1 | /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ |
---|
2 | /* e-html-utils.c |
---|
3 | * Copyright (C) 2000 Ximian, Inc. |
---|
4 | * Author: Dan Winship <danw@ximian.com> |
---|
5 | * |
---|
6 | * This library is free software; you can redistribute it and/or |
---|
7 | * modify it under the terms of version 2 of the GNU General Public |
---|
8 | * License as published by the Free Software Foundation. |
---|
9 | * |
---|
10 | * This program is distributed in the hope that it will be useful, |
---|
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
---|
13 | * General Public License for more details. |
---|
14 | * |
---|
15 | * You should have received a copy of the GNU General Public |
---|
16 | * License along with this library; if not, write to the |
---|
17 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, |
---|
18 | * Boston, MA 02111-1307, USA. |
---|
19 | */ |
---|
20 | |
---|
21 | #ifdef HAVE_CONFIG_H |
---|
22 | #include <config.h> |
---|
23 | #endif |
---|
24 | |
---|
25 | #include <ctype.h> |
---|
26 | #include <stdio.h> |
---|
27 | #include <string.h> |
---|
28 | #include <glib.h> |
---|
29 | #include <gal/unicode/gunicode.h> |
---|
30 | |
---|
31 | #include "e-html-utils.h" |
---|
32 | |
---|
33 | static char * |
---|
34 | check_size (char **buffer, int *buffer_size, char *out, int len) |
---|
35 | { |
---|
36 | if (out + len + 1> *buffer + *buffer_size) { |
---|
37 | int index = out - *buffer; |
---|
38 | |
---|
39 | *buffer_size = MAX (index + len + 1, *buffer_size * 2); |
---|
40 | *buffer = g_realloc (*buffer, *buffer_size); |
---|
41 | out = *buffer + index; |
---|
42 | } |
---|
43 | return out; |
---|
44 | } |
---|
45 | |
---|
46 | /* 1 = non-email-address chars: ()<>@,;:\"[]`'{}| */ |
---|
47 | /* 2 = trailing url garbage: ,.!?;:>)]}`'-_| */ |
---|
48 | /* 4 = dns chars */ |
---|
49 | static int special_chars[] = { |
---|
50 | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* nul - 0x0f */ |
---|
51 | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* 0x10 - 0x1f */ |
---|
52 | 1, 2, 1, 0, 0, 0, 0, 3, 1, 3, 0, 0, 3, 6, 6, 0, /* sp - / */ |
---|
53 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 1, 0, 3, 2, /* 0 - ? */ |
---|
54 | 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, /* @ - O */ |
---|
55 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 1, 1, 3, 0, 2, /* P - _ */ |
---|
56 | 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, /* ` - o */ |
---|
57 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 1, 3, 3, 0, 3 /* p - del */ |
---|
58 | }; |
---|
59 | |
---|
60 | #define is_addr_char(c) (c < 128 && !(special_chars[c] & 1)) |
---|
61 | #define is_trailing_garbage(c) (c > 127 || (special_chars[c] & 2)) |
---|
62 | #define is_domain_name_char(c) (c < 128 && (special_chars[c] & 4)) |
---|
63 | |
---|
64 | static char * |
---|
65 | url_extract (const unsigned char **text, gboolean check) |
---|
66 | { |
---|
67 | const unsigned char *end = *text, *p; |
---|
68 | char *out; |
---|
69 | |
---|
70 | while (*end && !isspace (*end) && (*end != '"') && (*end < 0x80)) |
---|
71 | end++; |
---|
72 | |
---|
73 | /* Back up if we probably went too far. */ |
---|
74 | while (end > *text && is_trailing_garbage (*(end - 1))) |
---|
75 | end--; |
---|
76 | |
---|
77 | if (check) { |
---|
78 | /* Make sure we weren't fooled. */ |
---|
79 | p = memchr (*text, ':', end - *text); |
---|
80 | if (!p || end - p < 4) |
---|
81 | return NULL; |
---|
82 | } |
---|
83 | |
---|
84 | out = g_strndup (*text, end - *text); |
---|
85 | *text = end; |
---|
86 | return out; |
---|
87 | } |
---|
88 | |
---|
89 | static char * |
---|
90 | email_address_extract (const unsigned char **cur, char **out, const unsigned char *linestart) |
---|
91 | { |
---|
92 | const unsigned char *start, *end, *dot; |
---|
93 | char *addr; |
---|
94 | |
---|
95 | /* *cur points to the '@'. Look backward for a valid local-part */ |
---|
96 | for (start = *cur; start - 1 >= linestart && is_addr_char (*(start - 1)); start--) |
---|
97 | ; |
---|
98 | if (start == *cur) |
---|
99 | return NULL; |
---|
100 | |
---|
101 | /* Now look forward for a valid domain part */ |
---|
102 | for (end = *cur + 1, dot = NULL; is_domain_name_char (*end); end++) { |
---|
103 | if (*end == '.' && !dot) |
---|
104 | dot = end; |
---|
105 | } |
---|
106 | if (!dot) |
---|
107 | return NULL; |
---|
108 | |
---|
109 | /* Remove trailing garbage */ |
---|
110 | while (is_trailing_garbage (*(end - 1))) |
---|
111 | end--; |
---|
112 | if (dot > end) |
---|
113 | return NULL; |
---|
114 | |
---|
115 | addr = g_strndup (start, end - start); |
---|
116 | *out -= *cur - start; |
---|
117 | *cur = end; |
---|
118 | |
---|
119 | return addr; |
---|
120 | } |
---|
121 | |
---|
122 | static gboolean |
---|
123 | is_citation (const unsigned char *c, gboolean saw_citation) |
---|
124 | { |
---|
125 | const unsigned char *p; |
---|
126 | |
---|
127 | if (*c != '>') |
---|
128 | return FALSE; |
---|
129 | |
---|
130 | /* A line that starts with a ">" is a citation, unless it's |
---|
131 | * just mbox From-mangling... |
---|
132 | */ |
---|
133 | if (strncmp (c, ">From ", 6) != 0) |
---|
134 | return TRUE; |
---|
135 | |
---|
136 | /* If the previous line was a citation, then say this |
---|
137 | * one is too. |
---|
138 | */ |
---|
139 | if (saw_citation) |
---|
140 | return TRUE; |
---|
141 | |
---|
142 | /* Same if the next line is */ |
---|
143 | p = (const unsigned char *)strchr ((const char *)c, '\n'); |
---|
144 | if (p && *++p == '>') |
---|
145 | return TRUE; |
---|
146 | |
---|
147 | /* Otherwise, it was just an isolated ">From" line. */ |
---|
148 | return FALSE; |
---|
149 | } |
---|
150 | |
---|
151 | /** |
---|
152 | * e_text_to_html_full: |
---|
153 | * @input: a NUL-terminated input buffer |
---|
154 | * @flags: some combination of the E_TEXT_TO_HTML_* flags defined |
---|
155 | * in e-html-utils.h |
---|
156 | * @color: color for citation highlighting |
---|
157 | * |
---|
158 | * This takes a buffer of text as input and produces a buffer of |
---|
159 | * "equivalent" HTML, subject to certain transformation rules. |
---|
160 | * |
---|
161 | * The set of possible flags is: |
---|
162 | * |
---|
163 | * - E_TEXT_TO_HTML_PRE: wrap the output HTML in <PRE> and </PRE>. |
---|
164 | * Should only be used if @input is the entire buffer to be |
---|
165 | * converted. If e_text_to_html is being called with small pieces |
---|
166 | * of data, you should wrap the entire result in <PRE> yourself. |
---|
167 | * |
---|
168 | * - E_TEXT_TO_HTML_CONVERT_NL: convert "\n" to "<BR>\n" on output. |
---|
169 | * (should not be used with E_TEXT_TO_HTML_PRE, since that would |
---|
170 | * result in double-newlines). |
---|
171 | * |
---|
172 | * - E_TEXT_TO_HTML_CONVERT_SPACES: convert a block of N spaces |
---|
173 | * into N-1 non-breaking spaces and one normal space. A space |
---|
174 | * at the start of the buffer is always converted to a |
---|
175 | * non-breaking space, regardless of the following character, |
---|
176 | * which probably means you don't want to use this flag on |
---|
177 | * pieces of data that aren't delimited by at least line breaks. |
---|
178 | * |
---|
179 | * If E_TEXT_TO_HTML_CONVERT_NL and E_TEXT_TO_HTML_CONVERT_SPACES |
---|
180 | * are both defined, then TABs will also be converted to spaces. |
---|
181 | * |
---|
182 | * - E_TEXT_TO_HTML_CONVERT_URLS: wrap <a href="..."> </a> around |
---|
183 | * strings that look like URLs. |
---|
184 | * |
---|
185 | * - E_TEXT_TO_HTML_CONVERT_ADDRESSES: wrap <a href="mailto:..."> </a> around |
---|
186 | * strings that look like mail addresses. |
---|
187 | * |
---|
188 | * - E_TEXT_TO_HTML_MARK_CITATION: wrap <font color="..."> </font> around |
---|
189 | * citations (lines beginning with "> ", etc). |
---|
190 | * |
---|
191 | * - E_TEXT_TO_HTML_ESCAPE_8BIT: flatten everything to US-ASCII |
---|
192 | * |
---|
193 | * - E_TEXT_TO_HTML_CITE: quote the text with "> " at the start of each |
---|
194 | * line. |
---|
195 | **/ |
---|
196 | char * |
---|
197 | e_text_to_html_full (const char *input, unsigned int flags, guint32 color) |
---|
198 | { |
---|
199 | const unsigned char *cur, *next, *linestart; |
---|
200 | char *buffer = NULL; |
---|
201 | char *out = NULL; |
---|
202 | int buffer_size = 0, col; |
---|
203 | gboolean colored = FALSE, saw_citation = FALSE; |
---|
204 | |
---|
205 | /* Allocate a translation buffer. */ |
---|
206 | buffer_size = strlen (input) * 2 + 5; |
---|
207 | buffer = g_malloc (buffer_size); |
---|
208 | |
---|
209 | out = buffer; |
---|
210 | if (flags & E_TEXT_TO_HTML_PRE) |
---|
211 | out += sprintf (out, "<PRE>"); |
---|
212 | |
---|
213 | col = 0; |
---|
214 | |
---|
215 | for (cur = linestart = input; cur && *cur; cur = next) { |
---|
216 | gunichar u; |
---|
217 | |
---|
218 | if (flags & E_TEXT_TO_HTML_MARK_CITATION && col == 0) { |
---|
219 | saw_citation = is_citation (cur, saw_citation); |
---|
220 | if (saw_citation) { |
---|
221 | if (!colored) { |
---|
222 | gchar font [25]; |
---|
223 | |
---|
224 | g_snprintf (font, 25, "<FONT COLOR=\"#%06x\">", color); |
---|
225 | |
---|
226 | out = check_size (&buffer, &buffer_size, out, 25); |
---|
227 | out += sprintf (out, "%s", font); |
---|
228 | colored = TRUE; |
---|
229 | } |
---|
230 | } else if (colored) { |
---|
231 | gchar *no_font = "</FONT>"; |
---|
232 | |
---|
233 | out = check_size (&buffer, &buffer_size, out, 9); |
---|
234 | out += sprintf (out, "%s", no_font); |
---|
235 | colored = FALSE; |
---|
236 | } |
---|
237 | |
---|
238 | /* Display mbox-mangled ">From" as "From" */ |
---|
239 | if (*cur == '>' && !saw_citation) |
---|
240 | cur++; |
---|
241 | } else if (flags & E_TEXT_TO_HTML_CITE && col == 0) { |
---|
242 | out = check_size (&buffer, &buffer_size, out, 5); |
---|
243 | out += sprintf (out, "> "); |
---|
244 | } |
---|
245 | |
---|
246 | u = g_utf8_get_char (cur); |
---|
247 | if (g_unichar_isalpha (u) && |
---|
248 | (flags & E_TEXT_TO_HTML_CONVERT_URLS)) { |
---|
249 | char *tmpurl = NULL, *refurl = NULL, *dispurl = NULL; |
---|
250 | |
---|
251 | if (!strncasecmp (cur, "http://", 7) || |
---|
252 | !strncasecmp (cur, "https://", 8) || |
---|
253 | !strncasecmp (cur, "ftp://", 6) || |
---|
254 | !strncasecmp (cur, "nntp://", 7) || |
---|
255 | !strncasecmp (cur, "mailto:", 7) || |
---|
256 | !strncasecmp (cur, "news:", 5) || |
---|
257 | !strncasecmp (cur, "file:", 5)) { |
---|
258 | tmpurl = url_extract (&cur, TRUE); |
---|
259 | if (tmpurl) { |
---|
260 | refurl = e_text_to_html (tmpurl, 0); |
---|
261 | dispurl = g_strdup (refurl); |
---|
262 | } |
---|
263 | } else if (!strncasecmp (cur, "www.", 4) && |
---|
264 | (*(cur + 4) < 0x80) && |
---|
265 | g_unichar_isalnum (*(cur + 4))) { |
---|
266 | tmpurl = url_extract (&cur, FALSE); |
---|
267 | dispurl = e_text_to_html (tmpurl, 0); |
---|
268 | refurl = g_strdup_printf ("http://%s", |
---|
269 | dispurl); |
---|
270 | } |
---|
271 | |
---|
272 | if (tmpurl) { |
---|
273 | out = check_size (&buffer, &buffer_size, out, |
---|
274 | strlen (refurl) + |
---|
275 | strlen (dispurl) + 15); |
---|
276 | out += sprintf (out, |
---|
277 | "<a href=\"%s\">%s</a>", |
---|
278 | refurl, dispurl); |
---|
279 | col += strlen (tmpurl); |
---|
280 | g_free (tmpurl); |
---|
281 | g_free (refurl); |
---|
282 | g_free (dispurl); |
---|
283 | } |
---|
284 | |
---|
285 | if (!*cur) |
---|
286 | break; |
---|
287 | u = g_utf8_get_char (cur); |
---|
288 | } |
---|
289 | |
---|
290 | if (u == '@' && (flags & E_TEXT_TO_HTML_CONVERT_ADDRESSES)) { |
---|
291 | char *addr, *dispaddr, *outaddr; |
---|
292 | |
---|
293 | addr = email_address_extract (&cur, &out, linestart); |
---|
294 | if (addr) { |
---|
295 | dispaddr = e_text_to_html (addr, 0); |
---|
296 | outaddr = g_strdup_printf ("<a href=\"mailto:%s\">%s</a>", |
---|
297 | addr, dispaddr); |
---|
298 | out = check_size (&buffer, &buffer_size, out, strlen (outaddr)); |
---|
299 | out += sprintf (out, "%s", outaddr); |
---|
300 | col += strlen (addr); |
---|
301 | g_free (addr); |
---|
302 | g_free (dispaddr); |
---|
303 | g_free (outaddr); |
---|
304 | |
---|
305 | if (!*cur) |
---|
306 | break; |
---|
307 | u = g_utf8_get_char (cur); |
---|
308 | } |
---|
309 | } |
---|
310 | |
---|
311 | if (!g_unichar_validate (u)) { |
---|
312 | /* Sigh. Someone sent undeclared 8-bit data. |
---|
313 | * Assume it's iso-8859-1. |
---|
314 | */ |
---|
315 | u = *cur; |
---|
316 | next = cur + 1; |
---|
317 | } else |
---|
318 | next = g_utf8_next_char (cur); |
---|
319 | |
---|
320 | out = check_size (&buffer, &buffer_size, out, 10); |
---|
321 | |
---|
322 | switch (u) { |
---|
323 | case '<': |
---|
324 | strcpy (out, "<"); |
---|
325 | out += 4; |
---|
326 | col++; |
---|
327 | break; |
---|
328 | |
---|
329 | case '>': |
---|
330 | strcpy (out, ">"); |
---|
331 | out += 4; |
---|
332 | col++; |
---|
333 | break; |
---|
334 | |
---|
335 | case '&': |
---|
336 | strcpy (out, "&"); |
---|
337 | out += 5; |
---|
338 | col++; |
---|
339 | break; |
---|
340 | |
---|
341 | case '"': |
---|
342 | strcpy (out, """); |
---|
343 | out += 6; |
---|
344 | col++; |
---|
345 | break; |
---|
346 | |
---|
347 | case '\n': |
---|
348 | if (flags & E_TEXT_TO_HTML_CONVERT_NL) { |
---|
349 | strcpy (out, "<br>"); |
---|
350 | out += 4; |
---|
351 | } |
---|
352 | *out++ = *cur; |
---|
353 | linestart = cur; |
---|
354 | col = 0; |
---|
355 | break; |
---|
356 | |
---|
357 | case '\t': |
---|
358 | if (flags & (E_TEXT_TO_HTML_CONVERT_SPACES | |
---|
359 | E_TEXT_TO_HTML_CONVERT_NL)) { |
---|
360 | do { |
---|
361 | out = check_size (&buffer, &buffer_size, |
---|
362 | out, 7); |
---|
363 | strcpy (out, " "); |
---|
364 | out += 6; |
---|
365 | col++; |
---|
366 | } while (col % 8); |
---|
367 | break; |
---|
368 | } |
---|
369 | /* otherwise, FALL THROUGH */ |
---|
370 | |
---|
371 | case ' ': |
---|
372 | if (flags & E_TEXT_TO_HTML_CONVERT_SPACES) { |
---|
373 | if (cur == (const unsigned char *)input || |
---|
374 | *(cur + 1) == ' ' || *(cur + 1) == '\t' || |
---|
375 | *(cur - 1) == '\n') { |
---|
376 | strcpy (out, " "); |
---|
377 | out += 6; |
---|
378 | col++; |
---|
379 | break; |
---|
380 | } |
---|
381 | } |
---|
382 | /* otherwise, FALL THROUGH */ |
---|
383 | |
---|
384 | default: |
---|
385 | if ((u >= 0x20 && u < 0x80) || |
---|
386 | (u == '\r' || u == '\t')) { |
---|
387 | /* Default case, just copy. */ |
---|
388 | *out++ = u; |
---|
389 | } else { |
---|
390 | if (flags & E_TEXT_TO_HTML_ESCAPE_8BIT) |
---|
391 | *out++ = '?'; |
---|
392 | else |
---|
393 | out += g_snprintf(out, 9, "&#%d;", u); |
---|
394 | } |
---|
395 | col++; |
---|
396 | break; |
---|
397 | } |
---|
398 | } |
---|
399 | |
---|
400 | out = check_size (&buffer, &buffer_size, out, 7); |
---|
401 | if (flags & E_TEXT_TO_HTML_PRE) |
---|
402 | strcpy (out, "</PRE>"); |
---|
403 | else |
---|
404 | *out = '\0'; |
---|
405 | |
---|
406 | return buffer; |
---|
407 | } |
---|
408 | |
---|
409 | char * |
---|
410 | e_text_to_html (const char *input, unsigned int flags) |
---|
411 | { |
---|
412 | return e_text_to_html_full (input, flags, 0); |
---|
413 | } |
---|