Context Navigation

source: trunk/third/evolution/e-util/e-html-utils.c @ 19030

Visit:

Revision 19030, 10.8 KB checked in by ghudson, 21 years ago (diff)
This commit was generated by cvs2svn to compensate for changes in r19029, which included commits to RCS files with non-trunk default branches.

Line
1	/* -- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -- */
2	/* e-html-utils.c
3	* Copyright (C) 2000 Ximian, Inc.
4	* Author: Dan Winship <danw@ximian.com>
5	*
6	* This library is free software; you can redistribute it and/or
7	* modify it under the terms of version 2 of the GNU General Public
8	* License as published by the Free Software Foundation.
9	*
10	* This program is distributed in the hope that it will be useful,
11	* but WITHOUT ANY WARRANTY; without even the implied warranty of
12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	* General Public License for more details.
14	*
15	* You should have received a copy of the GNU General Public
16	* License along with this library; if not, write to the
17	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18	* Boston, MA 02111-1307, USA.
19	*/
20
21	#ifdef HAVE_CONFIG_H
22	#include <config.h>
23	#endif
24
25	#include <ctype.h>
26	#include <stdio.h>
27	#include <string.h>
28	#include <glib.h>
29	#include <gal/unicode/gunicode.h>
30
31	#include "e-html-utils.h"
32
33	static char *
34	check_size (char *buffer, int buffer_size, char *out, int len)
35	{
36	if (out + len + 1> buffer + buffer_size) {
37	int index = out - *buffer;
38
39	buffer_size = MAX (index + len + 1, buffer_size * 2);
40	buffer = g_realloc (buffer, *buffer_size);
41	out = *buffer + index;
42	}
43	return out;
44	}
45
46	/* 1 = non-email-address chars: ()<>@,;:\"[]`'{}\| */
47	/* 2 = trailing url garbage: ,.!?;:>)]}`'-_\| */
48	/* 4 = dns chars */
49	static int special_chars[] = {
50	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* nul - 0x0f */
51	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* 0x10 - 0x1f */
52	1, 2, 1, 0, 0, 0, 0, 3, 1, 3, 0, 0, 3, 6, 6, 0, /* sp - / */
53	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 1, 0, 3, 2, /* 0 - ? */
54	1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, /* @ - O */
55	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 1, 1, 3, 0, 2, /* P - _ */
56	3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, /* ` - o */
57	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 1, 3, 3, 0, 3 /* p - del */
58	};
59
60	#define is_addr_char(c) (c < 128 && !(special_chars[c] & 1))
61	#define is_trailing_garbage(c) (c > 127 \|\| (special_chars[c] & 2))
62	#define is_domain_name_char(c) (c < 128 && (special_chars[c] & 4))
63
64	static char *
65	url_extract (const unsigned char **text, gboolean check)
66	{
67	const unsigned char end = text, *p;
68	char *out;
69
70	while (end && !isspace (end) && (end != '"') && (end < 0x80))
71	end++;
72
73	/* Back up if we probably went too far. */
74	while (end > text && is_trailing_garbage ((end - 1)))
75	end--;
76
77	if (check) {
78	/* Make sure we weren't fooled. */
79	p = memchr (text, ':', end - text);
80	if (!p \|\| end - p < 4)
81	return NULL;
82	}
83
84	out = g_strndup (text, end - text);
85	*text = end;
86	return out;
87	}
88
89	static char *
90	email_address_extract (const unsigned char cur, char out, const unsigned char *linestart)
91	{
92	const unsigned char start, end, *dot;
93	char *addr;
94
95	/* cur points to the '@'. Look backward for a valid local-part /
96	for (start = cur; start - 1 >= linestart && is_addr_char ((start - 1)); start--)
97	;
98	if (start == *cur)
99	return NULL;
100
101	/* Now look forward for a valid domain part */
102	for (end = cur + 1, dot = NULL; is_domain_name_char (end); end++) {
103	if (*end == '.' && !dot)
104	dot = end;
105	}
106	if (!dot)
107	return NULL;
108
109	/* Remove trailing garbage */
110	while (is_trailing_garbage (*(end - 1)))
111	end--;
112	if (dot > end)
113	return NULL;
114
115	addr = g_strndup (start, end - start);
116	out -= cur - start;
117	*cur = end;
118
119	return addr;
120	}
121
122	static gboolean
123	is_citation (const unsigned char *c, gboolean saw_citation)
124	{
125	const unsigned char *p;
126
127	if (*c != '>')
128	return FALSE;
129
130	/* A line that starts with a ">" is a citation, unless it's
131	* just mbox From-mangling...
132	*/
133	if (strncmp (c, ">From ", 6) != 0)
134	return TRUE;
135
136	/* If the previous line was a citation, then say this
137	* one is too.
138	*/
139	if (saw_citation)
140	return TRUE;
141
142	/* Same if the next line is */
143	p = (const unsigned char )strchr ((const char )c, '\n');
144	if (p && *++p == '>')
145	return TRUE;
146
147	/* Otherwise, it was just an isolated ">From" line. */
148	return FALSE;
149	}
150
151	/**
152	* e_text_to_html_full:
153	* @input: a NUL-terminated input buffer
154	* @flags: some combination of the E_TEXT_TO_HTML_* flags defined
155	* in e-html-utils.h
156	* @color: color for citation highlighting
157	*
158	* This takes a buffer of text as input and produces a buffer of
159	* "equivalent" HTML, subject to certain transformation rules.
160	*
161	* The set of possible flags is:
162	*
163	* - E_TEXT_TO_HTML_PRE: wrap the output HTML in <PRE> and </PRE>.
164	* Should only be used if @input is the entire buffer to be
165	* converted. If e_text_to_html is being called with small pieces
166	* of data, you should wrap the entire result in <PRE> yourself.
167	*
168	* - E_TEXT_TO_HTML_CONVERT_NL: convert "\n" to "<BR>\n" on output.
169	* (should not be used with E_TEXT_TO_HTML_PRE, since that would
170	* result in double-newlines).
171	*
172	* - E_TEXT_TO_HTML_CONVERT_SPACES: convert a block of N spaces
173	* into N-1 non-breaking spaces and one normal space. A space
174	* at the start of the buffer is always converted to a
175	* non-breaking space, regardless of the following character,
176	* which probably means you don't want to use this flag on
177	* pieces of data that aren't delimited by at least line breaks.
178	*
179	* If E_TEXT_TO_HTML_CONVERT_NL and E_TEXT_TO_HTML_CONVERT_SPACES
180	* are both defined, then TABs will also be converted to spaces.
181	*
182	* - E_TEXT_TO_HTML_CONVERT_URLS: wrap <a href="..."> </a> around
183	* strings that look like URLs.
184	*
185	* - E_TEXT_TO_HTML_CONVERT_ADDRESSES: wrap <a href="mailto:..."> </a> around
186	* strings that look like mail addresses.
187	*
188	* - E_TEXT_TO_HTML_MARK_CITATION: wrap <font color="..."> </font> around
189	* citations (lines beginning with "> ", etc).
190	*
191	* - E_TEXT_TO_HTML_ESCAPE_8BIT: flatten everything to US-ASCII
192	*
193	* - E_TEXT_TO_HTML_CITE: quote the text with "> " at the start of each
194	* line.
195	**/
196	char *
197	e_text_to_html_full (const char *input, unsigned int flags, guint32 color)
198	{
199	const unsigned char cur, next, *linestart;
200	char *buffer = NULL;
201	char *out = NULL;
202	int buffer_size = 0, col;
203	gboolean colored = FALSE, saw_citation = FALSE;
204
205	/* Allocate a translation buffer. */
206	buffer_size = strlen (input) * 2 + 5;
207	buffer = g_malloc (buffer_size);
208
209	out = buffer;
210	if (flags & E_TEXT_TO_HTML_PRE)
211	out += sprintf (out, "<PRE>");
212
213	col = 0;
214
215	for (cur = linestart = input; cur && *cur; cur = next) {
216	gunichar u;
217
218	if (flags & E_TEXT_TO_HTML_MARK_CITATION && col == 0) {
219	saw_citation = is_citation (cur, saw_citation);
220	if (saw_citation) {
221	if (!colored) {
222	gchar font [25];
223
224	g_snprintf (font, 25, "<FONT COLOR=\"#%06x\">", color);
225
226	out = check_size (&buffer, &buffer_size, out, 25);
227	out += sprintf (out, "%s", font);
228	colored = TRUE;
229	}
230	} else if (colored) {
231	gchar *no_font = "</FONT>";
232
233	out = check_size (&buffer, &buffer_size, out, 9);
234	out += sprintf (out, "%s", no_font);
235	colored = FALSE;
236	}
237
238	/* Display mbox-mangled ">From" as "From" */
239	if (*cur == '>' && !saw_citation)
240	cur++;
241	} else if (flags & E_TEXT_TO_HTML_CITE && col == 0) {
242	out = check_size (&buffer, &buffer_size, out, 5);
243	out += sprintf (out, "> ");
244	}
245
246	u = g_utf8_get_char (cur);
247	if (g_unichar_isalpha (u) &&
248	(flags & E_TEXT_TO_HTML_CONVERT_URLS)) {
249	char tmpurl = NULL, refurl = NULL, *dispurl = NULL;
250
251	if (!strncasecmp (cur, "http://", 7) \|\|
252	!strncasecmp (cur, "https://", 8) \|\|
253	!strncasecmp (cur, "ftp://", 6) \|\|
254	!strncasecmp (cur, "nntp://", 7) \|\|
255	!strncasecmp (cur, "mailto:", 7) \|\|
256	!strncasecmp (cur, "news:", 5) \|\|
257	!strncasecmp (cur, "file:", 5)) {
258	tmpurl = url_extract (&cur, TRUE);
259	if (tmpurl) {
260	refurl = e_text_to_html (tmpurl, 0);
261	dispurl = g_strdup (refurl);
262	}
263	} else if (!strncasecmp (cur, "www.", 4) &&
264	(*(cur + 4) < 0x80) &&
265	g_unichar_isalnum (*(cur + 4))) {
266	tmpurl = url_extract (&cur, FALSE);
267	dispurl = e_text_to_html (tmpurl, 0);
268	refurl = g_strdup_printf ("http://%s",
269	dispurl);
270	}
271
272	if (tmpurl) {
273	out = check_size (&buffer, &buffer_size, out,
274	strlen (refurl) +
275	strlen (dispurl) + 15);
276	out += sprintf (out,
277	"<a href=\"%s\">%s</a>",
278	refurl, dispurl);
279	col += strlen (tmpurl);
280	g_free (tmpurl);
281	g_free (refurl);
282	g_free (dispurl);
283	}
284
285	if (!*cur)
286	break;
287	u = g_utf8_get_char (cur);
288	}
289
290	if (u == '@' && (flags & E_TEXT_TO_HTML_CONVERT_ADDRESSES)) {
291	char addr, dispaddr, *outaddr;
292
293	addr = email_address_extract (&cur, &out, linestart);
294	if (addr) {
295	dispaddr = e_text_to_html (addr, 0);
296	outaddr = g_strdup_printf ("<a href=\"mailto:%s\">%s</a>",
297	addr, dispaddr);
298	out = check_size (&buffer, &buffer_size, out, strlen (outaddr));
299	out += sprintf (out, "%s", outaddr);
300	col += strlen (addr);
301	g_free (addr);
302	g_free (dispaddr);
303	g_free (outaddr);
304
305	if (!*cur)
306	break;
307	u = g_utf8_get_char (cur);
308	}
309	}
310
311	if (!g_unichar_validate (u)) {
312	/* Sigh. Someone sent undeclared 8-bit data.
313	* Assume it's iso-8859-1.
314	*/
315	u = *cur;
316	next = cur + 1;
317	} else
318	next = g_utf8_next_char (cur);
319
320	out = check_size (&buffer, &buffer_size, out, 10);
321
322	switch (u) {
323	case '<':
324	strcpy (out, "<");
325	out += 4;
326	col++;
327	break;
328
329	case '>':
330	strcpy (out, ">");
331	out += 4;
332	col++;
333	break;
334
335	case '&':
336	strcpy (out, "&");
337	out += 5;
338	col++;
339	break;
340
341	case '"':
342	strcpy (out, """);
343	out += 6;
344	col++;
345	break;
346
347	case '\n':
348	if (flags & E_TEXT_TO_HTML_CONVERT_NL) {
349	strcpy (out, "<br>");
350	out += 4;
351	}
352	out++ = cur;
353	linestart = cur;
354	col = 0;
355	break;
356
357	case '\t':
358	if (flags & (E_TEXT_TO_HTML_CONVERT_SPACES \|
359	E_TEXT_TO_HTML_CONVERT_NL)) {
360	do {
361	out = check_size (&buffer, &buffer_size,
362	out, 7);
363	strcpy (out, " ");
364	out += 6;
365	col++;
366	} while (col % 8);
367	break;
368	}
369	/* otherwise, FALL THROUGH */
370
371	case ' ':
372	if (flags & E_TEXT_TO_HTML_CONVERT_SPACES) {
373	if (cur == (const unsigned char *)input \|\|
374	(cur + 1) == ' ' \|\| (cur + 1) == '\t' \|\|
375	*(cur - 1) == '\n') {
376	strcpy (out, " ");
377	out += 6;
378	col++;
379	break;
380	}
381	}
382	/* otherwise, FALL THROUGH */
383
384	default:
385	if ((u >= 0x20 && u < 0x80) \|\|
386	(u == '\r' \|\| u == '\t')) {
387	/* Default case, just copy. */
388	*out++ = u;
389	} else {
390	if (flags & E_TEXT_TO_HTML_ESCAPE_8BIT)
391	*out++ = '?';
392	else
393	out += g_snprintf(out, 9, "&#%d;", u);
394	}
395	col++;
396	break;
397	}
398	}
399
400	out = check_size (&buffer, &buffer_size, out, 7);
401	if (flags & E_TEXT_TO_HTML_PRE)
402	strcpy (out, "</PRE>");
403	else
404	*out = '\0';
405
406	return buffer;
407	}
408
409	char *
410	e_text_to_html (const char *input, unsigned int flags)
411	{
412	return e_text_to_html_full (input, flags, 0);
413	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: