Context Navigation

source: trunk/third/evolution/e-util/ename/e-name-western.c @ 16770

Visit:

Revision 16770, 17.9 KB checked in by ghudson, 23 years ago (diff)
This commit was generated by cvs2svn to compensate for changes in r16769, which included commits to RCS files with non-trunk default branches.

Line
1	/*
2	* A simple Western name parser.
3	*
4	* <Nat> Jamie, do you know anything about name parsing?
5	* <jwz> Are you going down that rat hole? Bring a flashlight.
6	*
7	* Author:
8	* Nat Friedman (nat@ximian.com)
9	*
10	* Copyright 1999, Ximian, Inc.
11	*/
12
13	#include <ctype.h>
14	#include <string.h>
15	#include <glib.h>
16
17	#include <ename/e-name-western.h>
18	#include <ename/e-name-western-tables.h>
19
20	typedef struct {
21	int prefix_idx;
22	int first_idx;
23	int middle_idx;
24	int nick_idx;
25	int last_idx;
26	int suffix_idx;
27	} ENameWesternIdxs;
28
29	static int
30	e_name_western_str_count_words (char *str)
31	{
32	int word_count;
33	char *p;
34
35	word_count = 0;
36
37	for (p = str; p != NULL; p = strchr (p, ' ')) {
38	word_count ++;
39	p ++;
40	}
41
42	return word_count;
43	}
44
45	static void
46	e_name_western_cleanup_string (char **str)
47	{
48	char *newstr;
49	char *p;
50
51	if (*str == NULL)
52	return;
53
54	/* skip any spaces and commas at the start of the string */
55	p = *str;
56	while (isspace (p) \|\| p == ',')
57	p ++;
58
59	/* make the copy we're going to return */
60	newstr = g_strdup (p);
61
62	if ( strlen(newstr) > 0) {
63	/* now search from the back, skipping over any spaces and commas */
64	p = newstr + strlen (newstr) - 1;
65	while (isspace (p) \|\| p == ',')
66	p --;
67	/* advance p to after the character that caused us to exit the
68	previous loop, and end the string. */
69	if ((! isspace (p)) && p != ',')
70	p ++;
71	*p = '\0';
72	}
73
74	g_free (*str);
75	*str = newstr;
76	}
77
78	static char *
79	e_name_western_get_words_at_idx (char *str, int idx, int num_words)
80	{
81	char *words;
82	char *p;
83	int word_count;
84	int words_len;
85
86	/*
87	* Walk to the end of the words.
88	*/
89	word_count = 0;
90	p = str + idx;
91	while (word_count < num_words && *p != '\0') {
92	while (! isspace (p) && p != '\0')
93	p ++;
94
95	while (isspace (p) && p != '\0')
96	p ++;
97
98	word_count ++;
99	}
100
101	words_len = p - str - idx - 1;
102
103	if (*p == '\0')
104	words_len ++;
105
106	words = g_malloc0 (1 + words_len);
107	strncpy (words, str + idx, words_len);
108
109	return words;
110	}
111
112	/*
113	* What the fuck is wrong with glib's MAX macro.
114	*/
115	static int
116	e_name_western_max (const int a, const int b)
117	{
118	if (a > b)
119	return a;
120
121	return b;
122	}
123
124	static gboolean
125	e_name_western_word_is_suffix (char *word)
126	{
127	int i;
128
129	for (i = 0; e_name_western_sfx_table [i] != NULL; i ++) {
130	if (g_strcasecmp (word, e_name_western_sfx_table [i]))
131	continue;
132
133	return TRUE;
134	}
135
136	return FALSE;
137	}
138
139	static char *
140	e_name_western_get_one_prefix_at_str (char *str)
141	{
142	char *word;
143	int i;
144
145	/*
146	* Check for prefixes from our table.
147	*/
148	for (i = 0; e_name_western_pfx_table [i] != NULL; i ++) {
149	int pfx_words;
150	char *words;
151
152	pfx_words = e_name_western_str_count_words (e_name_western_pfx_table [i]);
153	words = e_name_western_get_words_at_idx (str, 0, pfx_words);
154
155	if (! g_strcasecmp (words, e_name_western_pfx_table [i]))
156	return words;
157
158	g_free (words);
159	}
160
161	/*
162	* Check for prefixes we don't know about. These are always a
163	* sequence of more than one letters followed by a period.
164	*/
165	word = e_name_western_get_words_at_idx (str, 0, 1);
166
167	if (strlen (word) > 2 &&
168	isalpha ((unsigned char) word [0]) &&
169	isalpha ((unsigned char) word [1]) &&
170	word [strlen (word) - 1] == '.')
171	return word;
172
173	g_free (word);
174
175	return NULL;
176	}
177
178	static char *
179	e_name_western_get_prefix_at_str (char *str)
180	{
181	char *pfx;
182	char *pfx1;
183	char *pfx2;
184	char *p;
185
186	/* Get the first prefix. */
187	pfx1 = e_name_western_get_one_prefix_at_str (str);
188
189	if (pfx1 == NULL)
190	return NULL;
191
192	/* Check for a second prefix. */
193	p = str + strlen (pfx1);
194	while (isspace (p) && p != '\0')
195	p ++;
196
197	pfx2 = e_name_western_get_one_prefix_at_str (p);
198
199	if (pfx2 != NULL) {
200	int pfx_len;
201
202	pfx_len = (p + strlen (pfx2)) - str;
203	pfx = g_malloc0 (pfx_len + 1);
204	strncpy (pfx, str, pfx_len);
205	} else {
206	pfx = g_strdup (pfx1);
207	}
208
209	g_free (pfx1);
210	g_free (pfx2);
211
212	return pfx;
213	}
214
215	static void
216	e_name_western_extract_prefix (ENameWestern name, ENameWesternIdxs idxs)
217	{
218	char *pfx;
219
220	pfx = e_name_western_get_prefix_at_str (name->full);
221
222	if (pfx == NULL)
223	return;
224
225	idxs->prefix_idx = 0;
226	name->prefix = pfx;
227	}
228
229	static gboolean
230	e_name_western_is_complex_last_beginning (char *word)
231	{
232	int i;
233
234	for (i = 0; e_name_western_complex_last_table [i] != NULL; i ++) {
235
236	if (! g_strcasecmp (
237	word, e_name_western_complex_last_table [i]))
238	return TRUE;
239	}
240
241	return FALSE;
242	}
243
244	static void
245	e_name_western_extract_first (ENameWestern name, ENameWesternIdxs idxs)
246	{
247	/*
248	* If there's a prefix, then the first name is right after it.
249	*/
250	if (idxs->prefix_idx != -1) {
251	int first_idx;
252	char *p;
253
254	first_idx = idxs->prefix_idx + strlen (name->prefix);
255
256	/* Skip past white space. */
257	p = name->full + first_idx;
258	while (isspace (p) && p != '\0')
259	p++;
260
261	if (*p == '\0')
262	return;
263
264	idxs->first_idx = p - name->full;
265	name->first = e_name_western_get_words_at_idx (
266	name->full, idxs->first_idx, 1);
267
268	} else {
269
270	/*
271	* Otherwise, the first name is probably the first string.
272	*/
273	idxs->first_idx = 0;
274	name->first = e_name_western_get_words_at_idx (
275	name->full, idxs->first_idx, 1);
276	}
277
278	/*
279	* Check that we didn't just assign the beginning of a
280	* compound last name to the first name.
281	*/
282	if (name->first != NULL) {
283	if (e_name_western_is_complex_last_beginning (name->first)) {
284	g_free (name->first);
285	name->first = NULL;
286	idxs->first_idx = -1;
287	}
288	}
289	}
290
291	static void
292	e_name_western_extract_middle (ENameWestern name, ENameWesternIdxs idxs)
293	{
294	char *word;
295	int middle_idx;
296
297	/*
298	* Middle names can only exist if you have a first name.
299	*/
300	if (idxs->first_idx == -1)
301	return;
302
303	middle_idx = idxs->first_idx + strlen (name->first) + 1;
304
305	if (middle_idx > strlen (name->full))
306	return;
307
308	/*
309	* Search for the first space (or the terminating \0)
310	*/
311	while (isspace (name->full [middle_idx]) &&
312	name->full [middle_idx] != '\0')
313	middle_idx ++;
314
315	if (name->full [middle_idx] == '\0')
316	return;
317
318	/*
319	* Skip past the nickname, if it's there.
320	*/
321	if (name->full [middle_idx] == '\"') {
322	if (idxs->nick_idx == -1)
323	return;
324
325	middle_idx = idxs->nick_idx + strlen (name->nick) + 1;
326
327	while (isspace (name->full [middle_idx]) &&
328	name->full [middle_idx] != '\0')
329	middle_idx ++;
330
331	if (name->full [middle_idx] == '\0')
332	return;
333	}
334
335	/*
336	* Make sure this isn't the beginning of a complex last name.
337	*/
338	word = e_name_western_get_words_at_idx (name->full, middle_idx, 1);
339	if (e_name_western_is_complex_last_beginning (word)) {
340	g_free (word);
341	return;
342	}
343
344	/*
345	* Make sure this isn't a suffix.
346	*/
347	e_name_western_cleanup_string (& word);
348	if (e_name_western_word_is_suffix (word)) {
349	g_free (word);
350	return;
351	}
352
353	/*
354	* Make sure we didn't just grab a cute nickname.
355	*/
356	if (word [0] == '\"') {
357	g_free (word);
358	return;
359	}
360
361	idxs->middle_idx = middle_idx;
362	name->middle = word;
363	}
364
365	static void
366	e_name_western_extract_nickname (ENameWestern name, ENameWesternIdxs idxs)
367	{
368	int idx;
369	int start_idx;
370	char *str;
371
372	if (idxs->first_idx == -1)
373	return;
374
375	if (idxs->middle_idx > idxs->first_idx)
376	idx = idxs->middle_idx + strlen (name->middle);
377	else
378	idx = idxs->first_idx + strlen (name->first);
379
380	while (name->full [idx] != '\"' && name->full [idx] != '\0')
381	idx ++;
382
383	if (name->full [idx] != '\"')
384	return;
385
386	start_idx = idx;
387
388	/*
389	* Advance to the next double quote.
390	*/
391	idx ++;
392
393	while (name->full [idx] != '\"' && name->full [idx] != '\0')
394	idx ++;
395
396	if (name->full [idx] == '\0')
397	return;
398
399	str = g_malloc0 (idx - start_idx + 2);
400	strncpy (str, name->full + start_idx, idx - start_idx + 1);
401
402	name->nick = str;
403	idxs->nick_idx = start_idx;
404	}
405
406	static int
407	e_name_western_last_get_max_idx (ENameWestern name, ENameWesternIdxs idxs)
408	{
409	int max_idx = -1;
410
411	if (name->prefix != NULL)
412	max_idx = e_name_western_max (
413	max_idx, idxs->prefix_idx + strlen (name->prefix));
414
415	if (name->first != NULL)
416	max_idx = e_name_western_max (
417	max_idx, idxs->first_idx + strlen (name->first));
418
419	if (name->middle != NULL)
420	max_idx = e_name_western_max (
421	max_idx, idxs->middle_idx + strlen (name->middle));
422
423	if (name->nick != NULL)
424	max_idx = e_name_western_max (
425	max_idx, idxs->nick_idx + strlen (name->nick));
426
427	return max_idx;
428	}
429
430	static void
431	e_name_western_extract_last (ENameWestern name, ENameWesternIdxs idxs)
432	{
433	char *word;
434	int idx = -1;
435
436	idx = e_name_western_last_get_max_idx (name, idxs);
437
438	/*
439	* In the case where there is no preceding name element, the
440	* name is either just a first name ("Nat", "John"), is a
441	* single-element name ("Cher", which we treat as a first
442	* name), or is just a last name. The only time we can
443	* differentiate a last name alone from a single-element name
444	* or a first name alone is if it's a complex last name ("de
445	* Icaza", "van Josephsen"). So if there is no preceding name
446	* element, we check to see whether or not the first part of
447	* the name is the beginning of a complex name. If it is,
448	* we subsume the entire string. If we accidentally subsume
449	* the suffix, this will get fixed in the fixup routine.
450	*/
451	if (idx == -1) {
452	word = e_name_western_get_words_at_idx (name->full, 0, 1);
453	if (! e_name_western_is_complex_last_beginning (word)) {
454	g_free (word);
455	return;
456	}
457
458	name->last = g_strdup (name->full);
459	idxs->last_idx = 0;
460	return;
461	}
462
463	/* Skip past the white space. */
464	while (isspace (name->full [idx]) && name->full [idx] != '\0')
465	idx ++;
466
467	if (name->full [idx] == '\0')
468	return;
469
470	word = e_name_western_get_words_at_idx (name->full, idx, 1);
471	e_name_western_cleanup_string (& word);
472	if (e_name_western_word_is_suffix (word)) {
473	g_free (word);
474	return;
475	}
476	g_free (word);
477
478	/*
479	* Subsume the rest of the string into the last name. If we
480	* accidentally include the prefix, it will get fixed later.
481	* This is the only way to handle things like "Miguel de Icaza
482	* Amozorrutia" without dropping data and forcing the user
483	* to retype it.
484	*/
485	name->last = g_strdup (name->full + idx);
486	idxs->last_idx = idx;
487	}
488
489	static char *
490	e_name_western_get_preceding_word (char *str, int idx)
491	{
492	int word_len;
493	char *word;
494	char *p;
495
496	p = str + idx;
497
498	while (isspace (*p) && p > str)
499	p --;
500
501	while (! isspace (*p) && p > str)
502	p --;
503
504	if (isspace (*p))
505	p ++;
506
507	word_len = (str + idx) - p;
508	word = g_malloc0 (word_len + 1);
509	if (word_len > 0)
510	strncpy (word, p, word_len);
511
512	return word;
513	}
514
515	static char *
516	e_name_western_get_suffix_at_str_end (char *str)
517	{
518	char *suffix;
519	char *p;
520
521	/*
522	* Walk backwards till we reach the beginning of the
523	* (potentially-comma-separated) list of suffixes.
524	*/
525	p = str + strlen (str);
526	while (1) {
527	char *nextp;
528	char *word;
529
530	word = e_name_western_get_preceding_word (str, p - str);
531	nextp = p - strlen (word) - 1;
532
533	e_name_western_cleanup_string (& word);
534
535	if (e_name_western_word_is_suffix (word)) {
536	p = nextp;
537	g_free (word);
538	} else {
539	g_free (word);
540	break;
541	}
542	}
543
544	if (p == (str + strlen (str)))
545	return NULL;
546
547	suffix = g_strdup (p);
548	e_name_western_cleanup_string (& suffix);
549
550	if (strlen (suffix) == 0) {
551	g_free (suffix);
552	return NULL;
553	}
554
555	return suffix;
556	}
557
558	static void
559	e_name_western_extract_suffix (ENameWestern name, ENameWesternIdxs idxs)
560	{
561
562	name->suffix = e_name_western_get_suffix_at_str_end (name->full);
563
564	if (name->suffix == NULL)
565	return;
566
567	idxs->suffix_idx = strlen (name->full) - strlen (name->suffix);
568	}
569
570	static gboolean
571	e_name_western_detect_backwards (ENameWestern name, ENameWesternIdxs idxs)
572	{
573	char *comma;
574	char *word;
575
576	comma = strchr (name->full, ',');
577
578	if (comma == NULL)
579	return FALSE;
580
581	/*
582	* If there's a comma, we need to detect whether it's
583	* separating the last name from the first or just separating
584	* suffixes. So we grab the word which comes before the
585	* comma and check if it's a suffix.
586	*/
587	word = e_name_western_get_preceding_word (name->full, comma - name->full);
588
589	if (e_name_western_word_is_suffix (word)) {
590	g_free (word);
591	return FALSE;
592	}
593
594	g_free (word);
595	return TRUE;
596	}
597
598	static void
599	e_name_western_reorder_asshole (ENameWestern name, ENameWesternIdxs idxs)
600	{
601	char *prefix;
602	char *last;
603	char *suffix;
604	char *firstmidnick;
605	char *newfull;
606
607	char *comma;
608	char *p;
609
610	if (! e_name_western_detect_backwards (name, idxs))
611	return;
612
613	/*
614	* Convert
615	* <Prefix> <Last name>, <First name> <Middle[+nick] name> <Suffix>
616	* to
617	* <Prefix> <First name> <Middle[+nick] name> <Last name> <Suffix>
618	*/
619
620	/*
621	* Grab the prefix from the beginning.
622	*/
623	prefix = e_name_western_get_prefix_at_str (name->full);
624
625	/*
626	* Everything from the end of the prefix to the comma is the
627	* last name.
628	*/
629	comma = strchr (name->full, ',');
630	if (comma == NULL)
631	return;
632
633	p = name->full + (prefix == NULL ? 0 : strlen (prefix));
634
635	while (isspace (p) && p != '\0')
636	p ++;
637
638	last = g_malloc0 (comma - p + 1);
639	strncpy (last, p, comma - p);
640
641	/*
642	* Get the suffix off the end.
643	*/
644	suffix = e_name_western_get_suffix_at_str_end (name->full);
645
646	/*
647	* Firstmidnick is everything from the comma to the beginning
648	* of the suffix.
649	*/
650	p = comma + 1;
651
652	while (isspace (p) && p != '\0')
653	p ++;
654
655	if (suffix != NULL) {
656	char *q;
657
658	/*
659	* Point q at the beginning of the suffix.
660	*/
661	q = name->full + strlen (name->full) - strlen (suffix) - 1;
662
663	/*
664	* Walk backwards until we hit the space which
665	* separates the suffix from firstmidnick.
666	*/
667	while (! isspace (*q) && q > comma)
668	q --;
669
670	if ((q - p + 1) > 0) {
671	firstmidnick = g_malloc0 (q - p + 1);
672	strncpy (firstmidnick, p, q - p);
673	} else
674	firstmidnick = NULL;
675	} else {
676	firstmidnick = g_strdup (p);
677	}
678
679	/*
680	* Create our new reordered version of the name.
681	*/
682	#define NULLSTR(a) ((a) == NULL ? "" : (a))
683	newfull = g_strdup_printf ("%s %s %s %s", NULLSTR (prefix), NULLSTR (firstmidnick),
684	NULLSTR (last), NULLSTR (suffix));
685	g_strstrip (newfull);
686	g_free (name->full);
687	name->full = newfull;
688
689
690	g_free (prefix);
691	g_free (firstmidnick);
692	g_free (last);
693	g_free (suffix);
694	}
695
696	static void
697	e_name_western_zap_nil (char *str, int idx)
698	{
699	if (*str == NULL)
700	return;
701
702	if (strlen (*str) != 0)
703	return;
704
705	*idx = -1;
706	g_free (*str);
707	*str = NULL;
708	}
709
710	static void
711	e_name_western_fixup (ENameWestern name, ENameWesternIdxs idxs)
712	{
713	/*
714	* The middle and last names cannot be the same.
715	*/
716	if (idxs->middle_idx != -1 && idxs->middle_idx == idxs->last_idx) {
717	idxs->middle_idx = -1;
718	g_free (name->middle);
719	name->middle = NULL;
720	}
721
722	/*
723	* If we have a middle name and no last name, then we mistook
724	* the last name for the middle name.
725	*/
726	if (idxs->last_idx == -1 && idxs->middle_idx != -1) {
727	idxs->last_idx = idxs->middle_idx;
728	name->last = name->middle;
729	name->middle = NULL;
730	idxs->middle_idx = -1;
731	}
732
733	/*
734	* Check to see if we accidentally included the suffix in the
735	* last name.
736	*/
737	if (idxs->suffix_idx != -1 && idxs->last_idx != -1 &&
738	idxs->suffix_idx < (idxs->last_idx + strlen (name->last))) {
739	char *sfx;
740
741	sfx = name->last + (idxs->suffix_idx - idxs->last_idx);
742	if (sfx != NULL) {
743	char *newlast;
744	char *p;
745
746	p = sfx - 1;
747	while (isspace (*p) && p > name->last)
748	p --;
749	p ++;
750
751	newlast = g_malloc0 (p - name->last + 1);
752	strncpy (newlast, name->last, p - name->last);
753	g_free (name->last);
754	name->last = newlast;
755	}
756	}
757
758	/*
759	* If we have a prefix and a first name, but no last name,
760	* then we need to assign the first name to the last name.
761	* This way we get things like "Mr Friedman" correctly.
762	*/
763	if (idxs->first_idx != -1 && idxs->prefix_idx != -1 &&
764	idxs->last_idx == -1) {
765	name->last = name->first;
766	idxs->last_idx = idxs->first_idx;
767	idxs->first_idx = -1;
768	name->first = NULL;
769	}
770
771	/*
772	* Remove stray spaces and commas (although there don't seem
773	* to be any in the test cases, they might show up later).
774	*/
775	e_name_western_cleanup_string (& name->prefix);
776	e_name_western_cleanup_string (& name->first);
777	e_name_western_cleanup_string (& name->middle);
778	e_name_western_cleanup_string (& name->nick);
779	e_name_western_cleanup_string (& name->last);
780	e_name_western_cleanup_string (& name->suffix);
781
782	/*
783	* Make zero-length strings just NULL.
784	*/
785	e_name_western_zap_nil (& name->prefix, & idxs->prefix_idx);
786	e_name_western_zap_nil (& name->first, & idxs->first_idx);
787	e_name_western_zap_nil (& name->middle, & idxs->middle_idx);
788	e_name_western_zap_nil (& name->nick, & idxs->nick_idx);
789	e_name_western_zap_nil (& name->last, & idxs->last_idx);
790	e_name_western_zap_nil (& name->suffix, & idxs->suffix_idx);
791	}
792
793	/**
794	* e_name_western_western_parse_fullname:
795	* @full_name: A string containing a Western name.
796	*
797	* Parses @full_name and returns an #ENameWestern object filled with
798	* the component parts of the name.
799	*/
800	ENameWestern *
801	e_name_western_parse (const char *full_name)
802	{
803	ENameWesternIdxs *idxs;
804	ENameWestern *wname;
805
806	wname = g_new0 (ENameWestern, 1);
807
808	wname->full = g_strdup (full_name);
809
810	idxs = g_new0 (ENameWesternIdxs, 1);
811
812	idxs->prefix_idx = -1;
813	idxs->first_idx = -1;
814	idxs->middle_idx = -1;
815	idxs->nick_idx = -1;
816	idxs->last_idx = -1;
817	idxs->suffix_idx = -1;
818
819	/*
820	* An extremely simple algorithm.
821	*
822	* The goal here is to get it right 95% of the time for
823	* Western names.
824	*
825	* First we check to see if this is an ass-backwards name
826	* ("Prefix Last, First Middle Suffix"). These names really
827	* suck (imagine "Dr von Johnson, Albert Roderick Jr"), so
828	* we reorder them first and then parse them.
829	*
830	* Next, we grab the most obvious assignments for the various
831	* parts of the name. Once this is done, we check for stupid
832	* errors and fix them up.
833	*/
834	e_name_western_reorder_asshole (wname, idxs);
835
836	e_name_western_extract_prefix (wname, idxs);
837	e_name_western_extract_first (wname, idxs);
838	e_name_western_extract_nickname (wname, idxs);
839	e_name_western_extract_middle (wname, idxs);
840	e_name_western_extract_last (wname, idxs);
841	e_name_western_extract_suffix (wname, idxs);
842
843	e_name_western_fixup (wname, idxs);
844
845	g_free (idxs);
846
847	return wname;
848	}
849
850	/**
851	* e_name_western_free:
852	* @name: An ENameWestern object which needs to be freed.
853	*
854	* Deep-frees @name
855	*/
856	void
857	e_name_western_free (ENameWestern *w)
858	{
859
860	g_free (w->prefix);
861	g_free (w->first);
862	g_free (w->middle);
863	g_free (w->nick);
864	g_free (w->last);
865	g_free (w->suffix);
866
867	g_free (w->full);
868
869	g_free (w);
870	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: