source: trunk/third/evolution/e-util/ename/e-name-western.c @ 16770

Revision 16770, 17.9 KB checked in by ghudson, 23 years ago (diff)
This commit was generated by cvs2svn to compensate for changes in r16769, which included commits to RCS files with non-trunk default branches.
Line 
1/*
2 * A simple Western name parser.
3 *
4 * <Nat> Jamie, do you know anything about name parsing?
5 * <jwz> Are you going down that rat hole?  Bring a flashlight.
6 *
7 * Author:
8 *   Nat Friedman (nat@ximian.com)
9 *
10 * Copyright 1999, Ximian, Inc.
11 */
12
13#include <ctype.h>
14#include <string.h>
15#include <glib.h>
16 
17#include <ename/e-name-western.h>
18#include <ename/e-name-western-tables.h>
19
20typedef struct {
21        int prefix_idx;
22        int first_idx;
23        int middle_idx;
24        int nick_idx;
25        int last_idx;
26        int suffix_idx;
27} ENameWesternIdxs;
28
29static int
30e_name_western_str_count_words (char *str)
31{
32        int word_count;
33        char *p;
34
35        word_count = 0;
36
37        for (p = str; p != NULL; p = strchr (p, ' ')) {
38                word_count ++;
39                p ++;
40        }
41
42        return word_count;
43}
44
45static void
46e_name_western_cleanup_string (char **str)
47{
48        char *newstr;
49        char *p;
50
51        if (*str == NULL)
52                return;
53
54        /* skip any spaces and commas at the start of the string */
55        p = *str;
56        while (isspace (*p) || *p == ',')
57                p ++;
58
59        /* make the copy we're going to return */
60        newstr = g_strdup (p);
61
62        if ( strlen(newstr) > 0) {
63                /* now search from the back, skipping over any spaces and commas */
64                p = newstr + strlen (newstr) - 1;
65                while (isspace (*p) || *p == ',')
66                        p --;
67                /* advance p to after the character that caused us to exit the
68                   previous loop, and end the string. */
69                if ((! isspace (*p)) && *p != ',')
70                        p ++;
71                *p = '\0';
72        }
73
74        g_free (*str);
75        *str = newstr;
76}
77
78static char *
79e_name_western_get_words_at_idx (char *str, int idx, int num_words)
80{
81        char *words;
82        char *p;
83        int   word_count;
84        int   words_len;
85
86        /*
87         * Walk to the end of the words.
88         */
89        word_count = 0;
90        p = str + idx;
91        while (word_count < num_words && *p != '\0') {
92                while (! isspace (*p) && *p != '\0')
93                        p ++;
94
95                while (isspace (*p) && *p != '\0')
96                        p ++;
97
98                word_count ++;
99        }
100
101        words_len = p - str - idx - 1;
102
103        if (*p == '\0')
104                words_len ++;
105
106        words = g_malloc0 (1 + words_len);
107        strncpy (words, str + idx, words_len);
108
109        return words;
110}
111
112/*
113 * What the fuck is wrong with glib's MAX macro.
114 */
115static int
116e_name_western_max (const int a, const int b)
117{
118        if (a > b)
119                return a;
120
121        return b;
122}
123
124static gboolean
125e_name_western_word_is_suffix (char *word)
126{
127        int i;
128
129        for (i = 0; e_name_western_sfx_table [i] != NULL; i ++) {
130                if (g_strcasecmp (word, e_name_western_sfx_table [i]))
131                        continue;
132
133                return TRUE;
134        }
135
136        return FALSE;
137}
138
139static char *
140e_name_western_get_one_prefix_at_str (char *str)
141{
142        char *word;
143        int   i;
144
145        /*
146         * Check for prefixes from our table.
147         */
148        for (i = 0; e_name_western_pfx_table [i] != NULL; i ++) {
149                int pfx_words;
150                char *words;
151
152                pfx_words = e_name_western_str_count_words (e_name_western_pfx_table [i]);
153                words = e_name_western_get_words_at_idx (str, 0, pfx_words);
154
155                if (! g_strcasecmp (words, e_name_western_pfx_table [i]))
156                        return words;
157
158                g_free (words);
159        }
160
161        /*
162         * Check for prefixes we don't know about.  These are always a
163         * sequence of more than one letters followed by a period.
164         */
165        word = e_name_western_get_words_at_idx (str, 0, 1);
166
167        if (strlen (word) > 2 &&
168            isalpha ((unsigned char) word [0]) &&
169            isalpha ((unsigned char) word [1]) &&
170            word [strlen (word) - 1] == '.')
171                return word;
172
173        g_free (word);
174
175        return NULL;
176}
177
178static char *
179e_name_western_get_prefix_at_str (char *str)
180{
181        char *pfx;
182        char *pfx1;
183        char *pfx2;
184        char *p;
185
186        /* Get the first prefix. */
187        pfx1 = e_name_western_get_one_prefix_at_str (str);
188
189        if (pfx1 == NULL)
190                return NULL;
191
192        /* Check for a second prefix. */
193        p = str + strlen (pfx1);
194        while (isspace (*p) && *p != '\0')
195                p ++;
196
197        pfx2 = e_name_western_get_one_prefix_at_str (p);
198
199        if (pfx2 != NULL) {
200                int pfx_len;
201
202                pfx_len = (p + strlen (pfx2)) - str;
203                pfx = g_malloc0 (pfx_len + 1);
204                strncpy (pfx, str, pfx_len);
205        } else {
206                pfx = g_strdup (pfx1);
207        }
208
209        g_free (pfx1);
210        g_free (pfx2);
211
212        return pfx;
213}
214
215static void
216e_name_western_extract_prefix (ENameWestern *name, ENameWesternIdxs *idxs)
217{
218        char *pfx;
219
220        pfx = e_name_western_get_prefix_at_str (name->full);
221
222        if (pfx == NULL)
223                return;
224
225        idxs->prefix_idx = 0;
226        name->prefix     = pfx;
227}
228
229static gboolean
230e_name_western_is_complex_last_beginning (char *word)
231{
232        int i;
233
234        for (i = 0; e_name_western_complex_last_table [i] != NULL; i ++) {
235
236                if (! g_strcasecmp (
237                        word, e_name_western_complex_last_table [i]))
238                        return TRUE;
239        }
240
241        return FALSE;
242}
243
244static void
245e_name_western_extract_first (ENameWestern *name, ENameWesternIdxs *idxs)
246{
247        /*
248         * If there's a prefix, then the first name is right after it.
249         */
250        if (idxs->prefix_idx != -1) {
251                int   first_idx;
252                char *p;
253
254                first_idx = idxs->prefix_idx + strlen (name->prefix);
255
256                /* Skip past white space. */
257                p = name->full + first_idx;
258                while (isspace (*p) && *p != '\0')
259                        p++;
260
261                if (*p == '\0')
262                        return;
263
264                idxs->first_idx = p - name->full;
265                name->first = e_name_western_get_words_at_idx (
266                        name->full, idxs->first_idx, 1);
267
268        } else {
269
270                /*
271                 * Otherwise, the first name is probably the first string.
272                 */
273                idxs->first_idx = 0;
274                name->first = e_name_western_get_words_at_idx (
275                        name->full, idxs->first_idx, 1);
276        }
277
278        /*
279         * Check that we didn't just assign the beginning of a
280         * compound last name to the first name.
281         */
282        if (name->first != NULL) {
283                if (e_name_western_is_complex_last_beginning (name->first)) {
284                        g_free (name->first);
285                        name->first = NULL;
286                        idxs->first_idx = -1;
287                }
288        }
289}
290
291static void
292e_name_western_extract_middle (ENameWestern *name, ENameWesternIdxs *idxs)
293{
294        char *word;
295        int   middle_idx;
296
297        /*
298         * Middle names can only exist if you have a first name.
299         */
300        if (idxs->first_idx == -1)
301                return;
302
303        middle_idx = idxs->first_idx + strlen (name->first) + 1;
304
305        if (middle_idx > strlen (name->full))
306                return;
307       
308        /*
309         * Search for the first space (or the terminating \0)
310         */
311        while (isspace (name->full [middle_idx]) &&
312               name->full [middle_idx] != '\0')
313                middle_idx ++;
314               
315        if (name->full [middle_idx] == '\0')
316                return;
317
318        /*
319         * Skip past the nickname, if it's there.
320         */
321        if (name->full [middle_idx] == '\"') {
322                if (idxs->nick_idx == -1)
323                        return;
324
325                middle_idx = idxs->nick_idx + strlen (name->nick) + 1;
326               
327                while (isspace (name->full [middle_idx]) &&
328                       name->full [middle_idx] != '\0')
329                        middle_idx ++;
330
331                if (name->full [middle_idx] == '\0')
332                        return;
333        }
334
335        /*
336         * Make sure this isn't the beginning of a complex last name.
337         */
338        word = e_name_western_get_words_at_idx (name->full, middle_idx, 1);
339        if (e_name_western_is_complex_last_beginning (word)) {
340                g_free (word);
341                return;
342        }
343
344        /*
345         * Make sure this isn't a suffix.
346         */
347        e_name_western_cleanup_string (& word);
348        if (e_name_western_word_is_suffix (word)) {
349                g_free (word);
350                return;
351        }
352
353        /*
354         * Make sure we didn't just grab a cute nickname.
355         */
356        if (word [0] == '\"') {
357                g_free (word);
358                return;
359        }
360       
361        idxs->middle_idx = middle_idx;
362        name->middle = word;
363}
364
365static void
366e_name_western_extract_nickname (ENameWestern *name, ENameWesternIdxs *idxs)
367{
368        int   idx;
369        int   start_idx;
370        char *str;
371
372        if (idxs->first_idx == -1)
373                return;
374
375        if (idxs->middle_idx > idxs->first_idx)
376                idx = idxs->middle_idx + strlen (name->middle);
377        else
378                idx = idxs->first_idx + strlen (name->first);
379
380        while (name->full [idx] != '\"' && name->full [idx] != '\0')
381                idx ++;
382
383        if (name->full [idx] != '\"')
384                return;
385
386        start_idx = idx;
387
388        /*
389         * Advance to the next double quote.
390         */
391        idx ++;
392       
393        while (name->full [idx] != '\"' && name->full [idx] != '\0')
394                idx ++;
395
396        if (name->full [idx] == '\0')
397                return;
398
399        str = g_malloc0 (idx - start_idx + 2);
400        strncpy (str, name->full + start_idx, idx - start_idx + 1);
401
402        name->nick = str;
403        idxs->nick_idx = start_idx;
404}
405
406static int
407e_name_western_last_get_max_idx (ENameWestern *name, ENameWesternIdxs *idxs)
408{
409        int max_idx = -1;
410
411        if (name->prefix != NULL)
412                max_idx = e_name_western_max (
413                        max_idx, idxs->prefix_idx + strlen (name->prefix));
414
415        if (name->first != NULL)
416                max_idx = e_name_western_max (
417                        max_idx, idxs->first_idx + strlen (name->first));
418
419        if (name->middle != NULL)
420                max_idx = e_name_western_max (
421                        max_idx, idxs->middle_idx + strlen (name->middle));
422
423        if (name->nick != NULL)
424                max_idx = e_name_western_max (
425                        max_idx, idxs->nick_idx + strlen (name->nick));
426
427        return max_idx;
428}
429
430static void
431e_name_western_extract_last (ENameWestern *name, ENameWesternIdxs *idxs)
432{
433        char *word;
434        int   idx = -1;
435
436        idx = e_name_western_last_get_max_idx (name, idxs);
437
438        /*
439         * In the case where there is no preceding name element, the
440         * name is either just a first name ("Nat", "John"), is a
441         * single-element name ("Cher", which we treat as a first
442         * name), or is just a last name.  The only time we can
443         * differentiate a last name alone from a single-element name
444         * or a first name alone is if it's a complex last name ("de
445         * Icaza", "van Josephsen").  So if there is no preceding name
446         * element, we check to see whether or not the first part of
447         * the name is the beginning of a complex name.  If it is,
448         * we subsume the entire string.  If we accidentally subsume
449         * the suffix, this will get fixed in the fixup routine.
450         */
451        if (idx == -1) {
452                word = e_name_western_get_words_at_idx (name->full, 0, 1);
453                if (! e_name_western_is_complex_last_beginning (word)) {
454                        g_free (word);
455                        return;
456                }
457
458                name->last     = g_strdup (name->full);
459                idxs->last_idx = 0;
460                return;
461        }
462
463        /* Skip past the white space. */
464        while (isspace (name->full [idx]) && name->full [idx] != '\0')
465                idx ++;
466
467        if (name->full [idx] == '\0')
468                return;
469
470        word = e_name_western_get_words_at_idx (name->full, idx, 1);
471        e_name_western_cleanup_string (& word);
472        if (e_name_western_word_is_suffix (word)) {
473                g_free (word);
474                return;
475        }
476        g_free (word);
477
478        /*
479         * Subsume the rest of the string into the last name.  If we
480         * accidentally include the prefix, it will get fixed later.
481         * This is the only way to handle things like "Miguel de Icaza
482         * Amozorrutia" without dropping data and forcing the user
483         * to retype it.
484         */
485        name->last = g_strdup (name->full + idx);
486        idxs->last_idx = idx;
487}
488
489static char *
490e_name_western_get_preceding_word (char *str, int idx)
491{
492        int   word_len;
493        char *word;
494        char *p;
495
496        p = str + idx;
497
498        while (isspace (*p) && p > str)
499                p --;
500
501        while (! isspace (*p) && p > str)
502                p --;
503
504        if (isspace (*p))
505            p ++;
506
507        word_len = (str + idx) - p;
508        word = g_malloc0 (word_len + 1);
509        if (word_len > 0)
510                strncpy (word, p, word_len);
511
512        return word;
513}
514
515static char *
516e_name_western_get_suffix_at_str_end (char *str)
517{
518        char *suffix;
519        char *p;
520
521        /*
522         * Walk backwards till we reach the beginning of the
523         * (potentially-comma-separated) list of suffixes.
524         */
525        p = str + strlen (str);
526        while (1) {
527                char *nextp;
528                char *word;
529
530                word = e_name_western_get_preceding_word (str, p - str);
531                nextp = p - strlen (word) - 1;
532               
533                e_name_western_cleanup_string (& word);
534
535                if (e_name_western_word_is_suffix (word)) {
536                        p = nextp;
537                        g_free (word);
538                } else {
539                        g_free (word);
540                        break;
541                }
542        }
543
544        if (p == (str + strlen (str)))
545                return NULL;
546
547        suffix = g_strdup (p);
548        e_name_western_cleanup_string (& suffix);
549
550        if (strlen (suffix) == 0) {
551                g_free (suffix);
552                return NULL;
553        }
554
555        return suffix;
556}
557
558static void
559e_name_western_extract_suffix (ENameWestern *name, ENameWesternIdxs *idxs)
560{
561
562        name->suffix = e_name_western_get_suffix_at_str_end (name->full);
563
564        if (name->suffix == NULL)
565                return;
566
567        idxs->suffix_idx = strlen (name->full) - strlen (name->suffix);
568}
569
570static gboolean
571e_name_western_detect_backwards (ENameWestern *name, ENameWesternIdxs *idxs)
572{
573        char *comma;
574        char *word;
575
576        comma = strchr (name->full, ',');
577
578        if (comma == NULL)
579                return FALSE;
580
581        /*
582         * If there's a comma, we need to detect whether it's
583         * separating the last name from the first or just separating
584         * suffixes.  So we grab the word which comes before the
585         * comma and check if it's a suffix.
586         */
587        word = e_name_western_get_preceding_word (name->full, comma - name->full);
588
589        if (e_name_western_word_is_suffix (word)) {
590                g_free (word);
591                return FALSE;
592        }
593
594        g_free (word);
595        return TRUE;
596}
597
598static void
599e_name_western_reorder_asshole (ENameWestern *name, ENameWesternIdxs *idxs)
600{
601        char *prefix;
602        char *last;
603        char *suffix;
604        char *firstmidnick;
605        char *newfull;
606
607        char *comma;
608        char *p;
609
610        if (! e_name_western_detect_backwards (name, idxs))
611                return;
612
613        /*
614         * Convert
615         *    <Prefix> <Last name>, <First name> <Middle[+nick] name> <Suffix>
616         * to
617         *    <Prefix> <First name> <Middle[+nick] name> <Last name> <Suffix>
618         */
619       
620        /*
621         * Grab the prefix from the beginning.
622         */
623        prefix = e_name_western_get_prefix_at_str (name->full);
624
625        /*
626         * Everything from the end of the prefix to the comma is the
627         * last name.
628         */
629        comma = strchr (name->full, ',');
630        if (comma == NULL)
631                return;
632
633        p = name->full + (prefix == NULL ? 0 : strlen (prefix));
634
635        while (isspace (*p) && *p != '\0')
636                p ++;
637
638        last = g_malloc0 (comma - p + 1);
639        strncpy (last, p, comma - p);
640
641        /*
642         * Get the suffix off the end.
643         */
644        suffix = e_name_western_get_suffix_at_str_end (name->full);
645
646        /*
647         * Firstmidnick is everything from the comma to the beginning
648         * of the suffix.
649         */
650        p = comma + 1;
651
652        while (isspace (*p) && *p != '\0')
653                p ++;
654
655        if (suffix != NULL) {
656                char *q;
657
658                /*
659                 * Point q at the beginning of the suffix.
660                 */
661                q = name->full + strlen (name->full) - strlen (suffix) - 1;
662
663                /*
664                 * Walk backwards until we hit the space which
665                 * separates the suffix from firstmidnick.
666                 */
667                while (! isspace (*q) && q > comma)
668                        q --;
669
670                if ((q - p + 1) > 0) {
671                        firstmidnick = g_malloc0 (q - p + 1);
672                        strncpy (firstmidnick, p, q - p);
673                } else
674                        firstmidnick = NULL;
675        } else {
676                firstmidnick = g_strdup (p);
677        }
678
679        /*
680         * Create our new reordered version of the name.
681         */
682#define NULLSTR(a) ((a) == NULL ? "" : (a))
683        newfull = g_strdup_printf ("%s %s %s %s", NULLSTR (prefix), NULLSTR (firstmidnick),
684                                   NULLSTR (last), NULLSTR (suffix));
685        g_strstrip (newfull);
686        g_free (name->full);
687        name->full = newfull;
688
689
690        g_free (prefix);
691        g_free (firstmidnick);
692        g_free (last);
693        g_free (suffix);
694}
695
696static void
697e_name_western_zap_nil (char **str, int *idx)
698{
699        if (*str == NULL)
700                return;
701
702        if (strlen (*str) != 0)
703                return;
704
705        *idx = -1;
706        g_free (*str);
707        *str = NULL;
708}
709
710static void
711e_name_western_fixup (ENameWestern *name, ENameWesternIdxs *idxs)
712{
713        /*
714         * The middle and last names cannot be the same.
715         */
716        if (idxs->middle_idx != -1 && idxs->middle_idx == idxs->last_idx) {
717                idxs->middle_idx = -1;
718                g_free (name->middle);
719                name->middle = NULL;
720        }
721
722        /*
723         * If we have a middle name and no last name, then we mistook
724         * the last name for the middle name.
725         */
726        if (idxs->last_idx == -1 && idxs->middle_idx != -1) {
727                idxs->last_idx   = idxs->middle_idx;
728                name->last       = name->middle;
729                name->middle     = NULL;
730                idxs->middle_idx = -1;
731        }
732
733        /*
734         * Check to see if we accidentally included the suffix in the
735         * last name.
736         */
737        if (idxs->suffix_idx != -1 && idxs->last_idx != -1 &&
738            idxs->suffix_idx < (idxs->last_idx + strlen (name->last))) {
739                char *sfx;
740
741                sfx = name->last + (idxs->suffix_idx - idxs->last_idx);
742                if (sfx != NULL) {
743                        char *newlast;
744                        char *p;
745
746                        p = sfx - 1;
747                        while (isspace (*p) && p > name->last)
748                                p --;
749                        p ++;
750
751                        newlast = g_malloc0 (p - name->last + 1);
752                        strncpy (newlast, name->last, p - name->last);
753                        g_free (name->last);
754                        name->last = newlast;
755                }
756        }
757
758        /*
759         * If we have a prefix and a first name, but no last name,
760         * then we need to assign the first name to the last name.
761         * This way we get things like "Mr Friedman" correctly.
762         */
763        if (idxs->first_idx != -1 && idxs->prefix_idx != -1 &&
764            idxs->last_idx == -1) {
765                name->last      = name->first;
766                idxs->last_idx  = idxs->first_idx;
767                idxs->first_idx = -1;
768                name->first     = NULL;
769        }
770
771        /*
772         * Remove stray spaces and commas (although there don't seem
773         * to be any in the test cases, they might show up later).
774         */
775        e_name_western_cleanup_string (& name->prefix);
776        e_name_western_cleanup_string (& name->first);
777        e_name_western_cleanup_string (& name->middle);
778        e_name_western_cleanup_string (& name->nick);
779        e_name_western_cleanup_string (& name->last);
780        e_name_western_cleanup_string (& name->suffix);
781
782        /*
783         * Make zero-length strings just NULL.
784         */
785        e_name_western_zap_nil (& name->prefix, & idxs->prefix_idx);
786        e_name_western_zap_nil (& name->first,  & idxs->first_idx);
787        e_name_western_zap_nil (& name->middle, & idxs->middle_idx);
788        e_name_western_zap_nil (& name->nick,   & idxs->nick_idx);
789        e_name_western_zap_nil (& name->last,   & idxs->last_idx);
790        e_name_western_zap_nil (& name->suffix, & idxs->suffix_idx);
791}
792
793/**
794 * e_name_western_western_parse_fullname:
795 * @full_name: A string containing a Western name.
796 *
797 * Parses @full_name and returns an #ENameWestern object filled with
798 * the component parts of the name.
799 */
800ENameWestern *
801e_name_western_parse (const char *full_name)
802{
803        ENameWesternIdxs *idxs;
804        ENameWestern *wname;
805
806        wname = g_new0 (ENameWestern, 1);
807
808        wname->full = g_strdup (full_name);
809
810        idxs = g_new0 (ENameWesternIdxs, 1);
811
812        idxs->prefix_idx = -1;
813        idxs->first_idx  = -1;
814        idxs->middle_idx = -1;
815        idxs->nick_idx   = -1;
816        idxs->last_idx   = -1;
817        idxs->suffix_idx = -1;
818       
819        /*
820         * An extremely simple algorithm.
821         *
822         * The goal here is to get it right 95% of the time for
823         * Western names.
824         *
825         * First we check to see if this is an ass-backwards name
826         * ("Prefix Last, First Middle Suffix").  These names really
827         * suck (imagine "Dr von Johnson, Albert Roderick Jr"), so
828         * we reorder them first and then parse them.
829         *
830         * Next, we grab the most obvious assignments for the various
831         * parts of the name.  Once this is done, we check for stupid
832         * errors and fix them up.
833         */
834        e_name_western_reorder_asshole  (wname, idxs);
835
836        e_name_western_extract_prefix   (wname, idxs);
837        e_name_western_extract_first    (wname, idxs);
838        e_name_western_extract_nickname (wname, idxs);
839        e_name_western_extract_middle   (wname, idxs);
840        e_name_western_extract_last     (wname, idxs);
841        e_name_western_extract_suffix   (wname, idxs);
842
843        e_name_western_fixup            (wname, idxs);
844
845        g_free (idxs);
846
847        return wname;
848}
849
850/**
851 * e_name_western_free:
852 * @name: An ENameWestern object which needs to be freed.
853 *
854 * Deep-frees @name
855 */
856void
857e_name_western_free (ENameWestern *w)
858{
859
860        g_free (w->prefix);
861        g_free (w->first);
862        g_free (w->middle);
863        g_free (w->nick);
864        g_free (w->last);
865        g_free (w->suffix);
866       
867        g_free (w->full);
868
869        g_free (w);
870}
Note: See TracBrowser for help on using the repository browser.