1 | /* |
---|
2 | * A simple Western name parser. |
---|
3 | * |
---|
4 | * <Nat> Jamie, do you know anything about name parsing? |
---|
5 | * <jwz> Are you going down that rat hole? Bring a flashlight. |
---|
6 | * |
---|
7 | * Author: |
---|
8 | * Nat Friedman (nat@ximian.com) |
---|
9 | * |
---|
10 | * Copyright 1999, Ximian, Inc. |
---|
11 | */ |
---|
12 | |
---|
13 | #include <ctype.h> |
---|
14 | #include <string.h> |
---|
15 | #include <glib.h> |
---|
16 | |
---|
17 | #include <ename/e-name-western.h> |
---|
18 | #include <ename/e-name-western-tables.h> |
---|
19 | |
---|
20 | typedef struct { |
---|
21 | int prefix_idx; |
---|
22 | int first_idx; |
---|
23 | int middle_idx; |
---|
24 | int nick_idx; |
---|
25 | int last_idx; |
---|
26 | int suffix_idx; |
---|
27 | } ENameWesternIdxs; |
---|
28 | |
---|
29 | static int |
---|
30 | e_name_western_str_count_words (char *str) |
---|
31 | { |
---|
32 | int word_count; |
---|
33 | char *p; |
---|
34 | |
---|
35 | word_count = 0; |
---|
36 | |
---|
37 | for (p = str; p != NULL; p = strchr (p, ' ')) { |
---|
38 | word_count ++; |
---|
39 | p ++; |
---|
40 | } |
---|
41 | |
---|
42 | return word_count; |
---|
43 | } |
---|
44 | |
---|
45 | static void |
---|
46 | e_name_western_cleanup_string (char **str) |
---|
47 | { |
---|
48 | char *newstr; |
---|
49 | char *p; |
---|
50 | |
---|
51 | if (*str == NULL) |
---|
52 | return; |
---|
53 | |
---|
54 | /* skip any spaces and commas at the start of the string */ |
---|
55 | p = *str; |
---|
56 | while (isspace (*p) || *p == ',') |
---|
57 | p ++; |
---|
58 | |
---|
59 | /* make the copy we're going to return */ |
---|
60 | newstr = g_strdup (p); |
---|
61 | |
---|
62 | if ( strlen(newstr) > 0) { |
---|
63 | /* now search from the back, skipping over any spaces and commas */ |
---|
64 | p = newstr + strlen (newstr) - 1; |
---|
65 | while (isspace (*p) || *p == ',') |
---|
66 | p --; |
---|
67 | /* advance p to after the character that caused us to exit the |
---|
68 | previous loop, and end the string. */ |
---|
69 | if ((! isspace (*p)) && *p != ',') |
---|
70 | p ++; |
---|
71 | *p = '\0'; |
---|
72 | } |
---|
73 | |
---|
74 | g_free (*str); |
---|
75 | *str = newstr; |
---|
76 | } |
---|
77 | |
---|
78 | static char * |
---|
79 | e_name_western_get_words_at_idx (char *str, int idx, int num_words) |
---|
80 | { |
---|
81 | char *words; |
---|
82 | char *p; |
---|
83 | int word_count; |
---|
84 | int words_len; |
---|
85 | |
---|
86 | /* |
---|
87 | * Walk to the end of the words. |
---|
88 | */ |
---|
89 | word_count = 0; |
---|
90 | p = str + idx; |
---|
91 | while (word_count < num_words && *p != '\0') { |
---|
92 | while (! isspace (*p) && *p != '\0') |
---|
93 | p ++; |
---|
94 | |
---|
95 | while (isspace (*p) && *p != '\0') |
---|
96 | p ++; |
---|
97 | |
---|
98 | word_count ++; |
---|
99 | } |
---|
100 | |
---|
101 | words_len = p - str - idx - 1; |
---|
102 | |
---|
103 | if (*p == '\0') |
---|
104 | words_len ++; |
---|
105 | |
---|
106 | words = g_malloc0 (1 + words_len); |
---|
107 | strncpy (words, str + idx, words_len); |
---|
108 | |
---|
109 | return words; |
---|
110 | } |
---|
111 | |
---|
112 | /* |
---|
113 | * What the fuck is wrong with glib's MAX macro. |
---|
114 | */ |
---|
115 | static int |
---|
116 | e_name_western_max (const int a, const int b) |
---|
117 | { |
---|
118 | if (a > b) |
---|
119 | return a; |
---|
120 | |
---|
121 | return b; |
---|
122 | } |
---|
123 | |
---|
124 | static gboolean |
---|
125 | e_name_western_word_is_suffix (char *word) |
---|
126 | { |
---|
127 | int i; |
---|
128 | |
---|
129 | for (i = 0; e_name_western_sfx_table [i] != NULL; i ++) { |
---|
130 | if (g_strcasecmp (word, e_name_western_sfx_table [i])) |
---|
131 | continue; |
---|
132 | |
---|
133 | return TRUE; |
---|
134 | } |
---|
135 | |
---|
136 | return FALSE; |
---|
137 | } |
---|
138 | |
---|
139 | static char * |
---|
140 | e_name_western_get_one_prefix_at_str (char *str) |
---|
141 | { |
---|
142 | char *word; |
---|
143 | int i; |
---|
144 | |
---|
145 | /* |
---|
146 | * Check for prefixes from our table. |
---|
147 | */ |
---|
148 | for (i = 0; e_name_western_pfx_table [i] != NULL; i ++) { |
---|
149 | int pfx_words; |
---|
150 | char *words; |
---|
151 | |
---|
152 | pfx_words = e_name_western_str_count_words (e_name_western_pfx_table [i]); |
---|
153 | words = e_name_western_get_words_at_idx (str, 0, pfx_words); |
---|
154 | |
---|
155 | if (! g_strcasecmp (words, e_name_western_pfx_table [i])) |
---|
156 | return words; |
---|
157 | |
---|
158 | g_free (words); |
---|
159 | } |
---|
160 | |
---|
161 | /* |
---|
162 | * Check for prefixes we don't know about. These are always a |
---|
163 | * sequence of more than one letters followed by a period. |
---|
164 | */ |
---|
165 | word = e_name_western_get_words_at_idx (str, 0, 1); |
---|
166 | |
---|
167 | if (strlen (word) > 2 && |
---|
168 | isalpha ((unsigned char) word [0]) && |
---|
169 | isalpha ((unsigned char) word [1]) && |
---|
170 | word [strlen (word) - 1] == '.') |
---|
171 | return word; |
---|
172 | |
---|
173 | g_free (word); |
---|
174 | |
---|
175 | return NULL; |
---|
176 | } |
---|
177 | |
---|
178 | static char * |
---|
179 | e_name_western_get_prefix_at_str (char *str) |
---|
180 | { |
---|
181 | char *pfx; |
---|
182 | char *pfx1; |
---|
183 | char *pfx2; |
---|
184 | char *p; |
---|
185 | |
---|
186 | /* Get the first prefix. */ |
---|
187 | pfx1 = e_name_western_get_one_prefix_at_str (str); |
---|
188 | |
---|
189 | if (pfx1 == NULL) |
---|
190 | return NULL; |
---|
191 | |
---|
192 | /* Check for a second prefix. */ |
---|
193 | p = str + strlen (pfx1); |
---|
194 | while (isspace (*p) && *p != '\0') |
---|
195 | p ++; |
---|
196 | |
---|
197 | pfx2 = e_name_western_get_one_prefix_at_str (p); |
---|
198 | |
---|
199 | if (pfx2 != NULL) { |
---|
200 | int pfx_len; |
---|
201 | |
---|
202 | pfx_len = (p + strlen (pfx2)) - str; |
---|
203 | pfx = g_malloc0 (pfx_len + 1); |
---|
204 | strncpy (pfx, str, pfx_len); |
---|
205 | } else { |
---|
206 | pfx = g_strdup (pfx1); |
---|
207 | } |
---|
208 | |
---|
209 | g_free (pfx1); |
---|
210 | g_free (pfx2); |
---|
211 | |
---|
212 | return pfx; |
---|
213 | } |
---|
214 | |
---|
215 | static void |
---|
216 | e_name_western_extract_prefix (ENameWestern *name, ENameWesternIdxs *idxs) |
---|
217 | { |
---|
218 | char *pfx; |
---|
219 | |
---|
220 | pfx = e_name_western_get_prefix_at_str (name->full); |
---|
221 | |
---|
222 | if (pfx == NULL) |
---|
223 | return; |
---|
224 | |
---|
225 | idxs->prefix_idx = 0; |
---|
226 | name->prefix = pfx; |
---|
227 | } |
---|
228 | |
---|
229 | static gboolean |
---|
230 | e_name_western_is_complex_last_beginning (char *word) |
---|
231 | { |
---|
232 | int i; |
---|
233 | |
---|
234 | for (i = 0; e_name_western_complex_last_table [i] != NULL; i ++) { |
---|
235 | |
---|
236 | if (! g_strcasecmp ( |
---|
237 | word, e_name_western_complex_last_table [i])) |
---|
238 | return TRUE; |
---|
239 | } |
---|
240 | |
---|
241 | return FALSE; |
---|
242 | } |
---|
243 | |
---|
244 | static void |
---|
245 | e_name_western_extract_first (ENameWestern *name, ENameWesternIdxs *idxs) |
---|
246 | { |
---|
247 | /* |
---|
248 | * If there's a prefix, then the first name is right after it. |
---|
249 | */ |
---|
250 | if (idxs->prefix_idx != -1) { |
---|
251 | int first_idx; |
---|
252 | char *p; |
---|
253 | |
---|
254 | first_idx = idxs->prefix_idx + strlen (name->prefix); |
---|
255 | |
---|
256 | /* Skip past white space. */ |
---|
257 | p = name->full + first_idx; |
---|
258 | while (isspace (*p) && *p != '\0') |
---|
259 | p++; |
---|
260 | |
---|
261 | if (*p == '\0') |
---|
262 | return; |
---|
263 | |
---|
264 | idxs->first_idx = p - name->full; |
---|
265 | name->first = e_name_western_get_words_at_idx ( |
---|
266 | name->full, idxs->first_idx, 1); |
---|
267 | |
---|
268 | } else { |
---|
269 | |
---|
270 | /* |
---|
271 | * Otherwise, the first name is probably the first string. |
---|
272 | */ |
---|
273 | idxs->first_idx = 0; |
---|
274 | name->first = e_name_western_get_words_at_idx ( |
---|
275 | name->full, idxs->first_idx, 1); |
---|
276 | } |
---|
277 | |
---|
278 | /* |
---|
279 | * Check that we didn't just assign the beginning of a |
---|
280 | * compound last name to the first name. |
---|
281 | */ |
---|
282 | if (name->first != NULL) { |
---|
283 | if (e_name_western_is_complex_last_beginning (name->first)) { |
---|
284 | g_free (name->first); |
---|
285 | name->first = NULL; |
---|
286 | idxs->first_idx = -1; |
---|
287 | } |
---|
288 | } |
---|
289 | } |
---|
290 | |
---|
291 | static void |
---|
292 | e_name_western_extract_middle (ENameWestern *name, ENameWesternIdxs *idxs) |
---|
293 | { |
---|
294 | char *word; |
---|
295 | int middle_idx; |
---|
296 | |
---|
297 | /* |
---|
298 | * Middle names can only exist if you have a first name. |
---|
299 | */ |
---|
300 | if (idxs->first_idx == -1) |
---|
301 | return; |
---|
302 | |
---|
303 | middle_idx = idxs->first_idx + strlen (name->first) + 1; |
---|
304 | |
---|
305 | if (middle_idx > strlen (name->full)) |
---|
306 | return; |
---|
307 | |
---|
308 | /* |
---|
309 | * Search for the first space (or the terminating \0) |
---|
310 | */ |
---|
311 | while (isspace (name->full [middle_idx]) && |
---|
312 | name->full [middle_idx] != '\0') |
---|
313 | middle_idx ++; |
---|
314 | |
---|
315 | if (name->full [middle_idx] == '\0') |
---|
316 | return; |
---|
317 | |
---|
318 | /* |
---|
319 | * Skip past the nickname, if it's there. |
---|
320 | */ |
---|
321 | if (name->full [middle_idx] == '\"') { |
---|
322 | if (idxs->nick_idx == -1) |
---|
323 | return; |
---|
324 | |
---|
325 | middle_idx = idxs->nick_idx + strlen (name->nick) + 1; |
---|
326 | |
---|
327 | while (isspace (name->full [middle_idx]) && |
---|
328 | name->full [middle_idx] != '\0') |
---|
329 | middle_idx ++; |
---|
330 | |
---|
331 | if (name->full [middle_idx] == '\0') |
---|
332 | return; |
---|
333 | } |
---|
334 | |
---|
335 | /* |
---|
336 | * Make sure this isn't the beginning of a complex last name. |
---|
337 | */ |
---|
338 | word = e_name_western_get_words_at_idx (name->full, middle_idx, 1); |
---|
339 | if (e_name_western_is_complex_last_beginning (word)) { |
---|
340 | g_free (word); |
---|
341 | return; |
---|
342 | } |
---|
343 | |
---|
344 | /* |
---|
345 | * Make sure this isn't a suffix. |
---|
346 | */ |
---|
347 | e_name_western_cleanup_string (& word); |
---|
348 | if (e_name_western_word_is_suffix (word)) { |
---|
349 | g_free (word); |
---|
350 | return; |
---|
351 | } |
---|
352 | |
---|
353 | /* |
---|
354 | * Make sure we didn't just grab a cute nickname. |
---|
355 | */ |
---|
356 | if (word [0] == '\"') { |
---|
357 | g_free (word); |
---|
358 | return; |
---|
359 | } |
---|
360 | |
---|
361 | idxs->middle_idx = middle_idx; |
---|
362 | name->middle = word; |
---|
363 | } |
---|
364 | |
---|
365 | static void |
---|
366 | e_name_western_extract_nickname (ENameWestern *name, ENameWesternIdxs *idxs) |
---|
367 | { |
---|
368 | int idx; |
---|
369 | int start_idx; |
---|
370 | char *str; |
---|
371 | |
---|
372 | if (idxs->first_idx == -1) |
---|
373 | return; |
---|
374 | |
---|
375 | if (idxs->middle_idx > idxs->first_idx) |
---|
376 | idx = idxs->middle_idx + strlen (name->middle); |
---|
377 | else |
---|
378 | idx = idxs->first_idx + strlen (name->first); |
---|
379 | |
---|
380 | while (name->full [idx] != '\"' && name->full [idx] != '\0') |
---|
381 | idx ++; |
---|
382 | |
---|
383 | if (name->full [idx] != '\"') |
---|
384 | return; |
---|
385 | |
---|
386 | start_idx = idx; |
---|
387 | |
---|
388 | /* |
---|
389 | * Advance to the next double quote. |
---|
390 | */ |
---|
391 | idx ++; |
---|
392 | |
---|
393 | while (name->full [idx] != '\"' && name->full [idx] != '\0') |
---|
394 | idx ++; |
---|
395 | |
---|
396 | if (name->full [idx] == '\0') |
---|
397 | return; |
---|
398 | |
---|
399 | str = g_malloc0 (idx - start_idx + 2); |
---|
400 | strncpy (str, name->full + start_idx, idx - start_idx + 1); |
---|
401 | |
---|
402 | name->nick = str; |
---|
403 | idxs->nick_idx = start_idx; |
---|
404 | } |
---|
405 | |
---|
406 | static int |
---|
407 | e_name_western_last_get_max_idx (ENameWestern *name, ENameWesternIdxs *idxs) |
---|
408 | { |
---|
409 | int max_idx = -1; |
---|
410 | |
---|
411 | if (name->prefix != NULL) |
---|
412 | max_idx = e_name_western_max ( |
---|
413 | max_idx, idxs->prefix_idx + strlen (name->prefix)); |
---|
414 | |
---|
415 | if (name->first != NULL) |
---|
416 | max_idx = e_name_western_max ( |
---|
417 | max_idx, idxs->first_idx + strlen (name->first)); |
---|
418 | |
---|
419 | if (name->middle != NULL) |
---|
420 | max_idx = e_name_western_max ( |
---|
421 | max_idx, idxs->middle_idx + strlen (name->middle)); |
---|
422 | |
---|
423 | if (name->nick != NULL) |
---|
424 | max_idx = e_name_western_max ( |
---|
425 | max_idx, idxs->nick_idx + strlen (name->nick)); |
---|
426 | |
---|
427 | return max_idx; |
---|
428 | } |
---|
429 | |
---|
430 | static void |
---|
431 | e_name_western_extract_last (ENameWestern *name, ENameWesternIdxs *idxs) |
---|
432 | { |
---|
433 | char *word; |
---|
434 | int idx = -1; |
---|
435 | |
---|
436 | idx = e_name_western_last_get_max_idx (name, idxs); |
---|
437 | |
---|
438 | /* |
---|
439 | * In the case where there is no preceding name element, the |
---|
440 | * name is either just a first name ("Nat", "John"), is a |
---|
441 | * single-element name ("Cher", which we treat as a first |
---|
442 | * name), or is just a last name. The only time we can |
---|
443 | * differentiate a last name alone from a single-element name |
---|
444 | * or a first name alone is if it's a complex last name ("de |
---|
445 | * Icaza", "van Josephsen"). So if there is no preceding name |
---|
446 | * element, we check to see whether or not the first part of |
---|
447 | * the name is the beginning of a complex name. If it is, |
---|
448 | * we subsume the entire string. If we accidentally subsume |
---|
449 | * the suffix, this will get fixed in the fixup routine. |
---|
450 | */ |
---|
451 | if (idx == -1) { |
---|
452 | word = e_name_western_get_words_at_idx (name->full, 0, 1); |
---|
453 | if (! e_name_western_is_complex_last_beginning (word)) { |
---|
454 | g_free (word); |
---|
455 | return; |
---|
456 | } |
---|
457 | |
---|
458 | name->last = g_strdup (name->full); |
---|
459 | idxs->last_idx = 0; |
---|
460 | return; |
---|
461 | } |
---|
462 | |
---|
463 | /* Skip past the white space. */ |
---|
464 | while (isspace (name->full [idx]) && name->full [idx] != '\0') |
---|
465 | idx ++; |
---|
466 | |
---|
467 | if (name->full [idx] == '\0') |
---|
468 | return; |
---|
469 | |
---|
470 | word = e_name_western_get_words_at_idx (name->full, idx, 1); |
---|
471 | e_name_western_cleanup_string (& word); |
---|
472 | if (e_name_western_word_is_suffix (word)) { |
---|
473 | g_free (word); |
---|
474 | return; |
---|
475 | } |
---|
476 | g_free (word); |
---|
477 | |
---|
478 | /* |
---|
479 | * Subsume the rest of the string into the last name. If we |
---|
480 | * accidentally include the prefix, it will get fixed later. |
---|
481 | * This is the only way to handle things like "Miguel de Icaza |
---|
482 | * Amozorrutia" without dropping data and forcing the user |
---|
483 | * to retype it. |
---|
484 | */ |
---|
485 | name->last = g_strdup (name->full + idx); |
---|
486 | idxs->last_idx = idx; |
---|
487 | } |
---|
488 | |
---|
489 | static char * |
---|
490 | e_name_western_get_preceding_word (char *str, int idx) |
---|
491 | { |
---|
492 | int word_len; |
---|
493 | char *word; |
---|
494 | char *p; |
---|
495 | |
---|
496 | p = str + idx; |
---|
497 | |
---|
498 | while (isspace (*p) && p > str) |
---|
499 | p --; |
---|
500 | |
---|
501 | while (! isspace (*p) && p > str) |
---|
502 | p --; |
---|
503 | |
---|
504 | if (isspace (*p)) |
---|
505 | p ++; |
---|
506 | |
---|
507 | word_len = (str + idx) - p; |
---|
508 | word = g_malloc0 (word_len + 1); |
---|
509 | if (word_len > 0) |
---|
510 | strncpy (word, p, word_len); |
---|
511 | |
---|
512 | return word; |
---|
513 | } |
---|
514 | |
---|
515 | static char * |
---|
516 | e_name_western_get_suffix_at_str_end (char *str) |
---|
517 | { |
---|
518 | char *suffix; |
---|
519 | char *p; |
---|
520 | |
---|
521 | /* |
---|
522 | * Walk backwards till we reach the beginning of the |
---|
523 | * (potentially-comma-separated) list of suffixes. |
---|
524 | */ |
---|
525 | p = str + strlen (str); |
---|
526 | while (1) { |
---|
527 | char *nextp; |
---|
528 | char *word; |
---|
529 | |
---|
530 | word = e_name_western_get_preceding_word (str, p - str); |
---|
531 | nextp = p - strlen (word) - 1; |
---|
532 | |
---|
533 | e_name_western_cleanup_string (& word); |
---|
534 | |
---|
535 | if (e_name_western_word_is_suffix (word)) { |
---|
536 | p = nextp; |
---|
537 | g_free (word); |
---|
538 | } else { |
---|
539 | g_free (word); |
---|
540 | break; |
---|
541 | } |
---|
542 | } |
---|
543 | |
---|
544 | if (p == (str + strlen (str))) |
---|
545 | return NULL; |
---|
546 | |
---|
547 | suffix = g_strdup (p); |
---|
548 | e_name_western_cleanup_string (& suffix); |
---|
549 | |
---|
550 | if (strlen (suffix) == 0) { |
---|
551 | g_free (suffix); |
---|
552 | return NULL; |
---|
553 | } |
---|
554 | |
---|
555 | return suffix; |
---|
556 | } |
---|
557 | |
---|
558 | static void |
---|
559 | e_name_western_extract_suffix (ENameWestern *name, ENameWesternIdxs *idxs) |
---|
560 | { |
---|
561 | |
---|
562 | name->suffix = e_name_western_get_suffix_at_str_end (name->full); |
---|
563 | |
---|
564 | if (name->suffix == NULL) |
---|
565 | return; |
---|
566 | |
---|
567 | idxs->suffix_idx = strlen (name->full) - strlen (name->suffix); |
---|
568 | } |
---|
569 | |
---|
570 | static gboolean |
---|
571 | e_name_western_detect_backwards (ENameWestern *name, ENameWesternIdxs *idxs) |
---|
572 | { |
---|
573 | char *comma; |
---|
574 | char *word; |
---|
575 | |
---|
576 | comma = strchr (name->full, ','); |
---|
577 | |
---|
578 | if (comma == NULL) |
---|
579 | return FALSE; |
---|
580 | |
---|
581 | /* |
---|
582 | * If there's a comma, we need to detect whether it's |
---|
583 | * separating the last name from the first or just separating |
---|
584 | * suffixes. So we grab the word which comes before the |
---|
585 | * comma and check if it's a suffix. |
---|
586 | */ |
---|
587 | word = e_name_western_get_preceding_word (name->full, comma - name->full); |
---|
588 | |
---|
589 | if (e_name_western_word_is_suffix (word)) { |
---|
590 | g_free (word); |
---|
591 | return FALSE; |
---|
592 | } |
---|
593 | |
---|
594 | g_free (word); |
---|
595 | return TRUE; |
---|
596 | } |
---|
597 | |
---|
598 | static void |
---|
599 | e_name_western_reorder_asshole (ENameWestern *name, ENameWesternIdxs *idxs) |
---|
600 | { |
---|
601 | char *prefix; |
---|
602 | char *last; |
---|
603 | char *suffix; |
---|
604 | char *firstmidnick; |
---|
605 | char *newfull; |
---|
606 | |
---|
607 | char *comma; |
---|
608 | char *p; |
---|
609 | |
---|
610 | if (! e_name_western_detect_backwards (name, idxs)) |
---|
611 | return; |
---|
612 | |
---|
613 | /* |
---|
614 | * Convert |
---|
615 | * <Prefix> <Last name>, <First name> <Middle[+nick] name> <Suffix> |
---|
616 | * to |
---|
617 | * <Prefix> <First name> <Middle[+nick] name> <Last name> <Suffix> |
---|
618 | */ |
---|
619 | |
---|
620 | /* |
---|
621 | * Grab the prefix from the beginning. |
---|
622 | */ |
---|
623 | prefix = e_name_western_get_prefix_at_str (name->full); |
---|
624 | |
---|
625 | /* |
---|
626 | * Everything from the end of the prefix to the comma is the |
---|
627 | * last name. |
---|
628 | */ |
---|
629 | comma = strchr (name->full, ','); |
---|
630 | if (comma == NULL) |
---|
631 | return; |
---|
632 | |
---|
633 | p = name->full + (prefix == NULL ? 0 : strlen (prefix)); |
---|
634 | |
---|
635 | while (isspace (*p) && *p != '\0') |
---|
636 | p ++; |
---|
637 | |
---|
638 | last = g_malloc0 (comma - p + 1); |
---|
639 | strncpy (last, p, comma - p); |
---|
640 | |
---|
641 | /* |
---|
642 | * Get the suffix off the end. |
---|
643 | */ |
---|
644 | suffix = e_name_western_get_suffix_at_str_end (name->full); |
---|
645 | |
---|
646 | /* |
---|
647 | * Firstmidnick is everything from the comma to the beginning |
---|
648 | * of the suffix. |
---|
649 | */ |
---|
650 | p = comma + 1; |
---|
651 | |
---|
652 | while (isspace (*p) && *p != '\0') |
---|
653 | p ++; |
---|
654 | |
---|
655 | if (suffix != NULL) { |
---|
656 | char *q; |
---|
657 | |
---|
658 | /* |
---|
659 | * Point q at the beginning of the suffix. |
---|
660 | */ |
---|
661 | q = name->full + strlen (name->full) - strlen (suffix) - 1; |
---|
662 | |
---|
663 | /* |
---|
664 | * Walk backwards until we hit the space which |
---|
665 | * separates the suffix from firstmidnick. |
---|
666 | */ |
---|
667 | while (! isspace (*q) && q > comma) |
---|
668 | q --; |
---|
669 | |
---|
670 | if ((q - p + 1) > 0) { |
---|
671 | firstmidnick = g_malloc0 (q - p + 1); |
---|
672 | strncpy (firstmidnick, p, q - p); |
---|
673 | } else |
---|
674 | firstmidnick = NULL; |
---|
675 | } else { |
---|
676 | firstmidnick = g_strdup (p); |
---|
677 | } |
---|
678 | |
---|
679 | /* |
---|
680 | * Create our new reordered version of the name. |
---|
681 | */ |
---|
682 | #define NULLSTR(a) ((a) == NULL ? "" : (a)) |
---|
683 | newfull = g_strdup_printf ("%s %s %s %s", NULLSTR (prefix), NULLSTR (firstmidnick), |
---|
684 | NULLSTR (last), NULLSTR (suffix)); |
---|
685 | g_strstrip (newfull); |
---|
686 | g_free (name->full); |
---|
687 | name->full = newfull; |
---|
688 | |
---|
689 | |
---|
690 | g_free (prefix); |
---|
691 | g_free (firstmidnick); |
---|
692 | g_free (last); |
---|
693 | g_free (suffix); |
---|
694 | } |
---|
695 | |
---|
696 | static void |
---|
697 | e_name_western_zap_nil (char **str, int *idx) |
---|
698 | { |
---|
699 | if (*str == NULL) |
---|
700 | return; |
---|
701 | |
---|
702 | if (strlen (*str) != 0) |
---|
703 | return; |
---|
704 | |
---|
705 | *idx = -1; |
---|
706 | g_free (*str); |
---|
707 | *str = NULL; |
---|
708 | } |
---|
709 | |
---|
710 | static void |
---|
711 | e_name_western_fixup (ENameWestern *name, ENameWesternIdxs *idxs) |
---|
712 | { |
---|
713 | /* |
---|
714 | * The middle and last names cannot be the same. |
---|
715 | */ |
---|
716 | if (idxs->middle_idx != -1 && idxs->middle_idx == idxs->last_idx) { |
---|
717 | idxs->middle_idx = -1; |
---|
718 | g_free (name->middle); |
---|
719 | name->middle = NULL; |
---|
720 | } |
---|
721 | |
---|
722 | /* |
---|
723 | * If we have a middle name and no last name, then we mistook |
---|
724 | * the last name for the middle name. |
---|
725 | */ |
---|
726 | if (idxs->last_idx == -1 && idxs->middle_idx != -1) { |
---|
727 | idxs->last_idx = idxs->middle_idx; |
---|
728 | name->last = name->middle; |
---|
729 | name->middle = NULL; |
---|
730 | idxs->middle_idx = -1; |
---|
731 | } |
---|
732 | |
---|
733 | /* |
---|
734 | * Check to see if we accidentally included the suffix in the |
---|
735 | * last name. |
---|
736 | */ |
---|
737 | if (idxs->suffix_idx != -1 && idxs->last_idx != -1 && |
---|
738 | idxs->suffix_idx < (idxs->last_idx + strlen (name->last))) { |
---|
739 | char *sfx; |
---|
740 | |
---|
741 | sfx = name->last + (idxs->suffix_idx - idxs->last_idx); |
---|
742 | if (sfx != NULL) { |
---|
743 | char *newlast; |
---|
744 | char *p; |
---|
745 | |
---|
746 | p = sfx - 1; |
---|
747 | while (isspace (*p) && p > name->last) |
---|
748 | p --; |
---|
749 | p ++; |
---|
750 | |
---|
751 | newlast = g_malloc0 (p - name->last + 1); |
---|
752 | strncpy (newlast, name->last, p - name->last); |
---|
753 | g_free (name->last); |
---|
754 | name->last = newlast; |
---|
755 | } |
---|
756 | } |
---|
757 | |
---|
758 | /* |
---|
759 | * If we have a prefix and a first name, but no last name, |
---|
760 | * then we need to assign the first name to the last name. |
---|
761 | * This way we get things like "Mr Friedman" correctly. |
---|
762 | */ |
---|
763 | if (idxs->first_idx != -1 && idxs->prefix_idx != -1 && |
---|
764 | idxs->last_idx == -1) { |
---|
765 | name->last = name->first; |
---|
766 | idxs->last_idx = idxs->first_idx; |
---|
767 | idxs->first_idx = -1; |
---|
768 | name->first = NULL; |
---|
769 | } |
---|
770 | |
---|
771 | /* |
---|
772 | * Remove stray spaces and commas (although there don't seem |
---|
773 | * to be any in the test cases, they might show up later). |
---|
774 | */ |
---|
775 | e_name_western_cleanup_string (& name->prefix); |
---|
776 | e_name_western_cleanup_string (& name->first); |
---|
777 | e_name_western_cleanup_string (& name->middle); |
---|
778 | e_name_western_cleanup_string (& name->nick); |
---|
779 | e_name_western_cleanup_string (& name->last); |
---|
780 | e_name_western_cleanup_string (& name->suffix); |
---|
781 | |
---|
782 | /* |
---|
783 | * Make zero-length strings just NULL. |
---|
784 | */ |
---|
785 | e_name_western_zap_nil (& name->prefix, & idxs->prefix_idx); |
---|
786 | e_name_western_zap_nil (& name->first, & idxs->first_idx); |
---|
787 | e_name_western_zap_nil (& name->middle, & idxs->middle_idx); |
---|
788 | e_name_western_zap_nil (& name->nick, & idxs->nick_idx); |
---|
789 | e_name_western_zap_nil (& name->last, & idxs->last_idx); |
---|
790 | e_name_western_zap_nil (& name->suffix, & idxs->suffix_idx); |
---|
791 | } |
---|
792 | |
---|
793 | /** |
---|
794 | * e_name_western_western_parse_fullname: |
---|
795 | * @full_name: A string containing a Western name. |
---|
796 | * |
---|
797 | * Parses @full_name and returns an #ENameWestern object filled with |
---|
798 | * the component parts of the name. |
---|
799 | */ |
---|
800 | ENameWestern * |
---|
801 | e_name_western_parse (const char *full_name) |
---|
802 | { |
---|
803 | ENameWesternIdxs *idxs; |
---|
804 | ENameWestern *wname; |
---|
805 | |
---|
806 | wname = g_new0 (ENameWestern, 1); |
---|
807 | |
---|
808 | wname->full = g_strdup (full_name); |
---|
809 | |
---|
810 | idxs = g_new0 (ENameWesternIdxs, 1); |
---|
811 | |
---|
812 | idxs->prefix_idx = -1; |
---|
813 | idxs->first_idx = -1; |
---|
814 | idxs->middle_idx = -1; |
---|
815 | idxs->nick_idx = -1; |
---|
816 | idxs->last_idx = -1; |
---|
817 | idxs->suffix_idx = -1; |
---|
818 | |
---|
819 | /* |
---|
820 | * An extremely simple algorithm. |
---|
821 | * |
---|
822 | * The goal here is to get it right 95% of the time for |
---|
823 | * Western names. |
---|
824 | * |
---|
825 | * First we check to see if this is an ass-backwards name |
---|
826 | * ("Prefix Last, First Middle Suffix"). These names really |
---|
827 | * suck (imagine "Dr von Johnson, Albert Roderick Jr"), so |
---|
828 | * we reorder them first and then parse them. |
---|
829 | * |
---|
830 | * Next, we grab the most obvious assignments for the various |
---|
831 | * parts of the name. Once this is done, we check for stupid |
---|
832 | * errors and fix them up. |
---|
833 | */ |
---|
834 | e_name_western_reorder_asshole (wname, idxs); |
---|
835 | |
---|
836 | e_name_western_extract_prefix (wname, idxs); |
---|
837 | e_name_western_extract_first (wname, idxs); |
---|
838 | e_name_western_extract_nickname (wname, idxs); |
---|
839 | e_name_western_extract_middle (wname, idxs); |
---|
840 | e_name_western_extract_last (wname, idxs); |
---|
841 | e_name_western_extract_suffix (wname, idxs); |
---|
842 | |
---|
843 | e_name_western_fixup (wname, idxs); |
---|
844 | |
---|
845 | g_free (idxs); |
---|
846 | |
---|
847 | return wname; |
---|
848 | } |
---|
849 | |
---|
850 | /** |
---|
851 | * e_name_western_free: |
---|
852 | * @name: An ENameWestern object which needs to be freed. |
---|
853 | * |
---|
854 | * Deep-frees @name |
---|
855 | */ |
---|
856 | void |
---|
857 | e_name_western_free (ENameWestern *w) |
---|
858 | { |
---|
859 | |
---|
860 | g_free (w->prefix); |
---|
861 | g_free (w->first); |
---|
862 | g_free (w->middle); |
---|
863 | g_free (w->nick); |
---|
864 | g_free (w->last); |
---|
865 | g_free (w->suffix); |
---|
866 | |
---|
867 | g_free (w->full); |
---|
868 | |
---|
869 | g_free (w); |
---|
870 | } |
---|