1 | /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ |
---|
2 | /* |
---|
3 | * A simple Western name parser. |
---|
4 | * |
---|
5 | * <Nat> Jamie, do you know anything about name parsing? |
---|
6 | * <jwz> Are you going down that rat hole? Bring a flashlight. |
---|
7 | * |
---|
8 | * Authors: |
---|
9 | * Nat Friedman <nat@ximian.com> |
---|
10 | * |
---|
11 | * Copyright 1999 - 2001, Ximian, Inc. |
---|
12 | */ |
---|
13 | |
---|
14 | #include <ctype.h> |
---|
15 | #include <string.h> |
---|
16 | #include <glib.h> |
---|
17 | |
---|
18 | #include <ename/e-name-western.h> |
---|
19 | #include <ename/e-name-western-tables.h> |
---|
20 | |
---|
21 | typedef struct { |
---|
22 | int prefix_idx; |
---|
23 | int first_idx; |
---|
24 | int middle_idx; |
---|
25 | int nick_idx; |
---|
26 | int last_idx; |
---|
27 | int suffix_idx; |
---|
28 | } ENameWesternIdxs; |
---|
29 | |
---|
30 | static int |
---|
31 | e_name_western_str_count_words (char *str) |
---|
32 | { |
---|
33 | int word_count; |
---|
34 | char *p; |
---|
35 | |
---|
36 | word_count = 0; |
---|
37 | |
---|
38 | for (p = str; p != NULL; p = strchr (p, ' ')) { |
---|
39 | word_count ++; |
---|
40 | p ++; |
---|
41 | } |
---|
42 | |
---|
43 | return word_count; |
---|
44 | } |
---|
45 | |
---|
46 | static void |
---|
47 | e_name_western_cleanup_string (char **str) |
---|
48 | { |
---|
49 | char *newstr; |
---|
50 | char *p; |
---|
51 | |
---|
52 | if (*str == NULL) |
---|
53 | return; |
---|
54 | |
---|
55 | /* skip any spaces and commas at the start of the string */ |
---|
56 | p = *str; |
---|
57 | while (isspace ((unsigned char)*p) || *p == ',') |
---|
58 | p ++; |
---|
59 | |
---|
60 | /* make the copy we're going to return */ |
---|
61 | newstr = g_strdup (p); |
---|
62 | |
---|
63 | if ( strlen(newstr) > 0) { |
---|
64 | /* now search from the back, skipping over any spaces and commas */ |
---|
65 | p = newstr + strlen (newstr) - 1; |
---|
66 | while (isspace ((unsigned char)*p) || *p == ',') |
---|
67 | p --; |
---|
68 | /* advance p to after the character that caused us to exit the |
---|
69 | previous loop, and end the string. */ |
---|
70 | if ((! isspace ((unsigned char)*p)) && *p != ',') |
---|
71 | p ++; |
---|
72 | *p = '\0'; |
---|
73 | } |
---|
74 | |
---|
75 | g_free (*str); |
---|
76 | *str = newstr; |
---|
77 | } |
---|
78 | |
---|
79 | static char * |
---|
80 | e_name_western_get_words_at_idx (char *str, int idx, int num_words) |
---|
81 | { |
---|
82 | char *words; |
---|
83 | char *p; |
---|
84 | int word_count; |
---|
85 | int words_len; |
---|
86 | |
---|
87 | /* |
---|
88 | * Walk to the end of the words. |
---|
89 | */ |
---|
90 | word_count = 0; |
---|
91 | p = str + idx; |
---|
92 | while (word_count < num_words && *p != '\0') { |
---|
93 | while (! isspace ((unsigned char)*p) && *p != '\0') |
---|
94 | p ++; |
---|
95 | |
---|
96 | while (isspace ((unsigned char)*p) && *p != '\0') |
---|
97 | p ++; |
---|
98 | |
---|
99 | word_count ++; |
---|
100 | } |
---|
101 | |
---|
102 | words_len = p - str - idx - 1; |
---|
103 | |
---|
104 | if (*p == '\0') |
---|
105 | words_len ++; |
---|
106 | |
---|
107 | words = g_malloc0 (1 + words_len); |
---|
108 | strncpy (words, str + idx, words_len); |
---|
109 | |
---|
110 | return words; |
---|
111 | } |
---|
112 | |
---|
113 | /* |
---|
114 | * What the fuck is wrong with glib's MAX macro. |
---|
115 | */ |
---|
116 | static int |
---|
117 | e_name_western_max (const int a, const int b) |
---|
118 | { |
---|
119 | if (a > b) |
---|
120 | return a; |
---|
121 | |
---|
122 | return b; |
---|
123 | } |
---|
124 | |
---|
125 | static gboolean |
---|
126 | e_name_western_word_is_suffix (char *word) |
---|
127 | { |
---|
128 | int i; |
---|
129 | |
---|
130 | for (i = 0; e_name_western_sfx_table [i] != NULL; i ++) { |
---|
131 | int length = strlen (e_name_western_sfx_table [i]); |
---|
132 | if (!g_strcasecmp (word, e_name_western_sfx_table [i]) || |
---|
133 | ( !g_strncasecmp (word, e_name_western_sfx_table [i], length) && |
---|
134 | strlen(word) == length + 1 && |
---|
135 | word[length] == '.' )) |
---|
136 | return TRUE; |
---|
137 | } |
---|
138 | |
---|
139 | return FALSE; |
---|
140 | } |
---|
141 | |
---|
142 | static char * |
---|
143 | e_name_western_get_one_prefix_at_str (char *str) |
---|
144 | { |
---|
145 | char *word; |
---|
146 | int i; |
---|
147 | |
---|
148 | /* |
---|
149 | * Check for prefixes from our table. |
---|
150 | */ |
---|
151 | for (i = 0; e_name_western_pfx_table [i] != NULL; i ++) { |
---|
152 | int pfx_words; |
---|
153 | char *words; |
---|
154 | |
---|
155 | pfx_words = e_name_western_str_count_words (e_name_western_pfx_table [i]); |
---|
156 | words = e_name_western_get_words_at_idx (str, 0, pfx_words); |
---|
157 | |
---|
158 | if (! g_strcasecmp (words, e_name_western_pfx_table [i])) |
---|
159 | return words; |
---|
160 | |
---|
161 | g_free (words); |
---|
162 | } |
---|
163 | |
---|
164 | /* |
---|
165 | * Check for prefixes we don't know about. These are always a |
---|
166 | * sequence of more than one letters followed by a period. |
---|
167 | */ |
---|
168 | word = e_name_western_get_words_at_idx (str, 0, 1); |
---|
169 | |
---|
170 | if (strlen (word) > 2 && |
---|
171 | isalpha ((unsigned char) word [0]) && |
---|
172 | isalpha ((unsigned char) word [1]) && |
---|
173 | word [strlen (word) - 1] == '.') |
---|
174 | return word; |
---|
175 | |
---|
176 | g_free (word); |
---|
177 | |
---|
178 | return NULL; |
---|
179 | } |
---|
180 | |
---|
181 | static char * |
---|
182 | e_name_western_get_prefix_at_str (char *str) |
---|
183 | { |
---|
184 | char *pfx; |
---|
185 | char *pfx1; |
---|
186 | char *pfx2; |
---|
187 | char *p; |
---|
188 | |
---|
189 | /* Get the first prefix. */ |
---|
190 | pfx1 = e_name_western_get_one_prefix_at_str (str); |
---|
191 | |
---|
192 | if (pfx1 == NULL) |
---|
193 | return NULL; |
---|
194 | |
---|
195 | /* Check for a second prefix. */ |
---|
196 | p = str + strlen (pfx1); |
---|
197 | while (isspace ((unsigned char)*p) && *p != '\0') |
---|
198 | p ++; |
---|
199 | |
---|
200 | pfx2 = e_name_western_get_one_prefix_at_str (p); |
---|
201 | |
---|
202 | if (pfx2 != NULL) { |
---|
203 | int pfx_len; |
---|
204 | |
---|
205 | pfx_len = (p + strlen (pfx2)) - str; |
---|
206 | pfx = g_malloc0 (pfx_len + 1); |
---|
207 | strncpy (pfx, str, pfx_len); |
---|
208 | } else { |
---|
209 | pfx = g_strdup (pfx1); |
---|
210 | } |
---|
211 | |
---|
212 | g_free (pfx1); |
---|
213 | g_free (pfx2); |
---|
214 | |
---|
215 | return pfx; |
---|
216 | } |
---|
217 | |
---|
218 | static void |
---|
219 | e_name_western_extract_prefix (ENameWestern *name, ENameWesternIdxs *idxs) |
---|
220 | { |
---|
221 | char *pfx; |
---|
222 | |
---|
223 | pfx = e_name_western_get_prefix_at_str (name->full); |
---|
224 | |
---|
225 | if (pfx == NULL) |
---|
226 | return; |
---|
227 | |
---|
228 | idxs->prefix_idx = 0; |
---|
229 | name->prefix = pfx; |
---|
230 | } |
---|
231 | |
---|
232 | static gboolean |
---|
233 | e_name_western_is_complex_last_beginning (char *word) |
---|
234 | { |
---|
235 | int i; |
---|
236 | |
---|
237 | for (i = 0; e_name_western_complex_last_table [i] != NULL; i ++) { |
---|
238 | |
---|
239 | if (! g_strcasecmp ( |
---|
240 | word, e_name_western_complex_last_table [i])) |
---|
241 | return TRUE; |
---|
242 | } |
---|
243 | |
---|
244 | return FALSE; |
---|
245 | } |
---|
246 | |
---|
247 | static void |
---|
248 | e_name_western_extract_first (ENameWestern *name, ENameWesternIdxs *idxs) |
---|
249 | { |
---|
250 | /* |
---|
251 | * If there's a prefix, then the first name is right after it. |
---|
252 | */ |
---|
253 | if (idxs->prefix_idx != -1) { |
---|
254 | int first_idx; |
---|
255 | char *p; |
---|
256 | |
---|
257 | first_idx = idxs->prefix_idx + strlen (name->prefix); |
---|
258 | |
---|
259 | /* Skip past white space. */ |
---|
260 | p = name->full + first_idx; |
---|
261 | while (isspace ((unsigned char)*p) && *p != '\0') |
---|
262 | p++; |
---|
263 | |
---|
264 | if (*p == '\0') |
---|
265 | return; |
---|
266 | |
---|
267 | idxs->first_idx = p - name->full; |
---|
268 | name->first = e_name_western_get_words_at_idx ( |
---|
269 | name->full, idxs->first_idx, 1); |
---|
270 | |
---|
271 | } else { |
---|
272 | |
---|
273 | /* |
---|
274 | * Otherwise, the first name is probably the first string. |
---|
275 | */ |
---|
276 | idxs->first_idx = 0; |
---|
277 | name->first = e_name_western_get_words_at_idx ( |
---|
278 | name->full, idxs->first_idx, 1); |
---|
279 | } |
---|
280 | |
---|
281 | /* |
---|
282 | * Check that we didn't just assign the beginning of a |
---|
283 | * compound last name to the first name. |
---|
284 | */ |
---|
285 | if (name->first != NULL) { |
---|
286 | if (e_name_western_is_complex_last_beginning (name->first)) { |
---|
287 | g_free (name->first); |
---|
288 | name->first = NULL; |
---|
289 | idxs->first_idx = -1; |
---|
290 | } |
---|
291 | } |
---|
292 | } |
---|
293 | |
---|
294 | static void |
---|
295 | e_name_western_extract_middle (ENameWestern *name, ENameWesternIdxs *idxs) |
---|
296 | { |
---|
297 | char *word; |
---|
298 | int middle_idx; |
---|
299 | |
---|
300 | /* |
---|
301 | * Middle names can only exist if you have a first name. |
---|
302 | */ |
---|
303 | if (idxs->first_idx == -1) |
---|
304 | return; |
---|
305 | |
---|
306 | middle_idx = idxs->first_idx + strlen (name->first) + 1; |
---|
307 | |
---|
308 | if (middle_idx > strlen (name->full)) |
---|
309 | return; |
---|
310 | |
---|
311 | /* |
---|
312 | * Search for the first space (or the terminating \0) |
---|
313 | */ |
---|
314 | while (isspace ((unsigned char)name->full [middle_idx]) && |
---|
315 | name->full [middle_idx] != '\0') |
---|
316 | middle_idx ++; |
---|
317 | |
---|
318 | if (name->full [middle_idx] == '\0') |
---|
319 | return; |
---|
320 | |
---|
321 | /* |
---|
322 | * Skip past the nickname, if it's there. |
---|
323 | */ |
---|
324 | if (name->full [middle_idx] == '\"') { |
---|
325 | if (idxs->nick_idx == -1) |
---|
326 | return; |
---|
327 | |
---|
328 | middle_idx = idxs->nick_idx + strlen (name->nick) + 1; |
---|
329 | |
---|
330 | while (isspace ((unsigned char)name->full [middle_idx]) && |
---|
331 | name->full [middle_idx] != '\0') |
---|
332 | middle_idx ++; |
---|
333 | |
---|
334 | if (name->full [middle_idx] == '\0') |
---|
335 | return; |
---|
336 | } |
---|
337 | |
---|
338 | /* |
---|
339 | * Make sure this isn't the beginning of a complex last name. |
---|
340 | */ |
---|
341 | word = e_name_western_get_words_at_idx (name->full, middle_idx, 1); |
---|
342 | if (e_name_western_is_complex_last_beginning (word)) { |
---|
343 | g_free (word); |
---|
344 | return; |
---|
345 | } |
---|
346 | |
---|
347 | /* |
---|
348 | * Make sure this isn't a suffix. |
---|
349 | */ |
---|
350 | e_name_western_cleanup_string (& word); |
---|
351 | if (e_name_western_word_is_suffix (word)) { |
---|
352 | g_free (word); |
---|
353 | return; |
---|
354 | } |
---|
355 | |
---|
356 | /* |
---|
357 | * Make sure we didn't just grab a cute nickname. |
---|
358 | */ |
---|
359 | if (word [0] == '\"') { |
---|
360 | g_free (word); |
---|
361 | return; |
---|
362 | } |
---|
363 | |
---|
364 | idxs->middle_idx = middle_idx; |
---|
365 | name->middle = word; |
---|
366 | } |
---|
367 | |
---|
368 | static void |
---|
369 | e_name_western_extract_nickname (ENameWestern *name, ENameWesternIdxs *idxs) |
---|
370 | { |
---|
371 | int idx; |
---|
372 | int start_idx; |
---|
373 | char *str; |
---|
374 | |
---|
375 | if (idxs->first_idx == -1) |
---|
376 | return; |
---|
377 | |
---|
378 | if (idxs->middle_idx > idxs->first_idx) |
---|
379 | idx = idxs->middle_idx + strlen (name->middle); |
---|
380 | else |
---|
381 | idx = idxs->first_idx + strlen (name->first); |
---|
382 | |
---|
383 | while (name->full [idx] != '\"' && name->full [idx] != '\0') |
---|
384 | idx ++; |
---|
385 | |
---|
386 | if (name->full [idx] != '\"') |
---|
387 | return; |
---|
388 | |
---|
389 | start_idx = idx; |
---|
390 | |
---|
391 | /* |
---|
392 | * Advance to the next double quote. |
---|
393 | */ |
---|
394 | idx ++; |
---|
395 | |
---|
396 | while (name->full [idx] != '\"' && name->full [idx] != '\0') |
---|
397 | idx ++; |
---|
398 | |
---|
399 | if (name->full [idx] == '\0') |
---|
400 | return; |
---|
401 | |
---|
402 | str = g_malloc0 (idx - start_idx + 2); |
---|
403 | strncpy (str, name->full + start_idx, idx - start_idx + 1); |
---|
404 | |
---|
405 | name->nick = str; |
---|
406 | idxs->nick_idx = start_idx; |
---|
407 | } |
---|
408 | |
---|
409 | static int |
---|
410 | e_name_western_last_get_max_idx (ENameWestern *name, ENameWesternIdxs *idxs) |
---|
411 | { |
---|
412 | int max_idx = -1; |
---|
413 | |
---|
414 | if (name->prefix != NULL) |
---|
415 | max_idx = e_name_western_max ( |
---|
416 | max_idx, idxs->prefix_idx + strlen (name->prefix)); |
---|
417 | |
---|
418 | if (name->first != NULL) |
---|
419 | max_idx = e_name_western_max ( |
---|
420 | max_idx, idxs->first_idx + strlen (name->first)); |
---|
421 | |
---|
422 | if (name->middle != NULL) |
---|
423 | max_idx = e_name_western_max ( |
---|
424 | max_idx, idxs->middle_idx + strlen (name->middle)); |
---|
425 | |
---|
426 | if (name->nick != NULL) |
---|
427 | max_idx = e_name_western_max ( |
---|
428 | max_idx, idxs->nick_idx + strlen (name->nick)); |
---|
429 | |
---|
430 | return max_idx; |
---|
431 | } |
---|
432 | |
---|
433 | static void |
---|
434 | e_name_western_extract_last (ENameWestern *name, ENameWesternIdxs *idxs) |
---|
435 | { |
---|
436 | char *word; |
---|
437 | int idx = -1; |
---|
438 | |
---|
439 | idx = e_name_western_last_get_max_idx (name, idxs); |
---|
440 | |
---|
441 | /* |
---|
442 | * In the case where there is no preceding name element, the |
---|
443 | * name is either just a first name ("Nat", "John"), is a |
---|
444 | * single-element name ("Cher", which we treat as a first |
---|
445 | * name), or is just a last name. The only time we can |
---|
446 | * differentiate a last name alone from a single-element name |
---|
447 | * or a first name alone is if it's a complex last name ("de |
---|
448 | * Icaza", "van Josephsen"). So if there is no preceding name |
---|
449 | * element, we check to see whether or not the first part of |
---|
450 | * the name is the beginning of a complex name. If it is, |
---|
451 | * we subsume the entire string. If we accidentally subsume |
---|
452 | * the suffix, this will get fixed in the fixup routine. |
---|
453 | */ |
---|
454 | if (idx == -1) { |
---|
455 | word = e_name_western_get_words_at_idx (name->full, 0, 1); |
---|
456 | if (! e_name_western_is_complex_last_beginning (word)) { |
---|
457 | g_free (word); |
---|
458 | return; |
---|
459 | } |
---|
460 | |
---|
461 | name->last = g_strdup (name->full); |
---|
462 | idxs->last_idx = 0; |
---|
463 | return; |
---|
464 | } |
---|
465 | |
---|
466 | /* Skip past the white space. */ |
---|
467 | while (isspace ((unsigned char)name->full [idx]) && name->full [idx] != '\0') |
---|
468 | idx ++; |
---|
469 | |
---|
470 | if (name->full [idx] == '\0') |
---|
471 | return; |
---|
472 | |
---|
473 | word = e_name_western_get_words_at_idx (name->full, idx, 1); |
---|
474 | e_name_western_cleanup_string (& word); |
---|
475 | if (e_name_western_word_is_suffix (word)) { |
---|
476 | g_free (word); |
---|
477 | return; |
---|
478 | } |
---|
479 | g_free (word); |
---|
480 | |
---|
481 | /* |
---|
482 | * Subsume the rest of the string into the last name. If we |
---|
483 | * accidentally include the prefix, it will get fixed later. |
---|
484 | * This is the only way to handle things like "Miguel de Icaza |
---|
485 | * Amozorrutia" without dropping data and forcing the user |
---|
486 | * to retype it. |
---|
487 | */ |
---|
488 | name->last = g_strdup (name->full + idx); |
---|
489 | idxs->last_idx = idx; |
---|
490 | } |
---|
491 | |
---|
492 | static char * |
---|
493 | e_name_western_get_preceding_word (char *str, int idx) |
---|
494 | { |
---|
495 | int word_len; |
---|
496 | char *word; |
---|
497 | char *p; |
---|
498 | |
---|
499 | p = str + idx; |
---|
500 | |
---|
501 | while (isspace ((unsigned char)*p) && p > str) |
---|
502 | p --; |
---|
503 | |
---|
504 | while (! isspace ((unsigned char)*p) && p > str) |
---|
505 | p --; |
---|
506 | |
---|
507 | if (isspace ((unsigned char)*p)) |
---|
508 | p ++; |
---|
509 | |
---|
510 | word_len = (str + idx) - p; |
---|
511 | word = g_malloc0 (word_len + 1); |
---|
512 | if (word_len > 0) |
---|
513 | strncpy (word, p, word_len); |
---|
514 | |
---|
515 | return word; |
---|
516 | } |
---|
517 | |
---|
518 | static char * |
---|
519 | e_name_western_get_suffix_at_str_end (char *str) |
---|
520 | { |
---|
521 | char *suffix; |
---|
522 | char *p; |
---|
523 | |
---|
524 | /* |
---|
525 | * Walk backwards till we reach the beginning of the |
---|
526 | * (potentially-comma-separated) list of suffixes. |
---|
527 | */ |
---|
528 | p = str + strlen (str); |
---|
529 | while (1) { |
---|
530 | char *nextp; |
---|
531 | char *word; |
---|
532 | |
---|
533 | word = e_name_western_get_preceding_word (str, p - str); |
---|
534 | nextp = p - strlen (word) - 1; |
---|
535 | |
---|
536 | e_name_western_cleanup_string (& word); |
---|
537 | |
---|
538 | if (e_name_western_word_is_suffix (word)) { |
---|
539 | p = nextp; |
---|
540 | g_free (word); |
---|
541 | } else { |
---|
542 | g_free (word); |
---|
543 | break; |
---|
544 | } |
---|
545 | } |
---|
546 | |
---|
547 | if (p == (str + strlen (str))) |
---|
548 | return NULL; |
---|
549 | |
---|
550 | suffix = g_strdup (p); |
---|
551 | e_name_western_cleanup_string (& suffix); |
---|
552 | |
---|
553 | if (strlen (suffix) == 0) { |
---|
554 | g_free (suffix); |
---|
555 | return NULL; |
---|
556 | } |
---|
557 | |
---|
558 | return suffix; |
---|
559 | } |
---|
560 | |
---|
561 | static void |
---|
562 | e_name_western_extract_suffix (ENameWestern *name, ENameWesternIdxs *idxs) |
---|
563 | { |
---|
564 | |
---|
565 | name->suffix = e_name_western_get_suffix_at_str_end (name->full); |
---|
566 | |
---|
567 | if (name->suffix == NULL) |
---|
568 | return; |
---|
569 | |
---|
570 | idxs->suffix_idx = strlen (name->full) - strlen (name->suffix); |
---|
571 | } |
---|
572 | |
---|
573 | static gboolean |
---|
574 | e_name_western_detect_backwards (ENameWestern *name, ENameWesternIdxs *idxs) |
---|
575 | { |
---|
576 | char *comma; |
---|
577 | char *word; |
---|
578 | |
---|
579 | comma = strchr (name->full, ','); |
---|
580 | |
---|
581 | if (comma == NULL) |
---|
582 | return FALSE; |
---|
583 | |
---|
584 | /* |
---|
585 | * If there's a comma, we need to detect whether it's |
---|
586 | * separating the last name from the first or just separating |
---|
587 | * suffixes. So we grab the word which comes before the |
---|
588 | * comma and check if it's a suffix. |
---|
589 | */ |
---|
590 | word = e_name_western_get_preceding_word (name->full, comma - name->full); |
---|
591 | |
---|
592 | if (e_name_western_word_is_suffix (word)) { |
---|
593 | g_free (word); |
---|
594 | return FALSE; |
---|
595 | } |
---|
596 | |
---|
597 | g_free (word); |
---|
598 | return TRUE; |
---|
599 | } |
---|
600 | |
---|
601 | static void |
---|
602 | e_name_western_reorder_asshole (ENameWestern *name, ENameWesternIdxs *idxs) |
---|
603 | { |
---|
604 | char *prefix; |
---|
605 | char *last; |
---|
606 | char *suffix; |
---|
607 | char *firstmidnick; |
---|
608 | char *newfull; |
---|
609 | |
---|
610 | char *comma; |
---|
611 | char *p; |
---|
612 | |
---|
613 | if (! e_name_western_detect_backwards (name, idxs)) |
---|
614 | return; |
---|
615 | |
---|
616 | /* |
---|
617 | * Convert |
---|
618 | * <Prefix> <Last name>, <First name> <Middle[+nick] name> <Suffix> |
---|
619 | * to |
---|
620 | * <Prefix> <First name> <Middle[+nick] name> <Last name> <Suffix> |
---|
621 | */ |
---|
622 | |
---|
623 | /* |
---|
624 | * Grab the prefix from the beginning. |
---|
625 | */ |
---|
626 | prefix = e_name_western_get_prefix_at_str (name->full); |
---|
627 | |
---|
628 | /* |
---|
629 | * Everything from the end of the prefix to the comma is the |
---|
630 | * last name. |
---|
631 | */ |
---|
632 | comma = strchr (name->full, ','); |
---|
633 | if (comma == NULL) |
---|
634 | return; |
---|
635 | |
---|
636 | p = name->full + (prefix == NULL ? 0 : strlen (prefix)); |
---|
637 | |
---|
638 | while (isspace ((unsigned char)*p) && *p != '\0') |
---|
639 | p ++; |
---|
640 | |
---|
641 | last = g_malloc0 (comma - p + 1); |
---|
642 | strncpy (last, p, comma - p); |
---|
643 | |
---|
644 | /* |
---|
645 | * Get the suffix off the end. |
---|
646 | */ |
---|
647 | suffix = e_name_western_get_suffix_at_str_end (name->full); |
---|
648 | |
---|
649 | /* |
---|
650 | * Firstmidnick is everything from the comma to the beginning |
---|
651 | * of the suffix. |
---|
652 | */ |
---|
653 | p = comma + 1; |
---|
654 | |
---|
655 | while (isspace ((unsigned char)*p) && *p != '\0') |
---|
656 | p ++; |
---|
657 | |
---|
658 | if (suffix != NULL) { |
---|
659 | char *q; |
---|
660 | |
---|
661 | /* |
---|
662 | * Point q at the beginning of the suffix. |
---|
663 | */ |
---|
664 | q = name->full + strlen (name->full) - strlen (suffix) - 1; |
---|
665 | |
---|
666 | /* |
---|
667 | * Walk backwards until we hit the space which |
---|
668 | * separates the suffix from firstmidnick. |
---|
669 | */ |
---|
670 | while (! isspace ((unsigned char)*q) && q > comma) |
---|
671 | q --; |
---|
672 | |
---|
673 | if ((q - p + 1) > 0) { |
---|
674 | firstmidnick = g_malloc0 (q - p + 1); |
---|
675 | strncpy (firstmidnick, p, q - p); |
---|
676 | } else |
---|
677 | firstmidnick = NULL; |
---|
678 | } else { |
---|
679 | firstmidnick = g_strdup (p); |
---|
680 | } |
---|
681 | |
---|
682 | /* |
---|
683 | * Create our new reordered version of the name. |
---|
684 | */ |
---|
685 | #define NULLSTR(a) ((a) == NULL ? "" : (a)) |
---|
686 | newfull = g_strdup_printf ("%s %s %s %s", NULLSTR (prefix), NULLSTR (firstmidnick), |
---|
687 | NULLSTR (last), NULLSTR (suffix)); |
---|
688 | g_strstrip (newfull); |
---|
689 | g_free (name->full); |
---|
690 | name->full = newfull; |
---|
691 | |
---|
692 | |
---|
693 | g_free (prefix); |
---|
694 | g_free (firstmidnick); |
---|
695 | g_free (last); |
---|
696 | g_free (suffix); |
---|
697 | } |
---|
698 | |
---|
699 | static void |
---|
700 | e_name_western_zap_nil (char **str, int *idx) |
---|
701 | { |
---|
702 | if (*str == NULL) |
---|
703 | return; |
---|
704 | |
---|
705 | if (strlen (*str) != 0) |
---|
706 | return; |
---|
707 | |
---|
708 | *idx = -1; |
---|
709 | g_free (*str); |
---|
710 | *str = NULL; |
---|
711 | } |
---|
712 | |
---|
713 | #define FINISH_CHECK_MIDDLE_NAME_FOR_CONJUNCTION \ |
---|
714 | char *last_start = NULL; \ |
---|
715 | if (name->last) \ |
---|
716 | last_start = strchr (name->last, ' '); \ |
---|
717 | if (last_start) { \ |
---|
718 | char *new_last, *new_first; \ |
---|
719 | \ |
---|
720 | new_last = g_strdup (last_start + 1); \ |
---|
721 | *last_start = '\0'; \ |
---|
722 | \ |
---|
723 | idxs->last_idx += (last_start - name->last) + 1; \ |
---|
724 | \ |
---|
725 | new_first = g_strdup_printf ("%s %s %s", name->first, name->middle, name->last); \ |
---|
726 | \ |
---|
727 | g_free (name->first); \ |
---|
728 | g_free (name->middle); \ |
---|
729 | g_free (name->last); \ |
---|
730 | \ |
---|
731 | name->first = new_first; \ |
---|
732 | name->middle = NULL; \ |
---|
733 | name->last = new_last; \ |
---|
734 | \ |
---|
735 | idxs->middle_idx = -1; \ |
---|
736 | } else { \ |
---|
737 | char *new_first; \ |
---|
738 | \ |
---|
739 | new_first = g_strdup_printf ("%s %s %s", name->first, name->middle, name->last); \ |
---|
740 | \ |
---|
741 | g_free (name->first); \ |
---|
742 | g_free (name->middle); \ |
---|
743 | g_free (name->last); \ |
---|
744 | \ |
---|
745 | name->first = new_first; \ |
---|
746 | name->middle = NULL; \ |
---|
747 | name->last = NULL; \ |
---|
748 | idxs->middle_idx = -1; \ |
---|
749 | idxs->last_idx = -1; \ |
---|
750 | } |
---|
751 | |
---|
752 | #define CHECK_MIDDLE_NAME_FOR_CONJUNCTION(conj) \ |
---|
753 | if (idxs->middle_idx != -1 && !strcmp (name->middle, conj)) { \ |
---|
754 | FINISH_CHECK_MIDDLE_NAME_FOR_CONJUNCTION \ |
---|
755 | } |
---|
756 | |
---|
757 | #define CHECK_MIDDLE_NAME_FOR_CONJUNCTION_CASE(conj) \ |
---|
758 | if (idxs->middle_idx != -1 && !strcasecmp (name->middle, conj)) { \ |
---|
759 | FINISH_CHECK_MIDDLE_NAME_FOR_CONJUNCTION \ |
---|
760 | } |
---|
761 | |
---|
762 | static void |
---|
763 | e_name_western_fixup (ENameWestern *name, ENameWesternIdxs *idxs) |
---|
764 | { |
---|
765 | /* |
---|
766 | * The middle and last names cannot be the same. |
---|
767 | */ |
---|
768 | if (idxs->middle_idx != -1 && idxs->middle_idx == idxs->last_idx) { |
---|
769 | idxs->middle_idx = -1; |
---|
770 | g_free (name->middle); |
---|
771 | name->middle = NULL; |
---|
772 | } |
---|
773 | |
---|
774 | /* |
---|
775 | * If we have a middle name and no last name, then we mistook |
---|
776 | * the last name for the middle name. |
---|
777 | */ |
---|
778 | if (idxs->last_idx == -1 && idxs->middle_idx != -1) { |
---|
779 | idxs->last_idx = idxs->middle_idx; |
---|
780 | name->last = name->middle; |
---|
781 | name->middle = NULL; |
---|
782 | idxs->middle_idx = -1; |
---|
783 | } |
---|
784 | |
---|
785 | /* |
---|
786 | * Check to see if we accidentally included the suffix in the |
---|
787 | * last name. |
---|
788 | */ |
---|
789 | if (idxs->suffix_idx != -1 && idxs->last_idx != -1 && |
---|
790 | idxs->suffix_idx < (idxs->last_idx + strlen (name->last))) { |
---|
791 | char *sfx; |
---|
792 | |
---|
793 | sfx = name->last + (idxs->suffix_idx - idxs->last_idx); |
---|
794 | if (sfx != NULL) { |
---|
795 | char *newlast; |
---|
796 | char *p; |
---|
797 | |
---|
798 | p = sfx - 1; |
---|
799 | while (isspace ((unsigned char)*p) && p > name->last) |
---|
800 | p --; |
---|
801 | p ++; |
---|
802 | |
---|
803 | newlast = g_malloc0 (p - name->last + 1); |
---|
804 | strncpy (newlast, name->last, p - name->last); |
---|
805 | g_free (name->last); |
---|
806 | name->last = newlast; |
---|
807 | } |
---|
808 | } |
---|
809 | |
---|
810 | /* |
---|
811 | * If we have a prefix and a first name, but no last name, |
---|
812 | * then we need to assign the first name to the last name. |
---|
813 | * This way we get things like "Mr Friedman" correctly. |
---|
814 | */ |
---|
815 | if (idxs->first_idx != -1 && idxs->prefix_idx != -1 && |
---|
816 | idxs->last_idx == -1) { |
---|
817 | name->last = name->first; |
---|
818 | idxs->last_idx = idxs->first_idx; |
---|
819 | idxs->first_idx = -1; |
---|
820 | name->first = NULL; |
---|
821 | } |
---|
822 | |
---|
823 | if (idxs->middle_idx != -1) { |
---|
824 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION ("&"); |
---|
825 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION ("*"); |
---|
826 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION ("|"); |
---|
827 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION ("^"); |
---|
828 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION ("&&"); |
---|
829 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION ("||"); |
---|
830 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION ("+"); |
---|
831 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION ("-"); |
---|
832 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION_CASE ("and"); |
---|
833 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION_CASE ("or"); |
---|
834 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION_CASE ("plus"); |
---|
835 | |
---|
836 | /* Spanish */ |
---|
837 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION_CASE ("y"); |
---|
838 | |
---|
839 | /* German */ |
---|
840 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION_CASE ("und"); |
---|
841 | |
---|
842 | /* Italian */ |
---|
843 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION_CASE ("e"); |
---|
844 | |
---|
845 | /* Czech */ |
---|
846 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION_CASE ("a"); |
---|
847 | |
---|
848 | /* Finnish */ |
---|
849 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION_CASE ("ja"); |
---|
850 | |
---|
851 | /* French */ |
---|
852 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION_CASE ("et"); |
---|
853 | |
---|
854 | /* Russian */ |
---|
855 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION ("\xd0\x98"); /* u+0418 */ |
---|
856 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION ("\xd0\xb8"); /* u+0438 */ |
---|
857 | } |
---|
858 | |
---|
859 | /* |
---|
860 | * Remove stray spaces and commas (although there don't seem |
---|
861 | * to be any in the test cases, they might show up later). |
---|
862 | */ |
---|
863 | e_name_western_cleanup_string (& name->prefix); |
---|
864 | e_name_western_cleanup_string (& name->first); |
---|
865 | e_name_western_cleanup_string (& name->middle); |
---|
866 | e_name_western_cleanup_string (& name->nick); |
---|
867 | e_name_western_cleanup_string (& name->last); |
---|
868 | e_name_western_cleanup_string (& name->suffix); |
---|
869 | |
---|
870 | /* |
---|
871 | * Make zero-length strings just NULL. |
---|
872 | */ |
---|
873 | e_name_western_zap_nil (& name->prefix, & idxs->prefix_idx); |
---|
874 | e_name_western_zap_nil (& name->first, & idxs->first_idx); |
---|
875 | e_name_western_zap_nil (& name->middle, & idxs->middle_idx); |
---|
876 | e_name_western_zap_nil (& name->nick, & idxs->nick_idx); |
---|
877 | e_name_western_zap_nil (& name->last, & idxs->last_idx); |
---|
878 | e_name_western_zap_nil (& name->suffix, & idxs->suffix_idx); |
---|
879 | } |
---|
880 | |
---|
881 | /** |
---|
882 | * e_name_western_western_parse_fullname: |
---|
883 | * @full_name: A string containing a Western name. |
---|
884 | * |
---|
885 | * Parses @full_name and returns an #ENameWestern object filled with |
---|
886 | * the component parts of the name. |
---|
887 | */ |
---|
888 | ENameWestern * |
---|
889 | e_name_western_parse (const char *full_name) |
---|
890 | { |
---|
891 | ENameWesternIdxs *idxs; |
---|
892 | ENameWestern *wname; |
---|
893 | |
---|
894 | wname = g_new0 (ENameWestern, 1); |
---|
895 | |
---|
896 | wname->full = g_strdup (full_name); |
---|
897 | |
---|
898 | idxs = g_new0 (ENameWesternIdxs, 1); |
---|
899 | |
---|
900 | idxs->prefix_idx = -1; |
---|
901 | idxs->first_idx = -1; |
---|
902 | idxs->middle_idx = -1; |
---|
903 | idxs->nick_idx = -1; |
---|
904 | idxs->last_idx = -1; |
---|
905 | idxs->suffix_idx = -1; |
---|
906 | |
---|
907 | /* |
---|
908 | * An extremely simple algorithm. |
---|
909 | * |
---|
910 | * The goal here is to get it right 95% of the time for |
---|
911 | * Western names. |
---|
912 | * |
---|
913 | * First we check to see if this is an ass-backwards name |
---|
914 | * ("Prefix Last, First Middle Suffix"). These names really |
---|
915 | * suck (imagine "Dr von Johnson, Albert Roderick Jr"), so |
---|
916 | * we reorder them first and then parse them. |
---|
917 | * |
---|
918 | * Next, we grab the most obvious assignments for the various |
---|
919 | * parts of the name. Once this is done, we check for stupid |
---|
920 | * errors and fix them up. |
---|
921 | */ |
---|
922 | e_name_western_reorder_asshole (wname, idxs); |
---|
923 | |
---|
924 | e_name_western_extract_prefix (wname, idxs); |
---|
925 | e_name_western_extract_first (wname, idxs); |
---|
926 | e_name_western_extract_nickname (wname, idxs); |
---|
927 | e_name_western_extract_middle (wname, idxs); |
---|
928 | e_name_western_extract_last (wname, idxs); |
---|
929 | e_name_western_extract_suffix (wname, idxs); |
---|
930 | |
---|
931 | e_name_western_fixup (wname, idxs); |
---|
932 | |
---|
933 | g_free (idxs); |
---|
934 | |
---|
935 | return wname; |
---|
936 | } |
---|
937 | |
---|
938 | /** |
---|
939 | * e_name_western_free: |
---|
940 | * @name: An ENameWestern object which needs to be freed. |
---|
941 | * |
---|
942 | * Deep-frees @name |
---|
943 | */ |
---|
944 | void |
---|
945 | e_name_western_free (ENameWestern *w) |
---|
946 | { |
---|
947 | |
---|
948 | g_free (w->prefix); |
---|
949 | g_free (w->first); |
---|
950 | g_free (w->middle); |
---|
951 | g_free (w->nick); |
---|
952 | g_free (w->last); |
---|
953 | g_free (w->suffix); |
---|
954 | |
---|
955 | g_free (w->full); |
---|
956 | |
---|
957 | g_free (w); |
---|
958 | } |
---|