1 | /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ |
---|
2 | /* |
---|
3 | * A simple Western name parser. |
---|
4 | * |
---|
5 | * <Nat> Jamie, do you know anything about name parsing? |
---|
6 | * <jwz> Are you going down that rat hole? Bring a flashlight. |
---|
7 | * |
---|
8 | * Authors: |
---|
9 | * Nat Friedman <nat@ximian.com> |
---|
10 | * |
---|
11 | * Copyright 1999 - 2001, Ximian, Inc. |
---|
12 | */ |
---|
13 | |
---|
14 | #include <ctype.h> |
---|
15 | #include <string.h> |
---|
16 | #include <glib.h> |
---|
17 | |
---|
18 | #include <ename/e-name-western.h> |
---|
19 | #include <ename/e-name-western-tables.h> |
---|
20 | |
---|
21 | typedef struct { |
---|
22 | int prefix_idx; |
---|
23 | int first_idx; |
---|
24 | int middle_idx; |
---|
25 | int nick_idx; |
---|
26 | int last_idx; |
---|
27 | int suffix_idx; |
---|
28 | } ENameWesternIdxs; |
---|
29 | |
---|
30 | static int |
---|
31 | e_name_western_str_count_words (char *str) |
---|
32 | { |
---|
33 | int word_count; |
---|
34 | char *p; |
---|
35 | |
---|
36 | word_count = 0; |
---|
37 | |
---|
38 | for (p = str; p != NULL; p = strchr (p, ' ')) { |
---|
39 | word_count ++; |
---|
40 | p ++; |
---|
41 | } |
---|
42 | |
---|
43 | return word_count; |
---|
44 | } |
---|
45 | |
---|
46 | static void |
---|
47 | e_name_western_cleanup_string (char **str) |
---|
48 | { |
---|
49 | char *newstr; |
---|
50 | char *p; |
---|
51 | |
---|
52 | if (*str == NULL) |
---|
53 | return; |
---|
54 | |
---|
55 | /* skip any spaces and commas at the start of the string */ |
---|
56 | p = *str; |
---|
57 | while (isspace (*p) || *p == ',') |
---|
58 | p ++; |
---|
59 | |
---|
60 | /* make the copy we're going to return */ |
---|
61 | newstr = g_strdup (p); |
---|
62 | |
---|
63 | if ( strlen(newstr) > 0) { |
---|
64 | /* now search from the back, skipping over any spaces and commas */ |
---|
65 | p = newstr + strlen (newstr) - 1; |
---|
66 | while (isspace (*p) || *p == ',') |
---|
67 | p --; |
---|
68 | /* advance p to after the character that caused us to exit the |
---|
69 | previous loop, and end the string. */ |
---|
70 | if ((! isspace (*p)) && *p != ',') |
---|
71 | p ++; |
---|
72 | *p = '\0'; |
---|
73 | } |
---|
74 | |
---|
75 | g_free (*str); |
---|
76 | *str = newstr; |
---|
77 | } |
---|
78 | |
---|
79 | static char * |
---|
80 | e_name_western_get_words_at_idx (char *str, int idx, int num_words) |
---|
81 | { |
---|
82 | char *words; |
---|
83 | char *p; |
---|
84 | int word_count; |
---|
85 | int words_len; |
---|
86 | |
---|
87 | /* |
---|
88 | * Walk to the end of the words. |
---|
89 | */ |
---|
90 | word_count = 0; |
---|
91 | p = str + idx; |
---|
92 | while (word_count < num_words && *p != '\0') { |
---|
93 | while (! isspace (*p) && *p != '\0') |
---|
94 | p ++; |
---|
95 | |
---|
96 | while (isspace (*p) && *p != '\0') |
---|
97 | p ++; |
---|
98 | |
---|
99 | word_count ++; |
---|
100 | } |
---|
101 | |
---|
102 | words_len = p - str - idx - 1; |
---|
103 | |
---|
104 | if (*p == '\0') |
---|
105 | words_len ++; |
---|
106 | |
---|
107 | words = g_malloc0 (1 + words_len); |
---|
108 | strncpy (words, str + idx, words_len); |
---|
109 | |
---|
110 | return words; |
---|
111 | } |
---|
112 | |
---|
113 | /* |
---|
114 | * What the fuck is wrong with glib's MAX macro. |
---|
115 | */ |
---|
116 | static int |
---|
117 | e_name_western_max (const int a, const int b) |
---|
118 | { |
---|
119 | if (a > b) |
---|
120 | return a; |
---|
121 | |
---|
122 | return b; |
---|
123 | } |
---|
124 | |
---|
125 | static gboolean |
---|
126 | e_name_western_word_is_suffix (char *word) |
---|
127 | { |
---|
128 | int i; |
---|
129 | |
---|
130 | for (i = 0; e_name_western_sfx_table [i] != NULL; i ++) { |
---|
131 | if (g_strcasecmp (word, e_name_western_sfx_table [i])) |
---|
132 | continue; |
---|
133 | |
---|
134 | return TRUE; |
---|
135 | } |
---|
136 | |
---|
137 | return FALSE; |
---|
138 | } |
---|
139 | |
---|
140 | static char * |
---|
141 | e_name_western_get_one_prefix_at_str (char *str) |
---|
142 | { |
---|
143 | char *word; |
---|
144 | int i; |
---|
145 | |
---|
146 | /* |
---|
147 | * Check for prefixes from our table. |
---|
148 | */ |
---|
149 | for (i = 0; e_name_western_pfx_table [i] != NULL; i ++) { |
---|
150 | int pfx_words; |
---|
151 | char *words; |
---|
152 | |
---|
153 | pfx_words = e_name_western_str_count_words (e_name_western_pfx_table [i]); |
---|
154 | words = e_name_western_get_words_at_idx (str, 0, pfx_words); |
---|
155 | |
---|
156 | if (! g_strcasecmp (words, e_name_western_pfx_table [i])) |
---|
157 | return words; |
---|
158 | |
---|
159 | g_free (words); |
---|
160 | } |
---|
161 | |
---|
162 | /* |
---|
163 | * Check for prefixes we don't know about. These are always a |
---|
164 | * sequence of more than one letters followed by a period. |
---|
165 | */ |
---|
166 | word = e_name_western_get_words_at_idx (str, 0, 1); |
---|
167 | |
---|
168 | if (strlen (word) > 2 && |
---|
169 | isalpha ((unsigned char) word [0]) && |
---|
170 | isalpha ((unsigned char) word [1]) && |
---|
171 | word [strlen (word) - 1] == '.') |
---|
172 | return word; |
---|
173 | |
---|
174 | g_free (word); |
---|
175 | |
---|
176 | return NULL; |
---|
177 | } |
---|
178 | |
---|
179 | static char * |
---|
180 | e_name_western_get_prefix_at_str (char *str) |
---|
181 | { |
---|
182 | char *pfx; |
---|
183 | char *pfx1; |
---|
184 | char *pfx2; |
---|
185 | char *p; |
---|
186 | |
---|
187 | /* Get the first prefix. */ |
---|
188 | pfx1 = e_name_western_get_one_prefix_at_str (str); |
---|
189 | |
---|
190 | if (pfx1 == NULL) |
---|
191 | return NULL; |
---|
192 | |
---|
193 | /* Check for a second prefix. */ |
---|
194 | p = str + strlen (pfx1); |
---|
195 | while (isspace (*p) && *p != '\0') |
---|
196 | p ++; |
---|
197 | |
---|
198 | pfx2 = e_name_western_get_one_prefix_at_str (p); |
---|
199 | |
---|
200 | if (pfx2 != NULL) { |
---|
201 | int pfx_len; |
---|
202 | |
---|
203 | pfx_len = (p + strlen (pfx2)) - str; |
---|
204 | pfx = g_malloc0 (pfx_len + 1); |
---|
205 | strncpy (pfx, str, pfx_len); |
---|
206 | } else { |
---|
207 | pfx = g_strdup (pfx1); |
---|
208 | } |
---|
209 | |
---|
210 | g_free (pfx1); |
---|
211 | g_free (pfx2); |
---|
212 | |
---|
213 | return pfx; |
---|
214 | } |
---|
215 | |
---|
216 | static void |
---|
217 | e_name_western_extract_prefix (ENameWestern *name, ENameWesternIdxs *idxs) |
---|
218 | { |
---|
219 | char *pfx; |
---|
220 | |
---|
221 | pfx = e_name_western_get_prefix_at_str (name->full); |
---|
222 | |
---|
223 | if (pfx == NULL) |
---|
224 | return; |
---|
225 | |
---|
226 | idxs->prefix_idx = 0; |
---|
227 | name->prefix = pfx; |
---|
228 | } |
---|
229 | |
---|
230 | static gboolean |
---|
231 | e_name_western_is_complex_last_beginning (char *word) |
---|
232 | { |
---|
233 | int i; |
---|
234 | |
---|
235 | for (i = 0; e_name_western_complex_last_table [i] != NULL; i ++) { |
---|
236 | |
---|
237 | if (! g_strcasecmp ( |
---|
238 | word, e_name_western_complex_last_table [i])) |
---|
239 | return TRUE; |
---|
240 | } |
---|
241 | |
---|
242 | return FALSE; |
---|
243 | } |
---|
244 | |
---|
245 | static void |
---|
246 | e_name_western_extract_first (ENameWestern *name, ENameWesternIdxs *idxs) |
---|
247 | { |
---|
248 | /* |
---|
249 | * If there's a prefix, then the first name is right after it. |
---|
250 | */ |
---|
251 | if (idxs->prefix_idx != -1) { |
---|
252 | int first_idx; |
---|
253 | char *p; |
---|
254 | |
---|
255 | first_idx = idxs->prefix_idx + strlen (name->prefix); |
---|
256 | |
---|
257 | /* Skip past white space. */ |
---|
258 | p = name->full + first_idx; |
---|
259 | while (isspace (*p) && *p != '\0') |
---|
260 | p++; |
---|
261 | |
---|
262 | if (*p == '\0') |
---|
263 | return; |
---|
264 | |
---|
265 | idxs->first_idx = p - name->full; |
---|
266 | name->first = e_name_western_get_words_at_idx ( |
---|
267 | name->full, idxs->first_idx, 1); |
---|
268 | |
---|
269 | } else { |
---|
270 | |
---|
271 | /* |
---|
272 | * Otherwise, the first name is probably the first string. |
---|
273 | */ |
---|
274 | idxs->first_idx = 0; |
---|
275 | name->first = e_name_western_get_words_at_idx ( |
---|
276 | name->full, idxs->first_idx, 1); |
---|
277 | } |
---|
278 | |
---|
279 | /* |
---|
280 | * Check that we didn't just assign the beginning of a |
---|
281 | * compound last name to the first name. |
---|
282 | */ |
---|
283 | if (name->first != NULL) { |
---|
284 | if (e_name_western_is_complex_last_beginning (name->first)) { |
---|
285 | g_free (name->first); |
---|
286 | name->first = NULL; |
---|
287 | idxs->first_idx = -1; |
---|
288 | } |
---|
289 | } |
---|
290 | } |
---|
291 | |
---|
292 | static void |
---|
293 | e_name_western_extract_middle (ENameWestern *name, ENameWesternIdxs *idxs) |
---|
294 | { |
---|
295 | char *word; |
---|
296 | int middle_idx; |
---|
297 | |
---|
298 | /* |
---|
299 | * Middle names can only exist if you have a first name. |
---|
300 | */ |
---|
301 | if (idxs->first_idx == -1) |
---|
302 | return; |
---|
303 | |
---|
304 | middle_idx = idxs->first_idx + strlen (name->first) + 1; |
---|
305 | |
---|
306 | if (middle_idx > strlen (name->full)) |
---|
307 | return; |
---|
308 | |
---|
309 | /* |
---|
310 | * Search for the first space (or the terminating \0) |
---|
311 | */ |
---|
312 | while (isspace (name->full [middle_idx]) && |
---|
313 | name->full [middle_idx] != '\0') |
---|
314 | middle_idx ++; |
---|
315 | |
---|
316 | if (name->full [middle_idx] == '\0') |
---|
317 | return; |
---|
318 | |
---|
319 | /* |
---|
320 | * Skip past the nickname, if it's there. |
---|
321 | */ |
---|
322 | if (name->full [middle_idx] == '\"') { |
---|
323 | if (idxs->nick_idx == -1) |
---|
324 | return; |
---|
325 | |
---|
326 | middle_idx = idxs->nick_idx + strlen (name->nick) + 1; |
---|
327 | |
---|
328 | while (isspace (name->full [middle_idx]) && |
---|
329 | name->full [middle_idx] != '\0') |
---|
330 | middle_idx ++; |
---|
331 | |
---|
332 | if (name->full [middle_idx] == '\0') |
---|
333 | return; |
---|
334 | } |
---|
335 | |
---|
336 | /* |
---|
337 | * Make sure this isn't the beginning of a complex last name. |
---|
338 | */ |
---|
339 | word = e_name_western_get_words_at_idx (name->full, middle_idx, 1); |
---|
340 | if (e_name_western_is_complex_last_beginning (word)) { |
---|
341 | g_free (word); |
---|
342 | return; |
---|
343 | } |
---|
344 | |
---|
345 | /* |
---|
346 | * Make sure this isn't a suffix. |
---|
347 | */ |
---|
348 | e_name_western_cleanup_string (& word); |
---|
349 | if (e_name_western_word_is_suffix (word)) { |
---|
350 | g_free (word); |
---|
351 | return; |
---|
352 | } |
---|
353 | |
---|
354 | /* |
---|
355 | * Make sure we didn't just grab a cute nickname. |
---|
356 | */ |
---|
357 | if (word [0] == '\"') { |
---|
358 | g_free (word); |
---|
359 | return; |
---|
360 | } |
---|
361 | |
---|
362 | idxs->middle_idx = middle_idx; |
---|
363 | name->middle = word; |
---|
364 | } |
---|
365 | |
---|
366 | static void |
---|
367 | e_name_western_extract_nickname (ENameWestern *name, ENameWesternIdxs *idxs) |
---|
368 | { |
---|
369 | int idx; |
---|
370 | int start_idx; |
---|
371 | char *str; |
---|
372 | |
---|
373 | if (idxs->first_idx == -1) |
---|
374 | return; |
---|
375 | |
---|
376 | if (idxs->middle_idx > idxs->first_idx) |
---|
377 | idx = idxs->middle_idx + strlen (name->middle); |
---|
378 | else |
---|
379 | idx = idxs->first_idx + strlen (name->first); |
---|
380 | |
---|
381 | while (name->full [idx] != '\"' && name->full [idx] != '\0') |
---|
382 | idx ++; |
---|
383 | |
---|
384 | if (name->full [idx] != '\"') |
---|
385 | return; |
---|
386 | |
---|
387 | start_idx = idx; |
---|
388 | |
---|
389 | /* |
---|
390 | * Advance to the next double quote. |
---|
391 | */ |
---|
392 | idx ++; |
---|
393 | |
---|
394 | while (name->full [idx] != '\"' && name->full [idx] != '\0') |
---|
395 | idx ++; |
---|
396 | |
---|
397 | if (name->full [idx] == '\0') |
---|
398 | return; |
---|
399 | |
---|
400 | str = g_malloc0 (idx - start_idx + 2); |
---|
401 | strncpy (str, name->full + start_idx, idx - start_idx + 1); |
---|
402 | |
---|
403 | name->nick = str; |
---|
404 | idxs->nick_idx = start_idx; |
---|
405 | } |
---|
406 | |
---|
407 | static int |
---|
408 | e_name_western_last_get_max_idx (ENameWestern *name, ENameWesternIdxs *idxs) |
---|
409 | { |
---|
410 | int max_idx = -1; |
---|
411 | |
---|
412 | if (name->prefix != NULL) |
---|
413 | max_idx = e_name_western_max ( |
---|
414 | max_idx, idxs->prefix_idx + strlen (name->prefix)); |
---|
415 | |
---|
416 | if (name->first != NULL) |
---|
417 | max_idx = e_name_western_max ( |
---|
418 | max_idx, idxs->first_idx + strlen (name->first)); |
---|
419 | |
---|
420 | if (name->middle != NULL) |
---|
421 | max_idx = e_name_western_max ( |
---|
422 | max_idx, idxs->middle_idx + strlen (name->middle)); |
---|
423 | |
---|
424 | if (name->nick != NULL) |
---|
425 | max_idx = e_name_western_max ( |
---|
426 | max_idx, idxs->nick_idx + strlen (name->nick)); |
---|
427 | |
---|
428 | return max_idx; |
---|
429 | } |
---|
430 | |
---|
431 | static void |
---|
432 | e_name_western_extract_last (ENameWestern *name, ENameWesternIdxs *idxs) |
---|
433 | { |
---|
434 | char *word; |
---|
435 | int idx = -1; |
---|
436 | |
---|
437 | idx = e_name_western_last_get_max_idx (name, idxs); |
---|
438 | |
---|
439 | /* |
---|
440 | * In the case where there is no preceding name element, the |
---|
441 | * name is either just a first name ("Nat", "John"), is a |
---|
442 | * single-element name ("Cher", which we treat as a first |
---|
443 | * name), or is just a last name. The only time we can |
---|
444 | * differentiate a last name alone from a single-element name |
---|
445 | * or a first name alone is if it's a complex last name ("de |
---|
446 | * Icaza", "van Josephsen"). So if there is no preceding name |
---|
447 | * element, we check to see whether or not the first part of |
---|
448 | * the name is the beginning of a complex name. If it is, |
---|
449 | * we subsume the entire string. If we accidentally subsume |
---|
450 | * the suffix, this will get fixed in the fixup routine. |
---|
451 | */ |
---|
452 | if (idx == -1) { |
---|
453 | word = e_name_western_get_words_at_idx (name->full, 0, 1); |
---|
454 | if (! e_name_western_is_complex_last_beginning (word)) { |
---|
455 | g_free (word); |
---|
456 | return; |
---|
457 | } |
---|
458 | |
---|
459 | name->last = g_strdup (name->full); |
---|
460 | idxs->last_idx = 0; |
---|
461 | return; |
---|
462 | } |
---|
463 | |
---|
464 | /* Skip past the white space. */ |
---|
465 | while (isspace (name->full [idx]) && name->full [idx] != '\0') |
---|
466 | idx ++; |
---|
467 | |
---|
468 | if (name->full [idx] == '\0') |
---|
469 | return; |
---|
470 | |
---|
471 | word = e_name_western_get_words_at_idx (name->full, idx, 1); |
---|
472 | e_name_western_cleanup_string (& word); |
---|
473 | if (e_name_western_word_is_suffix (word)) { |
---|
474 | g_free (word); |
---|
475 | return; |
---|
476 | } |
---|
477 | g_free (word); |
---|
478 | |
---|
479 | /* |
---|
480 | * Subsume the rest of the string into the last name. If we |
---|
481 | * accidentally include the prefix, it will get fixed later. |
---|
482 | * This is the only way to handle things like "Miguel de Icaza |
---|
483 | * Amozorrutia" without dropping data and forcing the user |
---|
484 | * to retype it. |
---|
485 | */ |
---|
486 | name->last = g_strdup (name->full + idx); |
---|
487 | idxs->last_idx = idx; |
---|
488 | } |
---|
489 | |
---|
490 | static char * |
---|
491 | e_name_western_get_preceding_word (char *str, int idx) |
---|
492 | { |
---|
493 | int word_len; |
---|
494 | char *word; |
---|
495 | char *p; |
---|
496 | |
---|
497 | p = str + idx; |
---|
498 | |
---|
499 | while (isspace (*p) && p > str) |
---|
500 | p --; |
---|
501 | |
---|
502 | while (! isspace (*p) && p > str) |
---|
503 | p --; |
---|
504 | |
---|
505 | if (isspace (*p)) |
---|
506 | p ++; |
---|
507 | |
---|
508 | word_len = (str + idx) - p; |
---|
509 | word = g_malloc0 (word_len + 1); |
---|
510 | if (word_len > 0) |
---|
511 | strncpy (word, p, word_len); |
---|
512 | |
---|
513 | return word; |
---|
514 | } |
---|
515 | |
---|
516 | static char * |
---|
517 | e_name_western_get_suffix_at_str_end (char *str) |
---|
518 | { |
---|
519 | char *suffix; |
---|
520 | char *p; |
---|
521 | |
---|
522 | /* |
---|
523 | * Walk backwards till we reach the beginning of the |
---|
524 | * (potentially-comma-separated) list of suffixes. |
---|
525 | */ |
---|
526 | p = str + strlen (str); |
---|
527 | while (1) { |
---|
528 | char *nextp; |
---|
529 | char *word; |
---|
530 | |
---|
531 | word = e_name_western_get_preceding_word (str, p - str); |
---|
532 | nextp = p - strlen (word) - 1; |
---|
533 | |
---|
534 | e_name_western_cleanup_string (& word); |
---|
535 | |
---|
536 | if (e_name_western_word_is_suffix (word)) { |
---|
537 | p = nextp; |
---|
538 | g_free (word); |
---|
539 | } else { |
---|
540 | g_free (word); |
---|
541 | break; |
---|
542 | } |
---|
543 | } |
---|
544 | |
---|
545 | if (p == (str + strlen (str))) |
---|
546 | return NULL; |
---|
547 | |
---|
548 | suffix = g_strdup (p); |
---|
549 | e_name_western_cleanup_string (& suffix); |
---|
550 | |
---|
551 | if (strlen (suffix) == 0) { |
---|
552 | g_free (suffix); |
---|
553 | return NULL; |
---|
554 | } |
---|
555 | |
---|
556 | return suffix; |
---|
557 | } |
---|
558 | |
---|
559 | static void |
---|
560 | e_name_western_extract_suffix (ENameWestern *name, ENameWesternIdxs *idxs) |
---|
561 | { |
---|
562 | |
---|
563 | name->suffix = e_name_western_get_suffix_at_str_end (name->full); |
---|
564 | |
---|
565 | if (name->suffix == NULL) |
---|
566 | return; |
---|
567 | |
---|
568 | idxs->suffix_idx = strlen (name->full) - strlen (name->suffix); |
---|
569 | } |
---|
570 | |
---|
571 | static gboolean |
---|
572 | e_name_western_detect_backwards (ENameWestern *name, ENameWesternIdxs *idxs) |
---|
573 | { |
---|
574 | char *comma; |
---|
575 | char *word; |
---|
576 | |
---|
577 | comma = strchr (name->full, ','); |
---|
578 | |
---|
579 | if (comma == NULL) |
---|
580 | return FALSE; |
---|
581 | |
---|
582 | /* |
---|
583 | * If there's a comma, we need to detect whether it's |
---|
584 | * separating the last name from the first or just separating |
---|
585 | * suffixes. So we grab the word which comes before the |
---|
586 | * comma and check if it's a suffix. |
---|
587 | */ |
---|
588 | word = e_name_western_get_preceding_word (name->full, comma - name->full); |
---|
589 | |
---|
590 | if (e_name_western_word_is_suffix (word)) { |
---|
591 | g_free (word); |
---|
592 | return FALSE; |
---|
593 | } |
---|
594 | |
---|
595 | g_free (word); |
---|
596 | return TRUE; |
---|
597 | } |
---|
598 | |
---|
599 | static void |
---|
600 | e_name_western_reorder_asshole (ENameWestern *name, ENameWesternIdxs *idxs) |
---|
601 | { |
---|
602 | char *prefix; |
---|
603 | char *last; |
---|
604 | char *suffix; |
---|
605 | char *firstmidnick; |
---|
606 | char *newfull; |
---|
607 | |
---|
608 | char *comma; |
---|
609 | char *p; |
---|
610 | |
---|
611 | if (! e_name_western_detect_backwards (name, idxs)) |
---|
612 | return; |
---|
613 | |
---|
614 | /* |
---|
615 | * Convert |
---|
616 | * <Prefix> <Last name>, <First name> <Middle[+nick] name> <Suffix> |
---|
617 | * to |
---|
618 | * <Prefix> <First name> <Middle[+nick] name> <Last name> <Suffix> |
---|
619 | */ |
---|
620 | |
---|
621 | /* |
---|
622 | * Grab the prefix from the beginning. |
---|
623 | */ |
---|
624 | prefix = e_name_western_get_prefix_at_str (name->full); |
---|
625 | |
---|
626 | /* |
---|
627 | * Everything from the end of the prefix to the comma is the |
---|
628 | * last name. |
---|
629 | */ |
---|
630 | comma = strchr (name->full, ','); |
---|
631 | if (comma == NULL) |
---|
632 | return; |
---|
633 | |
---|
634 | p = name->full + (prefix == NULL ? 0 : strlen (prefix)); |
---|
635 | |
---|
636 | while (isspace (*p) && *p != '\0') |
---|
637 | p ++; |
---|
638 | |
---|
639 | last = g_malloc0 (comma - p + 1); |
---|
640 | strncpy (last, p, comma - p); |
---|
641 | |
---|
642 | /* |
---|
643 | * Get the suffix off the end. |
---|
644 | */ |
---|
645 | suffix = e_name_western_get_suffix_at_str_end (name->full); |
---|
646 | |
---|
647 | /* |
---|
648 | * Firstmidnick is everything from the comma to the beginning |
---|
649 | * of the suffix. |
---|
650 | */ |
---|
651 | p = comma + 1; |
---|
652 | |
---|
653 | while (isspace (*p) && *p != '\0') |
---|
654 | p ++; |
---|
655 | |
---|
656 | if (suffix != NULL) { |
---|
657 | char *q; |
---|
658 | |
---|
659 | /* |
---|
660 | * Point q at the beginning of the suffix. |
---|
661 | */ |
---|
662 | q = name->full + strlen (name->full) - strlen (suffix) - 1; |
---|
663 | |
---|
664 | /* |
---|
665 | * Walk backwards until we hit the space which |
---|
666 | * separates the suffix from firstmidnick. |
---|
667 | */ |
---|
668 | while (! isspace (*q) && q > comma) |
---|
669 | q --; |
---|
670 | |
---|
671 | if ((q - p + 1) > 0) { |
---|
672 | firstmidnick = g_malloc0 (q - p + 1); |
---|
673 | strncpy (firstmidnick, p, q - p); |
---|
674 | } else |
---|
675 | firstmidnick = NULL; |
---|
676 | } else { |
---|
677 | firstmidnick = g_strdup (p); |
---|
678 | } |
---|
679 | |
---|
680 | /* |
---|
681 | * Create our new reordered version of the name. |
---|
682 | */ |
---|
683 | #define NULLSTR(a) ((a) == NULL ? "" : (a)) |
---|
684 | newfull = g_strdup_printf ("%s %s %s %s", NULLSTR (prefix), NULLSTR (firstmidnick), |
---|
685 | NULLSTR (last), NULLSTR (suffix)); |
---|
686 | g_strstrip (newfull); |
---|
687 | g_free (name->full); |
---|
688 | name->full = newfull; |
---|
689 | |
---|
690 | |
---|
691 | g_free (prefix); |
---|
692 | g_free (firstmidnick); |
---|
693 | g_free (last); |
---|
694 | g_free (suffix); |
---|
695 | } |
---|
696 | |
---|
697 | static void |
---|
698 | e_name_western_zap_nil (char **str, int *idx) |
---|
699 | { |
---|
700 | if (*str == NULL) |
---|
701 | return; |
---|
702 | |
---|
703 | if (strlen (*str) != 0) |
---|
704 | return; |
---|
705 | |
---|
706 | *idx = -1; |
---|
707 | g_free (*str); |
---|
708 | *str = NULL; |
---|
709 | } |
---|
710 | |
---|
711 | #define FINISH_CHECK_MIDDLE_NAME_FOR_CONJUNCTION \ |
---|
712 | char *last_start = NULL; \ |
---|
713 | if (name->last) \ |
---|
714 | last_start = strchr (name->last, ' '); \ |
---|
715 | if (last_start) { \ |
---|
716 | char *new_last, *new_first; \ |
---|
717 | \ |
---|
718 | new_last = g_strdup (last_start + 1); \ |
---|
719 | *last_start = '\0'; \ |
---|
720 | \ |
---|
721 | idxs->last_idx += (last_start - name->last) + 1; \ |
---|
722 | \ |
---|
723 | new_first = g_strdup_printf ("%s %s %s", name->first, name->middle, name->last); \ |
---|
724 | \ |
---|
725 | g_free (name->first); \ |
---|
726 | g_free (name->middle); \ |
---|
727 | g_free (name->last); \ |
---|
728 | \ |
---|
729 | name->first = new_first; \ |
---|
730 | name->middle = NULL; \ |
---|
731 | name->last = new_last; \ |
---|
732 | \ |
---|
733 | idxs->middle_idx = -1; \ |
---|
734 | } else { \ |
---|
735 | char *new_first; \ |
---|
736 | \ |
---|
737 | new_first = g_strdup_printf ("%s %s %s", name->first, name->middle, name->last); \ |
---|
738 | \ |
---|
739 | g_free (name->first); \ |
---|
740 | g_free (name->middle); \ |
---|
741 | g_free (name->last); \ |
---|
742 | \ |
---|
743 | name->first = new_first; \ |
---|
744 | name->middle = NULL; \ |
---|
745 | name->last = NULL; \ |
---|
746 | idxs->middle_idx = -1; \ |
---|
747 | idxs->last_idx = -1; \ |
---|
748 | } |
---|
749 | |
---|
750 | #define CHECK_MIDDLE_NAME_FOR_CONJUNCTION(conj) \ |
---|
751 | if (idxs->middle_idx != -1 && !strcmp (name->middle, conj)) { \ |
---|
752 | FINISH_CHECK_MIDDLE_NAME_FOR_CONJUNCTION \ |
---|
753 | } |
---|
754 | |
---|
755 | #define CHECK_MIDDLE_NAME_FOR_CONJUNCTION_CASE(conj) \ |
---|
756 | if (idxs->middle_idx != -1 && !strcasecmp (name->middle, conj)) { \ |
---|
757 | FINISH_CHECK_MIDDLE_NAME_FOR_CONJUNCTION \ |
---|
758 | } |
---|
759 | |
---|
760 | static void |
---|
761 | e_name_western_fixup (ENameWestern *name, ENameWesternIdxs *idxs) |
---|
762 | { |
---|
763 | /* |
---|
764 | * The middle and last names cannot be the same. |
---|
765 | */ |
---|
766 | if (idxs->middle_idx != -1 && idxs->middle_idx == idxs->last_idx) { |
---|
767 | idxs->middle_idx = -1; |
---|
768 | g_free (name->middle); |
---|
769 | name->middle = NULL; |
---|
770 | } |
---|
771 | |
---|
772 | /* |
---|
773 | * If we have a middle name and no last name, then we mistook |
---|
774 | * the last name for the middle name. |
---|
775 | */ |
---|
776 | if (idxs->last_idx == -1 && idxs->middle_idx != -1) { |
---|
777 | idxs->last_idx = idxs->middle_idx; |
---|
778 | name->last = name->middle; |
---|
779 | name->middle = NULL; |
---|
780 | idxs->middle_idx = -1; |
---|
781 | } |
---|
782 | |
---|
783 | /* |
---|
784 | * Check to see if we accidentally included the suffix in the |
---|
785 | * last name. |
---|
786 | */ |
---|
787 | if (idxs->suffix_idx != -1 && idxs->last_idx != -1 && |
---|
788 | idxs->suffix_idx < (idxs->last_idx + strlen (name->last))) { |
---|
789 | char *sfx; |
---|
790 | |
---|
791 | sfx = name->last + (idxs->suffix_idx - idxs->last_idx); |
---|
792 | if (sfx != NULL) { |
---|
793 | char *newlast; |
---|
794 | char *p; |
---|
795 | |
---|
796 | p = sfx - 1; |
---|
797 | while (isspace (*p) && p > name->last) |
---|
798 | p --; |
---|
799 | p ++; |
---|
800 | |
---|
801 | newlast = g_malloc0 (p - name->last + 1); |
---|
802 | strncpy (newlast, name->last, p - name->last); |
---|
803 | g_free (name->last); |
---|
804 | name->last = newlast; |
---|
805 | } |
---|
806 | } |
---|
807 | |
---|
808 | /* |
---|
809 | * If we have a prefix and a first name, but no last name, |
---|
810 | * then we need to assign the first name to the last name. |
---|
811 | * This way we get things like "Mr Friedman" correctly. |
---|
812 | */ |
---|
813 | if (idxs->first_idx != -1 && idxs->prefix_idx != -1 && |
---|
814 | idxs->last_idx == -1) { |
---|
815 | name->last = name->first; |
---|
816 | idxs->last_idx = idxs->first_idx; |
---|
817 | idxs->first_idx = -1; |
---|
818 | name->first = NULL; |
---|
819 | } |
---|
820 | |
---|
821 | if (idxs->middle_idx != -1) { |
---|
822 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION ("&"); |
---|
823 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION ("*"); |
---|
824 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION ("|"); |
---|
825 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION ("^"); |
---|
826 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION ("&&"); |
---|
827 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION ("||"); |
---|
828 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION ("+"); |
---|
829 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION ("-"); |
---|
830 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION_CASE ("and"); |
---|
831 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION_CASE ("or"); |
---|
832 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION_CASE ("plus"); |
---|
833 | |
---|
834 | /* Spanish */ |
---|
835 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION_CASE ("y"); |
---|
836 | |
---|
837 | /* German */ |
---|
838 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION_CASE ("und"); |
---|
839 | |
---|
840 | /* Italian */ |
---|
841 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION_CASE ("e"); |
---|
842 | |
---|
843 | /* Czech */ |
---|
844 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION_CASE ("a"); |
---|
845 | |
---|
846 | /* Finnish */ |
---|
847 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION_CASE ("ja"); |
---|
848 | |
---|
849 | /* French */ |
---|
850 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION_CASE ("et"); |
---|
851 | |
---|
852 | /* Russian */ |
---|
853 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION ("\xd0\x98"); /* u+0418 */ |
---|
854 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION ("\xd0\xb8"); /* u+0438 */ |
---|
855 | } |
---|
856 | |
---|
857 | /* |
---|
858 | * Remove stray spaces and commas (although there don't seem |
---|
859 | * to be any in the test cases, they might show up later). |
---|
860 | */ |
---|
861 | e_name_western_cleanup_string (& name->prefix); |
---|
862 | e_name_western_cleanup_string (& name->first); |
---|
863 | e_name_western_cleanup_string (& name->middle); |
---|
864 | e_name_western_cleanup_string (& name->nick); |
---|
865 | e_name_western_cleanup_string (& name->last); |
---|
866 | e_name_western_cleanup_string (& name->suffix); |
---|
867 | |
---|
868 | /* |
---|
869 | * Make zero-length strings just NULL. |
---|
870 | */ |
---|
871 | e_name_western_zap_nil (& name->prefix, & idxs->prefix_idx); |
---|
872 | e_name_western_zap_nil (& name->first, & idxs->first_idx); |
---|
873 | e_name_western_zap_nil (& name->middle, & idxs->middle_idx); |
---|
874 | e_name_western_zap_nil (& name->nick, & idxs->nick_idx); |
---|
875 | e_name_western_zap_nil (& name->last, & idxs->last_idx); |
---|
876 | e_name_western_zap_nil (& name->suffix, & idxs->suffix_idx); |
---|
877 | } |
---|
878 | |
---|
879 | /** |
---|
880 | * e_name_western_western_parse_fullname: |
---|
881 | * @full_name: A string containing a Western name. |
---|
882 | * |
---|
883 | * Parses @full_name and returns an #ENameWestern object filled with |
---|
884 | * the component parts of the name. |
---|
885 | */ |
---|
886 | ENameWestern * |
---|
887 | e_name_western_parse (const char *full_name) |
---|
888 | { |
---|
889 | ENameWesternIdxs *idxs; |
---|
890 | ENameWestern *wname; |
---|
891 | |
---|
892 | wname = g_new0 (ENameWestern, 1); |
---|
893 | |
---|
894 | wname->full = g_strdup (full_name); |
---|
895 | |
---|
896 | idxs = g_new0 (ENameWesternIdxs, 1); |
---|
897 | |
---|
898 | idxs->prefix_idx = -1; |
---|
899 | idxs->first_idx = -1; |
---|
900 | idxs->middle_idx = -1; |
---|
901 | idxs->nick_idx = -1; |
---|
902 | idxs->last_idx = -1; |
---|
903 | idxs->suffix_idx = -1; |
---|
904 | |
---|
905 | /* |
---|
906 | * An extremely simple algorithm. |
---|
907 | * |
---|
908 | * The goal here is to get it right 95% of the time for |
---|
909 | * Western names. |
---|
910 | * |
---|
911 | * First we check to see if this is an ass-backwards name |
---|
912 | * ("Prefix Last, First Middle Suffix"). These names really |
---|
913 | * suck (imagine "Dr von Johnson, Albert Roderick Jr"), so |
---|
914 | * we reorder them first and then parse them. |
---|
915 | * |
---|
916 | * Next, we grab the most obvious assignments for the various |
---|
917 | * parts of the name. Once this is done, we check for stupid |
---|
918 | * errors and fix them up. |
---|
919 | */ |
---|
920 | e_name_western_reorder_asshole (wname, idxs); |
---|
921 | |
---|
922 | e_name_western_extract_prefix (wname, idxs); |
---|
923 | e_name_western_extract_first (wname, idxs); |
---|
924 | e_name_western_extract_nickname (wname, idxs); |
---|
925 | e_name_western_extract_middle (wname, idxs); |
---|
926 | e_name_western_extract_last (wname, idxs); |
---|
927 | e_name_western_extract_suffix (wname, idxs); |
---|
928 | |
---|
929 | e_name_western_fixup (wname, idxs); |
---|
930 | |
---|
931 | g_free (idxs); |
---|
932 | |
---|
933 | return wname; |
---|
934 | } |
---|
935 | |
---|
936 | /** |
---|
937 | * e_name_western_free: |
---|
938 | * @name: An ENameWestern object which needs to be freed. |
---|
939 | * |
---|
940 | * Deep-frees @name |
---|
941 | */ |
---|
942 | void |
---|
943 | e_name_western_free (ENameWestern *w) |
---|
944 | { |
---|
945 | |
---|
946 | g_free (w->prefix); |
---|
947 | g_free (w->first); |
---|
948 | g_free (w->middle); |
---|
949 | g_free (w->nick); |
---|
950 | g_free (w->last); |
---|
951 | g_free (w->suffix); |
---|
952 | |
---|
953 | g_free (w->full); |
---|
954 | |
---|
955 | g_free (w); |
---|
956 | } |
---|