[16786] | 1 | /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ |
---|
[16769] | 2 | /* |
---|
| 3 | * A simple Western name parser. |
---|
| 4 | * |
---|
| 5 | * <Nat> Jamie, do you know anything about name parsing? |
---|
| 6 | * <jwz> Are you going down that rat hole? Bring a flashlight. |
---|
| 7 | * |
---|
[16786] | 8 | * Authors: |
---|
| 9 | * Nat Friedman <nat@ximian.com> |
---|
[16769] | 10 | * |
---|
[16786] | 11 | * Copyright 1999 - 2001, Ximian, Inc. |
---|
[16769] | 12 | */ |
---|
| 13 | |
---|
| 14 | #include <ctype.h> |
---|
| 15 | #include <string.h> |
---|
| 16 | #include <glib.h> |
---|
| 17 | |
---|
| 18 | #include <ename/e-name-western.h> |
---|
| 19 | #include <ename/e-name-western-tables.h> |
---|
| 20 | |
---|
| 21 | typedef struct { |
---|
| 22 | int prefix_idx; |
---|
| 23 | int first_idx; |
---|
| 24 | int middle_idx; |
---|
| 25 | int nick_idx; |
---|
| 26 | int last_idx; |
---|
| 27 | int suffix_idx; |
---|
| 28 | } ENameWesternIdxs; |
---|
| 29 | |
---|
| 30 | static int |
---|
| 31 | e_name_western_str_count_words (char *str) |
---|
| 32 | { |
---|
| 33 | int word_count; |
---|
| 34 | char *p; |
---|
| 35 | |
---|
| 36 | word_count = 0; |
---|
| 37 | |
---|
| 38 | for (p = str; p != NULL; p = strchr (p, ' ')) { |
---|
| 39 | word_count ++; |
---|
| 40 | p ++; |
---|
| 41 | } |
---|
| 42 | |
---|
| 43 | return word_count; |
---|
| 44 | } |
---|
| 45 | |
---|
| 46 | static void |
---|
| 47 | e_name_western_cleanup_string (char **str) |
---|
| 48 | { |
---|
| 49 | char *newstr; |
---|
| 50 | char *p; |
---|
| 51 | |
---|
| 52 | if (*str == NULL) |
---|
| 53 | return; |
---|
| 54 | |
---|
| 55 | /* skip any spaces and commas at the start of the string */ |
---|
| 56 | p = *str; |
---|
| 57 | while (isspace (*p) || *p == ',') |
---|
| 58 | p ++; |
---|
| 59 | |
---|
| 60 | /* make the copy we're going to return */ |
---|
| 61 | newstr = g_strdup (p); |
---|
| 62 | |
---|
| 63 | if ( strlen(newstr) > 0) { |
---|
| 64 | /* now search from the back, skipping over any spaces and commas */ |
---|
| 65 | p = newstr + strlen (newstr) - 1; |
---|
| 66 | while (isspace (*p) || *p == ',') |
---|
| 67 | p --; |
---|
| 68 | /* advance p to after the character that caused us to exit the |
---|
| 69 | previous loop, and end the string. */ |
---|
| 70 | if ((! isspace (*p)) && *p != ',') |
---|
| 71 | p ++; |
---|
| 72 | *p = '\0'; |
---|
| 73 | } |
---|
| 74 | |
---|
| 75 | g_free (*str); |
---|
| 76 | *str = newstr; |
---|
| 77 | } |
---|
| 78 | |
---|
| 79 | static char * |
---|
| 80 | e_name_western_get_words_at_idx (char *str, int idx, int num_words) |
---|
| 81 | { |
---|
| 82 | char *words; |
---|
| 83 | char *p; |
---|
| 84 | int word_count; |
---|
| 85 | int words_len; |
---|
| 86 | |
---|
| 87 | /* |
---|
| 88 | * Walk to the end of the words. |
---|
| 89 | */ |
---|
| 90 | word_count = 0; |
---|
| 91 | p = str + idx; |
---|
| 92 | while (word_count < num_words && *p != '\0') { |
---|
| 93 | while (! isspace (*p) && *p != '\0') |
---|
| 94 | p ++; |
---|
| 95 | |
---|
| 96 | while (isspace (*p) && *p != '\0') |
---|
| 97 | p ++; |
---|
| 98 | |
---|
| 99 | word_count ++; |
---|
| 100 | } |
---|
| 101 | |
---|
| 102 | words_len = p - str - idx - 1; |
---|
| 103 | |
---|
| 104 | if (*p == '\0') |
---|
| 105 | words_len ++; |
---|
| 106 | |
---|
| 107 | words = g_malloc0 (1 + words_len); |
---|
| 108 | strncpy (words, str + idx, words_len); |
---|
| 109 | |
---|
| 110 | return words; |
---|
| 111 | } |
---|
| 112 | |
---|
| 113 | /* |
---|
| 114 | * What the fuck is wrong with glib's MAX macro. |
---|
| 115 | */ |
---|
| 116 | static int |
---|
| 117 | e_name_western_max (const int a, const int b) |
---|
| 118 | { |
---|
| 119 | if (a > b) |
---|
| 120 | return a; |
---|
| 121 | |
---|
| 122 | return b; |
---|
| 123 | } |
---|
| 124 | |
---|
| 125 | static gboolean |
---|
| 126 | e_name_western_word_is_suffix (char *word) |
---|
| 127 | { |
---|
| 128 | int i; |
---|
| 129 | |
---|
| 130 | for (i = 0; e_name_western_sfx_table [i] != NULL; i ++) { |
---|
| 131 | if (g_strcasecmp (word, e_name_western_sfx_table [i])) |
---|
| 132 | continue; |
---|
| 133 | |
---|
| 134 | return TRUE; |
---|
| 135 | } |
---|
| 136 | |
---|
| 137 | return FALSE; |
---|
| 138 | } |
---|
| 139 | |
---|
| 140 | static char * |
---|
| 141 | e_name_western_get_one_prefix_at_str (char *str) |
---|
| 142 | { |
---|
| 143 | char *word; |
---|
| 144 | int i; |
---|
| 145 | |
---|
| 146 | /* |
---|
| 147 | * Check for prefixes from our table. |
---|
| 148 | */ |
---|
| 149 | for (i = 0; e_name_western_pfx_table [i] != NULL; i ++) { |
---|
| 150 | int pfx_words; |
---|
| 151 | char *words; |
---|
| 152 | |
---|
| 153 | pfx_words = e_name_western_str_count_words (e_name_western_pfx_table [i]); |
---|
| 154 | words = e_name_western_get_words_at_idx (str, 0, pfx_words); |
---|
| 155 | |
---|
| 156 | if (! g_strcasecmp (words, e_name_western_pfx_table [i])) |
---|
| 157 | return words; |
---|
| 158 | |
---|
| 159 | g_free (words); |
---|
| 160 | } |
---|
| 161 | |
---|
| 162 | /* |
---|
| 163 | * Check for prefixes we don't know about. These are always a |
---|
| 164 | * sequence of more than one letters followed by a period. |
---|
| 165 | */ |
---|
| 166 | word = e_name_western_get_words_at_idx (str, 0, 1); |
---|
| 167 | |
---|
| 168 | if (strlen (word) > 2 && |
---|
| 169 | isalpha ((unsigned char) word [0]) && |
---|
| 170 | isalpha ((unsigned char) word [1]) && |
---|
| 171 | word [strlen (word) - 1] == '.') |
---|
| 172 | return word; |
---|
| 173 | |
---|
| 174 | g_free (word); |
---|
| 175 | |
---|
| 176 | return NULL; |
---|
| 177 | } |
---|
| 178 | |
---|
| 179 | static char * |
---|
| 180 | e_name_western_get_prefix_at_str (char *str) |
---|
| 181 | { |
---|
| 182 | char *pfx; |
---|
| 183 | char *pfx1; |
---|
| 184 | char *pfx2; |
---|
| 185 | char *p; |
---|
| 186 | |
---|
| 187 | /* Get the first prefix. */ |
---|
| 188 | pfx1 = e_name_western_get_one_prefix_at_str (str); |
---|
| 189 | |
---|
| 190 | if (pfx1 == NULL) |
---|
| 191 | return NULL; |
---|
| 192 | |
---|
| 193 | /* Check for a second prefix. */ |
---|
| 194 | p = str + strlen (pfx1); |
---|
| 195 | while (isspace (*p) && *p != '\0') |
---|
| 196 | p ++; |
---|
| 197 | |
---|
| 198 | pfx2 = e_name_western_get_one_prefix_at_str (p); |
---|
| 199 | |
---|
| 200 | if (pfx2 != NULL) { |
---|
| 201 | int pfx_len; |
---|
| 202 | |
---|
| 203 | pfx_len = (p + strlen (pfx2)) - str; |
---|
| 204 | pfx = g_malloc0 (pfx_len + 1); |
---|
| 205 | strncpy (pfx, str, pfx_len); |
---|
| 206 | } else { |
---|
| 207 | pfx = g_strdup (pfx1); |
---|
| 208 | } |
---|
| 209 | |
---|
| 210 | g_free (pfx1); |
---|
| 211 | g_free (pfx2); |
---|
| 212 | |
---|
| 213 | return pfx; |
---|
| 214 | } |
---|
| 215 | |
---|
| 216 | static void |
---|
| 217 | e_name_western_extract_prefix (ENameWestern *name, ENameWesternIdxs *idxs) |
---|
| 218 | { |
---|
| 219 | char *pfx; |
---|
| 220 | |
---|
| 221 | pfx = e_name_western_get_prefix_at_str (name->full); |
---|
| 222 | |
---|
| 223 | if (pfx == NULL) |
---|
| 224 | return; |
---|
| 225 | |
---|
| 226 | idxs->prefix_idx = 0; |
---|
| 227 | name->prefix = pfx; |
---|
| 228 | } |
---|
| 229 | |
---|
| 230 | static gboolean |
---|
| 231 | e_name_western_is_complex_last_beginning (char *word) |
---|
| 232 | { |
---|
| 233 | int i; |
---|
| 234 | |
---|
| 235 | for (i = 0; e_name_western_complex_last_table [i] != NULL; i ++) { |
---|
| 236 | |
---|
| 237 | if (! g_strcasecmp ( |
---|
| 238 | word, e_name_western_complex_last_table [i])) |
---|
| 239 | return TRUE; |
---|
| 240 | } |
---|
| 241 | |
---|
| 242 | return FALSE; |
---|
| 243 | } |
---|
| 244 | |
---|
| 245 | static void |
---|
| 246 | e_name_western_extract_first (ENameWestern *name, ENameWesternIdxs *idxs) |
---|
| 247 | { |
---|
| 248 | /* |
---|
| 249 | * If there's a prefix, then the first name is right after it. |
---|
| 250 | */ |
---|
| 251 | if (idxs->prefix_idx != -1) { |
---|
| 252 | int first_idx; |
---|
| 253 | char *p; |
---|
| 254 | |
---|
| 255 | first_idx = idxs->prefix_idx + strlen (name->prefix); |
---|
| 256 | |
---|
| 257 | /* Skip past white space. */ |
---|
| 258 | p = name->full + first_idx; |
---|
| 259 | while (isspace (*p) && *p != '\0') |
---|
| 260 | p++; |
---|
| 261 | |
---|
| 262 | if (*p == '\0') |
---|
| 263 | return; |
---|
| 264 | |
---|
| 265 | idxs->first_idx = p - name->full; |
---|
| 266 | name->first = e_name_western_get_words_at_idx ( |
---|
| 267 | name->full, idxs->first_idx, 1); |
---|
| 268 | |
---|
| 269 | } else { |
---|
| 270 | |
---|
| 271 | /* |
---|
| 272 | * Otherwise, the first name is probably the first string. |
---|
| 273 | */ |
---|
| 274 | idxs->first_idx = 0; |
---|
| 275 | name->first = e_name_western_get_words_at_idx ( |
---|
| 276 | name->full, idxs->first_idx, 1); |
---|
| 277 | } |
---|
| 278 | |
---|
| 279 | /* |
---|
| 280 | * Check that we didn't just assign the beginning of a |
---|
| 281 | * compound last name to the first name. |
---|
| 282 | */ |
---|
| 283 | if (name->first != NULL) { |
---|
| 284 | if (e_name_western_is_complex_last_beginning (name->first)) { |
---|
| 285 | g_free (name->first); |
---|
| 286 | name->first = NULL; |
---|
| 287 | idxs->first_idx = -1; |
---|
| 288 | } |
---|
| 289 | } |
---|
| 290 | } |
---|
| 291 | |
---|
| 292 | static void |
---|
| 293 | e_name_western_extract_middle (ENameWestern *name, ENameWesternIdxs *idxs) |
---|
| 294 | { |
---|
| 295 | char *word; |
---|
| 296 | int middle_idx; |
---|
| 297 | |
---|
| 298 | /* |
---|
| 299 | * Middle names can only exist if you have a first name. |
---|
| 300 | */ |
---|
| 301 | if (idxs->first_idx == -1) |
---|
| 302 | return; |
---|
| 303 | |
---|
| 304 | middle_idx = idxs->first_idx + strlen (name->first) + 1; |
---|
| 305 | |
---|
| 306 | if (middle_idx > strlen (name->full)) |
---|
| 307 | return; |
---|
| 308 | |
---|
| 309 | /* |
---|
| 310 | * Search for the first space (or the terminating \0) |
---|
| 311 | */ |
---|
| 312 | while (isspace (name->full [middle_idx]) && |
---|
| 313 | name->full [middle_idx] != '\0') |
---|
| 314 | middle_idx ++; |
---|
| 315 | |
---|
| 316 | if (name->full [middle_idx] == '\0') |
---|
| 317 | return; |
---|
| 318 | |
---|
| 319 | /* |
---|
| 320 | * Skip past the nickname, if it's there. |
---|
| 321 | */ |
---|
| 322 | if (name->full [middle_idx] == '\"') { |
---|
| 323 | if (idxs->nick_idx == -1) |
---|
| 324 | return; |
---|
| 325 | |
---|
| 326 | middle_idx = idxs->nick_idx + strlen (name->nick) + 1; |
---|
| 327 | |
---|
| 328 | while (isspace (name->full [middle_idx]) && |
---|
| 329 | name->full [middle_idx] != '\0') |
---|
| 330 | middle_idx ++; |
---|
| 331 | |
---|
| 332 | if (name->full [middle_idx] == '\0') |
---|
| 333 | return; |
---|
| 334 | } |
---|
| 335 | |
---|
| 336 | /* |
---|
| 337 | * Make sure this isn't the beginning of a complex last name. |
---|
| 338 | */ |
---|
| 339 | word = e_name_western_get_words_at_idx (name->full, middle_idx, 1); |
---|
| 340 | if (e_name_western_is_complex_last_beginning (word)) { |
---|
| 341 | g_free (word); |
---|
| 342 | return; |
---|
| 343 | } |
---|
| 344 | |
---|
| 345 | /* |
---|
| 346 | * Make sure this isn't a suffix. |
---|
| 347 | */ |
---|
| 348 | e_name_western_cleanup_string (& word); |
---|
| 349 | if (e_name_western_word_is_suffix (word)) { |
---|
| 350 | g_free (word); |
---|
| 351 | return; |
---|
| 352 | } |
---|
| 353 | |
---|
| 354 | /* |
---|
| 355 | * Make sure we didn't just grab a cute nickname. |
---|
| 356 | */ |
---|
| 357 | if (word [0] == '\"') { |
---|
| 358 | g_free (word); |
---|
| 359 | return; |
---|
| 360 | } |
---|
| 361 | |
---|
| 362 | idxs->middle_idx = middle_idx; |
---|
| 363 | name->middle = word; |
---|
| 364 | } |
---|
| 365 | |
---|
| 366 | static void |
---|
| 367 | e_name_western_extract_nickname (ENameWestern *name, ENameWesternIdxs *idxs) |
---|
| 368 | { |
---|
| 369 | int idx; |
---|
| 370 | int start_idx; |
---|
| 371 | char *str; |
---|
| 372 | |
---|
| 373 | if (idxs->first_idx == -1) |
---|
| 374 | return; |
---|
| 375 | |
---|
| 376 | if (idxs->middle_idx > idxs->first_idx) |
---|
| 377 | idx = idxs->middle_idx + strlen (name->middle); |
---|
| 378 | else |
---|
| 379 | idx = idxs->first_idx + strlen (name->first); |
---|
| 380 | |
---|
| 381 | while (name->full [idx] != '\"' && name->full [idx] != '\0') |
---|
| 382 | idx ++; |
---|
| 383 | |
---|
| 384 | if (name->full [idx] != '\"') |
---|
| 385 | return; |
---|
| 386 | |
---|
| 387 | start_idx = idx; |
---|
| 388 | |
---|
| 389 | /* |
---|
| 390 | * Advance to the next double quote. |
---|
| 391 | */ |
---|
| 392 | idx ++; |
---|
| 393 | |
---|
| 394 | while (name->full [idx] != '\"' && name->full [idx] != '\0') |
---|
| 395 | idx ++; |
---|
| 396 | |
---|
| 397 | if (name->full [idx] == '\0') |
---|
| 398 | return; |
---|
| 399 | |
---|
| 400 | str = g_malloc0 (idx - start_idx + 2); |
---|
| 401 | strncpy (str, name->full + start_idx, idx - start_idx + 1); |
---|
| 402 | |
---|
| 403 | name->nick = str; |
---|
| 404 | idxs->nick_idx = start_idx; |
---|
| 405 | } |
---|
| 406 | |
---|
| 407 | static int |
---|
| 408 | e_name_western_last_get_max_idx (ENameWestern *name, ENameWesternIdxs *idxs) |
---|
| 409 | { |
---|
| 410 | int max_idx = -1; |
---|
| 411 | |
---|
| 412 | if (name->prefix != NULL) |
---|
| 413 | max_idx = e_name_western_max ( |
---|
| 414 | max_idx, idxs->prefix_idx + strlen (name->prefix)); |
---|
| 415 | |
---|
| 416 | if (name->first != NULL) |
---|
| 417 | max_idx = e_name_western_max ( |
---|
| 418 | max_idx, idxs->first_idx + strlen (name->first)); |
---|
| 419 | |
---|
| 420 | if (name->middle != NULL) |
---|
| 421 | max_idx = e_name_western_max ( |
---|
| 422 | max_idx, idxs->middle_idx + strlen (name->middle)); |
---|
| 423 | |
---|
| 424 | if (name->nick != NULL) |
---|
| 425 | max_idx = e_name_western_max ( |
---|
| 426 | max_idx, idxs->nick_idx + strlen (name->nick)); |
---|
| 427 | |
---|
| 428 | return max_idx; |
---|
| 429 | } |
---|
| 430 | |
---|
| 431 | static void |
---|
| 432 | e_name_western_extract_last (ENameWestern *name, ENameWesternIdxs *idxs) |
---|
| 433 | { |
---|
| 434 | char *word; |
---|
| 435 | int idx = -1; |
---|
| 436 | |
---|
| 437 | idx = e_name_western_last_get_max_idx (name, idxs); |
---|
| 438 | |
---|
| 439 | /* |
---|
| 440 | * In the case where there is no preceding name element, the |
---|
| 441 | * name is either just a first name ("Nat", "John"), is a |
---|
| 442 | * single-element name ("Cher", which we treat as a first |
---|
| 443 | * name), or is just a last name. The only time we can |
---|
| 444 | * differentiate a last name alone from a single-element name |
---|
| 445 | * or a first name alone is if it's a complex last name ("de |
---|
| 446 | * Icaza", "van Josephsen"). So if there is no preceding name |
---|
| 447 | * element, we check to see whether or not the first part of |
---|
| 448 | * the name is the beginning of a complex name. If it is, |
---|
| 449 | * we subsume the entire string. If we accidentally subsume |
---|
| 450 | * the suffix, this will get fixed in the fixup routine. |
---|
| 451 | */ |
---|
| 452 | if (idx == -1) { |
---|
| 453 | word = e_name_western_get_words_at_idx (name->full, 0, 1); |
---|
| 454 | if (! e_name_western_is_complex_last_beginning (word)) { |
---|
| 455 | g_free (word); |
---|
| 456 | return; |
---|
| 457 | } |
---|
| 458 | |
---|
| 459 | name->last = g_strdup (name->full); |
---|
| 460 | idxs->last_idx = 0; |
---|
| 461 | return; |
---|
| 462 | } |
---|
| 463 | |
---|
| 464 | /* Skip past the white space. */ |
---|
| 465 | while (isspace (name->full [idx]) && name->full [idx] != '\0') |
---|
| 466 | idx ++; |
---|
| 467 | |
---|
| 468 | if (name->full [idx] == '\0') |
---|
| 469 | return; |
---|
| 470 | |
---|
| 471 | word = e_name_western_get_words_at_idx (name->full, idx, 1); |
---|
| 472 | e_name_western_cleanup_string (& word); |
---|
| 473 | if (e_name_western_word_is_suffix (word)) { |
---|
| 474 | g_free (word); |
---|
| 475 | return; |
---|
| 476 | } |
---|
| 477 | g_free (word); |
---|
| 478 | |
---|
| 479 | /* |
---|
| 480 | * Subsume the rest of the string into the last name. If we |
---|
| 481 | * accidentally include the prefix, it will get fixed later. |
---|
| 482 | * This is the only way to handle things like "Miguel de Icaza |
---|
| 483 | * Amozorrutia" without dropping data and forcing the user |
---|
| 484 | * to retype it. |
---|
| 485 | */ |
---|
| 486 | name->last = g_strdup (name->full + idx); |
---|
| 487 | idxs->last_idx = idx; |
---|
| 488 | } |
---|
| 489 | |
---|
| 490 | static char * |
---|
| 491 | e_name_western_get_preceding_word (char *str, int idx) |
---|
| 492 | { |
---|
| 493 | int word_len; |
---|
| 494 | char *word; |
---|
| 495 | char *p; |
---|
| 496 | |
---|
| 497 | p = str + idx; |
---|
| 498 | |
---|
| 499 | while (isspace (*p) && p > str) |
---|
| 500 | p --; |
---|
| 501 | |
---|
| 502 | while (! isspace (*p) && p > str) |
---|
| 503 | p --; |
---|
| 504 | |
---|
| 505 | if (isspace (*p)) |
---|
| 506 | p ++; |
---|
| 507 | |
---|
| 508 | word_len = (str + idx) - p; |
---|
| 509 | word = g_malloc0 (word_len + 1); |
---|
| 510 | if (word_len > 0) |
---|
| 511 | strncpy (word, p, word_len); |
---|
| 512 | |
---|
| 513 | return word; |
---|
| 514 | } |
---|
| 515 | |
---|
| 516 | static char * |
---|
| 517 | e_name_western_get_suffix_at_str_end (char *str) |
---|
| 518 | { |
---|
| 519 | char *suffix; |
---|
| 520 | char *p; |
---|
| 521 | |
---|
| 522 | /* |
---|
| 523 | * Walk backwards till we reach the beginning of the |
---|
| 524 | * (potentially-comma-separated) list of suffixes. |
---|
| 525 | */ |
---|
| 526 | p = str + strlen (str); |
---|
| 527 | while (1) { |
---|
| 528 | char *nextp; |
---|
| 529 | char *word; |
---|
| 530 | |
---|
| 531 | word = e_name_western_get_preceding_word (str, p - str); |
---|
| 532 | nextp = p - strlen (word) - 1; |
---|
| 533 | |
---|
| 534 | e_name_western_cleanup_string (& word); |
---|
| 535 | |
---|
| 536 | if (e_name_western_word_is_suffix (word)) { |
---|
| 537 | p = nextp; |
---|
| 538 | g_free (word); |
---|
| 539 | } else { |
---|
| 540 | g_free (word); |
---|
| 541 | break; |
---|
| 542 | } |
---|
| 543 | } |
---|
| 544 | |
---|
| 545 | if (p == (str + strlen (str))) |
---|
| 546 | return NULL; |
---|
| 547 | |
---|
| 548 | suffix = g_strdup (p); |
---|
| 549 | e_name_western_cleanup_string (& suffix); |
---|
| 550 | |
---|
| 551 | if (strlen (suffix) == 0) { |
---|
| 552 | g_free (suffix); |
---|
| 553 | return NULL; |
---|
| 554 | } |
---|
| 555 | |
---|
| 556 | return suffix; |
---|
| 557 | } |
---|
| 558 | |
---|
| 559 | static void |
---|
| 560 | e_name_western_extract_suffix (ENameWestern *name, ENameWesternIdxs *idxs) |
---|
| 561 | { |
---|
| 562 | |
---|
| 563 | name->suffix = e_name_western_get_suffix_at_str_end (name->full); |
---|
| 564 | |
---|
| 565 | if (name->suffix == NULL) |
---|
| 566 | return; |
---|
| 567 | |
---|
| 568 | idxs->suffix_idx = strlen (name->full) - strlen (name->suffix); |
---|
| 569 | } |
---|
| 570 | |
---|
| 571 | static gboolean |
---|
| 572 | e_name_western_detect_backwards (ENameWestern *name, ENameWesternIdxs *idxs) |
---|
| 573 | { |
---|
| 574 | char *comma; |
---|
| 575 | char *word; |
---|
| 576 | |
---|
| 577 | comma = strchr (name->full, ','); |
---|
| 578 | |
---|
| 579 | if (comma == NULL) |
---|
| 580 | return FALSE; |
---|
| 581 | |
---|
| 582 | /* |
---|
| 583 | * If there's a comma, we need to detect whether it's |
---|
| 584 | * separating the last name from the first or just separating |
---|
| 585 | * suffixes. So we grab the word which comes before the |
---|
| 586 | * comma and check if it's a suffix. |
---|
| 587 | */ |
---|
| 588 | word = e_name_western_get_preceding_word (name->full, comma - name->full); |
---|
| 589 | |
---|
| 590 | if (e_name_western_word_is_suffix (word)) { |
---|
| 591 | g_free (word); |
---|
| 592 | return FALSE; |
---|
| 593 | } |
---|
| 594 | |
---|
| 595 | g_free (word); |
---|
| 596 | return TRUE; |
---|
| 597 | } |
---|
| 598 | |
---|
| 599 | static void |
---|
| 600 | e_name_western_reorder_asshole (ENameWestern *name, ENameWesternIdxs *idxs) |
---|
| 601 | { |
---|
| 602 | char *prefix; |
---|
| 603 | char *last; |
---|
| 604 | char *suffix; |
---|
| 605 | char *firstmidnick; |
---|
| 606 | char *newfull; |
---|
| 607 | |
---|
| 608 | char *comma; |
---|
| 609 | char *p; |
---|
| 610 | |
---|
| 611 | if (! e_name_western_detect_backwards (name, idxs)) |
---|
| 612 | return; |
---|
| 613 | |
---|
| 614 | /* |
---|
| 615 | * Convert |
---|
| 616 | * <Prefix> <Last name>, <First name> <Middle[+nick] name> <Suffix> |
---|
| 617 | * to |
---|
| 618 | * <Prefix> <First name> <Middle[+nick] name> <Last name> <Suffix> |
---|
| 619 | */ |
---|
| 620 | |
---|
| 621 | /* |
---|
| 622 | * Grab the prefix from the beginning. |
---|
| 623 | */ |
---|
| 624 | prefix = e_name_western_get_prefix_at_str (name->full); |
---|
| 625 | |
---|
| 626 | /* |
---|
| 627 | * Everything from the end of the prefix to the comma is the |
---|
| 628 | * last name. |
---|
| 629 | */ |
---|
| 630 | comma = strchr (name->full, ','); |
---|
| 631 | if (comma == NULL) |
---|
| 632 | return; |
---|
| 633 | |
---|
| 634 | p = name->full + (prefix == NULL ? 0 : strlen (prefix)); |
---|
| 635 | |
---|
| 636 | while (isspace (*p) && *p != '\0') |
---|
| 637 | p ++; |
---|
| 638 | |
---|
| 639 | last = g_malloc0 (comma - p + 1); |
---|
| 640 | strncpy (last, p, comma - p); |
---|
| 641 | |
---|
| 642 | /* |
---|
| 643 | * Get the suffix off the end. |
---|
| 644 | */ |
---|
| 645 | suffix = e_name_western_get_suffix_at_str_end (name->full); |
---|
| 646 | |
---|
| 647 | /* |
---|
| 648 | * Firstmidnick is everything from the comma to the beginning |
---|
| 649 | * of the suffix. |
---|
| 650 | */ |
---|
| 651 | p = comma + 1; |
---|
| 652 | |
---|
| 653 | while (isspace (*p) && *p != '\0') |
---|
| 654 | p ++; |
---|
| 655 | |
---|
| 656 | if (suffix != NULL) { |
---|
| 657 | char *q; |
---|
| 658 | |
---|
| 659 | /* |
---|
| 660 | * Point q at the beginning of the suffix. |
---|
| 661 | */ |
---|
| 662 | q = name->full + strlen (name->full) - strlen (suffix) - 1; |
---|
| 663 | |
---|
| 664 | /* |
---|
| 665 | * Walk backwards until we hit the space which |
---|
| 666 | * separates the suffix from firstmidnick. |
---|
| 667 | */ |
---|
| 668 | while (! isspace (*q) && q > comma) |
---|
| 669 | q --; |
---|
| 670 | |
---|
| 671 | if ((q - p + 1) > 0) { |
---|
| 672 | firstmidnick = g_malloc0 (q - p + 1); |
---|
| 673 | strncpy (firstmidnick, p, q - p); |
---|
| 674 | } else |
---|
| 675 | firstmidnick = NULL; |
---|
| 676 | } else { |
---|
| 677 | firstmidnick = g_strdup (p); |
---|
| 678 | } |
---|
| 679 | |
---|
| 680 | /* |
---|
| 681 | * Create our new reordered version of the name. |
---|
| 682 | */ |
---|
| 683 | #define NULLSTR(a) ((a) == NULL ? "" : (a)) |
---|
| 684 | newfull = g_strdup_printf ("%s %s %s %s", NULLSTR (prefix), NULLSTR (firstmidnick), |
---|
| 685 | NULLSTR (last), NULLSTR (suffix)); |
---|
| 686 | g_strstrip (newfull); |
---|
| 687 | g_free (name->full); |
---|
| 688 | name->full = newfull; |
---|
| 689 | |
---|
| 690 | |
---|
| 691 | g_free (prefix); |
---|
| 692 | g_free (firstmidnick); |
---|
| 693 | g_free (last); |
---|
| 694 | g_free (suffix); |
---|
| 695 | } |
---|
| 696 | |
---|
| 697 | static void |
---|
| 698 | e_name_western_zap_nil (char **str, int *idx) |
---|
| 699 | { |
---|
| 700 | if (*str == NULL) |
---|
| 701 | return; |
---|
| 702 | |
---|
| 703 | if (strlen (*str) != 0) |
---|
| 704 | return; |
---|
| 705 | |
---|
| 706 | *idx = -1; |
---|
| 707 | g_free (*str); |
---|
| 708 | *str = NULL; |
---|
| 709 | } |
---|
| 710 | |
---|
[16786] | 711 | #define FINISH_CHECK_MIDDLE_NAME_FOR_CONJUNCTION \ |
---|
| 712 | char *last_start = NULL; \ |
---|
| 713 | if (name->last) \ |
---|
| 714 | last_start = strchr (name->last, ' '); \ |
---|
| 715 | if (last_start) { \ |
---|
| 716 | char *new_last, *new_first; \ |
---|
| 717 | \ |
---|
| 718 | new_last = g_strdup (last_start + 1); \ |
---|
| 719 | *last_start = '\0'; \ |
---|
| 720 | \ |
---|
| 721 | idxs->last_idx += (last_start - name->last) + 1; \ |
---|
| 722 | \ |
---|
| 723 | new_first = g_strdup_printf ("%s %s %s", name->first, name->middle, name->last); \ |
---|
| 724 | \ |
---|
| 725 | g_free (name->first); \ |
---|
| 726 | g_free (name->middle); \ |
---|
| 727 | g_free (name->last); \ |
---|
| 728 | \ |
---|
| 729 | name->first = new_first; \ |
---|
| 730 | name->middle = NULL; \ |
---|
| 731 | name->last = new_last; \ |
---|
| 732 | \ |
---|
| 733 | idxs->middle_idx = -1; \ |
---|
| 734 | } else { \ |
---|
| 735 | char *new_first; \ |
---|
| 736 | \ |
---|
| 737 | new_first = g_strdup_printf ("%s %s %s", name->first, name->middle, name->last); \ |
---|
| 738 | \ |
---|
| 739 | g_free (name->first); \ |
---|
| 740 | g_free (name->middle); \ |
---|
| 741 | g_free (name->last); \ |
---|
| 742 | \ |
---|
| 743 | name->first = new_first; \ |
---|
| 744 | name->middle = NULL; \ |
---|
| 745 | name->last = NULL; \ |
---|
| 746 | idxs->middle_idx = -1; \ |
---|
| 747 | idxs->last_idx = -1; \ |
---|
| 748 | } |
---|
| 749 | |
---|
| 750 | #define CHECK_MIDDLE_NAME_FOR_CONJUNCTION(conj) \ |
---|
| 751 | if (idxs->middle_idx != -1 && !strcmp (name->middle, conj)) { \ |
---|
| 752 | FINISH_CHECK_MIDDLE_NAME_FOR_CONJUNCTION \ |
---|
| 753 | } |
---|
| 754 | |
---|
| 755 | #define CHECK_MIDDLE_NAME_FOR_CONJUNCTION_CASE(conj) \ |
---|
| 756 | if (idxs->middle_idx != -1 && !strcasecmp (name->middle, conj)) { \ |
---|
| 757 | FINISH_CHECK_MIDDLE_NAME_FOR_CONJUNCTION \ |
---|
| 758 | } |
---|
| 759 | |
---|
[16769] | 760 | static void |
---|
| 761 | e_name_western_fixup (ENameWestern *name, ENameWesternIdxs *idxs) |
---|
| 762 | { |
---|
| 763 | /* |
---|
| 764 | * The middle and last names cannot be the same. |
---|
| 765 | */ |
---|
| 766 | if (idxs->middle_idx != -1 && idxs->middle_idx == idxs->last_idx) { |
---|
| 767 | idxs->middle_idx = -1; |
---|
| 768 | g_free (name->middle); |
---|
| 769 | name->middle = NULL; |
---|
| 770 | } |
---|
| 771 | |
---|
| 772 | /* |
---|
| 773 | * If we have a middle name and no last name, then we mistook |
---|
| 774 | * the last name for the middle name. |
---|
| 775 | */ |
---|
| 776 | if (idxs->last_idx == -1 && idxs->middle_idx != -1) { |
---|
| 777 | idxs->last_idx = idxs->middle_idx; |
---|
| 778 | name->last = name->middle; |
---|
| 779 | name->middle = NULL; |
---|
| 780 | idxs->middle_idx = -1; |
---|
| 781 | } |
---|
| 782 | |
---|
| 783 | /* |
---|
| 784 | * Check to see if we accidentally included the suffix in the |
---|
| 785 | * last name. |
---|
| 786 | */ |
---|
| 787 | if (idxs->suffix_idx != -1 && idxs->last_idx != -1 && |
---|
| 788 | idxs->suffix_idx < (idxs->last_idx + strlen (name->last))) { |
---|
| 789 | char *sfx; |
---|
| 790 | |
---|
| 791 | sfx = name->last + (idxs->suffix_idx - idxs->last_idx); |
---|
| 792 | if (sfx != NULL) { |
---|
| 793 | char *newlast; |
---|
| 794 | char *p; |
---|
| 795 | |
---|
| 796 | p = sfx - 1; |
---|
| 797 | while (isspace (*p) && p > name->last) |
---|
| 798 | p --; |
---|
| 799 | p ++; |
---|
| 800 | |
---|
| 801 | newlast = g_malloc0 (p - name->last + 1); |
---|
| 802 | strncpy (newlast, name->last, p - name->last); |
---|
| 803 | g_free (name->last); |
---|
| 804 | name->last = newlast; |
---|
| 805 | } |
---|
| 806 | } |
---|
| 807 | |
---|
| 808 | /* |
---|
| 809 | * If we have a prefix and a first name, but no last name, |
---|
| 810 | * then we need to assign the first name to the last name. |
---|
| 811 | * This way we get things like "Mr Friedman" correctly. |
---|
| 812 | */ |
---|
| 813 | if (idxs->first_idx != -1 && idxs->prefix_idx != -1 && |
---|
| 814 | idxs->last_idx == -1) { |
---|
| 815 | name->last = name->first; |
---|
| 816 | idxs->last_idx = idxs->first_idx; |
---|
| 817 | idxs->first_idx = -1; |
---|
| 818 | name->first = NULL; |
---|
| 819 | } |
---|
| 820 | |
---|
[16786] | 821 | if (idxs->middle_idx != -1) { |
---|
| 822 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION ("&"); |
---|
| 823 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION ("*"); |
---|
| 824 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION ("|"); |
---|
| 825 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION ("^"); |
---|
| 826 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION ("&&"); |
---|
| 827 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION ("||"); |
---|
| 828 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION ("+"); |
---|
| 829 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION ("-"); |
---|
| 830 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION_CASE ("and"); |
---|
| 831 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION_CASE ("or"); |
---|
| 832 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION_CASE ("plus"); |
---|
| 833 | |
---|
| 834 | /* Spanish */ |
---|
| 835 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION_CASE ("y"); |
---|
| 836 | |
---|
| 837 | /* German */ |
---|
| 838 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION_CASE ("und"); |
---|
| 839 | |
---|
| 840 | /* Italian */ |
---|
| 841 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION_CASE ("e"); |
---|
| 842 | |
---|
| 843 | /* Czech */ |
---|
| 844 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION_CASE ("a"); |
---|
| 845 | |
---|
| 846 | /* Finnish */ |
---|
| 847 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION_CASE ("ja"); |
---|
| 848 | |
---|
| 849 | /* French */ |
---|
| 850 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION_CASE ("et"); |
---|
| 851 | |
---|
| 852 | /* Russian */ |
---|
| 853 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION ("\xd0\x98"); /* u+0418 */ |
---|
| 854 | CHECK_MIDDLE_NAME_FOR_CONJUNCTION ("\xd0\xb8"); /* u+0438 */ |
---|
| 855 | } |
---|
| 856 | |
---|
[16769] | 857 | /* |
---|
| 858 | * Remove stray spaces and commas (although there don't seem |
---|
| 859 | * to be any in the test cases, they might show up later). |
---|
| 860 | */ |
---|
| 861 | e_name_western_cleanup_string (& name->prefix); |
---|
| 862 | e_name_western_cleanup_string (& name->first); |
---|
| 863 | e_name_western_cleanup_string (& name->middle); |
---|
| 864 | e_name_western_cleanup_string (& name->nick); |
---|
| 865 | e_name_western_cleanup_string (& name->last); |
---|
| 866 | e_name_western_cleanup_string (& name->suffix); |
---|
| 867 | |
---|
| 868 | /* |
---|
| 869 | * Make zero-length strings just NULL. |
---|
| 870 | */ |
---|
| 871 | e_name_western_zap_nil (& name->prefix, & idxs->prefix_idx); |
---|
| 872 | e_name_western_zap_nil (& name->first, & idxs->first_idx); |
---|
| 873 | e_name_western_zap_nil (& name->middle, & idxs->middle_idx); |
---|
| 874 | e_name_western_zap_nil (& name->nick, & idxs->nick_idx); |
---|
| 875 | e_name_western_zap_nil (& name->last, & idxs->last_idx); |
---|
| 876 | e_name_western_zap_nil (& name->suffix, & idxs->suffix_idx); |
---|
| 877 | } |
---|
| 878 | |
---|
| 879 | /** |
---|
| 880 | * e_name_western_western_parse_fullname: |
---|
| 881 | * @full_name: A string containing a Western name. |
---|
| 882 | * |
---|
| 883 | * Parses @full_name and returns an #ENameWestern object filled with |
---|
| 884 | * the component parts of the name. |
---|
| 885 | */ |
---|
| 886 | ENameWestern * |
---|
| 887 | e_name_western_parse (const char *full_name) |
---|
| 888 | { |
---|
| 889 | ENameWesternIdxs *idxs; |
---|
| 890 | ENameWestern *wname; |
---|
| 891 | |
---|
| 892 | wname = g_new0 (ENameWestern, 1); |
---|
| 893 | |
---|
| 894 | wname->full = g_strdup (full_name); |
---|
| 895 | |
---|
| 896 | idxs = g_new0 (ENameWesternIdxs, 1); |
---|
| 897 | |
---|
| 898 | idxs->prefix_idx = -1; |
---|
| 899 | idxs->first_idx = -1; |
---|
| 900 | idxs->middle_idx = -1; |
---|
| 901 | idxs->nick_idx = -1; |
---|
| 902 | idxs->last_idx = -1; |
---|
| 903 | idxs->suffix_idx = -1; |
---|
| 904 | |
---|
| 905 | /* |
---|
| 906 | * An extremely simple algorithm. |
---|
| 907 | * |
---|
| 908 | * The goal here is to get it right 95% of the time for |
---|
| 909 | * Western names. |
---|
| 910 | * |
---|
| 911 | * First we check to see if this is an ass-backwards name |
---|
| 912 | * ("Prefix Last, First Middle Suffix"). These names really |
---|
| 913 | * suck (imagine "Dr von Johnson, Albert Roderick Jr"), so |
---|
| 914 | * we reorder them first and then parse them. |
---|
| 915 | * |
---|
| 916 | * Next, we grab the most obvious assignments for the various |
---|
| 917 | * parts of the name. Once this is done, we check for stupid |
---|
| 918 | * errors and fix them up. |
---|
| 919 | */ |
---|
| 920 | e_name_western_reorder_asshole (wname, idxs); |
---|
| 921 | |
---|
| 922 | e_name_western_extract_prefix (wname, idxs); |
---|
| 923 | e_name_western_extract_first (wname, idxs); |
---|
| 924 | e_name_western_extract_nickname (wname, idxs); |
---|
| 925 | e_name_western_extract_middle (wname, idxs); |
---|
| 926 | e_name_western_extract_last (wname, idxs); |
---|
| 927 | e_name_western_extract_suffix (wname, idxs); |
---|
| 928 | |
---|
| 929 | e_name_western_fixup (wname, idxs); |
---|
| 930 | |
---|
| 931 | g_free (idxs); |
---|
| 932 | |
---|
| 933 | return wname; |
---|
| 934 | } |
---|
| 935 | |
---|
| 936 | /** |
---|
| 937 | * e_name_western_free: |
---|
| 938 | * @name: An ENameWestern object which needs to be freed. |
---|
| 939 | * |
---|
| 940 | * Deep-frees @name |
---|
| 941 | */ |
---|
| 942 | void |
---|
| 943 | e_name_western_free (ENameWestern *w) |
---|
| 944 | { |
---|
| 945 | |
---|
| 946 | g_free (w->prefix); |
---|
| 947 | g_free (w->first); |
---|
| 948 | g_free (w->middle); |
---|
| 949 | g_free (w->nick); |
---|
| 950 | g_free (w->last); |
---|
| 951 | g_free (w->suffix); |
---|
| 952 | |
---|
| 953 | g_free (w->full); |
---|
| 954 | |
---|
| 955 | g_free (w); |
---|
| 956 | } |
---|