source: trunk/third/perl/utf8.c @ 18450

Revision 18450, 48.8 KB checked in by zacheiss, 21 years ago (diff)
This commit was generated by cvs2svn to compensate for changes in r18449, which included commits to RCS files with non-trunk default branches.
Line 
1/*    utf8.c
2 *
3 *    Copyright (c) 1998-2002, Larry Wall
4 *
5 *    You may distribute under the terms of either the GNU General Public
6 *    License or the Artistic License, as specified in the README file.
7 *
8 */
9
10/*
11 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
12 * heard of that we don't want to see any closer; and that's the one place
13 * we're trying to get to!  And that's just where we can't get, nohow.'
14 *
15 * 'Well do I understand your speech,' he answered in the same language;
16 * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
17 * as is the custom in the West, if you wish to be answered?'
18 *
19 * ...the travellers perceived that the floor was paved with stones of many
20 * hues; branching runes and strange devices intertwined beneath their feet.
21 */
22
23#include "EXTERN.h"
24#define PERL_IN_UTF8_C
25#include "perl.h"
26
27static char unees[] = "Malformed UTF-8 character (unexpected end of string)";
28
29/*
30=head1 Unicode Support
31
32=for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
33
34Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
35of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
36bytes available. The return value is the pointer to the byte after the
37end of the new character. In other words,
38
39    d = uvuni_to_utf8_flags(d, uv, flags);
40
41or, in most cases,
42
43    d = uvuni_to_utf8(d, uv);
44
45(which is equivalent to)
46
47    d = uvuni_to_utf8_flags(d, uv, 0);
48
49is the recommended Unicode-aware way of saying
50
51    *(d++) = uv;
52
53=cut
54*/
55
56U8 *
57Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
58{
59    if (ckWARN(WARN_UTF8)) {
60         if (UNICODE_IS_SURROGATE(uv) &&
61             !(flags & UNICODE_ALLOW_SURROGATE))
62              Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
63         else if (
64                  ((uv >= 0xFDD0 && uv <= 0xFDEF &&
65                    !(flags & UNICODE_ALLOW_FDD0))
66                   ||
67                   ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
68                    !(flags & UNICODE_ALLOW_FFFF))) &&
69                  /* UNICODE_ALLOW_SUPER includes
70                   * FFFEs and FFFFs beyond 0x10FFFF. */
71                  ((uv <= PERL_UNICODE_MAX) ||
72                   !(flags & UNICODE_ALLOW_SUPER))
73                  )
74              Perl_warner(aTHX_ packWARN(WARN_UTF8),
75                         "Unicode character 0x%04"UVxf" is illegal", uv);
76    }
77    if (UNI_IS_INVARIANT(uv)) {
78        *d++ = (U8)UTF_TO_NATIVE(uv);
79        return d;
80    }
81#if defined(EBCDIC)
82    else {
83        STRLEN len  = UNISKIP(uv);
84        U8 *p = d+len-1;
85        while (p > d) {
86            *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
87            uv >>= UTF_ACCUMULATION_SHIFT;
88        }
89        *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
90        return d+len;
91    }
92#else /* Non loop style */
93    if (uv < 0x800) {
94        *d++ = (U8)(( uv >>  6)         | 0xc0);
95        *d++ = (U8)(( uv        & 0x3f) | 0x80);
96        return d;
97    }
98    if (uv < 0x10000) {
99        *d++ = (U8)(( uv >> 12)         | 0xe0);
100        *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
101        *d++ = (U8)(( uv        & 0x3f) | 0x80);
102        return d;
103    }
104    if (uv < 0x200000) {
105        *d++ = (U8)(( uv >> 18)         | 0xf0);
106        *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
107        *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
108        *d++ = (U8)(( uv        & 0x3f) | 0x80);
109        return d;
110    }
111    if (uv < 0x4000000) {
112        *d++ = (U8)(( uv >> 24)         | 0xf8);
113        *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
114        *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
115        *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
116        *d++ = (U8)(( uv        & 0x3f) | 0x80);
117        return d;
118    }
119    if (uv < 0x80000000) {
120        *d++ = (U8)(( uv >> 30)         | 0xfc);
121        *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
122        *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
123        *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
124        *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
125        *d++ = (U8)(( uv        & 0x3f) | 0x80);
126        return d;
127    }
128#ifdef HAS_QUAD
129    if (uv < UTF8_QUAD_MAX)
130#endif
131    {
132        *d++ =                            0xfe; /* Can't match U+FEFF! */
133        *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
134        *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
135        *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
136        *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
137        *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
138        *d++ = (U8)(( uv        & 0x3f) | 0x80);
139        return d;
140    }
141#ifdef HAS_QUAD
142    {
143        *d++ =                            0xff;         /* Can't match U+FFFE! */
144        *d++ =                            0x80;         /* 6 Reserved bits */
145        *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80);        /* 2 Reserved bits */
146        *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
147        *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
148        *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
149        *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
150        *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
151        *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
152        *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
153        *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
154        *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
155        *d++ = (U8)(( uv        & 0x3f) | 0x80);
156        return d;
157    }
158#endif
159#endif /* Loop style */
160}
161 
162U8 *
163Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
164{
165    return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
166}
167
168
169/*
170=for apidoc A|STRLEN|is_utf8_char|U8 *s
171
172Tests if some arbitrary number of bytes begins in a valid UTF-8
173character.  Note that an INVARIANT (i.e. ASCII) character is a valid
174UTF-8 character.  The actual number of bytes in the UTF-8 character
175will be returned if it is valid, otherwise 0.
176
177=cut */
178STRLEN
179Perl_is_utf8_char(pTHX_ U8 *s)
180{
181    U8 u = *s;
182    STRLEN slen, len;
183    UV uv, ouv;
184
185    if (UTF8_IS_INVARIANT(u))
186        return 1;
187
188    if (!UTF8_IS_START(u))
189        return 0;
190
191    len = UTF8SKIP(s);
192
193    if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
194        return 0;
195
196    slen = len - 1;
197    s++;
198    u &= UTF_START_MASK(len);
199    uv  = u;
200    ouv = uv;
201    while (slen--) {
202        if (!UTF8_IS_CONTINUATION(*s))
203            return 0;
204        uv = UTF8_ACCUMULATE(uv, *s);
205        if (uv < ouv)
206            return 0;
207        ouv = uv;
208        s++;
209    }
210
211    if ((STRLEN)UNISKIP(uv) < len)
212        return 0;
213
214    return len;
215}
216
217/*
218=for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
219
220Returns true if first C<len> bytes of the given string form a valid UTF8
221string, false otherwise.  Note that 'a valid UTF8 string' does not mean
222'a string that contains UTF8' because a valid ASCII string is a valid
223UTF8 string.
224
225=cut
226*/
227
228bool
229Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
230{
231    U8* x = s;
232    U8* send;
233    STRLEN c;
234
235    if (!len)
236        len = strlen((char *)s);
237    send = s + len;
238
239    while (x < send) {
240         /* Inline the easy bits of is_utf8_char() here for speed... */
241         if (UTF8_IS_INVARIANT(*x))
242              c = 1;
243         else if (!UTF8_IS_START(*x))
244              return FALSE;
245         else {
246              /* ... and call is_utf8_char() only if really needed. */
247              c = is_utf8_char(x);
248              if (!c)
249                   return FALSE;
250         }
251        x += c;
252    }
253    if (x != send)
254        return FALSE;
255
256    return TRUE;
257}
258
259/*
260=for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
261
262Bottom level UTF-8 decode routine.
263Returns the unicode code point value of the first character in the string C<s>
264which is assumed to be in UTF8 encoding and no longer than C<curlen>;
265C<retlen> will be set to the length, in bytes, of that character.
266
267If C<s> does not point to a well-formed UTF8 character, the behaviour
268is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
269it is assumed that the caller will raise a warning, and this function
270will silently just set C<retlen> to C<-1> and return zero.  If the
271C<flags> does not contain UTF8_CHECK_ONLY, warnings about
272malformations will be given, C<retlen> will be set to the expected
273length of the UTF-8 character in bytes, and zero will be returned.
274
275The C<flags> can also contain various flags to allow deviations from
276the strict UTF-8 encoding (see F<utf8.h>).
277
278Most code should use utf8_to_uvchr() rather than call this directly.
279
280=cut
281*/
282
283UV
284Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
285{
286    U8 *s0 = s;
287    UV uv = *s, ouv = 0;
288    STRLEN len = 1;
289    bool dowarn = ckWARN_d(WARN_UTF8);
290    UV startbyte = *s;
291    STRLEN expectlen = 0;
292    U32 warning = 0;
293
294/* This list is a superset of the UTF8_ALLOW_XXX. */
295
296#define UTF8_WARN_EMPTY                          1
297#define UTF8_WARN_CONTINUATION                   2
298#define UTF8_WARN_NON_CONTINUATION               3
299#define UTF8_WARN_FE_FF                          4
300#define UTF8_WARN_SHORT                          5
301#define UTF8_WARN_OVERFLOW                       6
302#define UTF8_WARN_SURROGATE                      7
303#define UTF8_WARN_LONG                           8
304#define UTF8_WARN_FFFF                           9 /* Also FFFE. */
305
306    if (curlen == 0 &&
307        !(flags & UTF8_ALLOW_EMPTY)) {
308        warning = UTF8_WARN_EMPTY;
309        goto malformed;
310    }
311
312    if (UTF8_IS_INVARIANT(uv)) {
313        if (retlen)
314            *retlen = 1;
315        return (UV) (NATIVE_TO_UTF(*s));
316    }
317
318    if (UTF8_IS_CONTINUATION(uv) &&
319        !(flags & UTF8_ALLOW_CONTINUATION)) {
320        warning = UTF8_WARN_CONTINUATION;
321        goto malformed;
322    }
323
324    if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
325        !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
326        warning = UTF8_WARN_NON_CONTINUATION;
327        goto malformed;
328    }
329
330#ifdef EBCDIC
331    uv = NATIVE_TO_UTF(uv);
332#else
333    if ((uv == 0xfe || uv == 0xff) &&
334        !(flags & UTF8_ALLOW_FE_FF)) {
335        warning = UTF8_WARN_FE_FF;
336        goto malformed;
337    }
338#endif
339
340    if      (!(uv & 0x20))      { len =  2; uv &= 0x1f; }
341    else if (!(uv & 0x10))      { len =  3; uv &= 0x0f; }
342    else if (!(uv & 0x08))      { len =  4; uv &= 0x07; }
343    else if (!(uv & 0x04))      { len =  5; uv &= 0x03; }
344#ifdef EBCDIC
345    else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
346    else                        { len =  7; uv &= 0x01; }
347#else
348    else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
349    else if (!(uv & 0x01))      { len =  7; uv = 0; }
350    else                        { len = 13; uv = 0; } /* whoa! */
351#endif
352
353    if (retlen)
354        *retlen = len;
355
356    expectlen = len;
357
358    if ((curlen < expectlen) &&
359        !(flags & UTF8_ALLOW_SHORT)) {
360        warning = UTF8_WARN_SHORT;
361        goto malformed;
362    }
363
364    len--;
365    s++;
366    ouv = uv;
367
368    while (len--) {
369        if (!UTF8_IS_CONTINUATION(*s) &&
370            !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
371            s--;
372            warning = UTF8_WARN_NON_CONTINUATION;
373            goto malformed;
374        }
375        else
376            uv = UTF8_ACCUMULATE(uv, *s);
377        if (!(uv > ouv)) {
378            /* These cannot be allowed. */
379            if (uv == ouv) {
380                if (!(flags & UTF8_ALLOW_LONG)) {
381                    warning = UTF8_WARN_LONG;
382                    goto malformed;
383                }
384            }
385            else { /* uv < ouv */
386                /* This cannot be allowed. */
387                warning = UTF8_WARN_OVERFLOW;
388                goto malformed;
389            }
390        }
391        s++;
392        ouv = uv;
393    }
394
395    if (UNICODE_IS_SURROGATE(uv) &&
396        !(flags & UTF8_ALLOW_SURROGATE)) {
397        warning = UTF8_WARN_SURROGATE;
398        goto malformed;
399    } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
400               !(flags & UTF8_ALLOW_LONG)) {
401        warning = UTF8_WARN_LONG;
402        goto malformed;
403    } else if (UNICODE_IS_ILLEGAL(uv) &&
404               !(flags & UTF8_ALLOW_FFFF)) {
405        warning = UTF8_WARN_FFFF;
406        goto malformed;
407    }
408
409    return uv;
410
411malformed:
412
413    if (flags & UTF8_CHECK_ONLY) {
414        if (retlen)
415            *retlen = -1;
416        return 0;
417    }
418
419    if (dowarn) {
420        SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
421
422        switch (warning) {
423        case 0: /* Intentionally empty. */ break;
424        case UTF8_WARN_EMPTY:
425            Perl_sv_catpvf(aTHX_ sv, "(empty string)");
426            break;
427        case UTF8_WARN_CONTINUATION:
428            Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
429            break;
430        case UTF8_WARN_NON_CONTINUATION:
431            if (s == s0)
432                Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
433                           (UV)s[1], startbyte);
434            else
435                Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
436                           (UV)s[1], s - s0, s - s0 > 1 ? "s" : "", startbyte, expectlen);
437             
438            break;
439        case UTF8_WARN_FE_FF:
440            Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
441            break;
442        case UTF8_WARN_SHORT:
443            Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
444                           curlen, curlen == 1 ? "" : "s", expectlen, startbyte);
445            expectlen = curlen;         /* distance for caller to skip */
446            break;
447        case UTF8_WARN_OVERFLOW:
448            Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
449                           ouv, *s, startbyte);
450            break;
451        case UTF8_WARN_SURROGATE:
452            Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
453            break;
454        case UTF8_WARN_LONG:
455            Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
456                           expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
457            break;
458        case UTF8_WARN_FFFF:
459            Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
460            break;
461        default:
462            Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
463            break;
464        }
465       
466        if (warning) {
467            char *s = SvPVX(sv);
468
469            if (PL_op)
470                Perl_warner(aTHX_ packWARN(WARN_UTF8),
471                            "%s in %s", s,  OP_DESC(PL_op));
472            else
473                Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
474        }
475    }
476
477    if (retlen)
478        *retlen = expectlen ? expectlen : len;
479
480    return 0;
481}
482
483/*
484=for apidoc A|UV|utf8_to_uvchr|U8 *s|STRLEN *retlen
485
486Returns the native character value of the first character in the string C<s>
487which is assumed to be in UTF8 encoding; C<retlen> will be set to the
488length, in bytes, of that character.
489
490If C<s> does not point to a well-formed UTF8 character, zero is
491returned and retlen is set, if possible, to -1.
492
493=cut
494*/
495
496UV
497Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
498{
499    return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen,
500                               ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
501}
502
503/*
504=for apidoc A|UV|utf8_to_uvuni|U8 *s|STRLEN *retlen
505
506Returns the Unicode code point of the first character in the string C<s>
507which is assumed to be in UTF8 encoding; C<retlen> will be set to the
508length, in bytes, of that character.
509
510This function should only be used when returned UV is considered
511an index into the Unicode semantic tables (e.g. swashes).
512
513If C<s> does not point to a well-formed UTF8 character, zero is
514returned and retlen is set, if possible, to -1.
515
516=cut
517*/
518
519UV
520Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
521{
522    /* Call the low level routine asking for checks */
523    return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen,
524                               ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
525}
526
527/*
528=for apidoc A|STRLEN|utf8_length|U8 *s|U8 *e
529
530Return the length of the UTF-8 char encoded string C<s> in characters.
531Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
532up past C<e>, croaks.
533
534=cut
535*/
536
537STRLEN
538Perl_utf8_length(pTHX_ U8 *s, U8 *e)
539{
540    STRLEN len = 0;
541
542    /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
543     * the bitops (especially ~) can create illegal UTF-8.
544     * In other words: in Perl UTF-8 is not just for Unicode. */
545
546    if (e < s) {
547        if (ckWARN_d(WARN_UTF8)) {
548            if (PL_op)
549                Perl_warner(aTHX_ packWARN(WARN_UTF8),
550                            "%s in %s", unees, OP_DESC(PL_op));
551            else
552                Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
553        }
554        return 0;
555    }
556    while (s < e) {
557        U8 t = UTF8SKIP(s);
558
559        if (e - s < t) {
560            if (ckWARN_d(WARN_UTF8)) {
561                if (PL_op)
562                    Perl_warner(aTHX_ packWARN(WARN_UTF8),
563                                unees, OP_DESC(PL_op));
564                else
565                    Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
566            }
567            return len;
568        }
569        s += t;
570        len++;
571    }
572
573    return len;
574}
575
576/*
577=for apidoc A|IV|utf8_distance|U8 *a|U8 *b
578
579Returns the number of UTF8 characters between the UTF-8 pointers C<a>
580and C<b>.
581
582WARNING: use only if you *know* that the pointers point inside the
583same UTF-8 buffer.
584
585=cut
586*/
587
588IV
589Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
590{
591    IV off = 0;
592
593    /* Note: cannot use UTF8_IS_...() too eagerly here since  e.g.
594     * the bitops (especially ~) can create illegal UTF-8.
595     * In other words: in Perl UTF-8 is not just for Unicode. */
596
597    if (a < b) {
598        while (a < b) {
599            U8 c = UTF8SKIP(a);
600
601            if (b - a < c) {
602                if (ckWARN_d(WARN_UTF8)) {
603                    if (PL_op)
604                        Perl_warner(aTHX_ packWARN(WARN_UTF8),
605                                    "%s in %s", unees, OP_DESC(PL_op));
606                    else
607                        Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
608                }
609                return off;
610            }
611            a += c;
612            off--;
613        }
614    }
615    else {
616        while (b < a) {
617            U8 c = UTF8SKIP(b);
618
619            if (a - b < c) {
620                if (ckWARN_d(WARN_UTF8)) {
621                    if (PL_op)
622                        Perl_warner(aTHX_ packWARN(WARN_UTF8),
623                                    "%s in %s", unees, OP_DESC(PL_op));
624                    else
625                        Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
626                }
627                return off;
628            }
629            b += c;
630            off++;
631        }
632    }
633
634    return off;
635}
636
637/*
638=for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
639
640Return the UTF-8 pointer C<s> displaced by C<off> characters, either
641forward or backward.
642
643WARNING: do not use the following unless you *know* C<off> is within
644the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
645on the first byte of character or just after the last byte of a character.
646
647=cut
648*/
649
650U8 *
651Perl_utf8_hop(pTHX_ U8 *s, I32 off)
652{
653    /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
654     * the bitops (especially ~) can create illegal UTF-8.
655     * In other words: in Perl UTF-8 is not just for Unicode. */
656
657    if (off >= 0) {
658        while (off--)
659            s += UTF8SKIP(s);
660    }
661    else {
662        while (off++) {
663            s--;
664            while (UTF8_IS_CONTINUATION(*s))
665                s--;
666        }
667    }
668    return s;
669}
670
671/*
672=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
673
674Converts a string C<s> of length C<len> from UTF8 into byte encoding.
675Unlike C<bytes_to_utf8>, this over-writes the original string, and
676updates len to contain the new length.
677Returns zero on failure, setting C<len> to -1.
678
679=cut
680*/
681
682U8 *
683Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
684{
685    U8 *send;
686    U8 *d;
687    U8 *save = s;
688
689    /* ensure valid UTF8 and chars < 256 before updating string */
690    for (send = s + *len; s < send; ) {
691        U8 c = *s++;
692
693        if (!UTF8_IS_INVARIANT(c) &&
694            (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
695             || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
696            *len = -1;
697            return 0;
698        }
699    }
700
701    d = s = save;
702    while (s < send) {
703        STRLEN ulen;
704        *d++ = (U8)utf8_to_uvchr(s, &ulen);
705        s += ulen;
706    }
707    *d = '\0';
708    *len = d - save;
709    return save;
710}
711
712/*
713=for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
714
715Converts a string C<s> of length C<len> from UTF8 into byte encoding.
716Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
717the newly-created string, and updates C<len> to contain the new
718length.  Returns the original string if no conversion occurs, C<len>
719is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
7200 if C<s> is converted or contains all 7bit characters.
721
722=cut
723*/
724
725U8 *
726Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8)
727{
728    U8 *d;
729    U8 *start = s;
730    U8 *send;
731    I32 count = 0;
732
733    if (!*is_utf8)
734        return start;
735
736    /* ensure valid UTF8 and chars < 256 before converting string */
737    for (send = s + *len; s < send;) {
738        U8 c = *s++;
739        if (!UTF8_IS_INVARIANT(c)) {
740            if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
741                (c = *s++) && UTF8_IS_CONTINUATION(c))
742                count++;
743            else
744                return start;
745        }
746    }
747
748    *is_utf8 = 0;               
749
750    Newz(801, d, (*len) - count + 1, U8);
751    s = start; start = d;
752    while (s < send) {
753        U8 c = *s++;
754        if (!UTF8_IS_INVARIANT(c)) {
755            /* Then it is two-byte encoded */
756            c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
757            c = ASCII_TO_NATIVE(c);
758        }
759        *d++ = c;
760    }
761    *d = '\0';
762    *len = d - start;
763    return start;
764}
765
766/*
767=for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
768
769Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
770Returns a pointer to the newly-created string, and sets C<len> to
771reflect the new length.
772
773=cut
774*/
775
776U8*
777Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len)
778{
779    U8 *send;
780    U8 *d;
781    U8 *dst;
782    send = s + (*len);
783
784    Newz(801, d, (*len) * 2 + 1, U8);
785    dst = d;
786
787    while (s < send) {
788        UV uv = NATIVE_TO_ASCII(*s++);
789        if (UNI_IS_INVARIANT(uv))
790            *d++ = (U8)UTF_TO_NATIVE(uv);
791        else {
792            *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
793            *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
794        }
795    }
796    *d = '\0';
797    *len = d-dst;
798    return dst;
799}
800
801/*
802 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
803 *
804 * Destination must be pre-extended to 3/2 source.  Do not use in-place.
805 * We optimize for native, for obvious reasons. */
806
807U8*
808Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
809{
810    U8* pend;
811    U8* dstart = d;
812
813    if (bytelen & 1)
814        Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
815
816    pend = p + bytelen;
817
818    while (p < pend) {
819        UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
820        p += 2;
821        if (uv < 0x80) {
822            *d++ = (U8)uv;
823            continue;
824        }
825        if (uv < 0x800) {
826            *d++ = (U8)(( uv >>  6)         | 0xc0);
827            *d++ = (U8)(( uv        & 0x3f) | 0x80);
828            continue;
829        }
830        if (uv >= 0xd800 && uv < 0xdbff) {      /* surrogates */
831            UV low = *p++;
832            if (low < 0xdc00 || low >= 0xdfff)
833                Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
834            uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
835        }
836        if (uv < 0x10000) {
837            *d++ = (U8)(( uv >> 12)         | 0xe0);
838            *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
839            *d++ = (U8)(( uv        & 0x3f) | 0x80);
840            continue;
841        }
842        else {
843            *d++ = (U8)(( uv >> 18)         | 0xf0);
844            *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
845            *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
846            *d++ = (U8)(( uv        & 0x3f) | 0x80);
847            continue;
848        }
849    }
850    *newlen = d - dstart;
851    return d;
852}
853
854/* Note: this one is slightly destructive of the source. */
855
856U8*
857Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
858{
859    U8* s = (U8*)p;
860    U8* send = s + bytelen;
861    while (s < send) {
862        U8 tmp = s[0];
863        s[0] = s[1];
864        s[1] = tmp;
865        s += 2;
866    }
867    return utf16_to_utf8(p, d, bytelen, newlen);
868}
869
870/* for now these are all defined (inefficiently) in terms of the utf8 versions */
871
872bool
873Perl_is_uni_alnum(pTHX_ UV c)
874{
875    U8 tmpbuf[UTF8_MAXLEN+1];
876    uvchr_to_utf8(tmpbuf, c);
877    return is_utf8_alnum(tmpbuf);
878}
879
880bool
881Perl_is_uni_alnumc(pTHX_ UV c)
882{
883    U8 tmpbuf[UTF8_MAXLEN+1];
884    uvchr_to_utf8(tmpbuf, c);
885    return is_utf8_alnumc(tmpbuf);
886}
887
888bool
889Perl_is_uni_idfirst(pTHX_ UV c)
890{
891    U8 tmpbuf[UTF8_MAXLEN+1];
892    uvchr_to_utf8(tmpbuf, c);
893    return is_utf8_idfirst(tmpbuf);
894}
895
896bool
897Perl_is_uni_alpha(pTHX_ UV c)
898{
899    U8 tmpbuf[UTF8_MAXLEN+1];
900    uvchr_to_utf8(tmpbuf, c);
901    return is_utf8_alpha(tmpbuf);
902}
903
904bool
905Perl_is_uni_ascii(pTHX_ UV c)
906{
907    U8 tmpbuf[UTF8_MAXLEN+1];
908    uvchr_to_utf8(tmpbuf, c);
909    return is_utf8_ascii(tmpbuf);
910}
911
912bool
913Perl_is_uni_space(pTHX_ UV c)
914{
915    U8 tmpbuf[UTF8_MAXLEN+1];
916    uvchr_to_utf8(tmpbuf, c);
917    return is_utf8_space(tmpbuf);
918}
919
920bool
921Perl_is_uni_digit(pTHX_ UV c)
922{
923    U8 tmpbuf[UTF8_MAXLEN+1];
924    uvchr_to_utf8(tmpbuf, c);
925    return is_utf8_digit(tmpbuf);
926}
927
928bool
929Perl_is_uni_upper(pTHX_ UV c)
930{
931    U8 tmpbuf[UTF8_MAXLEN+1];
932    uvchr_to_utf8(tmpbuf, c);
933    return is_utf8_upper(tmpbuf);
934}
935
936bool
937Perl_is_uni_lower(pTHX_ UV c)
938{
939    U8 tmpbuf[UTF8_MAXLEN+1];
940    uvchr_to_utf8(tmpbuf, c);
941    return is_utf8_lower(tmpbuf);
942}
943
944bool
945Perl_is_uni_cntrl(pTHX_ UV c)
946{
947    U8 tmpbuf[UTF8_MAXLEN+1];
948    uvchr_to_utf8(tmpbuf, c);
949    return is_utf8_cntrl(tmpbuf);
950}
951
952bool
953Perl_is_uni_graph(pTHX_ UV c)
954{
955    U8 tmpbuf[UTF8_MAXLEN+1];
956    uvchr_to_utf8(tmpbuf, c);
957    return is_utf8_graph(tmpbuf);
958}
959
960bool
961Perl_is_uni_print(pTHX_ UV c)
962{
963    U8 tmpbuf[UTF8_MAXLEN+1];
964    uvchr_to_utf8(tmpbuf, c);
965    return is_utf8_print(tmpbuf);
966}
967
968bool
969Perl_is_uni_punct(pTHX_ UV c)
970{
971    U8 tmpbuf[UTF8_MAXLEN+1];
972    uvchr_to_utf8(tmpbuf, c);
973    return is_utf8_punct(tmpbuf);
974}
975
976bool
977Perl_is_uni_xdigit(pTHX_ UV c)
978{
979    U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
980    uvchr_to_utf8(tmpbuf, c);
981    return is_utf8_xdigit(tmpbuf);
982}
983
984UV
985Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
986{
987    uvchr_to_utf8(p, c);
988    return to_utf8_upper(p, p, lenp);
989}
990
991UV
992Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
993{
994    uvchr_to_utf8(p, c);
995    return to_utf8_title(p, p, lenp);
996}
997
998UV
999Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1000{
1001    uvchr_to_utf8(p, c);
1002    return to_utf8_lower(p, p, lenp);
1003}
1004
1005UV
1006Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1007{
1008    uvchr_to_utf8(p, c);
1009    return to_utf8_fold(p, p, lenp);
1010}
1011
1012/* for now these all assume no locale info available for Unicode > 255 */
1013
1014bool
1015Perl_is_uni_alnum_lc(pTHX_ UV c)
1016{
1017    return is_uni_alnum(c);     /* XXX no locale support yet */
1018}
1019
1020bool
1021Perl_is_uni_alnumc_lc(pTHX_ UV c)
1022{
1023    return is_uni_alnumc(c);    /* XXX no locale support yet */
1024}
1025
1026bool
1027Perl_is_uni_idfirst_lc(pTHX_ UV c)
1028{
1029    return is_uni_idfirst(c);   /* XXX no locale support yet */
1030}
1031
1032bool
1033Perl_is_uni_alpha_lc(pTHX_ UV c)
1034{
1035    return is_uni_alpha(c);     /* XXX no locale support yet */
1036}
1037
1038bool
1039Perl_is_uni_ascii_lc(pTHX_ UV c)
1040{
1041    return is_uni_ascii(c);     /* XXX no locale support yet */
1042}
1043
1044bool
1045Perl_is_uni_space_lc(pTHX_ UV c)
1046{
1047    return is_uni_space(c);     /* XXX no locale support yet */
1048}
1049
1050bool
1051Perl_is_uni_digit_lc(pTHX_ UV c)
1052{
1053    return is_uni_digit(c);     /* XXX no locale support yet */
1054}
1055
1056bool
1057Perl_is_uni_upper_lc(pTHX_ UV c)
1058{
1059    return is_uni_upper(c);     /* XXX no locale support yet */
1060}
1061
1062bool
1063Perl_is_uni_lower_lc(pTHX_ UV c)
1064{
1065    return is_uni_lower(c);     /* XXX no locale support yet */
1066}
1067
1068bool
1069Perl_is_uni_cntrl_lc(pTHX_ UV c)
1070{
1071    return is_uni_cntrl(c);     /* XXX no locale support yet */
1072}
1073
1074bool
1075Perl_is_uni_graph_lc(pTHX_ UV c)
1076{
1077    return is_uni_graph(c);     /* XXX no locale support yet */
1078}
1079
1080bool
1081Perl_is_uni_print_lc(pTHX_ UV c)
1082{
1083    return is_uni_print(c);     /* XXX no locale support yet */
1084}
1085
1086bool
1087Perl_is_uni_punct_lc(pTHX_ UV c)
1088{
1089    return is_uni_punct(c);     /* XXX no locale support yet */
1090}
1091
1092bool
1093Perl_is_uni_xdigit_lc(pTHX_ UV c)
1094{
1095    return is_uni_xdigit(c);    /* XXX no locale support yet */
1096}
1097
1098U32
1099Perl_to_uni_upper_lc(pTHX_ U32 c)
1100{
1101    /* XXX returns only the first character -- do not use XXX */
1102    /* XXX no locale support yet */
1103    STRLEN len;
1104    U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1105    return (U32)to_uni_upper(c, tmpbuf, &len);
1106}
1107
1108U32
1109Perl_to_uni_title_lc(pTHX_ U32 c)
1110{
1111    /* XXX returns only the first character XXX -- do not use XXX */
1112    /* XXX no locale support yet */
1113    STRLEN len;
1114    U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1115    return (U32)to_uni_title(c, tmpbuf, &len);
1116}
1117
1118U32
1119Perl_to_uni_lower_lc(pTHX_ U32 c)
1120{
1121    /* XXX returns only the first character -- do not use XXX */
1122    /* XXX no locale support yet */
1123    STRLEN len;
1124    U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1125    return (U32)to_uni_lower(c, tmpbuf, &len);
1126}
1127
1128bool
1129Perl_is_utf8_alnum(pTHX_ U8 *p)
1130{
1131    if (!is_utf8_char(p))
1132        return FALSE;
1133    if (!PL_utf8_alnum)
1134        /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1135         * descendant of isalnum(3), in other words, it doesn't
1136         * contain the '_'. --jhi */
1137        PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
1138    return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1139/*    return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1140#ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1141    if (!PL_utf8_alnum)
1142        PL_utf8_alnum = swash_init("utf8", "",
1143            sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1144    return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1145#endif
1146}
1147
1148bool
1149Perl_is_utf8_alnumc(pTHX_ U8 *p)
1150{
1151    if (!is_utf8_char(p))
1152        return FALSE;
1153    if (!PL_utf8_alnum)
1154        PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1155    return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1156/*    return is_utf8_alpha(p) || is_utf8_digit(p); */
1157#ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1158    if (!PL_utf8_alnum)
1159        PL_utf8_alnum = swash_init("utf8", "",
1160            sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1161    return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1162#endif
1163}
1164
1165bool
1166Perl_is_utf8_idfirst(pTHX_ U8 *p) /* The naming is historical. */
1167{
1168    if (*p == '_')
1169        return TRUE;
1170    if (!is_utf8_char(p))
1171        return FALSE;
1172    if (!PL_utf8_idstart) /* is_utf8_idstart would be more logical. */
1173        PL_utf8_idstart = swash_init("utf8", "IdStart", &PL_sv_undef, 0, 0);
1174    return swash_fetch(PL_utf8_idstart, p, TRUE) != 0;
1175}
1176
1177bool
1178Perl_is_utf8_idcont(pTHX_ U8 *p)
1179{
1180    if (*p == '_')
1181        return TRUE;
1182    if (!is_utf8_char(p))
1183        return FALSE;
1184    if (!PL_utf8_idcont)
1185        PL_utf8_idcont = swash_init("utf8", "IdContinue", &PL_sv_undef, 0, 0);
1186    return swash_fetch(PL_utf8_idcont, p, TRUE) != 0;
1187}
1188
1189bool
1190Perl_is_utf8_alpha(pTHX_ U8 *p)
1191{
1192    if (!is_utf8_char(p))
1193        return FALSE;
1194    if (!PL_utf8_alpha)
1195        PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
1196    return swash_fetch(PL_utf8_alpha, p, TRUE) != 0;
1197}
1198
1199bool
1200Perl_is_utf8_ascii(pTHX_ U8 *p)
1201{
1202    if (!is_utf8_char(p))
1203        return FALSE;
1204    if (!PL_utf8_ascii)
1205        PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1206    return swash_fetch(PL_utf8_ascii, p, TRUE) != 0;
1207}
1208
1209bool
1210Perl_is_utf8_space(pTHX_ U8 *p)
1211{
1212    if (!is_utf8_char(p))
1213        return FALSE;
1214    if (!PL_utf8_space)
1215        PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
1216    return swash_fetch(PL_utf8_space, p, TRUE) != 0;
1217}
1218
1219bool
1220Perl_is_utf8_digit(pTHX_ U8 *p)
1221{
1222    if (!is_utf8_char(p))
1223        return FALSE;
1224    if (!PL_utf8_digit)
1225        PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
1226    return swash_fetch(PL_utf8_digit, p, TRUE) != 0;
1227}
1228
1229bool
1230Perl_is_utf8_upper(pTHX_ U8 *p)
1231{
1232    if (!is_utf8_char(p))
1233        return FALSE;
1234    if (!PL_utf8_upper)
1235        PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
1236    return swash_fetch(PL_utf8_upper, p, TRUE) != 0;
1237}
1238
1239bool
1240Perl_is_utf8_lower(pTHX_ U8 *p)
1241{
1242    if (!is_utf8_char(p))
1243        return FALSE;
1244    if (!PL_utf8_lower)
1245        PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
1246    return swash_fetch(PL_utf8_lower, p, TRUE) != 0;
1247}
1248
1249bool
1250Perl_is_utf8_cntrl(pTHX_ U8 *p)
1251{
1252    if (!is_utf8_char(p))
1253        return FALSE;
1254    if (!PL_utf8_cntrl)
1255        PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1256    return swash_fetch(PL_utf8_cntrl, p, TRUE) != 0;
1257}
1258
1259bool
1260Perl_is_utf8_graph(pTHX_ U8 *p)
1261{
1262    if (!is_utf8_char(p))
1263        return FALSE;
1264    if (!PL_utf8_graph)
1265        PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1266    return swash_fetch(PL_utf8_graph, p, TRUE) != 0;
1267}
1268
1269bool
1270Perl_is_utf8_print(pTHX_ U8 *p)
1271{
1272    if (!is_utf8_char(p))
1273        return FALSE;
1274    if (!PL_utf8_print)
1275        PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
1276    return swash_fetch(PL_utf8_print, p, TRUE) != 0;
1277}
1278
1279bool
1280Perl_is_utf8_punct(pTHX_ U8 *p)
1281{
1282    if (!is_utf8_char(p))
1283        return FALSE;
1284    if (!PL_utf8_punct)
1285        PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1286    return swash_fetch(PL_utf8_punct, p, TRUE) != 0;
1287}
1288
1289bool
1290Perl_is_utf8_xdigit(pTHX_ U8 *p)
1291{
1292    if (!is_utf8_char(p))
1293        return FALSE;
1294    if (!PL_utf8_xdigit)
1295        PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1296    return swash_fetch(PL_utf8_xdigit, p, TRUE) != 0;
1297}
1298
1299bool
1300Perl_is_utf8_mark(pTHX_ U8 *p)
1301{
1302    if (!is_utf8_char(p))
1303        return FALSE;
1304    if (!PL_utf8_mark)
1305        PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
1306    return swash_fetch(PL_utf8_mark, p, TRUE) != 0;
1307}
1308
1309/*
1310=for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1311
1312The "p" contains the pointer to the UTF-8 string encoding
1313the character that is being converted.
1314
1315The "ustrp" is a pointer to the character buffer to put the
1316conversion result to.  The "lenp" is a pointer to the length
1317of the result.
1318
1319The "swashp" is a pointer to the swash to use.
1320
1321Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1322and loaded by SWASHGET, using lib/utf8_heavy.pl.  The special (usually,
1323but not always, a multicharacter mapping), is tried first.
1324
1325The "special" is a string like "utf8::ToSpecLower", which means the
1326hash %utf8::ToSpecLower.  The access to the hash is through
1327Perl_to_utf8_case().
1328
1329The "normal" is a string like "ToLower" which means the swash
1330%utf8::ToLower.
1331
1332=cut */
1333
1334UV
1335Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, char *normal, char *special)
1336{
1337    UV uv0, uv1;
1338    U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
1339    STRLEN len = 0;
1340
1341    uv0 = utf8_to_uvchr(p, 0);
1342    /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1343     * are necessary in EBCDIC, they are redundant no-ops
1344     * in ASCII-ish platforms, and hopefully optimized away. */
1345    uv1 = NATIVE_TO_UNI(uv0);
1346    uvuni_to_utf8(tmpbuf, uv1);
1347
1348    if (!*swashp) /* load on-demand */
1349         *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1350
1351    if (special) {
1352         /* It might be "special" (sometimes, but not always,
1353          * a multicharacter mapping) */
1354         HV *hv;
1355         SV *keysv;
1356         HE *he;
1357         SV *val;
1358       
1359         if ((hv    = get_hv(special, FALSE)) &&
1360             (keysv = sv_2mortal(Perl_newSVpvf(aTHX_ "%04"UVXf, uv1))) &&
1361             (he    = hv_fetch_ent(hv, keysv, FALSE, 0)) &&
1362             (val   = HeVAL(he))) {
1363             char *s;
1364
1365              s = SvPV(val, len);
1366              if (len == 1)
1367                   len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
1368              else {
1369#ifdef EBCDIC
1370                   /* If we have EBCDIC we need to remap the characters
1371                    * since any characters in the low 256 are Unicode
1372                    * code points, not EBCDIC. */
1373                   U8 *t = (U8*)s, *tend = t + len, *d;
1374               
1375                   d = tmpbuf;
1376                   if (SvUTF8(val)) {
1377                        STRLEN tlen = 0;
1378                       
1379                        while (t < tend) {
1380                             UV c = utf8_to_uvchr(t, &tlen);
1381                             if (tlen > 0) {
1382                                  d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1383                                  t += tlen;
1384                             }
1385                             else
1386                                  break;
1387                        }
1388                   }
1389                   else {
1390                        while (t < tend) {
1391                             d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1392                             t++;
1393                        }
1394                   }
1395                   len = d - tmpbuf;
1396                   Copy(tmpbuf, ustrp, len, U8);
1397#else
1398                   Copy(s, ustrp, len, U8);
1399#endif
1400              }
1401         }
1402    }
1403
1404    if (!len && *swashp) {
1405         UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1406         
1407         if (uv2) {
1408              /* It was "normal" (a single character mapping). */
1409              UV uv3 = UNI_TO_NATIVE(uv2);
1410             
1411              len = uvchr_to_utf8(ustrp, uv3) - ustrp;
1412         }
1413    }
1414
1415    if (!len) /* Neither: just copy. */
1416         len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1417
1418    if (lenp)
1419         *lenp = len;
1420
1421    return len ? utf8_to_uvchr(ustrp, 0) : 0;
1422}
1423
1424/*
1425=for apidoc A|UV|to_utf8_upper|U8 *p|U8 *ustrp|STRLEN *lenp
1426
1427Convert the UTF-8 encoded character at p to its uppercase version and
1428store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1429that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1430uppercase version may be longer than the original character (up to two
1431characters).
1432
1433The first character of the uppercased version is returned
1434(but note, as explained above, that there may be more.)
1435
1436=cut */
1437
1438UV
1439Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1440{
1441    return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1442                             &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1443}
1444
1445/*
1446=for apidoc A|UV|to_utf8_title|U8 *p|U8 *ustrp|STRLEN *lenp
1447
1448Convert the UTF-8 encoded character at p to its titlecase version and
1449store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1450that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1451titlecase version may be longer than the original character (up to two
1452characters).
1453
1454The first character of the titlecased version is returned
1455(but note, as explained above, that there may be more.)
1456
1457=cut */
1458
1459UV
1460Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1461{
1462    return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1463                             &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1464}
1465
1466/*
1467=for apidoc A|UV|to_utf8_lower|U8 *p|U8 *ustrp|STRLEN *lenp
1468
1469Convert the UTF-8 encoded character at p to its lowercase version and
1470store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1471that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1472lowercase version may be longer than the original character (up to two
1473characters).
1474
1475The first character of the lowercased version is returned
1476(but note, as explained above, that there may be more.)
1477
1478=cut */
1479
1480UV
1481Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1482{
1483    return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1484                             &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1485}
1486
1487/*
1488=for apidoc A|UV|to_utf8_fold|U8 *p|U8 *ustrp|STRLEN *lenp
1489
1490Convert the UTF-8 encoded character at p to its foldcase version and
1491store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1492that the ustrp needs to be at least UTF8_MAXLEN_FOLD+1 bytes since the
1493foldcase version may be longer than the original character (up to
1494three characters).
1495
1496The first character of the foldcased version is returned
1497(but note, as explained above, that there may be more.)
1498
1499=cut */
1500
1501UV
1502Perl_to_utf8_fold(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1503{
1504    return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1505                             &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1506}
1507
1508/* a "swash" is a swatch hash */
1509
1510SV*
1511Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
1512{
1513    SV* retval;
1514    SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
1515    dSP;
1516    HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
1517    SV* errsv_save;
1518
1519    if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {      /* demand load utf8 */
1520        ENTER;
1521        errsv_save = newSVsv(ERRSV);
1522        Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
1523        if (!SvTRUE(ERRSV))
1524            sv_setsv(ERRSV, errsv_save);
1525        SvREFCNT_dec(errsv_save);
1526        LEAVE;
1527    }
1528    SPAGAIN;
1529    PUSHSTACKi(PERLSI_MAGIC);
1530    PUSHMARK(SP);
1531    EXTEND(SP,5);
1532    PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1533    PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1534    PUSHs(listsv);
1535    PUSHs(sv_2mortal(newSViv(minbits)));
1536    PUSHs(sv_2mortal(newSViv(none)));
1537    PUTBACK;
1538    ENTER;
1539    SAVEI32(PL_hints);
1540    PL_hints = 0;
1541    save_re_context();
1542    if (PL_curcop == &PL_compiling) {
1543        /* XXX ought to be handled by lex_start */
1544        SAVEI32(PL_in_my);
1545        sv_setpv(tokenbufsv, PL_tokenbuf);
1546    }
1547    errsv_save = newSVsv(ERRSV);
1548    if (call_method("SWASHNEW", G_SCALAR))
1549        retval = newSVsv(*PL_stack_sp--);
1550    else
1551        retval = &PL_sv_undef;
1552    if (!SvTRUE(ERRSV))
1553        sv_setsv(ERRSV, errsv_save);
1554    SvREFCNT_dec(errsv_save);
1555    LEAVE;
1556    POPSTACK;
1557    if (PL_curcop == &PL_compiling) {
1558        STRLEN len;
1559        char* pv = SvPV(tokenbufsv, len);
1560
1561        Copy(pv, PL_tokenbuf, len+1, char);
1562        PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
1563    }
1564    if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1565        if (SvPOK(retval))
1566            Perl_croak(aTHX_ "Can't find Unicode property definition \"%s\"",
1567                       SvPV_nolen(retval));
1568        Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1569    }
1570    return retval;
1571}
1572
1573
1574/* This API is wrong for special case conversions since we may need to
1575 * return several Unicode characters for a single Unicode character
1576 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1577 * the lower-level routine, and it is similarly broken for returning
1578 * multiple values.  --jhi */
1579UV
1580Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
1581{
1582    HV* hv = (HV*)SvRV(sv);
1583    U32 klen;
1584    U32 off;
1585    STRLEN slen;
1586    STRLEN needents;
1587    U8 *tmps = NULL;
1588    U32 bit;
1589    SV *retval;
1590    U8 tmputf8[2];
1591    UV c = NATIVE_TO_ASCII(*ptr);
1592
1593    if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1594        tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1595        tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1596        ptr = tmputf8;
1597    }
1598    /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1599     * then the "swatch" is a vec() for al the chars which start
1600     * with 0xAA..0xYY
1601     * So the key in the hash (klen) is length of encoded char -1
1602     */
1603    klen = UTF8SKIP(ptr) - 1;
1604    off  = ptr[klen];
1605
1606    if (klen == 0)
1607     {
1608      /* If char in invariant then swatch is for all the invariant chars
1609       * In both UTF-8 and UTF8-MOD that happens to be UTF_CONTINUATION_MARK
1610       */
1611      needents = UTF_CONTINUATION_MARK;
1612      off      = NATIVE_TO_UTF(ptr[klen]);
1613     }
1614    else
1615     {
1616      /* If char is encoded then swatch is for the prefix */
1617      needents = (1 << UTF_ACCUMULATION_SHIFT);
1618      off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1619     }
1620
1621    /*
1622     * This single-entry cache saves about 1/3 of the utf8 overhead in test
1623     * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
1624     * it's nothing to sniff at.)  Pity we usually come through at least
1625     * two function calls to get here...
1626     *
1627     * NB: this code assumes that swatches are never modified, once generated!
1628     */
1629
1630    if (hv   == PL_last_swash_hv &&
1631        klen == PL_last_swash_klen &&
1632        (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1633    {
1634        tmps = PL_last_swash_tmps;
1635        slen = PL_last_swash_slen;
1636    }
1637    else {
1638        /* Try our second-level swatch cache, kept in a hash. */
1639        SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
1640
1641        /* If not cached, generate it via utf8::SWASHGET */
1642        if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
1643            dSP;
1644            /* We use utf8n_to_uvuni() as we want an index into
1645               Unicode tables, not a native character number.
1646             */
1647            UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, 0,
1648                                           ckWARN(WARN_UTF8) ?
1649                                           0 : UTF8_ALLOW_ANY);
1650            SV *errsv_save;
1651            ENTER;
1652            SAVETMPS;
1653            save_re_context();
1654            PUSHSTACKi(PERLSI_MAGIC);
1655            PUSHMARK(SP);
1656            EXTEND(SP,3);
1657            PUSHs((SV*)sv);
1658            /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1659            PUSHs(sv_2mortal(newSViv((klen) ?
1660                                     (code_point & ~(needents - 1)) : 0)));
1661            PUSHs(sv_2mortal(newSViv(needents)));
1662            PUTBACK;
1663            errsv_save = newSVsv(ERRSV);
1664            if (call_method("SWASHGET", G_SCALAR))
1665                retval = newSVsv(*PL_stack_sp--);
1666            else
1667                retval = &PL_sv_undef;
1668            if (!SvTRUE(ERRSV))
1669                sv_setsv(ERRSV, errsv_save);
1670            SvREFCNT_dec(errsv_save);
1671            POPSTACK;
1672            FREETMPS;
1673            LEAVE;
1674            if (PL_curcop == &PL_compiling)
1675                PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
1676
1677            svp = hv_store(hv, (char*)ptr, klen, retval, 0);
1678
1679            if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
1680                Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
1681        }
1682
1683        PL_last_swash_hv = hv;
1684        PL_last_swash_klen = klen;
1685        PL_last_swash_tmps = tmps;
1686        PL_last_swash_slen = slen;
1687        if (klen)
1688            Copy(ptr, PL_last_swash_key, klen, U8);
1689    }
1690
1691    switch ((int)((slen << 3) / needents)) {
1692    case 1:
1693        bit = 1 << (off & 7);
1694        off >>= 3;
1695        return (tmps[off] & bit) != 0;
1696    case 8:
1697        return tmps[off];
1698    case 16:
1699        off <<= 1;
1700        return (tmps[off] << 8) + tmps[off + 1] ;
1701    case 32:
1702        off <<= 2;
1703        return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1704    }
1705    Perl_croak(aTHX_ "panic: swash_fetch");
1706    return 0;
1707}
1708
1709
1710/*
1711=for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
1712
1713Adds the UTF8 representation of the Native codepoint C<uv> to the end
1714of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
1715bytes available. The return value is the pointer to the byte after the
1716end of the new character. In other words,
1717
1718    d = uvchr_to_utf8(d, uv);
1719
1720is the recommended wide native character-aware way of saying
1721
1722    *(d++) = uv;
1723
1724=cut
1725*/
1726
1727/* On ASCII machines this is normally a macro but we want a
1728   real function in case XS code wants it
1729*/
1730#undef Perl_uvchr_to_utf8
1731U8 *
1732Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1733{
1734    return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
1735}
1736
1737U8 *
1738Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
1739{
1740    return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
1741}
1742
1743/*
1744=for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
1745
1746Returns the native character value of the first character in the string C<s>
1747which is assumed to be in UTF8 encoding; C<retlen> will be set to the
1748length, in bytes, of that character.
1749
1750Allows length and flags to be passed to low level routine.
1751
1752=cut
1753*/
1754/* On ASCII machines this is normally a macro but we want
1755   a real function in case XS code wants it
1756*/
1757#undef Perl_utf8n_to_uvchr
1758UV
1759Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
1760{
1761    UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1762    return UNI_TO_NATIVE(uv);
1763}
1764
1765/*
1766=for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1767
1768Build to the scalar dsv a displayable version of the string spv,
1769length len, the displayable version being at most pvlim bytes long
1770(if longer, the rest is truncated and "..." will be appended).
1771
1772The flags argument can have UNI_DISPLAY_ISPRINT set to display
1773isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
1774to display the \\[nrfta\\] as the backslashed versions (like '\n')
1775(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
1776UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
1777UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
1778
1779The pointer to the PV of the dsv is returned.
1780
1781=cut */
1782char *
1783Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
1784{
1785    int truncated = 0;
1786    char *s, *e;
1787
1788    sv_setpvn(dsv, "", 0);
1789    for (s = (char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
1790         UV u;
1791         bool ok = FALSE;
1792
1793         if (pvlim && SvCUR(dsv) >= pvlim) {
1794              truncated++;
1795              break;
1796         }
1797         u = utf8_to_uvchr((U8*)s, 0);
1798         if (u < 256) {
1799             if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
1800                 switch (u & 0xFF) {
1801                 case '\n':
1802                     Perl_sv_catpvf(aTHX_ dsv, "\\n"); ok = TRUE; break;
1803                 case '\r':
1804                     Perl_sv_catpvf(aTHX_ dsv, "\\r"); ok = TRUE; break;
1805                 case '\t':
1806                     Perl_sv_catpvf(aTHX_ dsv, "\\t"); ok = TRUE; break;
1807                 case '\f':
1808                     Perl_sv_catpvf(aTHX_ dsv, "\\f"); ok = TRUE; break;
1809                 case '\a':
1810                     Perl_sv_catpvf(aTHX_ dsv, "\\a"); ok = TRUE; break;
1811                 case '\\':
1812                     Perl_sv_catpvf(aTHX_ dsv, "\\\\" ); ok = TRUE; break;
1813                 default: break;
1814                 }
1815             }
1816             /* isPRINT() is the locale-blind version. */
1817             if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(u & 0xFF)) {
1818                 Perl_sv_catpvf(aTHX_ dsv, "%c", (char)(u & 0xFF));
1819                 ok = TRUE;
1820             }
1821         }
1822         if (!ok)
1823             Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
1824    }
1825    if (truncated)
1826         sv_catpvn(dsv, "...", 3);
1827   
1828    return SvPVX(dsv);
1829}
1830
1831/*
1832=for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1833
1834Build to the scalar dsv a displayable version of the scalar sv,
1835the displayable version being at most pvlim bytes long
1836(if longer, the rest is truncated and "..." will be appended).
1837
1838The flags argument is as in pv_uni_display().
1839
1840The pointer to the PV of the dsv is returned.
1841
1842=cut */
1843char *
1844Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1845{
1846     return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
1847                                pvlim, flags);
1848}
1849
1850/*
1851=for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
1852
1853Return true if the strings s1 and s2 differ case-insensitively, false
1854if not (if they are equal case-insensitively).  If u1 is true, the
1855string s1 is assumed to be in UTF-8-encoded Unicode.  If u2 is true,
1856the string s2 is assumed to be in UTF-8-encoded Unicode.  If u1 or u2
1857are false, the respective string is assumed to be in native 8-bit
1858encoding.
1859
1860If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
1861in there (they will point at the beginning of the I<next> character).
1862If the pointers behind pe1 or pe2 are non-NULL, they are the end
1863pointers beyond which scanning will not continue under any
1864circustances.  If the byte lengths l1 and l2 are non-zero, s1+l1 and
1865s2+l2 will be used as goal end pointers that will also stop the scan,
1866and which qualify towards defining a successful match: all the scans
1867that define an explicit length must reach their goal pointers for
1868a match to succeed).
1869
1870For case-insensitiveness, the "casefolding" of Unicode is used
1871instead of upper/lowercasing both the characters, see
1872http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
1873
1874=cut */
1875I32
1876Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
1877{
1878     register U8 *p1  = (U8*)s1;
1879     register U8 *p2  = (U8*)s2;
1880     register U8 *e1 = 0, *f1 = 0, *q1 = 0;
1881     register U8 *e2 = 0, *f2 = 0, *q2 = 0;
1882     STRLEN n1 = 0, n2 = 0;
1883     U8 foldbuf1[UTF8_MAXLEN_FOLD+1];
1884     U8 foldbuf2[UTF8_MAXLEN_FOLD+1];
1885     U8 natbuf[1+1];
1886     STRLEN foldlen1, foldlen2;
1887     bool match;
1888     
1889     if (pe1)
1890          e1 = *(U8**)pe1;
1891     if (e1 == 0 || (l1 && l1 < (UV)(e1 - (U8*)s1)))
1892          f1 = (U8*)s1 + l1;
1893     if (pe2)
1894          e2 = *(U8**)pe2;
1895     if (e2 == 0 || (l2 && l2 < (UV)(e2 - (U8*)s2)))
1896          f2 = (U8*)s2 + l2;
1897
1898     if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
1899          return 1; /* mismatch; possible infinite loop or false positive */
1900
1901     if (!u1 || !u2)
1902          natbuf[1] = 0; /* Need to terminate the buffer. */
1903
1904     while ((e1 == 0 || p1 < e1) &&
1905            (f1 == 0 || p1 < f1) &&
1906            (e2 == 0 || p2 < e2) &&
1907            (f2 == 0 || p2 < f2)) {
1908          if (n1 == 0) {
1909               if (u1)
1910                    to_utf8_fold(p1, foldbuf1, &foldlen1);
1911               else {
1912                    natbuf[0] = *p1;
1913                    to_utf8_fold(natbuf, foldbuf1, &foldlen1);
1914               }
1915               q1 = foldbuf1;
1916               n1 = foldlen1;
1917          }
1918          if (n2 == 0) {
1919               if (u2)
1920                    to_utf8_fold(p2, foldbuf2, &foldlen2);
1921               else {
1922                    natbuf[0] = *p2;
1923                    to_utf8_fold(natbuf, foldbuf2, &foldlen2);
1924               }
1925               q2 = foldbuf2;
1926               n2 = foldlen2;
1927          }
1928          while (n1 && n2) {
1929               if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
1930                   (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
1931                    memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
1932                   return 1; /* mismatch */
1933               n1 -= UTF8SKIP(q1);
1934               q1 += UTF8SKIP(q1);
1935               n2 -= UTF8SKIP(q2);
1936               q2 += UTF8SKIP(q2);
1937          }
1938          if (n1 == 0)
1939               p1 += u1 ? UTF8SKIP(p1) : 1;
1940          if (n2 == 0)
1941               p2 += u2 ? UTF8SKIP(p2) : 1;
1942
1943     }
1944
1945     /* A match is defined by all the scans that specified
1946      * an explicit length reaching their final goals. */
1947     match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
1948
1949     if (match) {
1950          if (pe1)
1951               *pe1 = (char*)p1;
1952          if (pe2)
1953               *pe2 = (char*)p2;
1954     }
1955
1956     return match ? 0 : 1; /* 0 match, 1 mismatch */
1957}
1958
Note: See TracBrowser for help on using the repository browser.