source: trunk/third/perl/utf8.c @ 17035

Revision 17035, 29.3 KB checked in by zacheiss, 22 years ago (diff)
This commit was generated by cvs2svn to compensate for changes in r17034, which included commits to RCS files with non-trunk default branches.
Line 
1/*    utf8.c
2 *
3 *    Copyright (c) 1998-2001, Larry Wall
4 *
5 *    You may distribute under the terms of either the GNU General Public
6 *    License or the Artistic License, as specified in the README file.
7 *
8 */
9
10/*
11 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
12 * heard of that we don't want to see any closer; and that's the one place
13 * we're trying to get to!  And that's just where we can't get, nohow.'
14 *
15 * 'Well do I understand your speech,' he answered in the same language;
16 * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
17 * as is the custom in the West, if you wish to be answered?'
18 *
19 * ...the travellers perceived that the floor was paved with stones of many
20 * hues; branching runes and strange devices intertwined beneath their feet.
21 */
22
23#include "EXTERN.h"
24#define PERL_IN_UTF8_C
25#include "perl.h"
26
27/* Unicode support */
28
29/*
30=for apidoc A|U8*|uv_to_utf8|U8 *d|UV uv
31
32Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
33of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
34bytes available. The return value is the pointer to the byte after the
35end of the new character. In other words,
36
37    d = uv_to_utf8(d, uv);
38
39is the recommended Unicode-aware way of saying
40
41    *(d++) = uv;
42
43=cut
44*/
45
46U8 *
47Perl_uv_to_utf8(pTHX_ U8 *d, UV uv)
48{
49    if (uv < 0x80) {
50        *d++ = uv;
51        return d;
52    }
53    if (uv < 0x800) {
54        *d++ = (( uv >>  6)         | 0xc0);
55        *d++ = (( uv        & 0x3f) | 0x80);
56        return d;
57    }
58    if (uv < 0x10000) {
59        *d++ = (( uv >> 12)         | 0xe0);
60        *d++ = (((uv >>  6) & 0x3f) | 0x80);
61        *d++ = (( uv        & 0x3f) | 0x80);
62        return d;
63    }
64    if (uv < 0x200000) {
65        *d++ = (( uv >> 18)         | 0xf0);
66        *d++ = (((uv >> 12) & 0x3f) | 0x80);
67        *d++ = (((uv >>  6) & 0x3f) | 0x80);
68        *d++ = (( uv        & 0x3f) | 0x80);
69        return d;
70    }
71    if (uv < 0x4000000) {
72        *d++ = (( uv >> 24)         | 0xf8);
73        *d++ = (((uv >> 18) & 0x3f) | 0x80);
74        *d++ = (((uv >> 12) & 0x3f) | 0x80);
75        *d++ = (((uv >>  6) & 0x3f) | 0x80);
76        *d++ = (( uv        & 0x3f) | 0x80);
77        return d;
78    }
79    if (uv < 0x80000000) {
80        *d++ = (( uv >> 30)         | 0xfc);
81        *d++ = (((uv >> 24) & 0x3f) | 0x80);
82        *d++ = (((uv >> 18) & 0x3f) | 0x80);
83        *d++ = (((uv >> 12) & 0x3f) | 0x80);
84        *d++ = (((uv >>  6) & 0x3f) | 0x80);
85        *d++ = (( uv        & 0x3f) | 0x80);
86        return d;
87    }
88#ifdef HAS_QUAD
89    if (uv < UTF8_QUAD_MAX)
90#endif
91    {
92        *d++ =                        0xfe;     /* Can't match U+FEFF! */
93        *d++ = (((uv >> 30) & 0x3f) | 0x80);
94        *d++ = (((uv >> 24) & 0x3f) | 0x80);
95        *d++ = (((uv >> 18) & 0x3f) | 0x80);
96        *d++ = (((uv >> 12) & 0x3f) | 0x80);
97        *d++ = (((uv >>  6) & 0x3f) | 0x80);
98        *d++ = (( uv        & 0x3f) | 0x80);
99        return d;
100    }
101#ifdef HAS_QUAD
102    {
103        *d++ =                        0xff;     /* Can't match U+FFFE! */
104        *d++ =                        0x80;     /* 6 Reserved bits */
105        *d++ = (((uv >> 60) & 0x0f) | 0x80);    /* 2 Reserved bits */
106        *d++ = (((uv >> 54) & 0x3f) | 0x80);
107        *d++ = (((uv >> 48) & 0x3f) | 0x80);
108        *d++ = (((uv >> 42) & 0x3f) | 0x80);
109        *d++ = (((uv >> 36) & 0x3f) | 0x80);
110        *d++ = (((uv >> 30) & 0x3f) | 0x80);
111        *d++ = (((uv >> 24) & 0x3f) | 0x80);
112        *d++ = (((uv >> 18) & 0x3f) | 0x80);
113        *d++ = (((uv >> 12) & 0x3f) | 0x80);
114        *d++ = (((uv >>  6) & 0x3f) | 0x80);
115        *d++ = (( uv        & 0x3f) | 0x80);
116        return d;
117    }
118#endif
119}
120
121/*
122=for apidoc A|STRLEN|is_utf8_char|U8 *s
123
124Tests if some arbitrary number of bytes begins in a valid UTF-8 character.
125The actual number of bytes in the UTF-8 character will be returned if it
126is valid, otherwise 0.
127 
128=cut
129*/
130STRLEN
131Perl_is_utf8_char(pTHX_ U8 *s)
132{
133    U8 u = *s;
134    STRLEN slen, len;
135    UV uv, ouv;
136
137    if (UTF8_IS_ASCII(u))
138        return 1;
139
140    if (!UTF8_IS_START(u))
141        return 0;
142
143    len = UTF8SKIP(s);
144
145    if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
146        return 0;
147
148    slen = len - 1;
149    s++;
150    uv = u;
151    ouv = uv;
152    while (slen--) {
153        if (!UTF8_IS_CONTINUATION(*s))
154            return 0;
155        uv = UTF8_ACCUMULATE(uv, *s);
156        if (uv < ouv)
157            return 0;
158        ouv = uv;
159        s++;
160    }
161
162    if (UNISKIP(uv) < len)
163        return 0;
164
165    return len;
166}
167
168/*
169=for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
170
171Returns true if first C<len> bytes of the given string form valid a UTF8
172string, false otherwise.
173
174=cut
175*/
176
177bool
178Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
179{
180    U8* x = s;
181    U8* send;
182    STRLEN c;
183
184    if (!len)
185        len = strlen((char *)s);
186    send = s + len;
187
188    while (x < send) {
189        c = is_utf8_char(x);
190        if (!c)
191            return FALSE;
192        x += c;
193    }
194    if (x != send)
195        return FALSE;
196
197    return TRUE;
198}
199
200/*
201=for apidoc A|U8* s|utf8_to_uv|STRLEN curlen|STRLEN *retlen|U32 flags
202
203Returns the character value of the first character in the string C<s>
204which is assumed to be in UTF8 encoding and no longer than C<curlen>;
205C<retlen> will be set to the length, in bytes, of that character.
206
207If C<s> does not point to a well-formed UTF8 character, the behaviour
208is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
209it is assumed that the caller will raise a warning, and this function
210will silently just set C<retlen> to C<-1> and return zero.  If the
211C<flags> does not contain UTF8_CHECK_ONLY, warnings about
212malformations will be given, C<retlen> will be set to the expected
213length of the UTF-8 character in bytes, and zero will be returned.
214
215The C<flags> can also contain various flags to allow deviations from
216the strict UTF-8 encoding (see F<utf8.h>).
217
218=cut */
219
220UV
221Perl_utf8_to_uv(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
222{
223    UV uv = *s, ouv;
224    STRLEN len = 1;
225#ifdef EBCDIC
226    bool dowarn = 0;
227#else
228    bool dowarn = ckWARN_d(WARN_UTF8);
229#endif
230    STRLEN expectlen = 0;
231    U32 warning = 0;
232
233/* This list is a superset of the UTF8_ALLOW_XXX. */
234
235#define UTF8_WARN_EMPTY                          1
236#define UTF8_WARN_CONTINUATION                   2
237#define UTF8_WARN_NON_CONTINUATION               3
238#define UTF8_WARN_FE_FF                          4
239#define UTF8_WARN_SHORT                          5
240#define UTF8_WARN_OVERFLOW                       6
241#define UTF8_WARN_SURROGATE                      7
242#define UTF8_WARN_BOM                            8
243#define UTF8_WARN_LONG                           9
244#define UTF8_WARN_FFFF                          10
245
246    if (curlen == 0 &&
247        !(flags & UTF8_ALLOW_EMPTY)) {
248        warning = UTF8_WARN_EMPTY;
249        goto malformed;
250    }
251
252    if (UTF8_IS_ASCII(uv)) {
253        if (retlen)
254            *retlen = 1;
255        return *s;
256    }
257
258    if (UTF8_IS_CONTINUATION(uv) &&
259        !(flags & UTF8_ALLOW_CONTINUATION)) {
260        warning = UTF8_WARN_CONTINUATION;
261        goto malformed;
262    }
263
264    if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
265        !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
266        warning = UTF8_WARN_NON_CONTINUATION;
267        goto malformed;
268    }
269   
270    if ((uv == 0xfe || uv == 0xff) &&
271        !(flags & UTF8_ALLOW_FE_FF)) {
272        warning = UTF8_WARN_FE_FF;
273        goto malformed;
274    }
275       
276    if      (!(uv & 0x20))      { len =  2; uv &= 0x1f; }
277    else if (!(uv & 0x10))      { len =  3; uv &= 0x0f; }
278    else if (!(uv & 0x08))      { len =  4; uv &= 0x07; }
279    else if (!(uv & 0x04))      { len =  5; uv &= 0x03; }
280    else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
281    else if (!(uv & 0x01))      { len =  7; uv = 0; }
282    else                        { len = 13; uv = 0; } /* whoa! */
283       
284    if (retlen)
285        *retlen = len;
286   
287    expectlen = len;
288
289    if ((curlen < expectlen) &&
290        !(flags & UTF8_ALLOW_SHORT)) {
291        warning = UTF8_WARN_SHORT;
292        goto malformed;
293    }
294
295    len--;
296    s++;
297    ouv = uv;
298
299    while (len--) {
300        if (!UTF8_IS_CONTINUATION(*s) &&
301            !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
302            s--;
303            warning = UTF8_WARN_NON_CONTINUATION;
304            goto malformed;
305        }
306        else
307            uv = UTF8_ACCUMULATE(uv, *s);
308        if (!(uv > ouv)) {
309            /* These cannot be allowed. */
310            if (uv == ouv) {
311                if (!(flags & UTF8_ALLOW_LONG)) {
312                    warning = UTF8_WARN_LONG;
313                    goto malformed;
314                }
315            }
316            else { /* uv < ouv */
317                /* This cannot be allowed. */
318                warning = UTF8_WARN_OVERFLOW;
319                goto malformed;
320            }
321        }
322        s++;
323        ouv = uv;
324    }
325
326    if (UNICODE_IS_SURROGATE(uv) &&
327        !(flags & UTF8_ALLOW_SURROGATE)) {
328        warning = UTF8_WARN_SURROGATE;
329        goto malformed;
330    } else if (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
331               !(flags & UTF8_ALLOW_BOM)) {
332        warning = UTF8_WARN_BOM;
333        goto malformed;
334    } else if ((expectlen > UNISKIP(uv)) &&
335               !(flags & UTF8_ALLOW_LONG)) {
336        warning = UTF8_WARN_LONG;
337        goto malformed;
338    } else if (UNICODE_IS_ILLEGAL(uv) &&
339               !(flags & UTF8_ALLOW_FFFF)) {
340        warning = UTF8_WARN_FFFF;
341        goto malformed;
342    }
343
344    return uv;
345
346malformed:
347
348    if (flags & UTF8_CHECK_ONLY) {
349        if (retlen)
350            *retlen = -1;
351        return 0;
352    }
353
354    if (dowarn) {
355        SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
356
357        switch (warning) {
358        case 0: /* Intentionally empty. */ break;
359        case UTF8_WARN_EMPTY:
360            Perl_sv_catpvf(aTHX_ sv, "(empty string)");
361            break;
362        case UTF8_WARN_CONTINUATION:
363            Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf")", uv);
364            break;
365        case UTF8_WARN_NON_CONTINUATION:
366            Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf" after start byte 0x%02"UVxf")",
367                           (UV)s[1], uv);
368            break;
369        case UTF8_WARN_FE_FF:
370            Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
371            break;
372        case UTF8_WARN_SHORT:
373            Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
374                           curlen, curlen == 1 ? "" : "s", expectlen);
375            break;
376        case UTF8_WARN_OVERFLOW:
377            Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x)",
378                           ouv, *s);
379            break;
380        case UTF8_WARN_SURROGATE:
381            Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
382            break;
383        case UTF8_WARN_BOM:
384            Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv);
385            break;
386        case UTF8_WARN_LONG:
387            Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
388                           expectlen, expectlen == 1 ? "": "s", UNISKIP(uv));
389            break;
390        case UTF8_WARN_FFFF:
391            Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
392            break;
393        default:
394            Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
395            break;
396        }
397       
398        if (warning) {
399            char *s = SvPVX(sv);
400
401            if (PL_op)
402                Perl_warner(aTHX_ WARN_UTF8,
403                            "%s in %s", s,  PL_op_desc[PL_op->op_type]);
404            else
405                Perl_warner(aTHX_ WARN_UTF8, "%s", s);
406        }
407    }
408
409    if (retlen)
410        *retlen = expectlen ? expectlen : len;
411
412    return 0;
413}
414
415/*
416=for apidoc A|U8* s|utf8_to_uv_simple|STRLEN *retlen
417
418Returns the character value of the first character in the string C<s>
419which is assumed to be in UTF8 encoding; C<retlen> will be set to the
420length, in bytes, of that character.
421
422If C<s> does not point to a well-formed UTF8 character, zero is
423returned and retlen is set, if possible, to -1.
424
425=cut
426*/
427
428UV
429Perl_utf8_to_uv_simple(pTHX_ U8* s, STRLEN* retlen)
430{
431    return Perl_utf8_to_uv(aTHX_ s, UTF8_MAXLEN, retlen, 0);
432}
433
434/*
435=for apidoc A|STRLEN|utf8_length|U8* s|U8 *e
436
437Return the length of the UTF-8 char encoded string C<s> in characters.
438Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
439up past C<e>, croaks.
440
441=cut
442*/
443
444STRLEN
445Perl_utf8_length(pTHX_ U8* s, U8* e)
446{
447    STRLEN len = 0;
448
449    /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
450     * the bitops (especially ~) can create illegal UTF-8.
451     * In other words: in Perl UTF-8 is not just for Unicode. */
452
453    if (e < s)
454        Perl_croak(aTHX_ "panic: utf8_length: unexpected end");
455    while (s < e) {
456        U8 t = UTF8SKIP(s);
457
458        if (e - s < t)
459            Perl_croak(aTHX_ "panic: utf8_length: unaligned end");
460        s += t;
461        len++;
462    }
463
464    return len;
465}
466
467/*
468=for apidoc A|IV|utf8_distance|U8 *a|U8 *b
469
470Returns the number of UTF8 characters between the UTF-8 pointers C<a>
471and C<b>.
472
473WARNING: use only if you *know* that the pointers point inside the
474same UTF-8 buffer.
475
476=cut */
477
478IV
479Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
480{
481    IV off = 0;
482
483    /* Note: cannot use UTF8_IS_...() too eagerly here since  e.g.
484     * the bitops (especially ~) can create illegal UTF-8.
485     * In other words: in Perl UTF-8 is not just for Unicode. */
486
487    if (a < b) {
488        while (a < b) {
489            U8 c = UTF8SKIP(a);
490
491            if (b - a < c)
492                Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
493            a += c;
494            off--;
495        }
496    }
497    else {
498        while (b < a) {
499            U8 c = UTF8SKIP(b);
500
501            if (a - b < c)
502                Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
503            b += c;
504            off++;
505        }
506    }
507
508    return off;
509}
510
511/*
512=for apidoc A|U8*|utf8_hop|U8 *s|I32 off
513
514Return the UTF-8 pointer C<s> displaced by C<off> characters, either
515forward or backward.
516
517WARNING: do not use the following unless you *know* C<off> is within
518the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
519on the first byte of character or just after the last byte of a character.
520
521=cut */
522
523U8 *
524Perl_utf8_hop(pTHX_ U8 *s, I32 off)
525{
526    /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
527     * the bitops (especially ~) can create illegal UTF-8.
528     * In other words: in Perl UTF-8 is not just for Unicode. */
529
530    if (off >= 0) {
531        while (off--)
532            s += UTF8SKIP(s);
533    }
534    else {
535        while (off++) {
536            s--;
537            while (UTF8_IS_CONTINUATION(*s))
538                s--;
539        }
540    }
541    return s;
542}
543
544/*
545=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
546
547Converts a string C<s> of length C<len> from UTF8 into byte encoding.
548Unlike C<bytes_to_utf8>, this over-writes the original string, and
549updates len to contain the new length.
550Returns zero on failure, setting C<len> to -1.
551
552=cut
553*/
554
555U8 *
556Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN *len)
557{
558    U8 *send;
559    U8 *d;
560    U8 *save = s;
561
562    /* ensure valid UTF8 and chars < 256 before updating string */
563    for (send = s + *len; s < send; ) {
564        U8 c = *s++;
565
566        if (c >= 0x80 &&
567            ((s >= send) ||
568             ((*s++ & 0xc0) != 0x80) || ((c & 0xfe) != 0xc2))) {
569            *len = -1;
570            return 0;
571        }
572    }
573
574    d = s = save;
575    while (s < send) {
576        if (UTF8_IS_ASCII(*s)) {
577            *d++ = *s++;
578        }
579        else {
580            STRLEN ulen;
581            *d++ = (U8)utf8_to_uv_simple(s, &ulen);
582            s += ulen;
583        }
584    }
585    *d = '\0';
586    *len = d - save;
587    return save;
588}
589
590/*
591=for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
592
593Converts a string C<s> of length C<len> from UTF8 into byte encoding.
594Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
595the newly-created string, and updates C<len> to contain the new
596length.  Returns the original string if no conversion occurs, C<len>
597is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
5980 if C<s> is converted or contains all 7bit characters.
599
600=cut */
601
602U8 *
603Perl_bytes_from_utf8(pTHX_ U8* s, STRLEN *len, bool *is_utf8)
604{
605    U8 *send;
606    U8 *d;
607    U8 *start = s;
608    I32 count = 0;
609
610    if (!*is_utf8)
611        return start;
612
613    /* ensure valid UTF8 and chars < 256 before converting string */
614    for (send = s + *len; s < send;) {
615        U8 c = *s++;
616        if (!UTF8_IS_ASCII(c)) {
617            if (UTF8_IS_CONTINUATION(c) || s >= send ||
618                !UTF8_IS_CONTINUATION(*s) || UTF8_IS_DOWNGRADEABLE_START(c))
619                return start;
620            s++, count++;
621        }
622    }
623
624    *is_utf8 = 0;               
625
626    if (!count)
627        return start;
628
629    Newz(801, d, (*len) - count + 1, U8);
630    s = start; start = d;
631    while (s < send) {
632        U8 c = *s++;
633
634        if (UTF8_IS_ASCII(c))
635            *d++ = c;
636        else
637            *d++ = UTF8_ACCUMULATE(c, *s++);
638    }
639    *d = '\0';
640    *len = d - start;
641    return start;
642}
643
644/*
645=for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
646
647Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
648Returns a pointer to the newly-created string, and sets C<len> to
649reflect the new length.
650
651=cut
652*/
653
654U8*
655Perl_bytes_to_utf8(pTHX_ U8* s, STRLEN *len)
656{
657    U8 *send;
658    U8 *d;
659    U8 *dst;
660    send = s + (*len);
661
662    Newz(801, d, (*len) * 2 + 1, U8);
663    dst = d;
664
665    while (s < send) {
666        if (UTF8_IS_ASCII(*s))
667            *d++ = *s++;
668        else {
669            UV uv = *s++;
670
671            *d++ = UTF8_EIGHT_BIT_HI(uv);
672            *d++ = UTF8_EIGHT_BIT_LO(uv);
673        }
674    }
675    *d = '\0';
676    *len = d-dst;
677    return dst;
678}
679
680/*
681 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
682 *
683 * Destination must be pre-extended to 3/2 source.  Do not use in-place.
684 * We optimize for native, for obvious reasons. */
685
686U8*
687Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
688{
689    U8* pend;
690    U8* dstart = d;
691
692    if (bytelen & 1)
693        Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
694
695    pend = p + bytelen;
696
697    while (p < pend) {
698        UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
699        p += 2;
700        if (uv < 0x80) {
701            *d++ = uv;
702            continue;
703        }
704        if (uv < 0x800) {
705            *d++ = (( uv >>  6)         | 0xc0);
706            *d++ = (( uv        & 0x3f) | 0x80);
707            continue;
708        }
709        if (uv >= 0xd800 && uv < 0xdbff) {      /* surrogates */
710            UV low = *p++;
711            if (low < 0xdc00 || low >= 0xdfff)
712                Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
713            uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
714        }
715        if (uv < 0x10000) {
716            *d++ = (( uv >> 12)         | 0xe0);
717            *d++ = (((uv >>  6) & 0x3f) | 0x80);
718            *d++ = (( uv        & 0x3f) | 0x80);
719            continue;
720        }
721        else {
722            *d++ = (( uv >> 18)         | 0xf0);
723            *d++ = (((uv >> 12) & 0x3f) | 0x80);
724            *d++ = (((uv >>  6) & 0x3f) | 0x80);
725            *d++ = (( uv        & 0x3f) | 0x80);
726            continue;
727        }
728    }
729    *newlen = d - dstart;
730    return d;
731}
732
733/* Note: this one is slightly destructive of the source. */
734
735U8*
736Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
737{
738    U8* s = (U8*)p;
739    U8* send = s + bytelen;
740    while (s < send) {
741        U8 tmp = s[0];
742        s[0] = s[1];
743        s[1] = tmp;
744        s += 2;
745    }
746    return utf16_to_utf8(p, d, bytelen, newlen);
747}
748
749/* for now these are all defined (inefficiently) in terms of the utf8 versions */
750
751bool
752Perl_is_uni_alnum(pTHX_ U32 c)
753{
754    U8 tmpbuf[UTF8_MAXLEN+1];
755    uv_to_utf8(tmpbuf, (UV)c);
756    return is_utf8_alnum(tmpbuf);
757}
758
759bool
760Perl_is_uni_alnumc(pTHX_ U32 c)
761{
762    U8 tmpbuf[UTF8_MAXLEN+1];
763    uv_to_utf8(tmpbuf, (UV)c);
764    return is_utf8_alnumc(tmpbuf);
765}
766
767bool
768Perl_is_uni_idfirst(pTHX_ U32 c)
769{
770    U8 tmpbuf[UTF8_MAXLEN+1];
771    uv_to_utf8(tmpbuf, (UV)c);
772    return is_utf8_idfirst(tmpbuf);
773}
774
775bool
776Perl_is_uni_alpha(pTHX_ U32 c)
777{
778    U8 tmpbuf[UTF8_MAXLEN+1];
779    uv_to_utf8(tmpbuf, (UV)c);
780    return is_utf8_alpha(tmpbuf);
781}
782
783bool
784Perl_is_uni_ascii(pTHX_ U32 c)
785{
786    U8 tmpbuf[UTF8_MAXLEN+1];
787    uv_to_utf8(tmpbuf, (UV)c);
788    return is_utf8_ascii(tmpbuf);
789}
790
791bool
792Perl_is_uni_space(pTHX_ U32 c)
793{
794    U8 tmpbuf[UTF8_MAXLEN+1];
795    uv_to_utf8(tmpbuf, (UV)c);
796    return is_utf8_space(tmpbuf);
797}
798
799bool
800Perl_is_uni_digit(pTHX_ U32 c)
801{
802    U8 tmpbuf[UTF8_MAXLEN+1];
803    uv_to_utf8(tmpbuf, (UV)c);
804    return is_utf8_digit(tmpbuf);
805}
806
807bool
808Perl_is_uni_upper(pTHX_ U32 c)
809{
810    U8 tmpbuf[UTF8_MAXLEN+1];
811    uv_to_utf8(tmpbuf, (UV)c);
812    return is_utf8_upper(tmpbuf);
813}
814
815bool
816Perl_is_uni_lower(pTHX_ U32 c)
817{
818    U8 tmpbuf[UTF8_MAXLEN+1];
819    uv_to_utf8(tmpbuf, (UV)c);
820    return is_utf8_lower(tmpbuf);
821}
822
823bool
824Perl_is_uni_cntrl(pTHX_ U32 c)
825{
826    U8 tmpbuf[UTF8_MAXLEN+1];
827    uv_to_utf8(tmpbuf, (UV)c);
828    return is_utf8_cntrl(tmpbuf);
829}
830
831bool
832Perl_is_uni_graph(pTHX_ U32 c)
833{
834    U8 tmpbuf[UTF8_MAXLEN+1];
835    uv_to_utf8(tmpbuf, (UV)c);
836    return is_utf8_graph(tmpbuf);
837}
838
839bool
840Perl_is_uni_print(pTHX_ U32 c)
841{
842    U8 tmpbuf[UTF8_MAXLEN+1];
843    uv_to_utf8(tmpbuf, (UV)c);
844    return is_utf8_print(tmpbuf);
845}
846
847bool
848Perl_is_uni_punct(pTHX_ U32 c)
849{
850    U8 tmpbuf[UTF8_MAXLEN+1];
851    uv_to_utf8(tmpbuf, (UV)c);
852    return is_utf8_punct(tmpbuf);
853}
854
855bool
856Perl_is_uni_xdigit(pTHX_ U32 c)
857{
858    U8 tmpbuf[UTF8_MAXLEN+1];
859    uv_to_utf8(tmpbuf, (UV)c);
860    return is_utf8_xdigit(tmpbuf);
861}
862
863U32
864Perl_to_uni_upper(pTHX_ U32 c)
865{
866    U8 tmpbuf[UTF8_MAXLEN+1];
867    uv_to_utf8(tmpbuf, (UV)c);
868    return to_utf8_upper(tmpbuf);
869}
870
871U32
872Perl_to_uni_title(pTHX_ U32 c)
873{
874    U8 tmpbuf[UTF8_MAXLEN+1];
875    uv_to_utf8(tmpbuf, (UV)c);
876    return to_utf8_title(tmpbuf);
877}
878
879U32
880Perl_to_uni_lower(pTHX_ U32 c)
881{
882    U8 tmpbuf[UTF8_MAXLEN+1];
883    uv_to_utf8(tmpbuf, (UV)c);
884    return to_utf8_lower(tmpbuf);
885}
886
887/* for now these all assume no locale info available for Unicode > 255 */
888
889bool
890Perl_is_uni_alnum_lc(pTHX_ U32 c)
891{
892    return is_uni_alnum(c);     /* XXX no locale support yet */
893}
894
895bool
896Perl_is_uni_alnumc_lc(pTHX_ U32 c)
897{
898    return is_uni_alnumc(c);    /* XXX no locale support yet */
899}
900
901bool
902Perl_is_uni_idfirst_lc(pTHX_ U32 c)
903{
904    return is_uni_idfirst(c);   /* XXX no locale support yet */
905}
906
907bool
908Perl_is_uni_alpha_lc(pTHX_ U32 c)
909{
910    return is_uni_alpha(c);     /* XXX no locale support yet */
911}
912
913bool
914Perl_is_uni_ascii_lc(pTHX_ U32 c)
915{
916    return is_uni_ascii(c);     /* XXX no locale support yet */
917}
918
919bool
920Perl_is_uni_space_lc(pTHX_ U32 c)
921{
922    return is_uni_space(c);     /* XXX no locale support yet */
923}
924
925bool
926Perl_is_uni_digit_lc(pTHX_ U32 c)
927{
928    return is_uni_digit(c);     /* XXX no locale support yet */
929}
930
931bool
932Perl_is_uni_upper_lc(pTHX_ U32 c)
933{
934    return is_uni_upper(c);     /* XXX no locale support yet */
935}
936
937bool
938Perl_is_uni_lower_lc(pTHX_ U32 c)
939{
940    return is_uni_lower(c);     /* XXX no locale support yet */
941}
942
943bool
944Perl_is_uni_cntrl_lc(pTHX_ U32 c)
945{
946    return is_uni_cntrl(c);     /* XXX no locale support yet */
947}
948
949bool
950Perl_is_uni_graph_lc(pTHX_ U32 c)
951{
952    return is_uni_graph(c);     /* XXX no locale support yet */
953}
954
955bool
956Perl_is_uni_print_lc(pTHX_ U32 c)
957{
958    return is_uni_print(c);     /* XXX no locale support yet */
959}
960
961bool
962Perl_is_uni_punct_lc(pTHX_ U32 c)
963{
964    return is_uni_punct(c);     /* XXX no locale support yet */
965}
966
967bool
968Perl_is_uni_xdigit_lc(pTHX_ U32 c)
969{
970    return is_uni_xdigit(c);    /* XXX no locale support yet */
971}
972
973U32
974Perl_to_uni_upper_lc(pTHX_ U32 c)
975{
976    return to_uni_upper(c);     /* XXX no locale support yet */
977}
978
979U32
980Perl_to_uni_title_lc(pTHX_ U32 c)
981{
982    return to_uni_title(c);     /* XXX no locale support yet */
983}
984
985U32
986Perl_to_uni_lower_lc(pTHX_ U32 c)
987{
988    return to_uni_lower(c);     /* XXX no locale support yet */
989}
990
991bool
992Perl_is_utf8_alnum(pTHX_ U8 *p)
993{
994    if (!is_utf8_char(p))
995        return FALSE;
996    if (!PL_utf8_alnum)
997        /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
998         * descendant of isalnum(3), in other words, it doesn't
999         * contain the '_'. --jhi */
1000        PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
1001    return swash_fetch(PL_utf8_alnum, p);
1002/*    return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1003#ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1004    if (!PL_utf8_alnum)
1005        PL_utf8_alnum = swash_init("utf8", "",
1006            sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1007    return swash_fetch(PL_utf8_alnum, p);
1008#endif
1009}
1010
1011bool
1012Perl_is_utf8_alnumc(pTHX_ U8 *p)
1013{
1014    if (!is_utf8_char(p))
1015        return FALSE;
1016    if (!PL_utf8_alnum)
1017        PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1018    return swash_fetch(PL_utf8_alnum, p);
1019/*    return is_utf8_alpha(p) || is_utf8_digit(p); */
1020#ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1021    if (!PL_utf8_alnum)
1022        PL_utf8_alnum = swash_init("utf8", "",
1023            sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1024    return swash_fetch(PL_utf8_alnum, p);
1025#endif
1026}
1027
1028bool
1029Perl_is_utf8_idfirst(pTHX_ U8 *p)
1030{
1031    return *p == '_' || is_utf8_alpha(p);
1032}
1033
1034bool
1035Perl_is_utf8_alpha(pTHX_ U8 *p)
1036{
1037    if (!is_utf8_char(p))
1038        return FALSE;
1039    if (!PL_utf8_alpha)
1040        PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
1041    return swash_fetch(PL_utf8_alpha, p);
1042}
1043
1044bool
1045Perl_is_utf8_ascii(pTHX_ U8 *p)
1046{
1047    if (!is_utf8_char(p))
1048        return FALSE;
1049    if (!PL_utf8_ascii)
1050        PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1051    return swash_fetch(PL_utf8_ascii, p);
1052}
1053
1054bool
1055Perl_is_utf8_space(pTHX_ U8 *p)
1056{
1057    if (!is_utf8_char(p))
1058        return FALSE;
1059    if (!PL_utf8_space)
1060        PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
1061    return swash_fetch(PL_utf8_space, p);
1062}
1063
1064bool
1065Perl_is_utf8_digit(pTHX_ U8 *p)
1066{
1067    if (!is_utf8_char(p))
1068        return FALSE;
1069    if (!PL_utf8_digit)
1070        PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
1071    return swash_fetch(PL_utf8_digit, p);
1072}
1073
1074bool
1075Perl_is_utf8_upper(pTHX_ U8 *p)
1076{
1077    if (!is_utf8_char(p))
1078        return FALSE;
1079    if (!PL_utf8_upper)
1080        PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
1081    return swash_fetch(PL_utf8_upper, p);
1082}
1083
1084bool
1085Perl_is_utf8_lower(pTHX_ U8 *p)
1086{
1087    if (!is_utf8_char(p))
1088        return FALSE;
1089    if (!PL_utf8_lower)
1090        PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
1091    return swash_fetch(PL_utf8_lower, p);
1092}
1093
1094bool
1095Perl_is_utf8_cntrl(pTHX_ U8 *p)
1096{
1097    if (!is_utf8_char(p))
1098        return FALSE;
1099    if (!PL_utf8_cntrl)
1100        PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1101    return swash_fetch(PL_utf8_cntrl, p);
1102}
1103
1104bool
1105Perl_is_utf8_graph(pTHX_ U8 *p)
1106{
1107    if (!is_utf8_char(p))
1108        return FALSE;
1109    if (!PL_utf8_graph)
1110        PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1111    return swash_fetch(PL_utf8_graph, p);
1112}
1113
1114bool
1115Perl_is_utf8_print(pTHX_ U8 *p)
1116{
1117    if (!is_utf8_char(p))
1118        return FALSE;
1119    if (!PL_utf8_print)
1120        PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
1121    return swash_fetch(PL_utf8_print, p);
1122}
1123
1124bool
1125Perl_is_utf8_punct(pTHX_ U8 *p)
1126{
1127    if (!is_utf8_char(p))
1128        return FALSE;
1129    if (!PL_utf8_punct)
1130        PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1131    return swash_fetch(PL_utf8_punct, p);
1132}
1133
1134bool
1135Perl_is_utf8_xdigit(pTHX_ U8 *p)
1136{
1137    if (!is_utf8_char(p))
1138        return FALSE;
1139    if (!PL_utf8_xdigit)
1140        PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1141    return swash_fetch(PL_utf8_xdigit, p);
1142}
1143
1144bool
1145Perl_is_utf8_mark(pTHX_ U8 *p)
1146{
1147    if (!is_utf8_char(p))
1148        return FALSE;
1149    if (!PL_utf8_mark)
1150        PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
1151    return swash_fetch(PL_utf8_mark, p);
1152}
1153
1154UV
1155Perl_to_utf8_upper(pTHX_ U8 *p)
1156{
1157    UV uv;
1158
1159    if (!PL_utf8_toupper)
1160        PL_utf8_toupper = swash_init("utf8", "ToUpper", &PL_sv_undef, 4, 0);
1161    uv = swash_fetch(PL_utf8_toupper, p);
1162    return uv ? uv : utf8_to_uv(p,UTF8_MAXLEN,0,0);
1163}
1164
1165UV
1166Perl_to_utf8_title(pTHX_ U8 *p)
1167{
1168    UV uv;
1169
1170    if (!PL_utf8_totitle)
1171        PL_utf8_totitle = swash_init("utf8", "ToTitle", &PL_sv_undef, 4, 0);
1172    uv = swash_fetch(PL_utf8_totitle, p);
1173    return uv ? uv : utf8_to_uv(p,UTF8_MAXLEN,0,0);
1174}
1175
1176UV
1177Perl_to_utf8_lower(pTHX_ U8 *p)
1178{
1179    UV uv;
1180
1181    if (!PL_utf8_tolower)
1182        PL_utf8_tolower = swash_init("utf8", "ToLower", &PL_sv_undef, 4, 0);
1183    uv = swash_fetch(PL_utf8_tolower, p);
1184    return uv ? uv : utf8_to_uv(p,UTF8_MAXLEN,0,0);
1185}
1186
1187/* a "swash" is a swatch hash */
1188
1189SV*
1190Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
1191{
1192    SV* retval;
1193    SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
1194    dSP;
1195
1196    if (!gv_stashpv(pkg, 0)) {  /* demand load utf8 */
1197        ENTER;
1198        Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
1199        LEAVE;
1200    }
1201    SPAGAIN;
1202    PUSHSTACKi(PERLSI_MAGIC);
1203    PUSHMARK(SP);
1204    EXTEND(SP,5);
1205    PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1206    PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1207    PUSHs(listsv);
1208    PUSHs(sv_2mortal(newSViv(minbits)));
1209    PUSHs(sv_2mortal(newSViv(none)));
1210    PUTBACK;
1211    ENTER;
1212    SAVEI32(PL_hints);
1213    PL_hints = 0;
1214    save_re_context();
1215    if (PL_curcop == &PL_compiling)
1216        /* XXX ought to be handled by lex_start */
1217        sv_setpv(tokenbufsv, PL_tokenbuf);
1218    if (call_method("SWASHNEW", G_SCALAR))
1219        retval = newSVsv(*PL_stack_sp--);
1220    else
1221        retval = &PL_sv_undef;
1222    LEAVE;
1223    POPSTACK;
1224    if (PL_curcop == &PL_compiling) {
1225        STRLEN len;
1226        char* pv = SvPV(tokenbufsv, len);
1227
1228        Copy(pv, PL_tokenbuf, len+1, char);
1229        PL_curcop->op_private = PL_hints;
1230    }
1231    if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV)
1232        Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1233    return retval;
1234}
1235
1236UV
1237Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr)
1238{
1239    HV* hv = (HV*)SvRV(sv);
1240    U32 klen = UTF8SKIP(ptr) - 1;
1241    U32 off = ptr[klen] & 127;  /* NB: 64 bit always 0 when len > 1 */
1242    STRLEN slen;
1243    STRLEN needents = (klen ? 64 : 128);
1244    U8 *tmps;
1245    U32 bit;
1246    SV *retval;
1247
1248    /*
1249     * This single-entry cache saves about 1/3 of the utf8 overhead in test
1250     * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
1251     * it's nothing to sniff at.)  Pity we usually come through at least
1252     * two function calls to get here...
1253     *
1254     * NB: this code assumes that swatches are never modified, once generated!
1255     */
1256
1257    if (hv == PL_last_swash_hv &&
1258        klen == PL_last_swash_klen &&
1259        (!klen || memEQ(ptr,PL_last_swash_key,klen)) )
1260    {
1261        tmps = PL_last_swash_tmps;
1262        slen = PL_last_swash_slen;
1263    }
1264    else {
1265        /* Try our second-level swatch cache, kept in a hash. */
1266        SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
1267
1268        /* If not cached, generate it via utf8::SWASHGET */
1269        if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
1270            dSP;
1271            ENTER;
1272            SAVETMPS;
1273            save_re_context();
1274            PUSHSTACKi(PERLSI_MAGIC);
1275            PUSHMARK(SP);
1276            EXTEND(SP,3);
1277            PUSHs((SV*)sv);
1278            PUSHs(sv_2mortal(newSViv(utf8_to_uv(ptr, UTF8_MAXLEN, 0, 0) & ~(needents - 1))));
1279            PUSHs(sv_2mortal(newSViv(needents)));
1280            PUTBACK;
1281            if (call_method("SWASHGET", G_SCALAR))
1282                retval = newSVsv(*PL_stack_sp--);
1283            else
1284                retval = &PL_sv_undef;
1285            POPSTACK;
1286            FREETMPS;
1287            LEAVE;
1288            if (PL_curcop == &PL_compiling)
1289                PL_curcop->op_private = PL_hints;
1290
1291            svp = hv_store(hv, (char*)ptr, klen, retval, 0);
1292
1293            if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || slen < 8)
1294                Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
1295        }
1296
1297        PL_last_swash_hv = hv;
1298        PL_last_swash_klen = klen;
1299        PL_last_swash_tmps = tmps;
1300        PL_last_swash_slen = slen;
1301        if (klen)
1302            Copy(ptr, PL_last_swash_key, klen, U8);
1303    }
1304
1305    switch ((int)((slen << 3) / needents)) {
1306    case 1:
1307        bit = 1 << (off & 7);
1308        off >>= 3;
1309        return (tmps[off] & bit) != 0;
1310    case 8:
1311        return tmps[off];
1312    case 16:
1313        off <<= 1;
1314        return (tmps[off] << 8) + tmps[off + 1] ;
1315    case 32:
1316        off <<= 2;
1317        return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1318    }
1319    Perl_croak(aTHX_ "panic: swash_fetch");
1320    return 0;
1321}
Note: See TracBrowser for help on using the repository browser.