Context Navigation

source: trunk/third/perl/utf8.c @ 17035

Visit:

Revision 17035, 29.3 KB checked in by zacheiss, 22 years ago (diff)
This commit was generated by cvs2svn to compensate for changes in r17034, which included commits to RCS files with non-trunk default branches.

Line
1	/* utf8.c
2	*
3	* Copyright (c) 1998-2001, Larry Wall
4	*
5	* You may distribute under the terms of either the GNU General Public
6	* License or the Artistic License, as specified in the README file.
7	*
8	*/
9
10	/*
11	* 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
12	* heard of that we don't want to see any closer; and that's the one place
13	* we're trying to get to! And that's just where we can't get, nohow.'
14	*
15	* 'Well do I understand your speech,' he answered in the same language;
16	* 'yet few strangers do so. Why then do you not speak in the Common Tongue,
17	* as is the custom in the West, if you wish to be answered?'
18	*
19	* ...the travellers perceived that the floor was paved with stones of many
20	* hues; branching runes and strange devices intertwined beneath their feet.
21	*/
22
23	#include "EXTERN.h"
24	#define PERL_IN_UTF8_C
25	#include "perl.h"
26
27	/* Unicode support */
28
29	/*
30	=for apidoc A\|U8\|uv_to_utf8\|U8 d\|UV uv
31
32	Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
33	of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
34	bytes available. The return value is the pointer to the byte after the
35	end of the new character. In other words,
36
37	d = uv_to_utf8(d, uv);
38
39	is the recommended Unicode-aware way of saying
40
41	*(d++) = uv;
42
43	=cut
44	*/
45
46	U8 *
47	Perl_uv_to_utf8(pTHX_ U8 *d, UV uv)
48	{
49	if (uv < 0x80) {
50	*d++ = uv;
51	return d;
52	}
53	if (uv < 0x800) {
54	*d++ = (( uv >> 6) \| 0xc0);
55	*d++ = (( uv & 0x3f) \| 0x80);
56	return d;
57	}
58	if (uv < 0x10000) {
59	*d++ = (( uv >> 12) \| 0xe0);
60	*d++ = (((uv >> 6) & 0x3f) \| 0x80);
61	*d++ = (( uv & 0x3f) \| 0x80);
62	return d;
63	}
64	if (uv < 0x200000) {
65	*d++ = (( uv >> 18) \| 0xf0);
66	*d++ = (((uv >> 12) & 0x3f) \| 0x80);
67	*d++ = (((uv >> 6) & 0x3f) \| 0x80);
68	*d++ = (( uv & 0x3f) \| 0x80);
69	return d;
70	}
71	if (uv < 0x4000000) {
72	*d++ = (( uv >> 24) \| 0xf8);
73	*d++ = (((uv >> 18) & 0x3f) \| 0x80);
74	*d++ = (((uv >> 12) & 0x3f) \| 0x80);
75	*d++ = (((uv >> 6) & 0x3f) \| 0x80);
76	*d++ = (( uv & 0x3f) \| 0x80);
77	return d;
78	}
79	if (uv < 0x80000000) {
80	*d++ = (( uv >> 30) \| 0xfc);
81	*d++ = (((uv >> 24) & 0x3f) \| 0x80);
82	*d++ = (((uv >> 18) & 0x3f) \| 0x80);
83	*d++ = (((uv >> 12) & 0x3f) \| 0x80);
84	*d++ = (((uv >> 6) & 0x3f) \| 0x80);
85	*d++ = (( uv & 0x3f) \| 0x80);
86	return d;
87	}
88	#ifdef HAS_QUAD
89	if (uv < UTF8_QUAD_MAX)
90	#endif
91	{
92	d++ = 0xfe; / Can't match U+FEFF! */
93	*d++ = (((uv >> 30) & 0x3f) \| 0x80);
94	*d++ = (((uv >> 24) & 0x3f) \| 0x80);
95	*d++ = (((uv >> 18) & 0x3f) \| 0x80);
96	*d++ = (((uv >> 12) & 0x3f) \| 0x80);
97	*d++ = (((uv >> 6) & 0x3f) \| 0x80);
98	*d++ = (( uv & 0x3f) \| 0x80);
99	return d;
100	}
101	#ifdef HAS_QUAD
102	{
103	d++ = 0xff; / Can't match U+FFFE! */
104	d++ = 0x80; / 6 Reserved bits */
105	d++ = (((uv >> 60) & 0x0f) \| 0x80); / 2 Reserved bits */
106	*d++ = (((uv >> 54) & 0x3f) \| 0x80);
107	*d++ = (((uv >> 48) & 0x3f) \| 0x80);
108	*d++ = (((uv >> 42) & 0x3f) \| 0x80);
109	*d++ = (((uv >> 36) & 0x3f) \| 0x80);
110	*d++ = (((uv >> 30) & 0x3f) \| 0x80);
111	*d++ = (((uv >> 24) & 0x3f) \| 0x80);
112	*d++ = (((uv >> 18) & 0x3f) \| 0x80);
113	*d++ = (((uv >> 12) & 0x3f) \| 0x80);
114	*d++ = (((uv >> 6) & 0x3f) \| 0x80);
115	*d++ = (( uv & 0x3f) \| 0x80);
116	return d;
117	}
118	#endif
119	}
120
121	/*
122	=for apidoc A\|STRLEN\|is_utf8_char\|U8 *s
123
124	Tests if some arbitrary number of bytes begins in a valid UTF-8 character.
125	The actual number of bytes in the UTF-8 character will be returned if it
126	is valid, otherwise 0.
127
128	=cut
129	*/
130	STRLEN
131	Perl_is_utf8_char(pTHX_ U8 *s)
132	{
133	U8 u = *s;
134	STRLEN slen, len;
135	UV uv, ouv;
136
137	if (UTF8_IS_ASCII(u))
138	return 1;
139
140	if (!UTF8_IS_START(u))
141	return 0;
142
143	len = UTF8SKIP(s);
144
145	if (len < 2 \|\| !UTF8_IS_CONTINUATION(s[1]))
146	return 0;
147
148	slen = len - 1;
149	s++;
150	uv = u;
151	ouv = uv;
152	while (slen--) {
153	if (!UTF8_IS_CONTINUATION(*s))
154	return 0;
155	uv = UTF8_ACCUMULATE(uv, *s);
156	if (uv < ouv)
157	return 0;
158	ouv = uv;
159	s++;
160	}
161
162	if (UNISKIP(uv) < len)
163	return 0;
164
165	return len;
166	}
167
168	/*
169	=for apidoc A\|bool\|is_utf8_string\|U8 *s\|STRLEN len
170
171	Returns true if first C<len> bytes of the given string form valid a UTF8
172	string, false otherwise.
173
174	=cut
175	*/
176
177	bool
178	Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
179	{
180	U8* x = s;
181	U8* send;
182	STRLEN c;
183
184	if (!len)
185	len = strlen((char *)s);
186	send = s + len;
187
188	while (x < send) {
189	c = is_utf8_char(x);
190	if (!c)
191	return FALSE;
192	x += c;
193	}
194	if (x != send)
195	return FALSE;
196
197	return TRUE;
198	}
199
200	/*
201	=for apidoc A\|U8* s\|utf8_to_uv\|STRLEN curlen\|STRLEN *retlen\|U32 flags
202
203	Returns the character value of the first character in the string C<s>
204	which is assumed to be in UTF8 encoding and no longer than C<curlen>;
205	C<retlen> will be set to the length, in bytes, of that character.
206
207	If C<s> does not point to a well-formed UTF8 character, the behaviour
208	is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
209	it is assumed that the caller will raise a warning, and this function
210	will silently just set C<retlen> to C<-1> and return zero. If the
211	C<flags> does not contain UTF8_CHECK_ONLY, warnings about
212	malformations will be given, C<retlen> will be set to the expected
213	length of the UTF-8 character in bytes, and zero will be returned.
214
215	The C<flags> can also contain various flags to allow deviations from
216	the strict UTF-8 encoding (see F<utf8.h>).
217
218	=cut */
219
220	UV
221	Perl_utf8_to_uv(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
222	{
223	UV uv = *s, ouv;
224	STRLEN len = 1;
225	#ifdef EBCDIC
226	bool dowarn = 0;
227	#else
228	bool dowarn = ckWARN_d(WARN_UTF8);
229	#endif
230	STRLEN expectlen = 0;
231	U32 warning = 0;
232
233	/* This list is a superset of the UTF8_ALLOW_XXX. */
234
235	#define UTF8_WARN_EMPTY 1
236	#define UTF8_WARN_CONTINUATION 2
237	#define UTF8_WARN_NON_CONTINUATION 3
238	#define UTF8_WARN_FE_FF 4
239	#define UTF8_WARN_SHORT 5
240	#define UTF8_WARN_OVERFLOW 6
241	#define UTF8_WARN_SURROGATE 7
242	#define UTF8_WARN_BOM 8
243	#define UTF8_WARN_LONG 9
244	#define UTF8_WARN_FFFF 10
245
246	if (curlen == 0 &&
247	!(flags & UTF8_ALLOW_EMPTY)) {
248	warning = UTF8_WARN_EMPTY;
249	goto malformed;
250	}
251
252	if (UTF8_IS_ASCII(uv)) {
253	if (retlen)
254	*retlen = 1;
255	return *s;
256	}
257
258	if (UTF8_IS_CONTINUATION(uv) &&
259	!(flags & UTF8_ALLOW_CONTINUATION)) {
260	warning = UTF8_WARN_CONTINUATION;
261	goto malformed;
262	}
263
264	if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
265	!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
266	warning = UTF8_WARN_NON_CONTINUATION;
267	goto malformed;
268	}
269
270	if ((uv == 0xfe \|\| uv == 0xff) &&
271	!(flags & UTF8_ALLOW_FE_FF)) {
272	warning = UTF8_WARN_FE_FF;
273	goto malformed;
274	}
275
276	if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
277	else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
278	else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
279	else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
280	else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
281	else if (!(uv & 0x01)) { len = 7; uv = 0; }
282	else { len = 13; uv = 0; } /* whoa! */
283
284	if (retlen)
285	*retlen = len;
286
287	expectlen = len;
288
289	if ((curlen < expectlen) &&
290	!(flags & UTF8_ALLOW_SHORT)) {
291	warning = UTF8_WARN_SHORT;
292	goto malformed;
293	}
294
295	len--;
296	s++;
297	ouv = uv;
298
299	while (len--) {
300	if (!UTF8_IS_CONTINUATION(*s) &&
301	!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
302	s--;
303	warning = UTF8_WARN_NON_CONTINUATION;
304	goto malformed;
305	}
306	else
307	uv = UTF8_ACCUMULATE(uv, *s);
308	if (!(uv > ouv)) {
309	/* These cannot be allowed. */
310	if (uv == ouv) {
311	if (!(flags & UTF8_ALLOW_LONG)) {
312	warning = UTF8_WARN_LONG;
313	goto malformed;
314	}
315	}
316	else { /* uv < ouv */
317	/* This cannot be allowed. */
318	warning = UTF8_WARN_OVERFLOW;
319	goto malformed;
320	}
321	}
322	s++;
323	ouv = uv;
324	}
325
326	if (UNICODE_IS_SURROGATE(uv) &&
327	!(flags & UTF8_ALLOW_SURROGATE)) {
328	warning = UTF8_WARN_SURROGATE;
329	goto malformed;
330	} else if (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
331	!(flags & UTF8_ALLOW_BOM)) {
332	warning = UTF8_WARN_BOM;
333	goto malformed;
334	} else if ((expectlen > UNISKIP(uv)) &&
335	!(flags & UTF8_ALLOW_LONG)) {
336	warning = UTF8_WARN_LONG;
337	goto malformed;
338	} else if (UNICODE_IS_ILLEGAL(uv) &&
339	!(flags & UTF8_ALLOW_FFFF)) {
340	warning = UTF8_WARN_FFFF;
341	goto malformed;
342	}
343
344	return uv;
345
346	malformed:
347
348	if (flags & UTF8_CHECK_ONLY) {
349	if (retlen)
350	*retlen = -1;
351	return 0;
352	}
353
354	if (dowarn) {
355	SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
356
357	switch (warning) {
358	case 0: /* Intentionally empty. */ break;
359	case UTF8_WARN_EMPTY:
360	Perl_sv_catpvf(aTHX_ sv, "(empty string)");
361	break;
362	case UTF8_WARN_CONTINUATION:
363	Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf")", uv);
364	break;
365	case UTF8_WARN_NON_CONTINUATION:
366	Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf" after start byte 0x%02"UVxf")",
367	(UV)s[1], uv);
368	break;
369	case UTF8_WARN_FE_FF:
370	Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
371	break;
372	case UTF8_WARN_SHORT:
373	Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
374	curlen, curlen == 1 ? "" : "s", expectlen);
375	break;
376	case UTF8_WARN_OVERFLOW:
377	Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x)",
378	ouv, *s);
379	break;
380	case UTF8_WARN_SURROGATE:
381	Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
382	break;
383	case UTF8_WARN_BOM:
384	Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv);
385	break;
386	case UTF8_WARN_LONG:
387	Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
388	expectlen, expectlen == 1 ? "": "s", UNISKIP(uv));
389	break;
390	case UTF8_WARN_FFFF:
391	Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
392	break;
393	default:
394	Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
395	break;
396	}
397
398	if (warning) {
399	char *s = SvPVX(sv);
400
401	if (PL_op)
402	Perl_warner(aTHX_ WARN_UTF8,
403	"%s in %s", s, PL_op_desc[PL_op->op_type]);
404	else
405	Perl_warner(aTHX_ WARN_UTF8, "%s", s);
406	}
407	}
408
409	if (retlen)
410	*retlen = expectlen ? expectlen : len;
411
412	return 0;
413	}
414
415	/*
416	=for apidoc A\|U8* s\|utf8_to_uv_simple\|STRLEN *retlen
417
418	Returns the character value of the first character in the string C<s>
419	which is assumed to be in UTF8 encoding; C<retlen> will be set to the
420	length, in bytes, of that character.
421
422	If C<s> does not point to a well-formed UTF8 character, zero is
423	returned and retlen is set, if possible, to -1.
424
425	=cut
426	*/
427
428	UV
429	Perl_utf8_to_uv_simple(pTHX_ U8* s, STRLEN* retlen)
430	{
431	return Perl_utf8_to_uv(aTHX_ s, UTF8_MAXLEN, retlen, 0);
432	}
433
434	/*
435	=for apidoc A\|STRLEN\|utf8_length\|U8* s\|U8 *e
436
437	Return the length of the UTF-8 char encoded string C<s> in characters.
438	Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
439	up past C<e>, croaks.
440
441	=cut
442	*/
443
444	STRLEN
445	Perl_utf8_length(pTHX_ U8* s, U8* e)
446	{
447	STRLEN len = 0;
448
449	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
450	* the bitops (especially ~) can create illegal UTF-8.
451	* In other words: in Perl UTF-8 is not just for Unicode. */
452
453	if (e < s)
454	Perl_croak(aTHX_ "panic: utf8_length: unexpected end");
455	while (s < e) {
456	U8 t = UTF8SKIP(s);
457
458	if (e - s < t)
459	Perl_croak(aTHX_ "panic: utf8_length: unaligned end");
460	s += t;
461	len++;
462	}
463
464	return len;
465	}
466
467	/*
468	=for apidoc A\|IV\|utf8_distance\|U8 a\|U8 b
469
470	Returns the number of UTF8 characters between the UTF-8 pointers C<a>
471	and C<b>.
472
473	WARNING: use only if you know that the pointers point inside the
474	same UTF-8 buffer.
475
476	=cut */
477
478	IV
479	Perl_utf8_distance(pTHX_ U8 a, U8 b)
480	{
481	IV off = 0;
482
483	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
484	* the bitops (especially ~) can create illegal UTF-8.
485	* In other words: in Perl UTF-8 is not just for Unicode. */
486
487	if (a < b) {
488	while (a < b) {
489	U8 c = UTF8SKIP(a);
490
491	if (b - a < c)
492	Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
493	a += c;
494	off--;
495	}
496	}
497	else {
498	while (b < a) {
499	U8 c = UTF8SKIP(b);
500
501	if (a - b < c)
502	Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
503	b += c;
504	off++;
505	}
506	}
507
508	return off;
509	}
510
511	/*
512	=for apidoc A\|U8\|utf8_hop\|U8 s\|I32 off
513
514	Return the UTF-8 pointer C<s> displaced by C<off> characters, either
515	forward or backward.
516
517	WARNING: do not use the following unless you know C<off> is within
518	the UTF-8 data pointed to by C<s> and that on entry C<s> is aligned
519	on the first byte of character or just after the last byte of a character.
520
521	=cut */
522
523	U8 *
524	Perl_utf8_hop(pTHX_ U8 *s, I32 off)
525	{
526	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g
527	* the bitops (especially ~) can create illegal UTF-8.
528	* In other words: in Perl UTF-8 is not just for Unicode. */
529
530	if (off >= 0) {
531	while (off--)
532	s += UTF8SKIP(s);
533	}
534	else {
535	while (off++) {
536	s--;
537	while (UTF8_IS_CONTINUATION(*s))
538	s--;
539	}
540	}
541	return s;
542	}
543
544	/*
545	=for apidoc A\|U8 \|utf8_to_bytes\|U8 s\|STRLEN *len
546
547	Converts a string C<s> of length C<len> from UTF8 into byte encoding.
548	Unlike C<bytes_to_utf8>, this over-writes the original string, and
549	updates len to contain the new length.
550	Returns zero on failure, setting C<len> to -1.
551
552	=cut
553	*/
554
555	U8 *
556	Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN *len)
557	{
558	U8 *send;
559	U8 *d;
560	U8 *save = s;
561
562	/* ensure valid UTF8 and chars < 256 before updating string */
563	for (send = s + *len; s < send; ) {
564	U8 c = *s++;
565
566	if (c >= 0x80 &&
567	((s >= send) \|\|
568	((*s++ & 0xc0) != 0x80) \|\| ((c & 0xfe) != 0xc2))) {
569	*len = -1;
570	return 0;
571	}
572	}
573
574	d = s = save;
575	while (s < send) {
576	if (UTF8_IS_ASCII(*s)) {
577	d++ = s++;
578	}
579	else {
580	STRLEN ulen;
581	*d++ = (U8)utf8_to_uv_simple(s, &ulen);
582	s += ulen;
583	}
584	}
585	*d = '\0';
586	*len = d - save;
587	return save;
588	}
589
590	/*
591	=for apidoc A\|U8 \|bytes_from_utf8\|U8 s\|STRLEN len\|bool is_utf8
592
593	Converts a string C<s> of length C<len> from UTF8 into byte encoding.
594	Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
595	the newly-created string, and updates C<len> to contain the new
596	length. Returns the original string if no conversion occurs, C<len>
597	is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
598	0 if C<s> is converted or contains all 7bit characters.
599
600	=cut */
601
602	U8 *
603	Perl_bytes_from_utf8(pTHX_ U8* s, STRLEN len, bool is_utf8)
604	{
605	U8 *send;
606	U8 *d;
607	U8 *start = s;
608	I32 count = 0;
609
610	if (!*is_utf8)
611	return start;
612
613	/* ensure valid UTF8 and chars < 256 before converting string */
614	for (send = s + *len; s < send;) {
615	U8 c = *s++;
616	if (!UTF8_IS_ASCII(c)) {
617	if (UTF8_IS_CONTINUATION(c) \|\| s >= send \|\|
618	!UTF8_IS_CONTINUATION(*s) \|\| UTF8_IS_DOWNGRADEABLE_START(c))
619	return start;
620	s++, count++;
621	}
622	}
623
624	*is_utf8 = 0;
625
626	if (!count)
627	return start;
628
629	Newz(801, d, (*len) - count + 1, U8);
630	s = start; start = d;
631	while (s < send) {
632	U8 c = *s++;
633
634	if (UTF8_IS_ASCII(c))
635	*d++ = c;
636	else
637	d++ = UTF8_ACCUMULATE(c, s++);
638	}
639	*d = '\0';
640	*len = d - start;
641	return start;
642	}
643
644	/*
645	=for apidoc A\|U8 \|bytes_to_utf8\|U8 s\|STRLEN *len
646
647	Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
648	Returns a pointer to the newly-created string, and sets C<len> to
649	reflect the new length.
650
651	=cut
652	*/
653
654	U8*
655	Perl_bytes_to_utf8(pTHX_ U8* s, STRLEN *len)
656	{
657	U8 *send;
658	U8 *d;
659	U8 *dst;
660	send = s + (*len);
661
662	Newz(801, d, (len) 2 + 1, U8);
663	dst = d;
664
665	while (s < send) {
666	if (UTF8_IS_ASCII(*s))
667	d++ = s++;
668	else {
669	UV uv = *s++;
670
671	*d++ = UTF8_EIGHT_BIT_HI(uv);
672	*d++ = UTF8_EIGHT_BIT_LO(uv);
673	}
674	}
675	*d = '\0';
676	*len = d-dst;
677	return dst;
678	}
679
680	/*
681	* Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
682	*
683	* Destination must be pre-extended to 3/2 source. Do not use in-place.
684	* We optimize for native, for obvious reasons. */
685
686	U8*
687	Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
688	{
689	U8* pend;
690	U8* dstart = d;
691
692	if (bytelen & 1)
693	Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
694
695	pend = p + bytelen;
696
697	while (p < pend) {
698	UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
699	p += 2;
700	if (uv < 0x80) {
701	*d++ = uv;
702	continue;
703	}
704	if (uv < 0x800) {
705	*d++ = (( uv >> 6) \| 0xc0);
706	*d++ = (( uv & 0x3f) \| 0x80);
707	continue;
708	}
709	if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
710	UV low = *p++;
711	if (low < 0xdc00 \|\| low >= 0xdfff)
712	Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
713	uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
714	}
715	if (uv < 0x10000) {
716	*d++ = (( uv >> 12) \| 0xe0);
717	*d++ = (((uv >> 6) & 0x3f) \| 0x80);
718	*d++ = (( uv & 0x3f) \| 0x80);
719	continue;
720	}
721	else {
722	*d++ = (( uv >> 18) \| 0xf0);
723	*d++ = (((uv >> 12) & 0x3f) \| 0x80);
724	*d++ = (((uv >> 6) & 0x3f) \| 0x80);
725	*d++ = (( uv & 0x3f) \| 0x80);
726	continue;
727	}
728	}
729	*newlen = d - dstart;
730	return d;
731	}
732
733	/* Note: this one is slightly destructive of the source. */
734
735	U8*
736	Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
737	{
738	U8* s = (U8*)p;
739	U8* send = s + bytelen;
740	while (s < send) {
741	U8 tmp = s[0];
742	s[0] = s[1];
743	s[1] = tmp;
744	s += 2;
745	}
746	return utf16_to_utf8(p, d, bytelen, newlen);
747	}
748
749	/* for now these are all defined (inefficiently) in terms of the utf8 versions */
750
751	bool
752	Perl_is_uni_alnum(pTHX_ U32 c)
753	{
754	U8 tmpbuf[UTF8_MAXLEN+1];
755	uv_to_utf8(tmpbuf, (UV)c);
756	return is_utf8_alnum(tmpbuf);
757	}
758
759	bool
760	Perl_is_uni_alnumc(pTHX_ U32 c)
761	{
762	U8 tmpbuf[UTF8_MAXLEN+1];
763	uv_to_utf8(tmpbuf, (UV)c);
764	return is_utf8_alnumc(tmpbuf);
765	}
766
767	bool
768	Perl_is_uni_idfirst(pTHX_ U32 c)
769	{
770	U8 tmpbuf[UTF8_MAXLEN+1];
771	uv_to_utf8(tmpbuf, (UV)c);
772	return is_utf8_idfirst(tmpbuf);
773	}
774
775	bool
776	Perl_is_uni_alpha(pTHX_ U32 c)
777	{
778	U8 tmpbuf[UTF8_MAXLEN+1];
779	uv_to_utf8(tmpbuf, (UV)c);
780	return is_utf8_alpha(tmpbuf);
781	}
782
783	bool
784	Perl_is_uni_ascii(pTHX_ U32 c)
785	{
786	U8 tmpbuf[UTF8_MAXLEN+1];
787	uv_to_utf8(tmpbuf, (UV)c);
788	return is_utf8_ascii(tmpbuf);
789	}
790
791	bool
792	Perl_is_uni_space(pTHX_ U32 c)
793	{
794	U8 tmpbuf[UTF8_MAXLEN+1];
795	uv_to_utf8(tmpbuf, (UV)c);
796	return is_utf8_space(tmpbuf);
797	}
798
799	bool
800	Perl_is_uni_digit(pTHX_ U32 c)
801	{
802	U8 tmpbuf[UTF8_MAXLEN+1];
803	uv_to_utf8(tmpbuf, (UV)c);
804	return is_utf8_digit(tmpbuf);
805	}
806
807	bool
808	Perl_is_uni_upper(pTHX_ U32 c)
809	{
810	U8 tmpbuf[UTF8_MAXLEN+1];
811	uv_to_utf8(tmpbuf, (UV)c);
812	return is_utf8_upper(tmpbuf);
813	}
814
815	bool
816	Perl_is_uni_lower(pTHX_ U32 c)
817	{
818	U8 tmpbuf[UTF8_MAXLEN+1];
819	uv_to_utf8(tmpbuf, (UV)c);
820	return is_utf8_lower(tmpbuf);
821	}
822
823	bool
824	Perl_is_uni_cntrl(pTHX_ U32 c)
825	{
826	U8 tmpbuf[UTF8_MAXLEN+1];
827	uv_to_utf8(tmpbuf, (UV)c);
828	return is_utf8_cntrl(tmpbuf);
829	}
830
831	bool
832	Perl_is_uni_graph(pTHX_ U32 c)
833	{
834	U8 tmpbuf[UTF8_MAXLEN+1];
835	uv_to_utf8(tmpbuf, (UV)c);
836	return is_utf8_graph(tmpbuf);
837	}
838
839	bool
840	Perl_is_uni_print(pTHX_ U32 c)
841	{
842	U8 tmpbuf[UTF8_MAXLEN+1];
843	uv_to_utf8(tmpbuf, (UV)c);
844	return is_utf8_print(tmpbuf);
845	}
846
847	bool
848	Perl_is_uni_punct(pTHX_ U32 c)
849	{
850	U8 tmpbuf[UTF8_MAXLEN+1];
851	uv_to_utf8(tmpbuf, (UV)c);
852	return is_utf8_punct(tmpbuf);
853	}
854
855	bool
856	Perl_is_uni_xdigit(pTHX_ U32 c)
857	{
858	U8 tmpbuf[UTF8_MAXLEN+1];
859	uv_to_utf8(tmpbuf, (UV)c);
860	return is_utf8_xdigit(tmpbuf);
861	}
862
863	U32
864	Perl_to_uni_upper(pTHX_ U32 c)
865	{
866	U8 tmpbuf[UTF8_MAXLEN+1];
867	uv_to_utf8(tmpbuf, (UV)c);
868	return to_utf8_upper(tmpbuf);
869	}
870
871	U32
872	Perl_to_uni_title(pTHX_ U32 c)
873	{
874	U8 tmpbuf[UTF8_MAXLEN+1];
875	uv_to_utf8(tmpbuf, (UV)c);
876	return to_utf8_title(tmpbuf);
877	}
878
879	U32
880	Perl_to_uni_lower(pTHX_ U32 c)
881	{
882	U8 tmpbuf[UTF8_MAXLEN+1];
883	uv_to_utf8(tmpbuf, (UV)c);
884	return to_utf8_lower(tmpbuf);
885	}
886
887	/* for now these all assume no locale info available for Unicode > 255 */
888
889	bool
890	Perl_is_uni_alnum_lc(pTHX_ U32 c)
891	{
892	return is_uni_alnum(c); /* XXX no locale support yet */
893	}
894
895	bool
896	Perl_is_uni_alnumc_lc(pTHX_ U32 c)
897	{
898	return is_uni_alnumc(c); /* XXX no locale support yet */
899	}
900
901	bool
902	Perl_is_uni_idfirst_lc(pTHX_ U32 c)
903	{
904	return is_uni_idfirst(c); /* XXX no locale support yet */
905	}
906
907	bool
908	Perl_is_uni_alpha_lc(pTHX_ U32 c)
909	{
910	return is_uni_alpha(c); /* XXX no locale support yet */
911	}
912
913	bool
914	Perl_is_uni_ascii_lc(pTHX_ U32 c)
915	{
916	return is_uni_ascii(c); /* XXX no locale support yet */
917	}
918
919	bool
920	Perl_is_uni_space_lc(pTHX_ U32 c)
921	{
922	return is_uni_space(c); /* XXX no locale support yet */
923	}
924
925	bool
926	Perl_is_uni_digit_lc(pTHX_ U32 c)
927	{
928	return is_uni_digit(c); /* XXX no locale support yet */
929	}
930
931	bool
932	Perl_is_uni_upper_lc(pTHX_ U32 c)
933	{
934	return is_uni_upper(c); /* XXX no locale support yet */
935	}
936
937	bool
938	Perl_is_uni_lower_lc(pTHX_ U32 c)
939	{
940	return is_uni_lower(c); /* XXX no locale support yet */
941	}
942
943	bool
944	Perl_is_uni_cntrl_lc(pTHX_ U32 c)
945	{
946	return is_uni_cntrl(c); /* XXX no locale support yet */
947	}
948
949	bool
950	Perl_is_uni_graph_lc(pTHX_ U32 c)
951	{
952	return is_uni_graph(c); /* XXX no locale support yet */
953	}
954
955	bool
956	Perl_is_uni_print_lc(pTHX_ U32 c)
957	{
958	return is_uni_print(c); /* XXX no locale support yet */
959	}
960
961	bool
962	Perl_is_uni_punct_lc(pTHX_ U32 c)
963	{
964	return is_uni_punct(c); /* XXX no locale support yet */
965	}
966
967	bool
968	Perl_is_uni_xdigit_lc(pTHX_ U32 c)
969	{
970	return is_uni_xdigit(c); /* XXX no locale support yet */
971	}
972
973	U32
974	Perl_to_uni_upper_lc(pTHX_ U32 c)
975	{
976	return to_uni_upper(c); /* XXX no locale support yet */
977	}
978
979	U32
980	Perl_to_uni_title_lc(pTHX_ U32 c)
981	{
982	return to_uni_title(c); /* XXX no locale support yet */
983	}
984
985	U32
986	Perl_to_uni_lower_lc(pTHX_ U32 c)
987	{
988	return to_uni_lower(c); /* XXX no locale support yet */
989	}
990
991	bool
992	Perl_is_utf8_alnum(pTHX_ U8 *p)
993	{
994	if (!is_utf8_char(p))
995	return FALSE;
996	if (!PL_utf8_alnum)
997	/* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
998	* descendant of isalnum(3), in other words, it doesn't
999	* contain the '_'. --jhi */
1000	PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
1001	return swash_fetch(PL_utf8_alnum, p);
1002	/* return p == '_' \|\| is_utf8_alpha(p) \|\| is_utf8_digit(p); /
1003	#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1004	if (!PL_utf8_alnum)
1005	PL_utf8_alnum = swash_init("utf8", "",
1006	sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1007	return swash_fetch(PL_utf8_alnum, p);
1008	#endif
1009	}
1010
1011	bool
1012	Perl_is_utf8_alnumc(pTHX_ U8 *p)
1013	{
1014	if (!is_utf8_char(p))
1015	return FALSE;
1016	if (!PL_utf8_alnum)
1017	PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1018	return swash_fetch(PL_utf8_alnum, p);
1019	/* return is_utf8_alpha(p) \|\| is_utf8_digit(p); */
1020	#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1021	if (!PL_utf8_alnum)
1022	PL_utf8_alnum = swash_init("utf8", "",
1023	sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1024	return swash_fetch(PL_utf8_alnum, p);
1025	#endif
1026	}
1027
1028	bool
1029	Perl_is_utf8_idfirst(pTHX_ U8 *p)
1030	{
1031	return *p == '_' \|\| is_utf8_alpha(p);
1032	}
1033
1034	bool
1035	Perl_is_utf8_alpha(pTHX_ U8 *p)
1036	{
1037	if (!is_utf8_char(p))
1038	return FALSE;
1039	if (!PL_utf8_alpha)
1040	PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
1041	return swash_fetch(PL_utf8_alpha, p);
1042	}
1043
1044	bool
1045	Perl_is_utf8_ascii(pTHX_ U8 *p)
1046	{
1047	if (!is_utf8_char(p))
1048	return FALSE;
1049	if (!PL_utf8_ascii)
1050	PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1051	return swash_fetch(PL_utf8_ascii, p);
1052	}
1053
1054	bool
1055	Perl_is_utf8_space(pTHX_ U8 *p)
1056	{
1057	if (!is_utf8_char(p))
1058	return FALSE;
1059	if (!PL_utf8_space)
1060	PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
1061	return swash_fetch(PL_utf8_space, p);
1062	}
1063
1064	bool
1065	Perl_is_utf8_digit(pTHX_ U8 *p)
1066	{
1067	if (!is_utf8_char(p))
1068	return FALSE;
1069	if (!PL_utf8_digit)
1070	PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
1071	return swash_fetch(PL_utf8_digit, p);
1072	}
1073
1074	bool
1075	Perl_is_utf8_upper(pTHX_ U8 *p)
1076	{
1077	if (!is_utf8_char(p))
1078	return FALSE;
1079	if (!PL_utf8_upper)
1080	PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
1081	return swash_fetch(PL_utf8_upper, p);
1082	}
1083
1084	bool
1085	Perl_is_utf8_lower(pTHX_ U8 *p)
1086	{
1087	if (!is_utf8_char(p))
1088	return FALSE;
1089	if (!PL_utf8_lower)
1090	PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
1091	return swash_fetch(PL_utf8_lower, p);
1092	}
1093
1094	bool
1095	Perl_is_utf8_cntrl(pTHX_ U8 *p)
1096	{
1097	if (!is_utf8_char(p))
1098	return FALSE;
1099	if (!PL_utf8_cntrl)
1100	PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1101	return swash_fetch(PL_utf8_cntrl, p);
1102	}
1103
1104	bool
1105	Perl_is_utf8_graph(pTHX_ U8 *p)
1106	{
1107	if (!is_utf8_char(p))
1108	return FALSE;
1109	if (!PL_utf8_graph)
1110	PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1111	return swash_fetch(PL_utf8_graph, p);
1112	}
1113
1114	bool
1115	Perl_is_utf8_print(pTHX_ U8 *p)
1116	{
1117	if (!is_utf8_char(p))
1118	return FALSE;
1119	if (!PL_utf8_print)
1120	PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
1121	return swash_fetch(PL_utf8_print, p);
1122	}
1123
1124	bool
1125	Perl_is_utf8_punct(pTHX_ U8 *p)
1126	{
1127	if (!is_utf8_char(p))
1128	return FALSE;
1129	if (!PL_utf8_punct)
1130	PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1131	return swash_fetch(PL_utf8_punct, p);
1132	}
1133
1134	bool
1135	Perl_is_utf8_xdigit(pTHX_ U8 *p)
1136	{
1137	if (!is_utf8_char(p))
1138	return FALSE;
1139	if (!PL_utf8_xdigit)
1140	PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1141	return swash_fetch(PL_utf8_xdigit, p);
1142	}
1143
1144	bool
1145	Perl_is_utf8_mark(pTHX_ U8 *p)
1146	{
1147	if (!is_utf8_char(p))
1148	return FALSE;
1149	if (!PL_utf8_mark)
1150	PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
1151	return swash_fetch(PL_utf8_mark, p);
1152	}
1153
1154	UV
1155	Perl_to_utf8_upper(pTHX_ U8 *p)
1156	{
1157	UV uv;
1158
1159	if (!PL_utf8_toupper)
1160	PL_utf8_toupper = swash_init("utf8", "ToUpper", &PL_sv_undef, 4, 0);
1161	uv = swash_fetch(PL_utf8_toupper, p);
1162	return uv ? uv : utf8_to_uv(p,UTF8_MAXLEN,0,0);
1163	}
1164
1165	UV
1166	Perl_to_utf8_title(pTHX_ U8 *p)
1167	{
1168	UV uv;
1169
1170	if (!PL_utf8_totitle)
1171	PL_utf8_totitle = swash_init("utf8", "ToTitle", &PL_sv_undef, 4, 0);
1172	uv = swash_fetch(PL_utf8_totitle, p);
1173	return uv ? uv : utf8_to_uv(p,UTF8_MAXLEN,0,0);
1174	}
1175
1176	UV
1177	Perl_to_utf8_lower(pTHX_ U8 *p)
1178	{
1179	UV uv;
1180
1181	if (!PL_utf8_tolower)
1182	PL_utf8_tolower = swash_init("utf8", "ToLower", &PL_sv_undef, 4, 0);
1183	uv = swash_fetch(PL_utf8_tolower, p);
1184	return uv ? uv : utf8_to_uv(p,UTF8_MAXLEN,0,0);
1185	}
1186
1187	/* a "swash" is a swatch hash */
1188
1189	SV*
1190	Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
1191	{
1192	SV* retval;
1193	SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
1194	dSP;
1195
1196	if (!gv_stashpv(pkg, 0)) { /* demand load utf8 */
1197	ENTER;
1198	Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
1199	LEAVE;
1200	}
1201	SPAGAIN;
1202	PUSHSTACKi(PERLSI_MAGIC);
1203	PUSHMARK(SP);
1204	EXTEND(SP,5);
1205	PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1206	PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1207	PUSHs(listsv);
1208	PUSHs(sv_2mortal(newSViv(minbits)));
1209	PUSHs(sv_2mortal(newSViv(none)));
1210	PUTBACK;
1211	ENTER;
1212	SAVEI32(PL_hints);
1213	PL_hints = 0;
1214	save_re_context();
1215	if (PL_curcop == &PL_compiling)
1216	/* XXX ought to be handled by lex_start */
1217	sv_setpv(tokenbufsv, PL_tokenbuf);
1218	if (call_method("SWASHNEW", G_SCALAR))
1219	retval = newSVsv(*PL_stack_sp--);
1220	else
1221	retval = &PL_sv_undef;
1222	LEAVE;
1223	POPSTACK;
1224	if (PL_curcop == &PL_compiling) {
1225	STRLEN len;
1226	char* pv = SvPV(tokenbufsv, len);
1227
1228	Copy(pv, PL_tokenbuf, len+1, char);
1229	PL_curcop->op_private = PL_hints;
1230	}
1231	if (!SvROK(retval) \|\| SvTYPE(SvRV(retval)) != SVt_PVHV)
1232	Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1233	return retval;
1234	}
1235
1236	UV
1237	Perl_swash_fetch(pTHX_ SV sv, U8 ptr)
1238	{
1239	HV* hv = (HV*)SvRV(sv);
1240	U32 klen = UTF8SKIP(ptr) - 1;
1241	U32 off = ptr[klen] & 127; /* NB: 64 bit always 0 when len > 1 */
1242	STRLEN slen;
1243	STRLEN needents = (klen ? 64 : 128);
1244	U8 *tmps;
1245	U32 bit;
1246	SV *retval;
1247
1248	/*
1249	* This single-entry cache saves about 1/3 of the utf8 overhead in test
1250	* suite. (That is, only 7-8% overall over just a hash cache. Still,
1251	* it's nothing to sniff at.) Pity we usually come through at least
1252	* two function calls to get here...
1253	*
1254	* NB: this code assumes that swatches are never modified, once generated!
1255	*/
1256
1257	if (hv == PL_last_swash_hv &&
1258	klen == PL_last_swash_klen &&
1259	(!klen \|\| memEQ(ptr,PL_last_swash_key,klen)) )
1260	{
1261	tmps = PL_last_swash_tmps;
1262	slen = PL_last_swash_slen;
1263	}
1264	else {
1265	/* Try our second-level swatch cache, kept in a hash. */
1266	SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
1267
1268	/* If not cached, generate it via utf8::SWASHGET */
1269	if (!svp \|\| !SvPOK(svp) \|\| !(tmps = (U8)SvPV(*svp, slen))) {
1270	dSP;
1271	ENTER;
1272	SAVETMPS;
1273	save_re_context();
1274	PUSHSTACKi(PERLSI_MAGIC);
1275	PUSHMARK(SP);
1276	EXTEND(SP,3);
1277	PUSHs((SV*)sv);
1278	PUSHs(sv_2mortal(newSViv(utf8_to_uv(ptr, UTF8_MAXLEN, 0, 0) & ~(needents - 1))));
1279	PUSHs(sv_2mortal(newSViv(needents)));
1280	PUTBACK;
1281	if (call_method("SWASHGET", G_SCALAR))
1282	retval = newSVsv(*PL_stack_sp--);
1283	else
1284	retval = &PL_sv_undef;
1285	POPSTACK;
1286	FREETMPS;
1287	LEAVE;
1288	if (PL_curcop == &PL_compiling)
1289	PL_curcop->op_private = PL_hints;
1290
1291	svp = hv_store(hv, (char*)ptr, klen, retval, 0);
1292
1293	if (!svp \|\| !(tmps = (U8)SvPV(svp, slen)) \|\| slen < 8)
1294	Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
1295	}
1296
1297	PL_last_swash_hv = hv;
1298	PL_last_swash_klen = klen;
1299	PL_last_swash_tmps = tmps;
1300	PL_last_swash_slen = slen;
1301	if (klen)
1302	Copy(ptr, PL_last_swash_key, klen, U8);
1303	}
1304
1305	switch ((int)((slen << 3) / needents)) {
1306	case 1:
1307	bit = 1 << (off & 7);
1308	off >>= 3;
1309	return (tmps[off] & bit) != 0;
1310	case 8:
1311	return tmps[off];
1312	case 16:
1313	off <<= 1;
1314	return (tmps[off] << 8) + tmps[off + 1] ;
1315	case 32:
1316	off <<= 2;
1317	return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1318	}
1319	Perl_croak(aTHX_ "panic: swash_fetch");
1320	return 0;
1321	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: