Context Navigation

source: trunk/third/perl/utf8.c @ 18450

Visit:

Revision 18450, 48.8 KB checked in by zacheiss, 21 years ago (diff)
This commit was generated by cvs2svn to compensate for changes in r18449, which included commits to RCS files with non-trunk default branches.

Line
1	/* utf8.c
2	*
3	* Copyright (c) 1998-2002, Larry Wall
4	*
5	* You may distribute under the terms of either the GNU General Public
6	* License or the Artistic License, as specified in the README file.
7	*
8	*/
9
10	/*
11	* 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
12	* heard of that we don't want to see any closer; and that's the one place
13	* we're trying to get to! And that's just where we can't get, nohow.'
14	*
15	* 'Well do I understand your speech,' he answered in the same language;
16	* 'yet few strangers do so. Why then do you not speak in the Common Tongue,
17	* as is the custom in the West, if you wish to be answered?'
18	*
19	* ...the travellers perceived that the floor was paved with stones of many
20	* hues; branching runes and strange devices intertwined beneath their feet.
21	*/
22
23	#include "EXTERN.h"
24	#define PERL_IN_UTF8_C
25	#include "perl.h"
26
27	static char unees[] = "Malformed UTF-8 character (unexpected end of string)";
28
29	/*
30	=head1 Unicode Support
31
32	=for apidoc A\|U8 \|uvuni_to_utf8_flags\|U8 d\|UV uv\|UV flags
33
34	Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
35	of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
36	bytes available. The return value is the pointer to the byte after the
37	end of the new character. In other words,
38
39	d = uvuni_to_utf8_flags(d, uv, flags);
40
41	or, in most cases,
42
43	d = uvuni_to_utf8(d, uv);
44
45	(which is equivalent to)
46
47	d = uvuni_to_utf8_flags(d, uv, 0);
48
49	is the recommended Unicode-aware way of saying
50
51	*(d++) = uv;
52
53	=cut
54	*/
55
56	U8 *
57	Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
58	{
59	if (ckWARN(WARN_UTF8)) {
60	if (UNICODE_IS_SURROGATE(uv) &&
61	!(flags & UNICODE_ALLOW_SURROGATE))
62	Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
63	else if (
64	((uv >= 0xFDD0 && uv <= 0xFDEF &&
65	!(flags & UNICODE_ALLOW_FDD0))
66	\|\|
67	((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
68	!(flags & UNICODE_ALLOW_FFFF))) &&
69	/* UNICODE_ALLOW_SUPER includes
70	* FFFEs and FFFFs beyond 0x10FFFF. */
71	((uv <= PERL_UNICODE_MAX) \|\|
72	!(flags & UNICODE_ALLOW_SUPER))
73	)
74	Perl_warner(aTHX_ packWARN(WARN_UTF8),
75	"Unicode character 0x%04"UVxf" is illegal", uv);
76	}
77	if (UNI_IS_INVARIANT(uv)) {
78	*d++ = (U8)UTF_TO_NATIVE(uv);
79	return d;
80	}
81	#if defined(EBCDIC)
82	else {
83	STRLEN len = UNISKIP(uv);
84	U8 *p = d+len-1;
85	while (p > d) {
86	*p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) \| UTF_CONTINUATION_MARK);
87	uv >>= UTF_ACCUMULATION_SHIFT;
88	}
89	*p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) \| UTF_START_MARK(len));
90	return d+len;
91	}
92	#else /* Non loop style */
93	if (uv < 0x800) {
94	*d++ = (U8)(( uv >> 6) \| 0xc0);
95	*d++ = (U8)(( uv & 0x3f) \| 0x80);
96	return d;
97	}
98	if (uv < 0x10000) {
99	*d++ = (U8)(( uv >> 12) \| 0xe0);
100	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
101	*d++ = (U8)(( uv & 0x3f) \| 0x80);
102	return d;
103	}
104	if (uv < 0x200000) {
105	*d++ = (U8)(( uv >> 18) \| 0xf0);
106	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
107	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
108	*d++ = (U8)(( uv & 0x3f) \| 0x80);
109	return d;
110	}
111	if (uv < 0x4000000) {
112	*d++ = (U8)(( uv >> 24) \| 0xf8);
113	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
114	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
115	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
116	*d++ = (U8)(( uv & 0x3f) \| 0x80);
117	return d;
118	}
119	if (uv < 0x80000000) {
120	*d++ = (U8)(( uv >> 30) \| 0xfc);
121	*d++ = (U8)(((uv >> 24) & 0x3f) \| 0x80);
122	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
123	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
124	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
125	*d++ = (U8)(( uv & 0x3f) \| 0x80);
126	return d;
127	}
128	#ifdef HAS_QUAD
129	if (uv < UTF8_QUAD_MAX)
130	#endif
131	{
132	d++ = 0xfe; / Can't match U+FEFF! */
133	*d++ = (U8)(((uv >> 30) & 0x3f) \| 0x80);
134	*d++ = (U8)(((uv >> 24) & 0x3f) \| 0x80);
135	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
136	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
137	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
138	*d++ = (U8)(( uv & 0x3f) \| 0x80);
139	return d;
140	}
141	#ifdef HAS_QUAD
142	{
143	d++ = 0xff; / Can't match U+FFFE! */
144	d++ = 0x80; / 6 Reserved bits */
145	d++ = (U8)(((uv >> 60) & 0x0f) \| 0x80); / 2 Reserved bits */
146	*d++ = (U8)(((uv >> 54) & 0x3f) \| 0x80);
147	*d++ = (U8)(((uv >> 48) & 0x3f) \| 0x80);
148	*d++ = (U8)(((uv >> 42) & 0x3f) \| 0x80);
149	*d++ = (U8)(((uv >> 36) & 0x3f) \| 0x80);
150	*d++ = (U8)(((uv >> 30) & 0x3f) \| 0x80);
151	*d++ = (U8)(((uv >> 24) & 0x3f) \| 0x80);
152	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
153	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
154	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
155	*d++ = (U8)(( uv & 0x3f) \| 0x80);
156	return d;
157	}
158	#endif
159	#endif /* Loop style */
160	}
161
162	U8 *
163	Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
164	{
165	return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
166	}
167
168
169	/*
170	=for apidoc A\|STRLEN\|is_utf8_char\|U8 *s
171
172	Tests if some arbitrary number of bytes begins in a valid UTF-8
173	character. Note that an INVARIANT (i.e. ASCII) character is a valid
174	UTF-8 character. The actual number of bytes in the UTF-8 character
175	will be returned if it is valid, otherwise 0.
176
177	=cut */
178	STRLEN
179	Perl_is_utf8_char(pTHX_ U8 *s)
180	{
181	U8 u = *s;
182	STRLEN slen, len;
183	UV uv, ouv;
184
185	if (UTF8_IS_INVARIANT(u))
186	return 1;
187
188	if (!UTF8_IS_START(u))
189	return 0;
190
191	len = UTF8SKIP(s);
192
193	if (len < 2 \|\| !UTF8_IS_CONTINUATION(s[1]))
194	return 0;
195
196	slen = len - 1;
197	s++;
198	u &= UTF_START_MASK(len);
199	uv = u;
200	ouv = uv;
201	while (slen--) {
202	if (!UTF8_IS_CONTINUATION(*s))
203	return 0;
204	uv = UTF8_ACCUMULATE(uv, *s);
205	if (uv < ouv)
206	return 0;
207	ouv = uv;
208	s++;
209	}
210
211	if ((STRLEN)UNISKIP(uv) < len)
212	return 0;
213
214	return len;
215	}
216
217	/*
218	=for apidoc A\|bool\|is_utf8_string\|U8 *s\|STRLEN len
219
220	Returns true if first C<len> bytes of the given string form a valid UTF8
221	string, false otherwise. Note that 'a valid UTF8 string' does not mean
222	'a string that contains UTF8' because a valid ASCII string is a valid
223	UTF8 string.
224
225	=cut
226	*/
227
228	bool
229	Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
230	{
231	U8* x = s;
232	U8* send;
233	STRLEN c;
234
235	if (!len)
236	len = strlen((char *)s);
237	send = s + len;
238
239	while (x < send) {
240	/* Inline the easy bits of is_utf8_char() here for speed... */
241	if (UTF8_IS_INVARIANT(*x))
242	c = 1;
243	else if (!UTF8_IS_START(*x))
244	return FALSE;
245	else {
246	/* ... and call is_utf8_char() only if really needed. */
247	c = is_utf8_char(x);
248	if (!c)
249	return FALSE;
250	}
251	x += c;
252	}
253	if (x != send)
254	return FALSE;
255
256	return TRUE;
257	}
258
259	/*
260	=for apidoc A\|UV\|utf8n_to_uvuni\|U8 s\|STRLEN curlen\|STRLEN retlen\|U32 flags
261
262	Bottom level UTF-8 decode routine.
263	Returns the unicode code point value of the first character in the string C<s>
264	which is assumed to be in UTF8 encoding and no longer than C<curlen>;
265	C<retlen> will be set to the length, in bytes, of that character.
266
267	If C<s> does not point to a well-formed UTF8 character, the behaviour
268	is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
269	it is assumed that the caller will raise a warning, and this function
270	will silently just set C<retlen> to C<-1> and return zero. If the
271	C<flags> does not contain UTF8_CHECK_ONLY, warnings about
272	malformations will be given, C<retlen> will be set to the expected
273	length of the UTF-8 character in bytes, and zero will be returned.
274
275	The C<flags> can also contain various flags to allow deviations from
276	the strict UTF-8 encoding (see F<utf8.h>).
277
278	Most code should use utf8_to_uvchr() rather than call this directly.
279
280	=cut
281	*/
282
283	UV
284	Perl_utf8n_to_uvuni(pTHX_ U8 s, STRLEN curlen, STRLEN retlen, U32 flags)
285	{
286	U8 *s0 = s;
287	UV uv = *s, ouv = 0;
288	STRLEN len = 1;
289	bool dowarn = ckWARN_d(WARN_UTF8);
290	UV startbyte = *s;
291	STRLEN expectlen = 0;
292	U32 warning = 0;
293
294	/* This list is a superset of the UTF8_ALLOW_XXX. */
295
296	#define UTF8_WARN_EMPTY 1
297	#define UTF8_WARN_CONTINUATION 2
298	#define UTF8_WARN_NON_CONTINUATION 3
299	#define UTF8_WARN_FE_FF 4
300	#define UTF8_WARN_SHORT 5
301	#define UTF8_WARN_OVERFLOW 6
302	#define UTF8_WARN_SURROGATE 7
303	#define UTF8_WARN_LONG 8
304	#define UTF8_WARN_FFFF 9 /* Also FFFE. */
305
306	if (curlen == 0 &&
307	!(flags & UTF8_ALLOW_EMPTY)) {
308	warning = UTF8_WARN_EMPTY;
309	goto malformed;
310	}
311
312	if (UTF8_IS_INVARIANT(uv)) {
313	if (retlen)
314	*retlen = 1;
315	return (UV) (NATIVE_TO_UTF(*s));
316	}
317
318	if (UTF8_IS_CONTINUATION(uv) &&
319	!(flags & UTF8_ALLOW_CONTINUATION)) {
320	warning = UTF8_WARN_CONTINUATION;
321	goto malformed;
322	}
323
324	if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
325	!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
326	warning = UTF8_WARN_NON_CONTINUATION;
327	goto malformed;
328	}
329
330	#ifdef EBCDIC
331	uv = NATIVE_TO_UTF(uv);
332	#else
333	if ((uv == 0xfe \|\| uv == 0xff) &&
334	!(flags & UTF8_ALLOW_FE_FF)) {
335	warning = UTF8_WARN_FE_FF;
336	goto malformed;
337	}
338	#endif
339
340	if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
341	else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
342	else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
343	else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
344	#ifdef EBCDIC
345	else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
346	else { len = 7; uv &= 0x01; }
347	#else
348	else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
349	else if (!(uv & 0x01)) { len = 7; uv = 0; }
350	else { len = 13; uv = 0; } /* whoa! */
351	#endif
352
353	if (retlen)
354	*retlen = len;
355
356	expectlen = len;
357
358	if ((curlen < expectlen) &&
359	!(flags & UTF8_ALLOW_SHORT)) {
360	warning = UTF8_WARN_SHORT;
361	goto malformed;
362	}
363
364	len--;
365	s++;
366	ouv = uv;
367
368	while (len--) {
369	if (!UTF8_IS_CONTINUATION(*s) &&
370	!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
371	s--;
372	warning = UTF8_WARN_NON_CONTINUATION;
373	goto malformed;
374	}
375	else
376	uv = UTF8_ACCUMULATE(uv, *s);
377	if (!(uv > ouv)) {
378	/* These cannot be allowed. */
379	if (uv == ouv) {
380	if (!(flags & UTF8_ALLOW_LONG)) {
381	warning = UTF8_WARN_LONG;
382	goto malformed;
383	}
384	}
385	else { /* uv < ouv */
386	/* This cannot be allowed. */
387	warning = UTF8_WARN_OVERFLOW;
388	goto malformed;
389	}
390	}
391	s++;
392	ouv = uv;
393	}
394
395	if (UNICODE_IS_SURROGATE(uv) &&
396	!(flags & UTF8_ALLOW_SURROGATE)) {
397	warning = UTF8_WARN_SURROGATE;
398	goto malformed;
399	} else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
400	!(flags & UTF8_ALLOW_LONG)) {
401	warning = UTF8_WARN_LONG;
402	goto malformed;
403	} else if (UNICODE_IS_ILLEGAL(uv) &&
404	!(flags & UTF8_ALLOW_FFFF)) {
405	warning = UTF8_WARN_FFFF;
406	goto malformed;
407	}
408
409	return uv;
410
411	malformed:
412
413	if (flags & UTF8_CHECK_ONLY) {
414	if (retlen)
415	*retlen = -1;
416	return 0;
417	}
418
419	if (dowarn) {
420	SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
421
422	switch (warning) {
423	case 0: /* Intentionally empty. */ break;
424	case UTF8_WARN_EMPTY:
425	Perl_sv_catpvf(aTHX_ sv, "(empty string)");
426	break;
427	case UTF8_WARN_CONTINUATION:
428	Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
429	break;
430	case UTF8_WARN_NON_CONTINUATION:
431	if (s == s0)
432	Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
433	(UV)s[1], startbyte);
434	else
435	Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
436	(UV)s[1], s - s0, s - s0 > 1 ? "s" : "", startbyte, expectlen);
437
438	break;
439	case UTF8_WARN_FE_FF:
440	Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
441	break;
442	case UTF8_WARN_SHORT:
443	Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
444	curlen, curlen == 1 ? "" : "s", expectlen, startbyte);
445	expectlen = curlen; /* distance for caller to skip */
446	break;
447	case UTF8_WARN_OVERFLOW:
448	Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
449	ouv, *s, startbyte);
450	break;
451	case UTF8_WARN_SURROGATE:
452	Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
453	break;
454	case UTF8_WARN_LONG:
455	Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
456	expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
457	break;
458	case UTF8_WARN_FFFF:
459	Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
460	break;
461	default:
462	Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
463	break;
464	}
465
466	if (warning) {
467	char *s = SvPVX(sv);
468
469	if (PL_op)
470	Perl_warner(aTHX_ packWARN(WARN_UTF8),
471	"%s in %s", s, OP_DESC(PL_op));
472	else
473	Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
474	}
475	}
476
477	if (retlen)
478	*retlen = expectlen ? expectlen : len;
479
480	return 0;
481	}
482
483	/*
484	=for apidoc A\|UV\|utf8_to_uvchr\|U8 s\|STRLEN retlen
485
486	Returns the native character value of the first character in the string C<s>
487	which is assumed to be in UTF8 encoding; C<retlen> will be set to the
488	length, in bytes, of that character.
489
490	If C<s> does not point to a well-formed UTF8 character, zero is
491	returned and retlen is set, if possible, to -1.
492
493	=cut
494	*/
495
496	UV
497	Perl_utf8_to_uvchr(pTHX_ U8 s, STRLEN retlen)
498	{
499	return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen,
500	ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
501	}
502
503	/*
504	=for apidoc A\|UV\|utf8_to_uvuni\|U8 s\|STRLEN retlen
505
506	Returns the Unicode code point of the first character in the string C<s>
507	which is assumed to be in UTF8 encoding; C<retlen> will be set to the
508	length, in bytes, of that character.
509
510	This function should only be used when returned UV is considered
511	an index into the Unicode semantic tables (e.g. swashes).
512
513	If C<s> does not point to a well-formed UTF8 character, zero is
514	returned and retlen is set, if possible, to -1.
515
516	=cut
517	*/
518
519	UV
520	Perl_utf8_to_uvuni(pTHX_ U8 s, STRLEN retlen)
521	{
522	/* Call the low level routine asking for checks */
523	return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen,
524	ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
525	}
526
527	/*
528	=for apidoc A\|STRLEN\|utf8_length\|U8 s\|U8 e
529
530	Return the length of the UTF-8 char encoded string C<s> in characters.
531	Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
532	up past C<e>, croaks.
533
534	=cut
535	*/
536
537	STRLEN
538	Perl_utf8_length(pTHX_ U8 s, U8 e)
539	{
540	STRLEN len = 0;
541
542	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
543	* the bitops (especially ~) can create illegal UTF-8.
544	* In other words: in Perl UTF-8 is not just for Unicode. */
545
546	if (e < s) {
547	if (ckWARN_d(WARN_UTF8)) {
548	if (PL_op)
549	Perl_warner(aTHX_ packWARN(WARN_UTF8),
550	"%s in %s", unees, OP_DESC(PL_op));
551	else
552	Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
553	}
554	return 0;
555	}
556	while (s < e) {
557	U8 t = UTF8SKIP(s);
558
559	if (e - s < t) {
560	if (ckWARN_d(WARN_UTF8)) {
561	if (PL_op)
562	Perl_warner(aTHX_ packWARN(WARN_UTF8),
563	unees, OP_DESC(PL_op));
564	else
565	Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
566	}
567	return len;
568	}
569	s += t;
570	len++;
571	}
572
573	return len;
574	}
575
576	/*
577	=for apidoc A\|IV\|utf8_distance\|U8 a\|U8 b
578
579	Returns the number of UTF8 characters between the UTF-8 pointers C<a>
580	and C<b>.
581
582	WARNING: use only if you know that the pointers point inside the
583	same UTF-8 buffer.
584
585	=cut
586	*/
587
588	IV
589	Perl_utf8_distance(pTHX_ U8 a, U8 b)
590	{
591	IV off = 0;
592
593	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
594	* the bitops (especially ~) can create illegal UTF-8.
595	* In other words: in Perl UTF-8 is not just for Unicode. */
596
597	if (a < b) {
598	while (a < b) {
599	U8 c = UTF8SKIP(a);
600
601	if (b - a < c) {
602	if (ckWARN_d(WARN_UTF8)) {
603	if (PL_op)
604	Perl_warner(aTHX_ packWARN(WARN_UTF8),
605	"%s in %s", unees, OP_DESC(PL_op));
606	else
607	Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
608	}
609	return off;
610	}
611	a += c;
612	off--;
613	}
614	}
615	else {
616	while (b < a) {
617	U8 c = UTF8SKIP(b);
618
619	if (a - b < c) {
620	if (ckWARN_d(WARN_UTF8)) {
621	if (PL_op)
622	Perl_warner(aTHX_ packWARN(WARN_UTF8),
623	"%s in %s", unees, OP_DESC(PL_op));
624	else
625	Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
626	}
627	return off;
628	}
629	b += c;
630	off++;
631	}
632	}
633
634	return off;
635	}
636
637	/*
638	=for apidoc A\|U8 \|utf8_hop\|U8 s\|I32 off
639
640	Return the UTF-8 pointer C<s> displaced by C<off> characters, either
641	forward or backward.
642
643	WARNING: do not use the following unless you know C<off> is within
644	the UTF-8 data pointed to by C<s> and that on entry C<s> is aligned
645	on the first byte of character or just after the last byte of a character.
646
647	=cut
648	*/
649
650	U8 *
651	Perl_utf8_hop(pTHX_ U8 *s, I32 off)
652	{
653	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g
654	* the bitops (especially ~) can create illegal UTF-8.
655	* In other words: in Perl UTF-8 is not just for Unicode. */
656
657	if (off >= 0) {
658	while (off--)
659	s += UTF8SKIP(s);
660	}
661	else {
662	while (off++) {
663	s--;
664	while (UTF8_IS_CONTINUATION(*s))
665	s--;
666	}
667	}
668	return s;
669	}
670
671	/*
672	=for apidoc A\|U8 \|utf8_to_bytes\|U8 s\|STRLEN *len
673
674	Converts a string C<s> of length C<len> from UTF8 into byte encoding.
675	Unlike C<bytes_to_utf8>, this over-writes the original string, and
676	updates len to contain the new length.
677	Returns zero on failure, setting C<len> to -1.
678
679	=cut
680	*/
681
682	U8 *
683	Perl_utf8_to_bytes(pTHX_ U8 s, STRLEN len)
684	{
685	U8 *send;
686	U8 *d;
687	U8 *save = s;
688
689	/* ensure valid UTF8 and chars < 256 before updating string */
690	for (send = s + *len; s < send; ) {
691	U8 c = *s++;
692
693	if (!UTF8_IS_INVARIANT(c) &&
694	(!UTF8_IS_DOWNGRADEABLE_START(c) \|\| (s >= send)
695	\|\| !(c = *s++) \|\| !UTF8_IS_CONTINUATION(c))) {
696	*len = -1;
697	return 0;
698	}
699	}
700
701	d = s = save;
702	while (s < send) {
703	STRLEN ulen;
704	*d++ = (U8)utf8_to_uvchr(s, &ulen);
705	s += ulen;
706	}
707	*d = '\0';
708	*len = d - save;
709	return save;
710	}
711
712	/*
713	=for apidoc A\|U8 \|bytes_from_utf8\|U8 s\|STRLEN len\|bool is_utf8
714
715	Converts a string C<s> of length C<len> from UTF8 into byte encoding.
716	Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
717	the newly-created string, and updates C<len> to contain the new
718	length. Returns the original string if no conversion occurs, C<len>
719	is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
720	0 if C<s> is converted or contains all 7bit characters.
721
722	=cut
723	*/
724
725	U8 *
726	Perl_bytes_from_utf8(pTHX_ U8 s, STRLEN len, bool *is_utf8)
727	{
728	U8 *d;
729	U8 *start = s;
730	U8 *send;
731	I32 count = 0;
732
733	if (!*is_utf8)
734	return start;
735
736	/* ensure valid UTF8 and chars < 256 before converting string */
737	for (send = s + *len; s < send;) {
738	U8 c = *s++;
739	if (!UTF8_IS_INVARIANT(c)) {
740	if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
741	(c = *s++) && UTF8_IS_CONTINUATION(c))
742	count++;
743	else
744	return start;
745	}
746	}
747
748	*is_utf8 = 0;
749
750	Newz(801, d, (*len) - count + 1, U8);
751	s = start; start = d;
752	while (s < send) {
753	U8 c = *s++;
754	if (!UTF8_IS_INVARIANT(c)) {
755	/* Then it is two-byte encoded */
756	c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
757	c = ASCII_TO_NATIVE(c);
758	}
759	*d++ = c;
760	}
761	*d = '\0';
762	*len = d - start;
763	return start;
764	}
765
766	/*
767	=for apidoc A\|U8 \|bytes_to_utf8\|U8 s\|STRLEN *len
768
769	Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
770	Returns a pointer to the newly-created string, and sets C<len> to
771	reflect the new length.
772
773	=cut
774	*/
775
776	U8*
777	Perl_bytes_to_utf8(pTHX_ U8 s, STRLEN len)
778	{
779	U8 *send;
780	U8 *d;
781	U8 *dst;
782	send = s + (*len);
783
784	Newz(801, d, (len) 2 + 1, U8);
785	dst = d;
786
787	while (s < send) {
788	UV uv = NATIVE_TO_ASCII(*s++);
789	if (UNI_IS_INVARIANT(uv))
790	*d++ = (U8)UTF_TO_NATIVE(uv);
791	else {
792	*d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
793	*d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
794	}
795	}
796	*d = '\0';
797	*len = d-dst;
798	return dst;
799	}
800
801	/*
802	* Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
803	*
804	* Destination must be pre-extended to 3/2 source. Do not use in-place.
805	* We optimize for native, for obvious reasons. */
806
807	U8*
808	Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
809	{
810	U8* pend;
811	U8* dstart = d;
812
813	if (bytelen & 1)
814	Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
815
816	pend = p + bytelen;
817
818	while (p < pend) {
819	UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
820	p += 2;
821	if (uv < 0x80) {
822	*d++ = (U8)uv;
823	continue;
824	}
825	if (uv < 0x800) {
826	*d++ = (U8)(( uv >> 6) \| 0xc0);
827	*d++ = (U8)(( uv & 0x3f) \| 0x80);
828	continue;
829	}
830	if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
831	UV low = *p++;
832	if (low < 0xdc00 \|\| low >= 0xdfff)
833	Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
834	uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
835	}
836	if (uv < 0x10000) {
837	*d++ = (U8)(( uv >> 12) \| 0xe0);
838	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
839	*d++ = (U8)(( uv & 0x3f) \| 0x80);
840	continue;
841	}
842	else {
843	*d++ = (U8)(( uv >> 18) \| 0xf0);
844	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
845	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
846	*d++ = (U8)(( uv & 0x3f) \| 0x80);
847	continue;
848	}
849	}
850	*newlen = d - dstart;
851	return d;
852	}
853
854	/* Note: this one is slightly destructive of the source. */
855
856	U8*
857	Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
858	{
859	U8* s = (U8*)p;
860	U8* send = s + bytelen;
861	while (s < send) {
862	U8 tmp = s[0];
863	s[0] = s[1];
864	s[1] = tmp;
865	s += 2;
866	}
867	return utf16_to_utf8(p, d, bytelen, newlen);
868	}
869
870	/* for now these are all defined (inefficiently) in terms of the utf8 versions */
871
872	bool
873	Perl_is_uni_alnum(pTHX_ UV c)
874	{
875	U8 tmpbuf[UTF8_MAXLEN+1];
876	uvchr_to_utf8(tmpbuf, c);
877	return is_utf8_alnum(tmpbuf);
878	}
879
880	bool
881	Perl_is_uni_alnumc(pTHX_ UV c)
882	{
883	U8 tmpbuf[UTF8_MAXLEN+1];
884	uvchr_to_utf8(tmpbuf, c);
885	return is_utf8_alnumc(tmpbuf);
886	}
887
888	bool
889	Perl_is_uni_idfirst(pTHX_ UV c)
890	{
891	U8 tmpbuf[UTF8_MAXLEN+1];
892	uvchr_to_utf8(tmpbuf, c);
893	return is_utf8_idfirst(tmpbuf);
894	}
895
896	bool
897	Perl_is_uni_alpha(pTHX_ UV c)
898	{
899	U8 tmpbuf[UTF8_MAXLEN+1];
900	uvchr_to_utf8(tmpbuf, c);
901	return is_utf8_alpha(tmpbuf);
902	}
903
904	bool
905	Perl_is_uni_ascii(pTHX_ UV c)
906	{
907	U8 tmpbuf[UTF8_MAXLEN+1];
908	uvchr_to_utf8(tmpbuf, c);
909	return is_utf8_ascii(tmpbuf);
910	}
911
912	bool
913	Perl_is_uni_space(pTHX_ UV c)
914	{
915	U8 tmpbuf[UTF8_MAXLEN+1];
916	uvchr_to_utf8(tmpbuf, c);
917	return is_utf8_space(tmpbuf);
918	}
919
920	bool
921	Perl_is_uni_digit(pTHX_ UV c)
922	{
923	U8 tmpbuf[UTF8_MAXLEN+1];
924	uvchr_to_utf8(tmpbuf, c);
925	return is_utf8_digit(tmpbuf);
926	}
927
928	bool
929	Perl_is_uni_upper(pTHX_ UV c)
930	{
931	U8 tmpbuf[UTF8_MAXLEN+1];
932	uvchr_to_utf8(tmpbuf, c);
933	return is_utf8_upper(tmpbuf);
934	}
935
936	bool
937	Perl_is_uni_lower(pTHX_ UV c)
938	{
939	U8 tmpbuf[UTF8_MAXLEN+1];
940	uvchr_to_utf8(tmpbuf, c);
941	return is_utf8_lower(tmpbuf);
942	}
943
944	bool
945	Perl_is_uni_cntrl(pTHX_ UV c)
946	{
947	U8 tmpbuf[UTF8_MAXLEN+1];
948	uvchr_to_utf8(tmpbuf, c);
949	return is_utf8_cntrl(tmpbuf);
950	}
951
952	bool
953	Perl_is_uni_graph(pTHX_ UV c)
954	{
955	U8 tmpbuf[UTF8_MAXLEN+1];
956	uvchr_to_utf8(tmpbuf, c);
957	return is_utf8_graph(tmpbuf);
958	}
959
960	bool
961	Perl_is_uni_print(pTHX_ UV c)
962	{
963	U8 tmpbuf[UTF8_MAXLEN+1];
964	uvchr_to_utf8(tmpbuf, c);
965	return is_utf8_print(tmpbuf);
966	}
967
968	bool
969	Perl_is_uni_punct(pTHX_ UV c)
970	{
971	U8 tmpbuf[UTF8_MAXLEN+1];
972	uvchr_to_utf8(tmpbuf, c);
973	return is_utf8_punct(tmpbuf);
974	}
975
976	bool
977	Perl_is_uni_xdigit(pTHX_ UV c)
978	{
979	U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
980	uvchr_to_utf8(tmpbuf, c);
981	return is_utf8_xdigit(tmpbuf);
982	}
983
984	UV
985	Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
986	{
987	uvchr_to_utf8(p, c);
988	return to_utf8_upper(p, p, lenp);
989	}
990
991	UV
992	Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
993	{
994	uvchr_to_utf8(p, c);
995	return to_utf8_title(p, p, lenp);
996	}
997
998	UV
999	Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1000	{
1001	uvchr_to_utf8(p, c);
1002	return to_utf8_lower(p, p, lenp);
1003	}
1004
1005	UV
1006	Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1007	{
1008	uvchr_to_utf8(p, c);
1009	return to_utf8_fold(p, p, lenp);
1010	}
1011
1012	/* for now these all assume no locale info available for Unicode > 255 */
1013
1014	bool
1015	Perl_is_uni_alnum_lc(pTHX_ UV c)
1016	{
1017	return is_uni_alnum(c); /* XXX no locale support yet */
1018	}
1019
1020	bool
1021	Perl_is_uni_alnumc_lc(pTHX_ UV c)
1022	{
1023	return is_uni_alnumc(c); /* XXX no locale support yet */
1024	}
1025
1026	bool
1027	Perl_is_uni_idfirst_lc(pTHX_ UV c)
1028	{
1029	return is_uni_idfirst(c); /* XXX no locale support yet */
1030	}
1031
1032	bool
1033	Perl_is_uni_alpha_lc(pTHX_ UV c)
1034	{
1035	return is_uni_alpha(c); /* XXX no locale support yet */
1036	}
1037
1038	bool
1039	Perl_is_uni_ascii_lc(pTHX_ UV c)
1040	{
1041	return is_uni_ascii(c); /* XXX no locale support yet */
1042	}
1043
1044	bool
1045	Perl_is_uni_space_lc(pTHX_ UV c)
1046	{
1047	return is_uni_space(c); /* XXX no locale support yet */
1048	}
1049
1050	bool
1051	Perl_is_uni_digit_lc(pTHX_ UV c)
1052	{
1053	return is_uni_digit(c); /* XXX no locale support yet */
1054	}
1055
1056	bool
1057	Perl_is_uni_upper_lc(pTHX_ UV c)
1058	{
1059	return is_uni_upper(c); /* XXX no locale support yet */
1060	}
1061
1062	bool
1063	Perl_is_uni_lower_lc(pTHX_ UV c)
1064	{
1065	return is_uni_lower(c); /* XXX no locale support yet */
1066	}
1067
1068	bool
1069	Perl_is_uni_cntrl_lc(pTHX_ UV c)
1070	{
1071	return is_uni_cntrl(c); /* XXX no locale support yet */
1072	}
1073
1074	bool
1075	Perl_is_uni_graph_lc(pTHX_ UV c)
1076	{
1077	return is_uni_graph(c); /* XXX no locale support yet */
1078	}
1079
1080	bool
1081	Perl_is_uni_print_lc(pTHX_ UV c)
1082	{
1083	return is_uni_print(c); /* XXX no locale support yet */
1084	}
1085
1086	bool
1087	Perl_is_uni_punct_lc(pTHX_ UV c)
1088	{
1089	return is_uni_punct(c); /* XXX no locale support yet */
1090	}
1091
1092	bool
1093	Perl_is_uni_xdigit_lc(pTHX_ UV c)
1094	{
1095	return is_uni_xdigit(c); /* XXX no locale support yet */
1096	}
1097
1098	U32
1099	Perl_to_uni_upper_lc(pTHX_ U32 c)
1100	{
1101	/* XXX returns only the first character -- do not use XXX */
1102	/* XXX no locale support yet */
1103	STRLEN len;
1104	U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1105	return (U32)to_uni_upper(c, tmpbuf, &len);
1106	}
1107
1108	U32
1109	Perl_to_uni_title_lc(pTHX_ U32 c)
1110	{
1111	/* XXX returns only the first character XXX -- do not use XXX */
1112	/* XXX no locale support yet */
1113	STRLEN len;
1114	U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1115	return (U32)to_uni_title(c, tmpbuf, &len);
1116	}
1117
1118	U32
1119	Perl_to_uni_lower_lc(pTHX_ U32 c)
1120	{
1121	/* XXX returns only the first character -- do not use XXX */
1122	/* XXX no locale support yet */
1123	STRLEN len;
1124	U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1125	return (U32)to_uni_lower(c, tmpbuf, &len);
1126	}
1127
1128	bool
1129	Perl_is_utf8_alnum(pTHX_ U8 *p)
1130	{
1131	if (!is_utf8_char(p))
1132	return FALSE;
1133	if (!PL_utf8_alnum)
1134	/* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1135	* descendant of isalnum(3), in other words, it doesn't
1136	* contain the '_'. --jhi */
1137	PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
1138	return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1139	/* return p == '_' \|\| is_utf8_alpha(p) \|\| is_utf8_digit(p); /
1140	#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1141	if (!PL_utf8_alnum)
1142	PL_utf8_alnum = swash_init("utf8", "",
1143	sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1144	return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1145	#endif
1146	}
1147
1148	bool
1149	Perl_is_utf8_alnumc(pTHX_ U8 *p)
1150	{
1151	if (!is_utf8_char(p))
1152	return FALSE;
1153	if (!PL_utf8_alnum)
1154	PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1155	return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1156	/* return is_utf8_alpha(p) \|\| is_utf8_digit(p); */
1157	#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1158	if (!PL_utf8_alnum)
1159	PL_utf8_alnum = swash_init("utf8", "",
1160	sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1161	return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1162	#endif
1163	}
1164
1165	bool
1166	Perl_is_utf8_idfirst(pTHX_ U8 p) / The naming is historical. */
1167	{
1168	if (*p == '_')
1169	return TRUE;
1170	if (!is_utf8_char(p))
1171	return FALSE;
1172	if (!PL_utf8_idstart) /* is_utf8_idstart would be more logical. */
1173	PL_utf8_idstart = swash_init("utf8", "IdStart", &PL_sv_undef, 0, 0);
1174	return swash_fetch(PL_utf8_idstart, p, TRUE) != 0;
1175	}
1176
1177	bool
1178	Perl_is_utf8_idcont(pTHX_ U8 *p)
1179	{
1180	if (*p == '_')
1181	return TRUE;
1182	if (!is_utf8_char(p))
1183	return FALSE;
1184	if (!PL_utf8_idcont)
1185	PL_utf8_idcont = swash_init("utf8", "IdContinue", &PL_sv_undef, 0, 0);
1186	return swash_fetch(PL_utf8_idcont, p, TRUE) != 0;
1187	}
1188
1189	bool
1190	Perl_is_utf8_alpha(pTHX_ U8 *p)
1191	{
1192	if (!is_utf8_char(p))
1193	return FALSE;
1194	if (!PL_utf8_alpha)
1195	PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
1196	return swash_fetch(PL_utf8_alpha, p, TRUE) != 0;
1197	}
1198
1199	bool
1200	Perl_is_utf8_ascii(pTHX_ U8 *p)
1201	{
1202	if (!is_utf8_char(p))
1203	return FALSE;
1204	if (!PL_utf8_ascii)
1205	PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1206	return swash_fetch(PL_utf8_ascii, p, TRUE) != 0;
1207	}
1208
1209	bool
1210	Perl_is_utf8_space(pTHX_ U8 *p)
1211	{
1212	if (!is_utf8_char(p))
1213	return FALSE;
1214	if (!PL_utf8_space)
1215	PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
1216	return swash_fetch(PL_utf8_space, p, TRUE) != 0;
1217	}
1218
1219	bool
1220	Perl_is_utf8_digit(pTHX_ U8 *p)
1221	{
1222	if (!is_utf8_char(p))
1223	return FALSE;
1224	if (!PL_utf8_digit)
1225	PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
1226	return swash_fetch(PL_utf8_digit, p, TRUE) != 0;
1227	}
1228
1229	bool
1230	Perl_is_utf8_upper(pTHX_ U8 *p)
1231	{
1232	if (!is_utf8_char(p))
1233	return FALSE;
1234	if (!PL_utf8_upper)
1235	PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
1236	return swash_fetch(PL_utf8_upper, p, TRUE) != 0;
1237	}
1238
1239	bool
1240	Perl_is_utf8_lower(pTHX_ U8 *p)
1241	{
1242	if (!is_utf8_char(p))
1243	return FALSE;
1244	if (!PL_utf8_lower)
1245	PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
1246	return swash_fetch(PL_utf8_lower, p, TRUE) != 0;
1247	}
1248
1249	bool
1250	Perl_is_utf8_cntrl(pTHX_ U8 *p)
1251	{
1252	if (!is_utf8_char(p))
1253	return FALSE;
1254	if (!PL_utf8_cntrl)
1255	PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1256	return swash_fetch(PL_utf8_cntrl, p, TRUE) != 0;
1257	}
1258
1259	bool
1260	Perl_is_utf8_graph(pTHX_ U8 *p)
1261	{
1262	if (!is_utf8_char(p))
1263	return FALSE;
1264	if (!PL_utf8_graph)
1265	PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1266	return swash_fetch(PL_utf8_graph, p, TRUE) != 0;
1267	}
1268
1269	bool
1270	Perl_is_utf8_print(pTHX_ U8 *p)
1271	{
1272	if (!is_utf8_char(p))
1273	return FALSE;
1274	if (!PL_utf8_print)
1275	PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
1276	return swash_fetch(PL_utf8_print, p, TRUE) != 0;
1277	}
1278
1279	bool
1280	Perl_is_utf8_punct(pTHX_ U8 *p)
1281	{
1282	if (!is_utf8_char(p))
1283	return FALSE;
1284	if (!PL_utf8_punct)
1285	PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1286	return swash_fetch(PL_utf8_punct, p, TRUE) != 0;
1287	}
1288
1289	bool
1290	Perl_is_utf8_xdigit(pTHX_ U8 *p)
1291	{
1292	if (!is_utf8_char(p))
1293	return FALSE;
1294	if (!PL_utf8_xdigit)
1295	PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1296	return swash_fetch(PL_utf8_xdigit, p, TRUE) != 0;
1297	}
1298
1299	bool
1300	Perl_is_utf8_mark(pTHX_ U8 *p)
1301	{
1302	if (!is_utf8_char(p))
1303	return FALSE;
1304	if (!PL_utf8_mark)
1305	PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
1306	return swash_fetch(PL_utf8_mark, p, TRUE) != 0;
1307	}
1308
1309	/*
1310	=for apidoc A\|UV\|to_utf8_case\|U8 p\|U8 ustrp\|STRLEN lenp\|SV swash\|char normal\|char *special
1311
1312	The "p" contains the pointer to the UTF-8 string encoding
1313	the character that is being converted.
1314
1315	The "ustrp" is a pointer to the character buffer to put the
1316	conversion result to. The "lenp" is a pointer to the length
1317	of the result.
1318
1319	The "swashp" is a pointer to the swash to use.
1320
1321	Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1322	and loaded by SWASHGET, using lib/utf8_heavy.pl. The special (usually,
1323	but not always, a multicharacter mapping), is tried first.
1324
1325	The "special" is a string like "utf8::ToSpecLower", which means the
1326	hash %utf8::ToSpecLower. The access to the hash is through
1327	Perl_to_utf8_case().
1328
1329	The "normal" is a string like "ToLower" which means the swash
1330	%utf8::ToLower.
1331
1332	=cut */
1333
1334	UV
1335	Perl_to_utf8_case(pTHX_ U8 p, U8 ustrp, STRLEN lenp, SV swashp, char normal, char *special)
1336	{
1337	UV uv0, uv1;
1338	U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
1339	STRLEN len = 0;
1340
1341	uv0 = utf8_to_uvchr(p, 0);
1342	/* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1343	* are necessary in EBCDIC, they are redundant no-ops
1344	* in ASCII-ish platforms, and hopefully optimized away. */
1345	uv1 = NATIVE_TO_UNI(uv0);
1346	uvuni_to_utf8(tmpbuf, uv1);
1347
1348	if (!swashp) / load on-demand */
1349	*swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1350
1351	if (special) {
1352	/* It might be "special" (sometimes, but not always,
1353	* a multicharacter mapping) */
1354	HV *hv;
1355	SV *keysv;
1356	HE *he;
1357	SV *val;
1358
1359	if ((hv = get_hv(special, FALSE)) &&
1360	(keysv = sv_2mortal(Perl_newSVpvf(aTHX_ "%04"UVXf, uv1))) &&
1361	(he = hv_fetch_ent(hv, keysv, FALSE, 0)) &&
1362	(val = HeVAL(he))) {
1363	char *s;
1364
1365	s = SvPV(val, len);
1366	if (len == 1)
1367	len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI((U8)s)) - ustrp;
1368	else {
1369	#ifdef EBCDIC
1370	/* If we have EBCDIC we need to remap the characters
1371	* since any characters in the low 256 are Unicode
1372	* code points, not EBCDIC. */
1373	U8 t = (U8)s, tend = t + len, d;
1374
1375	d = tmpbuf;
1376	if (SvUTF8(val)) {
1377	STRLEN tlen = 0;
1378
1379	while (t < tend) {
1380	UV c = utf8_to_uvchr(t, &tlen);
1381	if (tlen > 0) {
1382	d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1383	t += tlen;
1384	}
1385	else
1386	break;
1387	}
1388	}
1389	else {
1390	while (t < tend) {
1391	d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1392	t++;
1393	}
1394	}
1395	len = d - tmpbuf;
1396	Copy(tmpbuf, ustrp, len, U8);
1397	#else
1398	Copy(s, ustrp, len, U8);
1399	#endif
1400	}
1401	}
1402	}
1403
1404	if (!len && *swashp) {
1405	UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1406
1407	if (uv2) {
1408	/* It was "normal" (a single character mapping). */
1409	UV uv3 = UNI_TO_NATIVE(uv2);
1410
1411	len = uvchr_to_utf8(ustrp, uv3) - ustrp;
1412	}
1413	}
1414
1415	if (!len) /* Neither: just copy. */
1416	len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1417
1418	if (lenp)
1419	*lenp = len;
1420
1421	return len ? utf8_to_uvchr(ustrp, 0) : 0;
1422	}
1423
1424	/*
1425	=for apidoc A\|UV\|to_utf8_upper\|U8 p\|U8 ustrp\|STRLEN *lenp
1426
1427	Convert the UTF-8 encoded character at p to its uppercase version and
1428	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1429	that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1430	uppercase version may be longer than the original character (up to two
1431	characters).
1432
1433	The first character of the uppercased version is returned
1434	(but note, as explained above, that there may be more.)
1435
1436	=cut */
1437
1438	UV
1439	Perl_to_utf8_upper(pTHX_ U8 p, U8 ustrp, STRLEN *lenp)
1440	{
1441	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1442	&PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1443	}
1444
1445	/*
1446	=for apidoc A\|UV\|to_utf8_title\|U8 p\|U8 ustrp\|STRLEN *lenp
1447
1448	Convert the UTF-8 encoded character at p to its titlecase version and
1449	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1450	that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1451	titlecase version may be longer than the original character (up to two
1452	characters).
1453
1454	The first character of the titlecased version is returned
1455	(but note, as explained above, that there may be more.)
1456
1457	=cut */
1458
1459	UV
1460	Perl_to_utf8_title(pTHX_ U8 p, U8 ustrp, STRLEN *lenp)
1461	{
1462	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1463	&PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1464	}
1465
1466	/*
1467	=for apidoc A\|UV\|to_utf8_lower\|U8 p\|U8 ustrp\|STRLEN *lenp
1468
1469	Convert the UTF-8 encoded character at p to its lowercase version and
1470	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1471	that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1472	lowercase version may be longer than the original character (up to two
1473	characters).
1474
1475	The first character of the lowercased version is returned
1476	(but note, as explained above, that there may be more.)
1477
1478	=cut */
1479
1480	UV
1481	Perl_to_utf8_lower(pTHX_ U8 p, U8 ustrp, STRLEN *lenp)
1482	{
1483	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1484	&PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1485	}
1486
1487	/*
1488	=for apidoc A\|UV\|to_utf8_fold\|U8 p\|U8 ustrp\|STRLEN *lenp
1489
1490	Convert the UTF-8 encoded character at p to its foldcase version and
1491	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1492	that the ustrp needs to be at least UTF8_MAXLEN_FOLD+1 bytes since the
1493	foldcase version may be longer than the original character (up to
1494	three characters).
1495
1496	The first character of the foldcased version is returned
1497	(but note, as explained above, that there may be more.)
1498
1499	=cut */
1500
1501	UV
1502	Perl_to_utf8_fold(pTHX_ U8 p, U8 ustrp, STRLEN *lenp)
1503	{
1504	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1505	&PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1506	}
1507
1508	/* a "swash" is a swatch hash */
1509
1510	SV*
1511	Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
1512	{
1513	SV* retval;
1514	SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
1515	dSP;
1516	HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
1517	SV* errsv_save;
1518
1519	if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
1520	ENTER;
1521	errsv_save = newSVsv(ERRSV);
1522	Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
1523	if (!SvTRUE(ERRSV))
1524	sv_setsv(ERRSV, errsv_save);
1525	SvREFCNT_dec(errsv_save);
1526	LEAVE;
1527	}
1528	SPAGAIN;
1529	PUSHSTACKi(PERLSI_MAGIC);
1530	PUSHMARK(SP);
1531	EXTEND(SP,5);
1532	PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1533	PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1534	PUSHs(listsv);
1535	PUSHs(sv_2mortal(newSViv(minbits)));
1536	PUSHs(sv_2mortal(newSViv(none)));
1537	PUTBACK;
1538	ENTER;
1539	SAVEI32(PL_hints);
1540	PL_hints = 0;
1541	save_re_context();
1542	if (PL_curcop == &PL_compiling) {
1543	/* XXX ought to be handled by lex_start */
1544	SAVEI32(PL_in_my);
1545	sv_setpv(tokenbufsv, PL_tokenbuf);
1546	}
1547	errsv_save = newSVsv(ERRSV);
1548	if (call_method("SWASHNEW", G_SCALAR))
1549	retval = newSVsv(*PL_stack_sp--);
1550	else
1551	retval = &PL_sv_undef;
1552	if (!SvTRUE(ERRSV))
1553	sv_setsv(ERRSV, errsv_save);
1554	SvREFCNT_dec(errsv_save);
1555	LEAVE;
1556	POPSTACK;
1557	if (PL_curcop == &PL_compiling) {
1558	STRLEN len;
1559	char* pv = SvPV(tokenbufsv, len);
1560
1561	Copy(pv, PL_tokenbuf, len+1, char);
1562	PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
1563	}
1564	if (!SvROK(retval) \|\| SvTYPE(SvRV(retval)) != SVt_PVHV) {
1565	if (SvPOK(retval))
1566	Perl_croak(aTHX_ "Can't find Unicode property definition \"%s\"",
1567	SvPV_nolen(retval));
1568	Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1569	}
1570	return retval;
1571	}
1572
1573
1574	/* This API is wrong for special case conversions since we may need to
1575	* return several Unicode characters for a single Unicode character
1576	* (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1577	* the lower-level routine, and it is similarly broken for returning
1578	* multiple values. --jhi */
1579	UV
1580	Perl_swash_fetch(pTHX_ SV sv, U8 ptr, bool do_utf8)
1581	{
1582	HV* hv = (HV*)SvRV(sv);
1583	U32 klen;
1584	U32 off;
1585	STRLEN slen;
1586	STRLEN needents;
1587	U8 *tmps = NULL;
1588	U32 bit;
1589	SV *retval;
1590	U8 tmputf8[2];
1591	UV c = NATIVE_TO_ASCII(*ptr);
1592
1593	if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1594	tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1595	tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1596	ptr = tmputf8;
1597	}
1598	/* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1599	* then the "swatch" is a vec() for al the chars which start
1600	* with 0xAA..0xYY
1601	* So the key in the hash (klen) is length of encoded char -1
1602	*/
1603	klen = UTF8SKIP(ptr) - 1;
1604	off = ptr[klen];
1605
1606	if (klen == 0)
1607	{
1608	/* If char in invariant then swatch is for all the invariant chars
1609	* In both UTF-8 and UTF8-MOD that happens to be UTF_CONTINUATION_MARK
1610	*/
1611	needents = UTF_CONTINUATION_MARK;
1612	off = NATIVE_TO_UTF(ptr[klen]);
1613	}
1614	else
1615	{
1616	/* If char is encoded then swatch is for the prefix */
1617	needents = (1 << UTF_ACCUMULATION_SHIFT);
1618	off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1619	}
1620
1621	/*
1622	* This single-entry cache saves about 1/3 of the utf8 overhead in test
1623	* suite. (That is, only 7-8% overall over just a hash cache. Still,
1624	* it's nothing to sniff at.) Pity we usually come through at least
1625	* two function calls to get here...
1626	*
1627	* NB: this code assumes that swatches are never modified, once generated!
1628	*/
1629
1630	if (hv == PL_last_swash_hv &&
1631	klen == PL_last_swash_klen &&
1632	(!klen \|\| memEQ((char )ptr, (char )PL_last_swash_key, klen)) )
1633	{
1634	tmps = PL_last_swash_tmps;
1635	slen = PL_last_swash_slen;
1636	}
1637	else {
1638	/* Try our second-level swatch cache, kept in a hash. */
1639	SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
1640
1641	/* If not cached, generate it via utf8::SWASHGET */
1642	if (!svp \|\| !SvPOK(svp) \|\| !(tmps = (U8)SvPV(*svp, slen))) {
1643	dSP;
1644	/* We use utf8n_to_uvuni() as we want an index into
1645	Unicode tables, not a native character number.
1646	*/
1647	UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, 0,
1648	ckWARN(WARN_UTF8) ?
1649	0 : UTF8_ALLOW_ANY);
1650	SV *errsv_save;
1651	ENTER;
1652	SAVETMPS;
1653	save_re_context();
1654	PUSHSTACKi(PERLSI_MAGIC);
1655	PUSHMARK(SP);
1656	EXTEND(SP,3);
1657	PUSHs((SV*)sv);
1658	/* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1659	PUSHs(sv_2mortal(newSViv((klen) ?
1660	(code_point & ~(needents - 1)) : 0)));
1661	PUSHs(sv_2mortal(newSViv(needents)));
1662	PUTBACK;
1663	errsv_save = newSVsv(ERRSV);
1664	if (call_method("SWASHGET", G_SCALAR))
1665	retval = newSVsv(*PL_stack_sp--);
1666	else
1667	retval = &PL_sv_undef;
1668	if (!SvTRUE(ERRSV))
1669	sv_setsv(ERRSV, errsv_save);
1670	SvREFCNT_dec(errsv_save);
1671	POPSTACK;
1672	FREETMPS;
1673	LEAVE;
1674	if (PL_curcop == &PL_compiling)
1675	PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
1676
1677	svp = hv_store(hv, (char*)ptr, klen, retval, 0);
1678
1679	if (!svp \|\| !(tmps = (U8)SvPV(svp, slen)) \|\| (slen << 3) < needents)
1680	Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
1681	}
1682
1683	PL_last_swash_hv = hv;
1684	PL_last_swash_klen = klen;
1685	PL_last_swash_tmps = tmps;
1686	PL_last_swash_slen = slen;
1687	if (klen)
1688	Copy(ptr, PL_last_swash_key, klen, U8);
1689	}
1690
1691	switch ((int)((slen << 3) / needents)) {
1692	case 1:
1693	bit = 1 << (off & 7);
1694	off >>= 3;
1695	return (tmps[off] & bit) != 0;
1696	case 8:
1697	return tmps[off];
1698	case 16:
1699	off <<= 1;
1700	return (tmps[off] << 8) + tmps[off + 1] ;
1701	case 32:
1702	off <<= 2;
1703	return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1704	}
1705	Perl_croak(aTHX_ "panic: swash_fetch");
1706	return 0;
1707	}
1708
1709
1710	/*
1711	=for apidoc A\|U8 \|uvchr_to_utf8\|U8 d\|UV uv
1712
1713	Adds the UTF8 representation of the Native codepoint C<uv> to the end
1714	of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
1715	bytes available. The return value is the pointer to the byte after the
1716	end of the new character. In other words,
1717
1718	d = uvchr_to_utf8(d, uv);
1719
1720	is the recommended wide native character-aware way of saying
1721
1722	*(d++) = uv;
1723
1724	=cut
1725	*/
1726
1727	/* On ASCII machines this is normally a macro but we want a
1728	real function in case XS code wants it
1729	*/
1730	#undef Perl_uvchr_to_utf8
1731	U8 *
1732	Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1733	{
1734	return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
1735	}
1736
1737	U8 *
1738	Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
1739	{
1740	return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
1741	}
1742
1743	/*
1744	=for apidoc A\|UV\|utf8n_to_uvchr\|U8 s\|STRLEN curlen\|STRLEN retlen\|U32 flags
1745
1746	Returns the native character value of the first character in the string C<s>
1747	which is assumed to be in UTF8 encoding; C<retlen> will be set to the
1748	length, in bytes, of that character.
1749
1750	Allows length and flags to be passed to low level routine.
1751
1752	=cut
1753	*/
1754	/* On ASCII machines this is normally a macro but we want
1755	a real function in case XS code wants it
1756	*/
1757	#undef Perl_utf8n_to_uvchr
1758	UV
1759	Perl_utf8n_to_uvchr(pTHX_ U8 s, STRLEN curlen, STRLEN retlen, U32 flags)
1760	{
1761	UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1762	return UNI_TO_NATIVE(uv);
1763	}
1764
1765	/*
1766	=for apidoc A\|char \|pv_uni_display\|SV dsv\|U8 *spv\|STRLEN len\|STRLEN pvlim\|UV flags
1767
1768	Build to the scalar dsv a displayable version of the string spv,
1769	length len, the displayable version being at most pvlim bytes long
1770	(if longer, the rest is truncated and "..." will be appended).
1771
1772	The flags argument can have UNI_DISPLAY_ISPRINT set to display
1773	isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
1774	to display the \\[nrfta\\] as the backslashed versions (like '\n')
1775	(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
1776	UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
1777	UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
1778
1779	The pointer to the PV of the dsv is returned.
1780
1781	=cut */
1782	char *
1783	Perl_pv_uni_display(pTHX_ SV dsv, U8 spv, STRLEN len, STRLEN pvlim, UV flags)
1784	{
1785	int truncated = 0;
1786	char s, e;
1787
1788	sv_setpvn(dsv, "", 0);
1789	for (s = (char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
1790	UV u;
1791	bool ok = FALSE;
1792
1793	if (pvlim && SvCUR(dsv) >= pvlim) {
1794	truncated++;
1795	break;
1796	}
1797	u = utf8_to_uvchr((U8*)s, 0);
1798	if (u < 256) {
1799	if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
1800	switch (u & 0xFF) {
1801	case '\n':
1802	Perl_sv_catpvf(aTHX_ dsv, "\\n"); ok = TRUE; break;
1803	case '\r':
1804	Perl_sv_catpvf(aTHX_ dsv, "\\r"); ok = TRUE; break;
1805	case '\t':
1806	Perl_sv_catpvf(aTHX_ dsv, "\\t"); ok = TRUE; break;
1807	case '\f':
1808	Perl_sv_catpvf(aTHX_ dsv, "\\f"); ok = TRUE; break;
1809	case '\a':
1810	Perl_sv_catpvf(aTHX_ dsv, "\\a"); ok = TRUE; break;
1811	case '\\':
1812	Perl_sv_catpvf(aTHX_ dsv, "\\\\" ); ok = TRUE; break;
1813	default: break;
1814	}
1815	}
1816	/* isPRINT() is the locale-blind version. */
1817	if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(u & 0xFF)) {
1818	Perl_sv_catpvf(aTHX_ dsv, "%c", (char)(u & 0xFF));
1819	ok = TRUE;
1820	}
1821	}
1822	if (!ok)
1823	Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
1824	}
1825	if (truncated)
1826	sv_catpvn(dsv, "...", 3);
1827
1828	return SvPVX(dsv);
1829	}
1830
1831	/*
1832	=for apidoc A\|char \|sv_uni_display\|SV dsv\|SV *ssv\|STRLEN pvlim\|UV flags
1833
1834	Build to the scalar dsv a displayable version of the scalar sv,
1835	the displayable version being at most pvlim bytes long
1836	(if longer, the rest is truncated and "..." will be appended).
1837
1838	The flags argument is as in pv_uni_display().
1839
1840	The pointer to the PV of the dsv is returned.
1841
1842	=cut */
1843	char *
1844	Perl_sv_uni_display(pTHX_ SV dsv, SV ssv, STRLEN pvlim, UV flags)
1845	{
1846	return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
1847	pvlim, flags);
1848	}
1849
1850	/*
1851	=for apidoc A\|I32\|ibcmp_utf8\|const char s1\|char pe1\|register UV l1\|bool u1\|const char s2\|char **pe2\|register UV l2\|bool u2
1852
1853	Return true if the strings s1 and s2 differ case-insensitively, false
1854	if not (if they are equal case-insensitively). If u1 is true, the
1855	string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
1856	the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
1857	are false, the respective string is assumed to be in native 8-bit
1858	encoding.
1859
1860	If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
1861	in there (they will point at the beginning of the I<next> character).
1862	If the pointers behind pe1 or pe2 are non-NULL, they are the end
1863	pointers beyond which scanning will not continue under any
1864	circustances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
1865	s2+l2 will be used as goal end pointers that will also stop the scan,
1866	and which qualify towards defining a successful match: all the scans
1867	that define an explicit length must reach their goal pointers for
1868	a match to succeed).
1869
1870	For case-insensitiveness, the "casefolding" of Unicode is used
1871	instead of upper/lowercasing both the characters, see
1872	http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
1873
1874	=cut */
1875	I32
1876	Perl_ibcmp_utf8(pTHX_ const char s1, char pe1, register UV l1, bool u1, const char s2, char **pe2, register UV l2, bool u2)
1877	{
1878	register U8 p1 = (U8)s1;
1879	register U8 p2 = (U8)s2;
1880	register U8 e1 = 0, f1 = 0, *q1 = 0;
1881	register U8 e2 = 0, f2 = 0, *q2 = 0;
1882	STRLEN n1 = 0, n2 = 0;
1883	U8 foldbuf1[UTF8_MAXLEN_FOLD+1];
1884	U8 foldbuf2[UTF8_MAXLEN_FOLD+1];
1885	U8 natbuf[1+1];
1886	STRLEN foldlen1, foldlen2;
1887	bool match;
1888
1889	if (pe1)
1890	e1 = (U8*)pe1;
1891	if (e1 == 0 \|\| (l1 && l1 < (UV)(e1 - (U8*)s1)))
1892	f1 = (U8*)s1 + l1;
1893	if (pe2)
1894	e2 = (U8*)pe2;
1895	if (e2 == 0 \|\| (l2 && l2 < (UV)(e2 - (U8*)s2)))
1896	f2 = (U8*)s2 + l2;
1897
1898	if ((e1 == 0 && f1 == 0) \|\| (e2 == 0 && f2 == 0) \|\| (f1 == 0 && f2 == 0))
1899	return 1; /* mismatch; possible infinite loop or false positive */
1900
1901	if (!u1 \|\| !u2)
1902	natbuf[1] = 0; /* Need to terminate the buffer. */
1903
1904	while ((e1 == 0 \|\| p1 < e1) &&
1905	(f1 == 0 \|\| p1 < f1) &&
1906	(e2 == 0 \|\| p2 < e2) &&
1907	(f2 == 0 \|\| p2 < f2)) {
1908	if (n1 == 0) {
1909	if (u1)
1910	to_utf8_fold(p1, foldbuf1, &foldlen1);
1911	else {
1912	natbuf[0] = *p1;
1913	to_utf8_fold(natbuf, foldbuf1, &foldlen1);
1914	}
1915	q1 = foldbuf1;
1916	n1 = foldlen1;
1917	}
1918	if (n2 == 0) {
1919	if (u2)
1920	to_utf8_fold(p2, foldbuf2, &foldlen2);
1921	else {
1922	natbuf[0] = *p2;
1923	to_utf8_fold(natbuf, foldbuf2, &foldlen2);
1924	}
1925	q2 = foldbuf2;
1926	n2 = foldlen2;
1927	}
1928	while (n1 && n2) {
1929	if ( UTF8SKIP(q1) != UTF8SKIP(q2) \|\|
1930	(UTF8SKIP(q1) == 1 && q1 != q2) \|\|
1931	memNE((char)q1, (char)q2, UTF8SKIP(q1)) )
1932	return 1; /* mismatch */
1933	n1 -= UTF8SKIP(q1);
1934	q1 += UTF8SKIP(q1);
1935	n2 -= UTF8SKIP(q2);
1936	q2 += UTF8SKIP(q2);
1937	}
1938	if (n1 == 0)
1939	p1 += u1 ? UTF8SKIP(p1) : 1;
1940	if (n2 == 0)
1941	p2 += u2 ? UTF8SKIP(p2) : 1;
1942
1943	}
1944
1945	/* A match is defined by all the scans that specified
1946	* an explicit length reaching their final goals. */
1947	match = (f1 == 0 \|\| p1 == f1) && (f2 == 0 \|\| p2 == f2);
1948
1949	if (match) {
1950	if (pe1)
1951	pe1 = (char)p1;
1952	if (pe2)
1953	pe2 = (char)p2;
1954	}
1955
1956	return match ? 0 : 1; /* 0 match, 1 mismatch */
1957	}
1958

Note: See TracBrowser for help on using the repository browser.

Download in other formats: