xref: /unit/src/nxt_utf8.c (revision 611:323e11065f83)
1 
2 /*
3  * Copyright (C) Igor Sysoev
4  * Copyright (C) NGINX, Inc.
5  */
6 
7 #include <nxt_main.h>
8 
9 /*
10  * The nxt_unicode_lowcase.h file is the auto-generated file from
11  * the CaseFolding-6.3.0.txt file provided by Unicode, Inc.:
12  *
13  *   ./lib/src/nxt_unicode_lowcase.pl CaseFolding-6.3.0.txt
14  *
15  * This file should be copied to system specific nxt_unicode_SYSTEM_lowcase.h
16  * file and utf8_file_name_test should be built with this file.
17  * Then a correct system specific file should be generated:
18  *
19  *   ./build/utf8_file_name_test | ./lib/src/nxt_unicode_lowcase.pl
20  *
21  * Only common and simple case foldings are supported.  Full case foldings
22  * is not supported.  Combined characters are also not supported.
23  */
24 
25 #if (NXT_MACOSX)
26 #include <nxt_unicode_macosx_lowcase.h>
27 
28 #else
29 #include <nxt_unicode_lowcase.h>
30 #endif
31 
32 
33 u_char *
nxt_utf8_encode(u_char * p,uint32_t u)34 nxt_utf8_encode(u_char *p, uint32_t u)
35 {
36     if (u < 0x80) {
37         *p++ = (u_char) (u & 0xFF);
38         return p;
39     }
40 
41     if (u < 0x0800) {
42         *p++ = (u_char) (( u >> 6)          | 0xC0);
43         *p++ = (u_char) (( u        & 0x3F) | 0x80);
44         return p;
45     }
46 
47     if (u < 0x10000) {
48         *p++ = (u_char) ( (u >> 12)         | 0xE0);
49         *p++ = (u_char) (((u >>  6) & 0x3F) | 0x80);
50         *p++ = (u_char) (( u        & 0x3F) | 0x80);
51         return p;
52     }
53 
54     if (u < 0x110000) {
55         *p++ = (u_char) ( (u >> 18)         | 0xF0);
56         *p++ = (u_char) (((u >> 12) & 0x3F) | 0x80);
57         *p++ = (u_char) (((u >>  6) & 0x3F) | 0x80);
58         *p++ = (u_char) (( u        & 0x3F) | 0x80);
59         return p;
60     }
61 
62     return NULL;
63 }
64 
65 
66 /*
67  * nxt_utf8_decode() decodes UTF-8 sequences and returns a valid
68  * character 0x00 - 0x10FFFF, or 0xFFFFFFFF for invalid or overlong
69  * UTF-8 sequence.
70  */
71 
72 uint32_t
nxt_utf8_decode(const u_char ** start,const u_char * end)73 nxt_utf8_decode(const u_char **start, const u_char *end)
74 {
75     uint32_t  u;
76 
77     u = (uint32_t) **start;
78 
79     if (u < 0x80) {
80         (*start)++;
81         return u;
82     }
83 
84     return nxt_utf8_decode2(start, end);
85 }
86 
87 
88 /*
89  * nxt_utf8_decode2() decodes two and more bytes UTF-8 sequences only
90  * and returns a valid character 0x80 - 0x10FFFF, or 0xFFFFFFFF for
91  * invalid or overlong UTF-8 sequence.
92  */
93 
94 uint32_t
nxt_utf8_decode2(const u_char ** start,const u_char * end)95 nxt_utf8_decode2(const u_char **start, const u_char *end)
96 {
97     u_char        c;
98     size_t        n;
99     uint32_t      u, overlong;
100     const u_char  *p;
101 
102     p = *start;
103     u = (uint32_t) *p;
104 
105     if (u >= 0xE0) {
106 
107         if (u >= 0xF0) {
108 
109             if (nxt_slow_path(u > 0xF4)) {
110                 /*
111                  * The maximum valid Unicode character is 0x10FFFF
112                  * which is encoded as 0xF4 0x8F 0xBF 0xBF.
113                  */
114                 return 0xFFFFFFFF;
115             }
116 
117             u &= 0x07;
118             overlong = 0x00FFFF;
119             n = 3;
120 
121         } else {
122             u &= 0x0F;
123             overlong = 0x07FF;
124             n = 2;
125         }
126 
127     } else if (u >= 0xC2) {
128 
129         /* 0x80 is encoded as 0xC2 0x80. */
130 
131         u &= 0x1F;
132         overlong = 0x007F;
133         n = 1;
134 
135     } else {
136         /* u <= 0xC2 */
137         return 0xFFFFFFFF;
138     }
139 
140     p++;
141 
142     if (nxt_fast_path(p + n <= end)) {
143 
144         do {
145             c = *p++;
146             /*
147              * The byte must in the 0x80 - 0xBF range.
148              * Values below 0x80 become >= 0x80.
149              */
150             c = c - 0x80;
151 
152             if (nxt_slow_path(c > 0x3F)) {
153                 return 0xFFFFFFFF;
154             }
155 
156             u = (u << 6) | c;
157             n--;
158 
159         } while (n != 0);
160 
161         if (overlong < u && u < 0x110000) {
162             *start = p;
163             return u;
164         }
165     }
166 
167     return 0xFFFFFFFF;
168 }
169 
170 
171 /*
172  * nxt_utf8_casecmp() tests only up to the minimum of given lengths, but
173  * requires lengths of both strings because otherwise nxt_utf8_decode2()
174  * may fail due to incomplete sequence.
175  */
176 
177 nxt_int_t
nxt_utf8_casecmp(const u_char * start1,const u_char * start2,size_t len1,size_t len2)178 nxt_utf8_casecmp(const u_char *start1, const u_char *start2, size_t len1,
179     size_t len2)
180 {
181     int32_t       n;
182     uint32_t      u1, u2;
183     const u_char  *end1, *end2;
184 
185     end1 = start1 + len1;
186     end2 = start2 + len2;
187 
188     while (start1 < end1 && start2 < end2) {
189 
190         u1 = nxt_utf8_lowcase(&start1, end1);
191 
192         u2 = nxt_utf8_lowcase(&start2, end2);
193 
194         if (nxt_slow_path((u1 | u2) == 0xFFFFFFFF)) {
195             return NXT_UTF8_SORT_INVALID;
196         }
197 
198         n = u1 - u2;
199 
200         if (n != 0) {
201             return (nxt_int_t) n;
202         }
203     }
204 
205     return 0;
206 }
207 
208 
209 uint32_t
nxt_utf8_lowcase(const u_char ** start,const u_char * end)210 nxt_utf8_lowcase(const u_char **start, const u_char *end)
211 {
212     uint32_t        u;
213     const uint32_t  *block;
214 
215     u = (uint32_t) **start;
216 
217     if (nxt_fast_path(u < 0x80)) {
218         (*start)++;
219 
220         return nxt_unicode_block_000[u];
221     }
222 
223     u = nxt_utf8_decode2(start, end);
224 
225     if (u <= NXT_UNICODE_MAX_LOWCASE) {
226         block = nxt_unicode_blocks[u / NXT_UNICODE_BLOCK_SIZE];
227 
228         if (block != NULL) {
229             return block[u % NXT_UNICODE_BLOCK_SIZE];
230         }
231     }
232 
233     return u;
234 }
235 
236 
237 ssize_t
nxt_utf8_length(const u_char * p,size_t len)238 nxt_utf8_length(const u_char *p, size_t len)
239 {
240     ssize_t       length;
241     const u_char  *end;
242 
243     length = 0;
244 
245     end = p + len;
246 
247     while (p < end) {
248         if (nxt_slow_path(nxt_utf8_decode(&p, end) == 0xFFFFFFFF)) {
249             return -1;
250         }
251 
252         length++;
253     }
254 
255     return length;
256 }
257 
258 
259 nxt_bool_t
nxt_utf8_is_valid(const u_char * p,size_t len)260 nxt_utf8_is_valid(const u_char *p, size_t len)
261 {
262     const u_char  *end;
263 
264     end = p + len;
265 
266     while (p < end) {
267         if (nxt_slow_path(nxt_utf8_decode(&p, end) == 0xFFFFFFFF)) {
268             return 0;
269         }
270     }
271 
272     return 1;
273 }
274