1
2 /*
3 * Copyright (C) Igor Sysoev
4 * Copyright (C) NGINX, Inc.
5 */
6
7 #include <nxt_main.h>
8
9 /*
10 * The nxt_unicode_lowcase.h file is the auto-generated file from
11 * the CaseFolding-6.3.0.txt file provided by Unicode, Inc.:
12 *
13 * ./lib/src/nxt_unicode_lowcase.pl CaseFolding-6.3.0.txt
14 *
15 * This file should be copied to system specific nxt_unicode_SYSTEM_lowcase.h
16 * file and utf8_file_name_test should be built with this file.
17 * Then a correct system specific file should be generated:
18 *
19 * ./build/utf8_file_name_test | ./lib/src/nxt_unicode_lowcase.pl
20 *
21 * Only common and simple case foldings are supported. Full case foldings
22 * is not supported. Combined characters are also not supported.
23 */
24
25 #if (NXT_MACOSX)
26 #include <nxt_unicode_macosx_lowcase.h>
27
28 #else
29 #include <nxt_unicode_lowcase.h>
30 #endif
31
32
33 u_char *
nxt_utf8_encode(u_char * p,uint32_t u)34 nxt_utf8_encode(u_char *p, uint32_t u)
35 {
36 if (u < 0x80) {
37 *p++ = (u_char) (u & 0xFF);
38 return p;
39 }
40
41 if (u < 0x0800) {
42 *p++ = (u_char) (( u >> 6) | 0xC0);
43 *p++ = (u_char) (( u & 0x3F) | 0x80);
44 return p;
45 }
46
47 if (u < 0x10000) {
48 *p++ = (u_char) ( (u >> 12) | 0xE0);
49 *p++ = (u_char) (((u >> 6) & 0x3F) | 0x80);
50 *p++ = (u_char) (( u & 0x3F) | 0x80);
51 return p;
52 }
53
54 if (u < 0x110000) {
55 *p++ = (u_char) ( (u >> 18) | 0xF0);
56 *p++ = (u_char) (((u >> 12) & 0x3F) | 0x80);
57 *p++ = (u_char) (((u >> 6) & 0x3F) | 0x80);
58 *p++ = (u_char) (( u & 0x3F) | 0x80);
59 return p;
60 }
61
62 return NULL;
63 }
64
65
66 /*
67 * nxt_utf8_decode() decodes UTF-8 sequences and returns a valid
68 * character 0x00 - 0x10FFFF, or 0xFFFFFFFF for invalid or overlong
69 * UTF-8 sequence.
70 */
71
72 uint32_t
nxt_utf8_decode(const u_char ** start,const u_char * end)73 nxt_utf8_decode(const u_char **start, const u_char *end)
74 {
75 uint32_t u;
76
77 u = (uint32_t) **start;
78
79 if (u < 0x80) {
80 (*start)++;
81 return u;
82 }
83
84 return nxt_utf8_decode2(start, end);
85 }
86
87
88 /*
89 * nxt_utf8_decode2() decodes two and more bytes UTF-8 sequences only
90 * and returns a valid character 0x80 - 0x10FFFF, or 0xFFFFFFFF for
91 * invalid or overlong UTF-8 sequence.
92 */
93
94 uint32_t
nxt_utf8_decode2(const u_char ** start,const u_char * end)95 nxt_utf8_decode2(const u_char **start, const u_char *end)
96 {
97 u_char c;
98 size_t n;
99 uint32_t u, overlong;
100 const u_char *p;
101
102 p = *start;
103 u = (uint32_t) *p;
104
105 if (u >= 0xE0) {
106
107 if (u >= 0xF0) {
108
109 if (nxt_slow_path(u > 0xF4)) {
110 /*
111 * The maximum valid Unicode character is 0x10FFFF
112 * which is encoded as 0xF4 0x8F 0xBF 0xBF.
113 */
114 return 0xFFFFFFFF;
115 }
116
117 u &= 0x07;
118 overlong = 0x00FFFF;
119 n = 3;
120
121 } else {
122 u &= 0x0F;
123 overlong = 0x07FF;
124 n = 2;
125 }
126
127 } else if (u >= 0xC2) {
128
129 /* 0x80 is encoded as 0xC2 0x80. */
130
131 u &= 0x1F;
132 overlong = 0x007F;
133 n = 1;
134
135 } else {
136 /* u <= 0xC2 */
137 return 0xFFFFFFFF;
138 }
139
140 p++;
141
142 if (nxt_fast_path(p + n <= end)) {
143
144 do {
145 c = *p++;
146 /*
147 * The byte must in the 0x80 - 0xBF range.
148 * Values below 0x80 become >= 0x80.
149 */
150 c = c - 0x80;
151
152 if (nxt_slow_path(c > 0x3F)) {
153 return 0xFFFFFFFF;
154 }
155
156 u = (u << 6) | c;
157 n--;
158
159 } while (n != 0);
160
161 if (overlong < u && u < 0x110000) {
162 *start = p;
163 return u;
164 }
165 }
166
167 return 0xFFFFFFFF;
168 }
169
170
171 /*
172 * nxt_utf8_casecmp() tests only up to the minimum of given lengths, but
173 * requires lengths of both strings because otherwise nxt_utf8_decode2()
174 * may fail due to incomplete sequence.
175 */
176
177 nxt_int_t
nxt_utf8_casecmp(const u_char * start1,const u_char * start2,size_t len1,size_t len2)178 nxt_utf8_casecmp(const u_char *start1, const u_char *start2, size_t len1,
179 size_t len2)
180 {
181 int32_t n;
182 uint32_t u1, u2;
183 const u_char *end1, *end2;
184
185 end1 = start1 + len1;
186 end2 = start2 + len2;
187
188 while (start1 < end1 && start2 < end2) {
189
190 u1 = nxt_utf8_lowcase(&start1, end1);
191
192 u2 = nxt_utf8_lowcase(&start2, end2);
193
194 if (nxt_slow_path((u1 | u2) == 0xFFFFFFFF)) {
195 return NXT_UTF8_SORT_INVALID;
196 }
197
198 n = u1 - u2;
199
200 if (n != 0) {
201 return (nxt_int_t) n;
202 }
203 }
204
205 return 0;
206 }
207
208
209 uint32_t
nxt_utf8_lowcase(const u_char ** start,const u_char * end)210 nxt_utf8_lowcase(const u_char **start, const u_char *end)
211 {
212 uint32_t u;
213 const uint32_t *block;
214
215 u = (uint32_t) **start;
216
217 if (nxt_fast_path(u < 0x80)) {
218 (*start)++;
219
220 return nxt_unicode_block_000[u];
221 }
222
223 u = nxt_utf8_decode2(start, end);
224
225 if (u <= NXT_UNICODE_MAX_LOWCASE) {
226 block = nxt_unicode_blocks[u / NXT_UNICODE_BLOCK_SIZE];
227
228 if (block != NULL) {
229 return block[u % NXT_UNICODE_BLOCK_SIZE];
230 }
231 }
232
233 return u;
234 }
235
236
237 ssize_t
nxt_utf8_length(const u_char * p,size_t len)238 nxt_utf8_length(const u_char *p, size_t len)
239 {
240 ssize_t length;
241 const u_char *end;
242
243 length = 0;
244
245 end = p + len;
246
247 while (p < end) {
248 if (nxt_slow_path(nxt_utf8_decode(&p, end) == 0xFFFFFFFF)) {
249 return -1;
250 }
251
252 length++;
253 }
254
255 return length;
256 }
257
258
259 nxt_bool_t
nxt_utf8_is_valid(const u_char * p,size_t len)260 nxt_utf8_is_valid(const u_char *p, size_t len)
261 {
262 const u_char *end;
263
264 end = p + len;
265
266 while (p < end) {
267 if (nxt_slow_path(nxt_utf8_decode(&p, end) == 0xFFFFFFFF)) {
268 return 0;
269 }
270 }
271
272 return 1;
273 }
274