1 2 /* 3 * Copyright (C) Igor Sysoev 4 * Copyright (C) NGINX, Inc. 5 */ 6 7 #include <nxt_main.h> 8 9 /* 10 * The nxt_unicode_lowcase.h file is the auto-generated file from 11 * the CaseFolding-6.3.0.txt file provided by Unicode, Inc.: 12 * 13 * ./lib/src/nxt_unicode_lowcase.pl CaseFolding-6.3.0.txt 14 * 15 * This file should be copied to system specific nxt_unicode_SYSTEM_lowcase.h 16 * file and utf8_file_name_test should be built with this file. 17 * Then a correct system specific file should be generated: 18 * 19 * ./build/utf8_file_name_test | ./lib/src/nxt_unicode_lowcase.pl 20 * 21 * Only common and simple case foldings are supported. Full case foldings 22 * is not supported. Combined characters are also not supported. 23 */ 24 25 #if (NXT_MACOSX) 26 #include <nxt_unicode_macosx_lowcase.h> 27 28 #else 29 #include <nxt_unicode_lowcase.h> 30 #endif 31 32 33 u_char * 34 nxt_utf8_encode(u_char *p, uint32_t u) 35 { 36 if (u < 0x80) { 37 *p++ = (u_char) (u & 0xff); 38 return p; 39 } 40 41 if (u < 0x0800) { 42 *p++ = (u_char) (( u >> 6) | 0xc0); 43 *p++ = (u_char) (( u & 0x3f) | 0x80); 44 return p; 45 } 46 47 if (u < 0x10000) { 48 *p++ = (u_char) ( (u >> 12) | 0xe0); 49 *p++ = (u_char) (((u >> 6) & 0x3f) | 0x80); 50 *p++ = (u_char) (( u & 0x3f) | 0x80); 51 return p; 52 } 53 54 if (u < 0x110000) { 55 *p++ = (u_char) ( (u >> 18) | 0xf0); 56 *p++ = (u_char) (((u >> 12) & 0x3f) | 0x80); 57 *p++ = (u_char) (((u >> 6) & 0x3f) | 0x80); 58 *p++ = (u_char) (( u & 0x3f) | 0x80); 59 return p; 60 } 61 62 return NULL; 63 } 64 65 66 /* 67 * nxt_utf8_decode() decodes UTF-8 sequences and returns a valid 68 * character 0x00 - 0x10ffff, or 0xffffffff for invalid or overlong 69 * UTF-8 sequence. 70 */ 71 72 uint32_t 73 nxt_utf8_decode(const u_char **start, const u_char *end) 74 { 75 uint32_t u; 76 77 u = (uint32_t) **start; 78 79 if (u < 0x80) { 80 (*start)++; 81 return u; 82 } 83 84 return nxt_utf8_decode2(start, end); 85 } 86 87 88 /* 89 * nxt_utf8_decode2() decodes two and more bytes UTF-8 sequences only 90 * and returns a valid character 0x80 - 0x10ffff, or 0xffffffff for 91 * invalid or overlong UTF-8 sequence. 92 */ 93 94 uint32_t 95 nxt_utf8_decode2(const u_char **start, const u_char *end) 96 { 97 u_char c; 98 size_t n; 99 uint32_t u, overlong; 100 const u_char *p; 101 102 p = *start; 103 u = (uint32_t) *p; 104 105 if (u >= 0xe0) { 106 107 if (u >= 0xf0) { 108 109 if (nxt_slow_path(u > 0xf4)) { 110 /* 111 * The maximum valid Unicode character is 0x10ffff 112 * which is encoded as 0xf4 0x8f 0xbf 0xbf. 113 */ 114 return 0xffffffff; 115 } 116 117 u &= 0x07; 118 overlong = 0x00ffff; 119 n = 3; 120 121 } else { 122 u &= 0x0f; 123 overlong = 0x07ff; 124 n = 2; 125 } 126 127 } else if (u >= 0xc2) { 128 129 /* 0x80 is encoded as 0xc2 0x80. */ 130 131 u &= 0x1f; 132 overlong = 0x007f; 133 n = 1; 134 135 } else { 136 /* u <= 0xc2 */ 137 return 0xffffffff; 138 } 139 140 p++; 141 142 if (nxt_fast_path(p + n <= end)) { 143 144 do { 145 c = *p++; 146 /* 147 * The byte must in the 0x80 - 0xbf range. 148 * Values below 0x80 become >= 0x80. 149 */ 150 c = c - 0x80; 151 152 if (nxt_slow_path(c > 0x3f)) { 153 return 0xffffffff; 154 } 155 156 u = (u << 6) | c; 157 n--; 158 159 } while (n != 0); 160 161 if (overlong < u && u < 0x110000) { 162 *start = p; 163 return u; 164 } 165 } 166 167 return 0xffffffff; 168 } 169 170 171 /* 172 * nxt_utf8_casecmp() tests only up to the minimum of given lengths, but 173 * requires lengths of both strings because otherwise nxt_utf8_decode2() 174 * may fail due to incomplete sequence. 175 */ 176 177 nxt_int_t 178 nxt_utf8_casecmp(const u_char *start1, const u_char *start2, size_t len1, 179 size_t len2) 180 { 181 int32_t n; 182 uint32_t u1, u2; 183 const u_char *end1, *end2; 184 185 end1 = start1 + len1; 186 end2 = start2 + len2; 187 188 while (start1 < end1 && start2 < end2) { 189 190 u1 = nxt_utf8_lowcase(&start1, end1); 191 192 u2 = nxt_utf8_lowcase(&start2, end2); 193 194 if (nxt_slow_path((u1 | u2) == 0xffffffff)) { 195 return NXT_UTF8_SORT_INVALID; 196 } 197 198 n = u1 - u2; 199 200 if (n != 0) { 201 return (nxt_int_t) n; 202 } 203 } 204 205 return 0; 206 } 207 208 209 uint32_t 210 nxt_utf8_lowcase(const u_char **start, const u_char *end) 211 { 212 uint32_t u; 213 const uint32_t *block; 214 215 u = (uint32_t) **start; 216 217 if (nxt_fast_path(u < 0x80)) { 218 (*start)++; 219 220 return nxt_unicode_block_000[u]; 221 } 222 223 u = nxt_utf8_decode2(start, end); 224 225 if (u <= NXT_UNICODE_MAX_LOWCASE) { 226 block = nxt_unicode_blocks[u / NXT_UNICODE_BLOCK_SIZE]; 227 228 if (block != NULL) { 229 return block[u % NXT_UNICODE_BLOCK_SIZE]; 230 } 231 } 232 233 return u; 234 } 235 236 237 ssize_t 238 nxt_utf8_length(const u_char *p, size_t len) 239 { 240 ssize_t length; 241 const u_char *end; 242 243 length = 0; 244 245 end = p + len; 246 247 while (p < end) { 248 if (nxt_slow_path(nxt_utf8_decode(&p, end) == 0xffffffff)) { 249 return -1; 250 } 251 252 length++; 253 } 254 255 return length; 256 } 257 258 259 nxt_bool_t 260 nxt_utf8_is_valid(const u_char *p, size_t len) 261 { 262 const u_char *end; 263 264 end = p + len; 265 266 while (p < end) { 267 if (nxt_slow_path(nxt_utf8_decode(&p, end) == 0xffffffff)) { 268 return 0; 269 } 270 } 271 272 return 1; 273 } 274