Deleted
Added
nxt_utf8.c (0:a63ceefd6ab0) | nxt_utf8.c (611:323e11065f83) |
---|---|
1 2/* 3 * Copyright (C) Igor Sysoev 4 * Copyright (C) NGINX, Inc. 5 */ 6 7#include <nxt_main.h> 8 --- 20 unchanged lines hidden (view full) --- 29#include <nxt_unicode_lowcase.h> 30#endif 31 32 33u_char * 34nxt_utf8_encode(u_char *p, uint32_t u) 35{ 36 if (u < 0x80) { | 1 2/* 3 * Copyright (C) Igor Sysoev 4 * Copyright (C) NGINX, Inc. 5 */ 6 7#include <nxt_main.h> 8 --- 20 unchanged lines hidden (view full) --- 29#include <nxt_unicode_lowcase.h> 30#endif 31 32 33u_char * 34nxt_utf8_encode(u_char *p, uint32_t u) 35{ 36 if (u < 0x80) { |
37 *p++ = (u_char) (u & 0xff); | 37 *p++ = (u_char) (u & 0xFF); |
38 return p; 39 } 40 41 if (u < 0x0800) { | 38 return p; 39 } 40 41 if (u < 0x0800) { |
42 *p++ = (u_char) (( u >> 6) | 0xc0); 43 *p++ = (u_char) (( u & 0x3f) | 0x80); | 42 *p++ = (u_char) (( u >> 6) | 0xC0); 43 *p++ = (u_char) (( u & 0x3F) | 0x80); |
44 return p; 45 } 46 47 if (u < 0x10000) { | 44 return p; 45 } 46 47 if (u < 0x10000) { |
48 *p++ = (u_char) ( (u >> 12) | 0xe0); 49 *p++ = (u_char) (((u >> 6) & 0x3f) | 0x80); 50 *p++ = (u_char) (( u & 0x3f) | 0x80); | 48 *p++ = (u_char) ( (u >> 12) | 0xE0); 49 *p++ = (u_char) (((u >> 6) & 0x3F) | 0x80); 50 *p++ = (u_char) (( u & 0x3F) | 0x80); |
51 return p; 52 } 53 54 if (u < 0x110000) { | 51 return p; 52 } 53 54 if (u < 0x110000) { |
55 *p++ = (u_char) ( (u >> 18) | 0xf0); 56 *p++ = (u_char) (((u >> 12) & 0x3f) | 0x80); 57 *p++ = (u_char) (((u >> 6) & 0x3f) | 0x80); 58 *p++ = (u_char) (( u & 0x3f) | 0x80); | 55 *p++ = (u_char) ( (u >> 18) | 0xF0); 56 *p++ = (u_char) (((u >> 12) & 0x3F) | 0x80); 57 *p++ = (u_char) (((u >> 6) & 0x3F) | 0x80); 58 *p++ = (u_char) (( u & 0x3F) | 0x80); |
59 return p; 60 } 61 62 return NULL; 63} 64 65 66/* 67 * nxt_utf8_decode() decodes UTF-8 sequences and returns a valid | 59 return p; 60 } 61 62 return NULL; 63} 64 65 66/* 67 * nxt_utf8_decode() decodes UTF-8 sequences and returns a valid |
68 * character 0x00 - 0x10ffff, or 0xffffffff for invalid or overlong | 68 * character 0x00 - 0x10FFFF, or 0xFFFFFFFF for invalid or overlong |
69 * UTF-8 sequence. 70 */ 71 72uint32_t 73nxt_utf8_decode(const u_char **start, const u_char *end) 74{ 75 uint32_t u; 76 --- 5 unchanged lines hidden (view full) --- 82 } 83 84 return nxt_utf8_decode2(start, end); 85} 86 87 88/* 89 * nxt_utf8_decode2() decodes two and more bytes UTF-8 sequences only | 69 * UTF-8 sequence. 70 */ 71 72uint32_t 73nxt_utf8_decode(const u_char **start, const u_char *end) 74{ 75 uint32_t u; 76 --- 5 unchanged lines hidden (view full) --- 82 } 83 84 return nxt_utf8_decode2(start, end); 85} 86 87 88/* 89 * nxt_utf8_decode2() decodes two and more bytes UTF-8 sequences only |
90 * and returns a valid character 0x80 - 0x10ffff, or 0xffffffff for | 90 * and returns a valid character 0x80 - 0x10FFFF, or 0xFFFFFFFF for |
91 * invalid or overlong UTF-8 sequence. 92 */ 93 94uint32_t 95nxt_utf8_decode2(const u_char **start, const u_char *end) 96{ 97 u_char c; 98 size_t n; 99 uint32_t u, overlong; 100 const u_char *p; 101 102 p = *start; 103 u = (uint32_t) *p; 104 | 91 * invalid or overlong UTF-8 sequence. 92 */ 93 94uint32_t 95nxt_utf8_decode2(const u_char **start, const u_char *end) 96{ 97 u_char c; 98 size_t n; 99 uint32_t u, overlong; 100 const u_char *p; 101 102 p = *start; 103 u = (uint32_t) *p; 104 |
105 if (u >= 0xe0) { | 105 if (u >= 0xE0) { |
106 | 106 |
107 if (u >= 0xf0) { | 107 if (u >= 0xF0) { |
108 | 108 |
109 if (nxt_slow_path(u > 0xf4)) { | 109 if (nxt_slow_path(u > 0xF4)) { |
110 /* | 110 /* |
111 * The maximum valid Unicode character is 0x10ffff 112 * which is encoded as 0xf4 0x8f 0xbf 0xbf. | 111 * The maximum valid Unicode character is 0x10FFFF 112 * which is encoded as 0xF4 0x8F 0xBF 0xBF. |
113 */ | 113 */ |
114 return 0xffffffff; | 114 return 0xFFFFFFFF; |
115 } 116 117 u &= 0x07; | 115 } 116 117 u &= 0x07; |
118 overlong = 0x00ffff; | 118 overlong = 0x00FFFF; |
119 n = 3; 120 121 } else { | 119 n = 3; 120 121 } else { |
122 u &= 0x0f; 123 overlong = 0x07ff; | 122 u &= 0x0F; 123 overlong = 0x07FF; |
124 n = 2; 125 } 126 | 124 n = 2; 125 } 126 |
127 } else if (u >= 0xc2) { | 127 } else if (u >= 0xC2) { |
128 | 128 |
129 /* 0x80 is encoded as 0xc2 0x80. */ | 129 /* 0x80 is encoded as 0xC2 0x80. */ |
130 | 130 |
131 u &= 0x1f; 132 overlong = 0x007f; | 131 u &= 0x1F; 132 overlong = 0x007F; |
133 n = 1; 134 135 } else { | 133 n = 1; 134 135 } else { |
136 /* u <= 0xc2 */ 137 return 0xffffffff; | 136 /* u <= 0xC2 */ 137 return 0xFFFFFFFF; |
138 } 139 140 p++; 141 142 if (nxt_fast_path(p + n <= end)) { 143 144 do { 145 c = *p++; 146 /* | 138 } 139 140 p++; 141 142 if (nxt_fast_path(p + n <= end)) { 143 144 do { 145 c = *p++; 146 /* |
147 * The byte must in the 0x80 - 0xbf range. | 147 * The byte must in the 0x80 - 0xBF range. |
148 * Values below 0x80 become >= 0x80. 149 */ 150 c = c - 0x80; 151 | 148 * Values below 0x80 become >= 0x80. 149 */ 150 c = c - 0x80; 151 |
152 if (nxt_slow_path(c > 0x3f)) { 153 return 0xffffffff; | 152 if (nxt_slow_path(c > 0x3F)) { 153 return 0xFFFFFFFF; |
154 } 155 156 u = (u << 6) | c; 157 n--; 158 159 } while (n != 0); 160 161 if (overlong < u && u < 0x110000) { 162 *start = p; 163 return u; 164 } 165 } 166 | 154 } 155 156 u = (u << 6) | c; 157 n--; 158 159 } while (n != 0); 160 161 if (overlong < u && u < 0x110000) { 162 *start = p; 163 return u; 164 } 165 } 166 |
167 return 0xffffffff; | 167 return 0xFFFFFFFF; |
168} 169 170 171/* 172 * nxt_utf8_casecmp() tests only up to the minimum of given lengths, but 173 * requires lengths of both strings because otherwise nxt_utf8_decode2() 174 * may fail due to incomplete sequence. 175 */ --- 10 unchanged lines hidden (view full) --- 186 end2 = start2 + len2; 187 188 while (start1 < end1 && start2 < end2) { 189 190 u1 = nxt_utf8_lowcase(&start1, end1); 191 192 u2 = nxt_utf8_lowcase(&start2, end2); 193 | 168} 169 170 171/* 172 * nxt_utf8_casecmp() tests only up to the minimum of given lengths, but 173 * requires lengths of both strings because otherwise nxt_utf8_decode2() 174 * may fail due to incomplete sequence. 175 */ --- 10 unchanged lines hidden (view full) --- 186 end2 = start2 + len2; 187 188 while (start1 < end1 && start2 < end2) { 189 190 u1 = nxt_utf8_lowcase(&start1, end1); 191 192 u2 = nxt_utf8_lowcase(&start2, end2); 193 |
194 if (nxt_slow_path((u1 | u2) == 0xffffffff)) { | 194 if (nxt_slow_path((u1 | u2) == 0xFFFFFFFF)) { |
195 return NXT_UTF8_SORT_INVALID; 196 } 197 198 n = u1 - u2; 199 200 if (n != 0) { 201 return (nxt_int_t) n; 202 } --- 37 unchanged lines hidden (view full) --- 240 ssize_t length; 241 const u_char *end; 242 243 length = 0; 244 245 end = p + len; 246 247 while (p < end) { | 195 return NXT_UTF8_SORT_INVALID; 196 } 197 198 n = u1 - u2; 199 200 if (n != 0) { 201 return (nxt_int_t) n; 202 } --- 37 unchanged lines hidden (view full) --- 240 ssize_t length; 241 const u_char *end; 242 243 length = 0; 244 245 end = p + len; 246 247 while (p < end) { |
248 if (nxt_slow_path(nxt_utf8_decode(&p, end) == 0xffffffff)) { | 248 if (nxt_slow_path(nxt_utf8_decode(&p, end) == 0xFFFFFFFF)) { |
249 return -1; 250 } 251 252 length++; 253 } 254 255 return length; 256} 257 258 259nxt_bool_t 260nxt_utf8_is_valid(const u_char *p, size_t len) 261{ 262 const u_char *end; 263 264 end = p + len; 265 266 while (p < end) { | 249 return -1; 250 } 251 252 length++; 253 } 254 255 return length; 256} 257 258 259nxt_bool_t 260nxt_utf8_is_valid(const u_char *p, size_t len) 261{ 262 const u_char *end; 263 264 end = p + len; 265 266 while (p < end) { |
267 if (nxt_slow_path(nxt_utf8_decode(&p, end) == 0xffffffff)) { | 267 if (nxt_slow_path(nxt_utf8_decode(&p, end) == 0xFFFFFFFF)) { |
268 return 0; 269 } 270 } 271 272 return 1; 273} | 268 return 0; 269 } 270 } 271 272 return 1; 273} |