1
2
3
4
5
6
7
8 package utf8
9
10
11
12
13
14
15 const (
16 RuneError = '\uFFFD'
17 RuneSelf = 0x80
18 MaxRune = '\U0010FFFF'
19 UTFMax = 4
20 )
21
22
23 const (
24 surrogateMin = 0xD800
25 surrogateMax = 0xDFFF
26 )
27
28 const (
29 t1 = 0b00000000
30 tx = 0b10000000
31 t2 = 0b11000000
32 t3 = 0b11100000
33 t4 = 0b11110000
34 t5 = 0b11111000
35
36 maskx = 0b00111111
37 mask2 = 0b00011111
38 mask3 = 0b00001111
39 mask4 = 0b00000111
40
41 rune1Max = 1<<7 - 1
42 rune2Max = 1<<11 - 1
43 rune3Max = 1<<16 - 1
44
45
46 locb = 0b10000000
47 hicb = 0b10111111
48
49
50
51
52
53 xx = 0xF1
54 as = 0xF0
55 s1 = 0x02
56 s2 = 0x13
57 s3 = 0x03
58 s4 = 0x23
59 s5 = 0x34
60 s6 = 0x04
61 s7 = 0x44
62 )
63
64
65 var first = [256]uint8{
66
67 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as,
68 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as,
69 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as,
70 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as,
71 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as,
72 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as,
73 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as,
74 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as,
75
76 xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx,
77 xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx,
78 xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx,
79 xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx,
80 xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1,
81 s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1,
82 s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3,
83 s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx,
84 }
85
86
87
88 type acceptRange struct {
89 lo uint8
90 hi uint8
91 }
92
93
94 var acceptRanges = [16]acceptRange{
95 0: {locb, hicb},
96 1: {0xA0, hicb},
97 2: {locb, 0x9F},
98 3: {0x90, hicb},
99 4: {locb, 0x8F},
100 }
101
102
103
104 func FullRune(p []byte) bool {
105 n := len(p)
106 if n == 0 {
107 return false
108 }
109 x := first[p[0]]
110 if n >= int(x&7) {
111 return true
112 }
113
114 accept := acceptRanges[x>>4]
115 if n > 1 && (p[1] < accept.lo || accept.hi < p[1]) {
116 return true
117 } else if n > 2 && (p[2] < locb || hicb < p[2]) {
118 return true
119 }
120 return false
121 }
122
123
124 func FullRuneInString(s string) bool {
125 n := len(s)
126 if n == 0 {
127 return false
128 }
129 x := first[s[0]]
130 if n >= int(x&7) {
131 return true
132 }
133
134 accept := acceptRanges[x>>4]
135 if n > 1 && (s[1] < accept.lo || accept.hi < s[1]) {
136 return true
137 } else if n > 2 && (s[2] < locb || hicb < s[2]) {
138 return true
139 }
140 return false
141 }
142
143
144
145
146
147
148
149
150
151 func DecodeRune(p []byte) (r rune, size int) {
152 n := len(p)
153 if n < 1 {
154 return RuneError, 0
155 }
156 p0 := p[0]
157 x := first[p0]
158 if x >= as {
159
160
161
162 mask := rune(x) << 31 >> 31
163 return rune(p[0])&^mask | RuneError&mask, 1
164 }
165 sz := int(x & 7)
166 accept := acceptRanges[x>>4]
167 if n < sz {
168 return RuneError, 1
169 }
170 b1 := p[1]
171 if b1 < accept.lo || accept.hi < b1 {
172 return RuneError, 1
173 }
174 if sz <= 2 {
175 return rune(p0&mask2)<<6 | rune(b1&maskx), 2
176 }
177 b2 := p[2]
178 if b2 < locb || hicb < b2 {
179 return RuneError, 1
180 }
181 if sz <= 3 {
182 return rune(p0&mask3)<<12 | rune(b1&maskx)<<6 | rune(b2&maskx), 3
183 }
184 b3 := p[3]
185 if b3 < locb || hicb < b3 {
186 return RuneError, 1
187 }
188 return rune(p0&mask4)<<18 | rune(b1&maskx)<<12 | rune(b2&maskx)<<6 | rune(b3&maskx), 4
189 }
190
191
192
193
194
195
196
197
198
199 func DecodeRuneInString(s string) (r rune, size int) {
200 n := len(s)
201 if n < 1 {
202 return RuneError, 0
203 }
204 s0 := s[0]
205 x := first[s0]
206 if x >= as {
207
208
209
210 mask := rune(x) << 31 >> 31
211 return rune(s[0])&^mask | RuneError&mask, 1
212 }
213 sz := int(x & 7)
214 accept := acceptRanges[x>>4]
215 if n < sz {
216 return RuneError, 1
217 }
218 s1 := s[1]
219 if s1 < accept.lo || accept.hi < s1 {
220 return RuneError, 1
221 }
222 if sz <= 2 {
223 return rune(s0&mask2)<<6 | rune(s1&maskx), 2
224 }
225 s2 := s[2]
226 if s2 < locb || hicb < s2 {
227 return RuneError, 1
228 }
229 if sz <= 3 {
230 return rune(s0&mask3)<<12 | rune(s1&maskx)<<6 | rune(s2&maskx), 3
231 }
232 s3 := s[3]
233 if s3 < locb || hicb < s3 {
234 return RuneError, 1
235 }
236 return rune(s0&mask4)<<18 | rune(s1&maskx)<<12 | rune(s2&maskx)<<6 | rune(s3&maskx), 4
237 }
238
239
240
241
242
243
244
245
246
247 func DecodeLastRune(p []byte) (r rune, size int) {
248 end := len(p)
249 if end == 0 {
250 return RuneError, 0
251 }
252 start := end - 1
253 r = rune(p[start])
254 if r < RuneSelf {
255 return r, 1
256 }
257
258
259
260 lim := end - UTFMax
261 if lim < 0 {
262 lim = 0
263 }
264 for start--; start >= lim; start-- {
265 if RuneStart(p[start]) {
266 break
267 }
268 }
269 if start < 0 {
270 start = 0
271 }
272 r, size = DecodeRune(p[start:end])
273 if start+size != end {
274 return RuneError, 1
275 }
276 return r, size
277 }
278
279
280
281
282
283
284
285
286
287 func DecodeLastRuneInString(s string) (r rune, size int) {
288 end := len(s)
289 if end == 0 {
290 return RuneError, 0
291 }
292 start := end - 1
293 r = rune(s[start])
294 if r < RuneSelf {
295 return r, 1
296 }
297
298
299
300 lim := end - UTFMax
301 if lim < 0 {
302 lim = 0
303 }
304 for start--; start >= lim; start-- {
305 if RuneStart(s[start]) {
306 break
307 }
308 }
309 if start < 0 {
310 start = 0
311 }
312 r, size = DecodeRuneInString(s[start:end])
313 if start+size != end {
314 return RuneError, 1
315 }
316 return r, size
317 }
318
319
320
321 func RuneLen(r rune) int {
322 switch {
323 case r < 0:
324 return -1
325 case r <= rune1Max:
326 return 1
327 case r <= rune2Max:
328 return 2
329 case surrogateMin <= r && r <= surrogateMax:
330 return -1
331 case r <= rune3Max:
332 return 3
333 case r <= MaxRune:
334 return 4
335 }
336 return -1
337 }
338
339
340
341
342 func EncodeRune(p []byte, r rune) int {
343
344 switch i := uint32(r); {
345 case i <= rune1Max:
346 p[0] = byte(r)
347 return 1
348 case i <= rune2Max:
349 _ = p[1]
350 p[0] = t2 | byte(r>>6)
351 p[1] = tx | byte(r)&maskx
352 return 2
353 case i > MaxRune, surrogateMin <= i && i <= surrogateMax:
354 r = RuneError
355 fallthrough
356 case i <= rune3Max:
357 _ = p[2]
358 p[0] = t3 | byte(r>>12)
359 p[1] = tx | byte(r>>6)&maskx
360 p[2] = tx | byte(r)&maskx
361 return 3
362 default:
363 _ = p[3]
364 p[0] = t4 | byte(r>>18)
365 p[1] = tx | byte(r>>12)&maskx
366 p[2] = tx | byte(r>>6)&maskx
367 p[3] = tx | byte(r)&maskx
368 return 4
369 }
370 }
371
372
373
374 func RuneCount(p []byte) int {
375 np := len(p)
376 var n int
377 for i := 0; i < np; {
378 n++
379 c := p[i]
380 if c < RuneSelf {
381
382 i++
383 continue
384 }
385 x := first[c]
386 if x == xx {
387 i++
388 continue
389 }
390 size := int(x & 7)
391 if i+size > np {
392 i++
393 continue
394 }
395 accept := acceptRanges[x>>4]
396 if c := p[i+1]; c < accept.lo || accept.hi < c {
397 size = 1
398 } else if size == 2 {
399 } else if c := p[i+2]; c < locb || hicb < c {
400 size = 1
401 } else if size == 3 {
402 } else if c := p[i+3]; c < locb || hicb < c {
403 size = 1
404 }
405 i += size
406 }
407 return n
408 }
409
410
411 func RuneCountInString(s string) (n int) {
412 ns := len(s)
413 for i := 0; i < ns; n++ {
414 c := s[i]
415 if c < RuneSelf {
416
417 i++
418 continue
419 }
420 x := first[c]
421 if x == xx {
422 i++
423 continue
424 }
425 size := int(x & 7)
426 if i+size > ns {
427 i++
428 continue
429 }
430 accept := acceptRanges[x>>4]
431 if c := s[i+1]; c < accept.lo || accept.hi < c {
432 size = 1
433 } else if size == 2 {
434 } else if c := s[i+2]; c < locb || hicb < c {
435 size = 1
436 } else if size == 3 {
437 } else if c := s[i+3]; c < locb || hicb < c {
438 size = 1
439 }
440 i += size
441 }
442 return n
443 }
444
445
446
447
448 func RuneStart(b byte) bool { return b&0xC0 != 0x80 }
449
450
451 func Valid(p []byte) bool {
452
453 for len(p) >= 8 {
454
455
456
457
458 first32 := uint32(p[0]) | uint32(p[1])<<8 | uint32(p[2])<<16 | uint32(p[3])<<24
459 second32 := uint32(p[4]) | uint32(p[5])<<8 | uint32(p[6])<<16 | uint32(p[7])<<24
460 if (first32|second32)&0x80808080 != 0 {
461
462 break
463 }
464 p = p[8:]
465 }
466 n := len(p)
467 for i := 0; i < n; {
468 pi := p[i]
469 if pi < RuneSelf {
470 i++
471 continue
472 }
473 x := first[pi]
474 if x == xx {
475 return false
476 }
477 size := int(x & 7)
478 if i+size > n {
479 return false
480 }
481 accept := acceptRanges[x>>4]
482 if c := p[i+1]; c < accept.lo || accept.hi < c {
483 return false
484 } else if size == 2 {
485 } else if c := p[i+2]; c < locb || hicb < c {
486 return false
487 } else if size == 3 {
488 } else if c := p[i+3]; c < locb || hicb < c {
489 return false
490 }
491 i += size
492 }
493 return true
494 }
495
496
497 func ValidString(s string) bool {
498
499 for len(s) >= 8 {
500
501
502
503
504 first32 := uint32(s[0]) | uint32(s[1])<<8 | uint32(s[2])<<16 | uint32(s[3])<<24
505 second32 := uint32(s[4]) | uint32(s[5])<<8 | uint32(s[6])<<16 | uint32(s[7])<<24
506 if (first32|second32)&0x80808080 != 0 {
507
508 break
509 }
510 s = s[8:]
511 }
512 n := len(s)
513 for i := 0; i < n; {
514 si := s[i]
515 if si < RuneSelf {
516 i++
517 continue
518 }
519 x := first[si]
520 if x == xx {
521 return false
522 }
523 size := int(x & 7)
524 if i+size > n {
525 return false
526 }
527 accept := acceptRanges[x>>4]
528 if c := s[i+1]; c < accept.lo || accept.hi < c {
529 return false
530 } else if size == 2 {
531 } else if c := s[i+2]; c < locb || hicb < c {
532 return false
533 } else if size == 3 {
534 } else if c := s[i+3]; c < locb || hicb < c {
535 return false
536 }
537 i += size
538 }
539 return true
540 }
541
542
543
544 func ValidRune(r rune) bool {
545 switch {
546 case 0 <= r && r < surrogateMin:
547 return true
548 case surrogateMax < r && r <= MaxRune:
549 return true
550 }
551 return false
552 }
553
View as plain text