utf8.go

Documentation: unicode/utf8

		 1  // Copyright 2009 The Go Authors. All rights reserved.
		 2  // Use of this source code is governed by a BSD-style
		 3  // license that can be found in the LICENSE file.
		 4  
		 5  // Package utf8 implements functions and constants to support text encoded in
		 6  // UTF-8. It includes functions to translate between runes and UTF-8 byte sequences.
		 7  // See https://en.wikipedia.org/wiki/UTF-8
		 8  package utf8
		 9  
		10  // The conditions RuneError==unicode.ReplacementChar and
		11  // MaxRune==unicode.MaxRune are verified in the tests.
		12  // Defining them locally avoids this package depending on package unicode.
		13  
		14  // Numbers fundamental to the encoding.
		15  const (
		16  	RuneError = '\uFFFD'		 // the "error" Rune or "Unicode replacement character"
		17  	RuneSelf	= 0x80				 // characters below RuneSelf are represented as themselves in a single byte.
		18  	MaxRune	 = '\U0010FFFF' // Maximum valid Unicode code point.
		19  	UTFMax		= 4						// maximum number of bytes of a UTF-8 encoded Unicode character.
		20  )
		21  
		22  // Code points in the surrogate range are not valid for UTF-8.
		23  const (
		24  	surrogateMin = 0xD800
		25  	surrogateMax = 0xDFFF
		26  )
		27  
		28  const (
		29  	t1 = 0b00000000
		30  	tx = 0b10000000
		31  	t2 = 0b11000000
		32  	t3 = 0b11100000
		33  	t4 = 0b11110000
		34  	t5 = 0b11111000
		35  
		36  	maskx = 0b00111111
		37  	mask2 = 0b00011111
		38  	mask3 = 0b00001111
		39  	mask4 = 0b00000111
		40  
		41  	rune1Max = 1<<7 - 1
		42  	rune2Max = 1<<11 - 1
		43  	rune3Max = 1<<16 - 1
		44  
		45  	// The default lowest and highest continuation byte.
		46  	locb = 0b10000000
		47  	hicb = 0b10111111
		48  
		49  	// These names of these constants are chosen to give nice alignment in the
		50  	// table below. The first nibble is an index into acceptRanges or F for
		51  	// special one-byte cases. The second nibble is the Rune length or the
		52  	// Status for the special one-byte case.
		53  	xx = 0xF1 // invalid: size 1
		54  	as = 0xF0 // ASCII: size 1
		55  	s1 = 0x02 // accept 0, size 2
		56  	s2 = 0x13 // accept 1, size 3
		57  	s3 = 0x03 // accept 0, size 3
		58  	s4 = 0x23 // accept 2, size 3
		59  	s5 = 0x34 // accept 3, size 4
		60  	s6 = 0x04 // accept 0, size 4
		61  	s7 = 0x44 // accept 4, size 4
		62  )
		63  
		64  // first is information about the first byte in a UTF-8 sequence.
		65  var first = [256]uint8{
		66  	//	 1	 2	 3	 4	 5	 6	 7	 8	 9	 A	 B	 C	 D	 E	 F
		67  	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F
		68  	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F
		69  	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F
		70  	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F
		71  	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F
		72  	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F
		73  	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F
		74  	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F
		75  	//	 1	 2	 3	 4	 5	 6	 7	 8	 9	 A	 B	 C	 D	 E	 F
		76  	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F
		77  	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F
		78  	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF
		79  	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF
		80  	xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF
		81  	s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF
		82  	s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF
		83  	s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xF0-0xFF
		84  }
		85  
		86  // acceptRange gives the range of valid values for the second byte in a UTF-8
		87  // sequence.
		88  type acceptRange struct {
		89  	lo uint8 // lowest value for second byte.
		90  	hi uint8 // highest value for second byte.
		91  }
		92  
		93  // acceptRanges has size 16 to avoid bounds checks in the code that uses it.
		94  var acceptRanges = [16]acceptRange{
		95  	0: {locb, hicb},
		96  	1: {0xA0, hicb},
		97  	2: {locb, 0x9F},
		98  	3: {0x90, hicb},
		99  	4: {locb, 0x8F},
	 100  }
	 101  
	 102  // FullRune reports whether the bytes in p begin with a full UTF-8 encoding of a rune.
	 103  // An invalid encoding is considered a full Rune since it will convert as a width-1 error rune.
	 104  func FullRune(p []byte) bool {
	 105  	n := len(p)
	 106  	if n == 0 {
	 107  		return false
	 108  	}
	 109  	x := first[p[0]]
	 110  	if n >= int(x&7) {
	 111  		return true // ASCII, invalid or valid.
	 112  	}
	 113  	// Must be short or invalid.
	 114  	accept := acceptRanges[x>>4]
	 115  	if n > 1 && (p[1] < accept.lo || accept.hi < p[1]) {
	 116  		return true
	 117  	} else if n > 2 && (p[2] < locb || hicb < p[2]) {
	 118  		return true
	 119  	}
	 120  	return false
	 121  }
	 122  
	 123  // FullRuneInString is like FullRune but its input is a string.
	 124  func FullRuneInString(s string) bool {
	 125  	n := len(s)
	 126  	if n == 0 {
	 127  		return false
	 128  	}
	 129  	x := first[s[0]]
	 130  	if n >= int(x&7) {
	 131  		return true // ASCII, invalid, or valid.
	 132  	}
	 133  	// Must be short or invalid.
	 134  	accept := acceptRanges[x>>4]
	 135  	if n > 1 && (s[1] < accept.lo || accept.hi < s[1]) {
	 136  		return true
	 137  	} else if n > 2 && (s[2] < locb || hicb < s[2]) {
	 138  		return true
	 139  	}
	 140  	return false
	 141  }
	 142  
	 143  // DecodeRune unpacks the first UTF-8 encoding in p and returns the rune and
	 144  // its width in bytes. If p is empty it returns (RuneError, 0). Otherwise, if
	 145  // the encoding is invalid, it returns (RuneError, 1). Both are impossible
	 146  // results for correct, non-empty UTF-8.
	 147  //
	 148  // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
	 149  // out of range, or is not the shortest possible UTF-8 encoding for the
	 150  // value. No other validation is performed.
	 151  func DecodeRune(p []byte) (r rune, size int) {
	 152  	n := len(p)
	 153  	if n < 1 {
	 154  		return RuneError, 0
	 155  	}
	 156  	p0 := p[0]
	 157  	x := first[p0]
	 158  	if x >= as {
	 159  		// The following code simulates an additional check for x == xx and
	 160  		// handling the ASCII and invalid cases accordingly. This mask-and-or
	 161  		// approach prevents an additional branch.
	 162  		mask := rune(x) << 31 >> 31 // Create 0x0000 or 0xFFFF.
	 163  		return rune(p[0])&^mask | RuneError&mask, 1
	 164  	}
	 165  	sz := int(x & 7)
	 166  	accept := acceptRanges[x>>4]
	 167  	if n < sz {
	 168  		return RuneError, 1
	 169  	}
	 170  	b1 := p[1]
	 171  	if b1 < accept.lo || accept.hi < b1 {
	 172  		return RuneError, 1
	 173  	}
	 174  	if sz <= 2 { // <= instead of == to help the compiler eliminate some bounds checks
	 175  		return rune(p0&mask2)<<6 | rune(b1&maskx), 2
	 176  	}
	 177  	b2 := p[2]
	 178  	if b2 < locb || hicb < b2 {
	 179  		return RuneError, 1
	 180  	}
	 181  	if sz <= 3 {
	 182  		return rune(p0&mask3)<<12 | rune(b1&maskx)<<6 | rune(b2&maskx), 3
	 183  	}
	 184  	b3 := p[3]
	 185  	if b3 < locb || hicb < b3 {
	 186  		return RuneError, 1
	 187  	}
	 188  	return rune(p0&mask4)<<18 | rune(b1&maskx)<<12 | rune(b2&maskx)<<6 | rune(b3&maskx), 4
	 189  }
	 190  
	 191  // DecodeRuneInString is like DecodeRune but its input is a string. If s is
	 192  // empty it returns (RuneError, 0). Otherwise, if the encoding is invalid, it
	 193  // returns (RuneError, 1). Both are impossible results for correct, non-empty
	 194  // UTF-8.
	 195  //
	 196  // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
	 197  // out of range, or is not the shortest possible UTF-8 encoding for the
	 198  // value. No other validation is performed.
	 199  func DecodeRuneInString(s string) (r rune, size int) {
	 200  	n := len(s)
	 201  	if n < 1 {
	 202  		return RuneError, 0
	 203  	}
	 204  	s0 := s[0]
	 205  	x := first[s0]
	 206  	if x >= as {
	 207  		// The following code simulates an additional check for x == xx and
	 208  		// handling the ASCII and invalid cases accordingly. This mask-and-or
	 209  		// approach prevents an additional branch.
	 210  		mask := rune(x) << 31 >> 31 // Create 0x0000 or 0xFFFF.
	 211  		return rune(s[0])&^mask | RuneError&mask, 1
	 212  	}
	 213  	sz := int(x & 7)
	 214  	accept := acceptRanges[x>>4]
	 215  	if n < sz {
	 216  		return RuneError, 1
	 217  	}
	 218  	s1 := s[1]
	 219  	if s1 < accept.lo || accept.hi < s1 {
	 220  		return RuneError, 1
	 221  	}
	 222  	if sz <= 2 { // <= instead of == to help the compiler eliminate some bounds checks
	 223  		return rune(s0&mask2)<<6 | rune(s1&maskx), 2
	 224  	}
	 225  	s2 := s[2]
	 226  	if s2 < locb || hicb < s2 {
	 227  		return RuneError, 1
	 228  	}
	 229  	if sz <= 3 {
	 230  		return rune(s0&mask3)<<12 | rune(s1&maskx)<<6 | rune(s2&maskx), 3
	 231  	}
	 232  	s3 := s[3]
	 233  	if s3 < locb || hicb < s3 {
	 234  		return RuneError, 1
	 235  	}
	 236  	return rune(s0&mask4)<<18 | rune(s1&maskx)<<12 | rune(s2&maskx)<<6 | rune(s3&maskx), 4
	 237  }
	 238  
	 239  // DecodeLastRune unpacks the last UTF-8 encoding in p and returns the rune and
	 240  // its width in bytes. If p is empty it returns (RuneError, 0). Otherwise, if
	 241  // the encoding is invalid, it returns (RuneError, 1). Both are impossible
	 242  // results for correct, non-empty UTF-8.
	 243  //
	 244  // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
	 245  // out of range, or is not the shortest possible UTF-8 encoding for the
	 246  // value. No other validation is performed.
	 247  func DecodeLastRune(p []byte) (r rune, size int) {
	 248  	end := len(p)
	 249  	if end == 0 {
	 250  		return RuneError, 0
	 251  	}
	 252  	start := end - 1
	 253  	r = rune(p[start])
	 254  	if r < RuneSelf {
	 255  		return r, 1
	 256  	}
	 257  	// guard against O(n^2) behavior when traversing
	 258  	// backwards through strings with long sequences of
	 259  	// invalid UTF-8.
	 260  	lim := end - UTFMax
	 261  	if lim < 0 {
	 262  		lim = 0
	 263  	}
	 264  	for start--; start >= lim; start-- {
	 265  		if RuneStart(p[start]) {
	 266  			break
	 267  		}
	 268  	}
	 269  	if start < 0 {
	 270  		start = 0
	 271  	}
	 272  	r, size = DecodeRune(p[start:end])
	 273  	if start+size != end {
	 274  		return RuneError, 1
	 275  	}
	 276  	return r, size
	 277  }
	 278  
	 279  // DecodeLastRuneInString is like DecodeLastRune but its input is a string. If
	 280  // s is empty it returns (RuneError, 0). Otherwise, if the encoding is invalid,
	 281  // it returns (RuneError, 1). Both are impossible results for correct,
	 282  // non-empty UTF-8.
	 283  //
	 284  // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
	 285  // out of range, or is not the shortest possible UTF-8 encoding for the
	 286  // value. No other validation is performed.
	 287  func DecodeLastRuneInString(s string) (r rune, size int) {
	 288  	end := len(s)
	 289  	if end == 0 {
	 290  		return RuneError, 0
	 291  	}
	 292  	start := end - 1
	 293  	r = rune(s[start])
	 294  	if r < RuneSelf {
	 295  		return r, 1
	 296  	}
	 297  	// guard against O(n^2) behavior when traversing
	 298  	// backwards through strings with long sequences of
	 299  	// invalid UTF-8.
	 300  	lim := end - UTFMax
	 301  	if lim < 0 {
	 302  		lim = 0
	 303  	}
	 304  	for start--; start >= lim; start-- {
	 305  		if RuneStart(s[start]) {
	 306  			break
	 307  		}
	 308  	}
	 309  	if start < 0 {
	 310  		start = 0
	 311  	}
	 312  	r, size = DecodeRuneInString(s[start:end])
	 313  	if start+size != end {
	 314  		return RuneError, 1
	 315  	}
	 316  	return r, size
	 317  }
	 318  
	 319  // RuneLen returns the number of bytes required to encode the rune.
	 320  // It returns -1 if the rune is not a valid value to encode in UTF-8.
	 321  func RuneLen(r rune) int {
	 322  	switch {
	 323  	case r < 0:
	 324  		return -1
	 325  	case r <= rune1Max:
	 326  		return 1
	 327  	case r <= rune2Max:
	 328  		return 2
	 329  	case surrogateMin <= r && r <= surrogateMax:
	 330  		return -1
	 331  	case r <= rune3Max:
	 332  		return 3
	 333  	case r <= MaxRune:
	 334  		return 4
	 335  	}
	 336  	return -1
	 337  }
	 338  
	 339  // EncodeRune writes into p (which must be large enough) the UTF-8 encoding of the rune.
	 340  // If the rune is out of range, it writes the encoding of RuneError.
	 341  // It returns the number of bytes written.
	 342  func EncodeRune(p []byte, r rune) int {
	 343  	// Negative values are erroneous. Making it unsigned addresses the problem.
	 344  	switch i := uint32(r); {
	 345  	case i <= rune1Max:
	 346  		p[0] = byte(r)
	 347  		return 1
	 348  	case i <= rune2Max:
	 349  		_ = p[1] // eliminate bounds checks
	 350  		p[0] = t2 | byte(r>>6)
	 351  		p[1] = tx | byte(r)&maskx
	 352  		return 2
	 353  	case i > MaxRune, surrogateMin <= i && i <= surrogateMax:
	 354  		r = RuneError
	 355  		fallthrough
	 356  	case i <= rune3Max:
	 357  		_ = p[2] // eliminate bounds checks
	 358  		p[0] = t3 | byte(r>>12)
	 359  		p[1] = tx | byte(r>>6)&maskx
	 360  		p[2] = tx | byte(r)&maskx
	 361  		return 3
	 362  	default:
	 363  		_ = p[3] // eliminate bounds checks
	 364  		p[0] = t4 | byte(r>>18)
	 365  		p[1] = tx | byte(r>>12)&maskx
	 366  		p[2] = tx | byte(r>>6)&maskx
	 367  		p[3] = tx | byte(r)&maskx
	 368  		return 4
	 369  	}
	 370  }
	 371  
	 372  // RuneCount returns the number of runes in p. Erroneous and short
	 373  // encodings are treated as single runes of width 1 byte.
	 374  func RuneCount(p []byte) int {
	 375  	np := len(p)
	 376  	var n int
	 377  	for i := 0; i < np; {
	 378  		n++
	 379  		c := p[i]
	 380  		if c < RuneSelf {
	 381  			// ASCII fast path
	 382  			i++
	 383  			continue
	 384  		}
	 385  		x := first[c]
	 386  		if x == xx {
	 387  			i++ // invalid.
	 388  			continue
	 389  		}
	 390  		size := int(x & 7)
	 391  		if i+size > np {
	 392  			i++ // Short or invalid.
	 393  			continue
	 394  		}
	 395  		accept := acceptRanges[x>>4]
	 396  		if c := p[i+1]; c < accept.lo || accept.hi < c {
	 397  			size = 1
	 398  		} else if size == 2 {
	 399  		} else if c := p[i+2]; c < locb || hicb < c {
	 400  			size = 1
	 401  		} else if size == 3 {
	 402  		} else if c := p[i+3]; c < locb || hicb < c {
	 403  			size = 1
	 404  		}
	 405  		i += size
	 406  	}
	 407  	return n
	 408  }
	 409  
	 410  // RuneCountInString is like RuneCount but its input is a string.
	 411  func RuneCountInString(s string) (n int) {
	 412  	ns := len(s)
	 413  	for i := 0; i < ns; n++ {
	 414  		c := s[i]
	 415  		if c < RuneSelf {
	 416  			// ASCII fast path
	 417  			i++
	 418  			continue
	 419  		}
	 420  		x := first[c]
	 421  		if x == xx {
	 422  			i++ // invalid.
	 423  			continue
	 424  		}
	 425  		size := int(x & 7)
	 426  		if i+size > ns {
	 427  			i++ // Short or invalid.
	 428  			continue
	 429  		}
	 430  		accept := acceptRanges[x>>4]
	 431  		if c := s[i+1]; c < accept.lo || accept.hi < c {
	 432  			size = 1
	 433  		} else if size == 2 {
	 434  		} else if c := s[i+2]; c < locb || hicb < c {
	 435  			size = 1
	 436  		} else if size == 3 {
	 437  		} else if c := s[i+3]; c < locb || hicb < c {
	 438  			size = 1
	 439  		}
	 440  		i += size
	 441  	}
	 442  	return n
	 443  }
	 444  
	 445  // RuneStart reports whether the byte could be the first byte of an encoded,
	 446  // possibly invalid rune. Second and subsequent bytes always have the top two
	 447  // bits set to 10.
	 448  func RuneStart(b byte) bool { return b&0xC0 != 0x80 }
	 449  
	 450  // Valid reports whether p consists entirely of valid UTF-8-encoded runes.
	 451  func Valid(p []byte) bool {
	 452  	// Fast path. Check for and skip 8 bytes of ASCII characters per iteration.
	 453  	for len(p) >= 8 {
	 454  		// Combining two 32 bit loads allows the same code to be used
	 455  		// for 32 and 64 bit platforms.
	 456  		// The compiler can generate a 32bit load for first32 and second32
	 457  		// on many platforms. See test/codegen/memcombine.go.
	 458  		first32 := uint32(p[0]) | uint32(p[1])<<8 | uint32(p[2])<<16 | uint32(p[3])<<24
	 459  		second32 := uint32(p[4]) | uint32(p[5])<<8 | uint32(p[6])<<16 | uint32(p[7])<<24
	 460  		if (first32|second32)&0x80808080 != 0 {
	 461  			// Found a non ASCII byte (>= RuneSelf).
	 462  			break
	 463  		}
	 464  		p = p[8:]
	 465  	}
	 466  	n := len(p)
	 467  	for i := 0; i < n; {
	 468  		pi := p[i]
	 469  		if pi < RuneSelf {
	 470  			i++
	 471  			continue
	 472  		}
	 473  		x := first[pi]
	 474  		if x == xx {
	 475  			return false // Illegal starter byte.
	 476  		}
	 477  		size := int(x & 7)
	 478  		if i+size > n {
	 479  			return false // Short or invalid.
	 480  		}
	 481  		accept := acceptRanges[x>>4]
	 482  		if c := p[i+1]; c < accept.lo || accept.hi < c {
	 483  			return false
	 484  		} else if size == 2 {
	 485  		} else if c := p[i+2]; c < locb || hicb < c {
	 486  			return false
	 487  		} else if size == 3 {
	 488  		} else if c := p[i+3]; c < locb || hicb < c {
	 489  			return false
	 490  		}
	 491  		i += size
	 492  	}
	 493  	return true
	 494  }
	 495  
	 496  // ValidString reports whether s consists entirely of valid UTF-8-encoded runes.
	 497  func ValidString(s string) bool {
	 498  	// Fast path. Check for and skip 8 bytes of ASCII characters per iteration.
	 499  	for len(s) >= 8 {
	 500  		// Combining two 32 bit loads allows the same code to be used
	 501  		// for 32 and 64 bit platforms.
	 502  		// The compiler can generate a 32bit load for first32 and second32
	 503  		// on many platforms. See test/codegen/memcombine.go.
	 504  		first32 := uint32(s[0]) | uint32(s[1])<<8 | uint32(s[2])<<16 | uint32(s[3])<<24
	 505  		second32 := uint32(s[4]) | uint32(s[5])<<8 | uint32(s[6])<<16 | uint32(s[7])<<24
	 506  		if (first32|second32)&0x80808080 != 0 {
	 507  			// Found a non ASCII byte (>= RuneSelf).
	 508  			break
	 509  		}
	 510  		s = s[8:]
	 511  	}
	 512  	n := len(s)
	 513  	for i := 0; i < n; {
	 514  		si := s[i]
	 515  		if si < RuneSelf {
	 516  			i++
	 517  			continue
	 518  		}
	 519  		x := first[si]
	 520  		if x == xx {
	 521  			return false // Illegal starter byte.
	 522  		}
	 523  		size := int(x & 7)
	 524  		if i+size > n {
	 525  			return false // Short or invalid.
	 526  		}
	 527  		accept := acceptRanges[x>>4]
	 528  		if c := s[i+1]; c < accept.lo || accept.hi < c {
	 529  			return false
	 530  		} else if size == 2 {
	 531  		} else if c := s[i+2]; c < locb || hicb < c {
	 532  			return false
	 533  		} else if size == 3 {
	 534  		} else if c := s[i+3]; c < locb || hicb < c {
	 535  			return false
	 536  		}
	 537  		i += size
	 538  	}
	 539  	return true
	 540  }
	 541  
	 542  // ValidRune reports whether r can be legally encoded as UTF-8.
	 543  // Code points that are out of range or a surrogate half are illegal.
	 544  func ValidRune(r rune) bool {
	 545  	switch {
	 546  	case 0 <= r && r < surrogateMin:
	 547  		return true
	 548  	case surrogateMax < r && r <= MaxRune:
	 549  		return true
	 550  	}
	 551  	return false
	 552  }
	 553
View as plain text