...

Source file src/unicode/utf16/utf16.go

Documentation: unicode/utf16

		 1  // Copyright 2010 The Go Authors. All rights reserved.
		 2  // Use of this source code is governed by a BSD-style
		 3  // license that can be found in the LICENSE file.
		 4  
		 5  // Package utf16 implements encoding and decoding of UTF-16 sequences.
		 6  package utf16
		 7  
		 8  // The conditions replacementChar==unicode.ReplacementChar and
		 9  // maxRune==unicode.MaxRune are verified in the tests.
		10  // Defining them locally avoids this package depending on package unicode.
		11  
		12  const (
		13  	replacementChar = '\uFFFD'		 // Unicode replacement character
		14  	maxRune				 = '\U0010FFFF' // Maximum valid Unicode code point.
		15  )
		16  
		17  const (
		18  	// 0xd800-0xdc00 encodes the high 10 bits of a pair.
		19  	// 0xdc00-0xe000 encodes the low 10 bits of a pair.
		20  	// the value is those 20 bits plus 0x10000.
		21  	surr1 = 0xd800
		22  	surr2 = 0xdc00
		23  	surr3 = 0xe000
		24  
		25  	surrSelf = 0x10000
		26  )
		27  
		28  // IsSurrogate reports whether the specified Unicode code point
		29  // can appear in a surrogate pair.
		30  func IsSurrogate(r rune) bool {
		31  	return surr1 <= r && r < surr3
		32  }
		33  
		34  // DecodeRune returns the UTF-16 decoding of a surrogate pair.
		35  // If the pair is not a valid UTF-16 surrogate pair, DecodeRune returns
		36  // the Unicode replacement code point U+FFFD.
		37  func DecodeRune(r1, r2 rune) rune {
		38  	if surr1 <= r1 && r1 < surr2 && surr2 <= r2 && r2 < surr3 {
		39  		return (r1-surr1)<<10 | (r2 - surr2) + surrSelf
		40  	}
		41  	return replacementChar
		42  }
		43  
		44  // EncodeRune returns the UTF-16 surrogate pair r1, r2 for the given rune.
		45  // If the rune is not a valid Unicode code point or does not need encoding,
		46  // EncodeRune returns U+FFFD, U+FFFD.
		47  func EncodeRune(r rune) (r1, r2 rune) {
		48  	if r < surrSelf || r > maxRune {
		49  		return replacementChar, replacementChar
		50  	}
		51  	r -= surrSelf
		52  	return surr1 + (r>>10)&0x3ff, surr2 + r&0x3ff
		53  }
		54  
		55  // Encode returns the UTF-16 encoding of the Unicode code point sequence s.
		56  func Encode(s []rune) []uint16 {
		57  	n := len(s)
		58  	for _, v := range s {
		59  		if v >= surrSelf {
		60  			n++
		61  		}
		62  	}
		63  
		64  	a := make([]uint16, n)
		65  	n = 0
		66  	for _, v := range s {
		67  		switch {
		68  		case 0 <= v && v < surr1, surr3 <= v && v < surrSelf:
		69  			// normal rune
		70  			a[n] = uint16(v)
		71  			n++
		72  		case surrSelf <= v && v <= maxRune:
		73  			// needs surrogate sequence
		74  			r1, r2 := EncodeRune(v)
		75  			a[n] = uint16(r1)
		76  			a[n+1] = uint16(r2)
		77  			n += 2
		78  		default:
		79  			a[n] = uint16(replacementChar)
		80  			n++
		81  		}
		82  	}
		83  	return a[:n]
		84  }
		85  
		86  // Decode returns the Unicode code point sequence represented
		87  // by the UTF-16 encoding s.
		88  func Decode(s []uint16) []rune {
		89  	a := make([]rune, len(s))
		90  	n := 0
		91  	for i := 0; i < len(s); i++ {
		92  		switch r := s[i]; {
		93  		case r < surr1, surr3 <= r:
		94  			// normal rune
		95  			a[n] = rune(r)
		96  		case surr1 <= r && r < surr2 && i+1 < len(s) &&
		97  			surr2 <= s[i+1] && s[i+1] < surr3:
		98  			// valid surrogate sequence
		99  			a[n] = DecodeRune(rune(r), rune(s[i+1]))
	 100  			i++
	 101  		default:
	 102  			// invalid surrogate sequence
	 103  			a[n] = replacementChar
	 104  		}
	 105  		n++
	 106  	}
	 107  	return a[:n]
	 108  }
	 109  

View as plain text