...

Source file src/strconv/quote.go

Documentation: strconv

		 1  // Copyright 2009 The Go Authors. All rights reserved.
		 2  // Use of this source code is governed by a BSD-style
		 3  // license that can be found in the LICENSE file.
		 4  
		 5  //go:generate go run makeisprint.go -output isprint.go
		 6  
		 7  package strconv
		 8  
		 9  import (
		10  	"unicode/utf8"
		11  )
		12  
		13  const (
		14  	lowerhex = "0123456789abcdef"
		15  	upperhex = "0123456789ABCDEF"
		16  )
		17  
		18  // contains reports whether the string contains the byte c.
		19  func contains(s string, c byte) bool {
		20  	return index(s, c) != -1
		21  }
		22  
		23  func quoteWith(s string, quote byte, ASCIIonly, graphicOnly bool) string {
		24  	return string(appendQuotedWith(make([]byte, 0, 3*len(s)/2), s, quote, ASCIIonly, graphicOnly))
		25  }
		26  
		27  func quoteRuneWith(r rune, quote byte, ASCIIonly, graphicOnly bool) string {
		28  	return string(appendQuotedRuneWith(nil, r, quote, ASCIIonly, graphicOnly))
		29  }
		30  
		31  func appendQuotedWith(buf []byte, s string, quote byte, ASCIIonly, graphicOnly bool) []byte {
		32  	// Often called with big strings, so preallocate. If there's quoting,
		33  	// this is conservative but still helps a lot.
		34  	if cap(buf)-len(buf) < len(s) {
		35  		nBuf := make([]byte, len(buf), len(buf)+1+len(s)+1)
		36  		copy(nBuf, buf)
		37  		buf = nBuf
		38  	}
		39  	buf = append(buf, quote)
		40  	for width := 0; len(s) > 0; s = s[width:] {
		41  		r := rune(s[0])
		42  		width = 1
		43  		if r >= utf8.RuneSelf {
		44  			r, width = utf8.DecodeRuneInString(s)
		45  		}
		46  		if width == 1 && r == utf8.RuneError {
		47  			buf = append(buf, `\x`...)
		48  			buf = append(buf, lowerhex[s[0]>>4])
		49  			buf = append(buf, lowerhex[s[0]&0xF])
		50  			continue
		51  		}
		52  		buf = appendEscapedRune(buf, r, quote, ASCIIonly, graphicOnly)
		53  	}
		54  	buf = append(buf, quote)
		55  	return buf
		56  }
		57  
		58  func appendQuotedRuneWith(buf []byte, r rune, quote byte, ASCIIonly, graphicOnly bool) []byte {
		59  	buf = append(buf, quote)
		60  	if !utf8.ValidRune(r) {
		61  		r = utf8.RuneError
		62  	}
		63  	buf = appendEscapedRune(buf, r, quote, ASCIIonly, graphicOnly)
		64  	buf = append(buf, quote)
		65  	return buf
		66  }
		67  
		68  func appendEscapedRune(buf []byte, r rune, quote byte, ASCIIonly, graphicOnly bool) []byte {
		69  	var runeTmp [utf8.UTFMax]byte
		70  	if r == rune(quote) || r == '\\' { // always backslashed
		71  		buf = append(buf, '\\')
		72  		buf = append(buf, byte(r))
		73  		return buf
		74  	}
		75  	if ASCIIonly {
		76  		if r < utf8.RuneSelf && IsPrint(r) {
		77  			buf = append(buf, byte(r))
		78  			return buf
		79  		}
		80  	} else if IsPrint(r) || graphicOnly && isInGraphicList(r) {
		81  		n := utf8.EncodeRune(runeTmp[:], r)
		82  		buf = append(buf, runeTmp[:n]...)
		83  		return buf
		84  	}
		85  	switch r {
		86  	case '\a':
		87  		buf = append(buf, `\a`...)
		88  	case '\b':
		89  		buf = append(buf, `\b`...)
		90  	case '\f':
		91  		buf = append(buf, `\f`...)
		92  	case '\n':
		93  		buf = append(buf, `\n`...)
		94  	case '\r':
		95  		buf = append(buf, `\r`...)
		96  	case '\t':
		97  		buf = append(buf, `\t`...)
		98  	case '\v':
		99  		buf = append(buf, `\v`...)
	 100  	default:
	 101  		switch {
	 102  		case r < ' ':
	 103  			buf = append(buf, `\x`...)
	 104  			buf = append(buf, lowerhex[byte(r)>>4])
	 105  			buf = append(buf, lowerhex[byte(r)&0xF])
	 106  		case r > utf8.MaxRune:
	 107  			r = 0xFFFD
	 108  			fallthrough
	 109  		case r < 0x10000:
	 110  			buf = append(buf, `\u`...)
	 111  			for s := 12; s >= 0; s -= 4 {
	 112  				buf = append(buf, lowerhex[r>>uint(s)&0xF])
	 113  			}
	 114  		default:
	 115  			buf = append(buf, `\U`...)
	 116  			for s := 28; s >= 0; s -= 4 {
	 117  				buf = append(buf, lowerhex[r>>uint(s)&0xF])
	 118  			}
	 119  		}
	 120  	}
	 121  	return buf
	 122  }
	 123  
	 124  // Quote returns a double-quoted Go string literal representing s. The
	 125  // returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for
	 126  // control characters and non-printable characters as defined by
	 127  // IsPrint.
	 128  func Quote(s string) string {
	 129  	return quoteWith(s, '"', false, false)
	 130  }
	 131  
	 132  // AppendQuote appends a double-quoted Go string literal representing s,
	 133  // as generated by Quote, to dst and returns the extended buffer.
	 134  func AppendQuote(dst []byte, s string) []byte {
	 135  	return appendQuotedWith(dst, s, '"', false, false)
	 136  }
	 137  
	 138  // QuoteToASCII returns a double-quoted Go string literal representing s.
	 139  // The returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for
	 140  // non-ASCII characters and non-printable characters as defined by IsPrint.
	 141  func QuoteToASCII(s string) string {
	 142  	return quoteWith(s, '"', true, false)
	 143  }
	 144  
	 145  // AppendQuoteToASCII appends a double-quoted Go string literal representing s,
	 146  // as generated by QuoteToASCII, to dst and returns the extended buffer.
	 147  func AppendQuoteToASCII(dst []byte, s string) []byte {
	 148  	return appendQuotedWith(dst, s, '"', true, false)
	 149  }
	 150  
	 151  // QuoteToGraphic returns a double-quoted Go string literal representing s.
	 152  // The returned string leaves Unicode graphic characters, as defined by
	 153  // IsGraphic, unchanged and uses Go escape sequences (\t, \n, \xFF, \u0100)
	 154  // for non-graphic characters.
	 155  func QuoteToGraphic(s string) string {
	 156  	return quoteWith(s, '"', false, true)
	 157  }
	 158  
	 159  // AppendQuoteToGraphic appends a double-quoted Go string literal representing s,
	 160  // as generated by QuoteToGraphic, to dst and returns the extended buffer.
	 161  func AppendQuoteToGraphic(dst []byte, s string) []byte {
	 162  	return appendQuotedWith(dst, s, '"', false, true)
	 163  }
	 164  
	 165  // QuoteRune returns a single-quoted Go character literal representing the
	 166  // rune. The returned string uses Go escape sequences (\t, \n, \xFF, \u0100)
	 167  // for control characters and non-printable characters as defined by IsPrint.
	 168  func QuoteRune(r rune) string {
	 169  	return quoteRuneWith(r, '\'', false, false)
	 170  }
	 171  
	 172  // AppendQuoteRune appends a single-quoted Go character literal representing the rune,
	 173  // as generated by QuoteRune, to dst and returns the extended buffer.
	 174  func AppendQuoteRune(dst []byte, r rune) []byte {
	 175  	return appendQuotedRuneWith(dst, r, '\'', false, false)
	 176  }
	 177  
	 178  // QuoteRuneToASCII returns a single-quoted Go character literal representing
	 179  // the rune. The returned string uses Go escape sequences (\t, \n, \xFF,
	 180  // \u0100) for non-ASCII characters and non-printable characters as defined
	 181  // by IsPrint.
	 182  func QuoteRuneToASCII(r rune) string {
	 183  	return quoteRuneWith(r, '\'', true, false)
	 184  }
	 185  
	 186  // AppendQuoteRuneToASCII appends a single-quoted Go character literal representing the rune,
	 187  // as generated by QuoteRuneToASCII, to dst and returns the extended buffer.
	 188  func AppendQuoteRuneToASCII(dst []byte, r rune) []byte {
	 189  	return appendQuotedRuneWith(dst, r, '\'', true, false)
	 190  }
	 191  
	 192  // QuoteRuneToGraphic returns a single-quoted Go character literal representing
	 193  // the rune. If the rune is not a Unicode graphic character,
	 194  // as defined by IsGraphic, the returned string will use a Go escape sequence
	 195  // (\t, \n, \xFF, \u0100).
	 196  func QuoteRuneToGraphic(r rune) string {
	 197  	return quoteRuneWith(r, '\'', false, true)
	 198  }
	 199  
	 200  // AppendQuoteRuneToGraphic appends a single-quoted Go character literal representing the rune,
	 201  // as generated by QuoteRuneToGraphic, to dst and returns the extended buffer.
	 202  func AppendQuoteRuneToGraphic(dst []byte, r rune) []byte {
	 203  	return appendQuotedRuneWith(dst, r, '\'', false, true)
	 204  }
	 205  
	 206  // CanBackquote reports whether the string s can be represented
	 207  // unchanged as a single-line backquoted string without control
	 208  // characters other than tab.
	 209  func CanBackquote(s string) bool {
	 210  	for len(s) > 0 {
	 211  		r, wid := utf8.DecodeRuneInString(s)
	 212  		s = s[wid:]
	 213  		if wid > 1 {
	 214  			if r == '\ufeff' {
	 215  				return false // BOMs are invisible and should not be quoted.
	 216  			}
	 217  			continue // All other multibyte runes are correctly encoded and assumed printable.
	 218  		}
	 219  		if r == utf8.RuneError {
	 220  			return false
	 221  		}
	 222  		if (r < ' ' && r != '\t') || r == '`' || r == '\u007F' {
	 223  			return false
	 224  		}
	 225  	}
	 226  	return true
	 227  }
	 228  
	 229  func unhex(b byte) (v rune, ok bool) {
	 230  	c := rune(b)
	 231  	switch {
	 232  	case '0' <= c && c <= '9':
	 233  		return c - '0', true
	 234  	case 'a' <= c && c <= 'f':
	 235  		return c - 'a' + 10, true
	 236  	case 'A' <= c && c <= 'F':
	 237  		return c - 'A' + 10, true
	 238  	}
	 239  	return
	 240  }
	 241  
	 242  // UnquoteChar decodes the first character or byte in the escaped string
	 243  // or character literal represented by the string s.
	 244  // It returns four values:
	 245  //
	 246  //	1) value, the decoded Unicode code point or byte value;
	 247  //	2) multibyte, a boolean indicating whether the decoded character requires a multibyte UTF-8 representation;
	 248  //	3) tail, the remainder of the string after the character; and
	 249  //	4) an error that will be nil if the character is syntactically valid.
	 250  //
	 251  // The second argument, quote, specifies the type of literal being parsed
	 252  // and therefore which escaped quote character is permitted.
	 253  // If set to a single quote, it permits the sequence \' and disallows unescaped '.
	 254  // If set to a double quote, it permits \" and disallows unescaped ".
	 255  // If set to zero, it does not permit either escape and allows both quote characters to appear unescaped.
	 256  func UnquoteChar(s string, quote byte) (value rune, multibyte bool, tail string, err error) {
	 257  	// easy cases
	 258  	if len(s) == 0 {
	 259  		err = ErrSyntax
	 260  		return
	 261  	}
	 262  	switch c := s[0]; {
	 263  	case c == quote && (quote == '\'' || quote == '"'):
	 264  		err = ErrSyntax
	 265  		return
	 266  	case c >= utf8.RuneSelf:
	 267  		r, size := utf8.DecodeRuneInString(s)
	 268  		return r, true, s[size:], nil
	 269  	case c != '\\':
	 270  		return rune(s[0]), false, s[1:], nil
	 271  	}
	 272  
	 273  	// hard case: c is backslash
	 274  	if len(s) <= 1 {
	 275  		err = ErrSyntax
	 276  		return
	 277  	}
	 278  	c := s[1]
	 279  	s = s[2:]
	 280  
	 281  	switch c {
	 282  	case 'a':
	 283  		value = '\a'
	 284  	case 'b':
	 285  		value = '\b'
	 286  	case 'f':
	 287  		value = '\f'
	 288  	case 'n':
	 289  		value = '\n'
	 290  	case 'r':
	 291  		value = '\r'
	 292  	case 't':
	 293  		value = '\t'
	 294  	case 'v':
	 295  		value = '\v'
	 296  	case 'x', 'u', 'U':
	 297  		n := 0
	 298  		switch c {
	 299  		case 'x':
	 300  			n = 2
	 301  		case 'u':
	 302  			n = 4
	 303  		case 'U':
	 304  			n = 8
	 305  		}
	 306  		var v rune
	 307  		if len(s) < n {
	 308  			err = ErrSyntax
	 309  			return
	 310  		}
	 311  		for j := 0; j < n; j++ {
	 312  			x, ok := unhex(s[j])
	 313  			if !ok {
	 314  				err = ErrSyntax
	 315  				return
	 316  			}
	 317  			v = v<<4 | x
	 318  		}
	 319  		s = s[n:]
	 320  		if c == 'x' {
	 321  			// single-byte string, possibly not UTF-8
	 322  			value = v
	 323  			break
	 324  		}
	 325  		if v > utf8.MaxRune {
	 326  			err = ErrSyntax
	 327  			return
	 328  		}
	 329  		value = v
	 330  		multibyte = true
	 331  	case '0', '1', '2', '3', '4', '5', '6', '7':
	 332  		v := rune(c) - '0'
	 333  		if len(s) < 2 {
	 334  			err = ErrSyntax
	 335  			return
	 336  		}
	 337  		for j := 0; j < 2; j++ { // one digit already; two more
	 338  			x := rune(s[j]) - '0'
	 339  			if x < 0 || x > 7 {
	 340  				err = ErrSyntax
	 341  				return
	 342  			}
	 343  			v = (v << 3) | x
	 344  		}
	 345  		s = s[2:]
	 346  		if v > 255 {
	 347  			err = ErrSyntax
	 348  			return
	 349  		}
	 350  		value = v
	 351  	case '\\':
	 352  		value = '\\'
	 353  	case '\'', '"':
	 354  		if c != quote {
	 355  			err = ErrSyntax
	 356  			return
	 357  		}
	 358  		value = rune(c)
	 359  	default:
	 360  		err = ErrSyntax
	 361  		return
	 362  	}
	 363  	tail = s
	 364  	return
	 365  }
	 366  
	 367  // QuotedPrefix returns the quoted string (as understood by Unquote) at the prefix of s.
	 368  // If s does not start with a valid quoted string, QuotedPrefix returns an error.
	 369  func QuotedPrefix(s string) (string, error) {
	 370  	out, _, err := unquote(s, false)
	 371  	return out, err
	 372  }
	 373  
	 374  // Unquote interprets s as a single-quoted, double-quoted,
	 375  // or backquoted Go string literal, returning the string value
	 376  // that s quotes.	(If s is single-quoted, it would be a Go
	 377  // character literal; Unquote returns the corresponding
	 378  // one-character string.)
	 379  func Unquote(s string) (string, error) {
	 380  	out, rem, err := unquote(s, true)
	 381  	if len(rem) > 0 {
	 382  		return "", ErrSyntax
	 383  	}
	 384  	return out, err
	 385  }
	 386  
	 387  // unquote parses a quoted string at the start of the input,
	 388  // returning the parsed prefix, the remaining suffix, and any parse errors.
	 389  // If unescape is true, the parsed prefix is unescaped,
	 390  // otherwise the input prefix is provided verbatim.
	 391  func unquote(in string, unescape bool) (out, rem string, err error) {
	 392  	// Determine the quote form and optimistically find the terminating quote.
	 393  	if len(in) < 2 {
	 394  		return "", in, ErrSyntax
	 395  	}
	 396  	quote := in[0]
	 397  	end := index(in[1:], quote)
	 398  	if end < 0 {
	 399  		return "", in, ErrSyntax
	 400  	}
	 401  	end += 2 // position after terminating quote; may be wrong if escape sequences are present
	 402  
	 403  	switch quote {
	 404  	case '`':
	 405  		switch {
	 406  		case !unescape:
	 407  			out = in[:end] // include quotes
	 408  		case !contains(in[:end], '\r'):
	 409  			out = in[len("`") : end-len("`")] // exclude quotes
	 410  		default:
	 411  			// Carriage return characters ('\r') inside raw string literals
	 412  			// are discarded from the raw string value.
	 413  			buf := make([]byte, 0, end-len("`")-len("\r")-len("`"))
	 414  			for i := len("`"); i < end-len("`"); i++ {
	 415  				if in[i] != '\r' {
	 416  					buf = append(buf, in[i])
	 417  				}
	 418  			}
	 419  			out = string(buf)
	 420  		}
	 421  		// NOTE: Prior implementations did not verify that raw strings consist
	 422  		// of valid UTF-8 characters and we continue to not verify it as such.
	 423  		// The Go specification does not explicitly require valid UTF-8,
	 424  		// but only mention that it is implicitly valid for Go source code
	 425  		// (which must be valid UTF-8).
	 426  		return out, in[end:], nil
	 427  	case '"', '\'':
	 428  		// Handle quoted strings without any escape sequences.
	 429  		if !contains(in[:end], '\\') && !contains(in[:end], '\n') {
	 430  			var valid bool
	 431  			switch quote {
	 432  			case '"':
	 433  				valid = utf8.ValidString(in[len(`"`) : end-len(`"`)])
	 434  			case '\'':
	 435  				r, n := utf8.DecodeRuneInString(in[len("'") : end-len("'")])
	 436  				valid = len("'")+n+len("'") == end && (r != utf8.RuneError || n != 1)
	 437  			}
	 438  			if valid {
	 439  				out = in[:end]
	 440  				if unescape {
	 441  					out = out[1 : end-1] // exclude quotes
	 442  				}
	 443  				return out, in[end:], nil
	 444  			}
	 445  		}
	 446  
	 447  		// Handle quoted strings with escape sequences.
	 448  		var buf []byte
	 449  		in0 := in
	 450  		in = in[1:] // skip starting quote
	 451  		if unescape {
	 452  			buf = make([]byte, 0, 3*end/2) // try to avoid more allocations
	 453  		}
	 454  		for len(in) > 0 && in[0] != quote {
	 455  			// Process the next character,
	 456  			// rejecting any unescaped newline characters which are invalid.
	 457  			r, multibyte, rem, err := UnquoteChar(in, quote)
	 458  			if in[0] == '\n' || err != nil {
	 459  				return "", in0, ErrSyntax
	 460  			}
	 461  			in = rem
	 462  
	 463  			// Append the character if unescaping the input.
	 464  			if unescape {
	 465  				if r < utf8.RuneSelf || !multibyte {
	 466  					buf = append(buf, byte(r))
	 467  				} else {
	 468  					var arr [utf8.UTFMax]byte
	 469  					n := utf8.EncodeRune(arr[:], r)
	 470  					buf = append(buf, arr[:n]...)
	 471  				}
	 472  			}
	 473  
	 474  			// Single quoted strings must be a single character.
	 475  			if quote == '\'' {
	 476  				break
	 477  			}
	 478  		}
	 479  
	 480  		// Verify that the string ends with a terminating quote.
	 481  		if !(len(in) > 0 && in[0] == quote) {
	 482  			return "", in0, ErrSyntax
	 483  		}
	 484  		in = in[1:] // skip terminating quote
	 485  
	 486  		if unescape {
	 487  			return string(buf), in, nil
	 488  		}
	 489  		return in0[:len(in0)-len(in)], in, nil
	 490  	default:
	 491  		return "", in, ErrSyntax
	 492  	}
	 493  }
	 494  
	 495  // bsearch16 returns the smallest i such that a[i] >= x.
	 496  // If there is no such i, bsearch16 returns len(a).
	 497  func bsearch16(a []uint16, x uint16) int {
	 498  	i, j := 0, len(a)
	 499  	for i < j {
	 500  		h := i + (j-i)>>1
	 501  		if a[h] < x {
	 502  			i = h + 1
	 503  		} else {
	 504  			j = h
	 505  		}
	 506  	}
	 507  	return i
	 508  }
	 509  
	 510  // bsearch32 returns the smallest i such that a[i] >= x.
	 511  // If there is no such i, bsearch32 returns len(a).
	 512  func bsearch32(a []uint32, x uint32) int {
	 513  	i, j := 0, len(a)
	 514  	for i < j {
	 515  		h := i + (j-i)>>1
	 516  		if a[h] < x {
	 517  			i = h + 1
	 518  		} else {
	 519  			j = h
	 520  		}
	 521  	}
	 522  	return i
	 523  }
	 524  
	 525  // TODO: IsPrint is a local implementation of unicode.IsPrint, verified by the tests
	 526  // to give the same answer. It allows this package not to depend on unicode,
	 527  // and therefore not pull in all the Unicode tables. If the linker were better
	 528  // at tossing unused tables, we could get rid of this implementation.
	 529  // That would be nice.
	 530  
	 531  // IsPrint reports whether the rune is defined as printable by Go, with
	 532  // the same definition as unicode.IsPrint: letters, numbers, punctuation,
	 533  // symbols and ASCII space.
	 534  func IsPrint(r rune) bool {
	 535  	// Fast check for Latin-1
	 536  	if r <= 0xFF {
	 537  		if 0x20 <= r && r <= 0x7E {
	 538  			// All the ASCII is printable from space through DEL-1.
	 539  			return true
	 540  		}
	 541  		if 0xA1 <= r && r <= 0xFF {
	 542  			// Similarly for ¡ through ÿ...
	 543  			return r != 0xAD // ...except for the bizarre soft hyphen.
	 544  		}
	 545  		return false
	 546  	}
	 547  
	 548  	// Same algorithm, either on uint16 or uint32 value.
	 549  	// First, find first i such that isPrint[i] >= x.
	 550  	// This is the index of either the start or end of a pair that might span x.
	 551  	// The start is even (isPrint[i&^1]) and the end is odd (isPrint[i|1]).
	 552  	// If we find x in a range, make sure x is not in isNotPrint list.
	 553  
	 554  	if 0 <= r && r < 1<<16 {
	 555  		rr, isPrint, isNotPrint := uint16(r), isPrint16, isNotPrint16
	 556  		i := bsearch16(isPrint, rr)
	 557  		if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr {
	 558  			return false
	 559  		}
	 560  		j := bsearch16(isNotPrint, rr)
	 561  		return j >= len(isNotPrint) || isNotPrint[j] != rr
	 562  	}
	 563  
	 564  	rr, isPrint, isNotPrint := uint32(r), isPrint32, isNotPrint32
	 565  	i := bsearch32(isPrint, rr)
	 566  	if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr {
	 567  		return false
	 568  	}
	 569  	if r >= 0x20000 {
	 570  		return true
	 571  	}
	 572  	r -= 0x10000
	 573  	j := bsearch16(isNotPrint, uint16(r))
	 574  	return j >= len(isNotPrint) || isNotPrint[j] != uint16(r)
	 575  }
	 576  
	 577  // IsGraphic reports whether the rune is defined as a Graphic by Unicode. Such
	 578  // characters include letters, marks, numbers, punctuation, symbols, and
	 579  // spaces, from categories L, M, N, P, S, and Zs.
	 580  func IsGraphic(r rune) bool {
	 581  	if IsPrint(r) {
	 582  		return true
	 583  	}
	 584  	return isInGraphicList(r)
	 585  }
	 586  
	 587  // isInGraphicList reports whether the rune is in the isGraphic list. This separation
	 588  // from IsGraphic allows quoteWith to avoid two calls to IsPrint.
	 589  // Should be called only if IsPrint fails.
	 590  func isInGraphicList(r rune) bool {
	 591  	// We know r must fit in 16 bits - see makeisprint.go.
	 592  	if r > 0xFFFF {
	 593  		return false
	 594  	}
	 595  	rr := uint16(r)
	 596  	i := bsearch16(isGraphic, rr)
	 597  	return i < len(isGraphic) && rr == isGraphic[i]
	 598  }
	 599  

View as plain text