...

Source file src/go/scanner/scanner.go

Documentation: go/scanner

		 1  // Copyright 2009 The Go Authors. All rights reserved.
		 2  // Use of this source code is governed by a BSD-style
		 3  // license that can be found in the LICENSE file.
		 4  
		 5  // Package scanner implements a scanner for Go source text.
		 6  // It takes a []byte as source which can then be tokenized
		 7  // through repeated calls to the Scan method.
		 8  //
		 9  package scanner
		10  
		11  import (
		12  	"bytes"
		13  	"fmt"
		14  	"go/token"
		15  	"path/filepath"
		16  	"strconv"
		17  	"unicode"
		18  	"unicode/utf8"
		19  )
		20  
		21  // An ErrorHandler may be provided to Scanner.Init. If a syntax error is
		22  // encountered and a handler was installed, the handler is called with a
		23  // position and an error message. The position points to the beginning of
		24  // the offending token.
		25  //
		26  type ErrorHandler func(pos token.Position, msg string)
		27  
		28  // A Scanner holds the scanner's internal state while processing
		29  // a given text. It can be allocated as part of another data
		30  // structure but must be initialized via Init before use.
		31  //
		32  type Scanner struct {
		33  	// immutable state
		34  	file *token.File	// source file handle
		35  	dir	string			 // directory portion of file.Name()
		36  	src	[]byte			 // source
		37  	err	ErrorHandler // error reporting; or nil
		38  	mode Mode				 // scanning mode
		39  
		40  	// scanning state
		41  	ch				 rune // current character
		42  	offset		 int	// character offset
		43  	rdOffset	 int	// reading offset (position after current character)
		44  	lineOffset int	// current line offset
		45  	insertSemi bool // insert a semicolon before next newline
		46  
		47  	// public state - ok to modify
		48  	ErrorCount int // number of errors encountered
		49  }
		50  
		51  const (
		52  	bom = 0xFEFF // byte order mark, only permitted as very first character
		53  	eof = -1		 // end of file
		54  )
		55  
		56  // Read the next Unicode char into s.ch.
		57  // s.ch < 0 means end-of-file.
		58  //
		59  // For optimization, there is some overlap between this method and
		60  // s.scanIdentifier.
		61  func (s *Scanner) next() {
		62  	if s.rdOffset < len(s.src) {
		63  		s.offset = s.rdOffset
		64  		if s.ch == '\n' {
		65  			s.lineOffset = s.offset
		66  			s.file.AddLine(s.offset)
		67  		}
		68  		r, w := rune(s.src[s.rdOffset]), 1
		69  		switch {
		70  		case r == 0:
		71  			s.error(s.offset, "illegal character NUL")
		72  		case r >= utf8.RuneSelf:
		73  			// not ASCII
		74  			r, w = utf8.DecodeRune(s.src[s.rdOffset:])
		75  			if r == utf8.RuneError && w == 1 {
		76  				s.error(s.offset, "illegal UTF-8 encoding")
		77  			} else if r == bom && s.offset > 0 {
		78  				s.error(s.offset, "illegal byte order mark")
		79  			}
		80  		}
		81  		s.rdOffset += w
		82  		s.ch = r
		83  	} else {
		84  		s.offset = len(s.src)
		85  		if s.ch == '\n' {
		86  			s.lineOffset = s.offset
		87  			s.file.AddLine(s.offset)
		88  		}
		89  		s.ch = eof
		90  	}
		91  }
		92  
		93  // peek returns the byte following the most recently read character without
		94  // advancing the scanner. If the scanner is at EOF, peek returns 0.
		95  func (s *Scanner) peek() byte {
		96  	if s.rdOffset < len(s.src) {
		97  		return s.src[s.rdOffset]
		98  	}
		99  	return 0
	 100  }
	 101  
	 102  // A mode value is a set of flags (or 0).
	 103  // They control scanner behavior.
	 104  //
	 105  type Mode uint
	 106  
	 107  const (
	 108  	ScanComments		Mode = 1 << iota // return comments as COMMENT tokens
	 109  	dontInsertSemis									// do not automatically insert semicolons - for testing only
	 110  )
	 111  
	 112  // Init prepares the scanner s to tokenize the text src by setting the
	 113  // scanner at the beginning of src. The scanner uses the file set file
	 114  // for position information and it adds line information for each line.
	 115  // It is ok to re-use the same file when re-scanning the same file as
	 116  // line information which is already present is ignored. Init causes a
	 117  // panic if the file size does not match the src size.
	 118  //
	 119  // Calls to Scan will invoke the error handler err if they encounter a
	 120  // syntax error and err is not nil. Also, for each error encountered,
	 121  // the Scanner field ErrorCount is incremented by one. The mode parameter
	 122  // determines how comments are handled.
	 123  //
	 124  // Note that Init may call err if there is an error in the first character
	 125  // of the file.
	 126  //
	 127  func (s *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode Mode) {
	 128  	// Explicitly initialize all fields since a scanner may be reused.
	 129  	if file.Size() != len(src) {
	 130  		panic(fmt.Sprintf("file size (%d) does not match src len (%d)", file.Size(), len(src)))
	 131  	}
	 132  	s.file = file
	 133  	s.dir, _ = filepath.Split(file.Name())
	 134  	s.src = src
	 135  	s.err = err
	 136  	s.mode = mode
	 137  
	 138  	s.ch = ' '
	 139  	s.offset = 0
	 140  	s.rdOffset = 0
	 141  	s.lineOffset = 0
	 142  	s.insertSemi = false
	 143  	s.ErrorCount = 0
	 144  
	 145  	s.next()
	 146  	if s.ch == bom {
	 147  		s.next() // ignore BOM at file beginning
	 148  	}
	 149  }
	 150  
	 151  func (s *Scanner) error(offs int, msg string) {
	 152  	if s.err != nil {
	 153  		s.err(s.file.Position(s.file.Pos(offs)), msg)
	 154  	}
	 155  	s.ErrorCount++
	 156  }
	 157  
	 158  func (s *Scanner) errorf(offs int, format string, args ...interface{}) {
	 159  	s.error(offs, fmt.Sprintf(format, args...))
	 160  }
	 161  
	 162  func (s *Scanner) scanComment() string {
	 163  	// initial '/' already consumed; s.ch == '/' || s.ch == '*'
	 164  	offs := s.offset - 1 // position of initial '/'
	 165  	next := -1					 // position immediately following the comment; < 0 means invalid comment
	 166  	numCR := 0
	 167  
	 168  	if s.ch == '/' {
	 169  		//-style comment
	 170  		// (the final '\n' is not considered part of the comment)
	 171  		s.next()
	 172  		for s.ch != '\n' && s.ch >= 0 {
	 173  			if s.ch == '\r' {
	 174  				numCR++
	 175  			}
	 176  			s.next()
	 177  		}
	 178  		// if we are at '\n', the position following the comment is afterwards
	 179  		next = s.offset
	 180  		if s.ch == '\n' {
	 181  			next++
	 182  		}
	 183  		goto exit
	 184  	}
	 185  
	 186  	/*-style comment */
	 187  	s.next()
	 188  	for s.ch >= 0 {
	 189  		ch := s.ch
	 190  		if ch == '\r' {
	 191  			numCR++
	 192  		}
	 193  		s.next()
	 194  		if ch == '*' && s.ch == '/' {
	 195  			s.next()
	 196  			next = s.offset
	 197  			goto exit
	 198  		}
	 199  	}
	 200  
	 201  	s.error(offs, "comment not terminated")
	 202  
	 203  exit:
	 204  	lit := s.src[offs:s.offset]
	 205  
	 206  	// On Windows, a (//-comment) line may end in "\r\n".
	 207  	// Remove the final '\r' before analyzing the text for
	 208  	// line directives (matching the compiler). Remove any
	 209  	// other '\r' afterwards (matching the pre-existing be-
	 210  	// havior of the scanner).
	 211  	if numCR > 0 && len(lit) >= 2 && lit[1] == '/' && lit[len(lit)-1] == '\r' {
	 212  		lit = lit[:len(lit)-1]
	 213  		numCR--
	 214  	}
	 215  
	 216  	// interpret line directives
	 217  	// (//line directives must start at the beginning of the current line)
	 218  	if next >= 0 /* implies valid comment */ && (lit[1] == '*' || offs == s.lineOffset) && bytes.HasPrefix(lit[2:], prefix) {
	 219  		s.updateLineInfo(next, offs, lit)
	 220  	}
	 221  
	 222  	if numCR > 0 {
	 223  		lit = stripCR(lit, lit[1] == '*')
	 224  	}
	 225  
	 226  	return string(lit)
	 227  }
	 228  
	 229  var prefix = []byte("line ")
	 230  
	 231  // updateLineInfo parses the incoming comment text at offset offs
	 232  // as a line directive. If successful, it updates the line info table
	 233  // for the position next per the line directive.
	 234  func (s *Scanner) updateLineInfo(next, offs int, text []byte) {
	 235  	// extract comment text
	 236  	if text[1] == '*' {
	 237  		text = text[:len(text)-2] // lop off trailing "*/"
	 238  	}
	 239  	text = text[7:] // lop off leading "//line " or "/*line "
	 240  	offs += 7
	 241  
	 242  	i, n, ok := trailingDigits(text)
	 243  	if i == 0 {
	 244  		return // ignore (not a line directive)
	 245  	}
	 246  	// i > 0
	 247  
	 248  	if !ok {
	 249  		// text has a suffix :xxx but xxx is not a number
	 250  		s.error(offs+i, "invalid line number: "+string(text[i:]))
	 251  		return
	 252  	}
	 253  
	 254  	var line, col int
	 255  	i2, n2, ok2 := trailingDigits(text[:i-1])
	 256  	if ok2 {
	 257  		//line filename:line:col
	 258  		i, i2 = i2, i
	 259  		line, col = n2, n
	 260  		if col == 0 {
	 261  			s.error(offs+i2, "invalid column number: "+string(text[i2:]))
	 262  			return
	 263  		}
	 264  		text = text[:i2-1] // lop off ":col"
	 265  	} else {
	 266  		//line filename:line
	 267  		line = n
	 268  	}
	 269  
	 270  	if line == 0 {
	 271  		s.error(offs+i, "invalid line number: "+string(text[i:]))
	 272  		return
	 273  	}
	 274  
	 275  	// If we have a column (//line filename:line:col form),
	 276  	// an empty filename means to use the previous filename.
	 277  	filename := string(text[:i-1]) // lop off ":line", and trim white space
	 278  	if filename == "" && ok2 {
	 279  		filename = s.file.Position(s.file.Pos(offs)).Filename
	 280  	} else if filename != "" {
	 281  		// Put a relative filename in the current directory.
	 282  		// This is for compatibility with earlier releases.
	 283  		// See issue 26671.
	 284  		filename = filepath.Clean(filename)
	 285  		if !filepath.IsAbs(filename) {
	 286  			filename = filepath.Join(s.dir, filename)
	 287  		}
	 288  	}
	 289  
	 290  	s.file.AddLineColumnInfo(next, filename, line, col)
	 291  }
	 292  
	 293  func trailingDigits(text []byte) (int, int, bool) {
	 294  	i := bytes.LastIndexByte(text, ':') // look from right (Windows filenames may contain ':')
	 295  	if i < 0 {
	 296  		return 0, 0, false // no ":"
	 297  	}
	 298  	// i >= 0
	 299  	n, err := strconv.ParseUint(string(text[i+1:]), 10, 0)
	 300  	return i + 1, int(n), err == nil
	 301  }
	 302  
	 303  func (s *Scanner) findLineEnd() bool {
	 304  	// initial '/' already consumed
	 305  
	 306  	defer func(offs int) {
	 307  		// reset scanner state to where it was upon calling findLineEnd
	 308  		s.ch = '/'
	 309  		s.offset = offs
	 310  		s.rdOffset = offs + 1
	 311  		s.next() // consume initial '/' again
	 312  	}(s.offset - 1)
	 313  
	 314  	// read ahead until a newline, EOF, or non-comment token is found
	 315  	for s.ch == '/' || s.ch == '*' {
	 316  		if s.ch == '/' {
	 317  			//-style comment always contains a newline
	 318  			return true
	 319  		}
	 320  		/*-style comment: look for newline */
	 321  		s.next()
	 322  		for s.ch >= 0 {
	 323  			ch := s.ch
	 324  			if ch == '\n' {
	 325  				return true
	 326  			}
	 327  			s.next()
	 328  			if ch == '*' && s.ch == '/' {
	 329  				s.next()
	 330  				break
	 331  			}
	 332  		}
	 333  		s.skipWhitespace() // s.insertSemi is set
	 334  		if s.ch < 0 || s.ch == '\n' {
	 335  			return true
	 336  		}
	 337  		if s.ch != '/' {
	 338  			// non-comment token
	 339  			return false
	 340  		}
	 341  		s.next() // consume '/'
	 342  	}
	 343  
	 344  	return false
	 345  }
	 346  
	 347  func isLetter(ch rune) bool {
	 348  	return 'a' <= lower(ch) && lower(ch) <= 'z' || ch == '_' || ch >= utf8.RuneSelf && unicode.IsLetter(ch)
	 349  }
	 350  
	 351  func isDigit(ch rune) bool {
	 352  	return isDecimal(ch) || ch >= utf8.RuneSelf && unicode.IsDigit(ch)
	 353  }
	 354  
	 355  // scanIdentifier reads the string of valid identifier characters at s.offset.
	 356  // It must only be called when s.ch is known to be a valid letter.
	 357  //
	 358  // Be careful when making changes to this function: it is optimized and affects
	 359  // scanning performance significantly.
	 360  func (s *Scanner) scanIdentifier() string {
	 361  	offs := s.offset
	 362  
	 363  	// Optimize for the common case of an ASCII identifier.
	 364  	//
	 365  	// Ranging over s.src[s.rdOffset:] lets us avoid some bounds checks, and
	 366  	// avoids conversions to runes.
	 367  	//
	 368  	// In case we encounter a non-ASCII character, fall back on the slower path
	 369  	// of calling into s.next().
	 370  	for rdOffset, b := range s.src[s.rdOffset:] {
	 371  		if 'a' <= b && b <= 'z' || 'A' <= b && b <= 'Z' || b == '_' || '0' <= b && b <= '9' {
	 372  			// Avoid assigning a rune for the common case of an ascii character.
	 373  			continue
	 374  		}
	 375  		s.rdOffset += rdOffset
	 376  		if 0 < b && b < utf8.RuneSelf {
	 377  			// Optimization: we've encountered an ASCII character that's not a letter
	 378  			// or number. Avoid the call into s.next() and corresponding set up.
	 379  			//
	 380  			// Note that s.next() does some line accounting if s.ch is '\n', so this
	 381  			// shortcut is only possible because we know that the preceding character
	 382  			// is not '\n'.
	 383  			s.ch = rune(b)
	 384  			s.offset = s.rdOffset
	 385  			s.rdOffset++
	 386  			goto exit
	 387  		}
	 388  		// We know that the preceding character is valid for an identifier because
	 389  		// scanIdentifier is only called when s.ch is a letter, so calling s.next()
	 390  		// at s.rdOffset resets the scanner state.
	 391  		s.next()
	 392  		for isLetter(s.ch) || isDigit(s.ch) {
	 393  			s.next()
	 394  		}
	 395  		goto exit
	 396  	}
	 397  	s.offset = len(s.src)
	 398  	s.rdOffset = len(s.src)
	 399  	s.ch = eof
	 400  
	 401  exit:
	 402  	return string(s.src[offs:s.offset])
	 403  }
	 404  
	 405  func digitVal(ch rune) int {
	 406  	switch {
	 407  	case '0' <= ch && ch <= '9':
	 408  		return int(ch - '0')
	 409  	case 'a' <= lower(ch) && lower(ch) <= 'f':
	 410  		return int(lower(ch) - 'a' + 10)
	 411  	}
	 412  	return 16 // larger than any legal digit val
	 413  }
	 414  
	 415  func lower(ch rune) rune		 { return ('a' - 'A') | ch } // returns lower-case ch iff ch is ASCII letter
	 416  func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' }
	 417  func isHex(ch rune) bool		 { return '0' <= ch && ch <= '9' || 'a' <= lower(ch) && lower(ch) <= 'f' }
	 418  
	 419  // digits accepts the sequence { digit | '_' }.
	 420  // If base <= 10, digits accepts any decimal digit but records
	 421  // the offset (relative to the source start) of a digit >= base
	 422  // in *invalid, if *invalid < 0.
	 423  // digits returns a bitset describing whether the sequence contained
	 424  // digits (bit 0 is set), or separators '_' (bit 1 is set).
	 425  func (s *Scanner) digits(base int, invalid *int) (digsep int) {
	 426  	if base <= 10 {
	 427  		max := rune('0' + base)
	 428  		for isDecimal(s.ch) || s.ch == '_' {
	 429  			ds := 1
	 430  			if s.ch == '_' {
	 431  				ds = 2
	 432  			} else if s.ch >= max && *invalid < 0 {
	 433  				*invalid = s.offset // record invalid rune offset
	 434  			}
	 435  			digsep |= ds
	 436  			s.next()
	 437  		}
	 438  	} else {
	 439  		for isHex(s.ch) || s.ch == '_' {
	 440  			ds := 1
	 441  			if s.ch == '_' {
	 442  				ds = 2
	 443  			}
	 444  			digsep |= ds
	 445  			s.next()
	 446  		}
	 447  	}
	 448  	return
	 449  }
	 450  
	 451  func (s *Scanner) scanNumber() (token.Token, string) {
	 452  	offs := s.offset
	 453  	tok := token.ILLEGAL
	 454  
	 455  	base := 10				// number base
	 456  	prefix := rune(0) // one of 0 (decimal), '0' (0-octal), 'x', 'o', or 'b'
	 457  	digsep := 0			 // bit 0: digit present, bit 1: '_' present
	 458  	invalid := -1		 // index of invalid digit in literal, or < 0
	 459  
	 460  	// integer part
	 461  	if s.ch != '.' {
	 462  		tok = token.INT
	 463  		if s.ch == '0' {
	 464  			s.next()
	 465  			switch lower(s.ch) {
	 466  			case 'x':
	 467  				s.next()
	 468  				base, prefix = 16, 'x'
	 469  			case 'o':
	 470  				s.next()
	 471  				base, prefix = 8, 'o'
	 472  			case 'b':
	 473  				s.next()
	 474  				base, prefix = 2, 'b'
	 475  			default:
	 476  				base, prefix = 8, '0'
	 477  				digsep = 1 // leading 0
	 478  			}
	 479  		}
	 480  		digsep |= s.digits(base, &invalid)
	 481  	}
	 482  
	 483  	// fractional part
	 484  	if s.ch == '.' {
	 485  		tok = token.FLOAT
	 486  		if prefix == 'o' || prefix == 'b' {
	 487  			s.error(s.offset, "invalid radix point in "+litname(prefix))
	 488  		}
	 489  		s.next()
	 490  		digsep |= s.digits(base, &invalid)
	 491  	}
	 492  
	 493  	if digsep&1 == 0 {
	 494  		s.error(s.offset, litname(prefix)+" has no digits")
	 495  	}
	 496  
	 497  	// exponent
	 498  	if e := lower(s.ch); e == 'e' || e == 'p' {
	 499  		switch {
	 500  		case e == 'e' && prefix != 0 && prefix != '0':
	 501  			s.errorf(s.offset, "%q exponent requires decimal mantissa", s.ch)
	 502  		case e == 'p' && prefix != 'x':
	 503  			s.errorf(s.offset, "%q exponent requires hexadecimal mantissa", s.ch)
	 504  		}
	 505  		s.next()
	 506  		tok = token.FLOAT
	 507  		if s.ch == '+' || s.ch == '-' {
	 508  			s.next()
	 509  		}
	 510  		ds := s.digits(10, nil)
	 511  		digsep |= ds
	 512  		if ds&1 == 0 {
	 513  			s.error(s.offset, "exponent has no digits")
	 514  		}
	 515  	} else if prefix == 'x' && tok == token.FLOAT {
	 516  		s.error(s.offset, "hexadecimal mantissa requires a 'p' exponent")
	 517  	}
	 518  
	 519  	// suffix 'i'
	 520  	if s.ch == 'i' {
	 521  		tok = token.IMAG
	 522  		s.next()
	 523  	}
	 524  
	 525  	lit := string(s.src[offs:s.offset])
	 526  	if tok == token.INT && invalid >= 0 {
	 527  		s.errorf(invalid, "invalid digit %q in %s", lit[invalid-offs], litname(prefix))
	 528  	}
	 529  	if digsep&2 != 0 {
	 530  		if i := invalidSep(lit); i >= 0 {
	 531  			s.error(offs+i, "'_' must separate successive digits")
	 532  		}
	 533  	}
	 534  
	 535  	return tok, lit
	 536  }
	 537  
	 538  func litname(prefix rune) string {
	 539  	switch prefix {
	 540  	case 'x':
	 541  		return "hexadecimal literal"
	 542  	case 'o', '0':
	 543  		return "octal literal"
	 544  	case 'b':
	 545  		return "binary literal"
	 546  	}
	 547  	return "decimal literal"
	 548  }
	 549  
	 550  // invalidSep returns the index of the first invalid separator in x, or -1.
	 551  func invalidSep(x string) int {
	 552  	x1 := ' ' // prefix char, we only care if it's 'x'
	 553  	d := '.'	// digit, one of '_', '0' (a digit), or '.' (anything else)
	 554  	i := 0
	 555  
	 556  	// a prefix counts as a digit
	 557  	if len(x) >= 2 && x[0] == '0' {
	 558  		x1 = lower(rune(x[1]))
	 559  		if x1 == 'x' || x1 == 'o' || x1 == 'b' {
	 560  			d = '0'
	 561  			i = 2
	 562  		}
	 563  	}
	 564  
	 565  	// mantissa and exponent
	 566  	for ; i < len(x); i++ {
	 567  		p := d // previous digit
	 568  		d = rune(x[i])
	 569  		switch {
	 570  		case d == '_':
	 571  			if p != '0' {
	 572  				return i
	 573  			}
	 574  		case isDecimal(d) || x1 == 'x' && isHex(d):
	 575  			d = '0'
	 576  		default:
	 577  			if p == '_' {
	 578  				return i - 1
	 579  			}
	 580  			d = '.'
	 581  		}
	 582  	}
	 583  	if d == '_' {
	 584  		return len(x) - 1
	 585  	}
	 586  
	 587  	return -1
	 588  }
	 589  
	 590  // scanEscape parses an escape sequence where rune is the accepted
	 591  // escaped quote. In case of a syntax error, it stops at the offending
	 592  // character (without consuming it) and returns false. Otherwise
	 593  // it returns true.
	 594  func (s *Scanner) scanEscape(quote rune) bool {
	 595  	offs := s.offset
	 596  
	 597  	var n int
	 598  	var base, max uint32
	 599  	switch s.ch {
	 600  	case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
	 601  		s.next()
	 602  		return true
	 603  	case '0', '1', '2', '3', '4', '5', '6', '7':
	 604  		n, base, max = 3, 8, 255
	 605  	case 'x':
	 606  		s.next()
	 607  		n, base, max = 2, 16, 255
	 608  	case 'u':
	 609  		s.next()
	 610  		n, base, max = 4, 16, unicode.MaxRune
	 611  	case 'U':
	 612  		s.next()
	 613  		n, base, max = 8, 16, unicode.MaxRune
	 614  	default:
	 615  		msg := "unknown escape sequence"
	 616  		if s.ch < 0 {
	 617  			msg = "escape sequence not terminated"
	 618  		}
	 619  		s.error(offs, msg)
	 620  		return false
	 621  	}
	 622  
	 623  	var x uint32
	 624  	for n > 0 {
	 625  		d := uint32(digitVal(s.ch))
	 626  		if d >= base {
	 627  			msg := fmt.Sprintf("illegal character %#U in escape sequence", s.ch)
	 628  			if s.ch < 0 {
	 629  				msg = "escape sequence not terminated"
	 630  			}
	 631  			s.error(s.offset, msg)
	 632  			return false
	 633  		}
	 634  		x = x*base + d
	 635  		s.next()
	 636  		n--
	 637  	}
	 638  
	 639  	if x > max || 0xD800 <= x && x < 0xE000 {
	 640  		s.error(offs, "escape sequence is invalid Unicode code point")
	 641  		return false
	 642  	}
	 643  
	 644  	return true
	 645  }
	 646  
	 647  func (s *Scanner) scanRune() string {
	 648  	// '\'' opening already consumed
	 649  	offs := s.offset - 1
	 650  
	 651  	valid := true
	 652  	n := 0
	 653  	for {
	 654  		ch := s.ch
	 655  		if ch == '\n' || ch < 0 {
	 656  			// only report error if we don't have one already
	 657  			if valid {
	 658  				s.error(offs, "rune literal not terminated")
	 659  				valid = false
	 660  			}
	 661  			break
	 662  		}
	 663  		s.next()
	 664  		if ch == '\'' {
	 665  			break
	 666  		}
	 667  		n++
	 668  		if ch == '\\' {
	 669  			if !s.scanEscape('\'') {
	 670  				valid = false
	 671  			}
	 672  			// continue to read to closing quote
	 673  		}
	 674  	}
	 675  
	 676  	if valid && n != 1 {
	 677  		s.error(offs, "illegal rune literal")
	 678  	}
	 679  
	 680  	return string(s.src[offs:s.offset])
	 681  }
	 682  
	 683  func (s *Scanner) scanString() string {
	 684  	// '"' opening already consumed
	 685  	offs := s.offset - 1
	 686  
	 687  	for {
	 688  		ch := s.ch
	 689  		if ch == '\n' || ch < 0 {
	 690  			s.error(offs, "string literal not terminated")
	 691  			break
	 692  		}
	 693  		s.next()
	 694  		if ch == '"' {
	 695  			break
	 696  		}
	 697  		if ch == '\\' {
	 698  			s.scanEscape('"')
	 699  		}
	 700  	}
	 701  
	 702  	return string(s.src[offs:s.offset])
	 703  }
	 704  
	 705  func stripCR(b []byte, comment bool) []byte {
	 706  	c := make([]byte, len(b))
	 707  	i := 0
	 708  	for j, ch := range b {
	 709  		// In a /*-style comment, don't strip \r from *\r/ (incl.
	 710  		// sequences of \r from *\r\r...\r/) since the resulting
	 711  		// */ would terminate the comment too early unless the \r
	 712  		// is immediately following the opening /* in which case
	 713  		// it's ok because /*/ is not closed yet (issue #11151).
	 714  		if ch != '\r' || comment && i > len("/*") && c[i-1] == '*' && j+1 < len(b) && b[j+1] == '/' {
	 715  			c[i] = ch
	 716  			i++
	 717  		}
	 718  	}
	 719  	return c[:i]
	 720  }
	 721  
	 722  func (s *Scanner) scanRawString() string {
	 723  	// '`' opening already consumed
	 724  	offs := s.offset - 1
	 725  
	 726  	hasCR := false
	 727  	for {
	 728  		ch := s.ch
	 729  		if ch < 0 {
	 730  			s.error(offs, "raw string literal not terminated")
	 731  			break
	 732  		}
	 733  		s.next()
	 734  		if ch == '`' {
	 735  			break
	 736  		}
	 737  		if ch == '\r' {
	 738  			hasCR = true
	 739  		}
	 740  	}
	 741  
	 742  	lit := s.src[offs:s.offset]
	 743  	if hasCR {
	 744  		lit = stripCR(lit, false)
	 745  	}
	 746  
	 747  	return string(lit)
	 748  }
	 749  
	 750  func (s *Scanner) skipWhitespace() {
	 751  	for s.ch == ' ' || s.ch == '\t' || s.ch == '\n' && !s.insertSemi || s.ch == '\r' {
	 752  		s.next()
	 753  	}
	 754  }
	 755  
	 756  // Helper functions for scanning multi-byte tokens such as >> += >>= .
	 757  // Different routines recognize different length tok_i based on matches
	 758  // of ch_i. If a token ends in '=', the result is tok1 or tok3
	 759  // respectively. Otherwise, the result is tok0 if there was no other
	 760  // matching character, or tok2 if the matching character was ch2.
	 761  
	 762  func (s *Scanner) switch2(tok0, tok1 token.Token) token.Token {
	 763  	if s.ch == '=' {
	 764  		s.next()
	 765  		return tok1
	 766  	}
	 767  	return tok0
	 768  }
	 769  
	 770  func (s *Scanner) switch3(tok0, tok1 token.Token, ch2 rune, tok2 token.Token) token.Token {
	 771  	if s.ch == '=' {
	 772  		s.next()
	 773  		return tok1
	 774  	}
	 775  	if s.ch == ch2 {
	 776  		s.next()
	 777  		return tok2
	 778  	}
	 779  	return tok0
	 780  }
	 781  
	 782  func (s *Scanner) switch4(tok0, tok1 token.Token, ch2 rune, tok2, tok3 token.Token) token.Token {
	 783  	if s.ch == '=' {
	 784  		s.next()
	 785  		return tok1
	 786  	}
	 787  	if s.ch == ch2 {
	 788  		s.next()
	 789  		if s.ch == '=' {
	 790  			s.next()
	 791  			return tok3
	 792  		}
	 793  		return tok2
	 794  	}
	 795  	return tok0
	 796  }
	 797  
	 798  // Scan scans the next token and returns the token position, the token,
	 799  // and its literal string if applicable. The source end is indicated by
	 800  // token.EOF.
	 801  //
	 802  // If the returned token is a literal (token.IDENT, token.INT, token.FLOAT,
	 803  // token.IMAG, token.CHAR, token.STRING) or token.COMMENT, the literal string
	 804  // has the corresponding value.
	 805  //
	 806  // If the returned token is a keyword, the literal string is the keyword.
	 807  //
	 808  // If the returned token is token.SEMICOLON, the corresponding
	 809  // literal string is ";" if the semicolon was present in the source,
	 810  // and "\n" if the semicolon was inserted because of a newline or
	 811  // at EOF.
	 812  //
	 813  // If the returned token is token.ILLEGAL, the literal string is the
	 814  // offending character.
	 815  //
	 816  // In all other cases, Scan returns an empty literal string.
	 817  //
	 818  // For more tolerant parsing, Scan will return a valid token if
	 819  // possible even if a syntax error was encountered. Thus, even
	 820  // if the resulting token sequence contains no illegal tokens,
	 821  // a client may not assume that no error occurred. Instead it
	 822  // must check the scanner's ErrorCount or the number of calls
	 823  // of the error handler, if there was one installed.
	 824  //
	 825  // Scan adds line information to the file added to the file
	 826  // set with Init. Token positions are relative to that file
	 827  // and thus relative to the file set.
	 828  //
	 829  func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) {
	 830  scanAgain:
	 831  	s.skipWhitespace()
	 832  
	 833  	// current token start
	 834  	pos = s.file.Pos(s.offset)
	 835  
	 836  	// determine token value
	 837  	insertSemi := false
	 838  	switch ch := s.ch; {
	 839  	case isLetter(ch):
	 840  		lit = s.scanIdentifier()
	 841  		if len(lit) > 1 {
	 842  			// keywords are longer than one letter - avoid lookup otherwise
	 843  			tok = token.Lookup(lit)
	 844  			switch tok {
	 845  			case token.IDENT, token.BREAK, token.CONTINUE, token.FALLTHROUGH, token.RETURN:
	 846  				insertSemi = true
	 847  			}
	 848  		} else {
	 849  			insertSemi = true
	 850  			tok = token.IDENT
	 851  		}
	 852  	case isDecimal(ch) || ch == '.' && isDecimal(rune(s.peek())):
	 853  		insertSemi = true
	 854  		tok, lit = s.scanNumber()
	 855  	default:
	 856  		s.next() // always make progress
	 857  		switch ch {
	 858  		case -1:
	 859  			if s.insertSemi {
	 860  				s.insertSemi = false // EOF consumed
	 861  				return pos, token.SEMICOLON, "\n"
	 862  			}
	 863  			tok = token.EOF
	 864  		case '\n':
	 865  			// we only reach here if s.insertSemi was
	 866  			// set in the first place and exited early
	 867  			// from s.skipWhitespace()
	 868  			s.insertSemi = false // newline consumed
	 869  			return pos, token.SEMICOLON, "\n"
	 870  		case '"':
	 871  			insertSemi = true
	 872  			tok = token.STRING
	 873  			lit = s.scanString()
	 874  		case '\'':
	 875  			insertSemi = true
	 876  			tok = token.CHAR
	 877  			lit = s.scanRune()
	 878  		case '`':
	 879  			insertSemi = true
	 880  			tok = token.STRING
	 881  			lit = s.scanRawString()
	 882  		case ':':
	 883  			tok = s.switch2(token.COLON, token.DEFINE)
	 884  		case '.':
	 885  			// fractions starting with a '.' are handled by outer switch
	 886  			tok = token.PERIOD
	 887  			if s.ch == '.' && s.peek() == '.' {
	 888  				s.next()
	 889  				s.next() // consume last '.'
	 890  				tok = token.ELLIPSIS
	 891  			}
	 892  		case ',':
	 893  			tok = token.COMMA
	 894  		case ';':
	 895  			tok = token.SEMICOLON
	 896  			lit = ";"
	 897  		case '(':
	 898  			tok = token.LPAREN
	 899  		case ')':
	 900  			insertSemi = true
	 901  			tok = token.RPAREN
	 902  		case '[':
	 903  			tok = token.LBRACK
	 904  		case ']':
	 905  			insertSemi = true
	 906  			tok = token.RBRACK
	 907  		case '{':
	 908  			tok = token.LBRACE
	 909  		case '}':
	 910  			insertSemi = true
	 911  			tok = token.RBRACE
	 912  		case '+':
	 913  			tok = s.switch3(token.ADD, token.ADD_ASSIGN, '+', token.INC)
	 914  			if tok == token.INC {
	 915  				insertSemi = true
	 916  			}
	 917  		case '-':
	 918  			tok = s.switch3(token.SUB, token.SUB_ASSIGN, '-', token.DEC)
	 919  			if tok == token.DEC {
	 920  				insertSemi = true
	 921  			}
	 922  		case '*':
	 923  			tok = s.switch2(token.MUL, token.MUL_ASSIGN)
	 924  		case '/':
	 925  			if s.ch == '/' || s.ch == '*' {
	 926  				// comment
	 927  				if s.insertSemi && s.findLineEnd() {
	 928  					// reset position to the beginning of the comment
	 929  					s.ch = '/'
	 930  					s.offset = s.file.Offset(pos)
	 931  					s.rdOffset = s.offset + 1
	 932  					s.insertSemi = false // newline consumed
	 933  					return pos, token.SEMICOLON, "\n"
	 934  				}
	 935  				comment := s.scanComment()
	 936  				if s.mode&ScanComments == 0 {
	 937  					// skip comment
	 938  					s.insertSemi = false // newline consumed
	 939  					goto scanAgain
	 940  				}
	 941  				tok = token.COMMENT
	 942  				lit = comment
	 943  			} else {
	 944  				tok = s.switch2(token.QUO, token.QUO_ASSIGN)
	 945  			}
	 946  		case '%':
	 947  			tok = s.switch2(token.REM, token.REM_ASSIGN)
	 948  		case '^':
	 949  			tok = s.switch2(token.XOR, token.XOR_ASSIGN)
	 950  		case '<':
	 951  			if s.ch == '-' {
	 952  				s.next()
	 953  				tok = token.ARROW
	 954  			} else {
	 955  				tok = s.switch4(token.LSS, token.LEQ, '<', token.SHL, token.SHL_ASSIGN)
	 956  			}
	 957  		case '>':
	 958  			tok = s.switch4(token.GTR, token.GEQ, '>', token.SHR, token.SHR_ASSIGN)
	 959  		case '=':
	 960  			tok = s.switch2(token.ASSIGN, token.EQL)
	 961  		case '!':
	 962  			tok = s.switch2(token.NOT, token.NEQ)
	 963  		case '&':
	 964  			if s.ch == '^' {
	 965  				s.next()
	 966  				tok = s.switch2(token.AND_NOT, token.AND_NOT_ASSIGN)
	 967  			} else {
	 968  				tok = s.switch3(token.AND, token.AND_ASSIGN, '&', token.LAND)
	 969  			}
	 970  		case '|':
	 971  			tok = s.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR)
	 972  		default:
	 973  			// next reports unexpected BOMs - don't repeat
	 974  			if ch != bom {
	 975  				s.errorf(s.file.Offset(pos), "illegal character %#U", ch)
	 976  			}
	 977  			insertSemi = s.insertSemi // preserve insertSemi info
	 978  			tok = token.ILLEGAL
	 979  			lit = string(ch)
	 980  		}
	 981  	}
	 982  	if s.mode&dontInsertSemis == 0 {
	 983  		s.insertSemi = insertSemi
	 984  	}
	 985  
	 986  	return
	 987  }
	 988  

View as plain text