scanner.go

Documentation: text/scanner

		 1  // Copyright 2009 The Go Authors. All rights reserved.
		 2  // Use of this source code is governed by a BSD-style
		 3  // license that can be found in the LICENSE file.
		 4  
		 5  // Package scanner provides a scanner and tokenizer for UTF-8-encoded text.
		 6  // It takes an io.Reader providing the source, which then can be tokenized
		 7  // through repeated calls to the Scan function. For compatibility with
		 8  // existing tools, the NUL character is not allowed. If the first character
		 9  // in the source is a UTF-8 encoded byte order mark (BOM), it is discarded.
		10  //
		11  // By default, a Scanner skips white space and Go comments and recognizes all
		12  // literals as defined by the Go language specification. It may be
		13  // customized to recognize only a subset of those literals and to recognize
		14  // different identifier and white space characters.
		15  package scanner
		16  
		17  import (
		18  	"bytes"
		19  	"fmt"
		20  	"io"
		21  	"os"
		22  	"unicode"
		23  	"unicode/utf8"
		24  )
		25  
		26  // Position is a value that represents a source position.
		27  // A position is valid if Line > 0.
		28  type Position struct {
		29  	Filename string // filename, if any
		30  	Offset	 int		// byte offset, starting at 0
		31  	Line		 int		// line number, starting at 1
		32  	Column	 int		// column number, starting at 1 (character count per line)
		33  }
		34  
		35  // IsValid reports whether the position is valid.
		36  func (pos *Position) IsValid() bool { return pos.Line > 0 }
		37  
		38  func (pos Position) String() string {
		39  	s := pos.Filename
		40  	if s == "" {
		41  		s = "<input>"
		42  	}
		43  	if pos.IsValid() {
		44  		s += fmt.Sprintf(":%d:%d", pos.Line, pos.Column)
		45  	}
		46  	return s
		47  }
		48  
		49  // Predefined mode bits to control recognition of tokens. For instance,
		50  // to configure a Scanner such that it only recognizes (Go) identifiers,
		51  // integers, and skips comments, set the Scanner's Mode field to:
		52  //
		53  //	ScanIdents | ScanInts | SkipComments
		54  //
		55  // With the exceptions of comments, which are skipped if SkipComments is
		56  // set, unrecognized tokens are not ignored. Instead, the scanner simply
		57  // returns the respective individual characters (or possibly sub-tokens).
		58  // For instance, if the mode is ScanIdents (not ScanStrings), the string
		59  // "foo" is scanned as the token sequence '"' Ident '"'.
		60  //
		61  // Use GoTokens to configure the Scanner such that it accepts all Go
		62  // literal tokens including Go identifiers. Comments will be skipped.
		63  //
		64  const (
		65  	ScanIdents		 = 1 << -Ident
		66  	ScanInts			 = 1 << -Int
		67  	ScanFloats		 = 1 << -Float // includes Ints and hexadecimal floats
		68  	ScanChars			= 1 << -Char
		69  	ScanStrings		= 1 << -String
		70  	ScanRawStrings = 1 << -RawString
		71  	ScanComments	 = 1 << -Comment
		72  	SkipComments	 = 1 << -skipComment // if set with ScanComments, comments become white space
		73  	GoTokens			 = ScanIdents | ScanFloats | ScanChars | ScanStrings | ScanRawStrings | ScanComments | SkipComments
		74  )
		75  
		76  // The result of Scan is one of these tokens or a Unicode character.
		77  const (
		78  	EOF = -(iota + 1)
		79  	Ident
		80  	Int
		81  	Float
		82  	Char
		83  	String
		84  	RawString
		85  	Comment
		86  
		87  	// internal use only
		88  	skipComment
		89  )
		90  
		91  var tokenString = map[rune]string{
		92  	EOF:			 "EOF",
		93  	Ident:		 "Ident",
		94  	Int:			 "Int",
		95  	Float:		 "Float",
		96  	Char:			"Char",
		97  	String:		"String",
		98  	RawString: "RawString",
		99  	Comment:	 "Comment",
	 100  }
	 101  
	 102  // TokenString returns a printable string for a token or Unicode character.
	 103  func TokenString(tok rune) string {
	 104  	if s, found := tokenString[tok]; found {
	 105  		return s
	 106  	}
	 107  	return fmt.Sprintf("%q", string(tok))
	 108  }
	 109  
	 110  // GoWhitespace is the default value for the Scanner's Whitespace field.
	 111  // Its value selects Go's white space characters.
	 112  const GoWhitespace = 1<<'\t' | 1<<'\n' | 1<<'\r' | 1<<' '
	 113  
	 114  const bufLen = 1024 // at least utf8.UTFMax
	 115  
	 116  // A Scanner implements reading of Unicode characters and tokens from an io.Reader.
	 117  type Scanner struct {
	 118  	// Input
	 119  	src io.Reader
	 120  
	 121  	// Source buffer
	 122  	srcBuf [bufLen + 1]byte // +1 for sentinel for common case of s.next()
	 123  	srcPos int							// reading position (srcBuf index)
	 124  	srcEnd int							// source end (srcBuf index)
	 125  
	 126  	// Source position
	 127  	srcBufOffset int // byte offset of srcBuf[0] in source
	 128  	line				 int // line count
	 129  	column			 int // character count
	 130  	lastLineLen	int // length of last line in characters (for correct column reporting)
	 131  	lastCharLen	int // length of last character in bytes
	 132  
	 133  	// Token text buffer
	 134  	// Typically, token text is stored completely in srcBuf, but in general
	 135  	// the token text's head may be buffered in tokBuf while the token text's
	 136  	// tail is stored in srcBuf.
	 137  	tokBuf bytes.Buffer // token text head that is not in srcBuf anymore
	 138  	tokPos int					// token text tail position (srcBuf index); valid if >= 0
	 139  	tokEnd int					// token text tail end (srcBuf index)
	 140  
	 141  	// One character look-ahead
	 142  	ch rune // character before current srcPos
	 143  
	 144  	// Error is called for each error encountered. If no Error
	 145  	// function is set, the error is reported to os.Stderr.
	 146  	Error func(s *Scanner, msg string)
	 147  
	 148  	// ErrorCount is incremented by one for each error encountered.
	 149  	ErrorCount int
	 150  
	 151  	// The Mode field controls which tokens are recognized. For instance,
	 152  	// to recognize Ints, set the ScanInts bit in Mode. The field may be
	 153  	// changed at any time.
	 154  	Mode uint
	 155  
	 156  	// The Whitespace field controls which characters are recognized
	 157  	// as white space. To recognize a character ch <= ' ' as white space,
	 158  	// set the ch'th bit in Whitespace (the Scanner's behavior is undefined
	 159  	// for values ch > ' '). The field may be changed at any time.
	 160  	Whitespace uint64
	 161  
	 162  	// IsIdentRune is a predicate controlling the characters accepted
	 163  	// as the ith rune in an identifier. The set of valid characters
	 164  	// must not intersect with the set of white space characters.
	 165  	// If no IsIdentRune function is set, regular Go identifiers are
	 166  	// accepted instead. The field may be changed at any time.
	 167  	IsIdentRune func(ch rune, i int) bool
	 168  
	 169  	// Start position of most recently scanned token; set by Scan.
	 170  	// Calling Init or Next invalidates the position (Line == 0).
	 171  	// The Filename field is always left untouched by the Scanner.
	 172  	// If an error is reported (via Error) and Position is invalid,
	 173  	// the scanner is not inside a token. Call Pos to obtain an error
	 174  	// position in that case, or to obtain the position immediately
	 175  	// after the most recently scanned token.
	 176  	Position
	 177  }
	 178  
	 179  // Init initializes a Scanner with a new source and returns s.
	 180  // Error is set to nil, ErrorCount is set to 0, Mode is set to GoTokens,
	 181  // and Whitespace is set to GoWhitespace.
	 182  func (s *Scanner) Init(src io.Reader) *Scanner {
	 183  	s.src = src
	 184  
	 185  	// initialize source buffer
	 186  	// (the first call to next() will fill it by calling src.Read)
	 187  	s.srcBuf[0] = utf8.RuneSelf // sentinel
	 188  	s.srcPos = 0
	 189  	s.srcEnd = 0
	 190  
	 191  	// initialize source position
	 192  	s.srcBufOffset = 0
	 193  	s.line = 1
	 194  	s.column = 0
	 195  	s.lastLineLen = 0
	 196  	s.lastCharLen = 0
	 197  
	 198  	// initialize token text buffer
	 199  	// (required for first call to next()).
	 200  	s.tokPos = -1
	 201  
	 202  	// initialize one character look-ahead
	 203  	s.ch = -2 // no char read yet, not EOF
	 204  
	 205  	// initialize public fields
	 206  	s.Error = nil
	 207  	s.ErrorCount = 0
	 208  	s.Mode = GoTokens
	 209  	s.Whitespace = GoWhitespace
	 210  	s.Line = 0 // invalidate token position
	 211  
	 212  	return s
	 213  }
	 214  
	 215  // next reads and returns the next Unicode character. It is designed such
	 216  // that only a minimal amount of work needs to be done in the common ASCII
	 217  // case (one test to check for both ASCII and end-of-buffer, and one test
	 218  // to check for newlines).
	 219  func (s *Scanner) next() rune {
	 220  	ch, width := rune(s.srcBuf[s.srcPos]), 1
	 221  
	 222  	if ch >= utf8.RuneSelf {
	 223  		// uncommon case: not ASCII or not enough bytes
	 224  		for s.srcPos+utf8.UTFMax > s.srcEnd && !utf8.FullRune(s.srcBuf[s.srcPos:s.srcEnd]) {
	 225  			// not enough bytes: read some more, but first
	 226  			// save away token text if any
	 227  			if s.tokPos >= 0 {
	 228  				s.tokBuf.Write(s.srcBuf[s.tokPos:s.srcPos])
	 229  				s.tokPos = 0
	 230  				// s.tokEnd is set by Scan()
	 231  			}
	 232  			// move unread bytes to beginning of buffer
	 233  			copy(s.srcBuf[0:], s.srcBuf[s.srcPos:s.srcEnd])
	 234  			s.srcBufOffset += s.srcPos
	 235  			// read more bytes
	 236  			// (an io.Reader must return io.EOF when it reaches
	 237  			// the end of what it is reading - simply returning
	 238  			// n == 0 will make this loop retry forever; but the
	 239  			// error is in the reader implementation in that case)
	 240  			i := s.srcEnd - s.srcPos
	 241  			n, err := s.src.Read(s.srcBuf[i:bufLen])
	 242  			s.srcPos = 0
	 243  			s.srcEnd = i + n
	 244  			s.srcBuf[s.srcEnd] = utf8.RuneSelf // sentinel
	 245  			if err != nil {
	 246  				if err != io.EOF {
	 247  					s.error(err.Error())
	 248  				}
	 249  				if s.srcEnd == 0 {
	 250  					if s.lastCharLen > 0 {
	 251  						// previous character was not EOF
	 252  						s.column++
	 253  					}
	 254  					s.lastCharLen = 0
	 255  					return EOF
	 256  				}
	 257  				// If err == EOF, we won't be getting more
	 258  				// bytes; break to avoid infinite loop. If
	 259  				// err is something else, we don't know if
	 260  				// we can get more bytes; thus also break.
	 261  				break
	 262  			}
	 263  		}
	 264  		// at least one byte
	 265  		ch = rune(s.srcBuf[s.srcPos])
	 266  		if ch >= utf8.RuneSelf {
	 267  			// uncommon case: not ASCII
	 268  			ch, width = utf8.DecodeRune(s.srcBuf[s.srcPos:s.srcEnd])
	 269  			if ch == utf8.RuneError && width == 1 {
	 270  				// advance for correct error position
	 271  				s.srcPos += width
	 272  				s.lastCharLen = width
	 273  				s.column++
	 274  				s.error("invalid UTF-8 encoding")
	 275  				return ch
	 276  			}
	 277  		}
	 278  	}
	 279  
	 280  	// advance
	 281  	s.srcPos += width
	 282  	s.lastCharLen = width
	 283  	s.column++
	 284  
	 285  	// special situations
	 286  	switch ch {
	 287  	case 0:
	 288  		// for compatibility with other tools
	 289  		s.error("invalid character NUL")
	 290  	case '\n':
	 291  		s.line++
	 292  		s.lastLineLen = s.column
	 293  		s.column = 0
	 294  	}
	 295  
	 296  	return ch
	 297  }
	 298  
	 299  // Next reads and returns the next Unicode character.
	 300  // It returns EOF at the end of the source. It reports
	 301  // a read error by calling s.Error, if not nil; otherwise
	 302  // it prints an error message to os.Stderr. Next does not
	 303  // update the Scanner's Position field; use Pos() to
	 304  // get the current position.
	 305  func (s *Scanner) Next() rune {
	 306  	s.tokPos = -1 // don't collect token text
	 307  	s.Line = 0		// invalidate token position
	 308  	ch := s.Peek()
	 309  	if ch != EOF {
	 310  		s.ch = s.next()
	 311  	}
	 312  	return ch
	 313  }
	 314  
	 315  // Peek returns the next Unicode character in the source without advancing
	 316  // the scanner. It returns EOF if the scanner's position is at the last
	 317  // character of the source.
	 318  func (s *Scanner) Peek() rune {
	 319  	if s.ch == -2 {
	 320  		// this code is only run for the very first character
	 321  		s.ch = s.next()
	 322  		if s.ch == '\uFEFF' {
	 323  			s.ch = s.next() // ignore BOM
	 324  		}
	 325  	}
	 326  	return s.ch
	 327  }
	 328  
	 329  func (s *Scanner) error(msg string) {
	 330  	s.tokEnd = s.srcPos - s.lastCharLen // make sure token text is terminated
	 331  	s.ErrorCount++
	 332  	if s.Error != nil {
	 333  		s.Error(s, msg)
	 334  		return
	 335  	}
	 336  	pos := s.Position
	 337  	if !pos.IsValid() {
	 338  		pos = s.Pos()
	 339  	}
	 340  	fmt.Fprintf(os.Stderr, "%s: %s\n", pos, msg)
	 341  }
	 342  
	 343  func (s *Scanner) errorf(format string, args ...interface{}) {
	 344  	s.error(fmt.Sprintf(format, args...))
	 345  }
	 346  
	 347  func (s *Scanner) isIdentRune(ch rune, i int) bool {
	 348  	if s.IsIdentRune != nil {
	 349  		return s.IsIdentRune(ch, i)
	 350  	}
	 351  	return ch == '_' || unicode.IsLetter(ch) || unicode.IsDigit(ch) && i > 0
	 352  }
	 353  
	 354  func (s *Scanner) scanIdentifier() rune {
	 355  	// we know the zero'th rune is OK; start scanning at the next one
	 356  	ch := s.next()
	 357  	for i := 1; s.isIdentRune(ch, i); i++ {
	 358  		ch = s.next()
	 359  	}
	 360  	return ch
	 361  }
	 362  
	 363  func lower(ch rune) rune		 { return ('a' - 'A') | ch } // returns lower-case ch iff ch is ASCII letter
	 364  func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' }
	 365  func isHex(ch rune) bool		 { return '0' <= ch && ch <= '9' || 'a' <= lower(ch) && lower(ch) <= 'f' }
	 366  
	 367  // digits accepts the sequence { digit | '_' } starting with ch0.
	 368  // If base <= 10, digits accepts any decimal digit but records
	 369  // the first invalid digit >= base in *invalid if *invalid == 0.
	 370  // digits returns the first rune that is not part of the sequence
	 371  // anymore, and a bitset describing whether the sequence contained
	 372  // digits (bit 0 is set), or separators '_' (bit 1 is set).
	 373  func (s *Scanner) digits(ch0 rune, base int, invalid *rune) (ch rune, digsep int) {
	 374  	ch = ch0
	 375  	if base <= 10 {
	 376  		max := rune('0' + base)
	 377  		for isDecimal(ch) || ch == '_' {
	 378  			ds := 1
	 379  			if ch == '_' {
	 380  				ds = 2
	 381  			} else if ch >= max && *invalid == 0 {
	 382  				*invalid = ch
	 383  			}
	 384  			digsep |= ds
	 385  			ch = s.next()
	 386  		}
	 387  	} else {
	 388  		for isHex(ch) || ch == '_' {
	 389  			ds := 1
	 390  			if ch == '_' {
	 391  				ds = 2
	 392  			}
	 393  			digsep |= ds
	 394  			ch = s.next()
	 395  		}
	 396  	}
	 397  	return
	 398  }
	 399  
	 400  func (s *Scanner) scanNumber(ch rune, seenDot bool) (rune, rune) {
	 401  	base := 10				 // number base
	 402  	prefix := rune(0)	// one of 0 (decimal), '0' (0-octal), 'x', 'o', or 'b'
	 403  	digsep := 0				// bit 0: digit present, bit 1: '_' present
	 404  	invalid := rune(0) // invalid digit in literal, or 0
	 405  
	 406  	// integer part
	 407  	var tok rune
	 408  	var ds int
	 409  	if !seenDot {
	 410  		tok = Int
	 411  		if ch == '0' {
	 412  			ch = s.next()
	 413  			switch lower(ch) {
	 414  			case 'x':
	 415  				ch = s.next()
	 416  				base, prefix = 16, 'x'
	 417  			case 'o':
	 418  				ch = s.next()
	 419  				base, prefix = 8, 'o'
	 420  			case 'b':
	 421  				ch = s.next()
	 422  				base, prefix = 2, 'b'
	 423  			default:
	 424  				base, prefix = 8, '0'
	 425  				digsep = 1 // leading 0
	 426  			}
	 427  		}
	 428  		ch, ds = s.digits(ch, base, &invalid)
	 429  		digsep |= ds
	 430  		if ch == '.' && s.Mode&ScanFloats != 0 {
	 431  			ch = s.next()
	 432  			seenDot = true
	 433  		}
	 434  	}
	 435  
	 436  	// fractional part
	 437  	if seenDot {
	 438  		tok = Float
	 439  		if prefix == 'o' || prefix == 'b' {
	 440  			s.error("invalid radix point in " + litname(prefix))
	 441  		}
	 442  		ch, ds = s.digits(ch, base, &invalid)
	 443  		digsep |= ds
	 444  	}
	 445  
	 446  	if digsep&1 == 0 {
	 447  		s.error(litname(prefix) + " has no digits")
	 448  	}
	 449  
	 450  	// exponent
	 451  	if e := lower(ch); (e == 'e' || e == 'p') && s.Mode&ScanFloats != 0 {
	 452  		switch {
	 453  		case e == 'e' && prefix != 0 && prefix != '0':
	 454  			s.errorf("%q exponent requires decimal mantissa", ch)
	 455  		case e == 'p' && prefix != 'x':
	 456  			s.errorf("%q exponent requires hexadecimal mantissa", ch)
	 457  		}
	 458  		ch = s.next()
	 459  		tok = Float
	 460  		if ch == '+' || ch == '-' {
	 461  			ch = s.next()
	 462  		}
	 463  		ch, ds = s.digits(ch, 10, nil)
	 464  		digsep |= ds
	 465  		if ds&1 == 0 {
	 466  			s.error("exponent has no digits")
	 467  		}
	 468  	} else if prefix == 'x' && tok == Float {
	 469  		s.error("hexadecimal mantissa requires a 'p' exponent")
	 470  	}
	 471  
	 472  	if tok == Int && invalid != 0 {
	 473  		s.errorf("invalid digit %q in %s", invalid, litname(prefix))
	 474  	}
	 475  
	 476  	if digsep&2 != 0 {
	 477  		s.tokEnd = s.srcPos - s.lastCharLen // make sure token text is terminated
	 478  		if i := invalidSep(s.TokenText()); i >= 0 {
	 479  			s.error("'_' must separate successive digits")
	 480  		}
	 481  	}
	 482  
	 483  	return tok, ch
	 484  }
	 485  
	 486  func litname(prefix rune) string {
	 487  	switch prefix {
	 488  	default:
	 489  		return "decimal literal"
	 490  	case 'x':
	 491  		return "hexadecimal literal"
	 492  	case 'o', '0':
	 493  		return "octal literal"
	 494  	case 'b':
	 495  		return "binary literal"
	 496  	}
	 497  }
	 498  
	 499  // invalidSep returns the index of the first invalid separator in x, or -1.
	 500  func invalidSep(x string) int {
	 501  	x1 := ' ' // prefix char, we only care if it's 'x'
	 502  	d := '.'	// digit, one of '_', '0' (a digit), or '.' (anything else)
	 503  	i := 0
	 504  
	 505  	// a prefix counts as a digit
	 506  	if len(x) >= 2 && x[0] == '0' {
	 507  		x1 = lower(rune(x[1]))
	 508  		if x1 == 'x' || x1 == 'o' || x1 == 'b' {
	 509  			d = '0'
	 510  			i = 2
	 511  		}
	 512  	}
	 513  
	 514  	// mantissa and exponent
	 515  	for ; i < len(x); i++ {
	 516  		p := d // previous digit
	 517  		d = rune(x[i])
	 518  		switch {
	 519  		case d == '_':
	 520  			if p != '0' {
	 521  				return i
	 522  			}
	 523  		case isDecimal(d) || x1 == 'x' && isHex(d):
	 524  			d = '0'
	 525  		default:
	 526  			if p == '_' {
	 527  				return i - 1
	 528  			}
	 529  			d = '.'
	 530  		}
	 531  	}
	 532  	if d == '_' {
	 533  		return len(x) - 1
	 534  	}
	 535  
	 536  	return -1
	 537  }
	 538  
	 539  func digitVal(ch rune) int {
	 540  	switch {
	 541  	case '0' <= ch && ch <= '9':
	 542  		return int(ch - '0')
	 543  	case 'a' <= lower(ch) && lower(ch) <= 'f':
	 544  		return int(lower(ch) - 'a' + 10)
	 545  	}
	 546  	return 16 // larger than any legal digit val
	 547  }
	 548  
	 549  func (s *Scanner) scanDigits(ch rune, base, n int) rune {
	 550  	for n > 0 && digitVal(ch) < base {
	 551  		ch = s.next()
	 552  		n--
	 553  	}
	 554  	if n > 0 {
	 555  		s.error("invalid char escape")
	 556  	}
	 557  	return ch
	 558  }
	 559  
	 560  func (s *Scanner) scanEscape(quote rune) rune {
	 561  	ch := s.next() // read character after '/'
	 562  	switch ch {
	 563  	case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
	 564  		// nothing to do
	 565  		ch = s.next()
	 566  	case '0', '1', '2', '3', '4', '5', '6', '7':
	 567  		ch = s.scanDigits(ch, 8, 3)
	 568  	case 'x':
	 569  		ch = s.scanDigits(s.next(), 16, 2)
	 570  	case 'u':
	 571  		ch = s.scanDigits(s.next(), 16, 4)
	 572  	case 'U':
	 573  		ch = s.scanDigits(s.next(), 16, 8)
	 574  	default:
	 575  		s.error("invalid char escape")
	 576  	}
	 577  	return ch
	 578  }
	 579  
	 580  func (s *Scanner) scanString(quote rune) (n int) {
	 581  	ch := s.next() // read character after quote
	 582  	for ch != quote {
	 583  		if ch == '\n' || ch < 0 {
	 584  			s.error("literal not terminated")
	 585  			return
	 586  		}
	 587  		if ch == '\\' {
	 588  			ch = s.scanEscape(quote)
	 589  		} else {
	 590  			ch = s.next()
	 591  		}
	 592  		n++
	 593  	}
	 594  	return
	 595  }
	 596  
	 597  func (s *Scanner) scanRawString() {
	 598  	ch := s.next() // read character after '`'
	 599  	for ch != '`' {
	 600  		if ch < 0 {
	 601  			s.error("literal not terminated")
	 602  			return
	 603  		}
	 604  		ch = s.next()
	 605  	}
	 606  }
	 607  
	 608  func (s *Scanner) scanChar() {
	 609  	if s.scanString('\'') != 1 {
	 610  		s.error("invalid char literal")
	 611  	}
	 612  }
	 613  
	 614  func (s *Scanner) scanComment(ch rune) rune {
	 615  	// ch == '/' || ch == '*'
	 616  	if ch == '/' {
	 617  		// line comment
	 618  		ch = s.next() // read character after "//"
	 619  		for ch != '\n' && ch >= 0 {
	 620  			ch = s.next()
	 621  		}
	 622  		return ch
	 623  	}
	 624  
	 625  	// general comment
	 626  	ch = s.next() // read character after "/*"
	 627  	for {
	 628  		if ch < 0 {
	 629  			s.error("comment not terminated")
	 630  			break
	 631  		}
	 632  		ch0 := ch
	 633  		ch = s.next()
	 634  		if ch0 == '*' && ch == '/' {
	 635  			ch = s.next()
	 636  			break
	 637  		}
	 638  	}
	 639  	return ch
	 640  }
	 641  
	 642  // Scan reads the next token or Unicode character from source and returns it.
	 643  // It only recognizes tokens t for which the respective Mode bit (1<<-t) is set.
	 644  // It returns EOF at the end of the source. It reports scanner errors (read and
	 645  // token errors) by calling s.Error, if not nil; otherwise it prints an error
	 646  // message to os.Stderr.
	 647  func (s *Scanner) Scan() rune {
	 648  	ch := s.Peek()
	 649  
	 650  	// reset token text position
	 651  	s.tokPos = -1
	 652  	s.Line = 0
	 653  
	 654  redo:
	 655  	// skip white space
	 656  	for s.Whitespace&(1<<uint(ch)) != 0 {
	 657  		ch = s.next()
	 658  	}
	 659  
	 660  	// start collecting token text
	 661  	s.tokBuf.Reset()
	 662  	s.tokPos = s.srcPos - s.lastCharLen
	 663  
	 664  	// set token position
	 665  	// (this is a slightly optimized version of the code in Pos())
	 666  	s.Offset = s.srcBufOffset + s.tokPos
	 667  	if s.column > 0 {
	 668  		// common case: last character was not a '\n'
	 669  		s.Line = s.line
	 670  		s.Column = s.column
	 671  	} else {
	 672  		// last character was a '\n'
	 673  		// (we cannot be at the beginning of the source
	 674  		// since we have called next() at least once)
	 675  		s.Line = s.line - 1
	 676  		s.Column = s.lastLineLen
	 677  	}
	 678  
	 679  	// determine token value
	 680  	tok := ch
	 681  	switch {
	 682  	case s.isIdentRune(ch, 0):
	 683  		if s.Mode&ScanIdents != 0 {
	 684  			tok = Ident
	 685  			ch = s.scanIdentifier()
	 686  		} else {
	 687  			ch = s.next()
	 688  		}
	 689  	case isDecimal(ch):
	 690  		if s.Mode&(ScanInts|ScanFloats) != 0 {
	 691  			tok, ch = s.scanNumber(ch, false)
	 692  		} else {
	 693  			ch = s.next()
	 694  		}
	 695  	default:
	 696  		switch ch {
	 697  		case EOF:
	 698  			break
	 699  		case '"':
	 700  			if s.Mode&ScanStrings != 0 {
	 701  				s.scanString('"')
	 702  				tok = String
	 703  			}
	 704  			ch = s.next()
	 705  		case '\'':
	 706  			if s.Mode&ScanChars != 0 {
	 707  				s.scanChar()
	 708  				tok = Char
	 709  			}
	 710  			ch = s.next()
	 711  		case '.':
	 712  			ch = s.next()
	 713  			if isDecimal(ch) && s.Mode&ScanFloats != 0 {
	 714  				tok, ch = s.scanNumber(ch, true)
	 715  			}
	 716  		case '/':
	 717  			ch = s.next()
	 718  			if (ch == '/' || ch == '*') && s.Mode&ScanComments != 0 {
	 719  				if s.Mode&SkipComments != 0 {
	 720  					s.tokPos = -1 // don't collect token text
	 721  					ch = s.scanComment(ch)
	 722  					goto redo
	 723  				}
	 724  				ch = s.scanComment(ch)
	 725  				tok = Comment
	 726  			}
	 727  		case '`':
	 728  			if s.Mode&ScanRawStrings != 0 {
	 729  				s.scanRawString()
	 730  				tok = RawString
	 731  			}
	 732  			ch = s.next()
	 733  		default:
	 734  			ch = s.next()
	 735  		}
	 736  	}
	 737  
	 738  	// end of token text
	 739  	s.tokEnd = s.srcPos - s.lastCharLen
	 740  
	 741  	s.ch = ch
	 742  	return tok
	 743  }
	 744  
	 745  // Pos returns the position of the character immediately after
	 746  // the character or token returned by the last call to Next or Scan.
	 747  // Use the Scanner's Position field for the start position of the most
	 748  // recently scanned token.
	 749  func (s *Scanner) Pos() (pos Position) {
	 750  	pos.Filename = s.Filename
	 751  	pos.Offset = s.srcBufOffset + s.srcPos - s.lastCharLen
	 752  	switch {
	 753  	case s.column > 0:
	 754  		// common case: last character was not a '\n'
	 755  		pos.Line = s.line
	 756  		pos.Column = s.column
	 757  	case s.lastLineLen > 0:
	 758  		// last character was a '\n'
	 759  		pos.Line = s.line - 1
	 760  		pos.Column = s.lastLineLen
	 761  	default:
	 762  		// at the beginning of the source
	 763  		pos.Line = 1
	 764  		pos.Column = 1
	 765  	}
	 766  	return
	 767  }
	 768  
	 769  // TokenText returns the string corresponding to the most recently scanned token.
	 770  // Valid after calling Scan and in calls of Scanner.Error.
	 771  func (s *Scanner) TokenText() string {
	 772  	if s.tokPos < 0 {
	 773  		// no token text
	 774  		return ""
	 775  	}
	 776  
	 777  	if s.tokEnd < s.tokPos {
	 778  		// if EOF was reached, s.tokEnd is set to -1 (s.srcPos == 0)
	 779  		s.tokEnd = s.tokPos
	 780  	}
	 781  	// s.tokEnd >= s.tokPos
	 782  
	 783  	if s.tokBuf.Len() == 0 {
	 784  		// common case: the entire token text is still in srcBuf
	 785  		return string(s.srcBuf[s.tokPos:s.tokEnd])
	 786  	}
	 787  
	 788  	// part of the token text was saved in tokBuf: save the rest in
	 789  	// tokBuf as well and return its content
	 790  	s.tokBuf.Write(s.srcBuf[s.tokPos:s.tokEnd])
	 791  	s.tokPos = s.tokEnd // ensure idempotency of TokenText() call
	 792  	return s.tokBuf.String()
	 793  }
	 794
View as plain text