parse_test.go

Documentation: regexp/syntax

		 1  // Copyright 2011 The Go Authors. All rights reserved.
		 2  // Use of this source code is governed by a BSD-style
		 3  // license that can be found in the LICENSE file.
		 4  
		 5  package syntax
		 6  
		 7  import (
		 8  	"fmt"
		 9  	"strings"
		10  	"testing"
		11  	"unicode"
		12  )
		13  
		14  type parseTest struct {
		15  	Regexp string
		16  	Dump	 string
		17  }
		18  
		19  var parseTests = []parseTest{
		20  	// Base cases
		21  	{`a`, `lit{a}`},
		22  	{`a.`, `cat{lit{a}dot{}}`},
		23  	{`a.b`, `cat{lit{a}dot{}lit{b}}`},
		24  	{`ab`, `str{ab}`},
		25  	{`a.b.c`, `cat{lit{a}dot{}lit{b}dot{}lit{c}}`},
		26  	{`abc`, `str{abc}`},
		27  	{`a|^`, `alt{lit{a}bol{}}`},
		28  	{`a|b`, `cc{0x61-0x62}`},
		29  	{`(a)`, `cap{lit{a}}`},
		30  	{`(a)|b`, `alt{cap{lit{a}}lit{b}}`},
		31  	{`a*`, `star{lit{a}}`},
		32  	{`a+`, `plus{lit{a}}`},
		33  	{`a?`, `que{lit{a}}`},
		34  	{`a{2}`, `rep{2,2 lit{a}}`},
		35  	{`a{2,3}`, `rep{2,3 lit{a}}`},
		36  	{`a{2,}`, `rep{2,-1 lit{a}}`},
		37  	{`a*?`, `nstar{lit{a}}`},
		38  	{`a+?`, `nplus{lit{a}}`},
		39  	{`a??`, `nque{lit{a}}`},
		40  	{`a{2}?`, `nrep{2,2 lit{a}}`},
		41  	{`a{2,3}?`, `nrep{2,3 lit{a}}`},
		42  	{`a{2,}?`, `nrep{2,-1 lit{a}}`},
		43  	// Malformed { } are treated as literals.
		44  	{`x{1001`, `str{x{1001}`},
		45  	{`x{9876543210`, `str{x{9876543210}`},
		46  	{`x{9876543210,`, `str{x{9876543210,}`},
		47  	{`x{2,1`, `str{x{2,1}`},
		48  	{`x{1,9876543210`, `str{x{1,9876543210}`},
		49  	{``, `emp{}`},
		50  	{`|`, `emp{}`}, // alt{emp{}emp{}} but got factored
		51  	{`|x|`, `alt{emp{}lit{x}emp{}}`},
		52  	{`.`, `dot{}`},
		53  	{`^`, `bol{}`},
		54  	{`$`, `eol{}`},
		55  	{`\|`, `lit{|}`},
		56  	{`\(`, `lit{(}`},
		57  	{`\)`, `lit{)}`},
		58  	{`\*`, `lit{*}`},
		59  	{`\+`, `lit{+}`},
		60  	{`\?`, `lit{?}`},
		61  	{`{`, `lit{{}`},
		62  	{`}`, `lit{}}`},
		63  	{`\.`, `lit{.}`},
		64  	{`\^`, `lit{^}`},
		65  	{`\$`, `lit{$}`},
		66  	{`\\`, `lit{\}`},
		67  	{`[ace]`, `cc{0x61 0x63 0x65}`},
		68  	{`[abc]`, `cc{0x61-0x63}`},
		69  	{`[a-z]`, `cc{0x61-0x7a}`},
		70  	{`[a]`, `lit{a}`},
		71  	{`\-`, `lit{-}`},
		72  	{`-`, `lit{-}`},
		73  	{`\_`, `lit{_}`},
		74  	{`abc`, `str{abc}`},
		75  	{`abc|def`, `alt{str{abc}str{def}}`},
		76  	{`abc|def|ghi`, `alt{str{abc}str{def}str{ghi}}`},
		77  
		78  	// Posix and Perl extensions
		79  	{`[[:lower:]]`, `cc{0x61-0x7a}`},
		80  	{`[a-z]`, `cc{0x61-0x7a}`},
		81  	{`[^[:lower:]]`, `cc{0x0-0x60 0x7b-0x10ffff}`},
		82  	{`[[:^lower:]]`, `cc{0x0-0x60 0x7b-0x10ffff}`},
		83  	{`(?i)[[:lower:]]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`},
		84  	{`(?i)[a-z]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`},
		85  	{`(?i)[^[:lower:]]`, `cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`},
		86  	{`(?i)[[:^lower:]]`, `cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`},
		87  	{`\d`, `cc{0x30-0x39}`},
		88  	{`\D`, `cc{0x0-0x2f 0x3a-0x10ffff}`},
		89  	{`\s`, `cc{0x9-0xa 0xc-0xd 0x20}`},
		90  	{`\S`, `cc{0x0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}`},
		91  	{`\w`, `cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}`},
		92  	{`\W`, `cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}`},
		93  	{`(?i)\w`, `cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}`},
		94  	{`(?i)\W`, `cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`},
		95  	{`[^\\]`, `cc{0x0-0x5b 0x5d-0x10ffff}`},
		96  	//	{ `\C`, `byte{}` },	// probably never
		97  
		98  	// Unicode, negatives, and a double negative.
		99  	{`\p{Braille}`, `cc{0x2800-0x28ff}`},
	 100  	{`\P{Braille}`, `cc{0x0-0x27ff 0x2900-0x10ffff}`},
	 101  	{`\p{^Braille}`, `cc{0x0-0x27ff 0x2900-0x10ffff}`},
	 102  	{`\P{^Braille}`, `cc{0x2800-0x28ff}`},
	 103  	{`\pZ`, `cc{0x20 0xa0 0x1680 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`},
	 104  	{`[\p{Braille}]`, `cc{0x2800-0x28ff}`},
	 105  	{`[\P{Braille}]`, `cc{0x0-0x27ff 0x2900-0x10ffff}`},
	 106  	{`[\p{^Braille}]`, `cc{0x0-0x27ff 0x2900-0x10ffff}`},
	 107  	{`[\P{^Braille}]`, `cc{0x2800-0x28ff}`},
	 108  	{`[\pZ]`, `cc{0x20 0xa0 0x1680 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`},
	 109  	{`\p{Lu}`, mkCharClass(unicode.IsUpper)},
	 110  	{`[\p{Lu}]`, mkCharClass(unicode.IsUpper)},
	 111  	{`(?i)[\p{Lu}]`, mkCharClass(isUpperFold)},
	 112  	{`\p{Any}`, `dot{}`},
	 113  	{`\p{^Any}`, `cc{}`},
	 114  
	 115  	// Hex, octal.
	 116  	{`[\012-\234]\141`, `cat{cc{0xa-0x9c}lit{a}}`},
	 117  	{`[\x{41}-\x7a]\x61`, `cat{cc{0x41-0x7a}lit{a}}`},
	 118  
	 119  	// More interesting regular expressions.
	 120  	{`a{,2}`, `str{a{,2}}`},
	 121  	{`\.\^\$\\`, `str{.^$\}`},
	 122  	{`[a-zABC]`, `cc{0x41-0x43 0x61-0x7a}`},
	 123  	{`[^a]`, `cc{0x0-0x60 0x62-0x10ffff}`},
	 124  	{`[α-ε☺]`, `cc{0x3b1-0x3b5 0x263a}`}, // utf-8
	 125  	{`a*{`, `cat{star{lit{a}}lit{{}}`},
	 126  
	 127  	// Test precedences
	 128  	{`(?:ab)*`, `star{str{ab}}`},
	 129  	{`(ab)*`, `star{cap{str{ab}}}`},
	 130  	{`ab|cd`, `alt{str{ab}str{cd}}`},
	 131  	{`a(b|c)d`, `cat{lit{a}cap{cc{0x62-0x63}}lit{d}}`},
	 132  
	 133  	// Test flattening.
	 134  	{`(?:a)`, `lit{a}`},
	 135  	{`(?:ab)(?:cd)`, `str{abcd}`},
	 136  	{`(?:a+b+)(?:c+d+)`, `cat{plus{lit{a}}plus{lit{b}}plus{lit{c}}plus{lit{d}}}`},
	 137  	{`(?:a+|b+)|(?:c+|d+)`, `alt{plus{lit{a}}plus{lit{b}}plus{lit{c}}plus{lit{d}}}`},
	 138  	{`(?:a|b)|(?:c|d)`, `cc{0x61-0x64}`},
	 139  	{`a|.`, `dot{}`},
	 140  	{`.|a`, `dot{}`},
	 141  	{`(?:[abc]|A|Z|hello|world)`, `alt{cc{0x41 0x5a 0x61-0x63}str{hello}str{world}}`},
	 142  	{`(?:[abc]|A|Z)`, `cc{0x41 0x5a 0x61-0x63}`},
	 143  
	 144  	// Test Perl quoted literals
	 145  	{`\Q+|*?{[\E`, `str{+|*?{[}`},
	 146  	{`\Q+\E+`, `plus{lit{+}}`},
	 147  	{`\Qab\E+`, `cat{lit{a}plus{lit{b}}}`},
	 148  	{`\Q\\E`, `lit{\}`},
	 149  	{`\Q\\\E`, `str{\\}`},
	 150  
	 151  	// Test Perl \A and \z
	 152  	{`(?m)^`, `bol{}`},
	 153  	{`(?m)$`, `eol{}`},
	 154  	{`(?-m)^`, `bot{}`},
	 155  	{`(?-m)$`, `eot{}`},
	 156  	{`(?m)\A`, `bot{}`},
	 157  	{`(?m)\z`, `eot{\z}`},
	 158  	{`(?-m)\A`, `bot{}`},
	 159  	{`(?-m)\z`, `eot{\z}`},
	 160  
	 161  	// Test named captures
	 162  	{`(?P<name>a)`, `cap{name:lit{a}}`},
	 163  
	 164  	// Case-folded literals
	 165  	{`[Aa]`, `litfold{A}`},
	 166  	{`[\x{100}\x{101}]`, `litfold{Ā}`},
	 167  	{`[Δδ]`, `litfold{Δ}`},
	 168  
	 169  	// Strings
	 170  	{`abcde`, `str{abcde}`},
	 171  	{`[Aa][Bb]cd`, `cat{strfold{AB}str{cd}}`},
	 172  
	 173  	// Factoring.
	 174  	{`abc|abd|aef|bcx|bcy`, `alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}cat{str{bc}cc{0x78-0x79}}}`},
	 175  	{`ax+y|ax+z|ay+w`, `cat{lit{a}alt{cat{plus{lit{x}}lit{y}}cat{plus{lit{x}}lit{z}}cat{plus{lit{y}}lit{w}}}}`},
	 176  
	 177  	// Bug fixes.
	 178  	{`(?:.)`, `dot{}`},
	 179  	{`(?:x|(?:xa))`, `cat{lit{x}alt{emp{}lit{a}}}`},
	 180  	{`(?:.|(?:.a))`, `cat{dot{}alt{emp{}lit{a}}}`},
	 181  	{`(?:A(?:A|a))`, `cat{lit{A}litfold{A}}`},
	 182  	{`(?:A|a)`, `litfold{A}`},
	 183  	{`A|(?:A|a)`, `litfold{A}`},
	 184  	{`(?s).`, `dot{}`},
	 185  	{`(?-s).`, `dnl{}`},
	 186  	{`(?:(?:^).)`, `cat{bol{}dot{}}`},
	 187  	{`(?-s)(?:(?:^).)`, `cat{bol{}dnl{}}`},
	 188  	{`[\s\S]a`, `cat{cc{0x0-0x10ffff}lit{a}}`},
	 189  
	 190  	// RE2 prefix_tests
	 191  	{`abc|abd`, `cat{str{ab}cc{0x63-0x64}}`},
	 192  	{`a(?:b)c|abd`, `cat{str{ab}cc{0x63-0x64}}`},
	 193  	{`abc|abd|aef|bcx|bcy`,
	 194  		`alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}` +
	 195  			`cat{str{bc}cc{0x78-0x79}}}`},
	 196  	{`abc|x|abd`, `alt{str{abc}lit{x}str{abd}}`},
	 197  	{`(?i)abc|ABD`, `cat{strfold{AB}cc{0x43-0x44 0x63-0x64}}`},
	 198  	{`[ab]c|[ab]d`, `cat{cc{0x61-0x62}cc{0x63-0x64}}`},
	 199  	{`.c|.d`, `cat{dot{}cc{0x63-0x64}}`},
	 200  	{`x{2}|x{2}[0-9]`,
	 201  		`cat{rep{2,2 lit{x}}alt{emp{}cc{0x30-0x39}}}`},
	 202  	{`x{2}y|x{2}[0-9]y`,
	 203  		`cat{rep{2,2 lit{x}}alt{lit{y}cat{cc{0x30-0x39}lit{y}}}}`},
	 204  	{`a.*?c|a.*?b`,
	 205  		`cat{lit{a}alt{cat{nstar{dot{}}lit{c}}cat{nstar{dot{}}lit{b}}}}`},
	 206  
	 207  	// Valid repetitions.
	 208  	{`((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}))`, ``},
	 209  	{`((((((((((x{1}){2}){2}){2}){2}){2}){2}){2}){2}){2})`, ``},
	 210  
	 211  	// Valid nesting.
	 212  	{strings.Repeat("(", 999) + strings.Repeat(")", 999), ``},
	 213  	{strings.Repeat("(?:", 999) + strings.Repeat(")*", 999), ``},
	 214  	{"(" + strings.Repeat("|", 12345) + ")", ``}, // not nested at all
	 215  }
	 216  
	 217  const testFlags = MatchNL | PerlX | UnicodeGroups
	 218  
	 219  func TestParseSimple(t *testing.T) {
	 220  	testParseDump(t, parseTests, testFlags)
	 221  }
	 222  
	 223  var foldcaseTests = []parseTest{
	 224  	{`AbCdE`, `strfold{ABCDE}`},
	 225  	{`[Aa]`, `litfold{A}`},
	 226  	{`a`, `litfold{A}`},
	 227  
	 228  	// 0x17F is an old English long s (looks like an f) and folds to s.
	 229  	// 0x212A is the Kelvin symbol and folds to k.
	 230  	{`A[F-g]`, `cat{litfold{A}cc{0x41-0x7a 0x17f 0x212a}}`}, // [Aa][A-z...]
	 231  	{`[[:upper:]]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`},
	 232  	{`[[:lower:]]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`},
	 233  }
	 234  
	 235  func TestParseFoldCase(t *testing.T) {
	 236  	testParseDump(t, foldcaseTests, FoldCase)
	 237  }
	 238  
	 239  var literalTests = []parseTest{
	 240  	{"(|)^$.[*+?]{5,10},\\", "str{(|)^$.[*+?]{5,10},\\}"},
	 241  }
	 242  
	 243  func TestParseLiteral(t *testing.T) {
	 244  	testParseDump(t, literalTests, Literal)
	 245  }
	 246  
	 247  var matchnlTests = []parseTest{
	 248  	{`.`, `dot{}`},
	 249  	{"\n", "lit{\n}"},
	 250  	{`[^a]`, `cc{0x0-0x60 0x62-0x10ffff}`},
	 251  	{`[a\n]`, `cc{0xa 0x61}`},
	 252  }
	 253  
	 254  func TestParseMatchNL(t *testing.T) {
	 255  	testParseDump(t, matchnlTests, MatchNL)
	 256  }
	 257  
	 258  var nomatchnlTests = []parseTest{
	 259  	{`.`, `dnl{}`},
	 260  	{"\n", "lit{\n}"},
	 261  	{`[^a]`, `cc{0x0-0x9 0xb-0x60 0x62-0x10ffff}`},
	 262  	{`[a\n]`, `cc{0xa 0x61}`},
	 263  }
	 264  
	 265  func TestParseNoMatchNL(t *testing.T) {
	 266  	testParseDump(t, nomatchnlTests, 0)
	 267  }
	 268  
	 269  // Test Parse -> Dump.
	 270  func testParseDump(t *testing.T, tests []parseTest, flags Flags) {
	 271  	for _, tt := range tests {
	 272  		re, err := Parse(tt.Regexp, flags)
	 273  		if err != nil {
	 274  			t.Errorf("Parse(%#q): %v", tt.Regexp, err)
	 275  			continue
	 276  		}
	 277  		if tt.Dump == "" {
	 278  			// It parsed. That's all we care about.
	 279  			continue
	 280  		}
	 281  		d := dump(re)
	 282  		if d != tt.Dump {
	 283  			t.Errorf("Parse(%#q).Dump() = %#q want %#q", tt.Regexp, d, tt.Dump)
	 284  		}
	 285  	}
	 286  }
	 287  
	 288  // dump prints a string representation of the regexp showing
	 289  // the structure explicitly.
	 290  func dump(re *Regexp) string {
	 291  	var b strings.Builder
	 292  	dumpRegexp(&b, re)
	 293  	return b.String()
	 294  }
	 295  
	 296  var opNames = []string{
	 297  	OpNoMatch:				"no",
	 298  	OpEmptyMatch:		 "emp",
	 299  	OpLiteral:				"lit",
	 300  	OpCharClass:			"cc",
	 301  	OpAnyCharNotNL:	 "dnl",
	 302  	OpAnyChar:				"dot",
	 303  	OpBeginLine:			"bol",
	 304  	OpEndLine:				"eol",
	 305  	OpBeginText:			"bot",
	 306  	OpEndText:				"eot",
	 307  	OpWordBoundary:	 "wb",
	 308  	OpNoWordBoundary: "nwb",
	 309  	OpCapture:				"cap",
	 310  	OpStar:					 "star",
	 311  	OpPlus:					 "plus",
	 312  	OpQuest:					"que",
	 313  	OpRepeat:				 "rep",
	 314  	OpConcat:				 "cat",
	 315  	OpAlternate:			"alt",
	 316  }
	 317  
	 318  // dumpRegexp writes an encoding of the syntax tree for the regexp re to b.
	 319  // It is used during testing to distinguish between parses that might print
	 320  // the same using re's String method.
	 321  func dumpRegexp(b *strings.Builder, re *Regexp) {
	 322  	if int(re.Op) >= len(opNames) || opNames[re.Op] == "" {
	 323  		fmt.Fprintf(b, "op%d", re.Op)
	 324  	} else {
	 325  		switch re.Op {
	 326  		default:
	 327  			b.WriteString(opNames[re.Op])
	 328  		case OpStar, OpPlus, OpQuest, OpRepeat:
	 329  			if re.Flags&NonGreedy != 0 {
	 330  				b.WriteByte('n')
	 331  			}
	 332  			b.WriteString(opNames[re.Op])
	 333  		case OpLiteral:
	 334  			if len(re.Rune) > 1 {
	 335  				b.WriteString("str")
	 336  			} else {
	 337  				b.WriteString("lit")
	 338  			}
	 339  			if re.Flags&FoldCase != 0 {
	 340  				for _, r := range re.Rune {
	 341  					if unicode.SimpleFold(r) != r {
	 342  						b.WriteString("fold")
	 343  						break
	 344  					}
	 345  				}
	 346  			}
	 347  		}
	 348  	}
	 349  	b.WriteByte('{')
	 350  	switch re.Op {
	 351  	case OpEndText:
	 352  		if re.Flags&WasDollar == 0 {
	 353  			b.WriteString(`\z`)
	 354  		}
	 355  	case OpLiteral:
	 356  		for _, r := range re.Rune {
	 357  			b.WriteRune(r)
	 358  		}
	 359  	case OpConcat, OpAlternate:
	 360  		for _, sub := range re.Sub {
	 361  			dumpRegexp(b, sub)
	 362  		}
	 363  	case OpStar, OpPlus, OpQuest:
	 364  		dumpRegexp(b, re.Sub[0])
	 365  	case OpRepeat:
	 366  		fmt.Fprintf(b, "%d,%d ", re.Min, re.Max)
	 367  		dumpRegexp(b, re.Sub[0])
	 368  	case OpCapture:
	 369  		if re.Name != "" {
	 370  			b.WriteString(re.Name)
	 371  			b.WriteByte(':')
	 372  		}
	 373  		dumpRegexp(b, re.Sub[0])
	 374  	case OpCharClass:
	 375  		sep := ""
	 376  		for i := 0; i < len(re.Rune); i += 2 {
	 377  			b.WriteString(sep)
	 378  			sep = " "
	 379  			lo, hi := re.Rune[i], re.Rune[i+1]
	 380  			if lo == hi {
	 381  				fmt.Fprintf(b, "%#x", lo)
	 382  			} else {
	 383  				fmt.Fprintf(b, "%#x-%#x", lo, hi)
	 384  			}
	 385  		}
	 386  	}
	 387  	b.WriteByte('}')
	 388  }
	 389  
	 390  func mkCharClass(f func(rune) bool) string {
	 391  	re := &Regexp{Op: OpCharClass}
	 392  	lo := rune(-1)
	 393  	for i := rune(0); i <= unicode.MaxRune; i++ {
	 394  		if f(i) {
	 395  			if lo < 0 {
	 396  				lo = i
	 397  			}
	 398  		} else {
	 399  			if lo >= 0 {
	 400  				re.Rune = append(re.Rune, lo, i-1)
	 401  				lo = -1
	 402  			}
	 403  		}
	 404  	}
	 405  	if lo >= 0 {
	 406  		re.Rune = append(re.Rune, lo, unicode.MaxRune)
	 407  	}
	 408  	return dump(re)
	 409  }
	 410  
	 411  func isUpperFold(r rune) bool {
	 412  	if unicode.IsUpper(r) {
	 413  		return true
	 414  	}
	 415  	c := unicode.SimpleFold(r)
	 416  	for c != r {
	 417  		if unicode.IsUpper(c) {
	 418  			return true
	 419  		}
	 420  		c = unicode.SimpleFold(c)
	 421  	}
	 422  	return false
	 423  }
	 424  
	 425  func TestFoldConstants(t *testing.T) {
	 426  	last := rune(-1)
	 427  	for i := rune(0); i <= unicode.MaxRune; i++ {
	 428  		if unicode.SimpleFold(i) == i {
	 429  			continue
	 430  		}
	 431  		if last == -1 && minFold != i {
	 432  			t.Errorf("minFold=%#U should be %#U", minFold, i)
	 433  		}
	 434  		last = i
	 435  	}
	 436  	if maxFold != last {
	 437  		t.Errorf("maxFold=%#U should be %#U", maxFold, last)
	 438  	}
	 439  }
	 440  
	 441  func TestAppendRangeCollapse(t *testing.T) {
	 442  	// AppendRange should collapse each of the new ranges
	 443  	// into the earlier ones (it looks back two ranges), so that
	 444  	// the slice never grows very large.
	 445  	// Note that we are not calling cleanClass.
	 446  	var r []rune
	 447  	for i := rune('A'); i <= 'Z'; i++ {
	 448  		r = appendRange(r, i, i)
	 449  		r = appendRange(r, i+'a'-'A', i+'a'-'A')
	 450  	}
	 451  	if string(r) != "AZaz" {
	 452  		t.Errorf("appendRange interlaced A-Z a-z = %s, want AZaz", string(r))
	 453  	}
	 454  }
	 455  
	 456  var invalidRegexps = []string{
	 457  	`(`,
	 458  	`)`,
	 459  	`(a`,
	 460  	`a)`,
	 461  	`(a))`,
	 462  	`(a|b|`,
	 463  	`a|b|)`,
	 464  	`(a|b|))`,
	 465  	`(a|b`,
	 466  	`a|b)`,
	 467  	`(a|b))`,
	 468  	`[a-z`,
	 469  	`([a-z)`,
	 470  	`[a-z)`,
	 471  	`([a-z]))`,
	 472  	`x{1001}`,
	 473  	`x{9876543210}`,
	 474  	`x{2,1}`,
	 475  	`x{1,9876543210}`,
	 476  	"\xff", // Invalid UTF-8
	 477  	"[\xff]",
	 478  	"[\\\xff]",
	 479  	"\\\xff",
	 480  	`(?P<name>a`,
	 481  	`(?P<name>`,
	 482  	`(?P<name`,
	 483  	`(?P<x y>a)`,
	 484  	`(?P<>a)`,
	 485  	`[a-Z]`,
	 486  	`(?i)[a-Z]`,
	 487  	`a{100000}`,
	 488  	`a{100000,}`,
	 489  	"((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}){2})",
	 490  	strings.Repeat("(", 1000) + strings.Repeat(")", 1000),
	 491  	strings.Repeat("(?:", 1000) + strings.Repeat(")*", 1000),
	 492  	`\Q\E*`,
	 493  }
	 494  
	 495  var onlyPerl = []string{
	 496  	`[a-b-c]`,
	 497  	`\Qabc\E`,
	 498  	`\Q*+?{[\E`,
	 499  	`\Q\\E`,
	 500  	`\Q\\\E`,
	 501  	`\Q\\\\E`,
	 502  	`\Q\\\\\E`,
	 503  	`(?:a)`,
	 504  	`(?P<name>a)`,
	 505  }
	 506  
	 507  var onlyPOSIX = []string{
	 508  	"a++",
	 509  	"a**",
	 510  	"a?*",
	 511  	"a+*",
	 512  	"a{1}*",
	 513  	".{1}{2}.{3}",
	 514  }
	 515  
	 516  func TestParseInvalidRegexps(t *testing.T) {
	 517  	for _, regexp := range invalidRegexps {
	 518  		if re, err := Parse(regexp, Perl); err == nil {
	 519  			t.Errorf("Parse(%#q, Perl) = %s, should have failed", regexp, dump(re))
	 520  		}
	 521  		if re, err := Parse(regexp, POSIX); err == nil {
	 522  			t.Errorf("Parse(%#q, POSIX) = %s, should have failed", regexp, dump(re))
	 523  		}
	 524  	}
	 525  	for _, regexp := range onlyPerl {
	 526  		if _, err := Parse(regexp, Perl); err != nil {
	 527  			t.Errorf("Parse(%#q, Perl): %v", regexp, err)
	 528  		}
	 529  		if re, err := Parse(regexp, POSIX); err == nil {
	 530  			t.Errorf("Parse(%#q, POSIX) = %s, should have failed", regexp, dump(re))
	 531  		}
	 532  	}
	 533  	for _, regexp := range onlyPOSIX {
	 534  		if re, err := Parse(regexp, Perl); err == nil {
	 535  			t.Errorf("Parse(%#q, Perl) = %s, should have failed", regexp, dump(re))
	 536  		}
	 537  		if _, err := Parse(regexp, POSIX); err != nil {
	 538  			t.Errorf("Parse(%#q, POSIX): %v", regexp, err)
	 539  		}
	 540  	}
	 541  }
	 542  
	 543  func TestToStringEquivalentParse(t *testing.T) {
	 544  	for _, tt := range parseTests {
	 545  		re, err := Parse(tt.Regexp, testFlags)
	 546  		if err != nil {
	 547  			t.Errorf("Parse(%#q): %v", tt.Regexp, err)
	 548  			continue
	 549  		}
	 550  		if tt.Dump == "" {
	 551  			// It parsed. That's all we care about.
	 552  			continue
	 553  		}
	 554  		d := dump(re)
	 555  		if d != tt.Dump {
	 556  			t.Errorf("Parse(%#q).Dump() = %#q want %#q", tt.Regexp, d, tt.Dump)
	 557  			continue
	 558  		}
	 559  
	 560  		s := re.String()
	 561  		if s != tt.Regexp {
	 562  			// If ToString didn't return the original regexp,
	 563  			// it must have found one with fewer parens.
	 564  			// Unfortunately we can't check the length here, because
	 565  			// ToString produces "\\{" for a literal brace,
	 566  			// but "{" is a shorter equivalent in some contexts.
	 567  			nre, err := Parse(s, testFlags)
	 568  			if err != nil {
	 569  				t.Errorf("Parse(%#q.String() = %#q): %v", tt.Regexp, s, err)
	 570  				continue
	 571  			}
	 572  			nd := dump(nre)
	 573  			if d != nd {
	 574  				t.Errorf("Parse(%#q) -> %#q; %#q vs %#q", tt.Regexp, s, d, nd)
	 575  			}
	 576  
	 577  			ns := nre.String()
	 578  			if s != ns {
	 579  				t.Errorf("Parse(%#q) -> %#q -> %#q", tt.Regexp, s, ns)
	 580  			}
	 581  		}
	 582  	}
	 583  }
	 584
View as plain text