...

Source file src/regexp/syntax/simplify_test.go

Documentation: regexp/syntax

		 1  // Copyright 2011 The Go Authors. All rights reserved.
		 2  // Use of this source code is governed by a BSD-style
		 3  // license that can be found in the LICENSE file.
		 4  
		 5  package syntax
		 6  
		 7  import "testing"
		 8  
		 9  var simplifyTests = []struct {
		10  	Regexp string
		11  	Simple string
		12  }{
		13  	// Already-simple constructs
		14  	{`a`, `a`},
		15  	{`ab`, `ab`},
		16  	{`a|b`, `[a-b]`},
		17  	{`ab|cd`, `ab|cd`},
		18  	{`(ab)*`, `(ab)*`},
		19  	{`(ab)+`, `(ab)+`},
		20  	{`(ab)?`, `(ab)?`},
		21  	{`.`, `(?s:.)`},
		22  	{`^`, `(?m:^)`},
		23  	{`$`, `(?m:$)`},
		24  	{`[ac]`, `[ac]`},
		25  	{`[^ac]`, `[^ac]`},
		26  
		27  	// Posix character classes
		28  	{`[[:alnum:]]`, `[0-9A-Za-z]`},
		29  	{`[[:alpha:]]`, `[A-Za-z]`},
		30  	{`[[:blank:]]`, `[\t ]`},
		31  	{`[[:cntrl:]]`, `[\x00-\x1f\x7f]`},
		32  	{`[[:digit:]]`, `[0-9]`},
		33  	{`[[:graph:]]`, `[!-~]`},
		34  	{`[[:lower:]]`, `[a-z]`},
		35  	{`[[:print:]]`, `[ -~]`},
		36  	{`[[:punct:]]`, "[!-/:-@\\[-`\\{-~]"},
		37  	{`[[:space:]]`, `[\t-\r ]`},
		38  	{`[[:upper:]]`, `[A-Z]`},
		39  	{`[[:xdigit:]]`, `[0-9A-Fa-f]`},
		40  
		41  	// Perl character classes
		42  	{`\d`, `[0-9]`},
		43  	{`\s`, `[\t-\n\f-\r ]`},
		44  	{`\w`, `[0-9A-Z_a-z]`},
		45  	{`\D`, `[^0-9]`},
		46  	{`\S`, `[^\t-\n\f-\r ]`},
		47  	{`\W`, `[^0-9A-Z_a-z]`},
		48  	{`[\d]`, `[0-9]`},
		49  	{`[\s]`, `[\t-\n\f-\r ]`},
		50  	{`[\w]`, `[0-9A-Z_a-z]`},
		51  	{`[\D]`, `[^0-9]`},
		52  	{`[\S]`, `[^\t-\n\f-\r ]`},
		53  	{`[\W]`, `[^0-9A-Z_a-z]`},
		54  
		55  	// Posix repetitions
		56  	{`a{1}`, `a`},
		57  	{`a{2}`, `aa`},
		58  	{`a{5}`, `aaaaa`},
		59  	{`a{0,1}`, `a?`},
		60  	// The next three are illegible because Simplify inserts (?:)
		61  	// parens instead of () parens to avoid creating extra
		62  	// captured subexpressions. The comments show a version with fewer parens.
		63  	{`(a){0,2}`, `(?:(a)(a)?)?`},											 //			 (aa?)?
		64  	{`(a){0,4}`, `(?:(a)(?:(a)(?:(a)(a)?)?)?)?`},			 //	 (a(a(aa?)?)?)?
		65  	{`(a){2,6}`, `(a)(a)(?:(a)(?:(a)(?:(a)(a)?)?)?)?`}, // aa(a(a(aa?)?)?)?
		66  	{`a{0,2}`, `(?:aa?)?`},														 //			 (aa?)?
		67  	{`a{0,4}`, `(?:a(?:a(?:aa?)?)?)?`},								 //	 (a(a(aa?)?)?)?
		68  	{`a{2,6}`, `aa(?:a(?:a(?:aa?)?)?)?`},							 // aa(a(a(aa?)?)?)?
		69  	{`a{0,}`, `a*`},
		70  	{`a{1,}`, `a+`},
		71  	{`a{2,}`, `aa+`},
		72  	{`a{5,}`, `aaaaa+`},
		73  
		74  	// Test that operators simplify their arguments.
		75  	{`(?:a{1,}){1,}`, `a+`},
		76  	{`(a{1,}b{1,})`, `(a+b+)`},
		77  	{`a{1,}|b{1,}`, `a+|b+`},
		78  	{`(?:a{1,})*`, `(?:a+)*`},
		79  	{`(?:a{1,})+`, `a+`},
		80  	{`(?:a{1,})?`, `(?:a+)?`},
		81  	{``, `(?:)`},
		82  	{`a{0}`, `(?:)`},
		83  
		84  	// Character class simplification
		85  	{`[ab]`, `[a-b]`},
		86  	{`[a-za-za-z]`, `[a-z]`},
		87  	{`[A-Za-zA-Za-z]`, `[A-Za-z]`},
		88  	{`[ABCDEFGH]`, `[A-H]`},
		89  	{`[AB-CD-EF-GH]`, `[A-H]`},
		90  	{`[W-ZP-XE-R]`, `[E-Z]`},
		91  	{`[a-ee-gg-m]`, `[a-m]`},
		92  	{`[a-ea-ha-m]`, `[a-m]`},
		93  	{`[a-ma-ha-e]`, `[a-m]`},
		94  	{`[a-zA-Z0-9 -~]`, `[ -~]`},
		95  
		96  	// Empty character classes
		97  	{`[^[:cntrl:][:^cntrl:]]`, `[^\x00-\x{10FFFF}]`},
		98  
		99  	// Full character classes
	 100  	{`[[:cntrl:][:^cntrl:]]`, `(?s:.)`},
	 101  
	 102  	// Unicode case folding.
	 103  	{`(?i)A`, `(?i:A)`},
	 104  	{`(?i)a`, `(?i:A)`},
	 105  	{`(?i)[A]`, `(?i:A)`},
	 106  	{`(?i)[a]`, `(?i:A)`},
	 107  	{`(?i)K`, `(?i:K)`},
	 108  	{`(?i)k`, `(?i:K)`},
	 109  	{`(?i)\x{212a}`, "(?i:K)"},
	 110  	{`(?i)[K]`, "[Kk\u212A]"},
	 111  	{`(?i)[k]`, "[Kk\u212A]"},
	 112  	{`(?i)[\x{212a}]`, "[Kk\u212A]"},
	 113  	{`(?i)[a-z]`, "[A-Za-z\u017F\u212A]"},
	 114  	{`(?i)[\x00-\x{FFFD}]`, "[\\x00-\uFFFD]"},
	 115  	{`(?i)[\x00-\x{10FFFF}]`, `(?s:.)`},
	 116  
	 117  	// Empty string as a regular expression.
	 118  	// The empty string must be preserved inside parens in order
	 119  	// to make submatches work right, so these tests are less
	 120  	// interesting than they might otherwise be. String inserts
	 121  	// explicit (?:) in place of non-parenthesized empty strings,
	 122  	// to make them easier to spot for other parsers.
	 123  	{`(a|b|)`, `([a-b]|(?:))`},
	 124  	{`(|)`, `()`},
	 125  	{`a()`, `a()`},
	 126  	{`(()|())`, `(()|())`},
	 127  	{`(a|)`, `(a|(?:))`},
	 128  	{`ab()cd()`, `ab()cd()`},
	 129  	{`()`, `()`},
	 130  	{`()*`, `()*`},
	 131  	{`()+`, `()+`},
	 132  	{`()?`, `()?`},
	 133  	{`(){0}`, `(?:)`},
	 134  	{`(){1}`, `()`},
	 135  	{`(){1,}`, `()+`},
	 136  	{`(){0,2}`, `(?:()()?)?`},
	 137  }
	 138  
	 139  func TestSimplify(t *testing.T) {
	 140  	for _, tt := range simplifyTests {
	 141  		re, err := Parse(tt.Regexp, MatchNL|Perl&^OneLine)
	 142  		if err != nil {
	 143  			t.Errorf("Parse(%#q) = error %v", tt.Regexp, err)
	 144  			continue
	 145  		}
	 146  		s := re.Simplify().String()
	 147  		if s != tt.Simple {
	 148  			t.Errorf("Simplify(%#q) = %#q, want %#q", tt.Regexp, s, tt.Simple)
	 149  		}
	 150  	}
	 151  }
	 152  

View as plain text