...

Source file src/regexp/syntax/regexp.go

Documentation: regexp/syntax

		 1  // Copyright 2011 The Go Authors. All rights reserved.
		 2  // Use of this source code is governed by a BSD-style
		 3  // license that can be found in the LICENSE file.
		 4  
		 5  package syntax
		 6  
		 7  // Note to implementers:
		 8  // In this package, re is always a *Regexp and r is always a rune.
		 9  
		10  import (
		11  	"strconv"
		12  	"strings"
		13  	"unicode"
		14  )
		15  
		16  // A Regexp is a node in a regular expression syntax tree.
		17  type Regexp struct {
		18  	Op			 Op // operator
		19  	Flags		Flags
		20  	Sub			[]*Regexp	// subexpressions, if any
		21  	Sub0		 [1]*Regexp // storage for short Sub
		22  	Rune		 []rune		 // matched runes, for OpLiteral, OpCharClass
		23  	Rune0		[2]rune		// storage for short Rune
		24  	Min, Max int				// min, max for OpRepeat
		25  	Cap			int				// capturing index, for OpCapture
		26  	Name		 string		 // capturing name, for OpCapture
		27  }
		28  
		29  //go:generate stringer -type Op -trimprefix Op
		30  
		31  // An Op is a single regular expression operator.
		32  type Op uint8
		33  
		34  // Operators are listed in precedence order, tightest binding to weakest.
		35  // Character class operators are listed simplest to most complex
		36  // (OpLiteral, OpCharClass, OpAnyCharNotNL, OpAnyChar).
		37  
		38  const (
		39  	OpNoMatch				Op = 1 + iota // matches no strings
		40  	OpEmptyMatch									 // matches empty string
		41  	OpLiteral											// matches Runes sequence
		42  	OpCharClass										// matches Runes interpreted as range pair list
		43  	OpAnyCharNotNL								 // matches any character except newline
		44  	OpAnyChar											// matches any character
		45  	OpBeginLine										// matches empty string at beginning of line
		46  	OpEndLine											// matches empty string at end of line
		47  	OpBeginText										// matches empty string at beginning of text
		48  	OpEndText											// matches empty string at end of text
		49  	OpWordBoundary								 // matches word boundary `\b`
		50  	OpNoWordBoundary							 // matches word non-boundary `\B`
		51  	OpCapture											// capturing subexpression with index Cap, optional name Name
		52  	OpStar												 // matches Sub[0] zero or more times
		53  	OpPlus												 // matches Sub[0] one or more times
		54  	OpQuest												// matches Sub[0] zero or one times
		55  	OpRepeat											 // matches Sub[0] at least Min times, at most Max (Max == -1 is no limit)
		56  	OpConcat											 // matches concatenation of Subs
		57  	OpAlternate										// matches alternation of Subs
		58  )
		59  
		60  const opPseudo Op = 128 // where pseudo-ops start
		61  
		62  // Equal reports whether x and y have identical structure.
		63  func (x *Regexp) Equal(y *Regexp) bool {
		64  	if x == nil || y == nil {
		65  		return x == y
		66  	}
		67  	if x.Op != y.Op {
		68  		return false
		69  	}
		70  	switch x.Op {
		71  	case OpEndText:
		72  		// The parse flags remember whether this is \z or \Z.
		73  		if x.Flags&WasDollar != y.Flags&WasDollar {
		74  			return false
		75  		}
		76  
		77  	case OpLiteral, OpCharClass:
		78  		if len(x.Rune) != len(y.Rune) {
		79  			return false
		80  		}
		81  		for i, r := range x.Rune {
		82  			if r != y.Rune[i] {
		83  				return false
		84  			}
		85  		}
		86  
		87  	case OpAlternate, OpConcat:
		88  		if len(x.Sub) != len(y.Sub) {
		89  			return false
		90  		}
		91  		for i, sub := range x.Sub {
		92  			if !sub.Equal(y.Sub[i]) {
		93  				return false
		94  			}
		95  		}
		96  
		97  	case OpStar, OpPlus, OpQuest:
		98  		if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) {
		99  			return false
	 100  		}
	 101  
	 102  	case OpRepeat:
	 103  		if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) {
	 104  			return false
	 105  		}
	 106  
	 107  	case OpCapture:
	 108  		if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) {
	 109  			return false
	 110  		}
	 111  	}
	 112  	return true
	 113  }
	 114  
	 115  // writeRegexp writes the Perl syntax for the regular expression re to b.
	 116  func writeRegexp(b *strings.Builder, re *Regexp) {
	 117  	switch re.Op {
	 118  	default:
	 119  		b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">")
	 120  	case OpNoMatch:
	 121  		b.WriteString(`[^\x00-\x{10FFFF}]`)
	 122  	case OpEmptyMatch:
	 123  		b.WriteString(`(?:)`)
	 124  	case OpLiteral:
	 125  		if re.Flags&FoldCase != 0 {
	 126  			b.WriteString(`(?i:`)
	 127  		}
	 128  		for _, r := range re.Rune {
	 129  			escape(b, r, false)
	 130  		}
	 131  		if re.Flags&FoldCase != 0 {
	 132  			b.WriteString(`)`)
	 133  		}
	 134  	case OpCharClass:
	 135  		if len(re.Rune)%2 != 0 {
	 136  			b.WriteString(`[invalid char class]`)
	 137  			break
	 138  		}
	 139  		b.WriteRune('[')
	 140  		if len(re.Rune) == 0 {
	 141  			b.WriteString(`^\x00-\x{10FFFF}`)
	 142  		} else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune && len(re.Rune) > 2 {
	 143  			// Contains 0 and MaxRune. Probably a negated class.
	 144  			// Print the gaps.
	 145  			b.WriteRune('^')
	 146  			for i := 1; i < len(re.Rune)-1; i += 2 {
	 147  				lo, hi := re.Rune[i]+1, re.Rune[i+1]-1
	 148  				escape(b, lo, lo == '-')
	 149  				if lo != hi {
	 150  					b.WriteRune('-')
	 151  					escape(b, hi, hi == '-')
	 152  				}
	 153  			}
	 154  		} else {
	 155  			for i := 0; i < len(re.Rune); i += 2 {
	 156  				lo, hi := re.Rune[i], re.Rune[i+1]
	 157  				escape(b, lo, lo == '-')
	 158  				if lo != hi {
	 159  					b.WriteRune('-')
	 160  					escape(b, hi, hi == '-')
	 161  				}
	 162  			}
	 163  		}
	 164  		b.WriteRune(']')
	 165  	case OpAnyCharNotNL:
	 166  		b.WriteString(`(?-s:.)`)
	 167  	case OpAnyChar:
	 168  		b.WriteString(`(?s:.)`)
	 169  	case OpBeginLine:
	 170  		b.WriteString(`(?m:^)`)
	 171  	case OpEndLine:
	 172  		b.WriteString(`(?m:$)`)
	 173  	case OpBeginText:
	 174  		b.WriteString(`\A`)
	 175  	case OpEndText:
	 176  		if re.Flags&WasDollar != 0 {
	 177  			b.WriteString(`(?-m:$)`)
	 178  		} else {
	 179  			b.WriteString(`\z`)
	 180  		}
	 181  	case OpWordBoundary:
	 182  		b.WriteString(`\b`)
	 183  	case OpNoWordBoundary:
	 184  		b.WriteString(`\B`)
	 185  	case OpCapture:
	 186  		if re.Name != "" {
	 187  			b.WriteString(`(?P<`)
	 188  			b.WriteString(re.Name)
	 189  			b.WriteRune('>')
	 190  		} else {
	 191  			b.WriteRune('(')
	 192  		}
	 193  		if re.Sub[0].Op != OpEmptyMatch {
	 194  			writeRegexp(b, re.Sub[0])
	 195  		}
	 196  		b.WriteRune(')')
	 197  	case OpStar, OpPlus, OpQuest, OpRepeat:
	 198  		if sub := re.Sub[0]; sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 {
	 199  			b.WriteString(`(?:`)
	 200  			writeRegexp(b, sub)
	 201  			b.WriteString(`)`)
	 202  		} else {
	 203  			writeRegexp(b, sub)
	 204  		}
	 205  		switch re.Op {
	 206  		case OpStar:
	 207  			b.WriteRune('*')
	 208  		case OpPlus:
	 209  			b.WriteRune('+')
	 210  		case OpQuest:
	 211  			b.WriteRune('?')
	 212  		case OpRepeat:
	 213  			b.WriteRune('{')
	 214  			b.WriteString(strconv.Itoa(re.Min))
	 215  			if re.Max != re.Min {
	 216  				b.WriteRune(',')
	 217  				if re.Max >= 0 {
	 218  					b.WriteString(strconv.Itoa(re.Max))
	 219  				}
	 220  			}
	 221  			b.WriteRune('}')
	 222  		}
	 223  		if re.Flags&NonGreedy != 0 {
	 224  			b.WriteRune('?')
	 225  		}
	 226  	case OpConcat:
	 227  		for _, sub := range re.Sub {
	 228  			if sub.Op == OpAlternate {
	 229  				b.WriteString(`(?:`)
	 230  				writeRegexp(b, sub)
	 231  				b.WriteString(`)`)
	 232  			} else {
	 233  				writeRegexp(b, sub)
	 234  			}
	 235  		}
	 236  	case OpAlternate:
	 237  		for i, sub := range re.Sub {
	 238  			if i > 0 {
	 239  				b.WriteRune('|')
	 240  			}
	 241  			writeRegexp(b, sub)
	 242  		}
	 243  	}
	 244  }
	 245  
	 246  func (re *Regexp) String() string {
	 247  	var b strings.Builder
	 248  	writeRegexp(&b, re)
	 249  	return b.String()
	 250  }
	 251  
	 252  const meta = `\.+*?()|[]{}^$`
	 253  
	 254  func escape(b *strings.Builder, r rune, force bool) {
	 255  	if unicode.IsPrint(r) {
	 256  		if strings.ContainsRune(meta, r) || force {
	 257  			b.WriteRune('\\')
	 258  		}
	 259  		b.WriteRune(r)
	 260  		return
	 261  	}
	 262  
	 263  	switch r {
	 264  	case '\a':
	 265  		b.WriteString(`\a`)
	 266  	case '\f':
	 267  		b.WriteString(`\f`)
	 268  	case '\n':
	 269  		b.WriteString(`\n`)
	 270  	case '\r':
	 271  		b.WriteString(`\r`)
	 272  	case '\t':
	 273  		b.WriteString(`\t`)
	 274  	case '\v':
	 275  		b.WriteString(`\v`)
	 276  	default:
	 277  		if r < 0x100 {
	 278  			b.WriteString(`\x`)
	 279  			s := strconv.FormatInt(int64(r), 16)
	 280  			if len(s) == 1 {
	 281  				b.WriteRune('0')
	 282  			}
	 283  			b.WriteString(s)
	 284  			break
	 285  		}
	 286  		b.WriteString(`\x{`)
	 287  		b.WriteString(strconv.FormatInt(int64(r), 16))
	 288  		b.WriteString(`}`)
	 289  	}
	 290  }
	 291  
	 292  // MaxCap walks the regexp to find the maximum capture index.
	 293  func (re *Regexp) MaxCap() int {
	 294  	m := 0
	 295  	if re.Op == OpCapture {
	 296  		m = re.Cap
	 297  	}
	 298  	for _, sub := range re.Sub {
	 299  		if n := sub.MaxCap(); m < n {
	 300  			m = n
	 301  		}
	 302  	}
	 303  	return m
	 304  }
	 305  
	 306  // CapNames walks the regexp to find the names of capturing groups.
	 307  func (re *Regexp) CapNames() []string {
	 308  	names := make([]string, re.MaxCap()+1)
	 309  	re.capNames(names)
	 310  	return names
	 311  }
	 312  
	 313  func (re *Regexp) capNames(names []string) {
	 314  	if re.Op == OpCapture {
	 315  		names[re.Cap] = re.Name
	 316  	}
	 317  	for _, sub := range re.Sub {
	 318  		sub.capNames(names)
	 319  	}
	 320  }
	 321  

View as plain text