...

Source file src/regexp/syntax/compile.go

Documentation: regexp/syntax

		 1  // Copyright 2011 The Go Authors. All rights reserved.
		 2  // Use of this source code is governed by a BSD-style
		 3  // license that can be found in the LICENSE file.
		 4  
		 5  package syntax
		 6  
		 7  import "unicode"
		 8  
		 9  // A patchList is a list of instruction pointers that need to be filled in (patched).
		10  // Because the pointers haven't been filled in yet, we can reuse their storage
		11  // to hold the list. It's kind of sleazy, but works well in practice.
		12  // See https://swtch.com/~rsc/regexp/regexp1.html for inspiration.
		13  //
		14  // These aren't really pointers: they're integers, so we can reinterpret them
		15  // this way without using package unsafe. A value l.head denotes
		16  // p.inst[l.head>>1].Out (l.head&1==0) or .Arg (l.head&1==1).
		17  // head == 0 denotes the empty list, okay because we start every program
		18  // with a fail instruction, so we'll never want to point at its output link.
		19  type patchList struct {
		20  	head, tail uint32
		21  }
		22  
		23  func makePatchList(n uint32) patchList {
		24  	return patchList{n, n}
		25  }
		26  
		27  func (l patchList) patch(p *Prog, val uint32) {
		28  	head := l.head
		29  	for head != 0 {
		30  		i := &p.Inst[head>>1]
		31  		if head&1 == 0 {
		32  			head = i.Out
		33  			i.Out = val
		34  		} else {
		35  			head = i.Arg
		36  			i.Arg = val
		37  		}
		38  	}
		39  }
		40  
		41  func (l1 patchList) append(p *Prog, l2 patchList) patchList {
		42  	if l1.head == 0 {
		43  		return l2
		44  	}
		45  	if l2.head == 0 {
		46  		return l1
		47  	}
		48  
		49  	i := &p.Inst[l1.tail>>1]
		50  	if l1.tail&1 == 0 {
		51  		i.Out = l2.head
		52  	} else {
		53  		i.Arg = l2.head
		54  	}
		55  	return patchList{l1.head, l2.tail}
		56  }
		57  
		58  // A frag represents a compiled program fragment.
		59  type frag struct {
		60  	i				uint32		// index of first instruction
		61  	out			patchList // where to record end instruction
		62  	nullable bool			// whether fragment can match empty string
		63  }
		64  
		65  type compiler struct {
		66  	p *Prog
		67  }
		68  
		69  // Compile compiles the regexp into a program to be executed.
		70  // The regexp should have been simplified already (returned from re.Simplify).
		71  func Compile(re *Regexp) (*Prog, error) {
		72  	var c compiler
		73  	c.init()
		74  	f := c.compile(re)
		75  	f.out.patch(c.p, c.inst(InstMatch).i)
		76  	c.p.Start = int(f.i)
		77  	return c.p, nil
		78  }
		79  
		80  func (c *compiler) init() {
		81  	c.p = new(Prog)
		82  	c.p.NumCap = 2 // implicit ( and ) for whole match $0
		83  	c.inst(InstFail)
		84  }
		85  
		86  var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune}
		87  var anyRune = []rune{0, unicode.MaxRune}
		88  
		89  func (c *compiler) compile(re *Regexp) frag {
		90  	switch re.Op {
		91  	case OpNoMatch:
		92  		return c.fail()
		93  	case OpEmptyMatch:
		94  		return c.nop()
		95  	case OpLiteral:
		96  		if len(re.Rune) == 0 {
		97  			return c.nop()
		98  		}
		99  		var f frag
	 100  		for j := range re.Rune {
	 101  			f1 := c.rune(re.Rune[j:j+1], re.Flags)
	 102  			if j == 0 {
	 103  				f = f1
	 104  			} else {
	 105  				f = c.cat(f, f1)
	 106  			}
	 107  		}
	 108  		return f
	 109  	case OpCharClass:
	 110  		return c.rune(re.Rune, re.Flags)
	 111  	case OpAnyCharNotNL:
	 112  		return c.rune(anyRuneNotNL, 0)
	 113  	case OpAnyChar:
	 114  		return c.rune(anyRune, 0)
	 115  	case OpBeginLine:
	 116  		return c.empty(EmptyBeginLine)
	 117  	case OpEndLine:
	 118  		return c.empty(EmptyEndLine)
	 119  	case OpBeginText:
	 120  		return c.empty(EmptyBeginText)
	 121  	case OpEndText:
	 122  		return c.empty(EmptyEndText)
	 123  	case OpWordBoundary:
	 124  		return c.empty(EmptyWordBoundary)
	 125  	case OpNoWordBoundary:
	 126  		return c.empty(EmptyNoWordBoundary)
	 127  	case OpCapture:
	 128  		bra := c.cap(uint32(re.Cap << 1))
	 129  		sub := c.compile(re.Sub[0])
	 130  		ket := c.cap(uint32(re.Cap<<1 | 1))
	 131  		return c.cat(c.cat(bra, sub), ket)
	 132  	case OpStar:
	 133  		return c.star(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0)
	 134  	case OpPlus:
	 135  		return c.plus(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0)
	 136  	case OpQuest:
	 137  		return c.quest(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0)
	 138  	case OpConcat:
	 139  		if len(re.Sub) == 0 {
	 140  			return c.nop()
	 141  		}
	 142  		var f frag
	 143  		for i, sub := range re.Sub {
	 144  			if i == 0 {
	 145  				f = c.compile(sub)
	 146  			} else {
	 147  				f = c.cat(f, c.compile(sub))
	 148  			}
	 149  		}
	 150  		return f
	 151  	case OpAlternate:
	 152  		var f frag
	 153  		for _, sub := range re.Sub {
	 154  			f = c.alt(f, c.compile(sub))
	 155  		}
	 156  		return f
	 157  	}
	 158  	panic("regexp: unhandled case in compile")
	 159  }
	 160  
	 161  func (c *compiler) inst(op InstOp) frag {
	 162  	// TODO: impose length limit
	 163  	f := frag{i: uint32(len(c.p.Inst)), nullable: true}
	 164  	c.p.Inst = append(c.p.Inst, Inst{Op: op})
	 165  	return f
	 166  }
	 167  
	 168  func (c *compiler) nop() frag {
	 169  	f := c.inst(InstNop)
	 170  	f.out = makePatchList(f.i << 1)
	 171  	return f
	 172  }
	 173  
	 174  func (c *compiler) fail() frag {
	 175  	return frag{}
	 176  }
	 177  
	 178  func (c *compiler) cap(arg uint32) frag {
	 179  	f := c.inst(InstCapture)
	 180  	f.out = makePatchList(f.i << 1)
	 181  	c.p.Inst[f.i].Arg = arg
	 182  
	 183  	if c.p.NumCap < int(arg)+1 {
	 184  		c.p.NumCap = int(arg) + 1
	 185  	}
	 186  	return f
	 187  }
	 188  
	 189  func (c *compiler) cat(f1, f2 frag) frag {
	 190  	// concat of failure is failure
	 191  	if f1.i == 0 || f2.i == 0 {
	 192  		return frag{}
	 193  	}
	 194  
	 195  	// TODO: elide nop
	 196  
	 197  	f1.out.patch(c.p, f2.i)
	 198  	return frag{f1.i, f2.out, f1.nullable && f2.nullable}
	 199  }
	 200  
	 201  func (c *compiler) alt(f1, f2 frag) frag {
	 202  	// alt of failure is other
	 203  	if f1.i == 0 {
	 204  		return f2
	 205  	}
	 206  	if f2.i == 0 {
	 207  		return f1
	 208  	}
	 209  
	 210  	f := c.inst(InstAlt)
	 211  	i := &c.p.Inst[f.i]
	 212  	i.Out = f1.i
	 213  	i.Arg = f2.i
	 214  	f.out = f1.out.append(c.p, f2.out)
	 215  	f.nullable = f1.nullable || f2.nullable
	 216  	return f
	 217  }
	 218  
	 219  func (c *compiler) quest(f1 frag, nongreedy bool) frag {
	 220  	f := c.inst(InstAlt)
	 221  	i := &c.p.Inst[f.i]
	 222  	if nongreedy {
	 223  		i.Arg = f1.i
	 224  		f.out = makePatchList(f.i << 1)
	 225  	} else {
	 226  		i.Out = f1.i
	 227  		f.out = makePatchList(f.i<<1 | 1)
	 228  	}
	 229  	f.out = f.out.append(c.p, f1.out)
	 230  	return f
	 231  }
	 232  
	 233  // loop returns the fragment for the main loop of a plus or star.
	 234  // For plus, it can be used after changing the entry to f1.i.
	 235  // For star, it can be used directly when f1 can't match an empty string.
	 236  // (When f1 can match an empty string, f1* must be implemented as (f1+)?
	 237  // to get the priority match order correct.)
	 238  func (c *compiler) loop(f1 frag, nongreedy bool) frag {
	 239  	f := c.inst(InstAlt)
	 240  	i := &c.p.Inst[f.i]
	 241  	if nongreedy {
	 242  		i.Arg = f1.i
	 243  		f.out = makePatchList(f.i << 1)
	 244  	} else {
	 245  		i.Out = f1.i
	 246  		f.out = makePatchList(f.i<<1 | 1)
	 247  	}
	 248  	f1.out.patch(c.p, f.i)
	 249  	return f
	 250  }
	 251  
	 252  func (c *compiler) star(f1 frag, nongreedy bool) frag {
	 253  	if f1.nullable {
	 254  		// Use (f1+)? to get priority match order correct.
	 255  		// See golang.org/issue/46123.
	 256  		return c.quest(c.plus(f1, nongreedy), nongreedy)
	 257  	}
	 258  	return c.loop(f1, nongreedy)
	 259  }
	 260  
	 261  func (c *compiler) plus(f1 frag, nongreedy bool) frag {
	 262  	return frag{f1.i, c.loop(f1, nongreedy).out, f1.nullable}
	 263  }
	 264  
	 265  func (c *compiler) empty(op EmptyOp) frag {
	 266  	f := c.inst(InstEmptyWidth)
	 267  	c.p.Inst[f.i].Arg = uint32(op)
	 268  	f.out = makePatchList(f.i << 1)
	 269  	return f
	 270  }
	 271  
	 272  func (c *compiler) rune(r []rune, flags Flags) frag {
	 273  	f := c.inst(InstRune)
	 274  	f.nullable = false
	 275  	i := &c.p.Inst[f.i]
	 276  	i.Rune = r
	 277  	flags &= FoldCase // only relevant flag is FoldCase
	 278  	if len(r) != 1 || unicode.SimpleFold(r[0]) == r[0] {
	 279  		// and sometimes not even that
	 280  		flags &^= FoldCase
	 281  	}
	 282  	i.Arg = uint32(flags)
	 283  	f.out = makePatchList(f.i << 1)
	 284  
	 285  	// Special cases for exec machine.
	 286  	switch {
	 287  	case flags&FoldCase == 0 && (len(r) == 1 || len(r) == 2 && r[0] == r[1]):
	 288  		i.Op = InstRune1
	 289  	case len(r) == 2 && r[0] == 0 && r[1] == unicode.MaxRune:
	 290  		i.Op = InstRuneAny
	 291  	case len(r) == 4 && r[0] == 0 && r[1] == '\n'-1 && r[2] == '\n'+1 && r[3] == unicode.MaxRune:
	 292  		i.Op = InstRuneAnyNotNL
	 293  	}
	 294  
	 295  	return f
	 296  }
	 297  

View as plain text