...

Source file src/regexp/backtrack.go

Documentation: regexp

		 1  // Copyright 2015 The Go Authors. All rights reserved.
		 2  // Use of this source code is governed by a BSD-style
		 3  // license that can be found in the LICENSE file.
		 4  
		 5  // backtrack is a regular expression search with submatch
		 6  // tracking for small regular expressions and texts. It allocates
		 7  // a bit vector with (length of input) * (length of prog) bits,
		 8  // to make sure it never explores the same (character position, instruction)
		 9  // state multiple times. This limits the search to run in time linear in
		10  // the length of the test.
		11  //
		12  // backtrack is a fast replacement for the NFA code on small
		13  // regexps when onepass cannot be used.
		14  
		15  package regexp
		16  
		17  import (
		18  	"regexp/syntax"
		19  	"sync"
		20  )
		21  
		22  // A job is an entry on the backtracker's job stack. It holds
		23  // the instruction pc and the position in the input.
		24  type job struct {
		25  	pc	uint32
		26  	arg bool
		27  	pos int
		28  }
		29  
		30  const (
		31  	visitedBits				= 32
		32  	maxBacktrackProg	 = 500				// len(prog.Inst) <= max
		33  	maxBacktrackVector = 256 * 1024 // bit vector size <= max (bits)
		34  )
		35  
		36  // bitState holds state for the backtracker.
		37  type bitState struct {
		38  	end			int
		39  	cap			[]int
		40  	matchcap []int
		41  	jobs		 []job
		42  	visited	[]uint32
		43  
		44  	inputs inputs
		45  }
		46  
		47  var bitStatePool sync.Pool
		48  
		49  func newBitState() *bitState {
		50  	b, ok := bitStatePool.Get().(*bitState)
		51  	if !ok {
		52  		b = new(bitState)
		53  	}
		54  	return b
		55  }
		56  
		57  func freeBitState(b *bitState) {
		58  	b.inputs.clear()
		59  	bitStatePool.Put(b)
		60  }
		61  
		62  // maxBitStateLen returns the maximum length of a string to search with
		63  // the backtracker using prog.
		64  func maxBitStateLen(prog *syntax.Prog) int {
		65  	if !shouldBacktrack(prog) {
		66  		return 0
		67  	}
		68  	return maxBacktrackVector / len(prog.Inst)
		69  }
		70  
		71  // shouldBacktrack reports whether the program is too
		72  // long for the backtracker to run.
		73  func shouldBacktrack(prog *syntax.Prog) bool {
		74  	return len(prog.Inst) <= maxBacktrackProg
		75  }
		76  
		77  // reset resets the state of the backtracker.
		78  // end is the end position in the input.
		79  // ncap is the number of captures.
		80  func (b *bitState) reset(prog *syntax.Prog, end int, ncap int) {
		81  	b.end = end
		82  
		83  	if cap(b.jobs) == 0 {
		84  		b.jobs = make([]job, 0, 256)
		85  	} else {
		86  		b.jobs = b.jobs[:0]
		87  	}
		88  
		89  	visitedSize := (len(prog.Inst)*(end+1) + visitedBits - 1) / visitedBits
		90  	if cap(b.visited) < visitedSize {
		91  		b.visited = make([]uint32, visitedSize, maxBacktrackVector/visitedBits)
		92  	} else {
		93  		b.visited = b.visited[:visitedSize]
		94  		for i := range b.visited {
		95  			b.visited[i] = 0
		96  		}
		97  	}
		98  
		99  	if cap(b.cap) < ncap {
	 100  		b.cap = make([]int, ncap)
	 101  	} else {
	 102  		b.cap = b.cap[:ncap]
	 103  	}
	 104  	for i := range b.cap {
	 105  		b.cap[i] = -1
	 106  	}
	 107  
	 108  	if cap(b.matchcap) < ncap {
	 109  		b.matchcap = make([]int, ncap)
	 110  	} else {
	 111  		b.matchcap = b.matchcap[:ncap]
	 112  	}
	 113  	for i := range b.matchcap {
	 114  		b.matchcap[i] = -1
	 115  	}
	 116  }
	 117  
	 118  // shouldVisit reports whether the combination of (pc, pos) has not
	 119  // been visited yet.
	 120  func (b *bitState) shouldVisit(pc uint32, pos int) bool {
	 121  	n := uint(int(pc)*(b.end+1) + pos)
	 122  	if b.visited[n/visitedBits]&(1<<(n&(visitedBits-1))) != 0 {
	 123  		return false
	 124  	}
	 125  	b.visited[n/visitedBits] |= 1 << (n & (visitedBits - 1))
	 126  	return true
	 127  }
	 128  
	 129  // push pushes (pc, pos, arg) onto the job stack if it should be
	 130  // visited.
	 131  func (b *bitState) push(re *Regexp, pc uint32, pos int, arg bool) {
	 132  	// Only check shouldVisit when arg is false.
	 133  	// When arg is true, we are continuing a previous visit.
	 134  	if re.prog.Inst[pc].Op != syntax.InstFail && (arg || b.shouldVisit(pc, pos)) {
	 135  		b.jobs = append(b.jobs, job{pc: pc, arg: arg, pos: pos})
	 136  	}
	 137  }
	 138  
	 139  // tryBacktrack runs a backtracking search starting at pos.
	 140  func (re *Regexp) tryBacktrack(b *bitState, i input, pc uint32, pos int) bool {
	 141  	longest := re.longest
	 142  
	 143  	b.push(re, pc, pos, false)
	 144  	for len(b.jobs) > 0 {
	 145  		l := len(b.jobs) - 1
	 146  		// Pop job off the stack.
	 147  		pc := b.jobs[l].pc
	 148  		pos := b.jobs[l].pos
	 149  		arg := b.jobs[l].arg
	 150  		b.jobs = b.jobs[:l]
	 151  
	 152  		// Optimization: rather than push and pop,
	 153  		// code that is going to Push and continue
	 154  		// the loop simply updates ip, p, and arg
	 155  		// and jumps to CheckAndLoop. We have to
	 156  		// do the ShouldVisit check that Push
	 157  		// would have, but we avoid the stack
	 158  		// manipulation.
	 159  		goto Skip
	 160  	CheckAndLoop:
	 161  		if !b.shouldVisit(pc, pos) {
	 162  			continue
	 163  		}
	 164  	Skip:
	 165  
	 166  		inst := re.prog.Inst[pc]
	 167  
	 168  		switch inst.Op {
	 169  		default:
	 170  			panic("bad inst")
	 171  		case syntax.InstFail:
	 172  			panic("unexpected InstFail")
	 173  		case syntax.InstAlt:
	 174  			// Cannot just
	 175  			//	 b.push(inst.Out, pos, false)
	 176  			//	 b.push(inst.Arg, pos, false)
	 177  			// If during the processing of inst.Out, we encounter
	 178  			// inst.Arg via another path, we want to process it then.
	 179  			// Pushing it here will inhibit that. Instead, re-push
	 180  			// inst with arg==true as a reminder to push inst.Arg out
	 181  			// later.
	 182  			if arg {
	 183  				// Finished inst.Out; try inst.Arg.
	 184  				arg = false
	 185  				pc = inst.Arg
	 186  				goto CheckAndLoop
	 187  			} else {
	 188  				b.push(re, pc, pos, true)
	 189  				pc = inst.Out
	 190  				goto CheckAndLoop
	 191  			}
	 192  
	 193  		case syntax.InstAltMatch:
	 194  			// One opcode consumes runes; the other leads to match.
	 195  			switch re.prog.Inst[inst.Out].Op {
	 196  			case syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
	 197  				// inst.Arg is the match.
	 198  				b.push(re, inst.Arg, pos, false)
	 199  				pc = inst.Arg
	 200  				pos = b.end
	 201  				goto CheckAndLoop
	 202  			}
	 203  			// inst.Out is the match - non-greedy
	 204  			b.push(re, inst.Out, b.end, false)
	 205  			pc = inst.Out
	 206  			goto CheckAndLoop
	 207  
	 208  		case syntax.InstRune:
	 209  			r, width := i.step(pos)
	 210  			if !inst.MatchRune(r) {
	 211  				continue
	 212  			}
	 213  			pos += width
	 214  			pc = inst.Out
	 215  			goto CheckAndLoop
	 216  
	 217  		case syntax.InstRune1:
	 218  			r, width := i.step(pos)
	 219  			if r != inst.Rune[0] {
	 220  				continue
	 221  			}
	 222  			pos += width
	 223  			pc = inst.Out
	 224  			goto CheckAndLoop
	 225  
	 226  		case syntax.InstRuneAnyNotNL:
	 227  			r, width := i.step(pos)
	 228  			if r == '\n' || r == endOfText {
	 229  				continue
	 230  			}
	 231  			pos += width
	 232  			pc = inst.Out
	 233  			goto CheckAndLoop
	 234  
	 235  		case syntax.InstRuneAny:
	 236  			r, width := i.step(pos)
	 237  			if r == endOfText {
	 238  				continue
	 239  			}
	 240  			pos += width
	 241  			pc = inst.Out
	 242  			goto CheckAndLoop
	 243  
	 244  		case syntax.InstCapture:
	 245  			if arg {
	 246  				// Finished inst.Out; restore the old value.
	 247  				b.cap[inst.Arg] = pos
	 248  				continue
	 249  			} else {
	 250  				if inst.Arg < uint32(len(b.cap)) {
	 251  					// Capture pos to register, but save old value.
	 252  					b.push(re, pc, b.cap[inst.Arg], true) // come back when we're done.
	 253  					b.cap[inst.Arg] = pos
	 254  				}
	 255  				pc = inst.Out
	 256  				goto CheckAndLoop
	 257  			}
	 258  
	 259  		case syntax.InstEmptyWidth:
	 260  			flag := i.context(pos)
	 261  			if !flag.match(syntax.EmptyOp(inst.Arg)) {
	 262  				continue
	 263  			}
	 264  			pc = inst.Out
	 265  			goto CheckAndLoop
	 266  
	 267  		case syntax.InstNop:
	 268  			pc = inst.Out
	 269  			goto CheckAndLoop
	 270  
	 271  		case syntax.InstMatch:
	 272  			// We found a match. If the caller doesn't care
	 273  			// where the match is, no point going further.
	 274  			if len(b.cap) == 0 {
	 275  				return true
	 276  			}
	 277  
	 278  			// Record best match so far.
	 279  			// Only need to check end point, because this entire
	 280  			// call is only considering one start position.
	 281  			if len(b.cap) > 1 {
	 282  				b.cap[1] = pos
	 283  			}
	 284  			if old := b.matchcap[1]; old == -1 || (longest && pos > 0 && pos > old) {
	 285  				copy(b.matchcap, b.cap)
	 286  			}
	 287  
	 288  			// If going for first match, we're done.
	 289  			if !longest {
	 290  				return true
	 291  			}
	 292  
	 293  			// If we used the entire text, no longer match is possible.
	 294  			if pos == b.end {
	 295  				return true
	 296  			}
	 297  
	 298  			// Otherwise, continue on in hope of a longer match.
	 299  			continue
	 300  		}
	 301  	}
	 302  
	 303  	return longest && len(b.matchcap) > 1 && b.matchcap[1] >= 0
	 304  }
	 305  
	 306  // backtrack runs a backtracking search of prog on the input starting at pos.
	 307  func (re *Regexp) backtrack(ib []byte, is string, pos int, ncap int, dstCap []int) []int {
	 308  	startCond := re.cond
	 309  	if startCond == ^syntax.EmptyOp(0) { // impossible
	 310  		return nil
	 311  	}
	 312  	if startCond&syntax.EmptyBeginText != 0 && pos != 0 {
	 313  		// Anchored match, past beginning of text.
	 314  		return nil
	 315  	}
	 316  
	 317  	b := newBitState()
	 318  	i, end := b.inputs.init(nil, ib, is)
	 319  	b.reset(re.prog, end, ncap)
	 320  
	 321  	// Anchored search must start at the beginning of the input
	 322  	if startCond&syntax.EmptyBeginText != 0 {
	 323  		if len(b.cap) > 0 {
	 324  			b.cap[0] = pos
	 325  		}
	 326  		if !re.tryBacktrack(b, i, uint32(re.prog.Start), pos) {
	 327  			freeBitState(b)
	 328  			return nil
	 329  		}
	 330  	} else {
	 331  
	 332  		// Unanchored search, starting from each possible text position.
	 333  		// Notice that we have to try the empty string at the end of
	 334  		// the text, so the loop condition is pos <= end, not pos < end.
	 335  		// This looks like it's quadratic in the size of the text,
	 336  		// but we are not clearing visited between calls to TrySearch,
	 337  		// so no work is duplicated and it ends up still being linear.
	 338  		width := -1
	 339  		for ; pos <= end && width != 0; pos += width {
	 340  			if len(re.prefix) > 0 {
	 341  				// Match requires literal prefix; fast search for it.
	 342  				advance := i.index(re, pos)
	 343  				if advance < 0 {
	 344  					freeBitState(b)
	 345  					return nil
	 346  				}
	 347  				pos += advance
	 348  			}
	 349  
	 350  			if len(b.cap) > 0 {
	 351  				b.cap[0] = pos
	 352  			}
	 353  			if re.tryBacktrack(b, i, uint32(re.prog.Start), pos) {
	 354  				// Match must be leftmost; done.
	 355  				goto Match
	 356  			}
	 357  			_, width = i.step(pos)
	 358  		}
	 359  		freeBitState(b)
	 360  		return nil
	 361  	}
	 362  
	 363  Match:
	 364  	dstCap = append(dstCap, b.matchcap...)
	 365  	freeBitState(b)
	 366  	return dstCap
	 367  }
	 368  

View as plain text