...

Source file src/regexp/onepass.go

Documentation: regexp

		 1  // Copyright 2014 The Go Authors. All rights reserved.
		 2  // Use of this source code is governed by a BSD-style
		 3  // license that can be found in the LICENSE file.
		 4  
		 5  package regexp
		 6  
		 7  import (
		 8  	"regexp/syntax"
		 9  	"sort"
		10  	"strings"
		11  	"unicode"
		12  )
		13  
		14  // "One-pass" regexp execution.
		15  // Some regexps can be analyzed to determine that they never need
		16  // backtracking: they are guaranteed to run in one pass over the string
		17  // without bothering to save all the usual NFA state.
		18  // Detect those and execute them more quickly.
		19  
		20  // A onePassProg is a compiled one-pass regular expression program.
		21  // It is the same as syntax.Prog except for the use of onePassInst.
		22  type onePassProg struct {
		23  	Inst	 []onePassInst
		24  	Start	int // index of start instruction
		25  	NumCap int // number of InstCapture insts in re
		26  }
		27  
		28  // A onePassInst is a single instruction in a one-pass regular expression program.
		29  // It is the same as syntax.Inst except for the new 'Next' field.
		30  type onePassInst struct {
		31  	syntax.Inst
		32  	Next []uint32
		33  }
		34  
		35  // OnePassPrefix returns a literal string that all matches for the
		36  // regexp must start with. Complete is true if the prefix
		37  // is the entire match. Pc is the index of the last rune instruction
		38  // in the string. The OnePassPrefix skips over the mandatory
		39  // EmptyBeginText
		40  func onePassPrefix(p *syntax.Prog) (prefix string, complete bool, pc uint32) {
		41  	i := &p.Inst[p.Start]
		42  	if i.Op != syntax.InstEmptyWidth || (syntax.EmptyOp(i.Arg))&syntax.EmptyBeginText == 0 {
		43  		return "", i.Op == syntax.InstMatch, uint32(p.Start)
		44  	}
		45  	pc = i.Out
		46  	i = &p.Inst[pc]
		47  	for i.Op == syntax.InstNop {
		48  		pc = i.Out
		49  		i = &p.Inst[pc]
		50  	}
		51  	// Avoid allocation of buffer if prefix is empty.
		52  	if iop(i) != syntax.InstRune || len(i.Rune) != 1 {
		53  		return "", i.Op == syntax.InstMatch, uint32(p.Start)
		54  	}
		55  
		56  	// Have prefix; gather characters.
		57  	var buf strings.Builder
		58  	for iop(i) == syntax.InstRune && len(i.Rune) == 1 && syntax.Flags(i.Arg)&syntax.FoldCase == 0 {
		59  		buf.WriteRune(i.Rune[0])
		60  		pc, i = i.Out, &p.Inst[i.Out]
		61  	}
		62  	if i.Op == syntax.InstEmptyWidth &&
		63  		syntax.EmptyOp(i.Arg)&syntax.EmptyEndText != 0 &&
		64  		p.Inst[i.Out].Op == syntax.InstMatch {
		65  		complete = true
		66  	}
		67  	return buf.String(), complete, pc
		68  }
		69  
		70  // OnePassNext selects the next actionable state of the prog, based on the input character.
		71  // It should only be called when i.Op == InstAlt or InstAltMatch, and from the one-pass machine.
		72  // One of the alternates may ultimately lead without input to end of line. If the instruction
		73  // is InstAltMatch the path to the InstMatch is in i.Out, the normal node in i.Next.
		74  func onePassNext(i *onePassInst, r rune) uint32 {
		75  	next := i.MatchRunePos(r)
		76  	if next >= 0 {
		77  		return i.Next[next]
		78  	}
		79  	if i.Op == syntax.InstAltMatch {
		80  		return i.Out
		81  	}
		82  	return 0
		83  }
		84  
		85  func iop(i *syntax.Inst) syntax.InstOp {
		86  	op := i.Op
		87  	switch op {
		88  	case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
		89  		op = syntax.InstRune
		90  	}
		91  	return op
		92  }
		93  
		94  // Sparse Array implementation is used as a queueOnePass.
		95  type queueOnePass struct {
		96  	sparse					[]uint32
		97  	dense					 []uint32
		98  	size, nextIndex uint32
		99  }
	 100  
	 101  func (q *queueOnePass) empty() bool {
	 102  	return q.nextIndex >= q.size
	 103  }
	 104  
	 105  func (q *queueOnePass) next() (n uint32) {
	 106  	n = q.dense[q.nextIndex]
	 107  	q.nextIndex++
	 108  	return
	 109  }
	 110  
	 111  func (q *queueOnePass) clear() {
	 112  	q.size = 0
	 113  	q.nextIndex = 0
	 114  }
	 115  
	 116  func (q *queueOnePass) contains(u uint32) bool {
	 117  	if u >= uint32(len(q.sparse)) {
	 118  		return false
	 119  	}
	 120  	return q.sparse[u] < q.size && q.dense[q.sparse[u]] == u
	 121  }
	 122  
	 123  func (q *queueOnePass) insert(u uint32) {
	 124  	if !q.contains(u) {
	 125  		q.insertNew(u)
	 126  	}
	 127  }
	 128  
	 129  func (q *queueOnePass) insertNew(u uint32) {
	 130  	if u >= uint32(len(q.sparse)) {
	 131  		return
	 132  	}
	 133  	q.sparse[u] = q.size
	 134  	q.dense[q.size] = u
	 135  	q.size++
	 136  }
	 137  
	 138  func newQueue(size int) (q *queueOnePass) {
	 139  	return &queueOnePass{
	 140  		sparse: make([]uint32, size),
	 141  		dense:	make([]uint32, size),
	 142  	}
	 143  }
	 144  
	 145  // mergeRuneSets merges two non-intersecting runesets, and returns the merged result,
	 146  // and a NextIp array. The idea is that if a rune matches the OnePassRunes at index
	 147  // i, NextIp[i/2] is the target. If the input sets intersect, an empty runeset and a
	 148  // NextIp array with the single element mergeFailed is returned.
	 149  // The code assumes that both inputs contain ordered and non-intersecting rune pairs.
	 150  const mergeFailed = uint32(0xffffffff)
	 151  
	 152  var (
	 153  	noRune = []rune{}
	 154  	noNext = []uint32{mergeFailed}
	 155  )
	 156  
	 157  func mergeRuneSets(leftRunes, rightRunes *[]rune, leftPC, rightPC uint32) ([]rune, []uint32) {
	 158  	leftLen := len(*leftRunes)
	 159  	rightLen := len(*rightRunes)
	 160  	if leftLen&0x1 != 0 || rightLen&0x1 != 0 {
	 161  		panic("mergeRuneSets odd length []rune")
	 162  	}
	 163  	var (
	 164  		lx, rx int
	 165  	)
	 166  	merged := make([]rune, 0)
	 167  	next := make([]uint32, 0)
	 168  	ok := true
	 169  	defer func() {
	 170  		if !ok {
	 171  			merged = nil
	 172  			next = nil
	 173  		}
	 174  	}()
	 175  
	 176  	ix := -1
	 177  	extend := func(newLow *int, newArray *[]rune, pc uint32) bool {
	 178  		if ix > 0 && (*newArray)[*newLow] <= merged[ix] {
	 179  			return false
	 180  		}
	 181  		merged = append(merged, (*newArray)[*newLow], (*newArray)[*newLow+1])
	 182  		*newLow += 2
	 183  		ix += 2
	 184  		next = append(next, pc)
	 185  		return true
	 186  	}
	 187  
	 188  	for lx < leftLen || rx < rightLen {
	 189  		switch {
	 190  		case rx >= rightLen:
	 191  			ok = extend(&lx, leftRunes, leftPC)
	 192  		case lx >= leftLen:
	 193  			ok = extend(&rx, rightRunes, rightPC)
	 194  		case (*rightRunes)[rx] < (*leftRunes)[lx]:
	 195  			ok = extend(&rx, rightRunes, rightPC)
	 196  		default:
	 197  			ok = extend(&lx, leftRunes, leftPC)
	 198  		}
	 199  		if !ok {
	 200  			return noRune, noNext
	 201  		}
	 202  	}
	 203  	return merged, next
	 204  }
	 205  
	 206  // cleanupOnePass drops working memory, and restores certain shortcut instructions.
	 207  func cleanupOnePass(prog *onePassProg, original *syntax.Prog) {
	 208  	for ix, instOriginal := range original.Inst {
	 209  		switch instOriginal.Op {
	 210  		case syntax.InstAlt, syntax.InstAltMatch, syntax.InstRune:
	 211  		case syntax.InstCapture, syntax.InstEmptyWidth, syntax.InstNop, syntax.InstMatch, syntax.InstFail:
	 212  			prog.Inst[ix].Next = nil
	 213  		case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
	 214  			prog.Inst[ix].Next = nil
	 215  			prog.Inst[ix] = onePassInst{Inst: instOriginal}
	 216  		}
	 217  	}
	 218  }
	 219  
	 220  // onePassCopy creates a copy of the original Prog, as we'll be modifying it
	 221  func onePassCopy(prog *syntax.Prog) *onePassProg {
	 222  	p := &onePassProg{
	 223  		Start:	prog.Start,
	 224  		NumCap: prog.NumCap,
	 225  		Inst:	 make([]onePassInst, len(prog.Inst)),
	 226  	}
	 227  	for i, inst := range prog.Inst {
	 228  		p.Inst[i] = onePassInst{Inst: inst}
	 229  	}
	 230  
	 231  	// rewrites one or more common Prog constructs that enable some otherwise
	 232  	// non-onepass Progs to be onepass. A:BD (for example) means an InstAlt at
	 233  	// ip A, that points to ips B & C.
	 234  	// A:BC + B:DA => A:BC + B:CD
	 235  	// A:BC + B:DC => A:DC + B:DC
	 236  	for pc := range p.Inst {
	 237  		switch p.Inst[pc].Op {
	 238  		default:
	 239  			continue
	 240  		case syntax.InstAlt, syntax.InstAltMatch:
	 241  			// A:Bx + B:Ay
	 242  			p_A_Other := &p.Inst[pc].Out
	 243  			p_A_Alt := &p.Inst[pc].Arg
	 244  			// make sure a target is another Alt
	 245  			instAlt := p.Inst[*p_A_Alt]
	 246  			if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) {
	 247  				p_A_Alt, p_A_Other = p_A_Other, p_A_Alt
	 248  				instAlt = p.Inst[*p_A_Alt]
	 249  				if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) {
	 250  					continue
	 251  				}
	 252  			}
	 253  			instOther := p.Inst[*p_A_Other]
	 254  			// Analyzing both legs pointing to Alts is for another day
	 255  			if instOther.Op == syntax.InstAlt || instOther.Op == syntax.InstAltMatch {
	 256  				// too complicated
	 257  				continue
	 258  			}
	 259  			// simple empty transition loop
	 260  			// A:BC + B:DA => A:BC + B:DC
	 261  			p_B_Alt := &p.Inst[*p_A_Alt].Out
	 262  			p_B_Other := &p.Inst[*p_A_Alt].Arg
	 263  			patch := false
	 264  			if instAlt.Out == uint32(pc) {
	 265  				patch = true
	 266  			} else if instAlt.Arg == uint32(pc) {
	 267  				patch = true
	 268  				p_B_Alt, p_B_Other = p_B_Other, p_B_Alt
	 269  			}
	 270  			if patch {
	 271  				*p_B_Alt = *p_A_Other
	 272  			}
	 273  
	 274  			// empty transition to common target
	 275  			// A:BC + B:DC => A:DC + B:DC
	 276  			if *p_A_Other == *p_B_Alt {
	 277  				*p_A_Alt = *p_B_Other
	 278  			}
	 279  		}
	 280  	}
	 281  	return p
	 282  }
	 283  
	 284  // runeSlice exists to permit sorting the case-folded rune sets.
	 285  type runeSlice []rune
	 286  
	 287  func (p runeSlice) Len() int					 { return len(p) }
	 288  func (p runeSlice) Less(i, j int) bool { return p[i] < p[j] }
	 289  func (p runeSlice) Swap(i, j int)			{ p[i], p[j] = p[j], p[i] }
	 290  
	 291  var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune}
	 292  var anyRune = []rune{0, unicode.MaxRune}
	 293  
	 294  // makeOnePass creates a onepass Prog, if possible. It is possible if at any alt,
	 295  // the match engine can always tell which branch to take. The routine may modify
	 296  // p if it is turned into a onepass Prog. If it isn't possible for this to be a
	 297  // onepass Prog, the Prog nil is returned. makeOnePass is recursive
	 298  // to the size of the Prog.
	 299  func makeOnePass(p *onePassProg) *onePassProg {
	 300  	// If the machine is very long, it's not worth the time to check if we can use one pass.
	 301  	if len(p.Inst) >= 1000 {
	 302  		return nil
	 303  	}
	 304  
	 305  	var (
	 306  		instQueue		= newQueue(len(p.Inst))
	 307  		visitQueue	 = newQueue(len(p.Inst))
	 308  		check				func(uint32, []bool) bool
	 309  		onePassRunes = make([][]rune, len(p.Inst))
	 310  	)
	 311  
	 312  	// check that paths from Alt instructions are unambiguous, and rebuild the new
	 313  	// program as a onepass program
	 314  	check = func(pc uint32, m []bool) (ok bool) {
	 315  		ok = true
	 316  		inst := &p.Inst[pc]
	 317  		if visitQueue.contains(pc) {
	 318  			return
	 319  		}
	 320  		visitQueue.insert(pc)
	 321  		switch inst.Op {
	 322  		case syntax.InstAlt, syntax.InstAltMatch:
	 323  			ok = check(inst.Out, m) && check(inst.Arg, m)
	 324  			// check no-input paths to InstMatch
	 325  			matchOut := m[inst.Out]
	 326  			matchArg := m[inst.Arg]
	 327  			if matchOut && matchArg {
	 328  				ok = false
	 329  				break
	 330  			}
	 331  			// Match on empty goes in inst.Out
	 332  			if matchArg {
	 333  				inst.Out, inst.Arg = inst.Arg, inst.Out
	 334  				matchOut, matchArg = matchArg, matchOut
	 335  			}
	 336  			if matchOut {
	 337  				m[pc] = true
	 338  				inst.Op = syntax.InstAltMatch
	 339  			}
	 340  
	 341  			// build a dispatch operator from the two legs of the alt.
	 342  			onePassRunes[pc], inst.Next = mergeRuneSets(
	 343  				&onePassRunes[inst.Out], &onePassRunes[inst.Arg], inst.Out, inst.Arg)
	 344  			if len(inst.Next) > 0 && inst.Next[0] == mergeFailed {
	 345  				ok = false
	 346  				break
	 347  			}
	 348  		case syntax.InstCapture, syntax.InstNop:
	 349  			ok = check(inst.Out, m)
	 350  			m[pc] = m[inst.Out]
	 351  			// pass matching runes back through these no-ops.
	 352  			onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...)
	 353  			inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
	 354  			for i := range inst.Next {
	 355  				inst.Next[i] = inst.Out
	 356  			}
	 357  		case syntax.InstEmptyWidth:
	 358  			ok = check(inst.Out, m)
	 359  			m[pc] = m[inst.Out]
	 360  			onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...)
	 361  			inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
	 362  			for i := range inst.Next {
	 363  				inst.Next[i] = inst.Out
	 364  			}
	 365  		case syntax.InstMatch, syntax.InstFail:
	 366  			m[pc] = inst.Op == syntax.InstMatch
	 367  		case syntax.InstRune:
	 368  			m[pc] = false
	 369  			if len(inst.Next) > 0 {
	 370  				break
	 371  			}
	 372  			instQueue.insert(inst.Out)
	 373  			if len(inst.Rune) == 0 {
	 374  				onePassRunes[pc] = []rune{}
	 375  				inst.Next = []uint32{inst.Out}
	 376  				break
	 377  			}
	 378  			runes := make([]rune, 0)
	 379  			if len(inst.Rune) == 1 && syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
	 380  				r0 := inst.Rune[0]
	 381  				runes = append(runes, r0, r0)
	 382  				for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
	 383  					runes = append(runes, r1, r1)
	 384  				}
	 385  				sort.Sort(runeSlice(runes))
	 386  			} else {
	 387  				runes = append(runes, inst.Rune...)
	 388  			}
	 389  			onePassRunes[pc] = runes
	 390  			inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
	 391  			for i := range inst.Next {
	 392  				inst.Next[i] = inst.Out
	 393  			}
	 394  			inst.Op = syntax.InstRune
	 395  		case syntax.InstRune1:
	 396  			m[pc] = false
	 397  			if len(inst.Next) > 0 {
	 398  				break
	 399  			}
	 400  			instQueue.insert(inst.Out)
	 401  			runes := []rune{}
	 402  			// expand case-folded runes
	 403  			if syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
	 404  				r0 := inst.Rune[0]
	 405  				runes = append(runes, r0, r0)
	 406  				for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
	 407  					runes = append(runes, r1, r1)
	 408  				}
	 409  				sort.Sort(runeSlice(runes))
	 410  			} else {
	 411  				runes = append(runes, inst.Rune[0], inst.Rune[0])
	 412  			}
	 413  			onePassRunes[pc] = runes
	 414  			inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
	 415  			for i := range inst.Next {
	 416  				inst.Next[i] = inst.Out
	 417  			}
	 418  			inst.Op = syntax.InstRune
	 419  		case syntax.InstRuneAny:
	 420  			m[pc] = false
	 421  			if len(inst.Next) > 0 {
	 422  				break
	 423  			}
	 424  			instQueue.insert(inst.Out)
	 425  			onePassRunes[pc] = append([]rune{}, anyRune...)
	 426  			inst.Next = []uint32{inst.Out}
	 427  		case syntax.InstRuneAnyNotNL:
	 428  			m[pc] = false
	 429  			if len(inst.Next) > 0 {
	 430  				break
	 431  			}
	 432  			instQueue.insert(inst.Out)
	 433  			onePassRunes[pc] = append([]rune{}, anyRuneNotNL...)
	 434  			inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
	 435  			for i := range inst.Next {
	 436  				inst.Next[i] = inst.Out
	 437  			}
	 438  		}
	 439  		return
	 440  	}
	 441  
	 442  	instQueue.clear()
	 443  	instQueue.insert(uint32(p.Start))
	 444  	m := make([]bool, len(p.Inst))
	 445  	for !instQueue.empty() {
	 446  		visitQueue.clear()
	 447  		pc := instQueue.next()
	 448  		if !check(pc, m) {
	 449  			p = nil
	 450  			break
	 451  		}
	 452  	}
	 453  	if p != nil {
	 454  		for i := range p.Inst {
	 455  			p.Inst[i].Rune = onePassRunes[i]
	 456  		}
	 457  	}
	 458  	return p
	 459  }
	 460  
	 461  // compileOnePass returns a new *syntax.Prog suitable for onePass execution if the original Prog
	 462  // can be recharacterized as a one-pass regexp program, or syntax.nil if the
	 463  // Prog cannot be converted. For a one pass prog, the fundamental condition that must
	 464  // be true is: at any InstAlt, there must be no ambiguity about what branch to	take.
	 465  func compileOnePass(prog *syntax.Prog) (p *onePassProg) {
	 466  	if prog.Start == 0 {
	 467  		return nil
	 468  	}
	 469  	// onepass regexp is anchored
	 470  	if prog.Inst[prog.Start].Op != syntax.InstEmptyWidth ||
	 471  		syntax.EmptyOp(prog.Inst[prog.Start].Arg)&syntax.EmptyBeginText != syntax.EmptyBeginText {
	 472  		return nil
	 473  	}
	 474  	// every instruction leading to InstMatch must be EmptyEndText
	 475  	for _, inst := range prog.Inst {
	 476  		opOut := prog.Inst[inst.Out].Op
	 477  		switch inst.Op {
	 478  		default:
	 479  			if opOut == syntax.InstMatch {
	 480  				return nil
	 481  			}
	 482  		case syntax.InstAlt, syntax.InstAltMatch:
	 483  			if opOut == syntax.InstMatch || prog.Inst[inst.Arg].Op == syntax.InstMatch {
	 484  				return nil
	 485  			}
	 486  		case syntax.InstEmptyWidth:
	 487  			if opOut == syntax.InstMatch {
	 488  				if syntax.EmptyOp(inst.Arg)&syntax.EmptyEndText == syntax.EmptyEndText {
	 489  					continue
	 490  				}
	 491  				return nil
	 492  			}
	 493  		}
	 494  	}
	 495  	// Creates a slightly optimized copy of the original Prog
	 496  	// that cleans up some Prog idioms that block valid onepass programs
	 497  	p = onePassCopy(prog)
	 498  
	 499  	// checkAmbiguity on InstAlts, build onepass Prog if possible
	 500  	p = makeOnePass(p)
	 501  
	 502  	if p != nil {
	 503  		cleanupOnePass(p, prog)
	 504  	}
	 505  	return p
	 506  }
	 507  

View as plain text