...

Source file src/strings/search.go

Documentation: strings

		 1  // Copyright 2012 The Go Authors. All rights reserved.
		 2  // Use of this source code is governed by a BSD-style
		 3  // license that can be found in the LICENSE file.
		 4  
		 5  package strings
		 6  
		 7  // stringFinder efficiently finds strings in a source text. It's implemented
		 8  // using the Boyer-Moore string search algorithm:
		 9  // https://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm
		10  // https://www.cs.utexas.edu/~moore/publications/fstrpos.pdf (note: this aged
		11  // document uses 1-based indexing)
		12  type stringFinder struct {
		13  	// pattern is the string that we are searching for in the text.
		14  	pattern string
		15  
		16  	// badCharSkip[b] contains the distance between the last byte of pattern
		17  	// and the rightmost occurrence of b in pattern. If b is not in pattern,
		18  	// badCharSkip[b] is len(pattern).
		19  	//
		20  	// Whenever a mismatch is found with byte b in the text, we can safely
		21  	// shift the matching frame at least badCharSkip[b] until the next time
		22  	// the matching char could be in alignment.
		23  	badCharSkip [256]int
		24  
		25  	// goodSuffixSkip[i] defines how far we can shift the matching frame given
		26  	// that the suffix pattern[i+1:] matches, but the byte pattern[i] does
		27  	// not. There are two cases to consider:
		28  	//
		29  	// 1. The matched suffix occurs elsewhere in pattern (with a different
		30  	// byte preceding it that we might possibly match). In this case, we can
		31  	// shift the matching frame to align with the next suffix chunk. For
		32  	// example, the pattern "mississi" has the suffix "issi" next occurring
		33  	// (in right-to-left order) at index 1, so goodSuffixSkip[3] ==
		34  	// shift+len(suffix) == 3+4 == 7.
		35  	//
		36  	// 2. If the matched suffix does not occur elsewhere in pattern, then the
		37  	// matching frame may share part of its prefix with the end of the
		38  	// matching suffix. In this case, goodSuffixSkip[i] will contain how far
		39  	// to shift the frame to align this portion of the prefix to the
		40  	// suffix. For example, in the pattern "abcxxxabc", when the first
		41  	// mismatch from the back is found to be in position 3, the matching
		42  	// suffix "xxabc" is not found elsewhere in the pattern. However, its
		43  	// rightmost "abc" (at position 6) is a prefix of the whole pattern, so
		44  	// goodSuffixSkip[3] == shift+len(suffix) == 6+5 == 11.
		45  	goodSuffixSkip []int
		46  }
		47  
		48  func makeStringFinder(pattern string) *stringFinder {
		49  	f := &stringFinder{
		50  		pattern:				pattern,
		51  		goodSuffixSkip: make([]int, len(pattern)),
		52  	}
		53  	// last is the index of the last character in the pattern.
		54  	last := len(pattern) - 1
		55  
		56  	// Build bad character table.
		57  	// Bytes not in the pattern can skip one pattern's length.
		58  	for i := range f.badCharSkip {
		59  		f.badCharSkip[i] = len(pattern)
		60  	}
		61  	// The loop condition is < instead of <= so that the last byte does not
		62  	// have a zero distance to itself. Finding this byte out of place implies
		63  	// that it is not in the last position.
		64  	for i := 0; i < last; i++ {
		65  		f.badCharSkip[pattern[i]] = last - i
		66  	}
		67  
		68  	// Build good suffix table.
		69  	// First pass: set each value to the next index which starts a prefix of
		70  	// pattern.
		71  	lastPrefix := last
		72  	for i := last; i >= 0; i-- {
		73  		if HasPrefix(pattern, pattern[i+1:]) {
		74  			lastPrefix = i + 1
		75  		}
		76  		// lastPrefix is the shift, and (last-i) is len(suffix).
		77  		f.goodSuffixSkip[i] = lastPrefix + last - i
		78  	}
		79  	// Second pass: find repeats of pattern's suffix starting from the front.
		80  	for i := 0; i < last; i++ {
		81  		lenSuffix := longestCommonSuffix(pattern, pattern[1:i+1])
		82  		if pattern[i-lenSuffix] != pattern[last-lenSuffix] {
		83  			// (last-i) is the shift, and lenSuffix is len(suffix).
		84  			f.goodSuffixSkip[last-lenSuffix] = lenSuffix + last - i
		85  		}
		86  	}
		87  
		88  	return f
		89  }
		90  
		91  func longestCommonSuffix(a, b string) (i int) {
		92  	for ; i < len(a) && i < len(b); i++ {
		93  		if a[len(a)-1-i] != b[len(b)-1-i] {
		94  			break
		95  		}
		96  	}
		97  	return
		98  }
		99  
	 100  // next returns the index in text of the first occurrence of the pattern. If
	 101  // the pattern is not found, it returns -1.
	 102  func (f *stringFinder) next(text string) int {
	 103  	i := len(f.pattern) - 1
	 104  	for i < len(text) {
	 105  		// Compare backwards from the end until the first unmatching character.
	 106  		j := len(f.pattern) - 1
	 107  		for j >= 0 && text[i] == f.pattern[j] {
	 108  			i--
	 109  			j--
	 110  		}
	 111  		if j < 0 {
	 112  			return i + 1 // match
	 113  		}
	 114  		i += max(f.badCharSkip[text[i]], f.goodSuffixSkip[j])
	 115  	}
	 116  	return -1
	 117  }
	 118  
	 119  func max(a, b int) int {
	 120  	if a > b {
	 121  		return a
	 122  	}
	 123  	return b
	 124  }
	 125  

View as plain text