suffixarray.go

Documentation: index/suffixarray

		 1  // Copyright 2010 The Go Authors. All rights reserved.
		 2  // Use of this source code is governed by a BSD-style
		 3  // license that can be found in the LICENSE file.
		 4  
		 5  // Package suffixarray implements substring search in logarithmic time using
		 6  // an in-memory suffix array.
		 7  //
		 8  // Example use:
		 9  //
		10  //	// create index for some data
		11  //	index := suffixarray.New(data)
		12  //
		13  //	// lookup byte slice s
		14  //	offsets1 := index.Lookup(s, -1) // the list of all indices where s occurs in data
		15  //	offsets2 := index.Lookup(s, 3)	// the list of at most 3 indices where s occurs in data
		16  //
		17  package suffixarray
		18  
		19  import (
		20  	"bytes"
		21  	"encoding/binary"
		22  	"errors"
		23  	"io"
		24  	"math"
		25  	"regexp"
		26  	"sort"
		27  )
		28  
		29  // Can change for testing
		30  var maxData32 int = realMaxData32
		31  
		32  const realMaxData32 = math.MaxInt32
		33  
		34  // Index implements a suffix array for fast substring search.
		35  type Index struct {
		36  	data []byte
		37  	sa	 ints // suffix array for data; sa.len() == len(data)
		38  }
		39  
		40  // An ints is either an []int32 or an []int64.
		41  // That is, one of them is empty, and one is the real data.
		42  // The int64 form is used when len(data) > maxData32
		43  type ints struct {
		44  	int32 []int32
		45  	int64 []int64
		46  }
		47  
		48  func (a *ints) len() int {
		49  	return len(a.int32) + len(a.int64)
		50  }
		51  
		52  func (a *ints) get(i int) int64 {
		53  	if a.int32 != nil {
		54  		return int64(a.int32[i])
		55  	}
		56  	return a.int64[i]
		57  }
		58  
		59  func (a *ints) set(i int, v int64) {
		60  	if a.int32 != nil {
		61  		a.int32[i] = int32(v)
		62  	} else {
		63  		a.int64[i] = v
		64  	}
		65  }
		66  
		67  func (a *ints) slice(i, j int) ints {
		68  	if a.int32 != nil {
		69  		return ints{a.int32[i:j], nil}
		70  	}
		71  	return ints{nil, a.int64[i:j]}
		72  }
		73  
		74  // New creates a new Index for data.
		75  // Index creation time is O(N) for N = len(data).
		76  func New(data []byte) *Index {
		77  	ix := &Index{data: data}
		78  	if len(data) <= maxData32 {
		79  		ix.sa.int32 = make([]int32, len(data))
		80  		text_32(data, ix.sa.int32)
		81  	} else {
		82  		ix.sa.int64 = make([]int64, len(data))
		83  		text_64(data, ix.sa.int64)
		84  	}
		85  	return ix
		86  }
		87  
		88  // writeInt writes an int x to w using buf to buffer the write.
		89  func writeInt(w io.Writer, buf []byte, x int) error {
		90  	binary.PutVarint(buf, int64(x))
		91  	_, err := w.Write(buf[0:binary.MaxVarintLen64])
		92  	return err
		93  }
		94  
		95  // readInt reads an int x from r using buf to buffer the read and returns x.
		96  func readInt(r io.Reader, buf []byte) (int64, error) {
		97  	_, err := io.ReadFull(r, buf[0:binary.MaxVarintLen64]) // ok to continue with error
		98  	x, _ := binary.Varint(buf)
		99  	return x, err
	 100  }
	 101  
	 102  // writeSlice writes data[:n] to w and returns n.
	 103  // It uses buf to buffer the write.
	 104  func writeSlice(w io.Writer, buf []byte, data ints) (n int, err error) {
	 105  	// encode as many elements as fit into buf
	 106  	p := binary.MaxVarintLen64
	 107  	m := data.len()
	 108  	for ; n < m && p+binary.MaxVarintLen64 <= len(buf); n++ {
	 109  		p += binary.PutUvarint(buf[p:], uint64(data.get(n)))
	 110  	}
	 111  
	 112  	// update buffer size
	 113  	binary.PutVarint(buf, int64(p))
	 114  
	 115  	// write buffer
	 116  	_, err = w.Write(buf[0:p])
	 117  	return
	 118  }
	 119  
	 120  var errTooBig = errors.New("suffixarray: data too large")
	 121  
	 122  // readSlice reads data[:n] from r and returns n.
	 123  // It uses buf to buffer the read.
	 124  func readSlice(r io.Reader, buf []byte, data ints) (n int, err error) {
	 125  	// read buffer size
	 126  	var size64 int64
	 127  	size64, err = readInt(r, buf)
	 128  	if err != nil {
	 129  		return
	 130  	}
	 131  	if int64(int(size64)) != size64 || int(size64) < 0 {
	 132  		// We never write chunks this big anyway.
	 133  		return 0, errTooBig
	 134  	}
	 135  	size := int(size64)
	 136  
	 137  	// read buffer w/o the size
	 138  	if _, err = io.ReadFull(r, buf[binary.MaxVarintLen64:size]); err != nil {
	 139  		return
	 140  	}
	 141  
	 142  	// decode as many elements as present in buf
	 143  	for p := binary.MaxVarintLen64; p < size; n++ {
	 144  		x, w := binary.Uvarint(buf[p:])
	 145  		data.set(n, int64(x))
	 146  		p += w
	 147  	}
	 148  
	 149  	return
	 150  }
	 151  
	 152  const bufSize = 16 << 10 // reasonable for BenchmarkSaveRestore
	 153  
	 154  // Read reads the index from r into x; x must not be nil.
	 155  func (x *Index) Read(r io.Reader) error {
	 156  	// buffer for all reads
	 157  	buf := make([]byte, bufSize)
	 158  
	 159  	// read length
	 160  	n64, err := readInt(r, buf)
	 161  	if err != nil {
	 162  		return err
	 163  	}
	 164  	if int64(int(n64)) != n64 || int(n64) < 0 {
	 165  		return errTooBig
	 166  	}
	 167  	n := int(n64)
	 168  
	 169  	// allocate space
	 170  	if 2*n < cap(x.data) || cap(x.data) < n || x.sa.int32 != nil && n > maxData32 || x.sa.int64 != nil && n <= maxData32 {
	 171  		// new data is significantly smaller or larger than
	 172  		// existing buffers - allocate new ones
	 173  		x.data = make([]byte, n)
	 174  		x.sa.int32 = nil
	 175  		x.sa.int64 = nil
	 176  		if n <= maxData32 {
	 177  			x.sa.int32 = make([]int32, n)
	 178  		} else {
	 179  			x.sa.int64 = make([]int64, n)
	 180  		}
	 181  	} else {
	 182  		// re-use existing buffers
	 183  		x.data = x.data[0:n]
	 184  		x.sa = x.sa.slice(0, n)
	 185  	}
	 186  
	 187  	// read data
	 188  	if _, err := io.ReadFull(r, x.data); err != nil {
	 189  		return err
	 190  	}
	 191  
	 192  	// read index
	 193  	sa := x.sa
	 194  	for sa.len() > 0 {
	 195  		n, err := readSlice(r, buf, sa)
	 196  		if err != nil {
	 197  			return err
	 198  		}
	 199  		sa = sa.slice(n, sa.len())
	 200  	}
	 201  	return nil
	 202  }
	 203  
	 204  // Write writes the index x to w.
	 205  func (x *Index) Write(w io.Writer) error {
	 206  	// buffer for all writes
	 207  	buf := make([]byte, bufSize)
	 208  
	 209  	// write length
	 210  	if err := writeInt(w, buf, len(x.data)); err != nil {
	 211  		return err
	 212  	}
	 213  
	 214  	// write data
	 215  	if _, err := w.Write(x.data); err != nil {
	 216  		return err
	 217  	}
	 218  
	 219  	// write index
	 220  	sa := x.sa
	 221  	for sa.len() > 0 {
	 222  		n, err := writeSlice(w, buf, sa)
	 223  		if err != nil {
	 224  			return err
	 225  		}
	 226  		sa = sa.slice(n, sa.len())
	 227  	}
	 228  	return nil
	 229  }
	 230  
	 231  // Bytes returns the data over which the index was created.
	 232  // It must not be modified.
	 233  //
	 234  func (x *Index) Bytes() []byte {
	 235  	return x.data
	 236  }
	 237  
	 238  func (x *Index) at(i int) []byte {
	 239  	return x.data[x.sa.get(i):]
	 240  }
	 241  
	 242  // lookupAll returns a slice into the matching region of the index.
	 243  // The runtime is O(log(N)*len(s)).
	 244  func (x *Index) lookupAll(s []byte) ints {
	 245  	// find matching suffix index range [i:j]
	 246  	// find the first index where s would be the prefix
	 247  	i := sort.Search(x.sa.len(), func(i int) bool { return bytes.Compare(x.at(i), s) >= 0 })
	 248  	// starting at i, find the first index at which s is not a prefix
	 249  	j := i + sort.Search(x.sa.len()-i, func(j int) bool { return !bytes.HasPrefix(x.at(j+i), s) })
	 250  	return x.sa.slice(i, j)
	 251  }
	 252  
	 253  // Lookup returns an unsorted list of at most n indices where the byte string s
	 254  // occurs in the indexed data. If n < 0, all occurrences are returned.
	 255  // The result is nil if s is empty, s is not found, or n == 0.
	 256  // Lookup time is O(log(N)*len(s) + len(result)) where N is the
	 257  // size of the indexed data.
	 258  //
	 259  func (x *Index) Lookup(s []byte, n int) (result []int) {
	 260  	if len(s) > 0 && n != 0 {
	 261  		matches := x.lookupAll(s)
	 262  		count := matches.len()
	 263  		if n < 0 || count < n {
	 264  			n = count
	 265  		}
	 266  		// 0 <= n <= count
	 267  		if n > 0 {
	 268  			result = make([]int, n)
	 269  			if matches.int32 != nil {
	 270  				for i := range result {
	 271  					result[i] = int(matches.int32[i])
	 272  				}
	 273  			} else {
	 274  				for i := range result {
	 275  					result[i] = int(matches.int64[i])
	 276  				}
	 277  			}
	 278  		}
	 279  	}
	 280  	return
	 281  }
	 282  
	 283  // FindAllIndex returns a sorted list of non-overlapping matches of the
	 284  // regular expression r, where a match is a pair of indices specifying
	 285  // the matched slice of x.Bytes(). If n < 0, all matches are returned
	 286  // in successive order. Otherwise, at most n matches are returned and
	 287  // they may not be successive. The result is nil if there are no matches,
	 288  // or if n == 0.
	 289  //
	 290  func (x *Index) FindAllIndex(r *regexp.Regexp, n int) (result [][]int) {
	 291  	// a non-empty literal prefix is used to determine possible
	 292  	// match start indices with Lookup
	 293  	prefix, complete := r.LiteralPrefix()
	 294  	lit := []byte(prefix)
	 295  
	 296  	// worst-case scenario: no literal prefix
	 297  	if prefix == "" {
	 298  		return r.FindAllIndex(x.data, n)
	 299  	}
	 300  
	 301  	// if regexp is a literal just use Lookup and convert its
	 302  	// result into match pairs
	 303  	if complete {
	 304  		// Lookup returns indices that may belong to overlapping matches.
	 305  		// After eliminating them, we may end up with fewer than n matches.
	 306  		// If we don't have enough at the end, redo the search with an
	 307  		// increased value n1, but only if Lookup returned all the requested
	 308  		// indices in the first place (if it returned fewer than that then
	 309  		// there cannot be more).
	 310  		for n1 := n; ; n1 += 2 * (n - len(result)) /* overflow ok */ {
	 311  			indices := x.Lookup(lit, n1)
	 312  			if len(indices) == 0 {
	 313  				return
	 314  			}
	 315  			sort.Ints(indices)
	 316  			pairs := make([]int, 2*len(indices))
	 317  			result = make([][]int, len(indices))
	 318  			count := 0
	 319  			prev := 0
	 320  			for _, i := range indices {
	 321  				if count == n {
	 322  					break
	 323  				}
	 324  				// ignore indices leading to overlapping matches
	 325  				if prev <= i {
	 326  					j := 2 * count
	 327  					pairs[j+0] = i
	 328  					pairs[j+1] = i + len(lit)
	 329  					result[count] = pairs[j : j+2]
	 330  					count++
	 331  					prev = i + len(lit)
	 332  				}
	 333  			}
	 334  			result = result[0:count]
	 335  			if len(result) >= n || len(indices) != n1 {
	 336  				// found all matches or there's no chance to find more
	 337  				// (n and n1 can be negative)
	 338  				break
	 339  			}
	 340  		}
	 341  		if len(result) == 0 {
	 342  			result = nil
	 343  		}
	 344  		return
	 345  	}
	 346  
	 347  	// regexp has a non-empty literal prefix; Lookup(lit) computes
	 348  	// the indices of possible complete matches; use these as starting
	 349  	// points for anchored searches
	 350  	// (regexp "^" matches beginning of input, not beginning of line)
	 351  	r = regexp.MustCompile("^" + r.String()) // compiles because r compiled
	 352  
	 353  	// same comment about Lookup applies here as in the loop above
	 354  	for n1 := n; ; n1 += 2 * (n - len(result)) /* overflow ok */ {
	 355  		indices := x.Lookup(lit, n1)
	 356  		if len(indices) == 0 {
	 357  			return
	 358  		}
	 359  		sort.Ints(indices)
	 360  		result = result[0:0]
	 361  		prev := 0
	 362  		for _, i := range indices {
	 363  			if len(result) == n {
	 364  				break
	 365  			}
	 366  			m := r.FindIndex(x.data[i:]) // anchored search - will not run off
	 367  			// ignore indices leading to overlapping matches
	 368  			if m != nil && prev <= i {
	 369  				m[0] = i // correct m
	 370  				m[1] += i
	 371  				result = append(result, m)
	 372  				prev = m[1]
	 373  			}
	 374  		}
	 375  		if len(result) >= n || len(indices) != n1 {
	 376  			// found all matches or there's no chance to find more
	 377  			// (n and n1 can be negative)
	 378  			break
	 379  		}
	 380  	}
	 381  	if len(result) == 0 {
	 382  		result = nil
	 383  	}
	 384  	return
	 385  }
	 386
View as plain text