...

Source file src/index/suffixarray/sais.go

Documentation: index/suffixarray

		 1  // Copyright 2019 The Go Authors. All rights reserved.
		 2  // Use of this source code is governed by a BSD-style
		 3  // license that can be found in the LICENSE file.
		 4  
		 5  // Suffix array construction by induced sorting (SAIS).
		 6  // See Ge Nong, Sen Zhang, and Wai Hong Chen,
		 7  // "Two Efficient Algorithms for Linear Time Suffix Array Construction",
		 8  // especially section 3 (https://ieeexplore.ieee.org/document/5582081).
		 9  // See also http://zork.net/~st/jottings/sais.html.
		10  //
		11  // With optimizations inspired by Yuta Mori's sais-lite
		12  // (https://sites.google.com/site/yuta256/sais).
		13  //
		14  // And with other new optimizations.
		15  
		16  // Many of these functions are parameterized by the sizes of
		17  // the types they operate on. The generator gen.go makes
		18  // copies of these functions for use with other sizes.
		19  // Specifically:
		20  //
		21  // - A function with a name ending in _8_32 takes []byte and []int32 arguments
		22  //	 and is duplicated into _32_32, _8_64, and _64_64 forms.
		23  //	 The _32_32 and _64_64_ suffixes are shortened to plain _32 and _64.
		24  //	 Any lines in the function body that contain the text "byte-only" or "256"
		25  //	 are stripped when creating _32_32 and _64_64 forms.
		26  //	 (Those lines are typically 8-bit-specific optimizations.)
		27  //
		28  // - A function with a name ending only in _32 operates on []int32
		29  //	 and is duplicated into a _64 form. (Note that it may still take a []byte,
		30  //	 but there is no need for a version of the function in which the []byte
		31  //	 is widened to a full integer array.)
		32  
		33  // The overall runtime of this code is linear in the input size:
		34  // it runs a sequence of linear passes to reduce the problem to
		35  // a subproblem at most half as big, invokes itself recursively,
		36  // and then runs a sequence of linear passes to turn the answer
		37  // for the subproblem into the answer for the original problem.
		38  // This gives T(N) = O(N) + T(N/2) = O(N) + O(N/2) + O(N/4) + ... = O(N).
		39  //
		40  // The outline of the code, with the forward and backward scans
		41  // through O(N)-sized arrays called out, is:
		42  //
		43  // sais_I_N
		44  //	placeLMS_I_B
		45  //		bucketMax_I_B
		46  //			freq_I_B
		47  //				<scan +text> (1)
		48  //			<scan +freq> (2)
		49  //		<scan -text, random bucket> (3)
		50  //	induceSubL_I_B
		51  //		bucketMin_I_B
		52  //			freq_I_B
		53  //				<scan +text, often optimized away> (4)
		54  //			<scan +freq> (5)
		55  //		<scan +sa, random text, random bucket> (6)
		56  //	induceSubS_I_B
		57  //		bucketMax_I_B
		58  //			freq_I_B
		59  //				<scan +text, often optimized away> (7)
		60  //			<scan +freq> (8)
		61  //		<scan -sa, random text, random bucket> (9)
		62  //	assignID_I_B
		63  //		<scan +sa, random text substrings> (10)
		64  //	map_B
		65  //		<scan -sa> (11)
		66  //	recurse_B
		67  //		(recursive call to sais_B_B for a subproblem of size at most 1/2 input, often much smaller)
		68  //	unmap_I_B
		69  //		<scan -text> (12)
		70  //		<scan +sa> (13)
		71  //	expand_I_B
		72  //		bucketMax_I_B
		73  //			freq_I_B
		74  //				<scan +text, often optimized away> (14)
		75  //			<scan +freq> (15)
		76  //		<scan -sa, random text, random bucket> (16)
		77  //	induceL_I_B
		78  //		bucketMin_I_B
		79  //			freq_I_B
		80  //				<scan +text, often optimized away> (17)
		81  //			<scan +freq> (18)
		82  //		<scan +sa, random text, random bucket> (19)
		83  //	induceS_I_B
		84  //		bucketMax_I_B
		85  //			freq_I_B
		86  //				<scan +text, often optimized away> (20)
		87  //			<scan +freq> (21)
		88  //		<scan -sa, random text, random bucket> (22)
		89  //
		90  // Here, _B indicates the suffix array size (_32 or _64) and _I the input size (_8 or _B).
		91  //
		92  // The outline shows there are in general 22 scans through
		93  // O(N)-sized arrays for a given level of the recursion.
		94  // In the top level, operating on 8-bit input text,
		95  // the six freq scans are fixed size (256) instead of potentially
		96  // input-sized. Also, the frequency is counted once and cached
		97  // whenever there is room to do so (there is nearly always room in general,
		98  // and always room at the top level), which eliminates all but
		99  // the first freq_I_B text scans (that is, 5 of the 6).
	 100  // So the top level of the recursion only does 22 - 6 - 5 = 11
	 101  // input-sized scans and a typical level does 16 scans.
	 102  //
	 103  // The linear scans do not cost anywhere near as much as
	 104  // the random accesses to the text made during a few of
	 105  // the scans (specifically #6, #9, #16, #19, #22 marked above).
	 106  // In real texts, there is not much but some locality to
	 107  // the accesses, due to the repetitive structure of the text
	 108  // (the same reason Burrows-Wheeler compression is so effective).
	 109  // For random inputs, there is no locality, which makes those
	 110  // accesses even more expensive, especially once the text
	 111  // no longer fits in cache.
	 112  // For example, running on 50 MB of Go source code, induceSubL_8_32
	 113  // (which runs only once, at the top level of the recursion)
	 114  // takes 0.44s, while on 50 MB of random input, it takes 2.55s.
	 115  // Nearly all the relative slowdown is explained by the text access:
	 116  //
	 117  //		c0, c1 := text[k-1], text[k]
	 118  //
	 119  // That line runs for 0.23s on the Go text and 2.02s on random text.
	 120  
	 121  //go:generate go run gen.go
	 122  
	 123  package suffixarray
	 124  
	 125  // text_32 returns the suffix array for the input text.
	 126  // It requires that len(text) fit in an int32
	 127  // and that the caller zero sa.
	 128  func text_32(text []byte, sa []int32) {
	 129  	if int(int32(len(text))) != len(text) || len(text) != len(sa) {
	 130  		panic("suffixarray: misuse of text_32")
	 131  	}
	 132  	sais_8_32(text, 256, sa, make([]int32, 2*256))
	 133  }
	 134  
	 135  // sais_8_32 computes the suffix array of text.
	 136  // The text must contain only values in [0, textMax).
	 137  // The suffix array is stored in sa, which the caller
	 138  // must ensure is already zeroed.
	 139  // The caller must also provide temporary space tmp
	 140  // with len(tmp) ≥ textMax. If len(tmp) ≥ 2*textMax
	 141  // then the algorithm runs a little faster.
	 142  // If sais_8_32 modifies tmp, it sets tmp[0] = -1 on return.
	 143  func sais_8_32(text []byte, textMax int, sa, tmp []int32) {
	 144  	if len(sa) != len(text) || len(tmp) < int(textMax) {
	 145  		panic("suffixarray: misuse of sais_8_32")
	 146  	}
	 147  
	 148  	// Trivial base cases. Sorting 0 or 1 things is easy.
	 149  	if len(text) == 0 {
	 150  		return
	 151  	}
	 152  	if len(text) == 1 {
	 153  		sa[0] = 0
	 154  		return
	 155  	}
	 156  
	 157  	// Establish slices indexed by text character
	 158  	// holding character frequency and bucket-sort offsets.
	 159  	// If there's only enough tmp for one slice,
	 160  	// we make it the bucket offsets and recompute
	 161  	// the character frequency each time we need it.
	 162  	var freq, bucket []int32
	 163  	if len(tmp) >= 2*textMax {
	 164  		freq, bucket = tmp[:textMax], tmp[textMax:2*textMax]
	 165  		freq[0] = -1 // mark as uninitialized
	 166  	} else {
	 167  		freq, bucket = nil, tmp[:textMax]
	 168  	}
	 169  
	 170  	// The SAIS algorithm.
	 171  	// Each of these calls makes one scan through sa.
	 172  	// See the individual functions for documentation
	 173  	// about each's role in the algorithm.
	 174  	numLMS := placeLMS_8_32(text, sa, freq, bucket)
	 175  	if numLMS <= 1 {
	 176  		// 0 or 1 items are already sorted. Do nothing.
	 177  	} else {
	 178  		induceSubL_8_32(text, sa, freq, bucket)
	 179  		induceSubS_8_32(text, sa, freq, bucket)
	 180  		length_8_32(text, sa, numLMS)
	 181  		maxID := assignID_8_32(text, sa, numLMS)
	 182  		if maxID < numLMS {
	 183  			map_32(sa, numLMS)
	 184  			recurse_32(sa, tmp, numLMS, maxID)
	 185  			unmap_8_32(text, sa, numLMS)
	 186  		} else {
	 187  			// If maxID == numLMS, then each LMS-substring
	 188  			// is unique, so the relative ordering of two LMS-suffixes
	 189  			// is determined by just the leading LMS-substring.
	 190  			// That is, the LMS-suffix sort order matches the
	 191  			// (simpler) LMS-substring sort order.
	 192  			// Copy the original LMS-substring order into the
	 193  			// suffix array destination.
	 194  			copy(sa, sa[len(sa)-numLMS:])
	 195  		}
	 196  		expand_8_32(text, freq, bucket, sa, numLMS)
	 197  	}
	 198  	induceL_8_32(text, sa, freq, bucket)
	 199  	induceS_8_32(text, sa, freq, bucket)
	 200  
	 201  	// Mark for caller that we overwrote tmp.
	 202  	tmp[0] = -1
	 203  }
	 204  
	 205  // freq_8_32 returns the character frequencies
	 206  // for text, as a slice indexed by character value.
	 207  // If freq is nil, freq_8_32 uses and returns bucket.
	 208  // If freq is non-nil, freq_8_32 assumes that freq[0] >= 0
	 209  // means the frequencies are already computed.
	 210  // If the frequency data is overwritten or uninitialized,
	 211  // the caller must set freq[0] = -1 to force recomputation
	 212  // the next time it is needed.
	 213  func freq_8_32(text []byte, freq, bucket []int32) []int32 {
	 214  	if freq != nil && freq[0] >= 0 {
	 215  		return freq // already computed
	 216  	}
	 217  	if freq == nil {
	 218  		freq = bucket
	 219  	}
	 220  
	 221  	freq = freq[:256] // eliminate bounds check for freq[c] below
	 222  	for i := range freq {
	 223  		freq[i] = 0
	 224  	}
	 225  	for _, c := range text {
	 226  		freq[c]++
	 227  	}
	 228  	return freq
	 229  }
	 230  
	 231  // bucketMin_8_32 stores into bucket[c] the minimum index
	 232  // in the bucket for character c in a bucket-sort of text.
	 233  func bucketMin_8_32(text []byte, freq, bucket []int32) {
	 234  	freq = freq_8_32(text, freq, bucket)
	 235  	freq = freq[:256]		 // establish len(freq) = 256, so 0 ≤ i < 256 below
	 236  	bucket = bucket[:256] // eliminate bounds check for bucket[i] below
	 237  	total := int32(0)
	 238  	for i, n := range freq {
	 239  		bucket[i] = total
	 240  		total += n
	 241  	}
	 242  }
	 243  
	 244  // bucketMax_8_32 stores into bucket[c] the maximum index
	 245  // in the bucket for character c in a bucket-sort of text.
	 246  // The bucket indexes for c are [min, max).
	 247  // That is, max is one past the final index in that bucket.
	 248  func bucketMax_8_32(text []byte, freq, bucket []int32) {
	 249  	freq = freq_8_32(text, freq, bucket)
	 250  	freq = freq[:256]		 // establish len(freq) = 256, so 0 ≤ i < 256 below
	 251  	bucket = bucket[:256] // eliminate bounds check for bucket[i] below
	 252  	total := int32(0)
	 253  	for i, n := range freq {
	 254  		total += n
	 255  		bucket[i] = total
	 256  	}
	 257  }
	 258  
	 259  // The SAIS algorithm proceeds in a sequence of scans through sa.
	 260  // Each of the following functions implements one scan,
	 261  // and the functions appear here in the order they execute in the algorithm.
	 262  
	 263  // placeLMS_8_32 places into sa the indexes of the
	 264  // final characters of the LMS substrings of text,
	 265  // sorted into the rightmost ends of their correct buckets
	 266  // in the suffix array.
	 267  //
	 268  // The imaginary sentinel character at the end of the text
	 269  // is the final character of the final LMS substring, but there
	 270  // is no bucket for the imaginary sentinel character,
	 271  // which has a smaller value than any real character.
	 272  // The caller must therefore pretend that sa[-1] == len(text).
	 273  //
	 274  // The text indexes of LMS-substring characters are always ≥ 1
	 275  // (the first LMS-substring must be preceded by one or more L-type
	 276  // characters that are not part of any LMS-substring),
	 277  // so using 0 as a “not present” suffix array entry is safe,
	 278  // both in this function and in most later functions
	 279  // (until induceL_8_32 below).
	 280  func placeLMS_8_32(text []byte, sa, freq, bucket []int32) int {
	 281  	bucketMax_8_32(text, freq, bucket)
	 282  
	 283  	numLMS := 0
	 284  	lastB := int32(-1)
	 285  	bucket = bucket[:256] // eliminate bounds check for bucket[c1] below
	 286  
	 287  	// The next stanza of code (until the blank line) loop backward
	 288  	// over text, stopping to execute a code body at each position i
	 289  	// such that text[i] is an L-character and text[i+1] is an S-character.
	 290  	// That is, i+1 is the position of the start of an LMS-substring.
	 291  	// These could be hoisted out into a function with a callback,
	 292  	// but at a significant speed cost. Instead, we just write these
	 293  	// seven lines a few times in this source file. The copies below
	 294  	// refer back to the pattern established by this original as the
	 295  	// "LMS-substring iterator".
	 296  	//
	 297  	// In every scan through the text, c0, c1 are successive characters of text.
	 298  	// In this backward scan, c0 == text[i] and c1 == text[i+1].
	 299  	// By scanning backward, we can keep track of whether the current
	 300  	// position is type-S or type-L according to the usual definition:
	 301  	//
	 302  	//	- position len(text) is type S with text[len(text)] == -1 (the sentinel)
	 303  	//	- position i is type S if text[i] < text[i+1], or if text[i] == text[i+1] && i+1 is type S.
	 304  	//	- position i is type L if text[i] > text[i+1], or if text[i] == text[i+1] && i+1 is type L.
	 305  	//
	 306  	// The backward scan lets us maintain the current type,
	 307  	// update it when we see c0 != c1, and otherwise leave it alone.
	 308  	// We want to identify all S positions with a preceding L.
	 309  	// Position len(text) is one such position by definition, but we have
	 310  	// nowhere to write it down, so we eliminate it by untruthfully
	 311  	// setting isTypeS = false at the start of the loop.
	 312  	c0, c1, isTypeS := byte(0), byte(0), false
	 313  	for i := len(text) - 1; i >= 0; i-- {
	 314  		c0, c1 = text[i], c0
	 315  		if c0 < c1 {
	 316  			isTypeS = true
	 317  		} else if c0 > c1 && isTypeS {
	 318  			isTypeS = false
	 319  
	 320  			// Bucket the index i+1 for the start of an LMS-substring.
	 321  			b := bucket[c1] - 1
	 322  			bucket[c1] = b
	 323  			sa[b] = int32(i + 1)
	 324  			lastB = b
	 325  			numLMS++
	 326  		}
	 327  	}
	 328  
	 329  	// We recorded the LMS-substring starts but really want the ends.
	 330  	// Luckily, with two differences, the start indexes and the end indexes are the same.
	 331  	// The first difference is that the rightmost LMS-substring's end index is len(text),
	 332  	// so the caller must pretend that sa[-1] == len(text), as noted above.
	 333  	// The second difference is that the first leftmost LMS-substring start index
	 334  	// does not end an earlier LMS-substring, so as an optimization we can omit
	 335  	// that leftmost LMS-substring start index (the last one we wrote).
	 336  	//
	 337  	// Exception: if numLMS <= 1, the caller is not going to bother with
	 338  	// the recursion at all and will treat the result as containing LMS-substring starts.
	 339  	// In that case, we don't remove the final entry.
	 340  	if numLMS > 1 {
	 341  		sa[lastB] = 0
	 342  	}
	 343  	return numLMS
	 344  }
	 345  
	 346  // induceSubL_8_32 inserts the L-type text indexes of LMS-substrings
	 347  // into sa, assuming that the final characters of the LMS-substrings
	 348  // are already inserted into sa, sorted by final character, and at the
	 349  // right (not left) end of the corresponding character bucket.
	 350  // Each LMS-substring has the form (as a regexp) /S+L+S/:
	 351  // one or more S-type, one or more L-type, final S-type.
	 352  // induceSubL_8_32 leaves behind only the leftmost L-type text
	 353  // index for each LMS-substring. That is, it removes the final S-type
	 354  // indexes that are present on entry, and it inserts but then removes
	 355  // the interior L-type indexes too.
	 356  // (Only the leftmost L-type index is needed by induceSubS_8_32.)
	 357  func induceSubL_8_32(text []byte, sa, freq, bucket []int32) {
	 358  	// Initialize positions for left side of character buckets.
	 359  	bucketMin_8_32(text, freq, bucket)
	 360  	bucket = bucket[:256] // eliminate bounds check for bucket[cB] below
	 361  
	 362  	// As we scan the array left-to-right, each sa[i] = j > 0 is a correctly
	 363  	// sorted suffix array entry (for text[j:]) for which we know that j-1 is type L.
	 364  	// Because j-1 is type L, inserting it into sa now will sort it correctly.
	 365  	// But we want to distinguish a j-1 with j-2 of type L from type S.
	 366  	// We can process the former but want to leave the latter for the caller.
	 367  	// We record the difference by negating j-1 if it is preceded by type S.
	 368  	// Either way, the insertion (into the text[j-1] bucket) is guaranteed to
	 369  	// happen at sa[i´] for some i´ > i, that is, in the portion of sa we have
	 370  	// yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3,
	 371  	// and so on, in sorted but not necessarily adjacent order, until it finds
	 372  	// one preceded by an index of type S, at which point it must stop.
	 373  	//
	 374  	// As we scan through the array, we clear the worked entries (sa[i] > 0) to zero,
	 375  	// and we flip sa[i] < 0 to -sa[i], so that the loop finishes with sa containing
	 376  	// only the indexes of the leftmost L-type indexes for each LMS-substring.
	 377  	//
	 378  	// The suffix array sa therefore serves simultaneously as input, output,
	 379  	// and a miraculously well-tailored work queue.
	 380  
	 381  	// placeLMS_8_32 left out the implicit entry sa[-1] == len(text),
	 382  	// corresponding to the identified type-L index len(text)-1.
	 383  	// Process it before the left-to-right scan of sa proper.
	 384  	// See body in loop for commentary.
	 385  	k := len(text) - 1
	 386  	c0, c1 := text[k-1], text[k]
	 387  	if c0 < c1 {
	 388  		k = -k
	 389  	}
	 390  
	 391  	// Cache recently used bucket index:
	 392  	// we're processing suffixes in sorted order
	 393  	// and accessing buckets indexed by the
	 394  	// byte before the sorted order, which still
	 395  	// has very good locality.
	 396  	// Invariant: b is cached, possibly dirty copy of bucket[cB].
	 397  	cB := c1
	 398  	b := bucket[cB]
	 399  	sa[b] = int32(k)
	 400  	b++
	 401  
	 402  	for i := 0; i < len(sa); i++ {
	 403  		j := int(sa[i])
	 404  		if j == 0 {
	 405  			// Skip empty entry.
	 406  			continue
	 407  		}
	 408  		if j < 0 {
	 409  			// Leave discovered type-S index for caller.
	 410  			sa[i] = int32(-j)
	 411  			continue
	 412  		}
	 413  		sa[i] = 0
	 414  
	 415  		// Index j was on work queue, meaning k := j-1 is L-type,
	 416  		// so we can now place k correctly into sa.
	 417  		// If k-1 is L-type, queue k for processing later in this loop.
	 418  		// If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller.
	 419  		k := j - 1
	 420  		c0, c1 := text[k-1], text[k]
	 421  		if c0 < c1 {
	 422  			k = -k
	 423  		}
	 424  
	 425  		if cB != c1 {
	 426  			bucket[cB] = b
	 427  			cB = c1
	 428  			b = bucket[cB]
	 429  		}
	 430  		sa[b] = int32(k)
	 431  		b++
	 432  	}
	 433  }
	 434  
	 435  // induceSubS_8_32 inserts the S-type text indexes of LMS-substrings
	 436  // into sa, assuming that the leftmost L-type text indexes are already
	 437  // inserted into sa, sorted by LMS-substring suffix, and at the
	 438  // left end of the corresponding character bucket.
	 439  // Each LMS-substring has the form (as a regexp) /S+L+S/:
	 440  // one or more S-type, one or more L-type, final S-type.
	 441  // induceSubS_8_32 leaves behind only the leftmost S-type text
	 442  // index for each LMS-substring, in sorted order, at the right end of sa.
	 443  // That is, it removes the L-type indexes that are present on entry,
	 444  // and it inserts but then removes the interior S-type indexes too,
	 445  // leaving the LMS-substring start indexes packed into sa[len(sa)-numLMS:].
	 446  // (Only the LMS-substring start indexes are processed by the recursion.)
	 447  func induceSubS_8_32(text []byte, sa, freq, bucket []int32) {
	 448  	// Initialize positions for right side of character buckets.
	 449  	bucketMax_8_32(text, freq, bucket)
	 450  	bucket = bucket[:256] // eliminate bounds check for bucket[cB] below
	 451  
	 452  	// Analogous to induceSubL_8_32 above,
	 453  	// as we scan the array right-to-left, each sa[i] = j > 0 is a correctly
	 454  	// sorted suffix array entry (for text[j:]) for which we know that j-1 is type S.
	 455  	// Because j-1 is type S, inserting it into sa now will sort it correctly.
	 456  	// But we want to distinguish a j-1 with j-2 of type S from type L.
	 457  	// We can process the former but want to leave the latter for the caller.
	 458  	// We record the difference by negating j-1 if it is preceded by type L.
	 459  	// Either way, the insertion (into the text[j-1] bucket) is guaranteed to
	 460  	// happen at sa[i´] for some i´ < i, that is, in the portion of sa we have
	 461  	// yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3,
	 462  	// and so on, in sorted but not necessarily adjacent order, until it finds
	 463  	// one preceded by an index of type L, at which point it must stop.
	 464  	// That index (preceded by one of type L) is an LMS-substring start.
	 465  	//
	 466  	// As we scan through the array, we clear the worked entries (sa[i] > 0) to zero,
	 467  	// and we flip sa[i] < 0 to -sa[i] and compact into the top of sa,
	 468  	// so that the loop finishes with the top of sa containing exactly
	 469  	// the LMS-substring start indexes, sorted by LMS-substring.
	 470  
	 471  	// Cache recently used bucket index:
	 472  	cB := byte(0)
	 473  	b := bucket[cB]
	 474  
	 475  	top := len(sa)
	 476  	for i := len(sa) - 1; i >= 0; i-- {
	 477  		j := int(sa[i])
	 478  		if j == 0 {
	 479  			// Skip empty entry.
	 480  			continue
	 481  		}
	 482  		sa[i] = 0
	 483  		if j < 0 {
	 484  			// Leave discovered LMS-substring start index for caller.
	 485  			top--
	 486  			sa[top] = int32(-j)
	 487  			continue
	 488  		}
	 489  
	 490  		// Index j was on work queue, meaning k := j-1 is S-type,
	 491  		// so we can now place k correctly into sa.
	 492  		// If k-1 is S-type, queue k for processing later in this loop.
	 493  		// If k-1 is L-type (text[k-1] > text[k]), queue -k to save for the caller.
	 494  		k := j - 1
	 495  		c1 := text[k]
	 496  		c0 := text[k-1]
	 497  		if c0 > c1 {
	 498  			k = -k
	 499  		}
	 500  
	 501  		if cB != c1 {
	 502  			bucket[cB] = b
	 503  			cB = c1
	 504  			b = bucket[cB]
	 505  		}
	 506  		b--
	 507  		sa[b] = int32(k)
	 508  	}
	 509  }
	 510  
	 511  // length_8_32 computes and records the length of each LMS-substring in text.
	 512  // The length of the LMS-substring at index j is stored at sa[j/2],
	 513  // avoiding the LMS-substring indexes already stored in the top half of sa.
	 514  // (If index j is an LMS-substring start, then index j-1 is type L and cannot be.)
	 515  // There are two exceptions, made for optimizations in name_8_32 below.
	 516  //
	 517  // First, the final LMS-substring is recorded as having length 0, which is otherwise
	 518  // impossible, instead of giving it a length that includes the implicit sentinel.
	 519  // This ensures the final LMS-substring has length unequal to all others
	 520  // and therefore can be detected as different without text comparison
	 521  // (it is unequal because it is the only one that ends in the implicit sentinel,
	 522  // and the text comparison would be problematic since the implicit sentinel
	 523  // is not actually present at text[len(text)]).
	 524  //
	 525  // Second, to avoid text comparison entirely, if an LMS-substring is very short,
	 526  // sa[j/2] records its actual text instead of its length, so that if two such
	 527  // substrings have matching “length,” the text need not be read at all.
	 528  // The definition of “very short” is that the text bytes must pack into an uint32,
	 529  // and the unsigned encoding e must be ≥ len(text), so that it can be
	 530  // distinguished from a valid length.
	 531  func length_8_32(text []byte, sa []int32, numLMS int) {
	 532  	end := 0 // index of current LMS-substring end (0 indicates final LMS-substring)
	 533  
	 534  	// The encoding of N text bytes into a “length” word
	 535  	// adds 1 to each byte, packs them into the bottom
	 536  	// N*8 bits of a word, and then bitwise inverts the result.
	 537  	// That is, the text sequence A B C (hex 41 42 43)
	 538  	// encodes as ^uint32(0x42_43_44).
	 539  	// LMS-substrings can never start or end with 0xFF.
	 540  	// Adding 1 ensures the encoded byte sequence never
	 541  	// starts or ends with 0x00, so that present bytes can be
	 542  	// distinguished from zero-padding in the top bits,
	 543  	// so the length need not be separately encoded.
	 544  	// Inverting the bytes increases the chance that a
	 545  	// 4-byte encoding will still be ≥ len(text).
	 546  	// In particular, if the first byte is ASCII (<= 0x7E, so +1 <= 0x7F)
	 547  	// then the high bit of the inversion will be set,
	 548  	// making it clearly not a valid length (it would be a negative one).
	 549  	//
	 550  	// cx holds the pre-inverted encoding (the packed incremented bytes).
	 551  	cx := uint32(0) // byte-only
	 552  
	 553  	// This stanza (until the blank line) is the "LMS-substring iterator",
	 554  	// described in placeLMS_8_32 above, with one line added to maintain cx.
	 555  	c0, c1, isTypeS := byte(0), byte(0), false
	 556  	for i := len(text) - 1; i >= 0; i-- {
	 557  		c0, c1 = text[i], c0
	 558  		cx = cx<<8 | uint32(c1+1) // byte-only
	 559  		if c0 < c1 {
	 560  			isTypeS = true
	 561  		} else if c0 > c1 && isTypeS {
	 562  			isTypeS = false
	 563  
	 564  			// Index j = i+1 is the start of an LMS-substring.
	 565  			// Compute length or encoded text to store in sa[j/2].
	 566  			j := i + 1
	 567  			var code int32
	 568  			if end == 0 {
	 569  				code = 0
	 570  			} else {
	 571  				code = int32(end - j)
	 572  				if code <= 32/8 && ^cx >= uint32(len(text)) { // byte-only
	 573  					code = int32(^cx) // byte-only
	 574  				} // byte-only
	 575  			}
	 576  			sa[j>>1] = code
	 577  			end = j + 1
	 578  			cx = uint32(c1 + 1) // byte-only
	 579  		}
	 580  	}
	 581  }
	 582  
	 583  // assignID_8_32 assigns a dense ID numbering to the
	 584  // set of LMS-substrings respecting string ordering and equality,
	 585  // returning the maximum assigned ID.
	 586  // For example given the input "ababab", the LMS-substrings
	 587  // are "aba", "aba", and "ab", renumbered as 2 2 1.
	 588  // sa[len(sa)-numLMS:] holds the LMS-substring indexes
	 589  // sorted in string order, so to assign numbers we can
	 590  // consider each in turn, removing adjacent duplicates.
	 591  // The new ID for the LMS-substring at index j is written to sa[j/2],
	 592  // overwriting the length previously stored there (by length_8_32 above).
	 593  func assignID_8_32(text []byte, sa []int32, numLMS int) int {
	 594  	id := 0
	 595  	lastLen := int32(-1) // impossible
	 596  	lastPos := int32(0)
	 597  	for _, j := range sa[len(sa)-numLMS:] {
	 598  		// Is the LMS-substring at index j new, or is it the same as the last one we saw?
	 599  		n := sa[j/2]
	 600  		if n != lastLen {
	 601  			goto New
	 602  		}
	 603  		if uint32(n) >= uint32(len(text)) {
	 604  			// “Length” is really encoded full text, and they match.
	 605  			goto Same
	 606  		}
	 607  		{
	 608  			// Compare actual texts.
	 609  			n := int(n)
	 610  			this := text[j:][:n]
	 611  			last := text[lastPos:][:n]
	 612  			for i := 0; i < n; i++ {
	 613  				if this[i] != last[i] {
	 614  					goto New
	 615  				}
	 616  			}
	 617  			goto Same
	 618  		}
	 619  	New:
	 620  		id++
	 621  		lastPos = j
	 622  		lastLen = n
	 623  	Same:
	 624  		sa[j/2] = int32(id)
	 625  	}
	 626  	return id
	 627  }
	 628  
	 629  // map_32 maps the LMS-substrings in text to their new IDs,
	 630  // producing the subproblem for the recursion.
	 631  // The mapping itself was mostly applied by assignID_8_32:
	 632  // sa[i] is either 0, the ID for the LMS-substring at index 2*i,
	 633  // or the ID for the LMS-substring at index 2*i+1.
	 634  // To produce the subproblem we need only remove the zeros
	 635  // and change ID into ID-1 (our IDs start at 1, but text chars start at 0).
	 636  //
	 637  // map_32 packs the result, which is the input to the recursion,
	 638  // into the top of sa, so that the recursion result can be stored
	 639  // in the bottom of sa, which sets up for expand_8_32 well.
	 640  func map_32(sa []int32, numLMS int) {
	 641  	w := len(sa)
	 642  	for i := len(sa) / 2; i >= 0; i-- {
	 643  		j := sa[i]
	 644  		if j > 0 {
	 645  			w--
	 646  			sa[w] = j - 1
	 647  		}
	 648  	}
	 649  }
	 650  
	 651  // recurse_32 calls sais_32 recursively to solve the subproblem we've built.
	 652  // The subproblem is at the right end of sa, the suffix array result will be
	 653  // written at the left end of sa, and the middle of sa is available for use as
	 654  // temporary frequency and bucket storage.
	 655  func recurse_32(sa, oldTmp []int32, numLMS, maxID int) {
	 656  	dst, saTmp, text := sa[:numLMS], sa[numLMS:len(sa)-numLMS], sa[len(sa)-numLMS:]
	 657  
	 658  	// Set up temporary space for recursive call.
	 659  	// We must pass sais_32 a tmp buffer wiith at least maxID entries.
	 660  	//
	 661  	// The subproblem is guaranteed to have length at most len(sa)/2,
	 662  	// so that sa can hold both the subproblem and its suffix array.
	 663  	// Nearly all the time, however, the subproblem has length < len(sa)/3,
	 664  	// in which case there is a subproblem-sized middle of sa that
	 665  	// we can reuse for temporary space (saTmp).
	 666  	// When recurse_32 is called from sais_8_32, oldTmp is length 512
	 667  	// (from text_32), and saTmp will typically be much larger, so we'll use saTmp.
	 668  	// When deeper recursions come back to recurse_32, now oldTmp is
	 669  	// the saTmp from the top-most recursion, it is typically larger than
	 670  	// the current saTmp (because the current sa gets smaller and smaller
	 671  	// as the recursion gets deeper), and we keep reusing that top-most
	 672  	// large saTmp instead of the offered smaller ones.
	 673  	//
	 674  	// Why is the subproblem length so often just under len(sa)/3?
	 675  	// See Nong, Zhang, and Chen, section 3.6 for a plausible explanation.
	 676  	// In brief, the len(sa)/2 case would correspond to an SLSLSLSLSLSL pattern
	 677  	// in the input, perfect alternation of larger and smaller input bytes.
	 678  	// Real text doesn't do that. If each L-type index is randomly followed
	 679  	// by either an L-type or S-type index, then half the substrings will
	 680  	// be of the form SLS, but the other half will be longer. Of that half,
	 681  	// half (a quarter overall) will be SLLS; an eighth will be SLLLS, and so on.
	 682  	// Not counting the final S in each (which overlaps the first S in the next),
	 683  	// This works out to an average length 2×½ + 3×¼ + 4×⅛ + ... = 3.
	 684  	// The space we need is further reduced by the fact that many of the
	 685  	// short patterns like SLS will often be the same character sequences
	 686  	// repeated throughout the text, reducing maxID relative to numLMS.
	 687  	//
	 688  	// For short inputs, the averages may not run in our favor, but then we
	 689  	// can often fall back to using the length-512 tmp available in the
	 690  	// top-most call. (Also a short allocation would not be a big deal.)
	 691  	//
	 692  	// For pathological inputs, we fall back to allocating a new tmp of length
	 693  	// max(maxID, numLMS/2). This level of the recursion needs maxID,
	 694  	// and all deeper levels of the recursion will need no more than numLMS/2,
	 695  	// so this one allocation is guaranteed to suffice for the entire stack
	 696  	// of recursive calls.
	 697  	tmp := oldTmp
	 698  	if len(tmp) < len(saTmp) {
	 699  		tmp = saTmp
	 700  	}
	 701  	if len(tmp) < numLMS {
	 702  		// TestSAIS/forcealloc reaches this code.
	 703  		n := maxID
	 704  		if n < numLMS/2 {
	 705  			n = numLMS / 2
	 706  		}
	 707  		tmp = make([]int32, n)
	 708  	}
	 709  
	 710  	// sais_32 requires that the caller arrange to clear dst,
	 711  	// because in general the caller may know dst is
	 712  	// freshly-allocated and already cleared. But this one is not.
	 713  	for i := range dst {
	 714  		dst[i] = 0
	 715  	}
	 716  	sais_32(text, maxID, dst, tmp)
	 717  }
	 718  
	 719  // unmap_8_32 unmaps the subproblem back to the original.
	 720  // sa[:numLMS] is the LMS-substring numbers, which don't matter much anymore.
	 721  // sa[len(sa)-numLMS:] is the sorted list of those LMS-substring numbers.
	 722  // The key part is that if the list says K that means the K'th substring.
	 723  // We can replace sa[:numLMS] with the indexes of the LMS-substrings.
	 724  // Then if the list says K it really means sa[K].
	 725  // Having mapped the list back to LMS-substring indexes,
	 726  // we can place those into the right buckets.
	 727  func unmap_8_32(text []byte, sa []int32, numLMS int) {
	 728  	unmap := sa[len(sa)-numLMS:]
	 729  	j := len(unmap)
	 730  
	 731  	// "LMS-substring iterator" (see placeLMS_8_32 above).
	 732  	c0, c1, isTypeS := byte(0), byte(0), false
	 733  	for i := len(text) - 1; i >= 0; i-- {
	 734  		c0, c1 = text[i], c0
	 735  		if c0 < c1 {
	 736  			isTypeS = true
	 737  		} else if c0 > c1 && isTypeS {
	 738  			isTypeS = false
	 739  
	 740  			// Populate inverse map.
	 741  			j--
	 742  			unmap[j] = int32(i + 1)
	 743  		}
	 744  	}
	 745  
	 746  	// Apply inverse map to subproblem suffix array.
	 747  	sa = sa[:numLMS]
	 748  	for i := 0; i < len(sa); i++ {
	 749  		sa[i] = unmap[sa[i]]
	 750  	}
	 751  }
	 752  
	 753  // expand_8_32 distributes the compacted, sorted LMS-suffix indexes
	 754  // from sa[:numLMS] into the tops of the appropriate buckets in sa,
	 755  // preserving the sorted order and making room for the L-type indexes
	 756  // to be slotted into the sorted sequence by induceL_8_32.
	 757  func expand_8_32(text []byte, freq, bucket, sa []int32, numLMS int) {
	 758  	bucketMax_8_32(text, freq, bucket)
	 759  	bucket = bucket[:256] // eliminate bound check for bucket[c] below
	 760  
	 761  	// Loop backward through sa, always tracking
	 762  	// the next index to populate from sa[:numLMS].
	 763  	// When we get to one, populate it.
	 764  	// Zero the rest of the slots; they have dead values in them.
	 765  	x := numLMS - 1
	 766  	saX := sa[x]
	 767  	c := text[saX]
	 768  	b := bucket[c] - 1
	 769  	bucket[c] = b
	 770  
	 771  	for i := len(sa) - 1; i >= 0; i-- {
	 772  		if i != int(b) {
	 773  			sa[i] = 0
	 774  			continue
	 775  		}
	 776  		sa[i] = saX
	 777  
	 778  		// Load next entry to put down (if any).
	 779  		if x > 0 {
	 780  			x--
	 781  			saX = sa[x] // TODO bounds check
	 782  			c = text[saX]
	 783  			b = bucket[c] - 1
	 784  			bucket[c] = b
	 785  		}
	 786  	}
	 787  }
	 788  
	 789  // induceL_8_32 inserts L-type text indexes into sa,
	 790  // assuming that the leftmost S-type indexes are inserted
	 791  // into sa, in sorted order, in the right bucket halves.
	 792  // It leaves all the L-type indexes in sa, but the
	 793  // leftmost L-type indexes are negated, to mark them
	 794  // for processing by induceS_8_32.
	 795  func induceL_8_32(text []byte, sa, freq, bucket []int32) {
	 796  	// Initialize positions for left side of character buckets.
	 797  	bucketMin_8_32(text, freq, bucket)
	 798  	bucket = bucket[:256] // eliminate bounds check for bucket[cB] below
	 799  
	 800  	// This scan is similar to the one in induceSubL_8_32 above.
	 801  	// That one arranges to clear all but the leftmost L-type indexes.
	 802  	// This scan leaves all the L-type indexes and the original S-type
	 803  	// indexes, but it negates the positive leftmost L-type indexes
	 804  	// (the ones that induceS_8_32 needs to process).
	 805  
	 806  	// expand_8_32 left out the implicit entry sa[-1] == len(text),
	 807  	// corresponding to the identified type-L index len(text)-1.
	 808  	// Process it before the left-to-right scan of sa proper.
	 809  	// See body in loop for commentary.
	 810  	k := len(text) - 1
	 811  	c0, c1 := text[k-1], text[k]
	 812  	if c0 < c1 {
	 813  		k = -k
	 814  	}
	 815  
	 816  	// Cache recently used bucket index.
	 817  	cB := c1
	 818  	b := bucket[cB]
	 819  	sa[b] = int32(k)
	 820  	b++
	 821  
	 822  	for i := 0; i < len(sa); i++ {
	 823  		j := int(sa[i])
	 824  		if j <= 0 {
	 825  			// Skip empty or negated entry (including negated zero).
	 826  			continue
	 827  		}
	 828  
	 829  		// Index j was on work queue, meaning k := j-1 is L-type,
	 830  		// so we can now place k correctly into sa.
	 831  		// If k-1 is L-type, queue k for processing later in this loop.
	 832  		// If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller.
	 833  		// If k is zero, k-1 doesn't exist, so we only need to leave it
	 834  		// for the caller. The caller can't tell the difference between
	 835  		// an empty slot and a non-empty zero, but there's no need
	 836  		// to distinguish them anyway: the final suffix array will end up
	 837  		// with one zero somewhere, and that will be a real zero.
	 838  		k := j - 1
	 839  		c1 := text[k]
	 840  		if k > 0 {
	 841  			if c0 := text[k-1]; c0 < c1 {
	 842  				k = -k
	 843  			}
	 844  		}
	 845  
	 846  		if cB != c1 {
	 847  			bucket[cB] = b
	 848  			cB = c1
	 849  			b = bucket[cB]
	 850  		}
	 851  		sa[b] = int32(k)
	 852  		b++
	 853  	}
	 854  }
	 855  
	 856  func induceS_8_32(text []byte, sa, freq, bucket []int32) {
	 857  	// Initialize positions for right side of character buckets.
	 858  	bucketMax_8_32(text, freq, bucket)
	 859  	bucket = bucket[:256] // eliminate bounds check for bucket[cB] below
	 860  
	 861  	cB := byte(0)
	 862  	b := bucket[cB]
	 863  
	 864  	for i := len(sa) - 1; i >= 0; i-- {
	 865  		j := int(sa[i])
	 866  		if j >= 0 {
	 867  			// Skip non-flagged entry.
	 868  			// (This loop can't see an empty entry; 0 means the real zero index.)
	 869  			continue
	 870  		}
	 871  
	 872  		// Negative j is a work queue entry; rewrite to positive j for final suffix array.
	 873  		j = -j
	 874  		sa[i] = int32(j)
	 875  
	 876  		// Index j was on work queue (encoded as -j but now decoded),
	 877  		// meaning k := j-1 is L-type,
	 878  		// so we can now place k correctly into sa.
	 879  		// If k-1 is S-type, queue -k for processing later in this loop.
	 880  		// If k-1 is L-type (text[k-1] > text[k]), queue k to save for the caller.
	 881  		// If k is zero, k-1 doesn't exist, so we only need to leave it
	 882  		// for the caller.
	 883  		k := j - 1
	 884  		c1 := text[k]
	 885  		if k > 0 {
	 886  			if c0 := text[k-1]; c0 <= c1 {
	 887  				k = -k
	 888  			}
	 889  		}
	 890  
	 891  		if cB != c1 {
	 892  			bucket[cB] = b
	 893  			cB = c1
	 894  			b = bucket[cB]
	 895  		}
	 896  		b--
	 897  		sa[b] = int32(k)
	 898  	}
	 899  }
	 900  

View as plain text