1 // Copyright 2019 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Suffix array construction by induced sorting (SAIS). 6 // See Ge Nong, Sen Zhang, and Wai Hong Chen, 7 // "Two Efficient Algorithms for Linear Time Suffix Array Construction", 8 // especially section 3 (https://ieeexplore.ieee.org/document/5582081). 9 // See also http://zork.net/~st/jottings/sais.html. 10 // 11 // With optimizations inspired by Yuta Mori's sais-lite 12 // (https://sites.google.com/site/yuta256/sais). 13 // 14 // And with other new optimizations. 15 16 // Many of these functions are parameterized by the sizes of 17 // the types they operate on. The generator gen.go makes 18 // copies of these functions for use with other sizes. 19 // Specifically: 20 // 21 // - A function with a name ending in _8_32 takes []byte and []int32 arguments 22 // and is duplicated into _32_32, _8_64, and _64_64 forms. 23 // The _32_32 and _64_64_ suffixes are shortened to plain _32 and _64. 24 // Any lines in the function body that contain the text "byte-only" or "256" 25 // are stripped when creating _32_32 and _64_64 forms. 26 // (Those lines are typically 8-bit-specific optimizations.) 27 // 28 // - A function with a name ending only in _32 operates on []int32 29 // and is duplicated into a _64 form. (Note that it may still take a []byte, 30 // but there is no need for a version of the function in which the []byte 31 // is widened to a full integer array.) 32 33 // The overall runtime of this code is linear in the input size: 34 // it runs a sequence of linear passes to reduce the problem to 35 // a subproblem at most half as big, invokes itself recursively, 36 // and then runs a sequence of linear passes to turn the answer 37 // for the subproblem into the answer for the original problem. 38 // This gives T(N) = O(N) + T(N/2) = O(N) + O(N/2) + O(N/4) + ... = O(N). 39 // 40 // The outline of the code, with the forward and backward scans 41 // through O(N)-sized arrays called out, is: 42 // 43 // sais_I_N 44 // placeLMS_I_B 45 // bucketMax_I_B 46 // freq_I_B 47 // <scan +text> (1) 48 // <scan +freq> (2) 49 // <scan -text, random bucket> (3) 50 // induceSubL_I_B 51 // bucketMin_I_B 52 // freq_I_B 53 // <scan +text, often optimized away> (4) 54 // <scan +freq> (5) 55 // <scan +sa, random text, random bucket> (6) 56 // induceSubS_I_B 57 // bucketMax_I_B 58 // freq_I_B 59 // <scan +text, often optimized away> (7) 60 // <scan +freq> (8) 61 // <scan -sa, random text, random bucket> (9) 62 // assignID_I_B 63 // <scan +sa, random text substrings> (10) 64 // map_B 65 // <scan -sa> (11) 66 // recurse_B 67 // (recursive call to sais_B_B for a subproblem of size at most 1/2 input, often much smaller) 68 // unmap_I_B 69 // <scan -text> (12) 70 // <scan +sa> (13) 71 // expand_I_B 72 // bucketMax_I_B 73 // freq_I_B 74 // <scan +text, often optimized away> (14) 75 // <scan +freq> (15) 76 // <scan -sa, random text, random bucket> (16) 77 // induceL_I_B 78 // bucketMin_I_B 79 // freq_I_B 80 // <scan +text, often optimized away> (17) 81 // <scan +freq> (18) 82 // <scan +sa, random text, random bucket> (19) 83 // induceS_I_B 84 // bucketMax_I_B 85 // freq_I_B 86 // <scan +text, often optimized away> (20) 87 // <scan +freq> (21) 88 // <scan -sa, random text, random bucket> (22) 89 // 90 // Here, _B indicates the suffix array size (_32 or _64) and _I the input size (_8 or _B). 91 // 92 // The outline shows there are in general 22 scans through 93 // O(N)-sized arrays for a given level of the recursion. 94 // In the top level, operating on 8-bit input text, 95 // the six freq scans are fixed size (256) instead of potentially 96 // input-sized. Also, the frequency is counted once and cached 97 // whenever there is room to do so (there is nearly always room in general, 98 // and always room at the top level), which eliminates all but 99 // the first freq_I_B text scans (that is, 5 of the 6). 100 // So the top level of the recursion only does 22 - 6 - 5 = 11 101 // input-sized scans and a typical level does 16 scans. 102 // 103 // The linear scans do not cost anywhere near as much as 104 // the random accesses to the text made during a few of 105 // the scans (specifically #6, #9, #16, #19, #22 marked above). 106 // In real texts, there is not much but some locality to 107 // the accesses, due to the repetitive structure of the text 108 // (the same reason Burrows-Wheeler compression is so effective). 109 // For random inputs, there is no locality, which makes those 110 // accesses even more expensive, especially once the text 111 // no longer fits in cache. 112 // For example, running on 50 MB of Go source code, induceSubL_8_32 113 // (which runs only once, at the top level of the recursion) 114 // takes 0.44s, while on 50 MB of random input, it takes 2.55s. 115 // Nearly all the relative slowdown is explained by the text access: 116 // 117 // c0, c1 := text[k-1], text[k] 118 // 119 // That line runs for 0.23s on the Go text and 2.02s on random text. 120 121 //go:generate go run gen.go 122 123 package suffixarray 124 125 // text_32 returns the suffix array for the input text. 126 // It requires that len(text) fit in an int32 127 // and that the caller zero sa. 128 func text_32(text []byte, sa []int32) { 129 if int(int32(len(text))) != len(text) || len(text) != len(sa) { 130 panic("suffixarray: misuse of text_32") 131 } 132 sais_8_32(text, 256, sa, make([]int32, 2*256)) 133 } 134 135 // sais_8_32 computes the suffix array of text. 136 // The text must contain only values in [0, textMax). 137 // The suffix array is stored in sa, which the caller 138 // must ensure is already zeroed. 139 // The caller must also provide temporary space tmp 140 // with len(tmp) ≥ textMax. If len(tmp) ≥ 2*textMax 141 // then the algorithm runs a little faster. 142 // If sais_8_32 modifies tmp, it sets tmp[0] = -1 on return. 143 func sais_8_32(text []byte, textMax int, sa, tmp []int32) { 144 if len(sa) != len(text) || len(tmp) < int(textMax) { 145 panic("suffixarray: misuse of sais_8_32") 146 } 147 148 // Trivial base cases. Sorting 0 or 1 things is easy. 149 if len(text) == 0 { 150 return 151 } 152 if len(text) == 1 { 153 sa[0] = 0 154 return 155 } 156 157 // Establish slices indexed by text character 158 // holding character frequency and bucket-sort offsets. 159 // If there's only enough tmp for one slice, 160 // we make it the bucket offsets and recompute 161 // the character frequency each time we need it. 162 var freq, bucket []int32 163 if len(tmp) >= 2*textMax { 164 freq, bucket = tmp[:textMax], tmp[textMax:2*textMax] 165 freq[0] = -1 // mark as uninitialized 166 } else { 167 freq, bucket = nil, tmp[:textMax] 168 } 169 170 // The SAIS algorithm. 171 // Each of these calls makes one scan through sa. 172 // See the individual functions for documentation 173 // about each's role in the algorithm. 174 numLMS := placeLMS_8_32(text, sa, freq, bucket) 175 if numLMS <= 1 { 176 // 0 or 1 items are already sorted. Do nothing. 177 } else { 178 induceSubL_8_32(text, sa, freq, bucket) 179 induceSubS_8_32(text, sa, freq, bucket) 180 length_8_32(text, sa, numLMS) 181 maxID := assignID_8_32(text, sa, numLMS) 182 if maxID < numLMS { 183 map_32(sa, numLMS) 184 recurse_32(sa, tmp, numLMS, maxID) 185 unmap_8_32(text, sa, numLMS) 186 } else { 187 // If maxID == numLMS, then each LMS-substring 188 // is unique, so the relative ordering of two LMS-suffixes 189 // is determined by just the leading LMS-substring. 190 // That is, the LMS-suffix sort order matches the 191 // (simpler) LMS-substring sort order. 192 // Copy the original LMS-substring order into the 193 // suffix array destination. 194 copy(sa, sa[len(sa)-numLMS:]) 195 } 196 expand_8_32(text, freq, bucket, sa, numLMS) 197 } 198 induceL_8_32(text, sa, freq, bucket) 199 induceS_8_32(text, sa, freq, bucket) 200 201 // Mark for caller that we overwrote tmp. 202 tmp[0] = -1 203 } 204 205 // freq_8_32 returns the character frequencies 206 // for text, as a slice indexed by character value. 207 // If freq is nil, freq_8_32 uses and returns bucket. 208 // If freq is non-nil, freq_8_32 assumes that freq[0] >= 0 209 // means the frequencies are already computed. 210 // If the frequency data is overwritten or uninitialized, 211 // the caller must set freq[0] = -1 to force recomputation 212 // the next time it is needed. 213 func freq_8_32(text []byte, freq, bucket []int32) []int32 { 214 if freq != nil && freq[0] >= 0 { 215 return freq // already computed 216 } 217 if freq == nil { 218 freq = bucket 219 } 220 221 freq = freq[:256] // eliminate bounds check for freq[c] below 222 for i := range freq { 223 freq[i] = 0 224 } 225 for _, c := range text { 226 freq[c]++ 227 } 228 return freq 229 } 230 231 // bucketMin_8_32 stores into bucket[c] the minimum index 232 // in the bucket for character c in a bucket-sort of text. 233 func bucketMin_8_32(text []byte, freq, bucket []int32) { 234 freq = freq_8_32(text, freq, bucket) 235 freq = freq[:256] // establish len(freq) = 256, so 0 ≤ i < 256 below 236 bucket = bucket[:256] // eliminate bounds check for bucket[i] below 237 total := int32(0) 238 for i, n := range freq { 239 bucket[i] = total 240 total += n 241 } 242 } 243 244 // bucketMax_8_32 stores into bucket[c] the maximum index 245 // in the bucket for character c in a bucket-sort of text. 246 // The bucket indexes for c are [min, max). 247 // That is, max is one past the final index in that bucket. 248 func bucketMax_8_32(text []byte, freq, bucket []int32) { 249 freq = freq_8_32(text, freq, bucket) 250 freq = freq[:256] // establish len(freq) = 256, so 0 ≤ i < 256 below 251 bucket = bucket[:256] // eliminate bounds check for bucket[i] below 252 total := int32(0) 253 for i, n := range freq { 254 total += n 255 bucket[i] = total 256 } 257 } 258 259 // The SAIS algorithm proceeds in a sequence of scans through sa. 260 // Each of the following functions implements one scan, 261 // and the functions appear here in the order they execute in the algorithm. 262 263 // placeLMS_8_32 places into sa the indexes of the 264 // final characters of the LMS substrings of text, 265 // sorted into the rightmost ends of their correct buckets 266 // in the suffix array. 267 // 268 // The imaginary sentinel character at the end of the text 269 // is the final character of the final LMS substring, but there 270 // is no bucket for the imaginary sentinel character, 271 // which has a smaller value than any real character. 272 // The caller must therefore pretend that sa[-1] == len(text). 273 // 274 // The text indexes of LMS-substring characters are always ≥ 1 275 // (the first LMS-substring must be preceded by one or more L-type 276 // characters that are not part of any LMS-substring), 277 // so using 0 as a “not present” suffix array entry is safe, 278 // both in this function and in most later functions 279 // (until induceL_8_32 below). 280 func placeLMS_8_32(text []byte, sa, freq, bucket []int32) int { 281 bucketMax_8_32(text, freq, bucket) 282 283 numLMS := 0 284 lastB := int32(-1) 285 bucket = bucket[:256] // eliminate bounds check for bucket[c1] below 286 287 // The next stanza of code (until the blank line) loop backward 288 // over text, stopping to execute a code body at each position i 289 // such that text[i] is an L-character and text[i+1] is an S-character. 290 // That is, i+1 is the position of the start of an LMS-substring. 291 // These could be hoisted out into a function with a callback, 292 // but at a significant speed cost. Instead, we just write these 293 // seven lines a few times in this source file. The copies below 294 // refer back to the pattern established by this original as the 295 // "LMS-substring iterator". 296 // 297 // In every scan through the text, c0, c1 are successive characters of text. 298 // In this backward scan, c0 == text[i] and c1 == text[i+1]. 299 // By scanning backward, we can keep track of whether the current 300 // position is type-S or type-L according to the usual definition: 301 // 302 // - position len(text) is type S with text[len(text)] == -1 (the sentinel) 303 // - position i is type S if text[i] < text[i+1], or if text[i] == text[i+1] && i+1 is type S. 304 // - position i is type L if text[i] > text[i+1], or if text[i] == text[i+1] && i+1 is type L. 305 // 306 // The backward scan lets us maintain the current type, 307 // update it when we see c0 != c1, and otherwise leave it alone. 308 // We want to identify all S positions with a preceding L. 309 // Position len(text) is one such position by definition, but we have 310 // nowhere to write it down, so we eliminate it by untruthfully 311 // setting isTypeS = false at the start of the loop. 312 c0, c1, isTypeS := byte(0), byte(0), false 313 for i := len(text) - 1; i >= 0; i-- { 314 c0, c1 = text[i], c0 315 if c0 < c1 { 316 isTypeS = true 317 } else if c0 > c1 && isTypeS { 318 isTypeS = false 319 320 // Bucket the index i+1 for the start of an LMS-substring. 321 b := bucket[c1] - 1 322 bucket[c1] = b 323 sa[b] = int32(i + 1) 324 lastB = b 325 numLMS++ 326 } 327 } 328 329 // We recorded the LMS-substring starts but really want the ends. 330 // Luckily, with two differences, the start indexes and the end indexes are the same. 331 // The first difference is that the rightmost LMS-substring's end index is len(text), 332 // so the caller must pretend that sa[-1] == len(text), as noted above. 333 // The second difference is that the first leftmost LMS-substring start index 334 // does not end an earlier LMS-substring, so as an optimization we can omit 335 // that leftmost LMS-substring start index (the last one we wrote). 336 // 337 // Exception: if numLMS <= 1, the caller is not going to bother with 338 // the recursion at all and will treat the result as containing LMS-substring starts. 339 // In that case, we don't remove the final entry. 340 if numLMS > 1 { 341 sa[lastB] = 0 342 } 343 return numLMS 344 } 345 346 // induceSubL_8_32 inserts the L-type text indexes of LMS-substrings 347 // into sa, assuming that the final characters of the LMS-substrings 348 // are already inserted into sa, sorted by final character, and at the 349 // right (not left) end of the corresponding character bucket. 350 // Each LMS-substring has the form (as a regexp) /S+L+S/: 351 // one or more S-type, one or more L-type, final S-type. 352 // induceSubL_8_32 leaves behind only the leftmost L-type text 353 // index for each LMS-substring. That is, it removes the final S-type 354 // indexes that are present on entry, and it inserts but then removes 355 // the interior L-type indexes too. 356 // (Only the leftmost L-type index is needed by induceSubS_8_32.) 357 func induceSubL_8_32(text []byte, sa, freq, bucket []int32) { 358 // Initialize positions for left side of character buckets. 359 bucketMin_8_32(text, freq, bucket) 360 bucket = bucket[:256] // eliminate bounds check for bucket[cB] below 361 362 // As we scan the array left-to-right, each sa[i] = j > 0 is a correctly 363 // sorted suffix array entry (for text[j:]) for which we know that j-1 is type L. 364 // Because j-1 is type L, inserting it into sa now will sort it correctly. 365 // But we want to distinguish a j-1 with j-2 of type L from type S. 366 // We can process the former but want to leave the latter for the caller. 367 // We record the difference by negating j-1 if it is preceded by type S. 368 // Either way, the insertion (into the text[j-1] bucket) is guaranteed to 369 // happen at sa[i´] for some i´ > i, that is, in the portion of sa we have 370 // yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3, 371 // and so on, in sorted but not necessarily adjacent order, until it finds 372 // one preceded by an index of type S, at which point it must stop. 373 // 374 // As we scan through the array, we clear the worked entries (sa[i] > 0) to zero, 375 // and we flip sa[i] < 0 to -sa[i], so that the loop finishes with sa containing 376 // only the indexes of the leftmost L-type indexes for each LMS-substring. 377 // 378 // The suffix array sa therefore serves simultaneously as input, output, 379 // and a miraculously well-tailored work queue. 380 381 // placeLMS_8_32 left out the implicit entry sa[-1] == len(text), 382 // corresponding to the identified type-L index len(text)-1. 383 // Process it before the left-to-right scan of sa proper. 384 // See body in loop for commentary. 385 k := len(text) - 1 386 c0, c1 := text[k-1], text[k] 387 if c0 < c1 { 388 k = -k 389 } 390 391 // Cache recently used bucket index: 392 // we're processing suffixes in sorted order 393 // and accessing buckets indexed by the 394 // byte before the sorted order, which still 395 // has very good locality. 396 // Invariant: b is cached, possibly dirty copy of bucket[cB]. 397 cB := c1 398 b := bucket[cB] 399 sa[b] = int32(k) 400 b++ 401 402 for i := 0; i < len(sa); i++ { 403 j := int(sa[i]) 404 if j == 0 { 405 // Skip empty entry. 406 continue 407 } 408 if j < 0 { 409 // Leave discovered type-S index for caller. 410 sa[i] = int32(-j) 411 continue 412 } 413 sa[i] = 0 414 415 // Index j was on work queue, meaning k := j-1 is L-type, 416 // so we can now place k correctly into sa. 417 // If k-1 is L-type, queue k for processing later in this loop. 418 // If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller. 419 k := j - 1 420 c0, c1 := text[k-1], text[k] 421 if c0 < c1 { 422 k = -k 423 } 424 425 if cB != c1 { 426 bucket[cB] = b 427 cB = c1 428 b = bucket[cB] 429 } 430 sa[b] = int32(k) 431 b++ 432 } 433 } 434 435 // induceSubS_8_32 inserts the S-type text indexes of LMS-substrings 436 // into sa, assuming that the leftmost L-type text indexes are already 437 // inserted into sa, sorted by LMS-substring suffix, and at the 438 // left end of the corresponding character bucket. 439 // Each LMS-substring has the form (as a regexp) /S+L+S/: 440 // one or more S-type, one or more L-type, final S-type. 441 // induceSubS_8_32 leaves behind only the leftmost S-type text 442 // index for each LMS-substring, in sorted order, at the right end of sa. 443 // That is, it removes the L-type indexes that are present on entry, 444 // and it inserts but then removes the interior S-type indexes too, 445 // leaving the LMS-substring start indexes packed into sa[len(sa)-numLMS:]. 446 // (Only the LMS-substring start indexes are processed by the recursion.) 447 func induceSubS_8_32(text []byte, sa, freq, bucket []int32) { 448 // Initialize positions for right side of character buckets. 449 bucketMax_8_32(text, freq, bucket) 450 bucket = bucket[:256] // eliminate bounds check for bucket[cB] below 451 452 // Analogous to induceSubL_8_32 above, 453 // as we scan the array right-to-left, each sa[i] = j > 0 is a correctly 454 // sorted suffix array entry (for text[j:]) for which we know that j-1 is type S. 455 // Because j-1 is type S, inserting it into sa now will sort it correctly. 456 // But we want to distinguish a j-1 with j-2 of type S from type L. 457 // We can process the former but want to leave the latter for the caller. 458 // We record the difference by negating j-1 if it is preceded by type L. 459 // Either way, the insertion (into the text[j-1] bucket) is guaranteed to 460 // happen at sa[i´] for some i´ < i, that is, in the portion of sa we have 461 // yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3, 462 // and so on, in sorted but not necessarily adjacent order, until it finds 463 // one preceded by an index of type L, at which point it must stop. 464 // That index (preceded by one of type L) is an LMS-substring start. 465 // 466 // As we scan through the array, we clear the worked entries (sa[i] > 0) to zero, 467 // and we flip sa[i] < 0 to -sa[i] and compact into the top of sa, 468 // so that the loop finishes with the top of sa containing exactly 469 // the LMS-substring start indexes, sorted by LMS-substring. 470 471 // Cache recently used bucket index: 472 cB := byte(0) 473 b := bucket[cB] 474 475 top := len(sa) 476 for i := len(sa) - 1; i >= 0; i-- { 477 j := int(sa[i]) 478 if j == 0 { 479 // Skip empty entry. 480 continue 481 } 482 sa[i] = 0 483 if j < 0 { 484 // Leave discovered LMS-substring start index for caller. 485 top-- 486 sa[top] = int32(-j) 487 continue 488 } 489 490 // Index j was on work queue, meaning k := j-1 is S-type, 491 // so we can now place k correctly into sa. 492 // If k-1 is S-type, queue k for processing later in this loop. 493 // If k-1 is L-type (text[k-1] > text[k]), queue -k to save for the caller. 494 k := j - 1 495 c1 := text[k] 496 c0 := text[k-1] 497 if c0 > c1 { 498 k = -k 499 } 500 501 if cB != c1 { 502 bucket[cB] = b 503 cB = c1 504 b = bucket[cB] 505 } 506 b-- 507 sa[b] = int32(k) 508 } 509 } 510 511 // length_8_32 computes and records the length of each LMS-substring in text. 512 // The length of the LMS-substring at index j is stored at sa[j/2], 513 // avoiding the LMS-substring indexes already stored in the top half of sa. 514 // (If index j is an LMS-substring start, then index j-1 is type L and cannot be.) 515 // There are two exceptions, made for optimizations in name_8_32 below. 516 // 517 // First, the final LMS-substring is recorded as having length 0, which is otherwise 518 // impossible, instead of giving it a length that includes the implicit sentinel. 519 // This ensures the final LMS-substring has length unequal to all others 520 // and therefore can be detected as different without text comparison 521 // (it is unequal because it is the only one that ends in the implicit sentinel, 522 // and the text comparison would be problematic since the implicit sentinel 523 // is not actually present at text[len(text)]). 524 // 525 // Second, to avoid text comparison entirely, if an LMS-substring is very short, 526 // sa[j/2] records its actual text instead of its length, so that if two such 527 // substrings have matching “length,” the text need not be read at all. 528 // The definition of “very short” is that the text bytes must pack into an uint32, 529 // and the unsigned encoding e must be ≥ len(text), so that it can be 530 // distinguished from a valid length. 531 func length_8_32(text []byte, sa []int32, numLMS int) { 532 end := 0 // index of current LMS-substring end (0 indicates final LMS-substring) 533 534 // The encoding of N text bytes into a “length” word 535 // adds 1 to each byte, packs them into the bottom 536 // N*8 bits of a word, and then bitwise inverts the result. 537 // That is, the text sequence A B C (hex 41 42 43) 538 // encodes as ^uint32(0x42_43_44). 539 // LMS-substrings can never start or end with 0xFF. 540 // Adding 1 ensures the encoded byte sequence never 541 // starts or ends with 0x00, so that present bytes can be 542 // distinguished from zero-padding in the top bits, 543 // so the length need not be separately encoded. 544 // Inverting the bytes increases the chance that a 545 // 4-byte encoding will still be ≥ len(text). 546 // In particular, if the first byte is ASCII (<= 0x7E, so +1 <= 0x7F) 547 // then the high bit of the inversion will be set, 548 // making it clearly not a valid length (it would be a negative one). 549 // 550 // cx holds the pre-inverted encoding (the packed incremented bytes). 551 cx := uint32(0) // byte-only 552 553 // This stanza (until the blank line) is the "LMS-substring iterator", 554 // described in placeLMS_8_32 above, with one line added to maintain cx. 555 c0, c1, isTypeS := byte(0), byte(0), false 556 for i := len(text) - 1; i >= 0; i-- { 557 c0, c1 = text[i], c0 558 cx = cx<<8 | uint32(c1+1) // byte-only 559 if c0 < c1 { 560 isTypeS = true 561 } else if c0 > c1 && isTypeS { 562 isTypeS = false 563 564 // Index j = i+1 is the start of an LMS-substring. 565 // Compute length or encoded text to store in sa[j/2]. 566 j := i + 1 567 var code int32 568 if end == 0 { 569 code = 0 570 } else { 571 code = int32(end - j) 572 if code <= 32/8 && ^cx >= uint32(len(text)) { // byte-only 573 code = int32(^cx) // byte-only 574 } // byte-only 575 } 576 sa[j>>1] = code 577 end = j + 1 578 cx = uint32(c1 + 1) // byte-only 579 } 580 } 581 } 582 583 // assignID_8_32 assigns a dense ID numbering to the 584 // set of LMS-substrings respecting string ordering and equality, 585 // returning the maximum assigned ID. 586 // For example given the input "ababab", the LMS-substrings 587 // are "aba", "aba", and "ab", renumbered as 2 2 1. 588 // sa[len(sa)-numLMS:] holds the LMS-substring indexes 589 // sorted in string order, so to assign numbers we can 590 // consider each in turn, removing adjacent duplicates. 591 // The new ID for the LMS-substring at index j is written to sa[j/2], 592 // overwriting the length previously stored there (by length_8_32 above). 593 func assignID_8_32(text []byte, sa []int32, numLMS int) int { 594 id := 0 595 lastLen := int32(-1) // impossible 596 lastPos := int32(0) 597 for _, j := range sa[len(sa)-numLMS:] { 598 // Is the LMS-substring at index j new, or is it the same as the last one we saw? 599 n := sa[j/2] 600 if n != lastLen { 601 goto New 602 } 603 if uint32(n) >= uint32(len(text)) { 604 // “Length” is really encoded full text, and they match. 605 goto Same 606 } 607 { 608 // Compare actual texts. 609 n := int(n) 610 this := text[j:][:n] 611 last := text[lastPos:][:n] 612 for i := 0; i < n; i++ { 613 if this[i] != last[i] { 614 goto New 615 } 616 } 617 goto Same 618 } 619 New: 620 id++ 621 lastPos = j 622 lastLen = n 623 Same: 624 sa[j/2] = int32(id) 625 } 626 return id 627 } 628 629 // map_32 maps the LMS-substrings in text to their new IDs, 630 // producing the subproblem for the recursion. 631 // The mapping itself was mostly applied by assignID_8_32: 632 // sa[i] is either 0, the ID for the LMS-substring at index 2*i, 633 // or the ID for the LMS-substring at index 2*i+1. 634 // To produce the subproblem we need only remove the zeros 635 // and change ID into ID-1 (our IDs start at 1, but text chars start at 0). 636 // 637 // map_32 packs the result, which is the input to the recursion, 638 // into the top of sa, so that the recursion result can be stored 639 // in the bottom of sa, which sets up for expand_8_32 well. 640 func map_32(sa []int32, numLMS int) { 641 w := len(sa) 642 for i := len(sa) / 2; i >= 0; i-- { 643 j := sa[i] 644 if j > 0 { 645 w-- 646 sa[w] = j - 1 647 } 648 } 649 } 650 651 // recurse_32 calls sais_32 recursively to solve the subproblem we've built. 652 // The subproblem is at the right end of sa, the suffix array result will be 653 // written at the left end of sa, and the middle of sa is available for use as 654 // temporary frequency and bucket storage. 655 func recurse_32(sa, oldTmp []int32, numLMS, maxID int) { 656 dst, saTmp, text := sa[:numLMS], sa[numLMS:len(sa)-numLMS], sa[len(sa)-numLMS:] 657 658 // Set up temporary space for recursive call. 659 // We must pass sais_32 a tmp buffer wiith at least maxID entries. 660 // 661 // The subproblem is guaranteed to have length at most len(sa)/2, 662 // so that sa can hold both the subproblem and its suffix array. 663 // Nearly all the time, however, the subproblem has length < len(sa)/3, 664 // in which case there is a subproblem-sized middle of sa that 665 // we can reuse for temporary space (saTmp). 666 // When recurse_32 is called from sais_8_32, oldTmp is length 512 667 // (from text_32), and saTmp will typically be much larger, so we'll use saTmp. 668 // When deeper recursions come back to recurse_32, now oldTmp is 669 // the saTmp from the top-most recursion, it is typically larger than 670 // the current saTmp (because the current sa gets smaller and smaller 671 // as the recursion gets deeper), and we keep reusing that top-most 672 // large saTmp instead of the offered smaller ones. 673 // 674 // Why is the subproblem length so often just under len(sa)/3? 675 // See Nong, Zhang, and Chen, section 3.6 for a plausible explanation. 676 // In brief, the len(sa)/2 case would correspond to an SLSLSLSLSLSL pattern 677 // in the input, perfect alternation of larger and smaller input bytes. 678 // Real text doesn't do that. If each L-type index is randomly followed 679 // by either an L-type or S-type index, then half the substrings will 680 // be of the form SLS, but the other half will be longer. Of that half, 681 // half (a quarter overall) will be SLLS; an eighth will be SLLLS, and so on. 682 // Not counting the final S in each (which overlaps the first S in the next), 683 // This works out to an average length 2×½ + 3×¼ + 4×⅛ + ... = 3. 684 // The space we need is further reduced by the fact that many of the 685 // short patterns like SLS will often be the same character sequences 686 // repeated throughout the text, reducing maxID relative to numLMS. 687 // 688 // For short inputs, the averages may not run in our favor, but then we 689 // can often fall back to using the length-512 tmp available in the 690 // top-most call. (Also a short allocation would not be a big deal.) 691 // 692 // For pathological inputs, we fall back to allocating a new tmp of length 693 // max(maxID, numLMS/2). This level of the recursion needs maxID, 694 // and all deeper levels of the recursion will need no more than numLMS/2, 695 // so this one allocation is guaranteed to suffice for the entire stack 696 // of recursive calls. 697 tmp := oldTmp 698 if len(tmp) < len(saTmp) { 699 tmp = saTmp 700 } 701 if len(tmp) < numLMS { 702 // TestSAIS/forcealloc reaches this code. 703 n := maxID 704 if n < numLMS/2 { 705 n = numLMS / 2 706 } 707 tmp = make([]int32, n) 708 } 709 710 // sais_32 requires that the caller arrange to clear dst, 711 // because in general the caller may know dst is 712 // freshly-allocated and already cleared. But this one is not. 713 for i := range dst { 714 dst[i] = 0 715 } 716 sais_32(text, maxID, dst, tmp) 717 } 718 719 // unmap_8_32 unmaps the subproblem back to the original. 720 // sa[:numLMS] is the LMS-substring numbers, which don't matter much anymore. 721 // sa[len(sa)-numLMS:] is the sorted list of those LMS-substring numbers. 722 // The key part is that if the list says K that means the K'th substring. 723 // We can replace sa[:numLMS] with the indexes of the LMS-substrings. 724 // Then if the list says K it really means sa[K]. 725 // Having mapped the list back to LMS-substring indexes, 726 // we can place those into the right buckets. 727 func unmap_8_32(text []byte, sa []int32, numLMS int) { 728 unmap := sa[len(sa)-numLMS:] 729 j := len(unmap) 730 731 // "LMS-substring iterator" (see placeLMS_8_32 above). 732 c0, c1, isTypeS := byte(0), byte(0), false 733 for i := len(text) - 1; i >= 0; i-- { 734 c0, c1 = text[i], c0 735 if c0 < c1 { 736 isTypeS = true 737 } else if c0 > c1 && isTypeS { 738 isTypeS = false 739 740 // Populate inverse map. 741 j-- 742 unmap[j] = int32(i + 1) 743 } 744 } 745 746 // Apply inverse map to subproblem suffix array. 747 sa = sa[:numLMS] 748 for i := 0; i < len(sa); i++ { 749 sa[i] = unmap[sa[i]] 750 } 751 } 752 753 // expand_8_32 distributes the compacted, sorted LMS-suffix indexes 754 // from sa[:numLMS] into the tops of the appropriate buckets in sa, 755 // preserving the sorted order and making room for the L-type indexes 756 // to be slotted into the sorted sequence by induceL_8_32. 757 func expand_8_32(text []byte, freq, bucket, sa []int32, numLMS int) { 758 bucketMax_8_32(text, freq, bucket) 759 bucket = bucket[:256] // eliminate bound check for bucket[c] below 760 761 // Loop backward through sa, always tracking 762 // the next index to populate from sa[:numLMS]. 763 // When we get to one, populate it. 764 // Zero the rest of the slots; they have dead values in them. 765 x := numLMS - 1 766 saX := sa[x] 767 c := text[saX] 768 b := bucket[c] - 1 769 bucket[c] = b 770 771 for i := len(sa) - 1; i >= 0; i-- { 772 if i != int(b) { 773 sa[i] = 0 774 continue 775 } 776 sa[i] = saX 777 778 // Load next entry to put down (if any). 779 if x > 0 { 780 x-- 781 saX = sa[x] // TODO bounds check 782 c = text[saX] 783 b = bucket[c] - 1 784 bucket[c] = b 785 } 786 } 787 } 788 789 // induceL_8_32 inserts L-type text indexes into sa, 790 // assuming that the leftmost S-type indexes are inserted 791 // into sa, in sorted order, in the right bucket halves. 792 // It leaves all the L-type indexes in sa, but the 793 // leftmost L-type indexes are negated, to mark them 794 // for processing by induceS_8_32. 795 func induceL_8_32(text []byte, sa, freq, bucket []int32) { 796 // Initialize positions for left side of character buckets. 797 bucketMin_8_32(text, freq, bucket) 798 bucket = bucket[:256] // eliminate bounds check for bucket[cB] below 799 800 // This scan is similar to the one in induceSubL_8_32 above. 801 // That one arranges to clear all but the leftmost L-type indexes. 802 // This scan leaves all the L-type indexes and the original S-type 803 // indexes, but it negates the positive leftmost L-type indexes 804 // (the ones that induceS_8_32 needs to process). 805 806 // expand_8_32 left out the implicit entry sa[-1] == len(text), 807 // corresponding to the identified type-L index len(text)-1. 808 // Process it before the left-to-right scan of sa proper. 809 // See body in loop for commentary. 810 k := len(text) - 1 811 c0, c1 := text[k-1], text[k] 812 if c0 < c1 { 813 k = -k 814 } 815 816 // Cache recently used bucket index. 817 cB := c1 818 b := bucket[cB] 819 sa[b] = int32(k) 820 b++ 821 822 for i := 0; i < len(sa); i++ { 823 j := int(sa[i]) 824 if j <= 0 { 825 // Skip empty or negated entry (including negated zero). 826 continue 827 } 828 829 // Index j was on work queue, meaning k := j-1 is L-type, 830 // so we can now place k correctly into sa. 831 // If k-1 is L-type, queue k for processing later in this loop. 832 // If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller. 833 // If k is zero, k-1 doesn't exist, so we only need to leave it 834 // for the caller. The caller can't tell the difference between 835 // an empty slot and a non-empty zero, but there's no need 836 // to distinguish them anyway: the final suffix array will end up 837 // with one zero somewhere, and that will be a real zero. 838 k := j - 1 839 c1 := text[k] 840 if k > 0 { 841 if c0 := text[k-1]; c0 < c1 { 842 k = -k 843 } 844 } 845 846 if cB != c1 { 847 bucket[cB] = b 848 cB = c1 849 b = bucket[cB] 850 } 851 sa[b] = int32(k) 852 b++ 853 } 854 } 855 856 func induceS_8_32(text []byte, sa, freq, bucket []int32) { 857 // Initialize positions for right side of character buckets. 858 bucketMax_8_32(text, freq, bucket) 859 bucket = bucket[:256] // eliminate bounds check for bucket[cB] below 860 861 cB := byte(0) 862 b := bucket[cB] 863 864 for i := len(sa) - 1; i >= 0; i-- { 865 j := int(sa[i]) 866 if j >= 0 { 867 // Skip non-flagged entry. 868 // (This loop can't see an empty entry; 0 means the real zero index.) 869 continue 870 } 871 872 // Negative j is a work queue entry; rewrite to positive j for final suffix array. 873 j = -j 874 sa[i] = int32(j) 875 876 // Index j was on work queue (encoded as -j but now decoded), 877 // meaning k := j-1 is L-type, 878 // so we can now place k correctly into sa. 879 // If k-1 is S-type, queue -k for processing later in this loop. 880 // If k-1 is L-type (text[k-1] > text[k]), queue k to save for the caller. 881 // If k is zero, k-1 doesn't exist, so we only need to leave it 882 // for the caller. 883 k := j - 1 884 c1 := text[k] 885 if k > 0 { 886 if c0 := text[k-1]; c0 <= c1 { 887 k = -k 888 } 889 } 890 891 if cB != c1 { 892 bucket[cB] = b 893 cB = c1 894 b = bucket[cB] 895 } 896 b-- 897 sa[b] = int32(k) 898 } 899 } 900