...

Source file src/hash/maphash/smhasher_test.go

Documentation: hash/maphash

		 1  // Copyright 2019 The Go Authors. All rights reserved.
		 2  // Use of this source code is governed by a BSD-style
		 3  // license that can be found in the LICENSE file.
		 4  
		 5  package maphash
		 6  
		 7  import (
		 8  	"fmt"
		 9  	"math"
		10  	"math/rand"
		11  	"runtime"
		12  	"strings"
		13  	"testing"
		14  	"unsafe"
		15  )
		16  
		17  // Smhasher is a torture test for hash functions.
		18  // https://code.google.com/p/smhasher/
		19  // This code is a port of some of the Smhasher tests to Go.
		20  
		21  var fixedSeed = MakeSeed()
		22  
		23  // Sanity checks.
		24  // hash should not depend on values outside key.
		25  // hash should not depend on alignment.
		26  func TestSmhasherSanity(t *testing.T) {
		27  	r := rand.New(rand.NewSource(1234))
		28  	const REP = 10
		29  	const KEYMAX = 128
		30  	const PAD = 16
		31  	const OFFMAX = 16
		32  	for k := 0; k < REP; k++ {
		33  		for n := 0; n < KEYMAX; n++ {
		34  			for i := 0; i < OFFMAX; i++ {
		35  				var b [KEYMAX + OFFMAX + 2*PAD]byte
		36  				var c [KEYMAX + OFFMAX + 2*PAD]byte
		37  				randBytes(r, b[:])
		38  				randBytes(r, c[:])
		39  				copy(c[PAD+i:PAD+i+n], b[PAD:PAD+n])
		40  				if bytesHash(b[PAD:PAD+n]) != bytesHash(c[PAD+i:PAD+i+n]) {
		41  					t.Errorf("hash depends on bytes outside key")
		42  				}
		43  			}
		44  		}
		45  	}
		46  }
		47  
		48  func bytesHash(b []byte) uint64 {
		49  	var h Hash
		50  	h.SetSeed(fixedSeed)
		51  	h.Write(b)
		52  	return h.Sum64()
		53  }
		54  func stringHash(s string) uint64 {
		55  	var h Hash
		56  	h.SetSeed(fixedSeed)
		57  	h.WriteString(s)
		58  	return h.Sum64()
		59  }
		60  
		61  const hashSize = 64
		62  
		63  func randBytes(r *rand.Rand, b []byte) {
		64  	r.Read(b) // can't fail
		65  }
		66  
		67  // A hashSet measures the frequency of hash collisions.
		68  type hashSet struct {
		69  	m map[uint64]struct{} // set of hashes added
		70  	n int								 // number of hashes added
		71  }
		72  
		73  func newHashSet() *hashSet {
		74  	return &hashSet{make(map[uint64]struct{}), 0}
		75  }
		76  func (s *hashSet) add(h uint64) {
		77  	s.m[h] = struct{}{}
		78  	s.n++
		79  }
		80  func (s *hashSet) addS(x string) {
		81  	s.add(stringHash(x))
		82  }
		83  func (s *hashSet) addB(x []byte) {
		84  	s.add(bytesHash(x))
		85  }
		86  func (s *hashSet) addS_seed(x string, seed Seed) {
		87  	var h Hash
		88  	h.SetSeed(seed)
		89  	h.WriteString(x)
		90  	s.add(h.Sum64())
		91  }
		92  func (s *hashSet) check(t *testing.T) {
		93  	const SLOP = 10.0
		94  	collisions := s.n - len(s.m)
		95  	pairs := int64(s.n) * int64(s.n-1) / 2
		96  	expected := float64(pairs) / math.Pow(2.0, float64(hashSize))
		97  	stddev := math.Sqrt(expected)
		98  	if float64(collisions) > expected+SLOP*(3*stddev+1) {
		99  		t.Errorf("unexpected number of collisions: got=%d mean=%f stddev=%f", collisions, expected, stddev)
	 100  	}
	 101  }
	 102  
	 103  // a string plus adding zeros must make distinct hashes
	 104  func TestSmhasherAppendedZeros(t *testing.T) {
	 105  	s := "hello" + strings.Repeat("\x00", 256)
	 106  	h := newHashSet()
	 107  	for i := 0; i <= len(s); i++ {
	 108  		h.addS(s[:i])
	 109  	}
	 110  	h.check(t)
	 111  }
	 112  
	 113  // All 0-3 byte strings have distinct hashes.
	 114  func TestSmhasherSmallKeys(t *testing.T) {
	 115  	h := newHashSet()
	 116  	var b [3]byte
	 117  	for i := 0; i < 256; i++ {
	 118  		b[0] = byte(i)
	 119  		h.addB(b[:1])
	 120  		for j := 0; j < 256; j++ {
	 121  			b[1] = byte(j)
	 122  			h.addB(b[:2])
	 123  			if !testing.Short() {
	 124  				for k := 0; k < 256; k++ {
	 125  					b[2] = byte(k)
	 126  					h.addB(b[:3])
	 127  				}
	 128  			}
	 129  		}
	 130  	}
	 131  	h.check(t)
	 132  }
	 133  
	 134  // Different length strings of all zeros have distinct hashes.
	 135  func TestSmhasherZeros(t *testing.T) {
	 136  	N := 256 * 1024
	 137  	if testing.Short() {
	 138  		N = 1024
	 139  	}
	 140  	h := newHashSet()
	 141  	b := make([]byte, N)
	 142  	for i := 0; i <= N; i++ {
	 143  		h.addB(b[:i])
	 144  	}
	 145  	h.check(t)
	 146  }
	 147  
	 148  // Strings with up to two nonzero bytes all have distinct hashes.
	 149  func TestSmhasherTwoNonzero(t *testing.T) {
	 150  	if runtime.GOARCH == "wasm" {
	 151  		t.Skip("Too slow on wasm")
	 152  	}
	 153  	if testing.Short() {
	 154  		t.Skip("Skipping in short mode")
	 155  	}
	 156  	h := newHashSet()
	 157  	for n := 2; n <= 16; n++ {
	 158  		twoNonZero(h, n)
	 159  	}
	 160  	h.check(t)
	 161  }
	 162  func twoNonZero(h *hashSet, n int) {
	 163  	b := make([]byte, n)
	 164  
	 165  	// all zero
	 166  	h.addB(b)
	 167  
	 168  	// one non-zero byte
	 169  	for i := 0; i < n; i++ {
	 170  		for x := 1; x < 256; x++ {
	 171  			b[i] = byte(x)
	 172  			h.addB(b)
	 173  			b[i] = 0
	 174  		}
	 175  	}
	 176  
	 177  	// two non-zero bytes
	 178  	for i := 0; i < n; i++ {
	 179  		for x := 1; x < 256; x++ {
	 180  			b[i] = byte(x)
	 181  			for j := i + 1; j < n; j++ {
	 182  				for y := 1; y < 256; y++ {
	 183  					b[j] = byte(y)
	 184  					h.addB(b)
	 185  					b[j] = 0
	 186  				}
	 187  			}
	 188  			b[i] = 0
	 189  		}
	 190  	}
	 191  }
	 192  
	 193  // Test strings with repeats, like "abcdabcdabcdabcd..."
	 194  func TestSmhasherCyclic(t *testing.T) {
	 195  	if testing.Short() {
	 196  		t.Skip("Skipping in short mode")
	 197  	}
	 198  	r := rand.New(rand.NewSource(1234))
	 199  	const REPEAT = 8
	 200  	const N = 1000000
	 201  	for n := 4; n <= 12; n++ {
	 202  		h := newHashSet()
	 203  		b := make([]byte, REPEAT*n)
	 204  		for i := 0; i < N; i++ {
	 205  			b[0] = byte(i * 79 % 97)
	 206  			b[1] = byte(i * 43 % 137)
	 207  			b[2] = byte(i * 151 % 197)
	 208  			b[3] = byte(i * 199 % 251)
	 209  			randBytes(r, b[4:n])
	 210  			for j := n; j < n*REPEAT; j++ {
	 211  				b[j] = b[j-n]
	 212  			}
	 213  			h.addB(b)
	 214  		}
	 215  		h.check(t)
	 216  	}
	 217  }
	 218  
	 219  // Test strings with only a few bits set
	 220  func TestSmhasherSparse(t *testing.T) {
	 221  	if runtime.GOARCH == "wasm" {
	 222  		t.Skip("Too slow on wasm")
	 223  	}
	 224  	if testing.Short() {
	 225  		t.Skip("Skipping in short mode")
	 226  	}
	 227  	sparse(t, 32, 6)
	 228  	sparse(t, 40, 6)
	 229  	sparse(t, 48, 5)
	 230  	sparse(t, 56, 5)
	 231  	sparse(t, 64, 5)
	 232  	sparse(t, 96, 4)
	 233  	sparse(t, 256, 3)
	 234  	sparse(t, 2048, 2)
	 235  }
	 236  func sparse(t *testing.T, n int, k int) {
	 237  	b := make([]byte, n/8)
	 238  	h := newHashSet()
	 239  	setbits(h, b, 0, k)
	 240  	h.check(t)
	 241  }
	 242  
	 243  // set up to k bits at index i and greater
	 244  func setbits(h *hashSet, b []byte, i int, k int) {
	 245  	h.addB(b)
	 246  	if k == 0 {
	 247  		return
	 248  	}
	 249  	for j := i; j < len(b)*8; j++ {
	 250  		b[j/8] |= byte(1 << uint(j&7))
	 251  		setbits(h, b, j+1, k-1)
	 252  		b[j/8] &= byte(^(1 << uint(j&7)))
	 253  	}
	 254  }
	 255  
	 256  // Test all possible combinations of n blocks from the set s.
	 257  // "permutation" is a bad name here, but it is what Smhasher uses.
	 258  func TestSmhasherPermutation(t *testing.T) {
	 259  	if runtime.GOARCH == "wasm" {
	 260  		t.Skip("Too slow on wasm")
	 261  	}
	 262  	if testing.Short() {
	 263  		t.Skip("Skipping in short mode")
	 264  	}
	 265  	permutation(t, []uint32{0, 1, 2, 3, 4, 5, 6, 7}, 8)
	 266  	permutation(t, []uint32{0, 1 << 29, 2 << 29, 3 << 29, 4 << 29, 5 << 29, 6 << 29, 7 << 29}, 8)
	 267  	permutation(t, []uint32{0, 1}, 20)
	 268  	permutation(t, []uint32{0, 1 << 31}, 20)
	 269  	permutation(t, []uint32{0, 1, 2, 3, 4, 5, 6, 7, 1 << 29, 2 << 29, 3 << 29, 4 << 29, 5 << 29, 6 << 29, 7 << 29}, 6)
	 270  }
	 271  func permutation(t *testing.T, s []uint32, n int) {
	 272  	b := make([]byte, n*4)
	 273  	h := newHashSet()
	 274  	genPerm(h, b, s, 0)
	 275  	h.check(t)
	 276  }
	 277  func genPerm(h *hashSet, b []byte, s []uint32, n int) {
	 278  	h.addB(b[:n])
	 279  	if n == len(b) {
	 280  		return
	 281  	}
	 282  	for _, v := range s {
	 283  		b[n] = byte(v)
	 284  		b[n+1] = byte(v >> 8)
	 285  		b[n+2] = byte(v >> 16)
	 286  		b[n+3] = byte(v >> 24)
	 287  		genPerm(h, b, s, n+4)
	 288  	}
	 289  }
	 290  
	 291  type key interface {
	 292  	clear()							// set bits all to 0
	 293  	random(r *rand.Rand) // set key to something random
	 294  	bits() int					 // how many bits key has
	 295  	flipBit(i int)			 // flip bit i of the key
	 296  	hash() uint64				// hash the key
	 297  	name() string				// for error reporting
	 298  }
	 299  
	 300  type bytesKey struct {
	 301  	b []byte
	 302  }
	 303  
	 304  func (k *bytesKey) clear() {
	 305  	for i := range k.b {
	 306  		k.b[i] = 0
	 307  	}
	 308  }
	 309  func (k *bytesKey) random(r *rand.Rand) {
	 310  	randBytes(r, k.b)
	 311  }
	 312  func (k *bytesKey) bits() int {
	 313  	return len(k.b) * 8
	 314  }
	 315  func (k *bytesKey) flipBit(i int) {
	 316  	k.b[i>>3] ^= byte(1 << uint(i&7))
	 317  }
	 318  func (k *bytesKey) hash() uint64 {
	 319  	return bytesHash(k.b)
	 320  }
	 321  func (k *bytesKey) name() string {
	 322  	return fmt.Sprintf("bytes%d", len(k.b))
	 323  }
	 324  
	 325  // Flipping a single bit of a key should flip each output bit with 50% probability.
	 326  func TestSmhasherAvalanche(t *testing.T) {
	 327  	if runtime.GOARCH == "wasm" {
	 328  		t.Skip("Too slow on wasm")
	 329  	}
	 330  	if testing.Short() {
	 331  		t.Skip("Skipping in short mode")
	 332  	}
	 333  	avalancheTest1(t, &bytesKey{make([]byte, 2)})
	 334  	avalancheTest1(t, &bytesKey{make([]byte, 4)})
	 335  	avalancheTest1(t, &bytesKey{make([]byte, 8)})
	 336  	avalancheTest1(t, &bytesKey{make([]byte, 16)})
	 337  	avalancheTest1(t, &bytesKey{make([]byte, 32)})
	 338  	avalancheTest1(t, &bytesKey{make([]byte, 200)})
	 339  }
	 340  func avalancheTest1(t *testing.T, k key) {
	 341  	const REP = 100000
	 342  	r := rand.New(rand.NewSource(1234))
	 343  	n := k.bits()
	 344  
	 345  	// grid[i][j] is a count of whether flipping
	 346  	// input bit i affects output bit j.
	 347  	grid := make([][hashSize]int, n)
	 348  
	 349  	for z := 0; z < REP; z++ {
	 350  		// pick a random key, hash it
	 351  		k.random(r)
	 352  		h := k.hash()
	 353  
	 354  		// flip each bit, hash & compare the results
	 355  		for i := 0; i < n; i++ {
	 356  			k.flipBit(i)
	 357  			d := h ^ k.hash()
	 358  			k.flipBit(i)
	 359  
	 360  			// record the effects of that bit flip
	 361  			g := &grid[i]
	 362  			for j := 0; j < hashSize; j++ {
	 363  				g[j] += int(d & 1)
	 364  				d >>= 1
	 365  			}
	 366  		}
	 367  	}
	 368  
	 369  	// Each entry in the grid should be about REP/2.
	 370  	// More precisely, we did N = k.bits() * hashSize experiments where
	 371  	// each is the sum of REP coin flips. We want to find bounds on the
	 372  	// sum of coin flips such that a truly random experiment would have
	 373  	// all sums inside those bounds with 99% probability.
	 374  	N := n * hashSize
	 375  	var c float64
	 376  	// find c such that Prob(mean-c*stddev < x < mean+c*stddev)^N > .9999
	 377  	for c = 0.0; math.Pow(math.Erf(c/math.Sqrt(2)), float64(N)) < .9999; c += .1 {
	 378  	}
	 379  	c *= 4.0 // allowed slack - we don't need to be perfectly random
	 380  	mean := .5 * REP
	 381  	stddev := .5 * math.Sqrt(REP)
	 382  	low := int(mean - c*stddev)
	 383  	high := int(mean + c*stddev)
	 384  	for i := 0; i < n; i++ {
	 385  		for j := 0; j < hashSize; j++ {
	 386  			x := grid[i][j]
	 387  			if x < low || x > high {
	 388  				t.Errorf("bad bias for %s bit %d -> bit %d: %d/%d\n", k.name(), i, j, x, REP)
	 389  			}
	 390  		}
	 391  	}
	 392  }
	 393  
	 394  // All bit rotations of a set of distinct keys
	 395  func TestSmhasherWindowed(t *testing.T) {
	 396  	windowed(t, &bytesKey{make([]byte, 128)})
	 397  }
	 398  func windowed(t *testing.T, k key) {
	 399  	if runtime.GOARCH == "wasm" {
	 400  		t.Skip("Too slow on wasm")
	 401  	}
	 402  	if testing.Short() {
	 403  		t.Skip("Skipping in short mode")
	 404  	}
	 405  	const BITS = 16
	 406  
	 407  	for r := 0; r < k.bits(); r++ {
	 408  		h := newHashSet()
	 409  		for i := 0; i < 1<<BITS; i++ {
	 410  			k.clear()
	 411  			for j := 0; j < BITS; j++ {
	 412  				if i>>uint(j)&1 != 0 {
	 413  					k.flipBit((j + r) % k.bits())
	 414  				}
	 415  			}
	 416  			h.add(k.hash())
	 417  		}
	 418  		h.check(t)
	 419  	}
	 420  }
	 421  
	 422  // All keys of the form prefix + [A-Za-z0-9]*N + suffix.
	 423  func TestSmhasherText(t *testing.T) {
	 424  	if testing.Short() {
	 425  		t.Skip("Skipping in short mode")
	 426  	}
	 427  	text(t, "Foo", "Bar")
	 428  	text(t, "FooBar", "")
	 429  	text(t, "", "FooBar")
	 430  }
	 431  func text(t *testing.T, prefix, suffix string) {
	 432  	const N = 4
	 433  	const S = "ABCDEFGHIJKLMNOPQRSTabcdefghijklmnopqrst0123456789"
	 434  	const L = len(S)
	 435  	b := make([]byte, len(prefix)+N+len(suffix))
	 436  	copy(b, prefix)
	 437  	copy(b[len(prefix)+N:], suffix)
	 438  	h := newHashSet()
	 439  	c := b[len(prefix):]
	 440  	for i := 0; i < L; i++ {
	 441  		c[0] = S[i]
	 442  		for j := 0; j < L; j++ {
	 443  			c[1] = S[j]
	 444  			for k := 0; k < L; k++ {
	 445  				c[2] = S[k]
	 446  				for x := 0; x < L; x++ {
	 447  					c[3] = S[x]
	 448  					h.addB(b)
	 449  				}
	 450  			}
	 451  		}
	 452  	}
	 453  	h.check(t)
	 454  }
	 455  
	 456  // Make sure different seed values generate different hashes.
	 457  func TestSmhasherSeed(t *testing.T) {
	 458  	if unsafe.Sizeof(uintptr(0)) == 4 {
	 459  		t.Skip("32-bit platforms don't have ideal seed-input distributions (see issue 33988)")
	 460  	}
	 461  	h := newHashSet()
	 462  	const N = 100000
	 463  	s := "hello"
	 464  	for i := 0; i < N; i++ {
	 465  		h.addS_seed(s, Seed{s: uint64(i + 1)})
	 466  		h.addS_seed(s, Seed{s: uint64(i+1) << 32}) // make sure high bits are used
	 467  	}
	 468  	h.check(t)
	 469  }
	 470  

View as plain text