...

Source file src/runtime/mbitmap.go

Documentation: runtime

		 1  // Copyright 2009 The Go Authors. All rights reserved.
		 2  // Use of this source code is governed by a BSD-style
		 3  // license that can be found in the LICENSE file.
		 4  
		 5  // Garbage collector: type and heap bitmaps.
		 6  //
		 7  // Stack, data, and bss bitmaps
		 8  //
		 9  // Stack frames and global variables in the data and bss sections are
		10  // described by bitmaps with 1 bit per pointer-sized word. A "1" bit
		11  // means the word is a live pointer to be visited by the GC (referred to
		12  // as "pointer"). A "0" bit means the word should be ignored by GC
		13  // (referred to as "scalar", though it could be a dead pointer value).
		14  //
		15  // Heap bitmap
		16  //
		17  // The heap bitmap comprises 2 bits for each pointer-sized word in the heap,
		18  // stored in the heapArena metadata backing each heap arena.
		19  // That is, if ha is the heapArena for the arena starting a start,
		20  // then ha.bitmap[0] holds the 2-bit entries for the four words start
		21  // through start+3*ptrSize, ha.bitmap[1] holds the entries for
		22  // start+4*ptrSize through start+7*ptrSize, and so on.
		23  //
		24  // In each 2-bit entry, the lower bit is a pointer/scalar bit, just
		25  // like in the stack/data bitmaps described above. The upper bit
		26  // indicates scan/dead: a "1" value ("scan") indicates that there may
		27  // be pointers in later words of the allocation, and a "0" value
		28  // ("dead") indicates there are no more pointers in the allocation. If
		29  // the upper bit is 0, the lower bit must also be 0, and this
		30  // indicates scanning can ignore the rest of the allocation.
		31  //
		32  // The 2-bit entries are split when written into the byte, so that the top half
		33  // of the byte contains 4 high (scan) bits and the bottom half contains 4 low
		34  // (pointer) bits. This form allows a copy from the 1-bit to the 4-bit form to
		35  // keep the pointer bits contiguous, instead of having to space them out.
		36  //
		37  // The code makes use of the fact that the zero value for a heap
		38  // bitmap means scalar/dead. This property must be preserved when
		39  // modifying the encoding.
		40  //
		41  // The bitmap for noscan spans is not maintained. Code must ensure
		42  // that an object is scannable before consulting its bitmap by
		43  // checking either the noscan bit in the span or by consulting its
		44  // type's information.
		45  
		46  package runtime
		47  
		48  import (
		49  	"runtime/internal/atomic"
		50  	"runtime/internal/sys"
		51  	"unsafe"
		52  )
		53  
		54  const (
		55  	bitPointer = 1 << 0
		56  	bitScan		= 1 << 4
		57  
		58  	heapBitsShift			= 1		 // shift offset between successive bitPointer or bitScan entries
		59  	wordsPerBitmapByte = 8 / 2 // heap words described by one bitmap byte
		60  
		61  	// all scan/pointer bits in a byte
		62  	bitScanAll		= bitScan | bitScan<<heapBitsShift | bitScan<<(2*heapBitsShift) | bitScan<<(3*heapBitsShift)
		63  	bitPointerAll = bitPointer | bitPointer<<heapBitsShift | bitPointer<<(2*heapBitsShift) | bitPointer<<(3*heapBitsShift)
		64  )
		65  
		66  // addb returns the byte pointer p+n.
		67  //go:nowritebarrier
		68  //go:nosplit
		69  func addb(p *byte, n uintptr) *byte {
		70  	// Note: wrote out full expression instead of calling add(p, n)
		71  	// to reduce the number of temporaries generated by the
		72  	// compiler for this trivial expression during inlining.
		73  	return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) + n))
		74  }
		75  
		76  // subtractb returns the byte pointer p-n.
		77  //go:nowritebarrier
		78  //go:nosplit
		79  func subtractb(p *byte, n uintptr) *byte {
		80  	// Note: wrote out full expression instead of calling add(p, -n)
		81  	// to reduce the number of temporaries generated by the
		82  	// compiler for this trivial expression during inlining.
		83  	return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) - n))
		84  }
		85  
		86  // add1 returns the byte pointer p+1.
		87  //go:nowritebarrier
		88  //go:nosplit
		89  func add1(p *byte) *byte {
		90  	// Note: wrote out full expression instead of calling addb(p, 1)
		91  	// to reduce the number of temporaries generated by the
		92  	// compiler for this trivial expression during inlining.
		93  	return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) + 1))
		94  }
		95  
		96  // subtract1 returns the byte pointer p-1.
		97  //go:nowritebarrier
		98  //
		99  // nosplit because it is used during write barriers and must not be preempted.
	 100  //go:nosplit
	 101  func subtract1(p *byte) *byte {
	 102  	// Note: wrote out full expression instead of calling subtractb(p, 1)
	 103  	// to reduce the number of temporaries generated by the
	 104  	// compiler for this trivial expression during inlining.
	 105  	return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) - 1))
	 106  }
	 107  
	 108  // heapBits provides access to the bitmap bits for a single heap word.
	 109  // The methods on heapBits take value receivers so that the compiler
	 110  // can more easily inline calls to those methods and registerize the
	 111  // struct fields independently.
	 112  type heapBits struct {
	 113  	bitp	*uint8
	 114  	shift uint32
	 115  	arena uint32 // Index of heap arena containing bitp
	 116  	last	*uint8 // Last byte arena's bitmap
	 117  }
	 118  
	 119  // Make the compiler check that heapBits.arena is large enough to hold
	 120  // the maximum arena frame number.
	 121  var _ = heapBits{arena: (1<<heapAddrBits)/heapArenaBytes - 1}
	 122  
	 123  // markBits provides access to the mark bit for an object in the heap.
	 124  // bytep points to the byte holding the mark bit.
	 125  // mask is a byte with a single bit set that can be &ed with *bytep
	 126  // to see if the bit has been set.
	 127  // *m.byte&m.mask != 0 indicates the mark bit is set.
	 128  // index can be used along with span information to generate
	 129  // the address of the object in the heap.
	 130  // We maintain one set of mark bits for allocation and one for
	 131  // marking purposes.
	 132  type markBits struct {
	 133  	bytep *uint8
	 134  	mask	uint8
	 135  	index uintptr
	 136  }
	 137  
	 138  //go:nosplit
	 139  func (s *mspan) allocBitsForIndex(allocBitIndex uintptr) markBits {
	 140  	bytep, mask := s.allocBits.bitp(allocBitIndex)
	 141  	return markBits{bytep, mask, allocBitIndex}
	 142  }
	 143  
	 144  // refillAllocCache takes 8 bytes s.allocBits starting at whichByte
	 145  // and negates them so that ctz (count trailing zeros) instructions
	 146  // can be used. It then places these 8 bytes into the cached 64 bit
	 147  // s.allocCache.
	 148  func (s *mspan) refillAllocCache(whichByte uintptr) {
	 149  	bytes := (*[8]uint8)(unsafe.Pointer(s.allocBits.bytep(whichByte)))
	 150  	aCache := uint64(0)
	 151  	aCache |= uint64(bytes[0])
	 152  	aCache |= uint64(bytes[1]) << (1 * 8)
	 153  	aCache |= uint64(bytes[2]) << (2 * 8)
	 154  	aCache |= uint64(bytes[3]) << (3 * 8)
	 155  	aCache |= uint64(bytes[4]) << (4 * 8)
	 156  	aCache |= uint64(bytes[5]) << (5 * 8)
	 157  	aCache |= uint64(bytes[6]) << (6 * 8)
	 158  	aCache |= uint64(bytes[7]) << (7 * 8)
	 159  	s.allocCache = ^aCache
	 160  }
	 161  
	 162  // nextFreeIndex returns the index of the next free object in s at
	 163  // or after s.freeindex.
	 164  // There are hardware instructions that can be used to make this
	 165  // faster if profiling warrants it.
	 166  func (s *mspan) nextFreeIndex() uintptr {
	 167  	sfreeindex := s.freeindex
	 168  	snelems := s.nelems
	 169  	if sfreeindex == snelems {
	 170  		return sfreeindex
	 171  	}
	 172  	if sfreeindex > snelems {
	 173  		throw("s.freeindex > s.nelems")
	 174  	}
	 175  
	 176  	aCache := s.allocCache
	 177  
	 178  	bitIndex := sys.Ctz64(aCache)
	 179  	for bitIndex == 64 {
	 180  		// Move index to start of next cached bits.
	 181  		sfreeindex = (sfreeindex + 64) &^ (64 - 1)
	 182  		if sfreeindex >= snelems {
	 183  			s.freeindex = snelems
	 184  			return snelems
	 185  		}
	 186  		whichByte := sfreeindex / 8
	 187  		// Refill s.allocCache with the next 64 alloc bits.
	 188  		s.refillAllocCache(whichByte)
	 189  		aCache = s.allocCache
	 190  		bitIndex = sys.Ctz64(aCache)
	 191  		// nothing available in cached bits
	 192  		// grab the next 8 bytes and try again.
	 193  	}
	 194  	result := sfreeindex + uintptr(bitIndex)
	 195  	if result >= snelems {
	 196  		s.freeindex = snelems
	 197  		return snelems
	 198  	}
	 199  
	 200  	s.allocCache >>= uint(bitIndex + 1)
	 201  	sfreeindex = result + 1
	 202  
	 203  	if sfreeindex%64 == 0 && sfreeindex != snelems {
	 204  		// We just incremented s.freeindex so it isn't 0.
	 205  		// As each 1 in s.allocCache was encountered and used for allocation
	 206  		// it was shifted away. At this point s.allocCache contains all 0s.
	 207  		// Refill s.allocCache so that it corresponds
	 208  		// to the bits at s.allocBits starting at s.freeindex.
	 209  		whichByte := sfreeindex / 8
	 210  		s.refillAllocCache(whichByte)
	 211  	}
	 212  	s.freeindex = sfreeindex
	 213  	return result
	 214  }
	 215  
	 216  // isFree reports whether the index'th object in s is unallocated.
	 217  //
	 218  // The caller must ensure s.state is mSpanInUse, and there must have
	 219  // been no preemption points since ensuring this (which could allow a
	 220  // GC transition, which would allow the state to change).
	 221  func (s *mspan) isFree(index uintptr) bool {
	 222  	if index < s.freeindex {
	 223  		return false
	 224  	}
	 225  	bytep, mask := s.allocBits.bitp(index)
	 226  	return *bytep&mask == 0
	 227  }
	 228  
	 229  // divideByElemSize returns n/s.elemsize.
	 230  // n must be within [0, s.npages*_PageSize),
	 231  // or may be exactly s.npages*_PageSize
	 232  // if s.elemsize is from sizeclasses.go.
	 233  func (s *mspan) divideByElemSize(n uintptr) uintptr {
	 234  	const doubleCheck = false
	 235  
	 236  	// See explanation in mksizeclasses.go's computeDivMagic.
	 237  	q := uintptr((uint64(n) * uint64(s.divMul)) >> 32)
	 238  
	 239  	if doubleCheck && q != n/s.elemsize {
	 240  		println(n, "/", s.elemsize, "should be", n/s.elemsize, "but got", q)
	 241  		throw("bad magic division")
	 242  	}
	 243  	return q
	 244  }
	 245  
	 246  func (s *mspan) objIndex(p uintptr) uintptr {
	 247  	return s.divideByElemSize(p - s.base())
	 248  }
	 249  
	 250  func markBitsForAddr(p uintptr) markBits {
	 251  	s := spanOf(p)
	 252  	objIndex := s.objIndex(p)
	 253  	return s.markBitsForIndex(objIndex)
	 254  }
	 255  
	 256  func (s *mspan) markBitsForIndex(objIndex uintptr) markBits {
	 257  	bytep, mask := s.gcmarkBits.bitp(objIndex)
	 258  	return markBits{bytep, mask, objIndex}
	 259  }
	 260  
	 261  func (s *mspan) markBitsForBase() markBits {
	 262  	return markBits{(*uint8)(s.gcmarkBits), uint8(1), 0}
	 263  }
	 264  
	 265  // isMarked reports whether mark bit m is set.
	 266  func (m markBits) isMarked() bool {
	 267  	return *m.bytep&m.mask != 0
	 268  }
	 269  
	 270  // setMarked sets the marked bit in the markbits, atomically.
	 271  func (m markBits) setMarked() {
	 272  	// Might be racing with other updates, so use atomic update always.
	 273  	// We used to be clever here and use a non-atomic update in certain
	 274  	// cases, but it's not worth the risk.
	 275  	atomic.Or8(m.bytep, m.mask)
	 276  }
	 277  
	 278  // setMarkedNonAtomic sets the marked bit in the markbits, non-atomically.
	 279  func (m markBits) setMarkedNonAtomic() {
	 280  	*m.bytep |= m.mask
	 281  }
	 282  
	 283  // clearMarked clears the marked bit in the markbits, atomically.
	 284  func (m markBits) clearMarked() {
	 285  	// Might be racing with other updates, so use atomic update always.
	 286  	// We used to be clever here and use a non-atomic update in certain
	 287  	// cases, but it's not worth the risk.
	 288  	atomic.And8(m.bytep, ^m.mask)
	 289  }
	 290  
	 291  // markBitsForSpan returns the markBits for the span base address base.
	 292  func markBitsForSpan(base uintptr) (mbits markBits) {
	 293  	mbits = markBitsForAddr(base)
	 294  	if mbits.mask != 1 {
	 295  		throw("markBitsForSpan: unaligned start")
	 296  	}
	 297  	return mbits
	 298  }
	 299  
	 300  // advance advances the markBits to the next object in the span.
	 301  func (m *markBits) advance() {
	 302  	if m.mask == 1<<7 {
	 303  		m.bytep = (*uint8)(unsafe.Pointer(uintptr(unsafe.Pointer(m.bytep)) + 1))
	 304  		m.mask = 1
	 305  	} else {
	 306  		m.mask = m.mask << 1
	 307  	}
	 308  	m.index++
	 309  }
	 310  
	 311  // heapBitsForAddr returns the heapBits for the address addr.
	 312  // The caller must ensure addr is in an allocated span.
	 313  // In particular, be careful not to point past the end of an object.
	 314  //
	 315  // nosplit because it is used during write barriers and must not be preempted.
	 316  //go:nosplit
	 317  func heapBitsForAddr(addr uintptr) (h heapBits) {
	 318  	// 2 bits per word, 4 pairs per byte, and a mask is hard coded.
	 319  	arena := arenaIndex(addr)
	 320  	ha := mheap_.arenas[arena.l1()][arena.l2()]
	 321  	// The compiler uses a load for nil checking ha, but in this
	 322  	// case we'll almost never hit that cache line again, so it
	 323  	// makes more sense to do a value check.
	 324  	if ha == nil {
	 325  		// addr is not in the heap. Return nil heapBits, which
	 326  		// we expect to crash in the caller.
	 327  		return
	 328  	}
	 329  	h.bitp = &ha.bitmap[(addr/(sys.PtrSize*4))%heapArenaBitmapBytes]
	 330  	h.shift = uint32((addr / sys.PtrSize) & 3)
	 331  	h.arena = uint32(arena)
	 332  	h.last = &ha.bitmap[len(ha.bitmap)-1]
	 333  	return
	 334  }
	 335  
	 336  // clobberdeadPtr is a special value that is used by the compiler to
	 337  // clobber dead stack slots, when -clobberdead flag is set.
	 338  const clobberdeadPtr = uintptr(0xdeaddead | 0xdeaddead<<((^uintptr(0)>>63)*32))
	 339  
	 340  // badPointer throws bad pointer in heap panic.
	 341  func badPointer(s *mspan, p, refBase, refOff uintptr) {
	 342  	// Typically this indicates an incorrect use
	 343  	// of unsafe or cgo to store a bad pointer in
	 344  	// the Go heap. It may also indicate a runtime
	 345  	// bug.
	 346  	//
	 347  	// TODO(austin): We could be more aggressive
	 348  	// and detect pointers to unallocated objects
	 349  	// in allocated spans.
	 350  	printlock()
	 351  	print("runtime: pointer ", hex(p))
	 352  	if s != nil {
	 353  		state := s.state.get()
	 354  		if state != mSpanInUse {
	 355  			print(" to unallocated span")
	 356  		} else {
	 357  			print(" to unused region of span")
	 358  		}
	 359  		print(" span.base()=", hex(s.base()), " span.limit=", hex(s.limit), " span.state=", state)
	 360  	}
	 361  	print("\n")
	 362  	if refBase != 0 {
	 363  		print("runtime: found in object at *(", hex(refBase), "+", hex(refOff), ")\n")
	 364  		gcDumpObject("object", refBase, refOff)
	 365  	}
	 366  	getg().m.traceback = 2
	 367  	throw("found bad pointer in Go heap (incorrect use of unsafe or cgo?)")
	 368  }
	 369  
	 370  // findObject returns the base address for the heap object containing
	 371  // the address p, the object's span, and the index of the object in s.
	 372  // If p does not point into a heap object, it returns base == 0.
	 373  //
	 374  // If p points is an invalid heap pointer and debug.invalidptr != 0,
	 375  // findObject panics.
	 376  //
	 377  // refBase and refOff optionally give the base address of the object
	 378  // in which the pointer p was found and the byte offset at which it
	 379  // was found. These are used for error reporting.
	 380  //
	 381  // It is nosplit so it is safe for p to be a pointer to the current goroutine's stack.
	 382  // Since p is a uintptr, it would not be adjusted if the stack were to move.
	 383  //go:nosplit
	 384  func findObject(p, refBase, refOff uintptr) (base uintptr, s *mspan, objIndex uintptr) {
	 385  	s = spanOf(p)
	 386  	// If s is nil, the virtual address has never been part of the heap.
	 387  	// This pointer may be to some mmap'd region, so we allow it.
	 388  	if s == nil {
	 389  		if GOARCH == "amd64" && p == clobberdeadPtr && debug.invalidptr != 0 {
	 390  			// Crash if clobberdeadPtr is seen. Only on AMD64 for now, as
	 391  			// it is the only platform where compiler's clobberdead mode is
	 392  			// implemented. On AMD64 clobberdeadPtr cannot be a valid address.
	 393  			badPointer(s, p, refBase, refOff)
	 394  		}
	 395  		return
	 396  	}
	 397  	// If p is a bad pointer, it may not be in s's bounds.
	 398  	//
	 399  	// Check s.state to synchronize with span initialization
	 400  	// before checking other fields. See also spanOfHeap.
	 401  	if state := s.state.get(); state != mSpanInUse || p < s.base() || p >= s.limit {
	 402  		// Pointers into stacks are also ok, the runtime manages these explicitly.
	 403  		if state == mSpanManual {
	 404  			return
	 405  		}
	 406  		// The following ensures that we are rigorous about what data
	 407  		// structures hold valid pointers.
	 408  		if debug.invalidptr != 0 {
	 409  			badPointer(s, p, refBase, refOff)
	 410  		}
	 411  		return
	 412  	}
	 413  
	 414  	objIndex = s.objIndex(p)
	 415  	base = s.base() + objIndex*s.elemsize
	 416  	return
	 417  }
	 418  
	 419  // next returns the heapBits describing the next pointer-sized word in memory.
	 420  // That is, if h describes address p, h.next() describes p+ptrSize.
	 421  // Note that next does not modify h. The caller must record the result.
	 422  //
	 423  // nosplit because it is used during write barriers and must not be preempted.
	 424  //go:nosplit
	 425  func (h heapBits) next() heapBits {
	 426  	if h.shift < 3*heapBitsShift {
	 427  		h.shift += heapBitsShift
	 428  	} else if h.bitp != h.last {
	 429  		h.bitp, h.shift = add1(h.bitp), 0
	 430  	} else {
	 431  		// Move to the next arena.
	 432  		return h.nextArena()
	 433  	}
	 434  	return h
	 435  }
	 436  
	 437  // nextArena advances h to the beginning of the next heap arena.
	 438  //
	 439  // This is a slow-path helper to next. gc's inliner knows that
	 440  // heapBits.next can be inlined even though it calls this. This is
	 441  // marked noinline so it doesn't get inlined into next and cause next
	 442  // to be too big to inline.
	 443  //
	 444  //go:nosplit
	 445  //go:noinline
	 446  func (h heapBits) nextArena() heapBits {
	 447  	h.arena++
	 448  	ai := arenaIdx(h.arena)
	 449  	l2 := mheap_.arenas[ai.l1()]
	 450  	if l2 == nil {
	 451  		// We just passed the end of the object, which
	 452  		// was also the end of the heap. Poison h. It
	 453  		// should never be dereferenced at this point.
	 454  		return heapBits{}
	 455  	}
	 456  	ha := l2[ai.l2()]
	 457  	if ha == nil {
	 458  		return heapBits{}
	 459  	}
	 460  	h.bitp, h.shift = &ha.bitmap[0], 0
	 461  	h.last = &ha.bitmap[len(ha.bitmap)-1]
	 462  	return h
	 463  }
	 464  
	 465  // forward returns the heapBits describing n pointer-sized words ahead of h in memory.
	 466  // That is, if h describes address p, h.forward(n) describes p+n*ptrSize.
	 467  // h.forward(1) is equivalent to h.next(), just slower.
	 468  // Note that forward does not modify h. The caller must record the result.
	 469  // bits returns the heap bits for the current word.
	 470  //go:nosplit
	 471  func (h heapBits) forward(n uintptr) heapBits {
	 472  	n += uintptr(h.shift) / heapBitsShift
	 473  	nbitp := uintptr(unsafe.Pointer(h.bitp)) + n/4
	 474  	h.shift = uint32(n%4) * heapBitsShift
	 475  	if nbitp <= uintptr(unsafe.Pointer(h.last)) {
	 476  		h.bitp = (*uint8)(unsafe.Pointer(nbitp))
	 477  		return h
	 478  	}
	 479  
	 480  	// We're in a new heap arena.
	 481  	past := nbitp - (uintptr(unsafe.Pointer(h.last)) + 1)
	 482  	h.arena += 1 + uint32(past/heapArenaBitmapBytes)
	 483  	ai := arenaIdx(h.arena)
	 484  	if l2 := mheap_.arenas[ai.l1()]; l2 != nil && l2[ai.l2()] != nil {
	 485  		a := l2[ai.l2()]
	 486  		h.bitp = &a.bitmap[past%heapArenaBitmapBytes]
	 487  		h.last = &a.bitmap[len(a.bitmap)-1]
	 488  	} else {
	 489  		h.bitp, h.last = nil, nil
	 490  	}
	 491  	return h
	 492  }
	 493  
	 494  // forwardOrBoundary is like forward, but stops at boundaries between
	 495  // contiguous sections of the bitmap. It returns the number of words
	 496  // advanced over, which will be <= n.
	 497  func (h heapBits) forwardOrBoundary(n uintptr) (heapBits, uintptr) {
	 498  	maxn := 4 * ((uintptr(unsafe.Pointer(h.last)) + 1) - uintptr(unsafe.Pointer(h.bitp)))
	 499  	if n > maxn {
	 500  		n = maxn
	 501  	}
	 502  	return h.forward(n), n
	 503  }
	 504  
	 505  // The caller can test morePointers and isPointer by &-ing with bitScan and bitPointer.
	 506  // The result includes in its higher bits the bits for subsequent words
	 507  // described by the same bitmap byte.
	 508  //
	 509  // nosplit because it is used during write barriers and must not be preempted.
	 510  //go:nosplit
	 511  func (h heapBits) bits() uint32 {
	 512  	// The (shift & 31) eliminates a test and conditional branch
	 513  	// from the generated code.
	 514  	return uint32(*h.bitp) >> (h.shift & 31)
	 515  }
	 516  
	 517  // morePointers reports whether this word and all remaining words in this object
	 518  // are scalars.
	 519  // h must not describe the second word of the object.
	 520  func (h heapBits) morePointers() bool {
	 521  	return h.bits()&bitScan != 0
	 522  }
	 523  
	 524  // isPointer reports whether the heap bits describe a pointer word.
	 525  //
	 526  // nosplit because it is used during write barriers and must not be preempted.
	 527  //go:nosplit
	 528  func (h heapBits) isPointer() bool {
	 529  	return h.bits()&bitPointer != 0
	 530  }
	 531  
	 532  // bulkBarrierPreWrite executes a write barrier
	 533  // for every pointer slot in the memory range [src, src+size),
	 534  // using pointer/scalar information from [dst, dst+size).
	 535  // This executes the write barriers necessary before a memmove.
	 536  // src, dst, and size must be pointer-aligned.
	 537  // The range [dst, dst+size) must lie within a single object.
	 538  // It does not perform the actual writes.
	 539  //
	 540  // As a special case, src == 0 indicates that this is being used for a
	 541  // memclr. bulkBarrierPreWrite will pass 0 for the src of each write
	 542  // barrier.
	 543  //
	 544  // Callers should call bulkBarrierPreWrite immediately before
	 545  // calling memmove(dst, src, size). This function is marked nosplit
	 546  // to avoid being preempted; the GC must not stop the goroutine
	 547  // between the memmove and the execution of the barriers.
	 548  // The caller is also responsible for cgo pointer checks if this
	 549  // may be writing Go pointers into non-Go memory.
	 550  //
	 551  // The pointer bitmap is not maintained for allocations containing
	 552  // no pointers at all; any caller of bulkBarrierPreWrite must first
	 553  // make sure the underlying allocation contains pointers, usually
	 554  // by checking typ.ptrdata.
	 555  //
	 556  // Callers must perform cgo checks if writeBarrier.cgo.
	 557  //
	 558  //go:nosplit
	 559  func bulkBarrierPreWrite(dst, src, size uintptr) {
	 560  	if (dst|src|size)&(sys.PtrSize-1) != 0 {
	 561  		throw("bulkBarrierPreWrite: unaligned arguments")
	 562  	}
	 563  	if !writeBarrier.needed {
	 564  		return
	 565  	}
	 566  	if s := spanOf(dst); s == nil {
	 567  		// If dst is a global, use the data or BSS bitmaps to
	 568  		// execute write barriers.
	 569  		for _, datap := range activeModules() {
	 570  			if datap.data <= dst && dst < datap.edata {
	 571  				bulkBarrierBitmap(dst, src, size, dst-datap.data, datap.gcdatamask.bytedata)
	 572  				return
	 573  			}
	 574  		}
	 575  		for _, datap := range activeModules() {
	 576  			if datap.bss <= dst && dst < datap.ebss {
	 577  				bulkBarrierBitmap(dst, src, size, dst-datap.bss, datap.gcbssmask.bytedata)
	 578  				return
	 579  			}
	 580  		}
	 581  		return
	 582  	} else if s.state.get() != mSpanInUse || dst < s.base() || s.limit <= dst {
	 583  		// dst was heap memory at some point, but isn't now.
	 584  		// It can't be a global. It must be either our stack,
	 585  		// or in the case of direct channel sends, it could be
	 586  		// another stack. Either way, no need for barriers.
	 587  		// This will also catch if dst is in a freed span,
	 588  		// though that should never have.
	 589  		return
	 590  	}
	 591  
	 592  	buf := &getg().m.p.ptr().wbBuf
	 593  	h := heapBitsForAddr(dst)
	 594  	if src == 0 {
	 595  		for i := uintptr(0); i < size; i += sys.PtrSize {
	 596  			if h.isPointer() {
	 597  				dstx := (*uintptr)(unsafe.Pointer(dst + i))
	 598  				if !buf.putFast(*dstx, 0) {
	 599  					wbBufFlush(nil, 0)
	 600  				}
	 601  			}
	 602  			h = h.next()
	 603  		}
	 604  	} else {
	 605  		for i := uintptr(0); i < size; i += sys.PtrSize {
	 606  			if h.isPointer() {
	 607  				dstx := (*uintptr)(unsafe.Pointer(dst + i))
	 608  				srcx := (*uintptr)(unsafe.Pointer(src + i))
	 609  				if !buf.putFast(*dstx, *srcx) {
	 610  					wbBufFlush(nil, 0)
	 611  				}
	 612  			}
	 613  			h = h.next()
	 614  		}
	 615  	}
	 616  }
	 617  
	 618  // bulkBarrierPreWriteSrcOnly is like bulkBarrierPreWrite but
	 619  // does not execute write barriers for [dst, dst+size).
	 620  //
	 621  // In addition to the requirements of bulkBarrierPreWrite
	 622  // callers need to ensure [dst, dst+size) is zeroed.
	 623  //
	 624  // This is used for special cases where e.g. dst was just
	 625  // created and zeroed with malloc.
	 626  //go:nosplit
	 627  func bulkBarrierPreWriteSrcOnly(dst, src, size uintptr) {
	 628  	if (dst|src|size)&(sys.PtrSize-1) != 0 {
	 629  		throw("bulkBarrierPreWrite: unaligned arguments")
	 630  	}
	 631  	if !writeBarrier.needed {
	 632  		return
	 633  	}
	 634  	buf := &getg().m.p.ptr().wbBuf
	 635  	h := heapBitsForAddr(dst)
	 636  	for i := uintptr(0); i < size; i += sys.PtrSize {
	 637  		if h.isPointer() {
	 638  			srcx := (*uintptr)(unsafe.Pointer(src + i))
	 639  			if !buf.putFast(0, *srcx) {
	 640  				wbBufFlush(nil, 0)
	 641  			}
	 642  		}
	 643  		h = h.next()
	 644  	}
	 645  }
	 646  
	 647  // bulkBarrierBitmap executes write barriers for copying from [src,
	 648  // src+size) to [dst, dst+size) using a 1-bit pointer bitmap. src is
	 649  // assumed to start maskOffset bytes into the data covered by the
	 650  // bitmap in bits (which may not be a multiple of 8).
	 651  //
	 652  // This is used by bulkBarrierPreWrite for writes to data and BSS.
	 653  //
	 654  //go:nosplit
	 655  func bulkBarrierBitmap(dst, src, size, maskOffset uintptr, bits *uint8) {
	 656  	word := maskOffset / sys.PtrSize
	 657  	bits = addb(bits, word/8)
	 658  	mask := uint8(1) << (word % 8)
	 659  
	 660  	buf := &getg().m.p.ptr().wbBuf
	 661  	for i := uintptr(0); i < size; i += sys.PtrSize {
	 662  		if mask == 0 {
	 663  			bits = addb(bits, 1)
	 664  			if *bits == 0 {
	 665  				// Skip 8 words.
	 666  				i += 7 * sys.PtrSize
	 667  				continue
	 668  			}
	 669  			mask = 1
	 670  		}
	 671  		if *bits&mask != 0 {
	 672  			dstx := (*uintptr)(unsafe.Pointer(dst + i))
	 673  			if src == 0 {
	 674  				if !buf.putFast(*dstx, 0) {
	 675  					wbBufFlush(nil, 0)
	 676  				}
	 677  			} else {
	 678  				srcx := (*uintptr)(unsafe.Pointer(src + i))
	 679  				if !buf.putFast(*dstx, *srcx) {
	 680  					wbBufFlush(nil, 0)
	 681  				}
	 682  			}
	 683  		}
	 684  		mask <<= 1
	 685  	}
	 686  }
	 687  
	 688  // typeBitsBulkBarrier executes a write barrier for every
	 689  // pointer that would be copied from [src, src+size) to [dst,
	 690  // dst+size) by a memmove using the type bitmap to locate those
	 691  // pointer slots.
	 692  //
	 693  // The type typ must correspond exactly to [src, src+size) and [dst, dst+size).
	 694  // dst, src, and size must be pointer-aligned.
	 695  // The type typ must have a plain bitmap, not a GC program.
	 696  // The only use of this function is in channel sends, and the
	 697  // 64 kB channel element limit takes care of this for us.
	 698  //
	 699  // Must not be preempted because it typically runs right before memmove,
	 700  // and the GC must observe them as an atomic action.
	 701  //
	 702  // Callers must perform cgo checks if writeBarrier.cgo.
	 703  //
	 704  //go:nosplit
	 705  func typeBitsBulkBarrier(typ *_type, dst, src, size uintptr) {
	 706  	if typ == nil {
	 707  		throw("runtime: typeBitsBulkBarrier without type")
	 708  	}
	 709  	if typ.size != size {
	 710  		println("runtime: typeBitsBulkBarrier with type ", typ.string(), " of size ", typ.size, " but memory size", size)
	 711  		throw("runtime: invalid typeBitsBulkBarrier")
	 712  	}
	 713  	if typ.kind&kindGCProg != 0 {
	 714  		println("runtime: typeBitsBulkBarrier with type ", typ.string(), " with GC prog")
	 715  		throw("runtime: invalid typeBitsBulkBarrier")
	 716  	}
	 717  	if !writeBarrier.needed {
	 718  		return
	 719  	}
	 720  	ptrmask := typ.gcdata
	 721  	buf := &getg().m.p.ptr().wbBuf
	 722  	var bits uint32
	 723  	for i := uintptr(0); i < typ.ptrdata; i += sys.PtrSize {
	 724  		if i&(sys.PtrSize*8-1) == 0 {
	 725  			bits = uint32(*ptrmask)
	 726  			ptrmask = addb(ptrmask, 1)
	 727  		} else {
	 728  			bits = bits >> 1
	 729  		}
	 730  		if bits&1 != 0 {
	 731  			dstx := (*uintptr)(unsafe.Pointer(dst + i))
	 732  			srcx := (*uintptr)(unsafe.Pointer(src + i))
	 733  			if !buf.putFast(*dstx, *srcx) {
	 734  				wbBufFlush(nil, 0)
	 735  			}
	 736  		}
	 737  	}
	 738  }
	 739  
	 740  // The methods operating on spans all require that h has been returned
	 741  // by heapBitsForSpan and that size, n, total are the span layout description
	 742  // returned by the mspan's layout method.
	 743  // If total > size*n, it means that there is extra leftover memory in the span,
	 744  // usually due to rounding.
	 745  //
	 746  // TODO(rsc): Perhaps introduce a different heapBitsSpan type.
	 747  
	 748  // initSpan initializes the heap bitmap for a span.
	 749  // If this is a span of pointer-sized objects, it initializes all
	 750  // words to pointer/scan.
	 751  // Otherwise, it initializes all words to scalar/dead.
	 752  func (h heapBits) initSpan(s *mspan) {
	 753  	// Clear bits corresponding to objects.
	 754  	nw := (s.npages << _PageShift) / sys.PtrSize
	 755  	if nw%wordsPerBitmapByte != 0 {
	 756  		throw("initSpan: unaligned length")
	 757  	}
	 758  	if h.shift != 0 {
	 759  		throw("initSpan: unaligned base")
	 760  	}
	 761  	isPtrs := sys.PtrSize == 8 && s.elemsize == sys.PtrSize
	 762  	for nw > 0 {
	 763  		hNext, anw := h.forwardOrBoundary(nw)
	 764  		nbyte := anw / wordsPerBitmapByte
	 765  		if isPtrs {
	 766  			bitp := h.bitp
	 767  			for i := uintptr(0); i < nbyte; i++ {
	 768  				*bitp = bitPointerAll | bitScanAll
	 769  				bitp = add1(bitp)
	 770  			}
	 771  		} else {
	 772  			memclrNoHeapPointers(unsafe.Pointer(h.bitp), nbyte)
	 773  		}
	 774  		h = hNext
	 775  		nw -= anw
	 776  	}
	 777  }
	 778  
	 779  // countAlloc returns the number of objects allocated in span s by
	 780  // scanning the allocation bitmap.
	 781  func (s *mspan) countAlloc() int {
	 782  	count := 0
	 783  	bytes := divRoundUp(s.nelems, 8)
	 784  	// Iterate over each 8-byte chunk and count allocations
	 785  	// with an intrinsic. Note that newMarkBits guarantees that
	 786  	// gcmarkBits will be 8-byte aligned, so we don't have to
	 787  	// worry about edge cases, irrelevant bits will simply be zero.
	 788  	for i := uintptr(0); i < bytes; i += 8 {
	 789  		// Extract 64 bits from the byte pointer and get a OnesCount.
	 790  		// Note that the unsafe cast here doesn't preserve endianness,
	 791  		// but that's OK. We only care about how many bits are 1, not
	 792  		// about the order we discover them in.
	 793  		mrkBits := *(*uint64)(unsafe.Pointer(s.gcmarkBits.bytep(i)))
	 794  		count += sys.OnesCount64(mrkBits)
	 795  	}
	 796  	return count
	 797  }
	 798  
	 799  // heapBitsSetType records that the new allocation [x, x+size)
	 800  // holds in [x, x+dataSize) one or more values of type typ.
	 801  // (The number of values is given by dataSize / typ.size.)
	 802  // If dataSize < size, the fragment [x+dataSize, x+size) is
	 803  // recorded as non-pointer data.
	 804  // It is known that the type has pointers somewhere;
	 805  // malloc does not call heapBitsSetType when there are no pointers,
	 806  // because all free objects are marked as noscan during
	 807  // heapBitsSweepSpan.
	 808  //
	 809  // There can only be one allocation from a given span active at a time,
	 810  // and the bitmap for a span always falls on byte boundaries,
	 811  // so there are no write-write races for access to the heap bitmap.
	 812  // Hence, heapBitsSetType can access the bitmap without atomics.
	 813  //
	 814  // There can be read-write races between heapBitsSetType and things
	 815  // that read the heap bitmap like scanobject. However, since
	 816  // heapBitsSetType is only used for objects that have not yet been
	 817  // made reachable, readers will ignore bits being modified by this
	 818  // function. This does mean this function cannot transiently modify
	 819  // bits that belong to neighboring objects. Also, on weakly-ordered
	 820  // machines, callers must execute a store/store (publication) barrier
	 821  // between calling this function and making the object reachable.
	 822  func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
	 823  	const doubleCheck = false // slow but helpful; enable to test modifications to this code
	 824  
	 825  	const (
	 826  		mask1 = bitPointer | bitScan												// 00010001
	 827  		mask2 = bitPointer | bitScan | mask1<<heapBitsShift // 00110011
	 828  		mask3 = bitPointer | bitScan | mask2<<heapBitsShift // 01110111
	 829  	)
	 830  
	 831  	// dataSize is always size rounded up to the next malloc size class,
	 832  	// except in the case of allocating a defer block, in which case
	 833  	// size is sizeof(_defer{}) (at least 6 words) and dataSize may be
	 834  	// arbitrarily larger.
	 835  	//
	 836  	// The checks for size == sys.PtrSize and size == 2*sys.PtrSize can therefore
	 837  	// assume that dataSize == size without checking it explicitly.
	 838  
	 839  	if sys.PtrSize == 8 && size == sys.PtrSize {
	 840  		// It's one word and it has pointers, it must be a pointer.
	 841  		// Since all allocated one-word objects are pointers
	 842  		// (non-pointers are aggregated into tinySize allocations),
	 843  		// initSpan sets the pointer bits for us. Nothing to do here.
	 844  		if doubleCheck {
	 845  			h := heapBitsForAddr(x)
	 846  			if !h.isPointer() {
	 847  				throw("heapBitsSetType: pointer bit missing")
	 848  			}
	 849  			if !h.morePointers() {
	 850  				throw("heapBitsSetType: scan bit missing")
	 851  			}
	 852  		}
	 853  		return
	 854  	}
	 855  
	 856  	h := heapBitsForAddr(x)
	 857  	ptrmask := typ.gcdata // start of 1-bit pointer mask (or GC program, handled below)
	 858  
	 859  	// 2-word objects only have 4 bitmap bits and 3-word objects only have 6 bitmap bits.
	 860  	// Therefore, these objects share a heap bitmap byte with the objects next to them.
	 861  	// These are called out as a special case primarily so the code below can assume all
	 862  	// objects are at least 4 words long and that their bitmaps start either at the beginning
	 863  	// of a bitmap byte, or half-way in (h.shift of 0 and 2 respectively).
	 864  
	 865  	if size == 2*sys.PtrSize {
	 866  		if typ.size == sys.PtrSize {
	 867  			// We're allocating a block big enough to hold two pointers.
	 868  			// On 64-bit, that means the actual object must be two pointers,
	 869  			// or else we'd have used the one-pointer-sized block.
	 870  			// On 32-bit, however, this is the 8-byte block, the smallest one.
	 871  			// So it could be that we're allocating one pointer and this was
	 872  			// just the smallest block available. Distinguish by checking dataSize.
	 873  			// (In general the number of instances of typ being allocated is
	 874  			// dataSize/typ.size.)
	 875  			if sys.PtrSize == 4 && dataSize == sys.PtrSize {
	 876  				// 1 pointer object. On 32-bit machines clear the bit for the
	 877  				// unused second word.
	 878  				*h.bitp &^= (bitPointer | bitScan | (bitPointer|bitScan)<<heapBitsShift) << h.shift
	 879  				*h.bitp |= (bitPointer | bitScan) << h.shift
	 880  			} else {
	 881  				// 2-element array of pointer.
	 882  				*h.bitp |= (bitPointer | bitScan | (bitPointer|bitScan)<<heapBitsShift) << h.shift
	 883  			}
	 884  			return
	 885  		}
	 886  		// Otherwise typ.size must be 2*sys.PtrSize,
	 887  		// and typ.kind&kindGCProg == 0.
	 888  		if doubleCheck {
	 889  			if typ.size != 2*sys.PtrSize || typ.kind&kindGCProg != 0 {
	 890  				print("runtime: heapBitsSetType size=", size, " but typ.size=", typ.size, " gcprog=", typ.kind&kindGCProg != 0, "\n")
	 891  				throw("heapBitsSetType")
	 892  			}
	 893  		}
	 894  		b := uint32(*ptrmask)
	 895  		hb := b & 3
	 896  		hb |= bitScanAll & ((bitScan << (typ.ptrdata / sys.PtrSize)) - 1)
	 897  		// Clear the bits for this object so we can set the
	 898  		// appropriate ones.
	 899  		*h.bitp &^= (bitPointer | bitScan | ((bitPointer | bitScan) << heapBitsShift)) << h.shift
	 900  		*h.bitp |= uint8(hb << h.shift)
	 901  		return
	 902  	} else if size == 3*sys.PtrSize {
	 903  		b := uint8(*ptrmask)
	 904  		if doubleCheck {
	 905  			if b == 0 {
	 906  				println("runtime: invalid type ", typ.string())
	 907  				throw("heapBitsSetType: called with non-pointer type")
	 908  			}
	 909  			if sys.PtrSize != 8 {
	 910  				throw("heapBitsSetType: unexpected 3 pointer wide size class on 32 bit")
	 911  			}
	 912  			if typ.kind&kindGCProg != 0 {
	 913  				throw("heapBitsSetType: unexpected GC prog for 3 pointer wide size class")
	 914  			}
	 915  			if typ.size == 2*sys.PtrSize {
	 916  				print("runtime: heapBitsSetType size=", size, " but typ.size=", typ.size, "\n")
	 917  				throw("heapBitsSetType: inconsistent object sizes")
	 918  			}
	 919  		}
	 920  		if typ.size == sys.PtrSize {
	 921  			// The type contains a pointer otherwise heapBitsSetType wouldn't have been called.
	 922  			// Since the type is only 1 pointer wide and contains a pointer, its gcdata must be exactly 1.
	 923  			if doubleCheck && *typ.gcdata != 1 {
	 924  				print("runtime: heapBitsSetType size=", size, " typ.size=", typ.size, "but *typ.gcdata", *typ.gcdata, "\n")
	 925  				throw("heapBitsSetType: unexpected gcdata for 1 pointer wide type size in 3 pointer wide size class")
	 926  			}
	 927  			// 3 element array of pointers. Unrolling ptrmask 3 times into p yields 00000111.
	 928  			b = 7
	 929  		}
	 930  
	 931  		hb := b & 7
	 932  		// Set bitScan bits for all pointers.
	 933  		hb |= hb << wordsPerBitmapByte
	 934  		// First bitScan bit is always set since the type contains pointers.
	 935  		hb |= bitScan
	 936  		// Second bitScan bit needs to also be set if the third bitScan bit is set.
	 937  		hb |= hb & (bitScan << (2 * heapBitsShift)) >> 1
	 938  
	 939  		// For h.shift > 1 heap bits cross a byte boundary and need to be written part
	 940  		// to h.bitp and part to the next h.bitp.
	 941  		switch h.shift {
	 942  		case 0:
	 943  			*h.bitp &^= mask3 << 0
	 944  			*h.bitp |= hb << 0
	 945  		case 1:
	 946  			*h.bitp &^= mask3 << 1
	 947  			*h.bitp |= hb << 1
	 948  		case 2:
	 949  			*h.bitp &^= mask2 << 2
	 950  			*h.bitp |= (hb & mask2) << 2
	 951  			// Two words written to the first byte.
	 952  			// Advance two words to get to the next byte.
	 953  			h = h.next().next()
	 954  			*h.bitp &^= mask1
	 955  			*h.bitp |= (hb >> 2) & mask1
	 956  		case 3:
	 957  			*h.bitp &^= mask1 << 3
	 958  			*h.bitp |= (hb & mask1) << 3
	 959  			// One word written to the first byte.
	 960  			// Advance one word to get to the next byte.
	 961  			h = h.next()
	 962  			*h.bitp &^= mask2
	 963  			*h.bitp |= (hb >> 1) & mask2
	 964  		}
	 965  		return
	 966  	}
	 967  
	 968  	// Copy from 1-bit ptrmask into 2-bit bitmap.
	 969  	// The basic approach is to use a single uintptr as a bit buffer,
	 970  	// alternating between reloading the buffer and writing bitmap bytes.
	 971  	// In general, one load can supply two bitmap byte writes.
	 972  	// This is a lot of lines of code, but it compiles into relatively few
	 973  	// machine instructions.
	 974  
	 975  	outOfPlace := false
	 976  	if arenaIndex(x+size-1) != arenaIdx(h.arena) || (doubleCheck && fastrand()%2 == 0) {
	 977  		// This object spans heap arenas, so the bitmap may be
	 978  		// discontiguous. Unroll it into the object instead
	 979  		// and then copy it out.
	 980  		//
	 981  		// In doubleCheck mode, we randomly do this anyway to
	 982  		// stress test the bitmap copying path.
	 983  		outOfPlace = true
	 984  		h.bitp = (*uint8)(unsafe.Pointer(x))
	 985  		h.last = nil
	 986  	}
	 987  
	 988  	var (
	 989  		// Ptrmask input.
	 990  		p		 *byte	 // last ptrmask byte read
	 991  		b		 uintptr // ptrmask bits already loaded
	 992  		nb		uintptr // number of bits in b at next read
	 993  		endp	*byte	 // final ptrmask byte to read (then repeat)
	 994  		endnb uintptr // number of valid bits in *endp
	 995  		pbits uintptr // alternate source of bits
	 996  
	 997  		// Heap bitmap output.
	 998  		w		 uintptr // words processed
	 999  		nw		uintptr // number of words to process
	1000  		hbitp *byte	 // next heap bitmap byte to write
	1001  		hb		uintptr // bits being prepared for *hbitp
	1002  	)
	1003  
	1004  	hbitp = h.bitp
	1005  
	1006  	// Handle GC program. Delayed until this part of the code
	1007  	// so that we can use the same double-checking mechanism
	1008  	// as the 1-bit case. Nothing above could have encountered
	1009  	// GC programs: the cases were all too small.
	1010  	if typ.kind&kindGCProg != 0 {
	1011  		heapBitsSetTypeGCProg(h, typ.ptrdata, typ.size, dataSize, size, addb(typ.gcdata, 4))
	1012  		if doubleCheck {
	1013  			// Double-check the heap bits written by GC program
	1014  			// by running the GC program to create a 1-bit pointer mask
	1015  			// and then jumping to the double-check code below.
	1016  			// This doesn't catch bugs shared between the 1-bit and 4-bit
	1017  			// GC program execution, but it does catch mistakes specific
	1018  			// to just one of those and bugs in heapBitsSetTypeGCProg's
	1019  			// implementation of arrays.
	1020  			lock(&debugPtrmask.lock)
	1021  			if debugPtrmask.data == nil {
	1022  				debugPtrmask.data = (*byte)(persistentalloc(1<<20, 1, &memstats.other_sys))
	1023  			}
	1024  			ptrmask = debugPtrmask.data
	1025  			runGCProg(addb(typ.gcdata, 4), nil, ptrmask, 1)
	1026  		}
	1027  		goto Phase4
	1028  	}
	1029  
	1030  	// Note about sizes:
	1031  	//
	1032  	// typ.size is the number of words in the object,
	1033  	// and typ.ptrdata is the number of words in the prefix
	1034  	// of the object that contains pointers. That is, the final
	1035  	// typ.size - typ.ptrdata words contain no pointers.
	1036  	// This allows optimization of a common pattern where
	1037  	// an object has a small header followed by a large scalar
	1038  	// buffer. If we know the pointers are over, we don't have
	1039  	// to scan the buffer's heap bitmap at all.
	1040  	// The 1-bit ptrmasks are sized to contain only bits for
	1041  	// the typ.ptrdata prefix, zero padded out to a full byte
	1042  	// of bitmap. This code sets nw (below) so that heap bitmap
	1043  	// bits are only written for the typ.ptrdata prefix; if there is
	1044  	// more room in the allocated object, the next heap bitmap
	1045  	// entry is a 00, indicating that there are no more pointers
	1046  	// to scan. So only the ptrmask for the ptrdata bytes is needed.
	1047  	//
	1048  	// Replicated copies are not as nice: if there is an array of
	1049  	// objects with scalar tails, all but the last tail does have to
	1050  	// be initialized, because there is no way to say "skip forward".
	1051  	// However, because of the possibility of a repeated type with
	1052  	// size not a multiple of 4 pointers (one heap bitmap byte),
	1053  	// the code already must handle the last ptrmask byte specially
	1054  	// by treating it as containing only the bits for endnb pointers,
	1055  	// where endnb <= 4. We represent large scalar tails that must
	1056  	// be expanded in the replication by setting endnb larger than 4.
	1057  	// This will have the effect of reading many bits out of b,
	1058  	// but once the real bits are shifted out, b will supply as many
	1059  	// zero bits as we try to read, which is exactly what we need.
	1060  
	1061  	p = ptrmask
	1062  	if typ.size < dataSize {
	1063  		// Filling in bits for an array of typ.
	1064  		// Set up for repetition of ptrmask during main loop.
	1065  		// Note that ptrmask describes only a prefix of
	1066  		const maxBits = sys.PtrSize*8 - 7
	1067  		if typ.ptrdata/sys.PtrSize <= maxBits {
	1068  			// Entire ptrmask fits in uintptr with room for a byte fragment.
	1069  			// Load into pbits and never read from ptrmask again.
	1070  			// This is especially important when the ptrmask has
	1071  			// fewer than 8 bits in it; otherwise the reload in the middle
	1072  			// of the Phase 2 loop would itself need to loop to gather
	1073  			// at least 8 bits.
	1074  
	1075  			// Accumulate ptrmask into b.
	1076  			// ptrmask is sized to describe only typ.ptrdata, but we record
	1077  			// it as describing typ.size bytes, since all the high bits are zero.
	1078  			nb = typ.ptrdata / sys.PtrSize
	1079  			for i := uintptr(0); i < nb; i += 8 {
	1080  				b |= uintptr(*p) << i
	1081  				p = add1(p)
	1082  			}
	1083  			nb = typ.size / sys.PtrSize
	1084  
	1085  			// Replicate ptrmask to fill entire pbits uintptr.
	1086  			// Doubling and truncating is fewer steps than
	1087  			// iterating by nb each time. (nb could be 1.)
	1088  			// Since we loaded typ.ptrdata/sys.PtrSize bits
	1089  			// but are pretending to have typ.size/sys.PtrSize,
	1090  			// there might be no replication necessary/possible.
	1091  			pbits = b
	1092  			endnb = nb
	1093  			if nb+nb <= maxBits {
	1094  				for endnb <= sys.PtrSize*8 {
	1095  					pbits |= pbits << endnb
	1096  					endnb += endnb
	1097  				}
	1098  				// Truncate to a multiple of original ptrmask.
	1099  				// Because nb+nb <= maxBits, nb fits in a byte.
	1100  				// Byte division is cheaper than uintptr division.
	1101  				endnb = uintptr(maxBits/byte(nb)) * nb
	1102  				pbits &= 1<<endnb - 1
	1103  				b = pbits
	1104  				nb = endnb
	1105  			}
	1106  
	1107  			// Clear p and endp as sentinel for using pbits.
	1108  			// Checked during Phase 2 loop.
	1109  			p = nil
	1110  			endp = nil
	1111  		} else {
	1112  			// Ptrmask is larger. Read it multiple times.
	1113  			n := (typ.ptrdata/sys.PtrSize+7)/8 - 1
	1114  			endp = addb(ptrmask, n)
	1115  			endnb = typ.size/sys.PtrSize - n*8
	1116  		}
	1117  	}
	1118  	if p != nil {
	1119  		b = uintptr(*p)
	1120  		p = add1(p)
	1121  		nb = 8
	1122  	}
	1123  
	1124  	if typ.size == dataSize {
	1125  		// Single entry: can stop once we reach the non-pointer data.
	1126  		nw = typ.ptrdata / sys.PtrSize
	1127  	} else {
	1128  		// Repeated instances of typ in an array.
	1129  		// Have to process first N-1 entries in full, but can stop
	1130  		// once we reach the non-pointer data in the final entry.
	1131  		nw = ((dataSize/typ.size-1)*typ.size + typ.ptrdata) / sys.PtrSize
	1132  	}
	1133  	if nw == 0 {
	1134  		// No pointers! Caller was supposed to check.
	1135  		println("runtime: invalid type ", typ.string())
	1136  		throw("heapBitsSetType: called with non-pointer type")
	1137  		return
	1138  	}
	1139  
	1140  	// Phase 1: Special case for leading byte (shift==0) or half-byte (shift==2).
	1141  	// The leading byte is special because it contains the bits for word 1,
	1142  	// which does not have the scan bit set.
	1143  	// The leading half-byte is special because it's a half a byte,
	1144  	// so we have to be careful with the bits already there.
	1145  	switch {
	1146  	default:
	1147  		throw("heapBitsSetType: unexpected shift")
	1148  
	1149  	case h.shift == 0:
	1150  		// Ptrmask and heap bitmap are aligned.
	1151  		//
	1152  		// This is a fast path for small objects.
	1153  		//
	1154  		// The first byte we write out covers the first four
	1155  		// words of the object. The scan/dead bit on the first
	1156  		// word must be set to scan since there are pointers
	1157  		// somewhere in the object.
	1158  		// In all following words, we set the scan/dead
	1159  		// appropriately to indicate that the object continues
	1160  		// to the next 2-bit entry in the bitmap.
	1161  		//
	1162  		// We set four bits at a time here, but if the object
	1163  		// is fewer than four words, phase 3 will clear
	1164  		// unnecessary bits.
	1165  		hb = b & bitPointerAll
	1166  		hb |= bitScanAll
	1167  		if w += 4; w >= nw {
	1168  			goto Phase3
	1169  		}
	1170  		*hbitp = uint8(hb)
	1171  		hbitp = add1(hbitp)
	1172  		b >>= 4
	1173  		nb -= 4
	1174  
	1175  	case h.shift == 2:
	1176  		// Ptrmask and heap bitmap are misaligned.
	1177  		//
	1178  		// On 32 bit architectures only the 6-word object that corresponds
	1179  		// to a 24 bytes size class can start with h.shift of 2 here since
	1180  		// all other non 16 byte aligned size classes have been handled by
	1181  		// special code paths at the beginning of heapBitsSetType on 32 bit.
	1182  		//
	1183  		// Many size classes are only 16 byte aligned. On 64 bit architectures
	1184  		// this results in a heap bitmap position starting with a h.shift of 2.
	1185  		//
	1186  		// The bits for the first two words are in a byte shared
	1187  		// with another object, so we must be careful with the bits
	1188  		// already there.
	1189  		//
	1190  		// We took care of 1-word, 2-word, and 3-word objects above,
	1191  		// so this is at least a 6-word object.
	1192  		hb = (b & (bitPointer | bitPointer<<heapBitsShift)) << (2 * heapBitsShift)
	1193  		hb |= bitScan << (2 * heapBitsShift)
	1194  		if nw > 1 {
	1195  			hb |= bitScan << (3 * heapBitsShift)
	1196  		}
	1197  		b >>= 2
	1198  		nb -= 2
	1199  		*hbitp &^= uint8((bitPointer | bitScan | ((bitPointer | bitScan) << heapBitsShift)) << (2 * heapBitsShift))
	1200  		*hbitp |= uint8(hb)
	1201  		hbitp = add1(hbitp)
	1202  		if w += 2; w >= nw {
	1203  			// We know that there is more data, because we handled 2-word and 3-word objects above.
	1204  			// This must be at least a 6-word object. If we're out of pointer words,
	1205  			// mark no scan in next bitmap byte and finish.
	1206  			hb = 0
	1207  			w += 4
	1208  			goto Phase3
	1209  		}
	1210  	}
	1211  
	1212  	// Phase 2: Full bytes in bitmap, up to but not including write to last byte (full or partial) in bitmap.
	1213  	// The loop computes the bits for that last write but does not execute the write;
	1214  	// it leaves the bits in hb for processing by phase 3.
	1215  	// To avoid repeated adjustment of nb, we subtract out the 4 bits we're going to
	1216  	// use in the first half of the loop right now, and then we only adjust nb explicitly
	1217  	// if the 8 bits used by each iteration isn't balanced by 8 bits loaded mid-loop.
	1218  	nb -= 4
	1219  	for {
	1220  		// Emit bitmap byte.
	1221  		// b has at least nb+4 bits, with one exception:
	1222  		// if w+4 >= nw, then b has only nw-w bits,
	1223  		// but we'll stop at the break and then truncate
	1224  		// appropriately in Phase 3.
	1225  		hb = b & bitPointerAll
	1226  		hb |= bitScanAll
	1227  		if w += 4; w >= nw {
	1228  			break
	1229  		}
	1230  		*hbitp = uint8(hb)
	1231  		hbitp = add1(hbitp)
	1232  		b >>= 4
	1233  
	1234  		// Load more bits. b has nb right now.
	1235  		if p != endp {
	1236  			// Fast path: keep reading from ptrmask.
	1237  			// nb unmodified: we just loaded 8 bits,
	1238  			// and the next iteration will consume 8 bits,
	1239  			// leaving us with the same nb the next time we're here.
	1240  			if nb < 8 {
	1241  				b |= uintptr(*p) << nb
	1242  				p = add1(p)
	1243  			} else {
	1244  				// Reduce the number of bits in b.
	1245  				// This is important if we skipped
	1246  				// over a scalar tail, since nb could
	1247  				// be larger than the bit width of b.
	1248  				nb -= 8
	1249  			}
	1250  		} else if p == nil {
	1251  			// Almost as fast path: track bit count and refill from pbits.
	1252  			// For short repetitions.
	1253  			if nb < 8 {
	1254  				b |= pbits << nb
	1255  				nb += endnb
	1256  			}
	1257  			nb -= 8 // for next iteration
	1258  		} else {
	1259  			// Slow path: reached end of ptrmask.
	1260  			// Process final partial byte and rewind to start.
	1261  			b |= uintptr(*p) << nb
	1262  			nb += endnb
	1263  			if nb < 8 {
	1264  				b |= uintptr(*ptrmask) << nb
	1265  				p = add1(ptrmask)
	1266  			} else {
	1267  				nb -= 8
	1268  				p = ptrmask
	1269  			}
	1270  		}
	1271  
	1272  		// Emit bitmap byte.
	1273  		hb = b & bitPointerAll
	1274  		hb |= bitScanAll
	1275  		if w += 4; w >= nw {
	1276  			break
	1277  		}
	1278  		*hbitp = uint8(hb)
	1279  		hbitp = add1(hbitp)
	1280  		b >>= 4
	1281  	}
	1282  
	1283  Phase3:
	1284  	// Phase 3: Write last byte or partial byte and zero the rest of the bitmap entries.
	1285  	if w > nw {
	1286  		// Counting the 4 entries in hb not yet written to memory,
	1287  		// there are more entries than possible pointer slots.
	1288  		// Discard the excess entries (can't be more than 3).
	1289  		mask := uintptr(1)<<(4-(w-nw)) - 1
	1290  		hb &= mask | mask<<4 // apply mask to both pointer bits and scan bits
	1291  	}
	1292  
	1293  	// Change nw from counting possibly-pointer words to total words in allocation.
	1294  	nw = size / sys.PtrSize
	1295  
	1296  	// Write whole bitmap bytes.
	1297  	// The first is hb, the rest are zero.
	1298  	if w <= nw {
	1299  		*hbitp = uint8(hb)
	1300  		hbitp = add1(hbitp)
	1301  		hb = 0 // for possible final half-byte below
	1302  		for w += 4; w <= nw; w += 4 {
	1303  			*hbitp = 0
	1304  			hbitp = add1(hbitp)
	1305  		}
	1306  	}
	1307  
	1308  	// Write final partial bitmap byte if any.
	1309  	// We know w > nw, or else we'd still be in the loop above.
	1310  	// It can be bigger only due to the 4 entries in hb that it counts.
	1311  	// If w == nw+4 then there's nothing left to do: we wrote all nw entries
	1312  	// and can discard the 4 sitting in hb.
	1313  	// But if w == nw+2, we need to write first two in hb.
	1314  	// The byte is shared with the next object, so be careful with
	1315  	// existing bits.
	1316  	if w == nw+2 {
	1317  		*hbitp = *hbitp&^(bitPointer|bitScan|(bitPointer|bitScan)<<heapBitsShift) | uint8(hb)
	1318  	}
	1319  
	1320  Phase4:
	1321  	// Phase 4: Copy unrolled bitmap to per-arena bitmaps, if necessary.
	1322  	if outOfPlace {
	1323  		// TODO: We could probably make this faster by
	1324  		// handling [x+dataSize, x+size) specially.
	1325  		h := heapBitsForAddr(x)
	1326  		// cnw is the number of heap words, or bit pairs
	1327  		// remaining (like nw above).
	1328  		cnw := size / sys.PtrSize
	1329  		src := (*uint8)(unsafe.Pointer(x))
	1330  		// We know the first and last byte of the bitmap are
	1331  		// not the same, but it's still possible for small
	1332  		// objects span arenas, so it may share bitmap bytes
	1333  		// with neighboring objects.
	1334  		//
	1335  		// Handle the first byte specially if it's shared. See
	1336  		// Phase 1 for why this is the only special case we need.
	1337  		if doubleCheck {
	1338  			if !(h.shift == 0 || h.shift == 2) {
	1339  				print("x=", x, " size=", size, " cnw=", h.shift, "\n")
	1340  				throw("bad start shift")
	1341  			}
	1342  		}
	1343  		if h.shift == 2 {
	1344  			*h.bitp = *h.bitp&^((bitPointer|bitScan|(bitPointer|bitScan)<<heapBitsShift)<<(2*heapBitsShift)) | *src
	1345  			h = h.next().next()
	1346  			cnw -= 2
	1347  			src = addb(src, 1)
	1348  		}
	1349  		// We're now byte aligned. Copy out to per-arena
	1350  		// bitmaps until the last byte (which may again be
	1351  		// partial).
	1352  		for cnw >= 4 {
	1353  			// This loop processes four words at a time,
	1354  			// so round cnw down accordingly.
	1355  			hNext, words := h.forwardOrBoundary(cnw / 4 * 4)
	1356  
	1357  			// n is the number of bitmap bytes to copy.
	1358  			n := words / 4
	1359  			memmove(unsafe.Pointer(h.bitp), unsafe.Pointer(src), n)
	1360  			cnw -= words
	1361  			h = hNext
	1362  			src = addb(src, n)
	1363  		}
	1364  		if doubleCheck && h.shift != 0 {
	1365  			print("cnw=", cnw, " h.shift=", h.shift, "\n")
	1366  			throw("bad shift after block copy")
	1367  		}
	1368  		// Handle the last byte if it's shared.
	1369  		if cnw == 2 {
	1370  			*h.bitp = *h.bitp&^(bitPointer|bitScan|(bitPointer|bitScan)<<heapBitsShift) | *src
	1371  			src = addb(src, 1)
	1372  			h = h.next().next()
	1373  		}
	1374  		if doubleCheck {
	1375  			if uintptr(unsafe.Pointer(src)) > x+size {
	1376  				throw("copy exceeded object size")
	1377  			}
	1378  			if !(cnw == 0 || cnw == 2) {
	1379  				print("x=", x, " size=", size, " cnw=", cnw, "\n")
	1380  				throw("bad number of remaining words")
	1381  			}
	1382  			// Set up hbitp so doubleCheck code below can check it.
	1383  			hbitp = h.bitp
	1384  		}
	1385  		// Zero the object where we wrote the bitmap.
	1386  		memclrNoHeapPointers(unsafe.Pointer(x), uintptr(unsafe.Pointer(src))-x)
	1387  	}
	1388  
	1389  	// Double check the whole bitmap.
	1390  	if doubleCheck {
	1391  		// x+size may not point to the heap, so back up one
	1392  		// word and then advance it the way we do above.
	1393  		end := heapBitsForAddr(x + size - sys.PtrSize)
	1394  		if outOfPlace {
	1395  			// In out-of-place copying, we just advance
	1396  			// using next.
	1397  			end = end.next()
	1398  		} else {
	1399  			// Don't use next because that may advance to
	1400  			// the next arena and the in-place logic
	1401  			// doesn't do that.
	1402  			end.shift += heapBitsShift
	1403  			if end.shift == 4*heapBitsShift {
	1404  				end.bitp, end.shift = add1(end.bitp), 0
	1405  			}
	1406  		}
	1407  		if typ.kind&kindGCProg == 0 && (hbitp != end.bitp || (w == nw+2) != (end.shift == 2)) {
	1408  			println("ended at wrong bitmap byte for", typ.string(), "x", dataSize/typ.size)
	1409  			print("typ.size=", typ.size, " typ.ptrdata=", typ.ptrdata, " dataSize=", dataSize, " size=", size, "\n")
	1410  			print("w=", w, " nw=", nw, " b=", hex(b), " nb=", nb, " hb=", hex(hb), "\n")
	1411  			h0 := heapBitsForAddr(x)
	1412  			print("initial bits h0.bitp=", h0.bitp, " h0.shift=", h0.shift, "\n")
	1413  			print("ended at hbitp=", hbitp, " but next starts at bitp=", end.bitp, " shift=", end.shift, "\n")
	1414  			throw("bad heapBitsSetType")
	1415  		}
	1416  
	1417  		// Double-check that bits to be written were written correctly.
	1418  		// Does not check that other bits were not written, unfortunately.
	1419  		h := heapBitsForAddr(x)
	1420  		nptr := typ.ptrdata / sys.PtrSize
	1421  		ndata := typ.size / sys.PtrSize
	1422  		count := dataSize / typ.size
	1423  		totalptr := ((count-1)*typ.size + typ.ptrdata) / sys.PtrSize
	1424  		for i := uintptr(0); i < size/sys.PtrSize; i++ {
	1425  			j := i % ndata
	1426  			var have, want uint8
	1427  			have = (*h.bitp >> h.shift) & (bitPointer | bitScan)
	1428  			if i >= totalptr {
	1429  				if typ.kind&kindGCProg != 0 && i < (totalptr+3)/4*4 {
	1430  					// heapBitsSetTypeGCProg always fills
	1431  					// in full nibbles of bitScan.
	1432  					want = bitScan
	1433  				}
	1434  			} else {
	1435  				if j < nptr && (*addb(ptrmask, j/8)>>(j%8))&1 != 0 {
	1436  					want |= bitPointer
	1437  				}
	1438  				want |= bitScan
	1439  			}
	1440  			if have != want {
	1441  				println("mismatch writing bits for", typ.string(), "x", dataSize/typ.size)
	1442  				print("typ.size=", typ.size, " typ.ptrdata=", typ.ptrdata, " dataSize=", dataSize, " size=", size, "\n")
	1443  				print("kindGCProg=", typ.kind&kindGCProg != 0, " outOfPlace=", outOfPlace, "\n")
	1444  				print("w=", w, " nw=", nw, " b=", hex(b), " nb=", nb, " hb=", hex(hb), "\n")
	1445  				h0 := heapBitsForAddr(x)
	1446  				print("initial bits h0.bitp=", h0.bitp, " h0.shift=", h0.shift, "\n")
	1447  				print("current bits h.bitp=", h.bitp, " h.shift=", h.shift, " *h.bitp=", hex(*h.bitp), "\n")
	1448  				print("ptrmask=", ptrmask, " p=", p, " endp=", endp, " endnb=", endnb, " pbits=", hex(pbits), " b=", hex(b), " nb=", nb, "\n")
	1449  				println("at word", i, "offset", i*sys.PtrSize, "have", hex(have), "want", hex(want))
	1450  				if typ.kind&kindGCProg != 0 {
	1451  					println("GC program:")
	1452  					dumpGCProg(addb(typ.gcdata, 4))
	1453  				}
	1454  				throw("bad heapBitsSetType")
	1455  			}
	1456  			h = h.next()
	1457  		}
	1458  		if ptrmask == debugPtrmask.data {
	1459  			unlock(&debugPtrmask.lock)
	1460  		}
	1461  	}
	1462  }
	1463  
	1464  var debugPtrmask struct {
	1465  	lock mutex
	1466  	data *byte
	1467  }
	1468  
	1469  // heapBitsSetTypeGCProg implements heapBitsSetType using a GC program.
	1470  // progSize is the size of the memory described by the program.
	1471  // elemSize is the size of the element that the GC program describes (a prefix of).
	1472  // dataSize is the total size of the intended data, a multiple of elemSize.
	1473  // allocSize is the total size of the allocated memory.
	1474  //
	1475  // GC programs are only used for large allocations.
	1476  // heapBitsSetType requires that allocSize is a multiple of 4 words,
	1477  // so that the relevant bitmap bytes are not shared with surrounding
	1478  // objects.
	1479  func heapBitsSetTypeGCProg(h heapBits, progSize, elemSize, dataSize, allocSize uintptr, prog *byte) {
	1480  	if sys.PtrSize == 8 && allocSize%(4*sys.PtrSize) != 0 {
	1481  		// Alignment will be wrong.
	1482  		throw("heapBitsSetTypeGCProg: small allocation")
	1483  	}
	1484  	var totalBits uintptr
	1485  	if elemSize == dataSize {
	1486  		totalBits = runGCProg(prog, nil, h.bitp, 2)
	1487  		if totalBits*sys.PtrSize != progSize {
	1488  			println("runtime: heapBitsSetTypeGCProg: total bits", totalBits, "but progSize", progSize)
	1489  			throw("heapBitsSetTypeGCProg: unexpected bit count")
	1490  		}
	1491  	} else {
	1492  		count := dataSize / elemSize
	1493  
	1494  		// Piece together program trailer to run after prog that does:
	1495  		//	literal(0)
	1496  		//	repeat(1, elemSize-progSize-1) // zeros to fill element size
	1497  		//	repeat(elemSize, count-1) // repeat that element for count
	1498  		// This zero-pads the data remaining in the first element and then
	1499  		// repeats that first element to fill the array.
	1500  		var trailer [40]byte // 3 varints (max 10 each) + some bytes
	1501  		i := 0
	1502  		if n := elemSize/sys.PtrSize - progSize/sys.PtrSize; n > 0 {
	1503  			// literal(0)
	1504  			trailer[i] = 0x01
	1505  			i++
	1506  			trailer[i] = 0
	1507  			i++
	1508  			if n > 1 {
	1509  				// repeat(1, n-1)
	1510  				trailer[i] = 0x81
	1511  				i++
	1512  				n--
	1513  				for ; n >= 0x80; n >>= 7 {
	1514  					trailer[i] = byte(n | 0x80)
	1515  					i++
	1516  				}
	1517  				trailer[i] = byte(n)
	1518  				i++
	1519  			}
	1520  		}
	1521  		// repeat(elemSize/ptrSize, count-1)
	1522  		trailer[i] = 0x80
	1523  		i++
	1524  		n := elemSize / sys.PtrSize
	1525  		for ; n >= 0x80; n >>= 7 {
	1526  			trailer[i] = byte(n | 0x80)
	1527  			i++
	1528  		}
	1529  		trailer[i] = byte(n)
	1530  		i++
	1531  		n = count - 1
	1532  		for ; n >= 0x80; n >>= 7 {
	1533  			trailer[i] = byte(n | 0x80)
	1534  			i++
	1535  		}
	1536  		trailer[i] = byte(n)
	1537  		i++
	1538  		trailer[i] = 0
	1539  		i++
	1540  
	1541  		runGCProg(prog, &trailer[0], h.bitp, 2)
	1542  
	1543  		// Even though we filled in the full array just now,
	1544  		// record that we only filled in up to the ptrdata of the
	1545  		// last element. This will cause the code below to
	1546  		// memclr the dead section of the final array element,
	1547  		// so that scanobject can stop early in the final element.
	1548  		totalBits = (elemSize*(count-1) + progSize) / sys.PtrSize
	1549  	}
	1550  	endProg := unsafe.Pointer(addb(h.bitp, (totalBits+3)/4))
	1551  	endAlloc := unsafe.Pointer(addb(h.bitp, allocSize/sys.PtrSize/wordsPerBitmapByte))
	1552  	memclrNoHeapPointers(endProg, uintptr(endAlloc)-uintptr(endProg))
	1553  }
	1554  
	1555  // progToPointerMask returns the 1-bit pointer mask output by the GC program prog.
	1556  // size the size of the region described by prog, in bytes.
	1557  // The resulting bitvector will have no more than size/sys.PtrSize bits.
	1558  func progToPointerMask(prog *byte, size uintptr) bitvector {
	1559  	n := (size/sys.PtrSize + 7) / 8
	1560  	x := (*[1 << 30]byte)(persistentalloc(n+1, 1, &memstats.buckhash_sys))[:n+1]
	1561  	x[len(x)-1] = 0xa1 // overflow check sentinel
	1562  	n = runGCProg(prog, nil, &x[0], 1)
	1563  	if x[len(x)-1] != 0xa1 {
	1564  		throw("progToPointerMask: overflow")
	1565  	}
	1566  	return bitvector{int32(n), &x[0]}
	1567  }
	1568  
	1569  // Packed GC pointer bitmaps, aka GC programs.
	1570  //
	1571  // For large types containing arrays, the type information has a
	1572  // natural repetition that can be encoded to save space in the
	1573  // binary and in the memory representation of the type information.
	1574  //
	1575  // The encoding is a simple Lempel-Ziv style bytecode machine
	1576  // with the following instructions:
	1577  //
	1578  //	00000000: stop
	1579  //	0nnnnnnn: emit n bits copied from the next (n+7)/8 bytes
	1580  //	10000000 n c: repeat the previous n bits c times; n, c are varints
	1581  //	1nnnnnnn c: repeat the previous n bits c times; c is a varint
	1582  
	1583  // runGCProg executes the GC program prog, and then trailer if non-nil,
	1584  // writing to dst with entries of the given size.
	1585  // If size == 1, dst is a 1-bit pointer mask laid out moving forward from dst.
	1586  // If size == 2, dst is the 2-bit heap bitmap, and writes move backward
	1587  // starting at dst (because the heap bitmap does). In this case, the caller guarantees
	1588  // that only whole bytes in dst need to be written.
	1589  //
	1590  // runGCProg returns the number of 1- or 2-bit entries written to memory.
	1591  func runGCProg(prog, trailer, dst *byte, size int) uintptr {
	1592  	dstStart := dst
	1593  
	1594  	// Bits waiting to be written to memory.
	1595  	var bits uintptr
	1596  	var nbits uintptr
	1597  
	1598  	p := prog
	1599  Run:
	1600  	for {
	1601  		// Flush accumulated full bytes.
	1602  		// The rest of the loop assumes that nbits <= 7.
	1603  		for ; nbits >= 8; nbits -= 8 {
	1604  			if size == 1 {
	1605  				*dst = uint8(bits)
	1606  				dst = add1(dst)
	1607  				bits >>= 8
	1608  			} else {
	1609  				v := bits&bitPointerAll | bitScanAll
	1610  				*dst = uint8(v)
	1611  				dst = add1(dst)
	1612  				bits >>= 4
	1613  				v = bits&bitPointerAll | bitScanAll
	1614  				*dst = uint8(v)
	1615  				dst = add1(dst)
	1616  				bits >>= 4
	1617  			}
	1618  		}
	1619  
	1620  		// Process one instruction.
	1621  		inst := uintptr(*p)
	1622  		p = add1(p)
	1623  		n := inst & 0x7F
	1624  		if inst&0x80 == 0 {
	1625  			// Literal bits; n == 0 means end of program.
	1626  			if n == 0 {
	1627  				// Program is over; continue in trailer if present.
	1628  				if trailer != nil {
	1629  					p = trailer
	1630  					trailer = nil
	1631  					continue
	1632  				}
	1633  				break Run
	1634  			}
	1635  			nbyte := n / 8
	1636  			for i := uintptr(0); i < nbyte; i++ {
	1637  				bits |= uintptr(*p) << nbits
	1638  				p = add1(p)
	1639  				if size == 1 {
	1640  					*dst = uint8(bits)
	1641  					dst = add1(dst)
	1642  					bits >>= 8
	1643  				} else {
	1644  					v := bits&0xf | bitScanAll
	1645  					*dst = uint8(v)
	1646  					dst = add1(dst)
	1647  					bits >>= 4
	1648  					v = bits&0xf | bitScanAll
	1649  					*dst = uint8(v)
	1650  					dst = add1(dst)
	1651  					bits >>= 4
	1652  				}
	1653  			}
	1654  			if n %= 8; n > 0 {
	1655  				bits |= uintptr(*p) << nbits
	1656  				p = add1(p)
	1657  				nbits += n
	1658  			}
	1659  			continue Run
	1660  		}
	1661  
	1662  		// Repeat. If n == 0, it is encoded in a varint in the next bytes.
	1663  		if n == 0 {
	1664  			for off := uint(0); ; off += 7 {
	1665  				x := uintptr(*p)
	1666  				p = add1(p)
	1667  				n |= (x & 0x7F) << off
	1668  				if x&0x80 == 0 {
	1669  					break
	1670  				}
	1671  			}
	1672  		}
	1673  
	1674  		// Count is encoded in a varint in the next bytes.
	1675  		c := uintptr(0)
	1676  		for off := uint(0); ; off += 7 {
	1677  			x := uintptr(*p)
	1678  			p = add1(p)
	1679  			c |= (x & 0x7F) << off
	1680  			if x&0x80 == 0 {
	1681  				break
	1682  			}
	1683  		}
	1684  		c *= n // now total number of bits to copy
	1685  
	1686  		// If the number of bits being repeated is small, load them
	1687  		// into a register and use that register for the entire loop
	1688  		// instead of repeatedly reading from memory.
	1689  		// Handling fewer than 8 bits here makes the general loop simpler.
	1690  		// The cutoff is sys.PtrSize*8 - 7 to guarantee that when we add
	1691  		// the pattern to a bit buffer holding at most 7 bits (a partial byte)
	1692  		// it will not overflow.
	1693  		src := dst
	1694  		const maxBits = sys.PtrSize*8 - 7
	1695  		if n <= maxBits {
	1696  			// Start with bits in output buffer.
	1697  			pattern := bits
	1698  			npattern := nbits
	1699  
	1700  			// If we need more bits, fetch them from memory.
	1701  			if size == 1 {
	1702  				src = subtract1(src)
	1703  				for npattern < n {
	1704  					pattern <<= 8
	1705  					pattern |= uintptr(*src)
	1706  					src = subtract1(src)
	1707  					npattern += 8
	1708  				}
	1709  			} else {
	1710  				src = subtract1(src)
	1711  				for npattern < n {
	1712  					pattern <<= 4
	1713  					pattern |= uintptr(*src) & 0xf
	1714  					src = subtract1(src)
	1715  					npattern += 4
	1716  				}
	1717  			}
	1718  
	1719  			// We started with the whole bit output buffer,
	1720  			// and then we loaded bits from whole bytes.
	1721  			// Either way, we might now have too many instead of too few.
	1722  			// Discard the extra.
	1723  			if npattern > n {
	1724  				pattern >>= npattern - n
	1725  				npattern = n
	1726  			}
	1727  
	1728  			// Replicate pattern to at most maxBits.
	1729  			if npattern == 1 {
	1730  				// One bit being repeated.
	1731  				// If the bit is 1, make the pattern all 1s.
	1732  				// If the bit is 0, the pattern is already all 0s,
	1733  				// but we can claim that the number of bits
	1734  				// in the word is equal to the number we need (c),
	1735  				// because right shift of bits will zero fill.
	1736  				if pattern == 1 {
	1737  					pattern = 1<<maxBits - 1
	1738  					npattern = maxBits
	1739  				} else {
	1740  					npattern = c
	1741  				}
	1742  			} else {
	1743  				b := pattern
	1744  				nb := npattern
	1745  				if nb+nb <= maxBits {
	1746  					// Double pattern until the whole uintptr is filled.
	1747  					for nb <= sys.PtrSize*8 {
	1748  						b |= b << nb
	1749  						nb += nb
	1750  					}
	1751  					// Trim away incomplete copy of original pattern in high bits.
	1752  					// TODO(rsc): Replace with table lookup or loop on systems without divide?
	1753  					nb = maxBits / npattern * npattern
	1754  					b &= 1<<nb - 1
	1755  					pattern = b
	1756  					npattern = nb
	1757  				}
	1758  			}
	1759  
	1760  			// Add pattern to bit buffer and flush bit buffer, c/npattern times.
	1761  			// Since pattern contains >8 bits, there will be full bytes to flush
	1762  			// on each iteration.
	1763  			for ; c >= npattern; c -= npattern {
	1764  				bits |= pattern << nbits
	1765  				nbits += npattern
	1766  				if size == 1 {
	1767  					for nbits >= 8 {
	1768  						*dst = uint8(bits)
	1769  						dst = add1(dst)
	1770  						bits >>= 8
	1771  						nbits -= 8
	1772  					}
	1773  				} else {
	1774  					for nbits >= 4 {
	1775  						*dst = uint8(bits&0xf | bitScanAll)
	1776  						dst = add1(dst)
	1777  						bits >>= 4
	1778  						nbits -= 4
	1779  					}
	1780  				}
	1781  			}
	1782  
	1783  			// Add final fragment to bit buffer.
	1784  			if c > 0 {
	1785  				pattern &= 1<<c - 1
	1786  				bits |= pattern << nbits
	1787  				nbits += c
	1788  			}
	1789  			continue Run
	1790  		}
	1791  
	1792  		// Repeat; n too large to fit in a register.
	1793  		// Since nbits <= 7, we know the first few bytes of repeated data
	1794  		// are already written to memory.
	1795  		off := n - nbits // n > nbits because n > maxBits and nbits <= 7
	1796  		if size == 1 {
	1797  			// Leading src fragment.
	1798  			src = subtractb(src, (off+7)/8)
	1799  			if frag := off & 7; frag != 0 {
	1800  				bits |= uintptr(*src) >> (8 - frag) << nbits
	1801  				src = add1(src)
	1802  				nbits += frag
	1803  				c -= frag
	1804  			}
	1805  			// Main loop: load one byte, write another.
	1806  			// The bits are rotating through the bit buffer.
	1807  			for i := c / 8; i > 0; i-- {
	1808  				bits |= uintptr(*src) << nbits
	1809  				src = add1(src)
	1810  				*dst = uint8(bits)
	1811  				dst = add1(dst)
	1812  				bits >>= 8
	1813  			}
	1814  			// Final src fragment.
	1815  			if c %= 8; c > 0 {
	1816  				bits |= (uintptr(*src) & (1<<c - 1)) << nbits
	1817  				nbits += c
	1818  			}
	1819  		} else {
	1820  			// Leading src fragment.
	1821  			src = subtractb(src, (off+3)/4)
	1822  			if frag := off & 3; frag != 0 {
	1823  				bits |= (uintptr(*src) & 0xf) >> (4 - frag) << nbits
	1824  				src = add1(src)
	1825  				nbits += frag
	1826  				c -= frag
	1827  			}
	1828  			// Main loop: load one byte, write another.
	1829  			// The bits are rotating through the bit buffer.
	1830  			for i := c / 4; i > 0; i-- {
	1831  				bits |= (uintptr(*src) & 0xf) << nbits
	1832  				src = add1(src)
	1833  				*dst = uint8(bits&0xf | bitScanAll)
	1834  				dst = add1(dst)
	1835  				bits >>= 4
	1836  			}
	1837  			// Final src fragment.
	1838  			if c %= 4; c > 0 {
	1839  				bits |= (uintptr(*src) & (1<<c - 1)) << nbits
	1840  				nbits += c
	1841  			}
	1842  		}
	1843  	}
	1844  
	1845  	// Write any final bits out, using full-byte writes, even for the final byte.
	1846  	var totalBits uintptr
	1847  	if size == 1 {
	1848  		totalBits = (uintptr(unsafe.Pointer(dst))-uintptr(unsafe.Pointer(dstStart)))*8 + nbits
	1849  		nbits += -nbits & 7
	1850  		for ; nbits > 0; nbits -= 8 {
	1851  			*dst = uint8(bits)
	1852  			dst = add1(dst)
	1853  			bits >>= 8
	1854  		}
	1855  	} else {
	1856  		totalBits = (uintptr(unsafe.Pointer(dst))-uintptr(unsafe.Pointer(dstStart)))*4 + nbits
	1857  		nbits += -nbits & 3
	1858  		for ; nbits > 0; nbits -= 4 {
	1859  			v := bits&0xf | bitScanAll
	1860  			*dst = uint8(v)
	1861  			dst = add1(dst)
	1862  			bits >>= 4
	1863  		}
	1864  	}
	1865  	return totalBits
	1866  }
	1867  
	1868  // materializeGCProg allocates space for the (1-bit) pointer bitmask
	1869  // for an object of size ptrdata.	Then it fills that space with the
	1870  // pointer bitmask specified by the program prog.
	1871  // The bitmask starts at s.startAddr.
	1872  // The result must be deallocated with dematerializeGCProg.
	1873  func materializeGCProg(ptrdata uintptr, prog *byte) *mspan {
	1874  	// Each word of ptrdata needs one bit in the bitmap.
	1875  	bitmapBytes := divRoundUp(ptrdata, 8*sys.PtrSize)
	1876  	// Compute the number of pages needed for bitmapBytes.
	1877  	pages := divRoundUp(bitmapBytes, pageSize)
	1878  	s := mheap_.allocManual(pages, spanAllocPtrScalarBits)
	1879  	runGCProg(addb(prog, 4), nil, (*byte)(unsafe.Pointer(s.startAddr)), 1)
	1880  	return s
	1881  }
	1882  func dematerializeGCProg(s *mspan) {
	1883  	mheap_.freeManual(s, spanAllocPtrScalarBits)
	1884  }
	1885  
	1886  func dumpGCProg(p *byte) {
	1887  	nptr := 0
	1888  	for {
	1889  		x := *p
	1890  		p = add1(p)
	1891  		if x == 0 {
	1892  			print("\t", nptr, " end\n")
	1893  			break
	1894  		}
	1895  		if x&0x80 == 0 {
	1896  			print("\t", nptr, " lit ", x, ":")
	1897  			n := int(x+7) / 8
	1898  			for i := 0; i < n; i++ {
	1899  				print(" ", hex(*p))
	1900  				p = add1(p)
	1901  			}
	1902  			print("\n")
	1903  			nptr += int(x)
	1904  		} else {
	1905  			nbit := int(x &^ 0x80)
	1906  			if nbit == 0 {
	1907  				for nb := uint(0); ; nb += 7 {
	1908  					x := *p
	1909  					p = add1(p)
	1910  					nbit |= int(x&0x7f) << nb
	1911  					if x&0x80 == 0 {
	1912  						break
	1913  					}
	1914  				}
	1915  			}
	1916  			count := 0
	1917  			for nb := uint(0); ; nb += 7 {
	1918  				x := *p
	1919  				p = add1(p)
	1920  				count |= int(x&0x7f) << nb
	1921  				if x&0x80 == 0 {
	1922  					break
	1923  				}
	1924  			}
	1925  			print("\t", nptr, " repeat ", nbit, " × ", count, "\n")
	1926  			nptr += nbit * count
	1927  		}
	1928  	}
	1929  }
	1930  
	1931  // Testing.
	1932  
	1933  func getgcmaskcb(frame *stkframe, ctxt unsafe.Pointer) bool {
	1934  	target := (*stkframe)(ctxt)
	1935  	if frame.sp <= target.sp && target.sp < frame.varp {
	1936  		*target = *frame
	1937  		return false
	1938  	}
	1939  	return true
	1940  }
	1941  
	1942  // gcbits returns the GC type info for x, for testing.
	1943  // The result is the bitmap entries (0 or 1), one entry per byte.
	1944  //go:linkname reflect_gcbits reflect.gcbits
	1945  func reflect_gcbits(x interface{}) []byte {
	1946  	ret := getgcmask(x)
	1947  	typ := (*ptrtype)(unsafe.Pointer(efaceOf(&x)._type)).elem
	1948  	nptr := typ.ptrdata / sys.PtrSize
	1949  	for uintptr(len(ret)) > nptr && ret[len(ret)-1] == 0 {
	1950  		ret = ret[:len(ret)-1]
	1951  	}
	1952  	return ret
	1953  }
	1954  
	1955  // Returns GC type info for the pointer stored in ep for testing.
	1956  // If ep points to the stack, only static live information will be returned
	1957  // (i.e. not for objects which are only dynamically live stack objects).
	1958  func getgcmask(ep interface{}) (mask []byte) {
	1959  	e := *efaceOf(&ep)
	1960  	p := e.data
	1961  	t := e._type
	1962  	// data or bss
	1963  	for _, datap := range activeModules() {
	1964  		// data
	1965  		if datap.data <= uintptr(p) && uintptr(p) < datap.edata {
	1966  			bitmap := datap.gcdatamask.bytedata
	1967  			n := (*ptrtype)(unsafe.Pointer(t)).elem.size
	1968  			mask = make([]byte, n/sys.PtrSize)
	1969  			for i := uintptr(0); i < n; i += sys.PtrSize {
	1970  				off := (uintptr(p) + i - datap.data) / sys.PtrSize
	1971  				mask[i/sys.PtrSize] = (*addb(bitmap, off/8) >> (off % 8)) & 1
	1972  			}
	1973  			return
	1974  		}
	1975  
	1976  		// bss
	1977  		if datap.bss <= uintptr(p) && uintptr(p) < datap.ebss {
	1978  			bitmap := datap.gcbssmask.bytedata
	1979  			n := (*ptrtype)(unsafe.Pointer(t)).elem.size
	1980  			mask = make([]byte, n/sys.PtrSize)
	1981  			for i := uintptr(0); i < n; i += sys.PtrSize {
	1982  				off := (uintptr(p) + i - datap.bss) / sys.PtrSize
	1983  				mask[i/sys.PtrSize] = (*addb(bitmap, off/8) >> (off % 8)) & 1
	1984  			}
	1985  			return
	1986  		}
	1987  	}
	1988  
	1989  	// heap
	1990  	if base, s, _ := findObject(uintptr(p), 0, 0); base != 0 {
	1991  		hbits := heapBitsForAddr(base)
	1992  		n := s.elemsize
	1993  		mask = make([]byte, n/sys.PtrSize)
	1994  		for i := uintptr(0); i < n; i += sys.PtrSize {
	1995  			if hbits.isPointer() {
	1996  				mask[i/sys.PtrSize] = 1
	1997  			}
	1998  			if !hbits.morePointers() {
	1999  				mask = mask[:i/sys.PtrSize]
	2000  				break
	2001  			}
	2002  			hbits = hbits.next()
	2003  		}
	2004  		return
	2005  	}
	2006  
	2007  	// stack
	2008  	if _g_ := getg(); _g_.m.curg.stack.lo <= uintptr(p) && uintptr(p) < _g_.m.curg.stack.hi {
	2009  		var frame stkframe
	2010  		frame.sp = uintptr(p)
	2011  		_g_ := getg()
	2012  		gentraceback(_g_.m.curg.sched.pc, _g_.m.curg.sched.sp, 0, _g_.m.curg, 0, nil, 1000, getgcmaskcb, noescape(unsafe.Pointer(&frame)), 0)
	2013  		if frame.fn.valid() {
	2014  			locals, _, _ := getStackMap(&frame, nil, false)
	2015  			if locals.n == 0 {
	2016  				return
	2017  			}
	2018  			size := uintptr(locals.n) * sys.PtrSize
	2019  			n := (*ptrtype)(unsafe.Pointer(t)).elem.size
	2020  			mask = make([]byte, n/sys.PtrSize)
	2021  			for i := uintptr(0); i < n; i += sys.PtrSize {
	2022  				off := (uintptr(p) + i - frame.varp + size) / sys.PtrSize
	2023  				mask[i/sys.PtrSize] = locals.ptrbit(off)
	2024  			}
	2025  		}
	2026  		return
	2027  	}
	2028  
	2029  	// otherwise, not something the GC knows about.
	2030  	// possibly read-only data, like malloc(0).
	2031  	// must not have pointers
	2032  	return
	2033  }
	2034  

View as plain text