memclr_amd64.s

Documentation: runtime

		 1// Copyright 2014 The Go Authors. All rights reserved.
		 2// Use of this source code is governed by a BSD-style
		 3// license that can be found in the LICENSE file.
		 4
		 5//go:build !plan9
		 6// +build !plan9
		 7
		 8#include "go_asm.h"
		 9#include "textflag.h"
		10
		11// See memclrNoHeapPointers Go doc for important implementation constraints.
		12
		13// func memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr)
		14// ABIInternal for performance.
		15TEXT runtime·memclrNoHeapPointers<ABIInternal>(SB), NOSPLIT, $0-16
		16#ifdef GOEXPERIMENT_regabiargs
		17	// AX = ptr
		18	// BX = n
		19	MOVQ	AX, DI	// DI = ptr
		20#else
		21	MOVQ	ptr+0(FP), DI
		22	MOVQ	n+8(FP), BX
		23#endif
		24	XORQ	AX, AX
		25
		26	// MOVOU seems always faster than REP STOSQ.
		27tail:
		28	// BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing.
		29	TESTQ	BX, BX
		30	JEQ	_0
		31	CMPQ	BX, $2
		32	JBE	_1or2
		33	CMPQ	BX, $4
		34	JBE	_3or4
		35	CMPQ	BX, $8
		36	JB	_5through7
		37	JE	_8
		38	CMPQ	BX, $16
		39	JBE	_9through16
		40#ifndef GOEXPERIMENT_regabig
		41	PXOR	X15, X15
		42#endif
		43	CMPQ	BX, $32
		44	JBE	_17through32
		45	CMPQ	BX, $64
		46	JBE	_33through64
		47	CMPQ	BX, $128
		48	JBE	_65through128
		49	CMPQ	BX, $256
		50	JBE	_129through256
		51	CMPB	internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
		52	JE loop_preheader_avx2
		53	// TODO: for really big clears, use MOVNTDQ, even without AVX2.
		54
		55loop:
		56	MOVOU	X15, 0(DI)
		57	MOVOU	X15, 16(DI)
		58	MOVOU	X15, 32(DI)
		59	MOVOU	X15, 48(DI)
		60	MOVOU	X15, 64(DI)
		61	MOVOU	X15, 80(DI)
		62	MOVOU	X15, 96(DI)
		63	MOVOU	X15, 112(DI)
		64	MOVOU	X15, 128(DI)
		65	MOVOU	X15, 144(DI)
		66	MOVOU	X15, 160(DI)
		67	MOVOU	X15, 176(DI)
		68	MOVOU	X15, 192(DI)
		69	MOVOU	X15, 208(DI)
		70	MOVOU	X15, 224(DI)
		71	MOVOU	X15, 240(DI)
		72	SUBQ	$256, BX
		73	ADDQ	$256, DI
		74	CMPQ	BX, $256
		75	JAE	loop
		76	JMP	tail
		77
		78loop_preheader_avx2:
		79	VPXOR Y0, Y0, Y0
		80	// For smaller sizes MOVNTDQ may be faster or slower depending on hardware.
		81	// For larger sizes it is always faster, even on dual Xeons with 30M cache.
		82	// TODO take into account actual LLC size. E. g. glibc uses LLC size/2.
		83	CMPQ		BX, $0x2000000
		84	JAE		 loop_preheader_avx2_huge
		85loop_avx2:
		86	VMOVDQU	Y0, 0(DI)
		87	VMOVDQU	Y0, 32(DI)
		88	VMOVDQU	Y0, 64(DI)
		89	VMOVDQU	Y0, 96(DI)
		90	SUBQ	$128, BX
		91	ADDQ	$128, DI
		92	CMPQ	BX, $128
		93	JAE	loop_avx2
		94	VMOVDQU	Y0, -32(DI)(BX*1)
		95	VMOVDQU	Y0, -64(DI)(BX*1)
		96	VMOVDQU	Y0, -96(DI)(BX*1)
		97	VMOVDQU	Y0, -128(DI)(BX*1)
		98	VZEROUPPER
		99	RET
	 100loop_preheader_avx2_huge:
	 101	// Align to 32 byte boundary
	 102	VMOVDQU	Y0, 0(DI)
	 103	MOVQ	DI, SI
	 104	ADDQ	$32, DI
	 105	ANDQ	$~31, DI
	 106	SUBQ	DI, SI
	 107	ADDQ	SI, BX
	 108loop_avx2_huge:
	 109	VMOVNTDQ	Y0, 0(DI)
	 110	VMOVNTDQ	Y0, 32(DI)
	 111	VMOVNTDQ	Y0, 64(DI)
	 112	VMOVNTDQ	Y0, 96(DI)
	 113	SUBQ	$128, BX
	 114	ADDQ	$128, DI
	 115	CMPQ	BX, $128
	 116	JAE	loop_avx2_huge
	 117	// In the description of MOVNTDQ in [1]
	 118	// "... fencing operation implemented with the SFENCE or MFENCE instruction
	 119	// should be used in conjunction with MOVNTDQ instructions..."
	 120	// [1] 64-ia-32-architectures-software-developer-manual-325462.pdf
	 121	SFENCE
	 122	VMOVDQU	Y0, -32(DI)(BX*1)
	 123	VMOVDQU	Y0, -64(DI)(BX*1)
	 124	VMOVDQU	Y0, -96(DI)(BX*1)
	 125	VMOVDQU	Y0, -128(DI)(BX*1)
	 126	VZEROUPPER
	 127	RET
	 128
	 129_1or2:
	 130	MOVB	AX, (DI)
	 131	MOVB	AX, -1(DI)(BX*1)
	 132	RET
	 133_0:
	 134	RET
	 135_3or4:
	 136	MOVW	AX, (DI)
	 137	MOVW	AX, -2(DI)(BX*1)
	 138	RET
	 139_5through7:
	 140	MOVL	AX, (DI)
	 141	MOVL	AX, -4(DI)(BX*1)
	 142	RET
	 143_8:
	 144	// We need a separate case for 8 to make sure we clear pointers atomically.
	 145	MOVQ	AX, (DI)
	 146	RET
	 147_9through16:
	 148	MOVQ	AX, (DI)
	 149	MOVQ	AX, -8(DI)(BX*1)
	 150	RET
	 151_17through32:
	 152	MOVOU	X15, (DI)
	 153	MOVOU	X15, -16(DI)(BX*1)
	 154	RET
	 155_33through64:
	 156	MOVOU	X15, (DI)
	 157	MOVOU	X15, 16(DI)
	 158	MOVOU	X15, -32(DI)(BX*1)
	 159	MOVOU	X15, -16(DI)(BX*1)
	 160	RET
	 161_65through128:
	 162	MOVOU	X15, (DI)
	 163	MOVOU	X15, 16(DI)
	 164	MOVOU	X15, 32(DI)
	 165	MOVOU	X15, 48(DI)
	 166	MOVOU	X15, -64(DI)(BX*1)
	 167	MOVOU	X15, -48(DI)(BX*1)
	 168	MOVOU	X15, -32(DI)(BX*1)
	 169	MOVOU	X15, -16(DI)(BX*1)
	 170	RET
	 171_129through256:
	 172	MOVOU	X15, (DI)
	 173	MOVOU	X15, 16(DI)
	 174	MOVOU	X15, 32(DI)
	 175	MOVOU	X15, 48(DI)
	 176	MOVOU	X15, 64(DI)
	 177	MOVOU	X15, 80(DI)
	 178	MOVOU	X15, 96(DI)
	 179	MOVOU	X15, 112(DI)
	 180	MOVOU	X15, -128(DI)(BX*1)
	 181	MOVOU	X15, -112(DI)(BX*1)
	 182	MOVOU	X15, -96(DI)(BX*1)
	 183	MOVOU	X15, -80(DI)(BX*1)
	 184	MOVOU	X15, -64(DI)(BX*1)
	 185	MOVOU	X15, -48(DI)(BX*1)
	 186	MOVOU	X15, -32(DI)(BX*1)
	 187	MOVOU	X15, -16(DI)(BX*1)
	 188	RET
View as plain text