Text file
src/runtime/memclr_amd64.s
Documentation: runtime
1// Copyright 2014 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:build !plan9
6// +build !plan9
7
8#include "go_asm.h"
9#include "textflag.h"
10
11// See memclrNoHeapPointers Go doc for important implementation constraints.
12
13// func memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr)
14// ABIInternal for performance.
15TEXT runtime·memclrNoHeapPointers<ABIInternal>(SB), NOSPLIT, $0-16
16#ifdef GOEXPERIMENT_regabiargs
17 // AX = ptr
18 // BX = n
19 MOVQ AX, DI // DI = ptr
20#else
21 MOVQ ptr+0(FP), DI
22 MOVQ n+8(FP), BX
23#endif
24 XORQ AX, AX
25
26 // MOVOU seems always faster than REP STOSQ.
27tail:
28 // BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing.
29 TESTQ BX, BX
30 JEQ _0
31 CMPQ BX, $2
32 JBE _1or2
33 CMPQ BX, $4
34 JBE _3or4
35 CMPQ BX, $8
36 JB _5through7
37 JE _8
38 CMPQ BX, $16
39 JBE _9through16
40#ifndef GOEXPERIMENT_regabig
41 PXOR X15, X15
42#endif
43 CMPQ BX, $32
44 JBE _17through32
45 CMPQ BX, $64
46 JBE _33through64
47 CMPQ BX, $128
48 JBE _65through128
49 CMPQ BX, $256
50 JBE _129through256
51 CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
52 JE loop_preheader_avx2
53 // TODO: for really big clears, use MOVNTDQ, even without AVX2.
54
55loop:
56 MOVOU X15, 0(DI)
57 MOVOU X15, 16(DI)
58 MOVOU X15, 32(DI)
59 MOVOU X15, 48(DI)
60 MOVOU X15, 64(DI)
61 MOVOU X15, 80(DI)
62 MOVOU X15, 96(DI)
63 MOVOU X15, 112(DI)
64 MOVOU X15, 128(DI)
65 MOVOU X15, 144(DI)
66 MOVOU X15, 160(DI)
67 MOVOU X15, 176(DI)
68 MOVOU X15, 192(DI)
69 MOVOU X15, 208(DI)
70 MOVOU X15, 224(DI)
71 MOVOU X15, 240(DI)
72 SUBQ $256, BX
73 ADDQ $256, DI
74 CMPQ BX, $256
75 JAE loop
76 JMP tail
77
78loop_preheader_avx2:
79 VPXOR Y0, Y0, Y0
80 // For smaller sizes MOVNTDQ may be faster or slower depending on hardware.
81 // For larger sizes it is always faster, even on dual Xeons with 30M cache.
82 // TODO take into account actual LLC size. E. g. glibc uses LLC size/2.
83 CMPQ BX, $0x2000000
84 JAE loop_preheader_avx2_huge
85loop_avx2:
86 VMOVDQU Y0, 0(DI)
87 VMOVDQU Y0, 32(DI)
88 VMOVDQU Y0, 64(DI)
89 VMOVDQU Y0, 96(DI)
90 SUBQ $128, BX
91 ADDQ $128, DI
92 CMPQ BX, $128
93 JAE loop_avx2
94 VMOVDQU Y0, -32(DI)(BX*1)
95 VMOVDQU Y0, -64(DI)(BX*1)
96 VMOVDQU Y0, -96(DI)(BX*1)
97 VMOVDQU Y0, -128(DI)(BX*1)
98 VZEROUPPER
99 RET
100loop_preheader_avx2_huge:
101 // Align to 32 byte boundary
102 VMOVDQU Y0, 0(DI)
103 MOVQ DI, SI
104 ADDQ $32, DI
105 ANDQ $~31, DI
106 SUBQ DI, SI
107 ADDQ SI, BX
108loop_avx2_huge:
109 VMOVNTDQ Y0, 0(DI)
110 VMOVNTDQ Y0, 32(DI)
111 VMOVNTDQ Y0, 64(DI)
112 VMOVNTDQ Y0, 96(DI)
113 SUBQ $128, BX
114 ADDQ $128, DI
115 CMPQ BX, $128
116 JAE loop_avx2_huge
117 // In the description of MOVNTDQ in [1]
118 // "... fencing operation implemented with the SFENCE or MFENCE instruction
119 // should be used in conjunction with MOVNTDQ instructions..."
120 // [1] 64-ia-32-architectures-software-developer-manual-325462.pdf
121 SFENCE
122 VMOVDQU Y0, -32(DI)(BX*1)
123 VMOVDQU Y0, -64(DI)(BX*1)
124 VMOVDQU Y0, -96(DI)(BX*1)
125 VMOVDQU Y0, -128(DI)(BX*1)
126 VZEROUPPER
127 RET
128
129_1or2:
130 MOVB AX, (DI)
131 MOVB AX, -1(DI)(BX*1)
132 RET
133_0:
134 RET
135_3or4:
136 MOVW AX, (DI)
137 MOVW AX, -2(DI)(BX*1)
138 RET
139_5through7:
140 MOVL AX, (DI)
141 MOVL AX, -4(DI)(BX*1)
142 RET
143_8:
144 // We need a separate case for 8 to make sure we clear pointers atomically.
145 MOVQ AX, (DI)
146 RET
147_9through16:
148 MOVQ AX, (DI)
149 MOVQ AX, -8(DI)(BX*1)
150 RET
151_17through32:
152 MOVOU X15, (DI)
153 MOVOU X15, -16(DI)(BX*1)
154 RET
155_33through64:
156 MOVOU X15, (DI)
157 MOVOU X15, 16(DI)
158 MOVOU X15, -32(DI)(BX*1)
159 MOVOU X15, -16(DI)(BX*1)
160 RET
161_65through128:
162 MOVOU X15, (DI)
163 MOVOU X15, 16(DI)
164 MOVOU X15, 32(DI)
165 MOVOU X15, 48(DI)
166 MOVOU X15, -64(DI)(BX*1)
167 MOVOU X15, -48(DI)(BX*1)
168 MOVOU X15, -32(DI)(BX*1)
169 MOVOU X15, -16(DI)(BX*1)
170 RET
171_129through256:
172 MOVOU X15, (DI)
173 MOVOU X15, 16(DI)
174 MOVOU X15, 32(DI)
175 MOVOU X15, 48(DI)
176 MOVOU X15, 64(DI)
177 MOVOU X15, 80(DI)
178 MOVOU X15, 96(DI)
179 MOVOU X15, 112(DI)
180 MOVOU X15, -128(DI)(BX*1)
181 MOVOU X15, -112(DI)(BX*1)
182 MOVOU X15, -96(DI)(BX*1)
183 MOVOU X15, -80(DI)(BX*1)
184 MOVOU X15, -64(DI)(BX*1)
185 MOVOU X15, -48(DI)(BX*1)
186 MOVOU X15, -32(DI)(BX*1)
187 MOVOU X15, -16(DI)(BX*1)
188 RET
View as plain text