Text file
src/runtime/memmove_amd64.s
Documentation: runtime
1// Derived from Inferno's libkern/memmove-386.s (adapted for amd64)
2// https://bitbucket.org/inferno-os/inferno-os/src/master/libkern/memmove-386.s
3//
4// Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
5// Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com). All rights reserved.
6// Portions Copyright 2009 The Go Authors. All rights reserved.
7//
8// Permission is hereby granted, free of charge, to any person obtaining a copy
9// of this software and associated documentation files (the "Software"), to deal
10// in the Software without restriction, including without limitation the rights
11// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12// copies of the Software, and to permit persons to whom the Software is
13// furnished to do so, subject to the following conditions:
14//
15// The above copyright notice and this permission notice shall be included in
16// all copies or substantial portions of the Software.
17//
18// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24// THE SOFTWARE.
25
26//go:build !plan9
27// +build !plan9
28
29#include "go_asm.h"
30#include "textflag.h"
31
32// See memmove Go doc for important implementation constraints.
33
34// func memmove(to, from unsafe.Pointer, n uintptr)
35// ABIInternal for performance.
36TEXT runtime·memmove<ABIInternal>(SB), NOSPLIT, $0-24
37#ifdef GOEXPERIMENT_regabiargs
38 // AX = to
39 // BX = from
40 // CX = n
41 MOVQ AX, DI
42 MOVQ BX, SI
43 MOVQ CX, BX
44#else
45 MOVQ to+0(FP), DI
46 MOVQ from+8(FP), SI
47 MOVQ n+16(FP), BX
48#endif
49
50 // REP instructions have a high startup cost, so we handle small sizes
51 // with some straightline code. The REP MOVSQ instruction is really fast
52 // for large sizes. The cutover is approximately 2K.
53tail:
54 // move_129through256 or smaller work whether or not the source and the
55 // destination memory regions overlap because they load all data into
56 // registers before writing it back. move_256through2048 on the other
57 // hand can be used only when the memory regions don't overlap or the copy
58 // direction is forward.
59 //
60 // BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing.
61 TESTQ BX, BX
62 JEQ move_0
63 CMPQ BX, $2
64 JBE move_1or2
65 CMPQ BX, $4
66 JB move_3
67 JBE move_4
68 CMPQ BX, $8
69 JB move_5through7
70 JE move_8
71 CMPQ BX, $16
72 JBE move_9through16
73 CMPQ BX, $32
74 JBE move_17through32
75 CMPQ BX, $64
76 JBE move_33through64
77 CMPQ BX, $128
78 JBE move_65through128
79 CMPQ BX, $256
80 JBE move_129through256
81
82 TESTB $1, runtime·useAVXmemmove(SB)
83 JNZ avxUnaligned
84
85/*
86 * check and set for backwards
87 */
88 CMPQ SI, DI
89 JLS back
90
91/*
92 * forward copy loop
93 */
94forward:
95 CMPQ BX, $2048
96 JLS move_256through2048
97
98 // If REP MOVSB isn't fast, don't use it
99 CMPB internal∕cpu·X86+const_offsetX86HasERMS(SB), $1 // enhanced REP MOVSB/STOSB
100 JNE fwdBy8
101
102 // Check alignment
103 MOVL SI, AX
104 ORL DI, AX
105 TESTL $7, AX
106 JEQ fwdBy8
107
108 // Do 1 byte at a time
109 MOVQ BX, CX
110 REP; MOVSB
111 RET
112
113fwdBy8:
114 // Do 8 bytes at a time
115 MOVQ BX, CX
116 SHRQ $3, CX
117 ANDQ $7, BX
118 REP; MOVSQ
119 JMP tail
120
121back:
122/*
123 * check overlap
124 */
125 MOVQ SI, CX
126 ADDQ BX, CX
127 CMPQ CX, DI
128 JLS forward
129/*
130 * whole thing backwards has
131 * adjusted addresses
132 */
133 ADDQ BX, DI
134 ADDQ BX, SI
135 STD
136
137/*
138 * copy
139 */
140 MOVQ BX, CX
141 SHRQ $3, CX
142 ANDQ $7, BX
143
144 SUBQ $8, DI
145 SUBQ $8, SI
146 REP; MOVSQ
147
148 CLD
149 ADDQ $8, DI
150 ADDQ $8, SI
151 SUBQ BX, DI
152 SUBQ BX, SI
153 JMP tail
154
155move_1or2:
156 MOVB (SI), AX
157 MOVB -1(SI)(BX*1), CX
158 MOVB AX, (DI)
159 MOVB CX, -1(DI)(BX*1)
160 RET
161move_0:
162 RET
163move_4:
164 MOVL (SI), AX
165 MOVL AX, (DI)
166 RET
167move_3:
168 MOVW (SI), AX
169 MOVB 2(SI), CX
170 MOVW AX, (DI)
171 MOVB CX, 2(DI)
172 RET
173move_5through7:
174 MOVL (SI), AX
175 MOVL -4(SI)(BX*1), CX
176 MOVL AX, (DI)
177 MOVL CX, -4(DI)(BX*1)
178 RET
179move_8:
180 // We need a separate case for 8 to make sure we write pointers atomically.
181 MOVQ (SI), AX
182 MOVQ AX, (DI)
183 RET
184move_9through16:
185 MOVQ (SI), AX
186 MOVQ -8(SI)(BX*1), CX
187 MOVQ AX, (DI)
188 MOVQ CX, -8(DI)(BX*1)
189 RET
190move_17through32:
191 MOVOU (SI), X0
192 MOVOU -16(SI)(BX*1), X1
193 MOVOU X0, (DI)
194 MOVOU X1, -16(DI)(BX*1)
195 RET
196move_33through64:
197 MOVOU (SI), X0
198 MOVOU 16(SI), X1
199 MOVOU -32(SI)(BX*1), X2
200 MOVOU -16(SI)(BX*1), X3
201 MOVOU X0, (DI)
202 MOVOU X1, 16(DI)
203 MOVOU X2, -32(DI)(BX*1)
204 MOVOU X3, -16(DI)(BX*1)
205 RET
206move_65through128:
207 MOVOU (SI), X0
208 MOVOU 16(SI), X1
209 MOVOU 32(SI), X2
210 MOVOU 48(SI), X3
211 MOVOU -64(SI)(BX*1), X4
212 MOVOU -48(SI)(BX*1), X5
213 MOVOU -32(SI)(BX*1), X6
214 MOVOU -16(SI)(BX*1), X7
215 MOVOU X0, (DI)
216 MOVOU X1, 16(DI)
217 MOVOU X2, 32(DI)
218 MOVOU X3, 48(DI)
219 MOVOU X4, -64(DI)(BX*1)
220 MOVOU X5, -48(DI)(BX*1)
221 MOVOU X6, -32(DI)(BX*1)
222 MOVOU X7, -16(DI)(BX*1)
223 RET
224move_129through256:
225 MOVOU (SI), X0
226 MOVOU 16(SI), X1
227 MOVOU 32(SI), X2
228 MOVOU 48(SI), X3
229 MOVOU 64(SI), X4
230 MOVOU 80(SI), X5
231 MOVOU 96(SI), X6
232 MOVOU 112(SI), X7
233 MOVOU -128(SI)(BX*1), X8
234 MOVOU -112(SI)(BX*1), X9
235 MOVOU -96(SI)(BX*1), X10
236 MOVOU -80(SI)(BX*1), X11
237 MOVOU -64(SI)(BX*1), X12
238 MOVOU -48(SI)(BX*1), X13
239 MOVOU -32(SI)(BX*1), X14
240 MOVOU -16(SI)(BX*1), X15
241 MOVOU X0, (DI)
242 MOVOU X1, 16(DI)
243 MOVOU X2, 32(DI)
244 MOVOU X3, 48(DI)
245 MOVOU X4, 64(DI)
246 MOVOU X5, 80(DI)
247 MOVOU X6, 96(DI)
248 MOVOU X7, 112(DI)
249 MOVOU X8, -128(DI)(BX*1)
250 MOVOU X9, -112(DI)(BX*1)
251 MOVOU X10, -96(DI)(BX*1)
252 MOVOU X11, -80(DI)(BX*1)
253 MOVOU X12, -64(DI)(BX*1)
254 MOVOU X13, -48(DI)(BX*1)
255 MOVOU X14, -32(DI)(BX*1)
256 MOVOU X15, -16(DI)(BX*1)
257#ifdef GOEXPERIMENT_regabig
258 // X15 must be zero on return
259 PXOR X15, X15
260#endif
261 RET
262move_256through2048:
263 SUBQ $256, BX
264 MOVOU (SI), X0
265 MOVOU 16(SI), X1
266 MOVOU 32(SI), X2
267 MOVOU 48(SI), X3
268 MOVOU 64(SI), X4
269 MOVOU 80(SI), X5
270 MOVOU 96(SI), X6
271 MOVOU 112(SI), X7
272 MOVOU 128(SI), X8
273 MOVOU 144(SI), X9
274 MOVOU 160(SI), X10
275 MOVOU 176(SI), X11
276 MOVOU 192(SI), X12
277 MOVOU 208(SI), X13
278 MOVOU 224(SI), X14
279 MOVOU 240(SI), X15
280 MOVOU X0, (DI)
281 MOVOU X1, 16(DI)
282 MOVOU X2, 32(DI)
283 MOVOU X3, 48(DI)
284 MOVOU X4, 64(DI)
285 MOVOU X5, 80(DI)
286 MOVOU X6, 96(DI)
287 MOVOU X7, 112(DI)
288 MOVOU X8, 128(DI)
289 MOVOU X9, 144(DI)
290 MOVOU X10, 160(DI)
291 MOVOU X11, 176(DI)
292 MOVOU X12, 192(DI)
293 MOVOU X13, 208(DI)
294 MOVOU X14, 224(DI)
295 MOVOU X15, 240(DI)
296 CMPQ BX, $256
297 LEAQ 256(SI), SI
298 LEAQ 256(DI), DI
299 JGE move_256through2048
300#ifdef GOEXPERIMENT_regabig
301 // X15 must be zero on return
302 PXOR X15, X15
303#endif
304 JMP tail
305
306avxUnaligned:
307 // There are two implementations of move algorithm.
308 // The first one for non-overlapped memory regions. It uses forward copying.
309 // The second one for overlapped regions. It uses backward copying
310 MOVQ DI, CX
311 SUBQ SI, CX
312 // Now CX contains distance between SRC and DEST
313 CMPQ CX, BX
314 // If the distance lesser than region length it means that regions are overlapped
315 JC copy_backward
316
317 // Non-temporal copy would be better for big sizes.
318 CMPQ BX, $0x100000
319 JAE gobble_big_data_fwd
320
321 // Memory layout on the source side
322 // SI CX
323 // |<---------BX before correction--------->|
324 // | |<--BX corrected-->| |
325 // | | |<--- AX --->|
326 // |<-R11->| |<-128 bytes->|
327 // +----------------------------------------+
328 // | Head | Body | Tail |
329 // +-------+------------------+-------------+
330 // ^ ^ ^
331 // | | |
332 // Save head into Y4 Save tail into X5..X12
333 // |
334 // SI+R11, where R11 = ((DI & -32) + 32) - DI
335 // Algorithm:
336 // 1. Unaligned save of the tail's 128 bytes
337 // 2. Unaligned save of the head's 32 bytes
338 // 3. Destination-aligned copying of body (128 bytes per iteration)
339 // 4. Put head on the new place
340 // 5. Put the tail on the new place
341 // It can be important to satisfy processor's pipeline requirements for
342 // small sizes as the cost of unaligned memory region copying is
343 // comparable with the cost of main loop. So code is slightly messed there.
344 // There is more clean implementation of that algorithm for bigger sizes
345 // where the cost of unaligned part copying is negligible.
346 // You can see it after gobble_big_data_fwd label.
347 LEAQ (SI)(BX*1), CX
348 MOVQ DI, R10
349 // CX points to the end of buffer so we need go back slightly. We will use negative offsets there.
350 MOVOU -0x80(CX), X5
351 MOVOU -0x70(CX), X6
352 MOVQ $0x80, AX
353 // Align destination address
354 ANDQ $-32, DI
355 ADDQ $32, DI
356 // Continue tail saving.
357 MOVOU -0x60(CX), X7
358 MOVOU -0x50(CX), X8
359 // Make R11 delta between aligned and unaligned destination addresses.
360 MOVQ DI, R11
361 SUBQ R10, R11
362 // Continue tail saving.
363 MOVOU -0x40(CX), X9
364 MOVOU -0x30(CX), X10
365 // Let's make bytes-to-copy value adjusted as we've prepared unaligned part for copying.
366 SUBQ R11, BX
367 // Continue tail saving.
368 MOVOU -0x20(CX), X11
369 MOVOU -0x10(CX), X12
370 // The tail will be put on its place after main body copying.
371 // It's time for the unaligned heading part.
372 VMOVDQU (SI), Y4
373 // Adjust source address to point past head.
374 ADDQ R11, SI
375 SUBQ AX, BX
376 // Aligned memory copying there
377gobble_128_loop:
378 VMOVDQU (SI), Y0
379 VMOVDQU 0x20(SI), Y1
380 VMOVDQU 0x40(SI), Y2
381 VMOVDQU 0x60(SI), Y3
382 ADDQ AX, SI
383 VMOVDQA Y0, (DI)
384 VMOVDQA Y1, 0x20(DI)
385 VMOVDQA Y2, 0x40(DI)
386 VMOVDQA Y3, 0x60(DI)
387 ADDQ AX, DI
388 SUBQ AX, BX
389 JA gobble_128_loop
390 // Now we can store unaligned parts.
391 ADDQ AX, BX
392 ADDQ DI, BX
393 VMOVDQU Y4, (R10)
394 VZEROUPPER
395 MOVOU X5, -0x80(BX)
396 MOVOU X6, -0x70(BX)
397 MOVOU X7, -0x60(BX)
398 MOVOU X8, -0x50(BX)
399 MOVOU X9, -0x40(BX)
400 MOVOU X10, -0x30(BX)
401 MOVOU X11, -0x20(BX)
402 MOVOU X12, -0x10(BX)
403 RET
404
405gobble_big_data_fwd:
406 // There is forward copying for big regions.
407 // It uses non-temporal mov instructions.
408 // Details of this algorithm are commented previously for small sizes.
409 LEAQ (SI)(BX*1), CX
410 MOVOU -0x80(SI)(BX*1), X5
411 MOVOU -0x70(CX), X6
412 MOVOU -0x60(CX), X7
413 MOVOU -0x50(CX), X8
414 MOVOU -0x40(CX), X9
415 MOVOU -0x30(CX), X10
416 MOVOU -0x20(CX), X11
417 MOVOU -0x10(CX), X12
418 VMOVDQU (SI), Y4
419 MOVQ DI, R8
420 ANDQ $-32, DI
421 ADDQ $32, DI
422 MOVQ DI, R10
423 SUBQ R8, R10
424 SUBQ R10, BX
425 ADDQ R10, SI
426 LEAQ (DI)(BX*1), CX
427 SUBQ $0x80, BX
428gobble_mem_fwd_loop:
429 PREFETCHNTA 0x1C0(SI)
430 PREFETCHNTA 0x280(SI)
431 // Prefetch values were chosen empirically.
432 // Approach for prefetch usage as in 7.6.6 of [1]
433 // [1] 64-ia-32-architectures-optimization-manual.pdf
434 // https://www.intel.ru/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
435 VMOVDQU (SI), Y0
436 VMOVDQU 0x20(SI), Y1
437 VMOVDQU 0x40(SI), Y2
438 VMOVDQU 0x60(SI), Y3
439 ADDQ $0x80, SI
440 VMOVNTDQ Y0, (DI)
441 VMOVNTDQ Y1, 0x20(DI)
442 VMOVNTDQ Y2, 0x40(DI)
443 VMOVNTDQ Y3, 0x60(DI)
444 ADDQ $0x80, DI
445 SUBQ $0x80, BX
446 JA gobble_mem_fwd_loop
447 // NT instructions don't follow the normal cache-coherency rules.
448 // We need SFENCE there to make copied data available timely.
449 SFENCE
450 VMOVDQU Y4, (R8)
451 VZEROUPPER
452 MOVOU X5, -0x80(CX)
453 MOVOU X6, -0x70(CX)
454 MOVOU X7, -0x60(CX)
455 MOVOU X8, -0x50(CX)
456 MOVOU X9, -0x40(CX)
457 MOVOU X10, -0x30(CX)
458 MOVOU X11, -0x20(CX)
459 MOVOU X12, -0x10(CX)
460 RET
461
462copy_backward:
463 MOVQ DI, AX
464 // Backward copying is about the same as the forward one.
465 // Firstly we load unaligned tail in the beginning of region.
466 MOVOU (SI), X5
467 MOVOU 0x10(SI), X6
468 ADDQ BX, DI
469 MOVOU 0x20(SI), X7
470 MOVOU 0x30(SI), X8
471 LEAQ -0x20(DI), R10
472 MOVQ DI, R11
473 MOVOU 0x40(SI), X9
474 MOVOU 0x50(SI), X10
475 ANDQ $0x1F, R11
476 MOVOU 0x60(SI), X11
477 MOVOU 0x70(SI), X12
478 XORQ R11, DI
479 // Let's point SI to the end of region
480 ADDQ BX, SI
481 // and load unaligned head into X4.
482 VMOVDQU -0x20(SI), Y4
483 SUBQ R11, SI
484 SUBQ R11, BX
485 // If there is enough data for non-temporal moves go to special loop
486 CMPQ BX, $0x100000
487 JA gobble_big_data_bwd
488 SUBQ $0x80, BX
489gobble_mem_bwd_loop:
490 VMOVDQU -0x20(SI), Y0
491 VMOVDQU -0x40(SI), Y1
492 VMOVDQU -0x60(SI), Y2
493 VMOVDQU -0x80(SI), Y3
494 SUBQ $0x80, SI
495 VMOVDQA Y0, -0x20(DI)
496 VMOVDQA Y1, -0x40(DI)
497 VMOVDQA Y2, -0x60(DI)
498 VMOVDQA Y3, -0x80(DI)
499 SUBQ $0x80, DI
500 SUBQ $0x80, BX
501 JA gobble_mem_bwd_loop
502 // Let's store unaligned data
503 VMOVDQU Y4, (R10)
504 VZEROUPPER
505 MOVOU X5, (AX)
506 MOVOU X6, 0x10(AX)
507 MOVOU X7, 0x20(AX)
508 MOVOU X8, 0x30(AX)
509 MOVOU X9, 0x40(AX)
510 MOVOU X10, 0x50(AX)
511 MOVOU X11, 0x60(AX)
512 MOVOU X12, 0x70(AX)
513 RET
514
515gobble_big_data_bwd:
516 SUBQ $0x80, BX
517gobble_big_mem_bwd_loop:
518 PREFETCHNTA -0x1C0(SI)
519 PREFETCHNTA -0x280(SI)
520 VMOVDQU -0x20(SI), Y0
521 VMOVDQU -0x40(SI), Y1
522 VMOVDQU -0x60(SI), Y2
523 VMOVDQU -0x80(SI), Y3
524 SUBQ $0x80, SI
525 VMOVNTDQ Y0, -0x20(DI)
526 VMOVNTDQ Y1, -0x40(DI)
527 VMOVNTDQ Y2, -0x60(DI)
528 VMOVNTDQ Y3, -0x80(DI)
529 SUBQ $0x80, DI
530 SUBQ $0x80, BX
531 JA gobble_big_mem_bwd_loop
532 SFENCE
533 VMOVDQU Y4, (R10)
534 VZEROUPPER
535 MOVOU X5, (AX)
536 MOVOU X6, 0x10(AX)
537 MOVOU X7, 0x20(AX)
538 MOVOU X8, 0x30(AX)
539 MOVOU X9, 0x40(AX)
540 MOVOU X10, 0x50(AX)
541 MOVOU X11, 0x60(AX)
542 MOVOU X12, 0x70(AX)
543 RET
View as plain text