memmove_amd64.s

Documentation: runtime

		 1// Derived from Inferno's libkern/memmove-386.s (adapted for amd64)
		 2// https://bitbucket.org/inferno-os/inferno-os/src/master/libkern/memmove-386.s
		 3//
		 4//				 Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
		 5//				 Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).	All rights reserved.
		 6//				 Portions Copyright 2009 The Go Authors. All rights reserved.
		 7//
		 8// Permission is hereby granted, free of charge, to any person obtaining a copy
		 9// of this software and associated documentation files (the "Software"), to deal
		10// in the Software without restriction, including without limitation the rights
		11// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
		12// copies of the Software, and to permit persons to whom the Software is
		13// furnished to do so, subject to the following conditions:
		14//
		15// The above copyright notice and this permission notice shall be included in
		16// all copies or substantial portions of the Software.
		17//
		18// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
		19// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
		20// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.	IN NO EVENT SHALL THE
		21// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
		22// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
		23// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
		24// THE SOFTWARE.
		25
		26//go:build !plan9
		27// +build !plan9
		28
		29#include "go_asm.h"
		30#include "textflag.h"
		31
		32// See memmove Go doc for important implementation constraints.
		33
		34// func memmove(to, from unsafe.Pointer, n uintptr)
		35// ABIInternal for performance.
		36TEXT runtime·memmove<ABIInternal>(SB), NOSPLIT, $0-24
		37#ifdef GOEXPERIMENT_regabiargs
		38	// AX = to
		39	// BX = from
		40	// CX = n
		41	MOVQ	AX, DI
		42	MOVQ	BX, SI
		43	MOVQ	CX, BX
		44#else
		45	MOVQ	to+0(FP), DI
		46	MOVQ	from+8(FP), SI
		47	MOVQ	n+16(FP), BX
		48#endif
		49
		50	// REP instructions have a high startup cost, so we handle small sizes
		51	// with some straightline code. The REP MOVSQ instruction is really fast
		52	// for large sizes. The cutover is approximately 2K.
		53tail:
		54	// move_129through256 or smaller work whether or not the source and the
		55	// destination memory regions overlap because they load all data into
		56	// registers before writing it back.	move_256through2048 on the other
		57	// hand can be used only when the memory regions don't overlap or the copy
		58	// direction is forward.
		59	//
		60	// BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing.
		61	TESTQ	BX, BX
		62	JEQ	move_0
		63	CMPQ	BX, $2
		64	JBE	move_1or2
		65	CMPQ	BX, $4
		66	JB	move_3
		67	JBE	move_4
		68	CMPQ	BX, $8
		69	JB	move_5through7
		70	JE	move_8
		71	CMPQ	BX, $16
		72	JBE	move_9through16
		73	CMPQ	BX, $32
		74	JBE	move_17through32
		75	CMPQ	BX, $64
		76	JBE	move_33through64
		77	CMPQ	BX, $128
		78	JBE	move_65through128
		79	CMPQ	BX, $256
		80	JBE	move_129through256
		81
		82	TESTB	$1, runtime·useAVXmemmove(SB)
		83	JNZ	avxUnaligned
		84
		85/*
		86 * check and set for backwards
		87 */
		88	CMPQ	SI, DI
		89	JLS	back
		90
		91/*
		92 * forward copy loop
		93 */
		94forward:
		95	CMPQ	BX, $2048
		96	JLS	move_256through2048
		97
		98	// If REP MOVSB isn't fast, don't use it
		99	CMPB	internal∕cpu·X86+const_offsetX86HasERMS(SB), $1 // enhanced REP MOVSB/STOSB
	 100	JNE	fwdBy8
	 101
	 102	// Check alignment
	 103	MOVL	SI, AX
	 104	ORL	DI, AX
	 105	TESTL	$7, AX
	 106	JEQ	fwdBy8
	 107
	 108	// Do 1 byte at a time
	 109	MOVQ	BX, CX
	 110	REP;	MOVSB
	 111	RET
	 112
	 113fwdBy8:
	 114	// Do 8 bytes at a time
	 115	MOVQ	BX, CX
	 116	SHRQ	$3, CX
	 117	ANDQ	$7, BX
	 118	REP;	MOVSQ
	 119	JMP	tail
	 120
	 121back:
	 122/*
	 123 * check overlap
	 124 */
	 125	MOVQ	SI, CX
	 126	ADDQ	BX, CX
	 127	CMPQ	CX, DI
	 128	JLS	forward
	 129/*
	 130 * whole thing backwards has
	 131 * adjusted addresses
	 132 */
	 133	ADDQ	BX, DI
	 134	ADDQ	BX, SI
	 135	STD
	 136
	 137/*
	 138 * copy
	 139 */
	 140	MOVQ	BX, CX
	 141	SHRQ	$3, CX
	 142	ANDQ	$7, BX
	 143
	 144	SUBQ	$8, DI
	 145	SUBQ	$8, SI
	 146	REP;	MOVSQ
	 147
	 148	CLD
	 149	ADDQ	$8, DI
	 150	ADDQ	$8, SI
	 151	SUBQ	BX, DI
	 152	SUBQ	BX, SI
	 153	JMP	tail
	 154
	 155move_1or2:
	 156	MOVB	(SI), AX
	 157	MOVB	-1(SI)(BX*1), CX
	 158	MOVB	AX, (DI)
	 159	MOVB	CX, -1(DI)(BX*1)
	 160	RET
	 161move_0:
	 162	RET
	 163move_4:
	 164	MOVL	(SI), AX
	 165	MOVL	AX, (DI)
	 166	RET
	 167move_3:
	 168	MOVW	(SI), AX
	 169	MOVB	2(SI), CX
	 170	MOVW	AX, (DI)
	 171	MOVB	CX, 2(DI)
	 172	RET
	 173move_5through7:
	 174	MOVL	(SI), AX
	 175	MOVL	-4(SI)(BX*1), CX
	 176	MOVL	AX, (DI)
	 177	MOVL	CX, -4(DI)(BX*1)
	 178	RET
	 179move_8:
	 180	// We need a separate case for 8 to make sure we write pointers atomically.
	 181	MOVQ	(SI), AX
	 182	MOVQ	AX, (DI)
	 183	RET
	 184move_9through16:
	 185	MOVQ	(SI), AX
	 186	MOVQ	-8(SI)(BX*1), CX
	 187	MOVQ	AX, (DI)
	 188	MOVQ	CX, -8(DI)(BX*1)
	 189	RET
	 190move_17through32:
	 191	MOVOU	(SI), X0
	 192	MOVOU	-16(SI)(BX*1), X1
	 193	MOVOU	X0, (DI)
	 194	MOVOU	X1, -16(DI)(BX*1)
	 195	RET
	 196move_33through64:
	 197	MOVOU	(SI), X0
	 198	MOVOU	16(SI), X1
	 199	MOVOU	-32(SI)(BX*1), X2
	 200	MOVOU	-16(SI)(BX*1), X3
	 201	MOVOU	X0, (DI)
	 202	MOVOU	X1, 16(DI)
	 203	MOVOU	X2, -32(DI)(BX*1)
	 204	MOVOU	X3, -16(DI)(BX*1)
	 205	RET
	 206move_65through128:
	 207	MOVOU	(SI), X0
	 208	MOVOU	16(SI), X1
	 209	MOVOU	32(SI), X2
	 210	MOVOU	48(SI), X3
	 211	MOVOU	-64(SI)(BX*1), X4
	 212	MOVOU	-48(SI)(BX*1), X5
	 213	MOVOU	-32(SI)(BX*1), X6
	 214	MOVOU	-16(SI)(BX*1), X7
	 215	MOVOU	X0, (DI)
	 216	MOVOU	X1, 16(DI)
	 217	MOVOU	X2, 32(DI)
	 218	MOVOU	X3, 48(DI)
	 219	MOVOU	X4, -64(DI)(BX*1)
	 220	MOVOU	X5, -48(DI)(BX*1)
	 221	MOVOU	X6, -32(DI)(BX*1)
	 222	MOVOU	X7, -16(DI)(BX*1)
	 223	RET
	 224move_129through256:
	 225	MOVOU	(SI), X0
	 226	MOVOU	16(SI), X1
	 227	MOVOU	32(SI), X2
	 228	MOVOU	48(SI), X3
	 229	MOVOU	64(SI), X4
	 230	MOVOU	80(SI), X5
	 231	MOVOU	96(SI), X6
	 232	MOVOU	112(SI), X7
	 233	MOVOU	-128(SI)(BX*1), X8
	 234	MOVOU	-112(SI)(BX*1), X9
	 235	MOVOU	-96(SI)(BX*1), X10
	 236	MOVOU	-80(SI)(BX*1), X11
	 237	MOVOU	-64(SI)(BX*1), X12
	 238	MOVOU	-48(SI)(BX*1), X13
	 239	MOVOU	-32(SI)(BX*1), X14
	 240	MOVOU	-16(SI)(BX*1), X15
	 241	MOVOU	X0, (DI)
	 242	MOVOU	X1, 16(DI)
	 243	MOVOU	X2, 32(DI)
	 244	MOVOU	X3, 48(DI)
	 245	MOVOU	X4, 64(DI)
	 246	MOVOU	X5, 80(DI)
	 247	MOVOU	X6, 96(DI)
	 248	MOVOU	X7, 112(DI)
	 249	MOVOU	X8, -128(DI)(BX*1)
	 250	MOVOU	X9, -112(DI)(BX*1)
	 251	MOVOU	X10, -96(DI)(BX*1)
	 252	MOVOU	X11, -80(DI)(BX*1)
	 253	MOVOU	X12, -64(DI)(BX*1)
	 254	MOVOU	X13, -48(DI)(BX*1)
	 255	MOVOU	X14, -32(DI)(BX*1)
	 256	MOVOU	X15, -16(DI)(BX*1)
	 257#ifdef GOEXPERIMENT_regabig
	 258	// X15 must be zero on return
	 259	PXOR	X15, X15
	 260#endif
	 261	RET
	 262move_256through2048:
	 263	SUBQ	$256, BX
	 264	MOVOU	(SI), X0
	 265	MOVOU	16(SI), X1
	 266	MOVOU	32(SI), X2
	 267	MOVOU	48(SI), X3
	 268	MOVOU	64(SI), X4
	 269	MOVOU	80(SI), X5
	 270	MOVOU	96(SI), X6
	 271	MOVOU	112(SI), X7
	 272	MOVOU	128(SI), X8
	 273	MOVOU	144(SI), X9
	 274	MOVOU	160(SI), X10
	 275	MOVOU	176(SI), X11
	 276	MOVOU	192(SI), X12
	 277	MOVOU	208(SI), X13
	 278	MOVOU	224(SI), X14
	 279	MOVOU	240(SI), X15
	 280	MOVOU	X0, (DI)
	 281	MOVOU	X1, 16(DI)
	 282	MOVOU	X2, 32(DI)
	 283	MOVOU	X3, 48(DI)
	 284	MOVOU	X4, 64(DI)
	 285	MOVOU	X5, 80(DI)
	 286	MOVOU	X6, 96(DI)
	 287	MOVOU	X7, 112(DI)
	 288	MOVOU	X8, 128(DI)
	 289	MOVOU	X9, 144(DI)
	 290	MOVOU	X10, 160(DI)
	 291	MOVOU	X11, 176(DI)
	 292	MOVOU	X12, 192(DI)
	 293	MOVOU	X13, 208(DI)
	 294	MOVOU	X14, 224(DI)
	 295	MOVOU	X15, 240(DI)
	 296	CMPQ	BX, $256
	 297	LEAQ	256(SI), SI
	 298	LEAQ	256(DI), DI
	 299	JGE	move_256through2048
	 300#ifdef GOEXPERIMENT_regabig
	 301	// X15 must be zero on return
	 302	PXOR	X15, X15
	 303#endif
	 304	JMP	tail
	 305
	 306avxUnaligned:
	 307	// There are two implementations of move algorithm.
	 308	// The first one for non-overlapped memory regions. It uses forward copying.
	 309	// The second one for overlapped regions. It uses backward copying
	 310	MOVQ	DI, CX
	 311	SUBQ	SI, CX
	 312	// Now CX contains distance between SRC and DEST
	 313	CMPQ	CX, BX
	 314	// If the distance lesser than region length it means that regions are overlapped
	 315	JC	copy_backward
	 316
	 317	// Non-temporal copy would be better for big sizes.
	 318	CMPQ	BX, $0x100000
	 319	JAE	gobble_big_data_fwd
	 320
	 321	// Memory layout on the source side
	 322	// SI																			 CX
	 323	// |<---------BX before correction--------->|
	 324	// |			 |<--BX corrected-->|						 |
	 325	// |			 |									|<--- AX	--->|
	 326	// |<-R11->|									|<-128 bytes->|
	 327	// +----------------------------------------+
	 328	// | Head	| Body						 | Tail				|
	 329	// +-------+------------------+-------------+
	 330	// ^			 ^									^
	 331	// |			 |									|
	 332	// Save head into Y4					Save tail into X5..X12
	 333	//				 |
	 334	//				 SI+R11, where R11 = ((DI & -32) + 32) - DI
	 335	// Algorithm:
	 336	// 1. Unaligned save of the tail's 128 bytes
	 337	// 2. Unaligned save of the head's 32	bytes
	 338	// 3. Destination-aligned copying of body (128 bytes per iteration)
	 339	// 4. Put head on the new place
	 340	// 5. Put the tail on the new place
	 341	// It can be important to satisfy processor's pipeline requirements for
	 342	// small sizes as the cost of unaligned memory region copying is
	 343	// comparable with the cost of main loop. So code is slightly messed there.
	 344	// There is more clean implementation of that algorithm for bigger sizes
	 345	// where the cost of unaligned part copying is negligible.
	 346	// You can see it after gobble_big_data_fwd label.
	 347	LEAQ	(SI)(BX*1), CX
	 348	MOVQ	DI, R10
	 349	// CX points to the end of buffer so we need go back slightly. We will use negative offsets there.
	 350	MOVOU	-0x80(CX), X5
	 351	MOVOU	-0x70(CX), X6
	 352	MOVQ	$0x80, AX
	 353	// Align destination address
	 354	ANDQ	$-32, DI
	 355	ADDQ	$32, DI
	 356	// Continue tail saving.
	 357	MOVOU	-0x60(CX), X7
	 358	MOVOU	-0x50(CX), X8
	 359	// Make R11 delta between aligned and unaligned destination addresses.
	 360	MOVQ	DI, R11
	 361	SUBQ	R10, R11
	 362	// Continue tail saving.
	 363	MOVOU	-0x40(CX), X9
	 364	MOVOU	-0x30(CX), X10
	 365	// Let's make bytes-to-copy value adjusted as we've prepared unaligned part for copying.
	 366	SUBQ	R11, BX
	 367	// Continue tail saving.
	 368	MOVOU	-0x20(CX), X11
	 369	MOVOU	-0x10(CX), X12
	 370	// The tail will be put on its place after main body copying.
	 371	// It's time for the unaligned heading part.
	 372	VMOVDQU	(SI), Y4
	 373	// Adjust source address to point past head.
	 374	ADDQ	R11, SI
	 375	SUBQ	AX, BX
	 376	// Aligned memory copying there
	 377gobble_128_loop:
	 378	VMOVDQU	(SI), Y0
	 379	VMOVDQU	0x20(SI), Y1
	 380	VMOVDQU	0x40(SI), Y2
	 381	VMOVDQU	0x60(SI), Y3
	 382	ADDQ	AX, SI
	 383	VMOVDQA	Y0, (DI)
	 384	VMOVDQA	Y1, 0x20(DI)
	 385	VMOVDQA	Y2, 0x40(DI)
	 386	VMOVDQA	Y3, 0x60(DI)
	 387	ADDQ	AX, DI
	 388	SUBQ	AX, BX
	 389	JA	gobble_128_loop
	 390	// Now we can store unaligned parts.
	 391	ADDQ	AX, BX
	 392	ADDQ	DI, BX
	 393	VMOVDQU	Y4, (R10)
	 394	VZEROUPPER
	 395	MOVOU	X5, -0x80(BX)
	 396	MOVOU	X6, -0x70(BX)
	 397	MOVOU	X7, -0x60(BX)
	 398	MOVOU	X8, -0x50(BX)
	 399	MOVOU	X9, -0x40(BX)
	 400	MOVOU	X10, -0x30(BX)
	 401	MOVOU	X11, -0x20(BX)
	 402	MOVOU	X12, -0x10(BX)
	 403	RET
	 404
	 405gobble_big_data_fwd:
	 406	// There is forward copying for big regions.
	 407	// It uses non-temporal mov instructions.
	 408	// Details of this algorithm are commented previously for small sizes.
	 409	LEAQ	(SI)(BX*1), CX
	 410	MOVOU	-0x80(SI)(BX*1), X5
	 411	MOVOU	-0x70(CX), X6
	 412	MOVOU	-0x60(CX), X7
	 413	MOVOU	-0x50(CX), X8
	 414	MOVOU	-0x40(CX), X9
	 415	MOVOU	-0x30(CX), X10
	 416	MOVOU	-0x20(CX), X11
	 417	MOVOU	-0x10(CX), X12
	 418	VMOVDQU	(SI), Y4
	 419	MOVQ	DI, R8
	 420	ANDQ	$-32, DI
	 421	ADDQ	$32, DI
	 422	MOVQ	DI, R10
	 423	SUBQ	R8, R10
	 424	SUBQ	R10, BX
	 425	ADDQ	R10, SI
	 426	LEAQ	(DI)(BX*1), CX
	 427	SUBQ	$0x80, BX
	 428gobble_mem_fwd_loop:
	 429	PREFETCHNTA 0x1C0(SI)
	 430	PREFETCHNTA 0x280(SI)
	 431	// Prefetch values were chosen empirically.
	 432	// Approach for prefetch usage as in 7.6.6 of [1]
	 433	// [1] 64-ia-32-architectures-optimization-manual.pdf
	 434	// https://www.intel.ru/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
	 435	VMOVDQU	(SI), Y0
	 436	VMOVDQU	0x20(SI), Y1
	 437	VMOVDQU	0x40(SI), Y2
	 438	VMOVDQU	0x60(SI), Y3
	 439	ADDQ	$0x80, SI
	 440	VMOVNTDQ Y0, (DI)
	 441	VMOVNTDQ Y1, 0x20(DI)
	 442	VMOVNTDQ Y2, 0x40(DI)
	 443	VMOVNTDQ Y3, 0x60(DI)
	 444	ADDQ	$0x80, DI
	 445	SUBQ	$0x80, BX
	 446	JA		gobble_mem_fwd_loop
	 447	// NT instructions don't follow the normal cache-coherency rules.
	 448	// We need SFENCE there to make copied data available timely.
	 449	SFENCE
	 450	VMOVDQU	Y4, (R8)
	 451	VZEROUPPER
	 452	MOVOU	X5, -0x80(CX)
	 453	MOVOU	X6, -0x70(CX)
	 454	MOVOU	X7, -0x60(CX)
	 455	MOVOU	X8, -0x50(CX)
	 456	MOVOU	X9, -0x40(CX)
	 457	MOVOU	X10, -0x30(CX)
	 458	MOVOU	X11, -0x20(CX)
	 459	MOVOU	X12, -0x10(CX)
	 460	RET
	 461
	 462copy_backward:
	 463	MOVQ	DI, AX
	 464	// Backward copying is about the same as the forward one.
	 465	// Firstly we load unaligned tail in the beginning of region.
	 466	MOVOU	(SI), X5
	 467	MOVOU	0x10(SI), X6
	 468	ADDQ	BX, DI
	 469	MOVOU	0x20(SI), X7
	 470	MOVOU	0x30(SI), X8
	 471	LEAQ	-0x20(DI), R10
	 472	MOVQ	DI, R11
	 473	MOVOU	0x40(SI), X9
	 474	MOVOU	0x50(SI), X10
	 475	ANDQ	$0x1F, R11
	 476	MOVOU	0x60(SI), X11
	 477	MOVOU	0x70(SI), X12
	 478	XORQ	R11, DI
	 479	// Let's point SI to the end of region
	 480	ADDQ	BX, SI
	 481	// and load unaligned head into X4.
	 482	VMOVDQU	-0x20(SI), Y4
	 483	SUBQ	R11, SI
	 484	SUBQ	R11, BX
	 485	// If there is enough data for non-temporal moves go to special loop
	 486	CMPQ	BX, $0x100000
	 487	JA		gobble_big_data_bwd
	 488	SUBQ	$0x80, BX
	 489gobble_mem_bwd_loop:
	 490	VMOVDQU	-0x20(SI), Y0
	 491	VMOVDQU	-0x40(SI), Y1
	 492	VMOVDQU	-0x60(SI), Y2
	 493	VMOVDQU	-0x80(SI), Y3
	 494	SUBQ	$0x80, SI
	 495	VMOVDQA	Y0, -0x20(DI)
	 496	VMOVDQA	Y1, -0x40(DI)
	 497	VMOVDQA	Y2, -0x60(DI)
	 498	VMOVDQA	Y3, -0x80(DI)
	 499	SUBQ	$0x80, DI
	 500	SUBQ	$0x80, BX
	 501	JA		gobble_mem_bwd_loop
	 502	// Let's store unaligned data
	 503	VMOVDQU	Y4, (R10)
	 504	VZEROUPPER
	 505	MOVOU	X5, (AX)
	 506	MOVOU	X6, 0x10(AX)
	 507	MOVOU	X7, 0x20(AX)
	 508	MOVOU	X8, 0x30(AX)
	 509	MOVOU	X9, 0x40(AX)
	 510	MOVOU	X10, 0x50(AX)
	 511	MOVOU	X11, 0x60(AX)
	 512	MOVOU	X12, 0x70(AX)
	 513	RET
	 514
	 515gobble_big_data_bwd:
	 516	SUBQ	$0x80, BX
	 517gobble_big_mem_bwd_loop:
	 518	PREFETCHNTA -0x1C0(SI)
	 519	PREFETCHNTA -0x280(SI)
	 520	VMOVDQU	-0x20(SI), Y0
	 521	VMOVDQU	-0x40(SI), Y1
	 522	VMOVDQU	-0x60(SI), Y2
	 523	VMOVDQU	-0x80(SI), Y3
	 524	SUBQ	$0x80, SI
	 525	VMOVNTDQ	Y0, -0x20(DI)
	 526	VMOVNTDQ	Y1, -0x40(DI)
	 527	VMOVNTDQ	Y2, -0x60(DI)
	 528	VMOVNTDQ	Y3, -0x80(DI)
	 529	SUBQ	$0x80, DI
	 530	SUBQ	$0x80, BX
	 531	JA	gobble_big_mem_bwd_loop
	 532	SFENCE
	 533	VMOVDQU	Y4, (R10)
	 534	VZEROUPPER
	 535	MOVOU	X5, (AX)
	 536	MOVOU	X6, 0x10(AX)
	 537	MOVOU	X7, 0x20(AX)
	 538	MOVOU	X8, 0x30(AX)
	 539	MOVOU	X9, 0x40(AX)
	 540	MOVOU	X10, 0x50(AX)
	 541	MOVOU	X11, 0x60(AX)
	 542	MOVOU	X12, 0x70(AX)
	 543	RET
View as plain text