...

Text file src/math/big/arith_amd64.s

Documentation: math/big

		 1// Copyright 2009 The Go Authors. All rights reserved.
		 2// Use of this source code is governed by a BSD-style
		 3// license that can be found in the LICENSE file.
		 4
		 5//go:build !math_big_pure_go
		 6// +build !math_big_pure_go
		 7
		 8#include "textflag.h"
		 9
		10// This file provides fast assembly versions for the elementary
		11// arithmetic operations on vectors implemented in arith.go.
		12
		13// func mulWW(x, y Word) (z1, z0 Word)
		14TEXT ·mulWW(SB),NOSPLIT,$0
		15	MOVQ x+0(FP), AX
		16	MULQ y+8(FP)
		17	MOVQ DX, z1+16(FP)
		18	MOVQ AX, z0+24(FP)
		19	RET
		20
		21
		22
		23// The carry bit is saved with SBBQ Rx, Rx: if the carry was set, Rx is -1, otherwise it is 0.
		24// It is restored with ADDQ Rx, Rx: if Rx was -1 the carry is set, otherwise it is cleared.
		25// This is faster than using rotate instructions.
		26
		27// func addVV(z, x, y []Word) (c Word)
		28TEXT ·addVV(SB),NOSPLIT,$0
		29	MOVQ z_len+8(FP), DI
		30	MOVQ x+24(FP), R8
		31	MOVQ y+48(FP), R9
		32	MOVQ z+0(FP), R10
		33
		34	MOVQ $0, CX		// c = 0
		35	MOVQ $0, SI		// i = 0
		36
		37	// s/JL/JMP/ below to disable the unrolled loop
		38	SUBQ $4, DI		// n -= 4
		39	JL V1			// if n < 0 goto V1
		40
		41U1:	// n >= 0
		42	// regular loop body unrolled 4x
		43	ADDQ CX, CX		// restore CF
		44	MOVQ 0(R8)(SI*8), R11
		45	MOVQ 8(R8)(SI*8), R12
		46	MOVQ 16(R8)(SI*8), R13
		47	MOVQ 24(R8)(SI*8), R14
		48	ADCQ 0(R9)(SI*8), R11
		49	ADCQ 8(R9)(SI*8), R12
		50	ADCQ 16(R9)(SI*8), R13
		51	ADCQ 24(R9)(SI*8), R14
		52	MOVQ R11, 0(R10)(SI*8)
		53	MOVQ R12, 8(R10)(SI*8)
		54	MOVQ R13, 16(R10)(SI*8)
		55	MOVQ R14, 24(R10)(SI*8)
		56	SBBQ CX, CX		// save CF
		57
		58	ADDQ $4, SI		// i += 4
		59	SUBQ $4, DI		// n -= 4
		60	JGE U1			// if n >= 0 goto U1
		61
		62V1:	ADDQ $4, DI		// n += 4
		63	JLE E1			// if n <= 0 goto E1
		64
		65L1:	// n > 0
		66	ADDQ CX, CX		// restore CF
		67	MOVQ 0(R8)(SI*8), R11
		68	ADCQ 0(R9)(SI*8), R11
		69	MOVQ R11, 0(R10)(SI*8)
		70	SBBQ CX, CX		// save CF
		71
		72	ADDQ $1, SI		// i++
		73	SUBQ $1, DI		// n--
		74	JG L1			// if n > 0 goto L1
		75
		76E1:	NEGQ CX
		77	MOVQ CX, c+72(FP)	// return c
		78	RET
		79
		80
		81// func subVV(z, x, y []Word) (c Word)
		82// (same as addVV except for SBBQ instead of ADCQ and label names)
		83TEXT ·subVV(SB),NOSPLIT,$0
		84	MOVQ z_len+8(FP), DI
		85	MOVQ x+24(FP), R8
		86	MOVQ y+48(FP), R9
		87	MOVQ z+0(FP), R10
		88
		89	MOVQ $0, CX		// c = 0
		90	MOVQ $0, SI		// i = 0
		91
		92	// s/JL/JMP/ below to disable the unrolled loop
		93	SUBQ $4, DI		// n -= 4
		94	JL V2			// if n < 0 goto V2
		95
		96U2:	// n >= 0
		97	// regular loop body unrolled 4x
		98	ADDQ CX, CX		// restore CF
		99	MOVQ 0(R8)(SI*8), R11
	 100	MOVQ 8(R8)(SI*8), R12
	 101	MOVQ 16(R8)(SI*8), R13
	 102	MOVQ 24(R8)(SI*8), R14
	 103	SBBQ 0(R9)(SI*8), R11
	 104	SBBQ 8(R9)(SI*8), R12
	 105	SBBQ 16(R9)(SI*8), R13
	 106	SBBQ 24(R9)(SI*8), R14
	 107	MOVQ R11, 0(R10)(SI*8)
	 108	MOVQ R12, 8(R10)(SI*8)
	 109	MOVQ R13, 16(R10)(SI*8)
	 110	MOVQ R14, 24(R10)(SI*8)
	 111	SBBQ CX, CX		// save CF
	 112
	 113	ADDQ $4, SI		// i += 4
	 114	SUBQ $4, DI		// n -= 4
	 115	JGE U2			// if n >= 0 goto U2
	 116
	 117V2:	ADDQ $4, DI		// n += 4
	 118	JLE E2			// if n <= 0 goto E2
	 119
	 120L2:	// n > 0
	 121	ADDQ CX, CX		// restore CF
	 122	MOVQ 0(R8)(SI*8), R11
	 123	SBBQ 0(R9)(SI*8), R11
	 124	MOVQ R11, 0(R10)(SI*8)
	 125	SBBQ CX, CX		// save CF
	 126
	 127	ADDQ $1, SI		// i++
	 128	SUBQ $1, DI		// n--
	 129	JG L2			// if n > 0 goto L2
	 130
	 131E2:	NEGQ CX
	 132	MOVQ CX, c+72(FP)	// return c
	 133	RET
	 134
	 135
	 136// func addVW(z, x []Word, y Word) (c Word)
	 137TEXT ·addVW(SB),NOSPLIT,$0
	 138	MOVQ z_len+8(FP), DI
	 139	CMPQ DI, $32
	 140	JG large
	 141	MOVQ x+24(FP), R8
	 142	MOVQ y+48(FP), CX	// c = y
	 143	MOVQ z+0(FP), R10
	 144
	 145	MOVQ $0, SI		// i = 0
	 146
	 147	// s/JL/JMP/ below to disable the unrolled loop
	 148	SUBQ $4, DI		// n -= 4
	 149	JL V3			// if n < 4 goto V3
	 150
	 151U3:	// n >= 0
	 152	// regular loop body unrolled 4x
	 153	MOVQ 0(R8)(SI*8), R11
	 154	MOVQ 8(R8)(SI*8), R12
	 155	MOVQ 16(R8)(SI*8), R13
	 156	MOVQ 24(R8)(SI*8), R14
	 157	ADDQ CX, R11
	 158	ADCQ $0, R12
	 159	ADCQ $0, R13
	 160	ADCQ $0, R14
	 161	SBBQ CX, CX		// save CF
	 162	NEGQ CX
	 163	MOVQ R11, 0(R10)(SI*8)
	 164	MOVQ R12, 8(R10)(SI*8)
	 165	MOVQ R13, 16(R10)(SI*8)
	 166	MOVQ R14, 24(R10)(SI*8)
	 167
	 168	ADDQ $4, SI		// i += 4
	 169	SUBQ $4, DI		// n -= 4
	 170	JGE U3			// if n >= 0 goto U3
	 171
	 172V3:	ADDQ $4, DI		// n += 4
	 173	JLE E3			// if n <= 0 goto E3
	 174
	 175L3:	// n > 0
	 176	ADDQ 0(R8)(SI*8), CX
	 177	MOVQ CX, 0(R10)(SI*8)
	 178	SBBQ CX, CX		// save CF
	 179	NEGQ CX
	 180
	 181	ADDQ $1, SI		// i++
	 182	SUBQ $1, DI		// n--
	 183	JG L3			// if n > 0 goto L3
	 184
	 185E3:	MOVQ CX, c+56(FP)	// return c
	 186	RET
	 187large:
	 188	JMP ·addVWlarge(SB)
	 189
	 190
	 191// func subVW(z, x []Word, y Word) (c Word)
	 192// (same as addVW except for SUBQ/SBBQ instead of ADDQ/ADCQ and label names)
	 193TEXT ·subVW(SB),NOSPLIT,$0
	 194	MOVQ z_len+8(FP), DI
	 195	CMPQ DI, $32
	 196	JG large
	 197	MOVQ x+24(FP), R8
	 198	MOVQ y+48(FP), CX	// c = y
	 199	MOVQ z+0(FP), R10
	 200
	 201	MOVQ $0, SI		// i = 0
	 202
	 203	// s/JL/JMP/ below to disable the unrolled loop
	 204	SUBQ $4, DI		// n -= 4
	 205	JL V4			// if n < 4 goto V4
	 206
	 207U4:	// n >= 0
	 208	// regular loop body unrolled 4x
	 209	MOVQ 0(R8)(SI*8), R11
	 210	MOVQ 8(R8)(SI*8), R12
	 211	MOVQ 16(R8)(SI*8), R13
	 212	MOVQ 24(R8)(SI*8), R14
	 213	SUBQ CX, R11
	 214	SBBQ $0, R12
	 215	SBBQ $0, R13
	 216	SBBQ $0, R14
	 217	SBBQ CX, CX		// save CF
	 218	NEGQ CX
	 219	MOVQ R11, 0(R10)(SI*8)
	 220	MOVQ R12, 8(R10)(SI*8)
	 221	MOVQ R13, 16(R10)(SI*8)
	 222	MOVQ R14, 24(R10)(SI*8)
	 223
	 224	ADDQ $4, SI		// i += 4
	 225	SUBQ $4, DI		// n -= 4
	 226	JGE U4			// if n >= 0 goto U4
	 227
	 228V4:	ADDQ $4, DI		// n += 4
	 229	JLE E4			// if n <= 0 goto E4
	 230
	 231L4:	// n > 0
	 232	MOVQ 0(R8)(SI*8), R11
	 233	SUBQ CX, R11
	 234	MOVQ R11, 0(R10)(SI*8)
	 235	SBBQ CX, CX		// save CF
	 236	NEGQ CX
	 237
	 238	ADDQ $1, SI		// i++
	 239	SUBQ $1, DI		// n--
	 240	JG L4			// if n > 0 goto L4
	 241
	 242E4:	MOVQ CX, c+56(FP)	// return c
	 243	RET
	 244large:
	 245	JMP ·subVWlarge(SB)
	 246
	 247
	 248// func shlVU(z, x []Word, s uint) (c Word)
	 249TEXT ·shlVU(SB),NOSPLIT,$0
	 250	MOVQ z_len+8(FP), BX	// i = z
	 251	SUBQ $1, BX		// i--
	 252	JL X8b			// i < 0	(n <= 0)
	 253
	 254	// n > 0
	 255	MOVQ z+0(FP), R10
	 256	MOVQ x+24(FP), R8
	 257	MOVQ s+48(FP), CX
	 258	MOVQ (R8)(BX*8), AX	// w1 = x[n-1]
	 259	MOVQ $0, DX
	 260	SHLQ CX, AX, DX		// w1>>ŝ
	 261	MOVQ DX, c+56(FP)
	 262
	 263	CMPQ BX, $0
	 264	JLE X8a			// i <= 0
	 265
	 266	// i > 0
	 267L8:	MOVQ AX, DX		// w = w1
	 268	MOVQ -8(R8)(BX*8), AX	// w1 = x[i-1]
	 269	SHLQ CX, AX, DX		// w<<s | w1>>ŝ
	 270	MOVQ DX, (R10)(BX*8)	// z[i] = w<<s | w1>>ŝ
	 271	SUBQ $1, BX		// i--
	 272	JG L8			// i > 0
	 273
	 274	// i <= 0
	 275X8a:	SHLQ CX, AX		// w1<<s
	 276	MOVQ AX, (R10)		// z[0] = w1<<s
	 277	RET
	 278
	 279X8b:	MOVQ $0, c+56(FP)
	 280	RET
	 281
	 282
	 283// func shrVU(z, x []Word, s uint) (c Word)
	 284TEXT ·shrVU(SB),NOSPLIT,$0
	 285	MOVQ z_len+8(FP), R11
	 286	SUBQ $1, R11		// n--
	 287	JL X9b			// n < 0	(n <= 0)
	 288
	 289	// n > 0
	 290	MOVQ z+0(FP), R10
	 291	MOVQ x+24(FP), R8
	 292	MOVQ s+48(FP), CX
	 293	MOVQ (R8), AX		// w1 = x[0]
	 294	MOVQ $0, DX
	 295	SHRQ CX, AX, DX		// w1<<ŝ
	 296	MOVQ DX, c+56(FP)
	 297
	 298	MOVQ $0, BX		// i = 0
	 299	JMP E9
	 300
	 301	// i < n-1
	 302L9:	MOVQ AX, DX		// w = w1
	 303	MOVQ 8(R8)(BX*8), AX	// w1 = x[i+1]
	 304	SHRQ CX, AX, DX		// w>>s | w1<<ŝ
	 305	MOVQ DX, (R10)(BX*8)	// z[i] = w>>s | w1<<ŝ
	 306	ADDQ $1, BX		// i++
	 307
	 308E9:	CMPQ BX, R11
	 309	JL L9			// i < n-1
	 310
	 311	// i >= n-1
	 312X9a:	SHRQ CX, AX		// w1>>s
	 313	MOVQ AX, (R10)(R11*8)	// z[n-1] = w1>>s
	 314	RET
	 315
	 316X9b:	MOVQ $0, c+56(FP)
	 317	RET
	 318
	 319
	 320// func mulAddVWW(z, x []Word, y, r Word) (c Word)
	 321TEXT ·mulAddVWW(SB),NOSPLIT,$0
	 322	MOVQ z+0(FP), R10
	 323	MOVQ x+24(FP), R8
	 324	MOVQ y+48(FP), R9
	 325	MOVQ r+56(FP), CX	// c = r
	 326	MOVQ z_len+8(FP), R11
	 327	MOVQ $0, BX		// i = 0
	 328
	 329	CMPQ R11, $4
	 330	JL E5
	 331
	 332U5:	// i+4 <= n
	 333	// regular loop body unrolled 4x
	 334	MOVQ (0*8)(R8)(BX*8), AX
	 335	MULQ R9
	 336	ADDQ CX, AX
	 337	ADCQ $0, DX
	 338	MOVQ AX, (0*8)(R10)(BX*8)
	 339	MOVQ DX, CX
	 340	MOVQ (1*8)(R8)(BX*8), AX
	 341	MULQ R9
	 342	ADDQ CX, AX
	 343	ADCQ $0, DX
	 344	MOVQ AX, (1*8)(R10)(BX*8)
	 345	MOVQ DX, CX
	 346	MOVQ (2*8)(R8)(BX*8), AX
	 347	MULQ R9
	 348	ADDQ CX, AX
	 349	ADCQ $0, DX
	 350	MOVQ AX, (2*8)(R10)(BX*8)
	 351	MOVQ DX, CX
	 352	MOVQ (3*8)(R8)(BX*8), AX
	 353	MULQ R9
	 354	ADDQ CX, AX
	 355	ADCQ $0, DX
	 356	MOVQ AX, (3*8)(R10)(BX*8)
	 357	MOVQ DX, CX
	 358	ADDQ $4, BX		// i += 4
	 359
	 360	LEAQ 4(BX), DX
	 361	CMPQ DX, R11
	 362	JLE U5
	 363	JMP E5
	 364
	 365L5:	MOVQ (R8)(BX*8), AX
	 366	MULQ R9
	 367	ADDQ CX, AX
	 368	ADCQ $0, DX
	 369	MOVQ AX, (R10)(BX*8)
	 370	MOVQ DX, CX
	 371	ADDQ $1, BX		// i++
	 372
	 373E5:	CMPQ BX, R11		// i < n
	 374	JL L5
	 375
	 376	MOVQ CX, c+64(FP)
	 377	RET
	 378
	 379
	 380// func addMulVVW(z, x []Word, y Word) (c Word)
	 381TEXT ·addMulVVW(SB),NOSPLIT,$0
	 382	CMPB		·support_adx(SB), $1
	 383	JEQ adx
	 384	MOVQ z+0(FP), R10
	 385	MOVQ x+24(FP), R8
	 386	MOVQ y+48(FP), R9
	 387	MOVQ z_len+8(FP), R11
	 388	MOVQ $0, BX		// i = 0
	 389	MOVQ $0, CX		// c = 0
	 390	MOVQ R11, R12
	 391	ANDQ $-2, R12
	 392	CMPQ R11, $2
	 393	JAE A6
	 394	JMP E6
	 395
	 396A6:
	 397	MOVQ (R8)(BX*8), AX
	 398	MULQ R9
	 399	ADDQ (R10)(BX*8), AX
	 400	ADCQ $0, DX
	 401	ADDQ CX, AX
	 402	ADCQ $0, DX
	 403	MOVQ DX, CX
	 404	MOVQ AX, (R10)(BX*8)
	 405
	 406	MOVQ (8)(R8)(BX*8), AX
	 407	MULQ R9
	 408	ADDQ (8)(R10)(BX*8), AX
	 409	ADCQ $0, DX
	 410	ADDQ CX, AX
	 411	ADCQ $0, DX
	 412	MOVQ DX, CX
	 413	MOVQ AX, (8)(R10)(BX*8)
	 414
	 415	ADDQ $2, BX
	 416	CMPQ BX, R12
	 417	JL A6
	 418	JMP E6
	 419
	 420L6:	MOVQ (R8)(BX*8), AX
	 421	MULQ R9
	 422	ADDQ CX, AX
	 423	ADCQ $0, DX
	 424	ADDQ AX, (R10)(BX*8)
	 425	ADCQ $0, DX
	 426	MOVQ DX, CX
	 427	ADDQ $1, BX		// i++
	 428
	 429E6:	CMPQ BX, R11		// i < n
	 430	JL L6
	 431
	 432	MOVQ CX, c+56(FP)
	 433	RET
	 434
	 435adx:
	 436	MOVQ z_len+8(FP), R11
	 437	MOVQ z+0(FP), R10
	 438	MOVQ x+24(FP), R8
	 439	MOVQ y+48(FP), DX
	 440	MOVQ $0, BX	 // i = 0
	 441	MOVQ $0, CX	 // carry
	 442	CMPQ R11, $8
	 443	JAE	adx_loop_header
	 444	CMPQ BX, R11
	 445	JL adx_short
	 446	MOVQ CX, c+56(FP)
	 447	RET
	 448
	 449adx_loop_header:
	 450	MOVQ	R11, R13
	 451	ANDQ	$-8, R13
	 452adx_loop:
	 453	XORQ	R9, R9	// unset flags
	 454	MULXQ (R8), SI, DI
	 455	ADCXQ CX,SI
	 456	ADOXQ (R10), SI
	 457	MOVQ	SI,(R10)
	 458
	 459	MULXQ 8(R8), AX, CX
	 460	ADCXQ DI, AX
	 461	ADOXQ 8(R10), AX
	 462	MOVQ	AX, 8(R10)
	 463
	 464	MULXQ 16(R8), SI, DI
	 465	ADCXQ CX, SI
	 466	ADOXQ 16(R10), SI
	 467	MOVQ	SI, 16(R10)
	 468
	 469	MULXQ 24(R8), AX, CX
	 470	ADCXQ DI, AX
	 471	ADOXQ 24(R10), AX
	 472	MOVQ	AX, 24(R10)
	 473
	 474	MULXQ 32(R8), SI, DI
	 475	ADCXQ CX, SI
	 476	ADOXQ 32(R10), SI
	 477	MOVQ	SI, 32(R10)
	 478
	 479	MULXQ 40(R8), AX, CX
	 480	ADCXQ DI, AX
	 481	ADOXQ 40(R10), AX
	 482	MOVQ	AX, 40(R10)
	 483
	 484	MULXQ 48(R8), SI, DI
	 485	ADCXQ CX, SI
	 486	ADOXQ 48(R10), SI
	 487	MOVQ	SI, 48(R10)
	 488
	 489	MULXQ 56(R8), AX, CX
	 490	ADCXQ DI, AX
	 491	ADOXQ 56(R10), AX
	 492	MOVQ	AX, 56(R10)
	 493
	 494	ADCXQ R9, CX
	 495	ADOXQ R9, CX
	 496
	 497	ADDQ $64, R8
	 498	ADDQ $64, R10
	 499	ADDQ $8, BX
	 500
	 501	CMPQ BX, R13
	 502	JL adx_loop
	 503	MOVQ z+0(FP), R10
	 504	MOVQ x+24(FP), R8
	 505	CMPQ BX, R11
	 506	JL adx_short
	 507	MOVQ CX, c+56(FP)
	 508	RET
	 509
	 510adx_short:
	 511	MULXQ (R8)(BX*8), SI, DI
	 512	ADDQ CX, SI
	 513	ADCQ $0, DI
	 514	ADDQ SI, (R10)(BX*8)
	 515	ADCQ $0, DI
	 516	MOVQ DI, CX
	 517	ADDQ $1, BX		// i++
	 518
	 519	CMPQ BX, R11
	 520	JL adx_short
	 521
	 522	MOVQ CX, c+56(FP)
	 523	RET
	 524
	 525
	 526

View as plain text