Text file
src/math/big/arith_amd64.s
1// Copyright 2009 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:build !math_big_pure_go
6// +build !math_big_pure_go
7
8#include "textflag.h"
9
10// This file provides fast assembly versions for the elementary
11// arithmetic operations on vectors implemented in arith.go.
12
13// func mulWW(x, y Word) (z1, z0 Word)
14TEXT ·mulWW(SB),NOSPLIT,$0
15 MOVQ x+0(FP), AX
16 MULQ y+8(FP)
17 MOVQ DX, z1+16(FP)
18 MOVQ AX, z0+24(FP)
19 RET
20
21
22
23// The carry bit is saved with SBBQ Rx, Rx: if the carry was set, Rx is -1, otherwise it is 0.
24// It is restored with ADDQ Rx, Rx: if Rx was -1 the carry is set, otherwise it is cleared.
25// This is faster than using rotate instructions.
26
27// func addVV(z, x, y []Word) (c Word)
28TEXT ·addVV(SB),NOSPLIT,$0
29 MOVQ z_len+8(FP), DI
30 MOVQ x+24(FP), R8
31 MOVQ y+48(FP), R9
32 MOVQ z+0(FP), R10
33
34 MOVQ $0, CX // c = 0
35 MOVQ $0, SI // i = 0
36
37 // s/JL/JMP/ below to disable the unrolled loop
38 SUBQ $4, DI // n -= 4
39 JL V1 // if n < 0 goto V1
40
41U1: // n >= 0
42 // regular loop body unrolled 4x
43 ADDQ CX, CX // restore CF
44 MOVQ 0(R8)(SI*8), R11
45 MOVQ 8(R8)(SI*8), R12
46 MOVQ 16(R8)(SI*8), R13
47 MOVQ 24(R8)(SI*8), R14
48 ADCQ 0(R9)(SI*8), R11
49 ADCQ 8(R9)(SI*8), R12
50 ADCQ 16(R9)(SI*8), R13
51 ADCQ 24(R9)(SI*8), R14
52 MOVQ R11, 0(R10)(SI*8)
53 MOVQ R12, 8(R10)(SI*8)
54 MOVQ R13, 16(R10)(SI*8)
55 MOVQ R14, 24(R10)(SI*8)
56 SBBQ CX, CX // save CF
57
58 ADDQ $4, SI // i += 4
59 SUBQ $4, DI // n -= 4
60 JGE U1 // if n >= 0 goto U1
61
62V1: ADDQ $4, DI // n += 4
63 JLE E1 // if n <= 0 goto E1
64
65L1: // n > 0
66 ADDQ CX, CX // restore CF
67 MOVQ 0(R8)(SI*8), R11
68 ADCQ 0(R9)(SI*8), R11
69 MOVQ R11, 0(R10)(SI*8)
70 SBBQ CX, CX // save CF
71
72 ADDQ $1, SI // i++
73 SUBQ $1, DI // n--
74 JG L1 // if n > 0 goto L1
75
76E1: NEGQ CX
77 MOVQ CX, c+72(FP) // return c
78 RET
79
80
81// func subVV(z, x, y []Word) (c Word)
82// (same as addVV except for SBBQ instead of ADCQ and label names)
83TEXT ·subVV(SB),NOSPLIT,$0
84 MOVQ z_len+8(FP), DI
85 MOVQ x+24(FP), R8
86 MOVQ y+48(FP), R9
87 MOVQ z+0(FP), R10
88
89 MOVQ $0, CX // c = 0
90 MOVQ $0, SI // i = 0
91
92 // s/JL/JMP/ below to disable the unrolled loop
93 SUBQ $4, DI // n -= 4
94 JL V2 // if n < 0 goto V2
95
96U2: // n >= 0
97 // regular loop body unrolled 4x
98 ADDQ CX, CX // restore CF
99 MOVQ 0(R8)(SI*8), R11
100 MOVQ 8(R8)(SI*8), R12
101 MOVQ 16(R8)(SI*8), R13
102 MOVQ 24(R8)(SI*8), R14
103 SBBQ 0(R9)(SI*8), R11
104 SBBQ 8(R9)(SI*8), R12
105 SBBQ 16(R9)(SI*8), R13
106 SBBQ 24(R9)(SI*8), R14
107 MOVQ R11, 0(R10)(SI*8)
108 MOVQ R12, 8(R10)(SI*8)
109 MOVQ R13, 16(R10)(SI*8)
110 MOVQ R14, 24(R10)(SI*8)
111 SBBQ CX, CX // save CF
112
113 ADDQ $4, SI // i += 4
114 SUBQ $4, DI // n -= 4
115 JGE U2 // if n >= 0 goto U2
116
117V2: ADDQ $4, DI // n += 4
118 JLE E2 // if n <= 0 goto E2
119
120L2: // n > 0
121 ADDQ CX, CX // restore CF
122 MOVQ 0(R8)(SI*8), R11
123 SBBQ 0(R9)(SI*8), R11
124 MOVQ R11, 0(R10)(SI*8)
125 SBBQ CX, CX // save CF
126
127 ADDQ $1, SI // i++
128 SUBQ $1, DI // n--
129 JG L2 // if n > 0 goto L2
130
131E2: NEGQ CX
132 MOVQ CX, c+72(FP) // return c
133 RET
134
135
136// func addVW(z, x []Word, y Word) (c Word)
137TEXT ·addVW(SB),NOSPLIT,$0
138 MOVQ z_len+8(FP), DI
139 CMPQ DI, $32
140 JG large
141 MOVQ x+24(FP), R8
142 MOVQ y+48(FP), CX // c = y
143 MOVQ z+0(FP), R10
144
145 MOVQ $0, SI // i = 0
146
147 // s/JL/JMP/ below to disable the unrolled loop
148 SUBQ $4, DI // n -= 4
149 JL V3 // if n < 4 goto V3
150
151U3: // n >= 0
152 // regular loop body unrolled 4x
153 MOVQ 0(R8)(SI*8), R11
154 MOVQ 8(R8)(SI*8), R12
155 MOVQ 16(R8)(SI*8), R13
156 MOVQ 24(R8)(SI*8), R14
157 ADDQ CX, R11
158 ADCQ $0, R12
159 ADCQ $0, R13
160 ADCQ $0, R14
161 SBBQ CX, CX // save CF
162 NEGQ CX
163 MOVQ R11, 0(R10)(SI*8)
164 MOVQ R12, 8(R10)(SI*8)
165 MOVQ R13, 16(R10)(SI*8)
166 MOVQ R14, 24(R10)(SI*8)
167
168 ADDQ $4, SI // i += 4
169 SUBQ $4, DI // n -= 4
170 JGE U3 // if n >= 0 goto U3
171
172V3: ADDQ $4, DI // n += 4
173 JLE E3 // if n <= 0 goto E3
174
175L3: // n > 0
176 ADDQ 0(R8)(SI*8), CX
177 MOVQ CX, 0(R10)(SI*8)
178 SBBQ CX, CX // save CF
179 NEGQ CX
180
181 ADDQ $1, SI // i++
182 SUBQ $1, DI // n--
183 JG L3 // if n > 0 goto L3
184
185E3: MOVQ CX, c+56(FP) // return c
186 RET
187large:
188 JMP ·addVWlarge(SB)
189
190
191// func subVW(z, x []Word, y Word) (c Word)
192// (same as addVW except for SUBQ/SBBQ instead of ADDQ/ADCQ and label names)
193TEXT ·subVW(SB),NOSPLIT,$0
194 MOVQ z_len+8(FP), DI
195 CMPQ DI, $32
196 JG large
197 MOVQ x+24(FP), R8
198 MOVQ y+48(FP), CX // c = y
199 MOVQ z+0(FP), R10
200
201 MOVQ $0, SI // i = 0
202
203 // s/JL/JMP/ below to disable the unrolled loop
204 SUBQ $4, DI // n -= 4
205 JL V4 // if n < 4 goto V4
206
207U4: // n >= 0
208 // regular loop body unrolled 4x
209 MOVQ 0(R8)(SI*8), R11
210 MOVQ 8(R8)(SI*8), R12
211 MOVQ 16(R8)(SI*8), R13
212 MOVQ 24(R8)(SI*8), R14
213 SUBQ CX, R11
214 SBBQ $0, R12
215 SBBQ $0, R13
216 SBBQ $0, R14
217 SBBQ CX, CX // save CF
218 NEGQ CX
219 MOVQ R11, 0(R10)(SI*8)
220 MOVQ R12, 8(R10)(SI*8)
221 MOVQ R13, 16(R10)(SI*8)
222 MOVQ R14, 24(R10)(SI*8)
223
224 ADDQ $4, SI // i += 4
225 SUBQ $4, DI // n -= 4
226 JGE U4 // if n >= 0 goto U4
227
228V4: ADDQ $4, DI // n += 4
229 JLE E4 // if n <= 0 goto E4
230
231L4: // n > 0
232 MOVQ 0(R8)(SI*8), R11
233 SUBQ CX, R11
234 MOVQ R11, 0(R10)(SI*8)
235 SBBQ CX, CX // save CF
236 NEGQ CX
237
238 ADDQ $1, SI // i++
239 SUBQ $1, DI // n--
240 JG L4 // if n > 0 goto L4
241
242E4: MOVQ CX, c+56(FP) // return c
243 RET
244large:
245 JMP ·subVWlarge(SB)
246
247
248// func shlVU(z, x []Word, s uint) (c Word)
249TEXT ·shlVU(SB),NOSPLIT,$0
250 MOVQ z_len+8(FP), BX // i = z
251 SUBQ $1, BX // i--
252 JL X8b // i < 0 (n <= 0)
253
254 // n > 0
255 MOVQ z+0(FP), R10
256 MOVQ x+24(FP), R8
257 MOVQ s+48(FP), CX
258 MOVQ (R8)(BX*8), AX // w1 = x[n-1]
259 MOVQ $0, DX
260 SHLQ CX, AX, DX // w1>>ŝ
261 MOVQ DX, c+56(FP)
262
263 CMPQ BX, $0
264 JLE X8a // i <= 0
265
266 // i > 0
267L8: MOVQ AX, DX // w = w1
268 MOVQ -8(R8)(BX*8), AX // w1 = x[i-1]
269 SHLQ CX, AX, DX // w<<s | w1>>ŝ
270 MOVQ DX, (R10)(BX*8) // z[i] = w<<s | w1>>ŝ
271 SUBQ $1, BX // i--
272 JG L8 // i > 0
273
274 // i <= 0
275X8a: SHLQ CX, AX // w1<<s
276 MOVQ AX, (R10) // z[0] = w1<<s
277 RET
278
279X8b: MOVQ $0, c+56(FP)
280 RET
281
282
283// func shrVU(z, x []Word, s uint) (c Word)
284TEXT ·shrVU(SB),NOSPLIT,$0
285 MOVQ z_len+8(FP), R11
286 SUBQ $1, R11 // n--
287 JL X9b // n < 0 (n <= 0)
288
289 // n > 0
290 MOVQ z+0(FP), R10
291 MOVQ x+24(FP), R8
292 MOVQ s+48(FP), CX
293 MOVQ (R8), AX // w1 = x[0]
294 MOVQ $0, DX
295 SHRQ CX, AX, DX // w1<<ŝ
296 MOVQ DX, c+56(FP)
297
298 MOVQ $0, BX // i = 0
299 JMP E9
300
301 // i < n-1
302L9: MOVQ AX, DX // w = w1
303 MOVQ 8(R8)(BX*8), AX // w1 = x[i+1]
304 SHRQ CX, AX, DX // w>>s | w1<<ŝ
305 MOVQ DX, (R10)(BX*8) // z[i] = w>>s | w1<<ŝ
306 ADDQ $1, BX // i++
307
308E9: CMPQ BX, R11
309 JL L9 // i < n-1
310
311 // i >= n-1
312X9a: SHRQ CX, AX // w1>>s
313 MOVQ AX, (R10)(R11*8) // z[n-1] = w1>>s
314 RET
315
316X9b: MOVQ $0, c+56(FP)
317 RET
318
319
320// func mulAddVWW(z, x []Word, y, r Word) (c Word)
321TEXT ·mulAddVWW(SB),NOSPLIT,$0
322 MOVQ z+0(FP), R10
323 MOVQ x+24(FP), R8
324 MOVQ y+48(FP), R9
325 MOVQ r+56(FP), CX // c = r
326 MOVQ z_len+8(FP), R11
327 MOVQ $0, BX // i = 0
328
329 CMPQ R11, $4
330 JL E5
331
332U5: // i+4 <= n
333 // regular loop body unrolled 4x
334 MOVQ (0*8)(R8)(BX*8), AX
335 MULQ R9
336 ADDQ CX, AX
337 ADCQ $0, DX
338 MOVQ AX, (0*8)(R10)(BX*8)
339 MOVQ DX, CX
340 MOVQ (1*8)(R8)(BX*8), AX
341 MULQ R9
342 ADDQ CX, AX
343 ADCQ $0, DX
344 MOVQ AX, (1*8)(R10)(BX*8)
345 MOVQ DX, CX
346 MOVQ (2*8)(R8)(BX*8), AX
347 MULQ R9
348 ADDQ CX, AX
349 ADCQ $0, DX
350 MOVQ AX, (2*8)(R10)(BX*8)
351 MOVQ DX, CX
352 MOVQ (3*8)(R8)(BX*8), AX
353 MULQ R9
354 ADDQ CX, AX
355 ADCQ $0, DX
356 MOVQ AX, (3*8)(R10)(BX*8)
357 MOVQ DX, CX
358 ADDQ $4, BX // i += 4
359
360 LEAQ 4(BX), DX
361 CMPQ DX, R11
362 JLE U5
363 JMP E5
364
365L5: MOVQ (R8)(BX*8), AX
366 MULQ R9
367 ADDQ CX, AX
368 ADCQ $0, DX
369 MOVQ AX, (R10)(BX*8)
370 MOVQ DX, CX
371 ADDQ $1, BX // i++
372
373E5: CMPQ BX, R11 // i < n
374 JL L5
375
376 MOVQ CX, c+64(FP)
377 RET
378
379
380// func addMulVVW(z, x []Word, y Word) (c Word)
381TEXT ·addMulVVW(SB),NOSPLIT,$0
382 CMPB ·support_adx(SB), $1
383 JEQ adx
384 MOVQ z+0(FP), R10
385 MOVQ x+24(FP), R8
386 MOVQ y+48(FP), R9
387 MOVQ z_len+8(FP), R11
388 MOVQ $0, BX // i = 0
389 MOVQ $0, CX // c = 0
390 MOVQ R11, R12
391 ANDQ $-2, R12
392 CMPQ R11, $2
393 JAE A6
394 JMP E6
395
396A6:
397 MOVQ (R8)(BX*8), AX
398 MULQ R9
399 ADDQ (R10)(BX*8), AX
400 ADCQ $0, DX
401 ADDQ CX, AX
402 ADCQ $0, DX
403 MOVQ DX, CX
404 MOVQ AX, (R10)(BX*8)
405
406 MOVQ (8)(R8)(BX*8), AX
407 MULQ R9
408 ADDQ (8)(R10)(BX*8), AX
409 ADCQ $0, DX
410 ADDQ CX, AX
411 ADCQ $0, DX
412 MOVQ DX, CX
413 MOVQ AX, (8)(R10)(BX*8)
414
415 ADDQ $2, BX
416 CMPQ BX, R12
417 JL A6
418 JMP E6
419
420L6: MOVQ (R8)(BX*8), AX
421 MULQ R9
422 ADDQ CX, AX
423 ADCQ $0, DX
424 ADDQ AX, (R10)(BX*8)
425 ADCQ $0, DX
426 MOVQ DX, CX
427 ADDQ $1, BX // i++
428
429E6: CMPQ BX, R11 // i < n
430 JL L6
431
432 MOVQ CX, c+56(FP)
433 RET
434
435adx:
436 MOVQ z_len+8(FP), R11
437 MOVQ z+0(FP), R10
438 MOVQ x+24(FP), R8
439 MOVQ y+48(FP), DX
440 MOVQ $0, BX // i = 0
441 MOVQ $0, CX // carry
442 CMPQ R11, $8
443 JAE adx_loop_header
444 CMPQ BX, R11
445 JL adx_short
446 MOVQ CX, c+56(FP)
447 RET
448
449adx_loop_header:
450 MOVQ R11, R13
451 ANDQ $-8, R13
452adx_loop:
453 XORQ R9, R9 // unset flags
454 MULXQ (R8), SI, DI
455 ADCXQ CX,SI
456 ADOXQ (R10), SI
457 MOVQ SI,(R10)
458
459 MULXQ 8(R8), AX, CX
460 ADCXQ DI, AX
461 ADOXQ 8(R10), AX
462 MOVQ AX, 8(R10)
463
464 MULXQ 16(R8), SI, DI
465 ADCXQ CX, SI
466 ADOXQ 16(R10), SI
467 MOVQ SI, 16(R10)
468
469 MULXQ 24(R8), AX, CX
470 ADCXQ DI, AX
471 ADOXQ 24(R10), AX
472 MOVQ AX, 24(R10)
473
474 MULXQ 32(R8), SI, DI
475 ADCXQ CX, SI
476 ADOXQ 32(R10), SI
477 MOVQ SI, 32(R10)
478
479 MULXQ 40(R8), AX, CX
480 ADCXQ DI, AX
481 ADOXQ 40(R10), AX
482 MOVQ AX, 40(R10)
483
484 MULXQ 48(R8), SI, DI
485 ADCXQ CX, SI
486 ADOXQ 48(R10), SI
487 MOVQ SI, 48(R10)
488
489 MULXQ 56(R8), AX, CX
490 ADCXQ DI, AX
491 ADOXQ 56(R10), AX
492 MOVQ AX, 56(R10)
493
494 ADCXQ R9, CX
495 ADOXQ R9, CX
496
497 ADDQ $64, R8
498 ADDQ $64, R10
499 ADDQ $8, BX
500
501 CMPQ BX, R13
502 JL adx_loop
503 MOVQ z+0(FP), R10
504 MOVQ x+24(FP), R8
505 CMPQ BX, R11
506 JL adx_short
507 MOVQ CX, c+56(FP)
508 RET
509
510adx_short:
511 MULXQ (R8)(BX*8), SI, DI
512 ADDQ CX, SI
513 ADCQ $0, DI
514 ADDQ SI, (R10)(BX*8)
515 ADCQ $0, DI
516 MOVQ DI, CX
517 ADDQ $1, BX // i++
518
519 CMPQ BX, R11
520 JL adx_short
521
522 MOVQ CX, c+56(FP)
523 RET
524
525
526
View as plain text