...

Text file src/crypto/aes/gcm_amd64.s

Documentation: crypto/aes

		 1// Copyright 2015 The Go Authors. All rights reserved.
		 2// Use of this source code is governed by a BSD-style
		 3// license that can be found in the LICENSE file.
		 4
		 5// This is an optimized implementation of AES-GCM using AES-NI and CLMUL-NI
		 6// The implementation uses some optimization as described in:
		 7// [1] Gueron, S., Kounavis, M.E.: Intel® Carry-Less Multiplication
		 8//		 Instruction and its Usage for Computing the GCM Mode rev. 2.02
		 9// [2] Gueron, S., Krasnov, V.: Speeding up Counter Mode in Software and
		10//		 Hardware
		11
		12#include "textflag.h"
		13
		14#define B0 X0
		15#define B1 X1
		16#define B2 X2
		17#define B3 X3
		18#define B4 X4
		19#define B5 X5
		20#define B6 X6
		21#define B7 X7
		22
		23#define ACC0 X8
		24#define ACC1 X9
		25#define ACCM X10
		26
		27#define T0 X11
		28#define T1 X12
		29#define T2 X13
		30#define POLY X14
		31#define BSWAP X15
		32
		33DATA bswapMask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f
		34DATA bswapMask<>+0x08(SB)/8, $0x0001020304050607
		35
		36DATA gcmPoly<>+0x00(SB)/8, $0x0000000000000001
		37DATA gcmPoly<>+0x08(SB)/8, $0xc200000000000000
		38
		39DATA andMask<>+0x00(SB)/8, $0x00000000000000ff
		40DATA andMask<>+0x08(SB)/8, $0x0000000000000000
		41DATA andMask<>+0x10(SB)/8, $0x000000000000ffff
		42DATA andMask<>+0x18(SB)/8, $0x0000000000000000
		43DATA andMask<>+0x20(SB)/8, $0x0000000000ffffff
		44DATA andMask<>+0x28(SB)/8, $0x0000000000000000
		45DATA andMask<>+0x30(SB)/8, $0x00000000ffffffff
		46DATA andMask<>+0x38(SB)/8, $0x0000000000000000
		47DATA andMask<>+0x40(SB)/8, $0x000000ffffffffff
		48DATA andMask<>+0x48(SB)/8, $0x0000000000000000
		49DATA andMask<>+0x50(SB)/8, $0x0000ffffffffffff
		50DATA andMask<>+0x58(SB)/8, $0x0000000000000000
		51DATA andMask<>+0x60(SB)/8, $0x00ffffffffffffff
		52DATA andMask<>+0x68(SB)/8, $0x0000000000000000
		53DATA andMask<>+0x70(SB)/8, $0xffffffffffffffff
		54DATA andMask<>+0x78(SB)/8, $0x0000000000000000
		55DATA andMask<>+0x80(SB)/8, $0xffffffffffffffff
		56DATA andMask<>+0x88(SB)/8, $0x00000000000000ff
		57DATA andMask<>+0x90(SB)/8, $0xffffffffffffffff
		58DATA andMask<>+0x98(SB)/8, $0x000000000000ffff
		59DATA andMask<>+0xa0(SB)/8, $0xffffffffffffffff
		60DATA andMask<>+0xa8(SB)/8, $0x0000000000ffffff
		61DATA andMask<>+0xb0(SB)/8, $0xffffffffffffffff
		62DATA andMask<>+0xb8(SB)/8, $0x00000000ffffffff
		63DATA andMask<>+0xc0(SB)/8, $0xffffffffffffffff
		64DATA andMask<>+0xc8(SB)/8, $0x000000ffffffffff
		65DATA andMask<>+0xd0(SB)/8, $0xffffffffffffffff
		66DATA andMask<>+0xd8(SB)/8, $0x0000ffffffffffff
		67DATA andMask<>+0xe0(SB)/8, $0xffffffffffffffff
		68DATA andMask<>+0xe8(SB)/8, $0x00ffffffffffffff
		69
		70GLOBL bswapMask<>(SB), (NOPTR+RODATA), $16
		71GLOBL gcmPoly<>(SB), (NOPTR+RODATA), $16
		72GLOBL andMask<>(SB), (NOPTR+RODATA), $240
		73
		74// func gcmAesFinish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64)
		75TEXT ·gcmAesFinish(SB),NOSPLIT,$0
		76#define pTbl DI
		77#define tMsk SI
		78#define tPtr DX
		79#define plen AX
		80#define dlen CX
		81
		82	MOVQ productTable+0(FP), pTbl
		83	MOVQ tagMask+8(FP), tMsk
		84	MOVQ T+16(FP), tPtr
		85	MOVQ pLen+24(FP), plen
		86	MOVQ dLen+32(FP), dlen
		87
		88	MOVOU (tPtr), ACC0
		89	MOVOU (tMsk), T2
		90
		91	MOVOU bswapMask<>(SB), BSWAP
		92	MOVOU gcmPoly<>(SB), POLY
		93
		94	SHLQ $3, plen
		95	SHLQ $3, dlen
		96
		97	MOVQ plen, B0
		98	PINSRQ $1, dlen, B0
		99
	 100	PXOR ACC0, B0
	 101
	 102	MOVOU (16*14)(pTbl), ACC0
	 103	MOVOU (16*15)(pTbl), ACCM
	 104	MOVOU ACC0, ACC1
	 105
	 106	PCLMULQDQ $0x00, B0, ACC0
	 107	PCLMULQDQ $0x11, B0, ACC1
	 108	PSHUFD $78, B0, T0
	 109	PXOR B0, T0
	 110	PCLMULQDQ $0x00, T0, ACCM
	 111
	 112	PXOR ACC0, ACCM
	 113	PXOR ACC1, ACCM
	 114	MOVOU ACCM, T0
	 115	PSRLDQ $8, ACCM
	 116	PSLLDQ $8, T0
	 117	PXOR ACCM, ACC1
	 118	PXOR T0, ACC0
	 119
	 120	MOVOU POLY, T0
	 121	PCLMULQDQ $0x01, ACC0, T0
	 122	PSHUFD $78, ACC0, ACC0
	 123	PXOR T0, ACC0
	 124
	 125	MOVOU POLY, T0
	 126	PCLMULQDQ $0x01, ACC0, T0
	 127	PSHUFD $78, ACC0, ACC0
	 128	PXOR T0, ACC0
	 129
	 130	PXOR ACC1, ACC0
	 131
	 132	PSHUFB BSWAP, ACC0
	 133	PXOR T2, ACC0
	 134	MOVOU ACC0, (tPtr)
	 135
	 136	RET
	 137#undef pTbl
	 138#undef tMsk
	 139#undef tPtr
	 140#undef plen
	 141#undef dlen
	 142
	 143// func gcmAesInit(productTable *[256]byte, ks []uint32)
	 144TEXT ·gcmAesInit(SB),NOSPLIT,$0
	 145#define dst DI
	 146#define KS SI
	 147#define NR DX
	 148
	 149	MOVQ productTable+0(FP), dst
	 150	MOVQ ks_base+8(FP), KS
	 151	MOVQ ks_len+16(FP), NR
	 152
	 153	SHRQ $2, NR
	 154	DECQ NR
	 155
	 156	MOVOU bswapMask<>(SB), BSWAP
	 157	MOVOU gcmPoly<>(SB), POLY
	 158
	 159	// Encrypt block 0, with the AES key to generate the hash key H
	 160	MOVOU (16*0)(KS), B0
	 161	MOVOU (16*1)(KS), T0
	 162	AESENC T0, B0
	 163	MOVOU (16*2)(KS), T0
	 164	AESENC T0, B0
	 165	MOVOU (16*3)(KS), T0
	 166	AESENC T0, B0
	 167	MOVOU (16*4)(KS), T0
	 168	AESENC T0, B0
	 169	MOVOU (16*5)(KS), T0
	 170	AESENC T0, B0
	 171	MOVOU (16*6)(KS), T0
	 172	AESENC T0, B0
	 173	MOVOU (16*7)(KS), T0
	 174	AESENC T0, B0
	 175	MOVOU (16*8)(KS), T0
	 176	AESENC T0, B0
	 177	MOVOU (16*9)(KS), T0
	 178	AESENC T0, B0
	 179	MOVOU (16*10)(KS), T0
	 180	CMPQ NR, $12
	 181	JB initEncLast
	 182	AESENC T0, B0
	 183	MOVOU (16*11)(KS), T0
	 184	AESENC T0, B0
	 185	MOVOU (16*12)(KS), T0
	 186	JE initEncLast
	 187	AESENC T0, B0
	 188	MOVOU (16*13)(KS), T0
	 189	AESENC T0, B0
	 190	MOVOU (16*14)(KS), T0
	 191initEncLast:
	 192	AESENCLAST T0, B0
	 193
	 194	PSHUFB BSWAP, B0
	 195	// H * 2
	 196	PSHUFD $0xff, B0, T0
	 197	MOVOU B0, T1
	 198	PSRAL $31, T0
	 199	PAND POLY, T0
	 200	PSRLL $31, T1
	 201	PSLLDQ $4, T1
	 202	PSLLL $1, B0
	 203	PXOR T0, B0
	 204	PXOR T1, B0
	 205	// Karatsuba pre-computations
	 206	MOVOU B0, (16*14)(dst)
	 207	PSHUFD $78, B0, B1
	 208	PXOR B0, B1
	 209	MOVOU B1, (16*15)(dst)
	 210
	 211	MOVOU B0, B2
	 212	MOVOU B1, B3
	 213	// Now prepare powers of H and pre-computations for them
	 214	MOVQ $7, AX
	 215
	 216initLoop:
	 217		MOVOU B2, T0
	 218		MOVOU B2, T1
	 219		MOVOU B3, T2
	 220		PCLMULQDQ $0x00, B0, T0
	 221		PCLMULQDQ $0x11, B0, T1
	 222		PCLMULQDQ $0x00, B1, T2
	 223
	 224		PXOR T0, T2
	 225		PXOR T1, T2
	 226		MOVOU T2, B4
	 227		PSLLDQ $8, B4
	 228		PSRLDQ $8, T2
	 229		PXOR B4, T0
	 230		PXOR T2, T1
	 231
	 232		MOVOU POLY, B2
	 233		PCLMULQDQ $0x01, T0, B2
	 234		PSHUFD $78, T0, T0
	 235		PXOR B2, T0
	 236		MOVOU POLY, B2
	 237		PCLMULQDQ $0x01, T0, B2
	 238		PSHUFD $78, T0, T0
	 239		PXOR T0, B2
	 240		PXOR T1, B2
	 241
	 242		MOVOU B2, (16*12)(dst)
	 243		PSHUFD $78, B2, B3
	 244		PXOR B2, B3
	 245		MOVOU B3, (16*13)(dst)
	 246
	 247		DECQ AX
	 248		LEAQ (-16*2)(dst), dst
	 249	JNE initLoop
	 250
	 251	RET
	 252#undef NR
	 253#undef KS
	 254#undef dst
	 255
	 256// func gcmAesData(productTable *[256]byte, data []byte, T *[16]byte)
	 257TEXT ·gcmAesData(SB),NOSPLIT,$0
	 258#define pTbl DI
	 259#define aut SI
	 260#define tPtr CX
	 261#define autLen DX
	 262
	 263#define reduceRound(a) 	MOVOU POLY, T0;	PCLMULQDQ $0x01, a, T0; PSHUFD $78, a, a; PXOR T0, a
	 264#define mulRoundAAD(X ,i) \
	 265	MOVOU (16*(i*2))(pTbl), T1;\
	 266	MOVOU T1, T2;\
	 267	PCLMULQDQ $0x00, X, T1;\
	 268	PXOR T1, ACC0;\
	 269	PCLMULQDQ $0x11, X, T2;\
	 270	PXOR T2, ACC1;\
	 271	PSHUFD $78, X, T1;\
	 272	PXOR T1, X;\
	 273	MOVOU (16*(i*2+1))(pTbl), T1;\
	 274	PCLMULQDQ $0x00, X, T1;\
	 275	PXOR T1, ACCM
	 276
	 277	MOVQ productTable+0(FP), pTbl
	 278	MOVQ data_base+8(FP), aut
	 279	MOVQ data_len+16(FP), autLen
	 280	MOVQ T+32(FP), tPtr
	 281
	 282	PXOR ACC0, ACC0
	 283	MOVOU bswapMask<>(SB), BSWAP
	 284	MOVOU gcmPoly<>(SB), POLY
	 285
	 286	TESTQ autLen, autLen
	 287	JEQ dataBail
	 288
	 289	CMPQ autLen, $13	// optimize the TLS case
	 290	JE dataTLS
	 291	CMPQ autLen, $128
	 292	JB startSinglesLoop
	 293	JMP dataOctaLoop
	 294
	 295dataTLS:
	 296	MOVOU (16*14)(pTbl), T1
	 297	MOVOU (16*15)(pTbl), T2
	 298	PXOR B0, B0
	 299	MOVQ (aut), B0
	 300	PINSRD $2, 8(aut), B0
	 301	PINSRB $12, 12(aut), B0
	 302	XORQ autLen, autLen
	 303	JMP dataMul
	 304
	 305dataOctaLoop:
	 306		CMPQ autLen, $128
	 307		JB startSinglesLoop
	 308		SUBQ $128, autLen
	 309
	 310		MOVOU (16*0)(aut), X0
	 311		MOVOU (16*1)(aut), X1
	 312		MOVOU (16*2)(aut), X2
	 313		MOVOU (16*3)(aut), X3
	 314		MOVOU (16*4)(aut), X4
	 315		MOVOU (16*5)(aut), X5
	 316		MOVOU (16*6)(aut), X6
	 317		MOVOU (16*7)(aut), X7
	 318		LEAQ (16*8)(aut), aut
	 319		PSHUFB BSWAP, X0
	 320		PSHUFB BSWAP, X1
	 321		PSHUFB BSWAP, X2
	 322		PSHUFB BSWAP, X3
	 323		PSHUFB BSWAP, X4
	 324		PSHUFB BSWAP, X5
	 325		PSHUFB BSWAP, X6
	 326		PSHUFB BSWAP, X7
	 327		PXOR ACC0, X0
	 328
	 329		MOVOU (16*0)(pTbl), ACC0
	 330		MOVOU (16*1)(pTbl), ACCM
	 331		MOVOU ACC0, ACC1
	 332		PSHUFD $78, X0, T1
	 333		PXOR X0, T1
	 334		PCLMULQDQ $0x00, X0, ACC0
	 335		PCLMULQDQ $0x11, X0, ACC1
	 336		PCLMULQDQ $0x00, T1, ACCM
	 337
	 338		mulRoundAAD(X1, 1)
	 339		mulRoundAAD(X2, 2)
	 340		mulRoundAAD(X3, 3)
	 341		mulRoundAAD(X4, 4)
	 342		mulRoundAAD(X5, 5)
	 343		mulRoundAAD(X6, 6)
	 344		mulRoundAAD(X7, 7)
	 345
	 346		PXOR ACC0, ACCM
	 347		PXOR ACC1, ACCM
	 348		MOVOU ACCM, T0
	 349		PSRLDQ $8, ACCM
	 350		PSLLDQ $8, T0
	 351		PXOR ACCM, ACC1
	 352		PXOR T0, ACC0
	 353		reduceRound(ACC0)
	 354		reduceRound(ACC0)
	 355		PXOR ACC1, ACC0
	 356	JMP dataOctaLoop
	 357
	 358startSinglesLoop:
	 359	MOVOU (16*14)(pTbl), T1
	 360	MOVOU (16*15)(pTbl), T2
	 361
	 362dataSinglesLoop:
	 363
	 364		CMPQ autLen, $16
	 365		JB dataEnd
	 366		SUBQ $16, autLen
	 367
	 368		MOVOU (aut), B0
	 369dataMul:
	 370		PSHUFB BSWAP, B0
	 371		PXOR ACC0, B0
	 372
	 373		MOVOU T1, ACC0
	 374		MOVOU T2, ACCM
	 375		MOVOU T1, ACC1
	 376
	 377		PSHUFD $78, B0, T0
	 378		PXOR B0, T0
	 379		PCLMULQDQ $0x00, B0, ACC0
	 380		PCLMULQDQ $0x11, B0, ACC1
	 381		PCLMULQDQ $0x00, T0, ACCM
	 382
	 383		PXOR ACC0, ACCM
	 384		PXOR ACC1, ACCM
	 385		MOVOU ACCM, T0
	 386		PSRLDQ $8, ACCM
	 387		PSLLDQ $8, T0
	 388		PXOR ACCM, ACC1
	 389		PXOR T0, ACC0
	 390
	 391		MOVOU POLY, T0
	 392		PCLMULQDQ $0x01, ACC0, T0
	 393		PSHUFD $78, ACC0, ACC0
	 394		PXOR T0, ACC0
	 395
	 396		MOVOU POLY, T0
	 397		PCLMULQDQ $0x01, ACC0, T0
	 398		PSHUFD $78, ACC0, ACC0
	 399		PXOR T0, ACC0
	 400		PXOR ACC1, ACC0
	 401
	 402		LEAQ 16(aut), aut
	 403
	 404	JMP dataSinglesLoop
	 405
	 406dataEnd:
	 407
	 408	TESTQ autLen, autLen
	 409	JEQ dataBail
	 410
	 411	PXOR B0, B0
	 412	LEAQ -1(aut)(autLen*1), aut
	 413
	 414dataLoadLoop:
	 415
	 416		PSLLDQ $1, B0
	 417		PINSRB $0, (aut), B0
	 418
	 419		LEAQ -1(aut), aut
	 420		DECQ autLen
	 421		JNE dataLoadLoop
	 422
	 423	JMP dataMul
	 424
	 425dataBail:
	 426	MOVOU ACC0, (tPtr)
	 427	RET
	 428#undef pTbl
	 429#undef aut
	 430#undef tPtr
	 431#undef autLen
	 432
	 433// func gcmAesEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
	 434TEXT ·gcmAesEnc(SB),0,$256-96
	 435#define pTbl DI
	 436#define ctx DX
	 437#define ctrPtr CX
	 438#define ptx SI
	 439#define ks AX
	 440#define tPtr R8
	 441#define ptxLen R9
	 442#define aluCTR R10
	 443#define aluTMP R11
	 444#define aluK R12
	 445#define NR R13
	 446
	 447#define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; XORL aluK, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + 8*16 + i*16)(SP)
	 448#define aesRnd(k) AESENC k, B0; AESENC k, B1; AESENC k, B2; AESENC k, B3; AESENC k, B4; AESENC k, B5; AESENC k, B6; AESENC k, B7
	 449#define aesRound(i) MOVOU (16*i)(ks), T0;AESENC T0, B0; AESENC T0, B1; AESENC T0, B2; AESENC T0, B3; AESENC T0, B4; AESENC T0, B5; AESENC T0, B6; AESENC T0, B7
	 450#define aesRndLast(k) AESENCLAST k, B0; AESENCLAST k, B1; AESENCLAST k, B2; AESENCLAST k, B3; AESENCLAST k, B4; AESENCLAST k, B5; AESENCLAST k, B6; AESENCLAST k, B7
	 451#define combinedRound(i) \
	 452	MOVOU (16*i)(ks), T0;\
	 453	AESENC T0, B0;\
	 454	AESENC T0, B1;\
	 455	AESENC T0, B2;\
	 456	AESENC T0, B3;\
	 457	 MOVOU (16*(i*2))(pTbl), T1;\
	 458	 MOVOU T1, T2;\
	 459	AESENC T0, B4;\
	 460	AESENC T0, B5;\
	 461	AESENC T0, B6;\
	 462	AESENC T0, B7;\
	 463	 MOVOU (16*i)(SP), T0;\
	 464	 PCLMULQDQ $0x00, T0, T1;\
	 465	 PXOR T1, ACC0;\
	 466	 PSHUFD $78, T0, T1;\
	 467	 PCLMULQDQ $0x11, T0, T2;\
	 468	 PXOR T1, T0;\
	 469	 PXOR T2, ACC1;\
	 470	 MOVOU (16*(i*2+1))(pTbl), T2;\
	 471	 PCLMULQDQ $0x00, T2, T0;\
	 472	 PXOR T0, ACCM
	 473#define mulRound(i) \
	 474	MOVOU (16*i)(SP), T0;\
	 475	MOVOU (16*(i*2))(pTbl), T1;\
	 476	MOVOU T1, T2;\
	 477	PCLMULQDQ $0x00, T0, T1;\
	 478	PXOR T1, ACC0;\
	 479	PCLMULQDQ $0x11, T0, T2;\
	 480	PXOR T2, ACC1;\
	 481	PSHUFD $78, T0, T1;\
	 482	PXOR T1, T0;\
	 483	MOVOU (16*(i*2+1))(pTbl), T1;\
	 484	PCLMULQDQ $0x00, T0, T1;\
	 485	PXOR T1, ACCM
	 486
	 487	MOVQ productTable+0(FP), pTbl
	 488	MOVQ dst+8(FP), ctx
	 489	MOVQ src_base+32(FP), ptx
	 490	MOVQ src_len+40(FP), ptxLen
	 491	MOVQ ctr+56(FP), ctrPtr
	 492	MOVQ T+64(FP), tPtr
	 493	MOVQ ks_base+72(FP), ks
	 494	MOVQ ks_len+80(FP), NR
	 495
	 496	SHRQ $2, NR
	 497	DECQ NR
	 498
	 499	MOVOU bswapMask<>(SB), BSWAP
	 500	MOVOU gcmPoly<>(SB), POLY
	 501
	 502	MOVOU (tPtr), ACC0
	 503	PXOR ACC1, ACC1
	 504	PXOR ACCM, ACCM
	 505	MOVOU (ctrPtr), B0
	 506	MOVL (3*4)(ctrPtr), aluCTR
	 507	MOVOU (ks), T0
	 508	MOVL (3*4)(ks), aluK
	 509	BSWAPL aluCTR
	 510	BSWAPL aluK
	 511
	 512	PXOR B0, T0
	 513	MOVOU T0, (8*16 + 0*16)(SP)
	 514	increment(0)
	 515
	 516	CMPQ ptxLen, $128
	 517	JB gcmAesEncSingles
	 518	SUBQ $128, ptxLen
	 519
	 520	// We have at least 8 blocks to encrypt, prepare the rest of the counters
	 521	MOVOU T0, (8*16 + 1*16)(SP)
	 522	increment(1)
	 523	MOVOU T0, (8*16 + 2*16)(SP)
	 524	increment(2)
	 525	MOVOU T0, (8*16 + 3*16)(SP)
	 526	increment(3)
	 527	MOVOU T0, (8*16 + 4*16)(SP)
	 528	increment(4)
	 529	MOVOU T0, (8*16 + 5*16)(SP)
	 530	increment(5)
	 531	MOVOU T0, (8*16 + 6*16)(SP)
	 532	increment(6)
	 533	MOVOU T0, (8*16 + 7*16)(SP)
	 534	increment(7)
	 535
	 536	MOVOU (8*16 + 0*16)(SP), B0
	 537	MOVOU (8*16 + 1*16)(SP), B1
	 538	MOVOU (8*16 + 2*16)(SP), B2
	 539	MOVOU (8*16 + 3*16)(SP), B3
	 540	MOVOU (8*16 + 4*16)(SP), B4
	 541	MOVOU (8*16 + 5*16)(SP), B5
	 542	MOVOU (8*16 + 6*16)(SP), B6
	 543	MOVOU (8*16 + 7*16)(SP), B7
	 544
	 545	aesRound(1)
	 546	increment(0)
	 547	aesRound(2)
	 548	increment(1)
	 549	aesRound(3)
	 550	increment(2)
	 551	aesRound(4)
	 552	increment(3)
	 553	aesRound(5)
	 554	increment(4)
	 555	aesRound(6)
	 556	increment(5)
	 557	aesRound(7)
	 558	increment(6)
	 559	aesRound(8)
	 560	increment(7)
	 561	aesRound(9)
	 562	MOVOU (16*10)(ks), T0
	 563	CMPQ NR, $12
	 564	JB encLast1
	 565	aesRnd(T0)
	 566	aesRound(11)
	 567	MOVOU (16*12)(ks), T0
	 568	JE encLast1
	 569	aesRnd(T0)
	 570	aesRound(13)
	 571	MOVOU (16*14)(ks), T0
	 572encLast1:
	 573	aesRndLast(T0)
	 574
	 575	MOVOU (16*0)(ptx), T0
	 576	PXOR T0, B0
	 577	MOVOU (16*1)(ptx), T0
	 578	PXOR T0, B1
	 579	MOVOU (16*2)(ptx), T0
	 580	PXOR T0, B2
	 581	MOVOU (16*3)(ptx), T0
	 582	PXOR T0, B3
	 583	MOVOU (16*4)(ptx), T0
	 584	PXOR T0, B4
	 585	MOVOU (16*5)(ptx), T0
	 586	PXOR T0, B5
	 587	MOVOU (16*6)(ptx), T0
	 588	PXOR T0, B6
	 589	MOVOU (16*7)(ptx), T0
	 590	PXOR T0, B7
	 591
	 592	MOVOU B0, (16*0)(ctx)
	 593	PSHUFB BSWAP, B0
	 594	PXOR ACC0, B0
	 595	MOVOU B1, (16*1)(ctx)
	 596	PSHUFB BSWAP, B1
	 597	MOVOU B2, (16*2)(ctx)
	 598	PSHUFB BSWAP, B2
	 599	MOVOU B3, (16*3)(ctx)
	 600	PSHUFB BSWAP, B3
	 601	MOVOU B4, (16*4)(ctx)
	 602	PSHUFB BSWAP, B4
	 603	MOVOU B5, (16*5)(ctx)
	 604	PSHUFB BSWAP, B5
	 605	MOVOU B6, (16*6)(ctx)
	 606	PSHUFB BSWAP, B6
	 607	MOVOU B7, (16*7)(ctx)
	 608	PSHUFB BSWAP, B7
	 609
	 610	MOVOU B0, (16*0)(SP)
	 611	MOVOU B1, (16*1)(SP)
	 612	MOVOU B2, (16*2)(SP)
	 613	MOVOU B3, (16*3)(SP)
	 614	MOVOU B4, (16*4)(SP)
	 615	MOVOU B5, (16*5)(SP)
	 616	MOVOU B6, (16*6)(SP)
	 617	MOVOU B7, (16*7)(SP)
	 618
	 619	LEAQ 128(ptx), ptx
	 620	LEAQ 128(ctx), ctx
	 621
	 622gcmAesEncOctetsLoop:
	 623
	 624		CMPQ ptxLen, $128
	 625		JB gcmAesEncOctetsEnd
	 626		SUBQ $128, ptxLen
	 627
	 628		MOVOU (8*16 + 0*16)(SP), B0
	 629		MOVOU (8*16 + 1*16)(SP), B1
	 630		MOVOU (8*16 + 2*16)(SP), B2
	 631		MOVOU (8*16 + 3*16)(SP), B3
	 632		MOVOU (8*16 + 4*16)(SP), B4
	 633		MOVOU (8*16 + 5*16)(SP), B5
	 634		MOVOU (8*16 + 6*16)(SP), B6
	 635		MOVOU (8*16 + 7*16)(SP), B7
	 636
	 637		MOVOU (16*0)(SP), T0
	 638		PSHUFD $78, T0, T1
	 639		PXOR T0, T1
	 640
	 641		MOVOU (16*0)(pTbl), ACC0
	 642		MOVOU (16*1)(pTbl), ACCM
	 643		MOVOU ACC0, ACC1
	 644
	 645		PCLMULQDQ $0x00, T1, ACCM
	 646		PCLMULQDQ $0x00, T0, ACC0
	 647		PCLMULQDQ $0x11, T0, ACC1
	 648
	 649		combinedRound(1)
	 650		increment(0)
	 651		combinedRound(2)
	 652		increment(1)
	 653		combinedRound(3)
	 654		increment(2)
	 655		combinedRound(4)
	 656		increment(3)
	 657		combinedRound(5)
	 658		increment(4)
	 659		combinedRound(6)
	 660		increment(5)
	 661		combinedRound(7)
	 662		increment(6)
	 663
	 664		aesRound(8)
	 665		increment(7)
	 666
	 667		PXOR ACC0, ACCM
	 668		PXOR ACC1, ACCM
	 669		MOVOU ACCM, T0
	 670		PSRLDQ $8, ACCM
	 671		PSLLDQ $8, T0
	 672		PXOR ACCM, ACC1
	 673		PXOR T0, ACC0
	 674
	 675		reduceRound(ACC0)
	 676		aesRound(9)
	 677
	 678		reduceRound(ACC0)
	 679		PXOR ACC1, ACC0
	 680
	 681		MOVOU (16*10)(ks), T0
	 682		CMPQ NR, $12
	 683		JB encLast2
	 684		aesRnd(T0)
	 685		aesRound(11)
	 686		MOVOU (16*12)(ks), T0
	 687		JE encLast2
	 688		aesRnd(T0)
	 689		aesRound(13)
	 690		MOVOU (16*14)(ks), T0
	 691encLast2:
	 692		aesRndLast(T0)
	 693
	 694		MOVOU (16*0)(ptx), T0
	 695		PXOR T0, B0
	 696		MOVOU (16*1)(ptx), T0
	 697		PXOR T0, B1
	 698		MOVOU (16*2)(ptx), T0
	 699		PXOR T0, B2
	 700		MOVOU (16*3)(ptx), T0
	 701		PXOR T0, B3
	 702		MOVOU (16*4)(ptx), T0
	 703		PXOR T0, B4
	 704		MOVOU (16*5)(ptx), T0
	 705		PXOR T0, B5
	 706		MOVOU (16*6)(ptx), T0
	 707		PXOR T0, B6
	 708		MOVOU (16*7)(ptx), T0
	 709		PXOR T0, B7
	 710
	 711		MOVOU B0, (16*0)(ctx)
	 712		PSHUFB BSWAP, B0
	 713		PXOR ACC0, B0
	 714		MOVOU B1, (16*1)(ctx)
	 715		PSHUFB BSWAP, B1
	 716		MOVOU B2, (16*2)(ctx)
	 717		PSHUFB BSWAP, B2
	 718		MOVOU B3, (16*3)(ctx)
	 719		PSHUFB BSWAP, B3
	 720		MOVOU B4, (16*4)(ctx)
	 721		PSHUFB BSWAP, B4
	 722		MOVOU B5, (16*5)(ctx)
	 723		PSHUFB BSWAP, B5
	 724		MOVOU B6, (16*6)(ctx)
	 725		PSHUFB BSWAP, B6
	 726		MOVOU B7, (16*7)(ctx)
	 727		PSHUFB BSWAP, B7
	 728
	 729		MOVOU B0, (16*0)(SP)
	 730		MOVOU B1, (16*1)(SP)
	 731		MOVOU B2, (16*2)(SP)
	 732		MOVOU B3, (16*3)(SP)
	 733		MOVOU B4, (16*4)(SP)
	 734		MOVOU B5, (16*5)(SP)
	 735		MOVOU B6, (16*6)(SP)
	 736		MOVOU B7, (16*7)(SP)
	 737
	 738		LEAQ 128(ptx), ptx
	 739		LEAQ 128(ctx), ctx
	 740
	 741		JMP gcmAesEncOctetsLoop
	 742
	 743gcmAesEncOctetsEnd:
	 744
	 745	MOVOU (16*0)(SP), T0
	 746	MOVOU (16*0)(pTbl), ACC0
	 747	MOVOU (16*1)(pTbl), ACCM
	 748	MOVOU ACC0, ACC1
	 749	PSHUFD $78, T0, T1
	 750	PXOR T0, T1
	 751	PCLMULQDQ $0x00, T0, ACC0
	 752	PCLMULQDQ $0x11, T0, ACC1
	 753	PCLMULQDQ $0x00, T1, ACCM
	 754
	 755	mulRound(1)
	 756	mulRound(2)
	 757	mulRound(3)
	 758	mulRound(4)
	 759	mulRound(5)
	 760	mulRound(6)
	 761	mulRound(7)
	 762
	 763	PXOR ACC0, ACCM
	 764	PXOR ACC1, ACCM
	 765	MOVOU ACCM, T0
	 766	PSRLDQ $8, ACCM
	 767	PSLLDQ $8, T0
	 768	PXOR ACCM, ACC1
	 769	PXOR T0, ACC0
	 770
	 771	reduceRound(ACC0)
	 772	reduceRound(ACC0)
	 773	PXOR ACC1, ACC0
	 774
	 775	TESTQ ptxLen, ptxLen
	 776	JE gcmAesEncDone
	 777
	 778	SUBQ $7, aluCTR
	 779
	 780gcmAesEncSingles:
	 781
	 782	MOVOU (16*1)(ks), B1
	 783	MOVOU (16*2)(ks), B2
	 784	MOVOU (16*3)(ks), B3
	 785	MOVOU (16*4)(ks), B4
	 786	MOVOU (16*5)(ks), B5
	 787	MOVOU (16*6)(ks), B6
	 788	MOVOU (16*7)(ks), B7
	 789
	 790	MOVOU (16*14)(pTbl), T2
	 791
	 792gcmAesEncSinglesLoop:
	 793
	 794		CMPQ ptxLen, $16
	 795		JB gcmAesEncTail
	 796		SUBQ $16, ptxLen
	 797
	 798		MOVOU (8*16 + 0*16)(SP), B0
	 799		increment(0)
	 800
	 801		AESENC B1, B0
	 802		AESENC B2, B0
	 803		AESENC B3, B0
	 804		AESENC B4, B0
	 805		AESENC B5, B0
	 806		AESENC B6, B0
	 807		AESENC B7, B0
	 808		MOVOU (16*8)(ks), T0
	 809		AESENC T0, B0
	 810		MOVOU (16*9)(ks), T0
	 811		AESENC T0, B0
	 812		MOVOU (16*10)(ks), T0
	 813		CMPQ NR, $12
	 814		JB encLast3
	 815		AESENC T0, B0
	 816		MOVOU (16*11)(ks), T0
	 817		AESENC T0, B0
	 818		MOVOU (16*12)(ks), T0
	 819		JE encLast3
	 820		AESENC T0, B0
	 821		MOVOU (16*13)(ks), T0
	 822		AESENC T0, B0
	 823		MOVOU (16*14)(ks), T0
	 824encLast3:
	 825		AESENCLAST T0, B0
	 826
	 827		MOVOU (ptx), T0
	 828		PXOR T0, B0
	 829		MOVOU B0, (ctx)
	 830
	 831		PSHUFB BSWAP, B0
	 832		PXOR ACC0, B0
	 833
	 834		MOVOU T2, ACC0
	 835		MOVOU T2, ACC1
	 836		MOVOU (16*15)(pTbl), ACCM
	 837
	 838		PSHUFD $78, B0, T0
	 839		PXOR B0, T0
	 840		PCLMULQDQ $0x00, B0, ACC0
	 841		PCLMULQDQ $0x11, B0, ACC1
	 842		PCLMULQDQ $0x00, T0, ACCM
	 843
	 844		PXOR ACC0, ACCM
	 845		PXOR ACC1, ACCM
	 846		MOVOU ACCM, T0
	 847		PSRLDQ $8, ACCM
	 848		PSLLDQ $8, T0
	 849		PXOR ACCM, ACC1
	 850		PXOR T0, ACC0
	 851
	 852		reduceRound(ACC0)
	 853		reduceRound(ACC0)
	 854		PXOR ACC1, ACC0
	 855
	 856		LEAQ (16*1)(ptx), ptx
	 857		LEAQ (16*1)(ctx), ctx
	 858
	 859	JMP gcmAesEncSinglesLoop
	 860
	 861gcmAesEncTail:
	 862	TESTQ ptxLen, ptxLen
	 863	JE gcmAesEncDone
	 864
	 865	MOVOU (8*16 + 0*16)(SP), B0
	 866	AESENC B1, B0
	 867	AESENC B2, B0
	 868	AESENC B3, B0
	 869	AESENC B4, B0
	 870	AESENC B5, B0
	 871	AESENC B6, B0
	 872	AESENC B7, B0
	 873	MOVOU (16*8)(ks), T0
	 874	AESENC T0, B0
	 875	MOVOU (16*9)(ks), T0
	 876	AESENC T0, B0
	 877	MOVOU (16*10)(ks), T0
	 878	CMPQ NR, $12
	 879	JB encLast4
	 880	AESENC T0, B0
	 881	MOVOU (16*11)(ks), T0
	 882	AESENC T0, B0
	 883	MOVOU (16*12)(ks), T0
	 884	JE encLast4
	 885	AESENC T0, B0
	 886	MOVOU (16*13)(ks), T0
	 887	AESENC T0, B0
	 888	MOVOU (16*14)(ks), T0
	 889encLast4:
	 890	AESENCLAST T0, B0
	 891	MOVOU B0, T0
	 892
	 893	LEAQ -1(ptx)(ptxLen*1), ptx
	 894
	 895	MOVQ ptxLen, aluTMP
	 896	SHLQ $4, aluTMP
	 897
	 898	LEAQ andMask<>(SB), aluCTR
	 899	MOVOU -16(aluCTR)(aluTMP*1), T1
	 900
	 901	PXOR B0, B0
	 902ptxLoadLoop:
	 903		PSLLDQ $1, B0
	 904		PINSRB $0, (ptx), B0
	 905		LEAQ -1(ptx), ptx
	 906		DECQ ptxLen
	 907	JNE ptxLoadLoop
	 908
	 909	PXOR T0, B0
	 910	PAND T1, B0
	 911	MOVOU B0, (ctx)	// I assume there is always space, due to TAG in the end of the CT
	 912
	 913	PSHUFB BSWAP, B0
	 914	PXOR ACC0, B0
	 915
	 916	MOVOU T2, ACC0
	 917	MOVOU T2, ACC1
	 918	MOVOU (16*15)(pTbl), ACCM
	 919
	 920	PSHUFD $78, B0, T0
	 921	PXOR B0, T0
	 922	PCLMULQDQ $0x00, B0, ACC0
	 923	PCLMULQDQ $0x11, B0, ACC1
	 924	PCLMULQDQ $0x00, T0, ACCM
	 925
	 926	PXOR ACC0, ACCM
	 927	PXOR ACC1, ACCM
	 928	MOVOU ACCM, T0
	 929	PSRLDQ $8, ACCM
	 930	PSLLDQ $8, T0
	 931	PXOR ACCM, ACC1
	 932	PXOR T0, ACC0
	 933
	 934	reduceRound(ACC0)
	 935	reduceRound(ACC0)
	 936	PXOR ACC1, ACC0
	 937
	 938gcmAesEncDone:
	 939	MOVOU ACC0, (tPtr)
	 940	RET
	 941#undef increment
	 942
	 943// func gcmAesDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
	 944TEXT ·gcmAesDec(SB),0,$128-96
	 945#define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; XORL aluK, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + i*16)(SP)
	 946#define combinedDecRound(i) \
	 947	MOVOU (16*i)(ks), T0;\
	 948	AESENC T0, B0;\
	 949	AESENC T0, B1;\
	 950	AESENC T0, B2;\
	 951	AESENC T0, B3;\
	 952	MOVOU (16*(i*2))(pTbl), T1;\
	 953	MOVOU T1, T2;\
	 954	AESENC T0, B4;\
	 955	AESENC T0, B5;\
	 956	AESENC T0, B6;\
	 957	AESENC T0, B7;\
	 958	MOVOU (16*i)(ctx), T0;\
	 959	PSHUFB BSWAP, T0;\
	 960	PCLMULQDQ $0x00, T0, T1;\
	 961	PXOR T1, ACC0;\
	 962	PSHUFD $78, T0, T1;\
	 963	PCLMULQDQ $0x11, T0, T2;\
	 964	PXOR T1, T0;\
	 965	PXOR T2, ACC1;\
	 966	MOVOU (16*(i*2+1))(pTbl), T2;\
	 967	PCLMULQDQ $0x00, T2, T0;\
	 968	PXOR T0, ACCM
	 969
	 970	MOVQ productTable+0(FP), pTbl
	 971	MOVQ dst+8(FP), ptx
	 972	MOVQ src_base+32(FP), ctx
	 973	MOVQ src_len+40(FP), ptxLen
	 974	MOVQ ctr+56(FP), ctrPtr
	 975	MOVQ T+64(FP), tPtr
	 976	MOVQ ks_base+72(FP), ks
	 977	MOVQ ks_len+80(FP), NR
	 978
	 979	SHRQ $2, NR
	 980	DECQ NR
	 981
	 982	MOVOU bswapMask<>(SB), BSWAP
	 983	MOVOU gcmPoly<>(SB), POLY
	 984
	 985	MOVOU (tPtr), ACC0
	 986	PXOR ACC1, ACC1
	 987	PXOR ACCM, ACCM
	 988	MOVOU (ctrPtr), B0
	 989	MOVL (3*4)(ctrPtr), aluCTR
	 990	MOVOU (ks), T0
	 991	MOVL (3*4)(ks), aluK
	 992	BSWAPL aluCTR
	 993	BSWAPL aluK
	 994
	 995	PXOR B0, T0
	 996	MOVOU T0, (0*16)(SP)
	 997	increment(0)
	 998
	 999	CMPQ ptxLen, $128
	1000	JB gcmAesDecSingles
	1001
	1002	MOVOU T0, (1*16)(SP)
	1003	increment(1)
	1004	MOVOU T0, (2*16)(SP)
	1005	increment(2)
	1006	MOVOU T0, (3*16)(SP)
	1007	increment(3)
	1008	MOVOU T0, (4*16)(SP)
	1009	increment(4)
	1010	MOVOU T0, (5*16)(SP)
	1011	increment(5)
	1012	MOVOU T0, (6*16)(SP)
	1013	increment(6)
	1014	MOVOU T0, (7*16)(SP)
	1015	increment(7)
	1016
	1017gcmAesDecOctetsLoop:
	1018
	1019		CMPQ ptxLen, $128
	1020		JB gcmAesDecEndOctets
	1021		SUBQ $128, ptxLen
	1022
	1023		MOVOU (0*16)(SP), B0
	1024		MOVOU (1*16)(SP), B1
	1025		MOVOU (2*16)(SP), B2
	1026		MOVOU (3*16)(SP), B3
	1027		MOVOU (4*16)(SP), B4
	1028		MOVOU (5*16)(SP), B5
	1029		MOVOU (6*16)(SP), B6
	1030		MOVOU (7*16)(SP), B7
	1031
	1032		MOVOU (16*0)(ctx), T0
	1033		PSHUFB BSWAP, T0
	1034		PXOR ACC0, T0
	1035		PSHUFD $78, T0, T1
	1036		PXOR T0, T1
	1037
	1038		MOVOU (16*0)(pTbl), ACC0
	1039		MOVOU (16*1)(pTbl), ACCM
	1040		MOVOU ACC0, ACC1
	1041
	1042		PCLMULQDQ $0x00, T1, ACCM
	1043		PCLMULQDQ $0x00, T0, ACC0
	1044		PCLMULQDQ $0x11, T0, ACC1
	1045
	1046		combinedDecRound(1)
	1047		increment(0)
	1048		combinedDecRound(2)
	1049		increment(1)
	1050		combinedDecRound(3)
	1051		increment(2)
	1052		combinedDecRound(4)
	1053		increment(3)
	1054		combinedDecRound(5)
	1055		increment(4)
	1056		combinedDecRound(6)
	1057		increment(5)
	1058		combinedDecRound(7)
	1059		increment(6)
	1060
	1061		aesRound(8)
	1062		increment(7)
	1063
	1064		PXOR ACC0, ACCM
	1065		PXOR ACC1, ACCM
	1066		MOVOU ACCM, T0
	1067		PSRLDQ $8, ACCM
	1068		PSLLDQ $8, T0
	1069		PXOR ACCM, ACC1
	1070		PXOR T0, ACC0
	1071
	1072		reduceRound(ACC0)
	1073		aesRound(9)
	1074
	1075		reduceRound(ACC0)
	1076		PXOR ACC1, ACC0
	1077
	1078		MOVOU (16*10)(ks), T0
	1079		CMPQ NR, $12
	1080		JB decLast1
	1081		aesRnd(T0)
	1082		aesRound(11)
	1083		MOVOU (16*12)(ks), T0
	1084		JE decLast1
	1085		aesRnd(T0)
	1086		aesRound(13)
	1087		MOVOU (16*14)(ks), T0
	1088decLast1:
	1089		aesRndLast(T0)
	1090
	1091		MOVOU (16*0)(ctx), T0
	1092		PXOR T0, B0
	1093		MOVOU (16*1)(ctx), T0
	1094		PXOR T0, B1
	1095		MOVOU (16*2)(ctx), T0
	1096		PXOR T0, B2
	1097		MOVOU (16*3)(ctx), T0
	1098		PXOR T0, B3
	1099		MOVOU (16*4)(ctx), T0
	1100		PXOR T0, B4
	1101		MOVOU (16*5)(ctx), T0
	1102		PXOR T0, B5
	1103		MOVOU (16*6)(ctx), T0
	1104		PXOR T0, B6
	1105		MOVOU (16*7)(ctx), T0
	1106		PXOR T0, B7
	1107
	1108		MOVOU B0, (16*0)(ptx)
	1109		MOVOU B1, (16*1)(ptx)
	1110		MOVOU B2, (16*2)(ptx)
	1111		MOVOU B3, (16*3)(ptx)
	1112		MOVOU B4, (16*4)(ptx)
	1113		MOVOU B5, (16*5)(ptx)
	1114		MOVOU B6, (16*6)(ptx)
	1115		MOVOU B7, (16*7)(ptx)
	1116
	1117		LEAQ 128(ptx), ptx
	1118		LEAQ 128(ctx), ctx
	1119
	1120		JMP gcmAesDecOctetsLoop
	1121
	1122gcmAesDecEndOctets:
	1123
	1124	SUBQ $7, aluCTR
	1125
	1126gcmAesDecSingles:
	1127
	1128	MOVOU (16*1)(ks), B1
	1129	MOVOU (16*2)(ks), B2
	1130	MOVOU (16*3)(ks), B3
	1131	MOVOU (16*4)(ks), B4
	1132	MOVOU (16*5)(ks), B5
	1133	MOVOU (16*6)(ks), B6
	1134	MOVOU (16*7)(ks), B7
	1135
	1136	MOVOU (16*14)(pTbl), T2
	1137
	1138gcmAesDecSinglesLoop:
	1139
	1140		CMPQ ptxLen, $16
	1141		JB gcmAesDecTail
	1142		SUBQ $16, ptxLen
	1143
	1144		MOVOU (ctx), B0
	1145		MOVOU B0, T1
	1146		PSHUFB BSWAP, B0
	1147		PXOR ACC0, B0
	1148
	1149		MOVOU T2, ACC0
	1150		MOVOU T2, ACC1
	1151		MOVOU (16*15)(pTbl), ACCM
	1152
	1153		PCLMULQDQ $0x00, B0, ACC0
	1154		PCLMULQDQ $0x11, B0, ACC1
	1155		PSHUFD $78, B0, T0
	1156		PXOR B0, T0
	1157		PCLMULQDQ $0x00, T0, ACCM
	1158
	1159		PXOR ACC0, ACCM
	1160		PXOR ACC1, ACCM
	1161		MOVOU ACCM, T0
	1162		PSRLDQ $8, ACCM
	1163		PSLLDQ $8, T0
	1164		PXOR ACCM, ACC1
	1165		PXOR T0, ACC0
	1166
	1167		reduceRound(ACC0)
	1168		reduceRound(ACC0)
	1169		PXOR ACC1, ACC0
	1170
	1171		MOVOU (0*16)(SP), B0
	1172		increment(0)
	1173		AESENC B1, B0
	1174		AESENC B2, B0
	1175		AESENC B3, B0
	1176		AESENC B4, B0
	1177		AESENC B5, B0
	1178		AESENC B6, B0
	1179		AESENC B7, B0
	1180		MOVOU (16*8)(ks), T0
	1181		AESENC T0, B0
	1182		MOVOU (16*9)(ks), T0
	1183		AESENC T0, B0
	1184		MOVOU (16*10)(ks), T0
	1185		CMPQ NR, $12
	1186		JB decLast2
	1187		AESENC T0, B0
	1188		MOVOU (16*11)(ks), T0
	1189		AESENC T0, B0
	1190		MOVOU (16*12)(ks), T0
	1191		JE decLast2
	1192		AESENC T0, B0
	1193		MOVOU (16*13)(ks), T0
	1194		AESENC T0, B0
	1195		MOVOU (16*14)(ks), T0
	1196decLast2:
	1197		AESENCLAST T0, B0
	1198
	1199		PXOR T1, B0
	1200		MOVOU B0, (ptx)
	1201
	1202		LEAQ (16*1)(ptx), ptx
	1203		LEAQ (16*1)(ctx), ctx
	1204
	1205	JMP gcmAesDecSinglesLoop
	1206
	1207gcmAesDecTail:
	1208
	1209	TESTQ ptxLen, ptxLen
	1210	JE gcmAesDecDone
	1211
	1212	MOVQ ptxLen, aluTMP
	1213	SHLQ $4, aluTMP
	1214	LEAQ andMask<>(SB), aluCTR
	1215	MOVOU -16(aluCTR)(aluTMP*1), T1
	1216
	1217	MOVOU (ctx), B0	// I assume there is TAG attached to the ctx, and there is no read overflow
	1218	PAND T1, B0
	1219
	1220	MOVOU B0, T1
	1221	PSHUFB BSWAP, B0
	1222	PXOR ACC0, B0
	1223
	1224	MOVOU (16*14)(pTbl), ACC0
	1225	MOVOU (16*15)(pTbl), ACCM
	1226	MOVOU ACC0, ACC1
	1227
	1228	PCLMULQDQ $0x00, B0, ACC0
	1229	PCLMULQDQ $0x11, B0, ACC1
	1230	PSHUFD $78, B0, T0
	1231	PXOR B0, T0
	1232	PCLMULQDQ $0x00, T0, ACCM
	1233
	1234	PXOR ACC0, ACCM
	1235	PXOR ACC1, ACCM
	1236	MOVOU ACCM, T0
	1237	PSRLDQ $8, ACCM
	1238	PSLLDQ $8, T0
	1239	PXOR ACCM, ACC1
	1240	PXOR T0, ACC0
	1241
	1242	reduceRound(ACC0)
	1243	reduceRound(ACC0)
	1244	PXOR ACC1, ACC0
	1245
	1246	MOVOU (0*16)(SP), B0
	1247	increment(0)
	1248	AESENC B1, B0
	1249	AESENC B2, B0
	1250	AESENC B3, B0
	1251	AESENC B4, B0
	1252	AESENC B5, B0
	1253	AESENC B6, B0
	1254	AESENC B7, B0
	1255	MOVOU (16*8)(ks), T0
	1256	AESENC T0, B0
	1257	MOVOU (16*9)(ks), T0
	1258	AESENC T0, B0
	1259	MOVOU (16*10)(ks), T0
	1260	CMPQ NR, $12
	1261	JB decLast3
	1262	AESENC T0, B0
	1263	MOVOU (16*11)(ks), T0
	1264	AESENC T0, B0
	1265	MOVOU (16*12)(ks), T0
	1266	JE decLast3
	1267	AESENC T0, B0
	1268	MOVOU (16*13)(ks), T0
	1269	AESENC T0, B0
	1270	MOVOU (16*14)(ks), T0
	1271decLast3:
	1272	AESENCLAST T0, B0
	1273	PXOR T1, B0
	1274
	1275ptxStoreLoop:
	1276		PEXTRB $0, B0, (ptx)
	1277		PSRLDQ $1, B0
	1278		LEAQ 1(ptx), ptx
	1279		DECQ ptxLen
	1280
	1281	JNE ptxStoreLoop
	1282
	1283gcmAesDecDone:
	1284
	1285	MOVOU ACC0, (tPtr)
	1286	RET

View as plain text