...

Text file src/crypto/sha1/sha1block_amd64.s

Documentation: crypto/sha1

		 1// Copyright 2013 The Go Authors. All rights reserved.
		 2// Use of this source code is governed by a BSD-style
		 3// license that can be found in the LICENSE file.
		 4
		 5// AVX2 version by Intel, same algorithm as code in Linux kernel:
		 6// https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha1_avx2_x86_64_asm.S
		 7// Authors:
		 8// Ilya Albrekht <[email protected]>
		 9// Maxim Locktyukhin <[email protected]>
		10// Ronen Zohar <[email protected]>
		11// Chandramouli Narayanan <[email protected]>
		12
		13
		14#include "textflag.h"
		15
		16// SHA-1 block routine. See sha1block.go for Go equivalent.
		17//
		18// There are 80 rounds of 4 types:
		19//	 - rounds 0-15 are type 1 and load data (ROUND1 macro).
		20//	 - rounds 16-19 are type 1 and do not load data (ROUND1x macro).
		21//	 - rounds 20-39 are type 2 and do not load data (ROUND2 macro).
		22//	 - rounds 40-59 are type 3 and do not load data (ROUND3 macro).
		23//	 - rounds 60-79 are type 4 and do not load data (ROUND4 macro).
		24//
		25// Each round loads or shuffles the data, then computes a per-round
		26// function of b, c, d, and then mixes the result into and rotates the
		27// five registers a, b, c, d, e holding the intermediate results.
		28//
		29// The register rotation is implemented by rotating the arguments to
		30// the round macros instead of by explicit move instructions.
		31
		32#define LOAD(index) \
		33	MOVL	(index*4)(SI), R10; \
		34	BSWAPL	R10; \
		35	MOVL	R10, (index*4)(SP)
		36
		37#define SHUFFLE(index) \
		38	MOVL	(((index)&0xf)*4)(SP), R10; \
		39	XORL	(((index-3)&0xf)*4)(SP), R10; \
		40	XORL	(((index-8)&0xf)*4)(SP), R10; \
		41	XORL	(((index-14)&0xf)*4)(SP), R10; \
		42	ROLL	$1, R10; \
		43	MOVL	R10, (((index)&0xf)*4)(SP)
		44
		45#define FUNC1(a, b, c, d, e) \
		46	MOVL	d, R9; \
		47	XORL	c, R9; \
		48	ANDL	b, R9; \
		49	XORL	d, R9
		50
		51#define FUNC2(a, b, c, d, e) \
		52	MOVL	b, R9; \
		53	XORL	c, R9; \
		54	XORL	d, R9
		55
		56#define FUNC3(a, b, c, d, e) \
		57	MOVL	b, R8; \
		58	ORL	c, R8; \
		59	ANDL	d, R8; \
		60	MOVL	b, R9; \
		61	ANDL	c, R9; \
		62	ORL	R8, R9
		63
		64#define FUNC4 FUNC2
		65
		66#define MIX(a, b, c, d, e, const) \
		67	ROLL	$30, b; \
		68	ADDL	R9, e; \
		69	MOVL	a, R8; \
		70	ROLL	$5, R8; \
		71	LEAL	const(e)(R10*1), e; \
		72	ADDL	R8, e
		73
		74#define ROUND1(a, b, c, d, e, index) \
		75	LOAD(index); \
		76	FUNC1(a, b, c, d, e); \
		77	MIX(a, b, c, d, e, 0x5A827999)
		78
		79#define ROUND1x(a, b, c, d, e, index) \
		80	SHUFFLE(index); \
		81	FUNC1(a, b, c, d, e); \
		82	MIX(a, b, c, d, e, 0x5A827999)
		83
		84#define ROUND2(a, b, c, d, e, index) \
		85	SHUFFLE(index); \
		86	FUNC2(a, b, c, d, e); \
		87	MIX(a, b, c, d, e, 0x6ED9EBA1)
		88
		89#define ROUND3(a, b, c, d, e, index) \
		90	SHUFFLE(index); \
		91	FUNC3(a, b, c, d, e); \
		92	MIX(a, b, c, d, e, 0x8F1BBCDC)
		93
		94#define ROUND4(a, b, c, d, e, index) \
		95	SHUFFLE(index); \
		96	FUNC4(a, b, c, d, e); \
		97	MIX(a, b, c, d, e, 0xCA62C1D6)
		98
		99TEXT ·blockAMD64(SB),NOSPLIT,$64-32
	 100	MOVQ	dig+0(FP),	BP
	 101	MOVQ	p_base+8(FP),	SI
	 102	MOVQ	p_len+16(FP),	DX
	 103	SHRQ	$6,		DX
	 104	SHLQ	$6,		DX
	 105
	 106	LEAQ	(SI)(DX*1),	DI
	 107	MOVL	(0*4)(BP),	AX
	 108	MOVL	(1*4)(BP),	BX
	 109	MOVL	(2*4)(BP),	CX
	 110	MOVL	(3*4)(BP),	DX
	 111	MOVL	(4*4)(BP),	BP
	 112
	 113	CMPQ	SI,		DI
	 114	JEQ	end
	 115
	 116loop:
	 117	MOVL	AX,	R11
	 118	MOVL	BX,	R12
	 119	MOVL	CX,	R13
	 120	MOVL	DX,	R14
	 121	MOVL	BP,	R15
	 122
	 123	ROUND1(AX, BX, CX, DX, BP, 0)
	 124	ROUND1(BP, AX, BX, CX, DX, 1)
	 125	ROUND1(DX, BP, AX, BX, CX, 2)
	 126	ROUND1(CX, DX, BP, AX, BX, 3)
	 127	ROUND1(BX, CX, DX, BP, AX, 4)
	 128	ROUND1(AX, BX, CX, DX, BP, 5)
	 129	ROUND1(BP, AX, BX, CX, DX, 6)
	 130	ROUND1(DX, BP, AX, BX, CX, 7)
	 131	ROUND1(CX, DX, BP, AX, BX, 8)
	 132	ROUND1(BX, CX, DX, BP, AX, 9)
	 133	ROUND1(AX, BX, CX, DX, BP, 10)
	 134	ROUND1(BP, AX, BX, CX, DX, 11)
	 135	ROUND1(DX, BP, AX, BX, CX, 12)
	 136	ROUND1(CX, DX, BP, AX, BX, 13)
	 137	ROUND1(BX, CX, DX, BP, AX, 14)
	 138	ROUND1(AX, BX, CX, DX, BP, 15)
	 139
	 140	ROUND1x(BP, AX, BX, CX, DX, 16)
	 141	ROUND1x(DX, BP, AX, BX, CX, 17)
	 142	ROUND1x(CX, DX, BP, AX, BX, 18)
	 143	ROUND1x(BX, CX, DX, BP, AX, 19)
	 144
	 145	ROUND2(AX, BX, CX, DX, BP, 20)
	 146	ROUND2(BP, AX, BX, CX, DX, 21)
	 147	ROUND2(DX, BP, AX, BX, CX, 22)
	 148	ROUND2(CX, DX, BP, AX, BX, 23)
	 149	ROUND2(BX, CX, DX, BP, AX, 24)
	 150	ROUND2(AX, BX, CX, DX, BP, 25)
	 151	ROUND2(BP, AX, BX, CX, DX, 26)
	 152	ROUND2(DX, BP, AX, BX, CX, 27)
	 153	ROUND2(CX, DX, BP, AX, BX, 28)
	 154	ROUND2(BX, CX, DX, BP, AX, 29)
	 155	ROUND2(AX, BX, CX, DX, BP, 30)
	 156	ROUND2(BP, AX, BX, CX, DX, 31)
	 157	ROUND2(DX, BP, AX, BX, CX, 32)
	 158	ROUND2(CX, DX, BP, AX, BX, 33)
	 159	ROUND2(BX, CX, DX, BP, AX, 34)
	 160	ROUND2(AX, BX, CX, DX, BP, 35)
	 161	ROUND2(BP, AX, BX, CX, DX, 36)
	 162	ROUND2(DX, BP, AX, BX, CX, 37)
	 163	ROUND2(CX, DX, BP, AX, BX, 38)
	 164	ROUND2(BX, CX, DX, BP, AX, 39)
	 165
	 166	ROUND3(AX, BX, CX, DX, BP, 40)
	 167	ROUND3(BP, AX, BX, CX, DX, 41)
	 168	ROUND3(DX, BP, AX, BX, CX, 42)
	 169	ROUND3(CX, DX, BP, AX, BX, 43)
	 170	ROUND3(BX, CX, DX, BP, AX, 44)
	 171	ROUND3(AX, BX, CX, DX, BP, 45)
	 172	ROUND3(BP, AX, BX, CX, DX, 46)
	 173	ROUND3(DX, BP, AX, BX, CX, 47)
	 174	ROUND3(CX, DX, BP, AX, BX, 48)
	 175	ROUND3(BX, CX, DX, BP, AX, 49)
	 176	ROUND3(AX, BX, CX, DX, BP, 50)
	 177	ROUND3(BP, AX, BX, CX, DX, 51)
	 178	ROUND3(DX, BP, AX, BX, CX, 52)
	 179	ROUND3(CX, DX, BP, AX, BX, 53)
	 180	ROUND3(BX, CX, DX, BP, AX, 54)
	 181	ROUND3(AX, BX, CX, DX, BP, 55)
	 182	ROUND3(BP, AX, BX, CX, DX, 56)
	 183	ROUND3(DX, BP, AX, BX, CX, 57)
	 184	ROUND3(CX, DX, BP, AX, BX, 58)
	 185	ROUND3(BX, CX, DX, BP, AX, 59)
	 186
	 187	ROUND4(AX, BX, CX, DX, BP, 60)
	 188	ROUND4(BP, AX, BX, CX, DX, 61)
	 189	ROUND4(DX, BP, AX, BX, CX, 62)
	 190	ROUND4(CX, DX, BP, AX, BX, 63)
	 191	ROUND4(BX, CX, DX, BP, AX, 64)
	 192	ROUND4(AX, BX, CX, DX, BP, 65)
	 193	ROUND4(BP, AX, BX, CX, DX, 66)
	 194	ROUND4(DX, BP, AX, BX, CX, 67)
	 195	ROUND4(CX, DX, BP, AX, BX, 68)
	 196	ROUND4(BX, CX, DX, BP, AX, 69)
	 197	ROUND4(AX, BX, CX, DX, BP, 70)
	 198	ROUND4(BP, AX, BX, CX, DX, 71)
	 199	ROUND4(DX, BP, AX, BX, CX, 72)
	 200	ROUND4(CX, DX, BP, AX, BX, 73)
	 201	ROUND4(BX, CX, DX, BP, AX, 74)
	 202	ROUND4(AX, BX, CX, DX, BP, 75)
	 203	ROUND4(BP, AX, BX, CX, DX, 76)
	 204	ROUND4(DX, BP, AX, BX, CX, 77)
	 205	ROUND4(CX, DX, BP, AX, BX, 78)
	 206	ROUND4(BX, CX, DX, BP, AX, 79)
	 207
	 208	ADDL	R11, AX
	 209	ADDL	R12, BX
	 210	ADDL	R13, CX
	 211	ADDL	R14, DX
	 212	ADDL	R15, BP
	 213
	 214	ADDQ	$64, SI
	 215	CMPQ	SI, DI
	 216	JB	loop
	 217
	 218end:
	 219	MOVQ	dig+0(FP), DI
	 220	MOVL	AX, (0*4)(DI)
	 221	MOVL	BX, (1*4)(DI)
	 222	MOVL	CX, (2*4)(DI)
	 223	MOVL	DX, (3*4)(DI)
	 224	MOVL	BP, (4*4)(DI)
	 225	RET
	 226
	 227
	 228// This is the implementation using AVX2, BMI1 and BMI2. It is based on:
	 229// "SHA-1 implementation with Intel(R) AVX2 instruction set extensions"
	 230// From http://software.intel.com/en-us/articles
	 231// (look for improving-the-performance-of-the-secure-hash-algorithm-1)
	 232// This implementation is 2x unrolled, and interleaves vector instructions,
	 233// used to precompute W, with scalar computation of current round
	 234// for optimal scheduling.
	 235
	 236// Trivial helper macros.
	 237#define UPDATE_HASH(A,TB,C,D,E) \
	 238	ADDL	(R9), A \
	 239	MOVL	A, (R9) \
	 240	ADDL	4(R9), TB \
	 241	MOVL	TB, 4(R9) \
	 242	ADDL	8(R9), C \
	 243	MOVL	C, 8(R9) \
	 244	ADDL	12(R9), D \
	 245	MOVL	D, 12(R9) \
	 246	ADDL	16(R9), E \
	 247	MOVL	E, 16(R9)
	 248
	 249
	 250
	 251// Helper macros for PRECALC, which does precomputations
	 252#define PRECALC_0(OFFSET) \
	 253	VMOVDQU	 OFFSET(R10),X0
	 254
	 255#define PRECALC_1(OFFSET) \
	 256	VINSERTI128 $1, OFFSET(R13), Y0, Y0
	 257
	 258#define PRECALC_2(YREG) \
	 259	VPSHUFB Y10, Y0, YREG
	 260
	 261#define PRECALC_4(YREG,K_OFFSET) \
	 262	VPADDD K_OFFSET(R8), YREG, Y0
	 263
	 264#define PRECALC_7(OFFSET) \
	 265	VMOVDQU Y0, (OFFSET*2)(R14)
	 266
	 267
	 268// Message scheduling pre-compute for rounds 0-15
	 269// R13 is a pointer to even 64-byte block
	 270// R10 is a pointer to odd 64-byte block
	 271// R14 is a pointer to temp buffer
	 272// X0 is used as temp register
	 273// YREG is clobbered as part of computation
	 274// OFFSET chooses 16 byte chunk within a block
	 275// R8 is a pointer to constants block
	 276// K_OFFSET chooses K constants relevant to this round
	 277// X10 holds swap mask
	 278#define PRECALC_00_15(OFFSET,YREG) \
	 279	PRECALC_0(OFFSET) \
	 280	PRECALC_1(OFFSET) \
	 281	PRECALC_2(YREG) \
	 282	PRECALC_4(YREG,0x0) \
	 283	PRECALC_7(OFFSET)
	 284
	 285
	 286// Helper macros for PRECALC_16_31
	 287#define PRECALC_16(REG_SUB_16,REG_SUB_12,REG_SUB_4,REG) \
	 288	VPALIGNR $8, REG_SUB_16, REG_SUB_12, REG \	// w[i-14]
	 289	VPSRLDQ $4, REG_SUB_4, Y0 // w[i-3]
	 290
	 291#define PRECALC_17(REG_SUB_16,REG_SUB_8,REG) \
	 292	VPXOR	REG_SUB_8, REG, REG \
	 293	VPXOR	REG_SUB_16, Y0, Y0
	 294
	 295#define PRECALC_18(REG) \
	 296	VPXOR Y0, REG, REG \
	 297	VPSLLDQ $12, REG, Y9
	 298
	 299#define PRECALC_19(REG) \
	 300	VPSLLD $1, REG, Y0 \
	 301	VPSRLD $31, REG, REG
	 302
	 303#define PRECALC_20(REG) \
	 304	VPOR REG, Y0, Y0 \
	 305	VPSLLD $2, Y9,	REG
	 306
	 307#define PRECALC_21(REG) \
	 308	VPSRLD $30, Y9, Y9 \
	 309	VPXOR REG, Y0, Y0
	 310
	 311#define PRECALC_23(REG,K_OFFSET,OFFSET) \
	 312	VPXOR Y9, Y0, REG \
	 313	VPADDD K_OFFSET(R8), REG, Y0 \
	 314	VMOVDQU Y0, (OFFSET)(R14)
	 315
	 316// Message scheduling pre-compute for rounds 16-31
	 317// calculating last 32 w[i] values in 8 XMM registers
	 318// pre-calculate K+w[i] values and store to mem
	 319// for later load by ALU add instruction.
	 320// "brute force" vectorization for rounds 16-31 only
	 321// due to w[i]->w[i-3] dependency.
	 322// clobbers 5 input ymm registers REG_SUB*
	 323// uses X0 and X9 as temp registers
	 324// As always, R8 is a pointer to constants block
	 325// and R14 is a pointer to temp buffer
	 326#define PRECALC_16_31(REG,REG_SUB_4,REG_SUB_8,REG_SUB_12,REG_SUB_16,K_OFFSET,OFFSET) \
	 327	PRECALC_16(REG_SUB_16,REG_SUB_12,REG_SUB_4,REG) \
	 328	PRECALC_17(REG_SUB_16,REG_SUB_8,REG) \
	 329	PRECALC_18(REG) \
	 330	PRECALC_19(REG) \
	 331	PRECALC_20(REG) \
	 332	PRECALC_21(REG) \
	 333	PRECALC_23(REG,K_OFFSET,OFFSET)
	 334
	 335
	 336// Helper macros for PRECALC_32_79
	 337#define PRECALC_32(REG_SUB_8,REG_SUB_4) \
	 338	VPALIGNR $8, REG_SUB_8, REG_SUB_4, Y0
	 339
	 340#define PRECALC_33(REG_SUB_28,REG) \
	 341	VPXOR REG_SUB_28, REG, REG
	 342
	 343#define PRECALC_34(REG_SUB_16) \
	 344	VPXOR REG_SUB_16, Y0, Y0
	 345
	 346#define PRECALC_35(REG) \
	 347	VPXOR Y0, REG, REG
	 348
	 349#define PRECALC_36(REG) \
	 350	VPSLLD $2, REG, Y0
	 351
	 352#define PRECALC_37(REG) \
	 353	VPSRLD $30, REG, REG \
	 354	VPOR REG, Y0, REG
	 355
	 356#define PRECALC_39(REG,K_OFFSET,OFFSET) \
	 357	VPADDD K_OFFSET(R8), REG, Y0 \
	 358	VMOVDQU Y0, (OFFSET)(R14)
	 359
	 360// Message scheduling pre-compute for rounds 32-79
	 361// In SHA-1 specification we have:
	 362// w[i] = (w[i-3] ^ w[i-8]	^ w[i-14] ^ w[i-16]) rol 1
	 363// Which is the same as:
	 364// w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
	 365// This allows for more efficient vectorization,
	 366// since w[i]->w[i-3] dependency is broken
	 367#define PRECALC_32_79(REG,REG_SUB_4,REG_SUB_8,REG_SUB_16,REG_SUB_28,K_OFFSET,OFFSET) \
	 368	PRECALC_32(REG_SUB_8,REG_SUB_4) \
	 369	PRECALC_33(REG_SUB_28,REG) \
	 370	PRECALC_34(REG_SUB_16) \
	 371	PRECALC_35(REG) \
	 372	PRECALC_36(REG) \
	 373	PRECALC_37(REG) \
	 374	PRECALC_39(REG,K_OFFSET,OFFSET)
	 375
	 376#define PRECALC \
	 377	PRECALC_00_15(0,Y15) \
	 378	PRECALC_00_15(0x10,Y14) \
	 379	PRECALC_00_15(0x20,Y13) \
	 380	PRECALC_00_15(0x30,Y12) \
	 381	PRECALC_16_31(Y8,Y12,Y13,Y14,Y15,0,0x80) \
	 382	PRECALC_16_31(Y7,Y8,Y12,Y13,Y14,0x20,0xa0) \
	 383	PRECALC_16_31(Y5,Y7,Y8,Y12,Y13,0x20,0xc0) \
	 384	PRECALC_16_31(Y3,Y5,Y7,Y8,Y12,0x20,0xe0) \
	 385	PRECALC_32_79(Y15,Y3,Y5,Y8,Y14,0x20,0x100) \
	 386	PRECALC_32_79(Y14,Y15,Y3,Y7,Y13,0x20,0x120) \
	 387	PRECALC_32_79(Y13,Y14,Y15,Y5,Y12,0x40,0x140) \
	 388	PRECALC_32_79(Y12,Y13,Y14,Y3,Y8,0x40,0x160) \
	 389	PRECALC_32_79(Y8,Y12,Y13,Y15,Y7,0x40,0x180) \
	 390	PRECALC_32_79(Y7,Y8,Y12,Y14,Y5,0x40,0x1a0) \
	 391	PRECALC_32_79(Y5,Y7,Y8,Y13,Y3,0x40,0x1c0) \
	 392	PRECALC_32_79(Y3,Y5,Y7,Y12,Y15,0x60,0x1e0) \
	 393	PRECALC_32_79(Y15,Y3,Y5,Y8,Y14,0x60,0x200) \
	 394	PRECALC_32_79(Y14,Y15,Y3,Y7,Y13,0x60,0x220) \
	 395	PRECALC_32_79(Y13,Y14,Y15,Y5,Y12,0x60,0x240) \
	 396	PRECALC_32_79(Y12,Y13,Y14,Y3,Y8,0x60,0x260)
	 397
	 398// Macros calculating individual rounds have general form
	 399// CALC_ROUND_PRE + PRECALC_ROUND + CALC_ROUND_POST
	 400// CALC_ROUND_{PRE,POST} macros follow
	 401
	 402#define CALC_F1_PRE(OFFSET,REG_A,REG_B,REG_C,REG_E) \
	 403	ADDL OFFSET(R15),REG_E \
	 404	ANDNL REG_C,REG_A,BP \
	 405	LEAL (REG_E)(REG_B*1), REG_E \ // Add F from the previous round
	 406	RORXL $0x1b, REG_A, R12 \
	 407	RORXL $2, REG_A, REG_B				 // for next round
	 408
	 409// Calculate F for the next round
	 410#define CALC_F1_POST(REG_A,REG_B,REG_E) \
	 411	ANDL REG_B,REG_A \						 // b&c
	 412	XORL BP, REG_A \							 // F1 = (b&c) ^ (~b&d)
	 413	LEAL (REG_E)(R12*1), REG_E		 // E += A >>> 5
	 414
	 415
	 416// Registers are cyclically rotated DX -> AX -> DI -> SI -> BX -> CX
	 417#define CALC_0 \
	 418	MOVL SI, BX \ // Precalculating first round
	 419	RORXL $2, SI, SI \
	 420	ANDNL AX, BX, BP \
	 421	ANDL DI, BX \
	 422	XORL BP, BX \
	 423	CALC_F1_PRE(0x0,CX,BX,DI,DX) \
	 424	PRECALC_0(0x80) \
	 425	CALC_F1_POST(CX,SI,DX)
	 426
	 427#define CALC_1 \
	 428	CALC_F1_PRE(0x4,DX,CX,SI,AX) \
	 429	PRECALC_1(0x80) \
	 430	CALC_F1_POST(DX,BX,AX)
	 431
	 432#define CALC_2 \
	 433	CALC_F1_PRE(0x8,AX,DX,BX,DI) \
	 434	PRECALC_2(Y15) \
	 435	CALC_F1_POST(AX,CX,DI)
	 436
	 437#define CALC_3 \
	 438	CALC_F1_PRE(0xc,DI,AX,CX,SI) \
	 439	CALC_F1_POST(DI,DX,SI)
	 440
	 441#define CALC_4 \
	 442	CALC_F1_PRE(0x20,SI,DI,DX,BX) \
	 443	PRECALC_4(Y15,0x0) \
	 444	CALC_F1_POST(SI,AX,BX)
	 445
	 446#define CALC_5 \
	 447	CALC_F1_PRE(0x24,BX,SI,AX,CX) \
	 448	CALC_F1_POST(BX,DI,CX)
	 449
	 450#define CALC_6 \
	 451	CALC_F1_PRE(0x28,CX,BX,DI,DX) \
	 452	CALC_F1_POST(CX,SI,DX)
	 453
	 454#define CALC_7 \
	 455	CALC_F1_PRE(0x2c,DX,CX,SI,AX) \
	 456	PRECALC_7(0x0) \
	 457	CALC_F1_POST(DX,BX,AX)
	 458
	 459#define CALC_8 \
	 460	CALC_F1_PRE(0x40,AX,DX,BX,DI) \
	 461	PRECALC_0(0x90) \
	 462	CALC_F1_POST(AX,CX,DI)
	 463
	 464#define CALC_9 \
	 465	CALC_F1_PRE(0x44,DI,AX,CX,SI) \
	 466	PRECALC_1(0x90) \
	 467	CALC_F1_POST(DI,DX,SI)
	 468
	 469#define CALC_10 \
	 470	CALC_F1_PRE(0x48,SI,DI,DX,BX) \
	 471	PRECALC_2(Y14) \
	 472	CALC_F1_POST(SI,AX,BX)
	 473
	 474#define CALC_11 \
	 475	CALC_F1_PRE(0x4c,BX,SI,AX,CX) \
	 476	CALC_F1_POST(BX,DI,CX)
	 477
	 478#define CALC_12 \
	 479	CALC_F1_PRE(0x60,CX,BX,DI,DX) \
	 480	PRECALC_4(Y14,0x0) \
	 481	CALC_F1_POST(CX,SI,DX)
	 482
	 483#define CALC_13 \
	 484	CALC_F1_PRE(0x64,DX,CX,SI,AX) \
	 485	CALC_F1_POST(DX,BX,AX)
	 486
	 487#define CALC_14 \
	 488	CALC_F1_PRE(0x68,AX,DX,BX,DI) \
	 489	CALC_F1_POST(AX,CX,DI)
	 490
	 491#define CALC_15 \
	 492	CALC_F1_PRE(0x6c,DI,AX,CX,SI) \
	 493	PRECALC_7(0x10) \
	 494	CALC_F1_POST(DI,DX,SI)
	 495
	 496#define CALC_16 \
	 497	CALC_F1_PRE(0x80,SI,DI,DX,BX) \
	 498	PRECALC_0(0xa0) \
	 499	CALC_F1_POST(SI,AX,BX)
	 500
	 501#define CALC_17 \
	 502	CALC_F1_PRE(0x84,BX,SI,AX,CX) \
	 503	PRECALC_1(0xa0) \
	 504	CALC_F1_POST(BX,DI,CX)
	 505
	 506#define CALC_18 \
	 507	CALC_F1_PRE(0x88,CX,BX,DI,DX) \
	 508	PRECALC_2(Y13) \
	 509	CALC_F1_POST(CX,SI,DX)
	 510
	 511
	 512#define CALC_F2_PRE(OFFSET,REG_A,REG_B,REG_E) \
	 513	ADDL OFFSET(R15),REG_E \
	 514	LEAL (REG_E)(REG_B*1), REG_E \ // Add F from the previous round
	 515	RORXL $0x1b, REG_A, R12 \
	 516	RORXL $2, REG_A, REG_B				 // for next round
	 517
	 518#define CALC_F2_POST(REG_A,REG_B,REG_C,REG_E) \
	 519	XORL REG_B, REG_A \
	 520	ADDL R12, REG_E \
	 521				XORL REG_C, REG_A
	 522
	 523#define CALC_19 \
	 524	CALC_F2_PRE(0x8c,DX,CX,AX) \
	 525	CALC_F2_POST(DX,BX,SI,AX)
	 526
	 527#define CALC_20 \
	 528	CALC_F2_PRE(0xa0,AX,DX,DI) \
	 529	PRECALC_4(Y13,0x0) \
	 530	CALC_F2_POST(AX,CX,BX,DI)
	 531
	 532#define CALC_21 \
	 533	CALC_F2_PRE(0xa4,DI,AX,SI) \
	 534	CALC_F2_POST(DI,DX,CX,SI)
	 535
	 536#define CALC_22 \
	 537	CALC_F2_PRE(0xa8,SI,DI,BX) \
	 538	CALC_F2_POST(SI,AX,DX,BX)
	 539
	 540#define CALC_23 \
	 541	CALC_F2_PRE(0xac,BX,SI,CX) \
	 542	PRECALC_7(0x20) \
	 543	CALC_F2_POST(BX,DI,AX,CX)
	 544
	 545#define CALC_24 \
	 546	CALC_F2_PRE(0xc0,CX,BX,DX) \
	 547	PRECALC_0(0xb0) \
	 548	CALC_F2_POST(CX,SI,DI,DX)
	 549
	 550#define CALC_25 \
	 551	CALC_F2_PRE(0xc4,DX,CX,AX) \
	 552	PRECALC_1(0xb0) \
	 553	CALC_F2_POST(DX,BX,SI,AX)
	 554
	 555#define CALC_26 \
	 556	CALC_F2_PRE(0xc8,AX,DX,DI) \
	 557	PRECALC_2(Y12) \
	 558	CALC_F2_POST(AX,CX,BX,DI)
	 559
	 560#define CALC_27 \
	 561	CALC_F2_PRE(0xcc,DI,AX,SI) \
	 562	CALC_F2_POST(DI,DX,CX,SI)
	 563
	 564#define CALC_28 \
	 565	CALC_F2_PRE(0xe0,SI,DI,BX) \
	 566	PRECALC_4(Y12,0x0) \
	 567	CALC_F2_POST(SI,AX,DX,BX)
	 568
	 569#define CALC_29 \
	 570	CALC_F2_PRE(0xe4,BX,SI,CX) \
	 571	CALC_F2_POST(BX,DI,AX,CX)
	 572
	 573#define CALC_30 \
	 574	CALC_F2_PRE(0xe8,CX,BX,DX) \
	 575	CALC_F2_POST(CX,SI,DI,DX)
	 576
	 577#define CALC_31 \
	 578	CALC_F2_PRE(0xec,DX,CX,AX) \
	 579	PRECALC_7(0x30) \
	 580	CALC_F2_POST(DX,BX,SI,AX)
	 581
	 582#define CALC_32 \
	 583	CALC_F2_PRE(0x100,AX,DX,DI) \
	 584	PRECALC_16(Y15,Y14,Y12,Y8) \
	 585	CALC_F2_POST(AX,CX,BX,DI)
	 586
	 587#define CALC_33 \
	 588	CALC_F2_PRE(0x104,DI,AX,SI) \
	 589	PRECALC_17(Y15,Y13,Y8) \
	 590	CALC_F2_POST(DI,DX,CX,SI)
	 591
	 592#define CALC_34 \
	 593	CALC_F2_PRE(0x108,SI,DI,BX) \
	 594	PRECALC_18(Y8) \
	 595	CALC_F2_POST(SI,AX,DX,BX)
	 596
	 597#define CALC_35 \
	 598	CALC_F2_PRE(0x10c,BX,SI,CX) \
	 599	PRECALC_19(Y8) \
	 600	CALC_F2_POST(BX,DI,AX,CX)
	 601
	 602#define CALC_36 \
	 603	CALC_F2_PRE(0x120,CX,BX,DX) \
	 604	PRECALC_20(Y8) \
	 605	CALC_F2_POST(CX,SI,DI,DX)
	 606
	 607#define CALC_37 \
	 608	CALC_F2_PRE(0x124,DX,CX,AX) \
	 609	PRECALC_21(Y8) \
	 610	CALC_F2_POST(DX,BX,SI,AX)
	 611
	 612#define CALC_38 \
	 613	CALC_F2_PRE(0x128,AX,DX,DI) \
	 614	CALC_F2_POST(AX,CX,BX,DI)
	 615
	 616
	 617#define CALC_F3_PRE(OFFSET,REG_E) \
	 618	ADDL OFFSET(R15),REG_E
	 619
	 620#define CALC_F3_POST(REG_A,REG_B,REG_C,REG_E,REG_TB) \
	 621	LEAL (REG_E)(REG_TB*1), REG_E \ // Add F from the previous round
	 622	MOVL REG_B, BP \
	 623	ORL	REG_A, BP \
	 624	RORXL $0x1b, REG_A, R12 \
	 625	RORXL $2, REG_A, REG_TB \
	 626	ANDL REG_C, BP \		// Calculate F for the next round
	 627	ANDL REG_B, REG_A \
	 628	ORL	BP, REG_A \
	 629	ADDL R12, REG_E
	 630
	 631#define CALC_39 \
	 632	CALC_F3_PRE(0x12c,SI) \
	 633	PRECALC_23(Y8,0x0,0x80) \
	 634	CALC_F3_POST(DI,DX,CX,SI,AX)
	 635
	 636#define CALC_40 \
	 637	CALC_F3_PRE(0x140,BX) \
	 638	PRECALC_16(Y14,Y13,Y8,Y7) \
	 639	CALC_F3_POST(SI,AX,DX,BX,DI)
	 640
	 641#define CALC_41 \
	 642	CALC_F3_PRE(0x144,CX) \
	 643	PRECALC_17(Y14,Y12,Y7) \
	 644	CALC_F3_POST(BX,DI,AX,CX,SI)
	 645
	 646#define CALC_42 \
	 647	CALC_F3_PRE(0x148,DX) \
	 648	PRECALC_18(Y7) \
	 649	CALC_F3_POST(CX,SI,DI,DX,BX)
	 650
	 651#define CALC_43 \
	 652	CALC_F3_PRE(0x14c,AX) \
	 653	PRECALC_19(Y7) \
	 654	CALC_F3_POST(DX,BX,SI,AX,CX)
	 655
	 656#define CALC_44 \
	 657	CALC_F3_PRE(0x160,DI) \
	 658	PRECALC_20(Y7) \
	 659	CALC_F3_POST(AX,CX,BX,DI,DX)
	 660
	 661#define CALC_45 \
	 662	CALC_F3_PRE(0x164,SI) \
	 663	PRECALC_21(Y7) \
	 664	CALC_F3_POST(DI,DX,CX,SI,AX)
	 665
	 666#define CALC_46 \
	 667	CALC_F3_PRE(0x168,BX) \
	 668	CALC_F3_POST(SI,AX,DX,BX,DI)
	 669
	 670#define CALC_47 \
	 671	CALC_F3_PRE(0x16c,CX) \
	 672	VPXOR Y9, Y0, Y7 \
	 673	VPADDD 0x20(R8), Y7, Y0 \
	 674	VMOVDQU Y0, 0xa0(R14) \
	 675	CALC_F3_POST(BX,DI,AX,CX,SI)
	 676
	 677#define CALC_48 \
	 678	CALC_F3_PRE(0x180,DX) \
	 679	PRECALC_16(Y13,Y12,Y7,Y5) \
	 680	CALC_F3_POST(CX,SI,DI,DX,BX)
	 681
	 682#define CALC_49 \
	 683	CALC_F3_PRE(0x184,AX) \
	 684	PRECALC_17(Y13,Y8,Y5) \
	 685	CALC_F3_POST(DX,BX,SI,AX,CX)
	 686
	 687#define CALC_50 \
	 688	CALC_F3_PRE(0x188,DI) \
	 689	PRECALC_18(Y5) \
	 690	CALC_F3_POST(AX,CX,BX,DI,DX)
	 691
	 692#define CALC_51 \
	 693	CALC_F3_PRE(0x18c,SI) \
	 694	PRECALC_19(Y5) \
	 695	CALC_F3_POST(DI,DX,CX,SI,AX)
	 696
	 697#define CALC_52 \
	 698	CALC_F3_PRE(0x1a0,BX) \
	 699	PRECALC_20(Y5) \
	 700	CALC_F3_POST(SI,AX,DX,BX,DI)
	 701
	 702#define CALC_53 \
	 703	CALC_F3_PRE(0x1a4,CX) \
	 704	PRECALC_21(Y5) \
	 705	CALC_F3_POST(BX,DI,AX,CX,SI)
	 706
	 707#define CALC_54 \
	 708	CALC_F3_PRE(0x1a8,DX) \
	 709	CALC_F3_POST(CX,SI,DI,DX,BX)
	 710
	 711#define CALC_55 \
	 712	CALC_F3_PRE(0x1ac,AX) \
	 713	PRECALC_23(Y5,0x20,0xc0) \
	 714	CALC_F3_POST(DX,BX,SI,AX,CX)
	 715
	 716#define CALC_56 \
	 717	CALC_F3_PRE(0x1c0,DI) \
	 718	PRECALC_16(Y12,Y8,Y5,Y3) \
	 719	CALC_F3_POST(AX,CX,BX,DI,DX)
	 720
	 721#define CALC_57 \
	 722	CALC_F3_PRE(0x1c4,SI) \
	 723	PRECALC_17(Y12,Y7,Y3) \
	 724	CALC_F3_POST(DI,DX,CX,SI,AX)
	 725
	 726#define CALC_58 \
	 727	CALC_F3_PRE(0x1c8,BX) \
	 728	PRECALC_18(Y3) \
	 729	CALC_F3_POST(SI,AX,DX,BX,DI)
	 730
	 731#define CALC_59 \
	 732	CALC_F2_PRE(0x1cc,BX,SI,CX) \
	 733	PRECALC_19(Y3) \
	 734	CALC_F2_POST(BX,DI,AX,CX)
	 735
	 736#define CALC_60 \
	 737	CALC_F2_PRE(0x1e0,CX,BX,DX) \
	 738	PRECALC_20(Y3) \
	 739	CALC_F2_POST(CX,SI,DI,DX)
	 740
	 741#define CALC_61 \
	 742	CALC_F2_PRE(0x1e4,DX,CX,AX) \
	 743	PRECALC_21(Y3) \
	 744	CALC_F2_POST(DX,BX,SI,AX)
	 745
	 746#define CALC_62 \
	 747	CALC_F2_PRE(0x1e8,AX,DX,DI) \
	 748	CALC_F2_POST(AX,CX,BX,DI)
	 749
	 750#define CALC_63 \
	 751	CALC_F2_PRE(0x1ec,DI,AX,SI) \
	 752	PRECALC_23(Y3,0x20,0xe0) \
	 753	CALC_F2_POST(DI,DX,CX,SI)
	 754
	 755#define CALC_64 \
	 756	CALC_F2_PRE(0x200,SI,DI,BX) \
	 757	PRECALC_32(Y5,Y3) \
	 758	CALC_F2_POST(SI,AX,DX,BX)
	 759
	 760#define CALC_65 \
	 761	CALC_F2_PRE(0x204,BX,SI,CX) \
	 762	PRECALC_33(Y14,Y15) \
	 763	CALC_F2_POST(BX,DI,AX,CX)
	 764
	 765#define CALC_66 \
	 766	CALC_F2_PRE(0x208,CX,BX,DX) \
	 767	PRECALC_34(Y8) \
	 768	CALC_F2_POST(CX,SI,DI,DX)
	 769
	 770#define CALC_67 \
	 771	CALC_F2_PRE(0x20c,DX,CX,AX) \
	 772	PRECALC_35(Y15) \
	 773	CALC_F2_POST(DX,BX,SI,AX)
	 774
	 775#define CALC_68 \
	 776	CALC_F2_PRE(0x220,AX,DX,DI) \
	 777	PRECALC_36(Y15) \
	 778	CALC_F2_POST(AX,CX,BX,DI)
	 779
	 780#define CALC_69 \
	 781	CALC_F2_PRE(0x224,DI,AX,SI) \
	 782	PRECALC_37(Y15) \
	 783	CALC_F2_POST(DI,DX,CX,SI)
	 784
	 785#define CALC_70 \
	 786	CALC_F2_PRE(0x228,SI,DI,BX) \
	 787	CALC_F2_POST(SI,AX,DX,BX)
	 788
	 789#define CALC_71 \
	 790	CALC_F2_PRE(0x22c,BX,SI,CX) \
	 791	PRECALC_39(Y15,0x20,0x100) \
	 792	CALC_F2_POST(BX,DI,AX,CX)
	 793
	 794#define CALC_72 \
	 795	CALC_F2_PRE(0x240,CX,BX,DX) \
	 796	PRECALC_32(Y3,Y15) \
	 797	CALC_F2_POST(CX,SI,DI,DX)
	 798
	 799#define CALC_73 \
	 800	CALC_F2_PRE(0x244,DX,CX,AX) \
	 801	PRECALC_33(Y13,Y14) \
	 802	CALC_F2_POST(DX,BX,SI,AX)
	 803
	 804#define CALC_74 \
	 805	CALC_F2_PRE(0x248,AX,DX,DI) \
	 806	PRECALC_34(Y7) \
	 807	CALC_F2_POST(AX,CX,BX,DI)
	 808
	 809#define CALC_75 \
	 810	CALC_F2_PRE(0x24c,DI,AX,SI) \
	 811	PRECALC_35(Y14) \
	 812	CALC_F2_POST(DI,DX,CX,SI)
	 813
	 814#define CALC_76 \
	 815	CALC_F2_PRE(0x260,SI,DI,BX) \
	 816	PRECALC_36(Y14) \
	 817	CALC_F2_POST(SI,AX,DX,BX)
	 818
	 819#define CALC_77 \
	 820	CALC_F2_PRE(0x264,BX,SI,CX) \
	 821	PRECALC_37(Y14) \
	 822	CALC_F2_POST(BX,DI,AX,CX)
	 823
	 824#define CALC_78 \
	 825	CALC_F2_PRE(0x268,CX,BX,DX) \
	 826	CALC_F2_POST(CX,SI,DI,DX)
	 827
	 828#define CALC_79 \
	 829	ADDL 0x26c(R15), AX \
	 830	LEAL (AX)(CX*1), AX \
	 831	RORXL $0x1b, DX, R12 \
	 832	PRECALC_39(Y14,0x20,0x120) \
	 833	ADDL R12, AX
	 834
	 835// Similar to CALC_0
	 836#define CALC_80 \
	 837	MOVL CX, DX \
	 838	RORXL $2, CX, CX \
	 839	ANDNL SI, DX, BP \
	 840	ANDL BX, DX \
	 841	XORL BP, DX \
	 842	CALC_F1_PRE(0x10,AX,DX,BX,DI) \
	 843	PRECALC_32(Y15,Y14) \
	 844	CALC_F1_POST(AX,CX,DI)
	 845
	 846#define CALC_81 \
	 847	CALC_F1_PRE(0x14,DI,AX,CX,SI) \
	 848	PRECALC_33(Y12,Y13) \
	 849	CALC_F1_POST(DI,DX,SI)
	 850
	 851#define CALC_82 \
	 852	CALC_F1_PRE(0x18,SI,DI,DX,BX) \
	 853	PRECALC_34(Y5) \
	 854	CALC_F1_POST(SI,AX,BX)
	 855
	 856#define CALC_83 \
	 857	CALC_F1_PRE(0x1c,BX,SI,AX,CX) \
	 858	PRECALC_35(Y13) \
	 859	CALC_F1_POST(BX,DI,CX)
	 860
	 861#define CALC_84 \
	 862	CALC_F1_PRE(0x30,CX,BX,DI,DX) \
	 863	PRECALC_36(Y13) \
	 864	CALC_F1_POST(CX,SI,DX)
	 865
	 866#define CALC_85 \
	 867	CALC_F1_PRE(0x34,DX,CX,SI,AX) \
	 868	PRECALC_37(Y13) \
	 869	CALC_F1_POST(DX,BX,AX)
	 870
	 871#define CALC_86 \
	 872	CALC_F1_PRE(0x38,AX,DX,BX,DI) \
	 873	CALC_F1_POST(AX,CX,DI)
	 874
	 875#define CALC_87 \
	 876	CALC_F1_PRE(0x3c,DI,AX,CX,SI) \
	 877	PRECALC_39(Y13,0x40,0x140) \
	 878	CALC_F1_POST(DI,DX,SI)
	 879
	 880#define CALC_88 \
	 881	CALC_F1_PRE(0x50,SI,DI,DX,BX) \
	 882	PRECALC_32(Y14,Y13) \
	 883	CALC_F1_POST(SI,AX,BX)
	 884
	 885#define CALC_89 \
	 886	CALC_F1_PRE(0x54,BX,SI,AX,CX) \
	 887	PRECALC_33(Y8,Y12) \
	 888	CALC_F1_POST(BX,DI,CX)
	 889
	 890#define CALC_90 \
	 891	CALC_F1_PRE(0x58,CX,BX,DI,DX) \
	 892	PRECALC_34(Y3) \
	 893	CALC_F1_POST(CX,SI,DX)
	 894
	 895#define CALC_91 \
	 896	CALC_F1_PRE(0x5c,DX,CX,SI,AX) \
	 897	PRECALC_35(Y12) \
	 898	CALC_F1_POST(DX,BX,AX)
	 899
	 900#define CALC_92 \
	 901	CALC_F1_PRE(0x70,AX,DX,BX,DI) \
	 902	PRECALC_36(Y12) \
	 903	CALC_F1_POST(AX,CX,DI)
	 904
	 905#define CALC_93 \
	 906	CALC_F1_PRE(0x74,DI,AX,CX,SI) \
	 907	PRECALC_37(Y12) \
	 908	CALC_F1_POST(DI,DX,SI)
	 909
	 910#define CALC_94 \
	 911	CALC_F1_PRE(0x78,SI,DI,DX,BX) \
	 912	CALC_F1_POST(SI,AX,BX)
	 913
	 914#define CALC_95 \
	 915	CALC_F1_PRE(0x7c,BX,SI,AX,CX) \
	 916	PRECALC_39(Y12,0x40,0x160) \
	 917	CALC_F1_POST(BX,DI,CX)
	 918
	 919#define CALC_96 \
	 920	CALC_F1_PRE(0x90,CX,BX,DI,DX) \
	 921	PRECALC_32(Y13,Y12) \
	 922	CALC_F1_POST(CX,SI,DX)
	 923
	 924#define CALC_97 \
	 925	CALC_F1_PRE(0x94,DX,CX,SI,AX) \
	 926	PRECALC_33(Y7,Y8) \
	 927	CALC_F1_POST(DX,BX,AX)
	 928
	 929#define CALC_98 \
	 930	CALC_F1_PRE(0x98,AX,DX,BX,DI) \
	 931	PRECALC_34(Y15) \
	 932	CALC_F1_POST(AX,CX,DI)
	 933
	 934#define CALC_99 \
	 935	CALC_F2_PRE(0x9c,DI,AX,SI) \
	 936	PRECALC_35(Y8) \
	 937	CALC_F2_POST(DI,DX,CX,SI)
	 938
	 939#define CALC_100 \
	 940	CALC_F2_PRE(0xb0,SI,DI,BX) \
	 941	PRECALC_36(Y8) \
	 942	CALC_F2_POST(SI,AX,DX,BX)
	 943
	 944#define CALC_101 \
	 945	CALC_F2_PRE(0xb4,BX,SI,CX) \
	 946	PRECALC_37(Y8) \
	 947	CALC_F2_POST(BX,DI,AX,CX)
	 948
	 949#define CALC_102 \
	 950	CALC_F2_PRE(0xb8,CX,BX,DX) \
	 951	CALC_F2_POST(CX,SI,DI,DX)
	 952
	 953#define CALC_103 \
	 954	CALC_F2_PRE(0xbc,DX,CX,AX) \
	 955	PRECALC_39(Y8,0x40,0x180) \
	 956	CALC_F2_POST(DX,BX,SI,AX)
	 957
	 958#define CALC_104 \
	 959	CALC_F2_PRE(0xd0,AX,DX,DI) \
	 960	PRECALC_32(Y12,Y8) \
	 961	CALC_F2_POST(AX,CX,BX,DI)
	 962
	 963#define CALC_105 \
	 964	CALC_F2_PRE(0xd4,DI,AX,SI) \
	 965	PRECALC_33(Y5,Y7) \
	 966	CALC_F2_POST(DI,DX,CX,SI)
	 967
	 968#define CALC_106 \
	 969	CALC_F2_PRE(0xd8,SI,DI,BX) \
	 970	PRECALC_34(Y14) \
	 971	CALC_F2_POST(SI,AX,DX,BX)
	 972
	 973#define CALC_107 \
	 974	CALC_F2_PRE(0xdc,BX,SI,CX) \
	 975	PRECALC_35(Y7) \
	 976	CALC_F2_POST(BX,DI,AX,CX)
	 977
	 978#define CALC_108 \
	 979	CALC_F2_PRE(0xf0,CX,BX,DX) \
	 980	PRECALC_36(Y7) \
	 981	CALC_F2_POST(CX,SI,DI,DX)
	 982
	 983#define CALC_109 \
	 984	CALC_F2_PRE(0xf4,DX,CX,AX) \
	 985	PRECALC_37(Y7) \
	 986	CALC_F2_POST(DX,BX,SI,AX)
	 987
	 988#define CALC_110 \
	 989	CALC_F2_PRE(0xf8,AX,DX,DI) \
	 990	CALC_F2_POST(AX,CX,BX,DI)
	 991
	 992#define CALC_111 \
	 993	CALC_F2_PRE(0xfc,DI,AX,SI) \
	 994	PRECALC_39(Y7,0x40,0x1a0) \
	 995	CALC_F2_POST(DI,DX,CX,SI)
	 996
	 997#define CALC_112 \
	 998	CALC_F2_PRE(0x110,SI,DI,BX) \
	 999	PRECALC_32(Y8,Y7) \
	1000	CALC_F2_POST(SI,AX,DX,BX)
	1001
	1002#define CALC_113 \
	1003	CALC_F2_PRE(0x114,BX,SI,CX) \
	1004	PRECALC_33(Y3,Y5) \
	1005	CALC_F2_POST(BX,DI,AX,CX)
	1006
	1007#define CALC_114 \
	1008	CALC_F2_PRE(0x118,CX,BX,DX) \
	1009	PRECALC_34(Y13) \
	1010	CALC_F2_POST(CX,SI,DI,DX)
	1011
	1012#define CALC_115 \
	1013	CALC_F2_PRE(0x11c,DX,CX,AX) \
	1014	PRECALC_35(Y5) \
	1015	CALC_F2_POST(DX,BX,SI,AX)
	1016
	1017#define CALC_116 \
	1018	CALC_F2_PRE(0x130,AX,DX,DI) \
	1019	PRECALC_36(Y5) \
	1020	CALC_F2_POST(AX,CX,BX,DI)
	1021
	1022#define CALC_117 \
	1023	CALC_F2_PRE(0x134,DI,AX,SI) \
	1024	PRECALC_37(Y5) \
	1025	CALC_F2_POST(DI,DX,CX,SI)
	1026
	1027#define CALC_118 \
	1028	CALC_F2_PRE(0x138,SI,DI,BX) \
	1029	CALC_F2_POST(SI,AX,DX,BX)
	1030
	1031#define CALC_119 \
	1032	CALC_F3_PRE(0x13c,CX) \
	1033	PRECALC_39(Y5,0x40,0x1c0) \
	1034	CALC_F3_POST(BX,DI,AX,CX,SI)
	1035
	1036#define CALC_120 \
	1037	CALC_F3_PRE(0x150,DX) \
	1038	PRECALC_32(Y7,Y5) \
	1039	CALC_F3_POST(CX,SI,DI,DX,BX)
	1040
	1041#define CALC_121 \
	1042	CALC_F3_PRE(0x154,AX) \
	1043	PRECALC_33(Y15,Y3) \
	1044	CALC_F3_POST(DX,BX,SI,AX,CX)
	1045
	1046#define CALC_122 \
	1047	CALC_F3_PRE(0x158,DI) \
	1048	PRECALC_34(Y12) \
	1049	CALC_F3_POST(AX,CX,BX,DI,DX)
	1050
	1051#define CALC_123 \
	1052	CALC_F3_PRE(0x15c,SI) \
	1053	PRECALC_35(Y3) \
	1054	CALC_F3_POST(DI,DX,CX,SI,AX)
	1055
	1056#define CALC_124 \
	1057	CALC_F3_PRE(0x170,BX) \
	1058	PRECALC_36(Y3) \
	1059	CALC_F3_POST(SI,AX,DX,BX,DI)
	1060
	1061#define CALC_125 \
	1062	CALC_F3_PRE(0x174,CX) \
	1063	PRECALC_37(Y3) \
	1064	CALC_F3_POST(BX,DI,AX,CX,SI)
	1065
	1066#define CALC_126 \
	1067	CALC_F3_PRE(0x178,DX) \
	1068	CALC_F3_POST(CX,SI,DI,DX,BX)
	1069
	1070#define CALC_127 \
	1071	CALC_F3_PRE(0x17c,AX) \
	1072	PRECALC_39(Y3,0x60,0x1e0) \
	1073	CALC_F3_POST(DX,BX,SI,AX,CX)
	1074
	1075#define CALC_128 \
	1076	CALC_F3_PRE(0x190,DI) \
	1077	PRECALC_32(Y5,Y3) \
	1078	CALC_F3_POST(AX,CX,BX,DI,DX)
	1079
	1080#define CALC_129 \
	1081	CALC_F3_PRE(0x194,SI) \
	1082	PRECALC_33(Y14,Y15) \
	1083	CALC_F3_POST(DI,DX,CX,SI,AX)
	1084
	1085#define CALC_130 \
	1086	CALC_F3_PRE(0x198,BX) \
	1087	PRECALC_34(Y8) \
	1088	CALC_F3_POST(SI,AX,DX,BX,DI)
	1089
	1090#define CALC_131 \
	1091	CALC_F3_PRE(0x19c,CX) \
	1092	PRECALC_35(Y15) \
	1093	CALC_F3_POST(BX,DI,AX,CX,SI)
	1094
	1095#define CALC_132 \
	1096	CALC_F3_PRE(0x1b0,DX) \
	1097	PRECALC_36(Y15) \
	1098	CALC_F3_POST(CX,SI,DI,DX,BX)
	1099
	1100#define CALC_133 \
	1101	CALC_F3_PRE(0x1b4,AX) \
	1102	PRECALC_37(Y15) \
	1103	CALC_F3_POST(DX,BX,SI,AX,CX)
	1104
	1105#define CALC_134 \
	1106	CALC_F3_PRE(0x1b8,DI) \
	1107	CALC_F3_POST(AX,CX,BX,DI,DX)
	1108
	1109#define CALC_135 \
	1110	CALC_F3_PRE(0x1bc,SI) \
	1111	PRECALC_39(Y15,0x60,0x200) \
	1112	CALC_F3_POST(DI,DX,CX,SI,AX)
	1113
	1114#define CALC_136 \
	1115	CALC_F3_PRE(0x1d0,BX) \
	1116	PRECALC_32(Y3,Y15) \
	1117	CALC_F3_POST(SI,AX,DX,BX,DI)
	1118
	1119#define CALC_137 \
	1120	CALC_F3_PRE(0x1d4,CX) \
	1121	PRECALC_33(Y13,Y14) \
	1122	CALC_F3_POST(BX,DI,AX,CX,SI)
	1123
	1124#define CALC_138 \
	1125	CALC_F3_PRE(0x1d8,DX) \
	1126	PRECALC_34(Y7) \
	1127	CALC_F3_POST(CX,SI,DI,DX,BX)
	1128
	1129#define CALC_139 \
	1130	CALC_F2_PRE(0x1dc,DX,CX,AX) \
	1131	PRECALC_35(Y14) \
	1132	CALC_F2_POST(DX,BX,SI,AX)
	1133
	1134#define CALC_140 \
	1135	CALC_F2_PRE(0x1f0,AX,DX,DI) \
	1136	PRECALC_36(Y14) \
	1137	CALC_F2_POST(AX,CX,BX,DI)
	1138
	1139#define CALC_141 \
	1140	CALC_F2_PRE(0x1f4,DI,AX,SI) \
	1141	PRECALC_37(Y14) \
	1142	CALC_F2_POST(DI,DX,CX,SI)
	1143
	1144#define CALC_142 \
	1145	CALC_F2_PRE(0x1f8,SI,DI,BX) \
	1146	CALC_F2_POST(SI,AX,DX,BX)
	1147
	1148#define CALC_143 \
	1149	CALC_F2_PRE(0x1fc,BX,SI,CX) \
	1150	PRECALC_39(Y14,0x60,0x220) \
	1151	CALC_F2_POST(BX,DI,AX,CX)
	1152
	1153#define CALC_144 \
	1154	CALC_F2_PRE(0x210,CX,BX,DX) \
	1155	PRECALC_32(Y15,Y14) \
	1156	CALC_F2_POST(CX,SI,DI,DX)
	1157
	1158#define CALC_145 \
	1159	CALC_F2_PRE(0x214,DX,CX,AX) \
	1160	PRECALC_33(Y12,Y13) \
	1161	CALC_F2_POST(DX,BX,SI,AX)
	1162
	1163#define CALC_146 \
	1164	CALC_F2_PRE(0x218,AX,DX,DI) \
	1165	PRECALC_34(Y5) \
	1166	CALC_F2_POST(AX,CX,BX,DI)
	1167
	1168#define CALC_147 \
	1169	CALC_F2_PRE(0x21c,DI,AX,SI) \
	1170	PRECALC_35(Y13) \
	1171	CALC_F2_POST(DI,DX,CX,SI)
	1172
	1173#define CALC_148 \
	1174	CALC_F2_PRE(0x230,SI,DI,BX) \
	1175	PRECALC_36(Y13) \
	1176	CALC_F2_POST(SI,AX,DX,BX)
	1177
	1178#define CALC_149 \
	1179	CALC_F2_PRE(0x234,BX,SI,CX) \
	1180	PRECALC_37(Y13) \
	1181	CALC_F2_POST(BX,DI,AX,CX)
	1182
	1183#define CALC_150 \
	1184	CALC_F2_PRE(0x238,CX,BX,DX) \
	1185	CALC_F2_POST(CX,SI,DI,DX)
	1186
	1187#define CALC_151 \
	1188	CALC_F2_PRE(0x23c,DX,CX,AX) \
	1189	PRECALC_39(Y13,0x60,0x240) \
	1190	CALC_F2_POST(DX,BX,SI,AX)
	1191
	1192#define CALC_152 \
	1193	CALC_F2_PRE(0x250,AX,DX,DI) \
	1194	PRECALC_32(Y14,Y13) \
	1195	CALC_F2_POST(AX,CX,BX,DI)
	1196
	1197#define CALC_153 \
	1198	CALC_F2_PRE(0x254,DI,AX,SI) \
	1199	PRECALC_33(Y8,Y12) \
	1200	CALC_F2_POST(DI,DX,CX,SI)
	1201
	1202#define CALC_154 \
	1203	CALC_F2_PRE(0x258,SI,DI,BX) \
	1204	PRECALC_34(Y3) \
	1205	CALC_F2_POST(SI,AX,DX,BX)
	1206
	1207#define CALC_155 \
	1208	CALC_F2_PRE(0x25c,BX,SI,CX) \
	1209	PRECALC_35(Y12) \
	1210	CALC_F2_POST(BX,DI,AX,CX)
	1211
	1212#define CALC_156 \
	1213	CALC_F2_PRE(0x270,CX,BX,DX) \
	1214	PRECALC_36(Y12) \
	1215	CALC_F2_POST(CX,SI,DI,DX)
	1216
	1217#define CALC_157 \
	1218	CALC_F2_PRE(0x274,DX,CX,AX) \
	1219	PRECALC_37(Y12) \
	1220	CALC_F2_POST(DX,BX,SI,AX)
	1221
	1222#define CALC_158 \
	1223	CALC_F2_PRE(0x278,AX,DX,DI) \
	1224	CALC_F2_POST(AX,CX,BX,DI)
	1225
	1226#define CALC_159 \
	1227	ADDL 0x27c(R15),SI \
	1228	LEAL (SI)(AX*1), SI \
	1229	RORXL $0x1b, DI, R12 \
	1230	PRECALC_39(Y12,0x60,0x260) \
	1231	ADDL R12, SI
	1232
	1233
	1234
	1235#define CALC \
	1236	MOVL	(R9), CX \
	1237	MOVL	4(R9), SI \
	1238	MOVL	8(R9), DI \
	1239	MOVL	12(R9), AX \
	1240	MOVL	16(R9), DX \
	1241	MOVQ		SP, R14 \
	1242	LEAQ		(2*4*80+32)(SP), R15 \
	1243	PRECALC \ // Precalc WK for first 2 blocks
	1244	XCHGQ	 R15, R14 \
	1245loop: \	// this loops is unrolled
	1246	CMPQ		R10, R8 \ // we use R8 value (set below) as a signal of a last block
	1247	JNE	begin \
	1248	VZEROUPPER \
	1249	RET \
	1250begin: \
	1251	CALC_0 \
	1252	CALC_1 \
	1253	CALC_2 \
	1254	CALC_3 \
	1255	CALC_4 \
	1256	CALC_5 \
	1257	CALC_6 \
	1258	CALC_7 \
	1259	CALC_8 \
	1260	CALC_9 \
	1261	CALC_10 \
	1262	CALC_11 \
	1263	CALC_12 \
	1264	CALC_13 \
	1265	CALC_14 \
	1266	CALC_15 \
	1267	CALC_16 \
	1268	CALC_17 \
	1269	CALC_18 \
	1270	CALC_19 \
	1271	CALC_20 \
	1272	CALC_21 \
	1273	CALC_22 \
	1274	CALC_23 \
	1275	CALC_24 \
	1276	CALC_25 \
	1277	CALC_26 \
	1278	CALC_27 \
	1279	CALC_28 \
	1280	CALC_29 \
	1281	CALC_30 \
	1282	CALC_31 \
	1283	CALC_32 \
	1284	CALC_33 \
	1285	CALC_34 \
	1286	CALC_35 \
	1287	CALC_36 \
	1288	CALC_37 \
	1289	CALC_38 \
	1290	CALC_39 \
	1291	CALC_40 \
	1292	CALC_41 \
	1293	CALC_42 \
	1294	CALC_43 \
	1295	CALC_44 \
	1296	CALC_45 \
	1297	CALC_46 \
	1298	CALC_47 \
	1299	CALC_48 \
	1300	CALC_49 \
	1301	CALC_50 \
	1302	CALC_51 \
	1303	CALC_52 \
	1304	CALC_53 \
	1305	CALC_54 \
	1306	CALC_55 \
	1307	CALC_56 \
	1308	CALC_57 \
	1309	CALC_58 \
	1310	CALC_59 \
	1311	ADDQ $128, R10 \ // move to next even-64-byte block
	1312	CMPQ R10, R11 \ // is current block the last one?
	1313	CMOVQCC R8, R10 \ // signal the last iteration smartly
	1314	CALC_60 \
	1315	CALC_61 \
	1316	CALC_62 \
	1317	CALC_63 \
	1318	CALC_64 \
	1319	CALC_65 \
	1320	CALC_66 \
	1321	CALC_67 \
	1322	CALC_68 \
	1323	CALC_69 \
	1324	CALC_70 \
	1325	CALC_71 \
	1326	CALC_72 \
	1327	CALC_73 \
	1328	CALC_74 \
	1329	CALC_75 \
	1330	CALC_76 \
	1331	CALC_77 \
	1332	CALC_78 \
	1333	CALC_79 \
	1334	UPDATE_HASH(AX,DX,BX,SI,DI) \
	1335	CMPQ R10, R8 \ // is current block the last one?
	1336	JE loop\
	1337	MOVL DX, CX \
	1338	CALC_80 \
	1339	CALC_81 \
	1340	CALC_82 \
	1341	CALC_83 \
	1342	CALC_84 \
	1343	CALC_85 \
	1344	CALC_86 \
	1345	CALC_87 \
	1346	CALC_88 \
	1347	CALC_89 \
	1348	CALC_90 \
	1349	CALC_91 \
	1350	CALC_92 \
	1351	CALC_93 \
	1352	CALC_94 \
	1353	CALC_95 \
	1354	CALC_96 \
	1355	CALC_97 \
	1356	CALC_98 \
	1357	CALC_99 \
	1358	CALC_100 \
	1359	CALC_101 \
	1360	CALC_102 \
	1361	CALC_103 \
	1362	CALC_104 \
	1363	CALC_105 \
	1364	CALC_106 \
	1365	CALC_107 \
	1366	CALC_108 \
	1367	CALC_109 \
	1368	CALC_110 \
	1369	CALC_111 \
	1370	CALC_112 \
	1371	CALC_113 \
	1372	CALC_114 \
	1373	CALC_115 \
	1374	CALC_116 \
	1375	CALC_117 \
	1376	CALC_118 \
	1377	CALC_119 \
	1378	CALC_120 \
	1379	CALC_121 \
	1380	CALC_122 \
	1381	CALC_123 \
	1382	CALC_124 \
	1383	CALC_125 \
	1384	CALC_126 \
	1385	CALC_127 \
	1386	CALC_128 \
	1387	CALC_129 \
	1388	CALC_130 \
	1389	CALC_131 \
	1390	CALC_132 \
	1391	CALC_133 \
	1392	CALC_134 \
	1393	CALC_135 \
	1394	CALC_136 \
	1395	CALC_137 \
	1396	CALC_138 \
	1397	CALC_139 \
	1398	ADDQ $128, R13 \ //move to next even-64-byte block
	1399	CMPQ R13, R11 \ //is current block the last one?
	1400	CMOVQCC R8, R10 \
	1401	CALC_140 \
	1402	CALC_141 \
	1403	CALC_142 \
	1404	CALC_143 \
	1405	CALC_144 \
	1406	CALC_145 \
	1407	CALC_146 \
	1408	CALC_147 \
	1409	CALC_148 \
	1410	CALC_149 \
	1411	CALC_150 \
	1412	CALC_151 \
	1413	CALC_152 \
	1414	CALC_153 \
	1415	CALC_154 \
	1416	CALC_155 \
	1417	CALC_156 \
	1418	CALC_157 \
	1419	CALC_158 \
	1420	CALC_159 \
	1421	UPDATE_HASH(SI,DI,DX,CX,BX) \
	1422	MOVL	SI, R12 \ //Reset state for	AVX2 reg permutation
	1423	MOVL	DI, SI \
	1424	MOVL	DX, DI \
	1425	MOVL	BX, DX \
	1426	MOVL	CX, AX \
	1427	MOVL	R12, CX \
	1428	XCHGQ	 R15, R14 \
	1429	JMP		 loop
	1430
	1431
	1432
	1433TEXT ·blockAVX2(SB),$1408-32
	1434
	1435	MOVQ	dig+0(FP),	DI
	1436	MOVQ	p_base+8(FP),	SI
	1437	MOVQ	p_len+16(FP),	DX
	1438	SHRQ	$6,		DX
	1439	SHLQ	$6,		DX
	1440
	1441	MOVQ	$K_XMM_AR<>(SB), R8
	1442
	1443	MOVQ	DI, R9
	1444	MOVQ	SI, R10
	1445	LEAQ	64(SI), R13
	1446
	1447	ADDQ	SI, DX
	1448	ADDQ	$64, DX
	1449	MOVQ	DX, R11
	1450
	1451	CMPQ	R13, R11
	1452	CMOVQCC	R8, R13
	1453
	1454	VMOVDQU	BSWAP_SHUFB_CTL<>(SB), Y10
	1455
	1456	CALC // RET is inside macros
	1457
	1458DATA K_XMM_AR<>+0x00(SB)/4,$0x5a827999
	1459DATA K_XMM_AR<>+0x04(SB)/4,$0x5a827999
	1460DATA K_XMM_AR<>+0x08(SB)/4,$0x5a827999
	1461DATA K_XMM_AR<>+0x0c(SB)/4,$0x5a827999
	1462DATA K_XMM_AR<>+0x10(SB)/4,$0x5a827999
	1463DATA K_XMM_AR<>+0x14(SB)/4,$0x5a827999
	1464DATA K_XMM_AR<>+0x18(SB)/4,$0x5a827999
	1465DATA K_XMM_AR<>+0x1c(SB)/4,$0x5a827999
	1466DATA K_XMM_AR<>+0x20(SB)/4,$0x6ed9eba1
	1467DATA K_XMM_AR<>+0x24(SB)/4,$0x6ed9eba1
	1468DATA K_XMM_AR<>+0x28(SB)/4,$0x6ed9eba1
	1469DATA K_XMM_AR<>+0x2c(SB)/4,$0x6ed9eba1
	1470DATA K_XMM_AR<>+0x30(SB)/4,$0x6ed9eba1
	1471DATA K_XMM_AR<>+0x34(SB)/4,$0x6ed9eba1
	1472DATA K_XMM_AR<>+0x38(SB)/4,$0x6ed9eba1
	1473DATA K_XMM_AR<>+0x3c(SB)/4,$0x6ed9eba1
	1474DATA K_XMM_AR<>+0x40(SB)/4,$0x8f1bbcdc
	1475DATA K_XMM_AR<>+0x44(SB)/4,$0x8f1bbcdc
	1476DATA K_XMM_AR<>+0x48(SB)/4,$0x8f1bbcdc
	1477DATA K_XMM_AR<>+0x4c(SB)/4,$0x8f1bbcdc
	1478DATA K_XMM_AR<>+0x50(SB)/4,$0x8f1bbcdc
	1479DATA K_XMM_AR<>+0x54(SB)/4,$0x8f1bbcdc
	1480DATA K_XMM_AR<>+0x58(SB)/4,$0x8f1bbcdc
	1481DATA K_XMM_AR<>+0x5c(SB)/4,$0x8f1bbcdc
	1482DATA K_XMM_AR<>+0x60(SB)/4,$0xca62c1d6
	1483DATA K_XMM_AR<>+0x64(SB)/4,$0xca62c1d6
	1484DATA K_XMM_AR<>+0x68(SB)/4,$0xca62c1d6
	1485DATA K_XMM_AR<>+0x6c(SB)/4,$0xca62c1d6
	1486DATA K_XMM_AR<>+0x70(SB)/4,$0xca62c1d6
	1487DATA K_XMM_AR<>+0x74(SB)/4,$0xca62c1d6
	1488DATA K_XMM_AR<>+0x78(SB)/4,$0xca62c1d6
	1489DATA K_XMM_AR<>+0x7c(SB)/4,$0xca62c1d6
	1490GLOBL K_XMM_AR<>(SB),RODATA,$128
	1491
	1492DATA BSWAP_SHUFB_CTL<>+0x00(SB)/4,$0x00010203
	1493DATA BSWAP_SHUFB_CTL<>+0x04(SB)/4,$0x04050607
	1494DATA BSWAP_SHUFB_CTL<>+0x08(SB)/4,$0x08090a0b
	1495DATA BSWAP_SHUFB_CTL<>+0x0c(SB)/4,$0x0c0d0e0f
	1496DATA BSWAP_SHUFB_CTL<>+0x10(SB)/4,$0x00010203
	1497DATA BSWAP_SHUFB_CTL<>+0x14(SB)/4,$0x04050607
	1498DATA BSWAP_SHUFB_CTL<>+0x18(SB)/4,$0x08090a0b
	1499DATA BSWAP_SHUFB_CTL<>+0x1c(SB)/4,$0x0c0d0e0f
	1500GLOBL BSWAP_SHUFB_CTL<>(SB),RODATA,$32

View as plain text