...

Text file src/crypto/sha256/sha256block_amd64.s

Documentation: crypto/sha256

		 1// Copyright 2013 The Go Authors. All rights reserved.
		 2// Use of this source code is governed by a BSD-style
		 3// license that can be found in the LICENSE file.
		 4
		 5#include "textflag.h"
		 6
		 7// SHA256 block routine. See sha256block.go for Go equivalent.
		 8//
		 9// The algorithm is detailed in FIPS 180-4:
		10//
		11//	https://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf
		12
		13// The avx2-version is described in an Intel White-Paper:
		14// "Fast SHA-256 Implementations on Intel Architecture Processors"
		15// To find it, surf to http://www.intel.com/p/en_US/embedded
		16// and search for that title.
		17// AVX2 version by Intel, same algorithm as code in Linux kernel:
		18// https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha256-avx2-asm.S
		19// by
		20//		 James Guilford <[email protected]>
		21//		 Kirk Yap <[email protected]>
		22//		 Tim Chen <[email protected]>
		23
		24// Wt = Mt; for 0 <= t <= 15
		25// Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63
		26//
		27// a = H0
		28// b = H1
		29// c = H2
		30// d = H3
		31// e = H4
		32// f = H5
		33// g = H6
		34// h = H7
		35//
		36// for t = 0 to 63 {
		37//		T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt
		38//		T2 = BIGSIGMA0(a) + Maj(a,b,c)
		39//		h = g
		40//		g = f
		41//		f = e
		42//		e = d + T1
		43//		d = c
		44//		c = b
		45//		b = a
		46//		a = T1 + T2
		47// }
		48//
		49// H0 = a + H0
		50// H1 = b + H1
		51// H2 = c + H2
		52// H3 = d + H3
		53// H4 = e + H4
		54// H5 = f + H5
		55// H6 = g + H6
		56// H7 = h + H7
		57
		58// Wt = Mt; for 0 <= t <= 15
		59#define MSGSCHEDULE0(index) \
		60	MOVL	(index*4)(SI), AX; \
		61	BSWAPL	AX; \
		62	MOVL	AX, (index*4)(BP)
		63
		64// Wt = SIGMA1(Wt-2) + Wt-7 + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63
		65//	 SIGMA0(x) = ROTR(7,x) XOR ROTR(18,x) XOR SHR(3,x)
		66//	 SIGMA1(x) = ROTR(17,x) XOR ROTR(19,x) XOR SHR(10,x)
		67#define MSGSCHEDULE1(index) \
		68	MOVL	((index-2)*4)(BP), AX; \
		69	MOVL	AX, CX; \
		70	RORL	$17, AX; \
		71	MOVL	CX, DX; \
		72	RORL	$19, CX; \
		73	SHRL	$10, DX; \
		74	MOVL	((index-15)*4)(BP), BX; \
		75	XORL	CX, AX; \
		76	MOVL	BX, CX; \
		77	XORL	DX, AX; \
		78	RORL	$7, BX; \
		79	MOVL	CX, DX; \
		80	SHRL	$3, DX; \
		81	RORL	$18, CX; \
		82	ADDL	((index-7)*4)(BP), AX; \
		83	XORL	CX, BX; \
		84	XORL	DX, BX; \
		85	ADDL	((index-16)*4)(BP), BX; \
		86	ADDL	BX, AX; \
		87	MOVL	AX, ((index)*4)(BP)
		88
		89// Calculate T1 in AX - uses AX, CX and DX registers.
		90// h is also used as an accumulator. Wt is passed in AX.
		91//	 T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + Kt + Wt
		92//		 BIGSIGMA1(x) = ROTR(6,x) XOR ROTR(11,x) XOR ROTR(25,x)
		93//		 Ch(x, y, z) = (x AND y) XOR (NOT x AND z)
		94#define SHA256T1(const, e, f, g, h) \
		95	ADDL	AX, h; \
		96	MOVL	e, AX; \
		97	ADDL	$const, h; \
		98	MOVL	e, CX; \
		99	RORL	$6, AX; \
	 100	MOVL	e, DX; \
	 101	RORL	$11, CX; \
	 102	XORL	CX, AX; \
	 103	MOVL	e, CX; \
	 104	RORL	$25, DX; \
	 105	ANDL	f, CX; \
	 106	XORL	AX, DX; \
	 107	MOVL	e, AX; \
	 108	NOTL	AX; \
	 109	ADDL	DX, h; \
	 110	ANDL	g, AX; \
	 111	XORL	CX, AX; \
	 112	ADDL	h, AX
	 113
	 114// Calculate T2 in BX - uses BX, CX, DX and DI registers.
	 115//	 T2 = BIGSIGMA0(a) + Maj(a, b, c)
	 116//		 BIGSIGMA0(x) = ROTR(2,x) XOR ROTR(13,x) XOR ROTR(22,x)
	 117//		 Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z)
	 118#define SHA256T2(a, b, c) \
	 119	MOVL	a, DI; \
	 120	MOVL	c, BX; \
	 121	RORL	$2, DI; \
	 122	MOVL	a, DX; \
	 123	ANDL	b, BX; \
	 124	RORL	$13, DX; \
	 125	MOVL	a, CX; \
	 126	ANDL	c, CX; \
	 127	XORL	DX, DI; \
	 128	XORL	CX, BX; \
	 129	MOVL	a, DX; \
	 130	MOVL	b, CX; \
	 131	RORL	$22, DX; \
	 132	ANDL	a, CX; \
	 133	XORL	CX, BX; \
	 134	XORL	DX, DI; \
	 135	ADDL	DI, BX
	 136
	 137// Calculate T1 and T2, then e = d + T1 and a = T1 + T2.
	 138// The values for e and a are stored in d and h, ready for rotation.
	 139#define SHA256ROUND(index, const, a, b, c, d, e, f, g, h) \
	 140	SHA256T1(const, e, f, g, h); \
	 141	SHA256T2(a, b, c); \
	 142	MOVL	BX, h; \
	 143	ADDL	AX, d; \
	 144	ADDL	AX, h
	 145
	 146#define SHA256ROUND0(index, const, a, b, c, d, e, f, g, h) \
	 147	MSGSCHEDULE0(index); \
	 148	SHA256ROUND(index, const, a, b, c, d, e, f, g, h)
	 149
	 150#define SHA256ROUND1(index, const, a, b, c, d, e, f, g, h) \
	 151	MSGSCHEDULE1(index); \
	 152	SHA256ROUND(index, const, a, b, c, d, e, f, g, h)
	 153
	 154
	 155// Definitions for AVX2 version
	 156
	 157// addm (mem), reg
	 158// Add reg to mem using reg-mem add and store
	 159#define addm(P1, P2) \
	 160	ADDL P2, P1; \
	 161	MOVL P1, P2
	 162
	 163#define XDWORD0 Y4
	 164#define XDWORD1 Y5
	 165#define XDWORD2 Y6
	 166#define XDWORD3 Y7
	 167
	 168#define XWORD0 X4
	 169#define XWORD1 X5
	 170#define XWORD2 X6
	 171#define XWORD3 X7
	 172
	 173#define XTMP0 Y0
	 174#define XTMP1 Y1
	 175#define XTMP2 Y2
	 176#define XTMP3 Y3
	 177#define XTMP4 Y8
	 178#define XTMP5 Y11
	 179
	 180#define XFER	Y9
	 181
	 182#define BYTE_FLIP_MASK 	Y13 // mask to convert LE -> BE
	 183#define X_BYTE_FLIP_MASK X13
	 184
	 185#define NUM_BYTES DX
	 186#define INP	DI
	 187
	 188#define CTX SI // Beginning of digest in memory (a, b, c, ... , h)
	 189
	 190#define a AX
	 191#define b BX
	 192#define c CX
	 193#define d R8
	 194#define e DX
	 195#define f R9
	 196#define g R10
	 197#define h R11
	 198
	 199#define old_h R11
	 200
	 201#define TBL BP
	 202
	 203#define SRND SI // SRND is same register as CTX
	 204
	 205#define T1 R12
	 206
	 207#define y0 R13
	 208#define y1 R14
	 209#define y2 R15
	 210#define y3 DI
	 211
	 212// Offsets
	 213#define XFER_SIZE 2*64*4
	 214#define INP_END_SIZE 8
	 215#define INP_SIZE 8
	 216
	 217#define _XFER 0
	 218#define _INP_END _XFER + XFER_SIZE
	 219#define _INP _INP_END + INP_END_SIZE
	 220#define STACK_SIZE _INP + INP_SIZE
	 221
	 222#define ROUND_AND_SCHED_N_0(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
	 223	;																		 \ // #############################	RND N + 0 ############################//
	 224	MOVL		 a, y3;											 \ // y3 = a					// MAJA
	 225	RORXL		$25, e, y0;									\ // y0 = e >> 25				// S1A
	 226	RORXL		$11, e, y1;									\ // y1 = e >> 11				// S1B
	 227	;																		 \
	 228	ADDL		 (disp + 0*4)(SP)(SRND*1), h; \ // h = k + w + h				// disp = k + w
	 229	ORL			c, y3;											 \ // y3 = a|c				// MAJA
	 230	VPALIGNR $4, XDWORD2, XDWORD3, XTMP0; \ // XTMP0 = W[-7]
	 231	MOVL		 f, y2;											 \ // y2 = f				// CH
	 232	RORXL		$13, a, T1;									\ // T1 = a >> 13			// S0B
	 233	;																		 \
	 234	XORL		 y1, y0;											\ // y0 = (e>>25) ^ (e>>11)					// S1
	 235	XORL		 g, y2;											 \ // y2 = f^g																// CH
	 236	VPADDD	 XDWORD0, XTMP0, XTMP0;			 \ // XTMP0 = W[-7] + W[-16]	// y1 = (e >> 6)	// S1
	 237	RORXL		$6, e, y1;									 \ // y1 = (e >> 6)						// S1
	 238	;																		 \
	 239	ANDL		 e, y2;											 \ // y2 = (f^g)&e												 // CH
	 240	XORL		 y1, y0;											\ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)		// S1
	 241	RORXL		$22, a, y1;									\ // y1 = a >> 22							// S0A
	 242	ADDL		 h, d;												\ // d = k + w + h + d										 	// --
	 243	;																		 \
	 244	ANDL		 b, y3;											 \ // y3 = (a|c)&b							// MAJA
	 245	VPALIGNR $4, XDWORD0, XDWORD1, XTMP1; \ // XTMP1 = W[-15]
	 246	XORL		 T1, y1;											\ // y1 = (a>>22) ^ (a>>13)				// S0
	 247	RORXL		$2, a, T1;									 \ // T1 = (a >> 2)						// S0
	 248	;																		 \
	 249	XORL		 g, y2;											 \ // y2 = CH = ((f^g)&e)^g				// CH
	 250	VPSRLD	 $7, XTMP1, XTMP2;						\
	 251	XORL		 T1, y1;											\ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)		// S0
	 252	MOVL		 a, T1;											 \ // T1 = a								// MAJB
	 253	ANDL		 c, T1;											 \ // T1 = a&c								// MAJB
	 254	;																		 \
	 255	ADDL		 y0, y2;											\ // y2 = S1 + CH							// --
	 256	VPSLLD	 $(32-7), XTMP1, XTMP3;			 \
	 257	ORL			T1, y3;											\ // y3 = MAJ = (a|c)&b)|(a&c)			// MAJ
	 258	ADDL		 y1, h;											 \ // h = k + w + h + S0					// --
	 259	;																		 \
	 260	ADDL		 y2, d;											 \ // d = k + w + h + d + S1 + CH = d + t1	// --
	 261	VPOR		 XTMP2, XTMP3, XTMP3;				 \ // XTMP3 = W[-15] ror 7
	 262	;																		 \
	 263	VPSRLD	 $18, XTMP1, XTMP2;					 \
	 264	ADDL		 y2, h;											 \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
	 265	ADDL		 y3, h												// h = t1 + S0 + MAJ										 // --
	 266
	 267#define ROUND_AND_SCHED_N_1(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
	 268	;																		\ // ################################### RND N + 1 ############################
	 269	;																		\
	 270	MOVL		a, y3;											 \ // y3 = a											 // MAJA
	 271	RORXL	 $25, e, y0;									\ // y0 = e >> 25					// S1A
	 272	RORXL	 $11, e, y1;									\ // y1 = e >> 11					// S1B
	 273	ADDL		(disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h				 		// --
	 274	ORL		 c, y3;											 \ // y3 = a|c						// MAJA
	 275	;																		\
	 276	VPSRLD	$3, XTMP1, XTMP4;						\ // XTMP4 = W[-15] >> 3
	 277	MOVL		f, y2;											 \ // y2 = f						// CH
	 278	RORXL	 $13, a, T1;									\ // T1 = a >> 13					// S0B
	 279	XORL		y1, y0;											\ // y0 = (e>>25) ^ (e>>11)		// S1
	 280	XORL		g, y2;											 \ // y2 = f^g						// CH
	 281	;																		\
	 282	RORXL	 $6, e, y1;									 \ // y1 = (e >> 6)				// S1
	 283	XORL		y1, y0;											\ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)	// S1
	 284	RORXL	 $22, a, y1;									\ // y1 = a >> 22						// S0A
	 285	ANDL		e, y2;											 \ // y2 = (f^g)&e						// CH
	 286	ADDL		h, d;												\ // d = k + w + h + d				// --
	 287	;																		\
	 288	VPSLLD	$(32-18), XTMP1, XTMP1;			\
	 289	ANDL		b, y3;											 \ // y3 = (a|c)&b					// MAJA
	 290	XORL		T1, y1;											\ // y1 = (a>>22) ^ (a>>13)		// S0
	 291	;																		\
	 292	VPXOR	 XTMP1, XTMP3, XTMP3;				 \
	 293	RORXL	 $2, a, T1;									 \ // T1 = (a >> 2)				// S0
	 294	XORL		g, y2;											 \ // y2 = CH = ((f^g)&e)^g		// CH
	 295	;																		\
	 296	VPXOR	 XTMP2, XTMP3, XTMP3;				 \ // XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
	 297	XORL		T1, y1;											\ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)		// S0
	 298	MOVL		a, T1;											 \ // T1 = a						// MAJB
	 299	ANDL		c, T1;											 \ // T1 = a&c						// MAJB
	 300	ADDL		y0, y2;											\ // y2 = S1 + CH					// --
	 301	;																		\
	 302	VPXOR	 XTMP4, XTMP3, XTMP1;				 \ // XTMP1 = s0
	 303	VPSHUFD $0xFA, XDWORD3, XTMP2;			 \ // XTMP2 = W[-2] {BBAA}
	 304	ORL		 T1, y3;											\ // y3 = MAJ = (a|c)&b)|(a&c)						 // MAJ
	 305	ADDL		y1, h;											 \ // h = k + w + h + S0										// --
	 306	;																		\
	 307	VPADDD	XTMP1, XTMP0, XTMP0;				 \ // XTMP0 = W[-16] + W[-7] + s0
	 308	ADDL		y2, d;											 \ // d = k + w + h + d + S1 + CH = d + t1	// --
	 309	ADDL		y2, h;											 \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
	 310	ADDL		y3, h;											 \ // h = t1 + S0 + MAJ										 // --
	 311	;																		\
	 312	VPSRLD	$10, XTMP2, XTMP4						// XTMP4 = W[-2] >> 10 {BBAA}
	 313
	 314#define ROUND_AND_SCHED_N_2(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
	 315	;																		\ // ################################### RND N + 2 ############################
	 316	;																		\
	 317	MOVL		a, y3;											 \ // y3 = a							// MAJA
	 318	RORXL	 $25, e, y0;									\ // y0 = e >> 25						// S1A
	 319	ADDL		(disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h							// --
	 320	;																		\
	 321	VPSRLQ	$19, XTMP2, XTMP3;					 \ // XTMP3 = W[-2] ror 19 {xBxA}
	 322	RORXL	 $11, e, y1;									\ // y1 = e >> 11						// S1B
	 323	ORL		 c, y3;											 \ // y3 = a|c												 // MAJA
	 324	MOVL		f, y2;											 \ // y2 = f													 // CH
	 325	XORL		g, y2;											 \ // y2 = f^g												 // CH
	 326	;																		\
	 327	RORXL	 $13, a, T1;									\ // T1 = a >> 13						// S0B
	 328	XORL		y1, y0;											\ // y0 = (e>>25) ^ (e>>11)			// S1
	 329	VPSRLQ	$17, XTMP2, XTMP2;					 \ // XTMP2 = W[-2] ror 17 {xBxA}
	 330	ANDL		e, y2;											 \ // y2 = (f^g)&e						// CH
	 331	;																		\
	 332	RORXL	 $6, e, y1;									 \ // y1 = (e >> 6)					// S1
	 333	VPXOR	 XTMP3, XTMP2, XTMP2;				 \
	 334	ADDL		h, d;												\ // d = k + w + h + d				// --
	 335	ANDL		b, y3;											 \ // y3 = (a|c)&b						// MAJA
	 336	;																		\
	 337	XORL		y1, y0;											\ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)	// S1
	 338	RORXL	 $22, a, y1;									\ // y1 = a >> 22						// S0A
	 339	VPXOR	 XTMP2, XTMP4, XTMP4;				 \ // XTMP4 = s1 {xBxA}
	 340	XORL		g, y2;											 \ // y2 = CH = ((f^g)&e)^g			// CH
	 341	;																		\
	 342	VPSHUFB shuff_00BA<>(SB), XTMP4, XTMP4;\ // XTMP4 = s1 {00BA}
	 343	;																		\
	 344	XORL		T1, y1;											\ // y1 = (a>>22) ^ (a>>13)		// S0
	 345	RORXL	 $2, a, T1;									 \ // T1 = (a >> 2)				// S0
	 346	VPADDD	XTMP4, XTMP0, XTMP0;				 \ // XTMP0 = {..., ..., W[1], W[0]}
	 347	;																		\
	 348	XORL		T1, y1;											\ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)	// S0
	 349	MOVL		a, T1;											 \ // T1 = a																// MAJB
	 350	ANDL		c, T1;											 \ // T1 = a&c															// MAJB
	 351	ADDL		y0, y2;											\ // y2 = S1 + CH													// --
	 352	VPSHUFD $80, XTMP0, XTMP2;					 \ // XTMP2 = W[-2] {DDCC}
	 353	;																		\
	 354	ORL		 T1, y3;											\ // y3 = MAJ = (a|c)&b)|(a&c)						 // MAJ
	 355	ADDL		y1, h;											 \ // h = k + w + h + S0										// --
	 356	ADDL		y2, d;											 \ // d = k + w + h + d + S1 + CH = d + t1	// --
	 357	ADDL		y2, h;											 \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
	 358	;																		\
	 359	ADDL		y3, h												// h = t1 + S0 + MAJ										 // --
	 360
	 361#define ROUND_AND_SCHED_N_3(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
	 362	;																		\ // ################################### RND N + 3 ############################
	 363	;																		\
	 364	MOVL		a, y3;											 \ // y3 = a						// MAJA
	 365	RORXL	 $25, e, y0;									\ // y0 = e >> 25					// S1A
	 366	RORXL	 $11, e, y1;									\ // y1 = e >> 11					// S1B
	 367	ADDL		(disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h				// --
	 368	ORL		 c, y3;											 \ // y3 = a|c										 // MAJA
	 369	;																		\
	 370	VPSRLD	$10, XTMP2, XTMP5;					 \ // XTMP5 = W[-2] >> 10 {DDCC}
	 371	MOVL		f, y2;											 \ // y2 = f						// CH
	 372	RORXL	 $13, a, T1;									\ // T1 = a >> 13					// S0B
	 373	XORL		y1, y0;											\ // y0 = (e>>25) ^ (e>>11)		// S1
	 374	XORL		g, y2;											 \ // y2 = f^g						// CH
	 375	;																		\
	 376	VPSRLQ	$19, XTMP2, XTMP3;					 \ // XTMP3 = W[-2] ror 19 {xDxC}
	 377	RORXL	 $6, e, y1;									 \ // y1 = (e >> 6)				// S1
	 378	ANDL		e, y2;											 \ // y2 = (f^g)&e					// CH
	 379	ADDL		h, d;												\ // d = k + w + h + d			// --
	 380	ANDL		b, y3;											 \ // y3 = (a|c)&b					// MAJA
	 381	;																		\
	 382	VPSRLQ	$17, XTMP2, XTMP2;					 \ // XTMP2 = W[-2] ror 17 {xDxC}
	 383	XORL		y1, y0;											\ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)	// S1
	 384	XORL		g, y2;											 \ // y2 = CH = ((f^g)&e)^g			// CH
	 385	;																		\
	 386	VPXOR	 XTMP3, XTMP2, XTMP2;				 \
	 387	RORXL	 $22, a, y1;									\ // y1 = a >> 22					// S0A
	 388	ADDL		y0, y2;											\ // y2 = S1 + CH					// --
	 389	;																		\
	 390	VPXOR	 XTMP2, XTMP5, XTMP5;				 \ // XTMP5 = s1 {xDxC}
	 391	XORL		T1, y1;											\ // y1 = (a>>22) ^ (a>>13)		// S0
	 392	ADDL		y2, d;											 \ // d = k + w + h + d + S1 + CH = d + t1	// --
	 393	;																		\
	 394	RORXL	 $2, a, T1;									 \ // T1 = (a >> 2)				// S0
	 395	;																		\
	 396	VPSHUFB shuff_DC00<>(SB), XTMP5, XTMP5;\ // XTMP5 = s1 {DC00}
	 397	;																		\
	 398	VPADDD	XTMP0, XTMP5, XDWORD0;			 \ // XDWORD0 = {W[3], W[2], W[1], W[0]}
	 399	XORL		T1, y1;											\ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)	// S0
	 400	MOVL		a, T1;											 \ // T1 = a							// MAJB
	 401	ANDL		c, T1;											 \ // T1 = a&c							// MAJB
	 402	ORL		 T1, y3;											\ // y3 = MAJ = (a|c)&b)|(a&c)		// MAJ
	 403	;																		\
	 404	ADDL		y1, h;											 \ // h = k + w + h + S0				// --
	 405	ADDL		y2, h;											 \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
	 406	ADDL		y3, h												// h = t1 + S0 + MAJ				// --
	 407
	 408#define DO_ROUND_N_0(disp, a, b, c, d, e, f, g, h, old_h) \
	 409	;																	\ // ################################### RND N + 0 ###########################
	 410	MOVL	f, y2;											 \ // y2 = f					// CH
	 411	RORXL $25, e, y0;									\ // y0 = e >> 25				// S1A
	 412	RORXL $11, e, y1;									\ // y1 = e >> 11				// S1B
	 413	XORL	g, y2;											 \ // y2 = f^g					// CH
	 414	;																	\
	 415	XORL	y1, y0;											\ // y0 = (e>>25) ^ (e>>11)	// S1
	 416	RORXL $6, e, y1;									 \ // y1 = (e >> 6)			// S1
	 417	ANDL	e, y2;											 \ // y2 = (f^g)&e				// CH
	 418	;																	\
	 419	XORL	y1, y0;											\ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)	// S1
	 420	RORXL $13, a, T1;									\ // T1 = a >> 13						// S0B
	 421	XORL	g, y2;											 \ // y2 = CH = ((f^g)&e)^g			// CH
	 422	RORXL $22, a, y1;									\ // y1 = a >> 22						// S0A
	 423	MOVL	a, y3;											 \ // y3 = a							// MAJA
	 424	;																	\
	 425	XORL	T1, y1;											\ // y1 = (a>>22) ^ (a>>13)			// S0
	 426	RORXL $2, a, T1;									 \ // T1 = (a >> 2)					// S0
	 427	ADDL	(disp + 0*4)(SP)(SRND*1), h; \ // h = k + w + h // --
	 428	ORL	 c, y3;											 \ // y3 = a|c							// MAJA
	 429	;																	\
	 430	XORL	T1, y1;											\ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)	// S0
	 431	MOVL	a, T1;											 \ // T1 = a							// MAJB
	 432	ANDL	b, y3;											 \ // y3 = (a|c)&b						// MAJA
	 433	ANDL	c, T1;											 \ // T1 = a&c							// MAJB
	 434	ADDL	y0, y2;											\ // y2 = S1 + CH						// --
	 435	;																	\
	 436	ADDL	h, d;												\ // d = k + w + h + d					// --
	 437	ORL	 T1, y3;											\ // y3 = MAJ = (a|c)&b)|(a&c)			// MAJ
	 438	ADDL	y1, h;											 \ // h = k + w + h + S0					// --
	 439	ADDL	y2, d												// d = k + w + h + d + S1 + CH = d + t1	// --
	 440
	 441#define DO_ROUND_N_1(disp, a, b, c, d, e, f, g, h, old_h) \
	 442	;																	\ // ################################### RND N + 1 ###########################
	 443	ADDL	y2, old_h;									 \ // h = k + w + h + S0 + S1 + CH = t1 + S0 // --
	 444	MOVL	f, y2;											 \ // y2 = f																// CH
	 445	RORXL $25, e, y0;									\ // y0 = e >> 25				// S1A
	 446	RORXL $11, e, y1;									\ // y1 = e >> 11				// S1B
	 447	XORL	g, y2;											 \ // y2 = f^g														 // CH
	 448	;																	\
	 449	XORL	y1, y0;											\ // y0 = (e>>25) ^ (e>>11)				// S1
	 450	RORXL $6, e, y1;									 \ // y1 = (e >> 6)						// S1
	 451	ANDL	e, y2;											 \ // y2 = (f^g)&e												 // CH
	 452	ADDL	y3, old_h;									 \ // h = t1 + S0 + MAJ										// --
	 453	;																	\
	 454	XORL	y1, y0;											\ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)		// S1
	 455	RORXL $13, a, T1;									\ // T1 = a >> 13							// S0B
	 456	XORL	g, y2;											 \ // y2 = CH = ((f^g)&e)^g								// CH
	 457	RORXL $22, a, y1;									\ // y1 = a >> 22							// S0A
	 458	MOVL	a, y3;											 \ // y3 = a															 // MAJA
	 459	;																	\
	 460	XORL	T1, y1;											\ // y1 = (a>>22) ^ (a>>13)				// S0
	 461	RORXL $2, a, T1;									 \ // T1 = (a >> 2)						// S0
	 462	ADDL	(disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h // --
	 463	ORL	 c, y3;											 \ // y3 = a|c														 // MAJA
	 464	;																	\
	 465	XORL	T1, y1;											\ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)		// S0
	 466	MOVL	a, T1;											 \ // T1 = a															 // MAJB
	 467	ANDL	b, y3;											 \ // y3 = (a|c)&b												 // MAJA
	 468	ANDL	c, T1;											 \ // T1 = a&c														 // MAJB
	 469	ADDL	y0, y2;											\ // y2 = S1 + CH												 // --
	 470	;																	\
	 471	ADDL	h, d;												\ // d = k + w + h + d										// --
	 472	ORL	 T1, y3;											\ // y3 = MAJ = (a|c)&b)|(a&c)						// MAJ
	 473	ADDL	y1, h;											 \ // h = k + w + h + S0									 // --
	 474	;																	\
	 475	ADDL	y2, d												// d = k + w + h + d + S1 + CH = d + t1 // --
	 476
	 477#define DO_ROUND_N_2(disp, a, b, c, d, e, f, g, h, old_h) \
	 478	;																	\ // ################################### RND N + 2 ##############################
	 479	ADDL	y2, old_h;									 \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
	 480	MOVL	f, y2;											 \ // y2 = f								// CH
	 481	RORXL $25, e, y0;									\ // y0 = e >> 25							// S1A
	 482	RORXL $11, e, y1;									\ // y1 = e >> 11							// S1B
	 483	XORL	g, y2;											 \ // y2 = f^g								// CH
	 484	;																	\
	 485	XORL	y1, y0;											\ // y0 = (e>>25) ^ (e>>11)				// S1
	 486	RORXL $6, e, y1;									 \ // y1 = (e >> 6)						// S1
	 487	ANDL	e, y2;											 \ // y2 = (f^g)&e							// CH
	 488	ADDL	y3, old_h;									 \ // h = t1 + S0 + MAJ					// --
	 489	;																	\
	 490	XORL	y1, y0;											\ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)		// S1
	 491	RORXL $13, a, T1;									\ // T1 = a >> 13							// S0B
	 492	XORL	g, y2;											 \ // y2 = CH = ((f^g)&e)^g								// CH
	 493	RORXL $22, a, y1;									\ // y1 = a >> 22							// S0A
	 494	MOVL	a, y3;											 \ // y3 = a								// MAJA
	 495	;																	\
	 496	XORL	T1, y1;											\ // y1 = (a>>22) ^ (a>>13)				// S0
	 497	RORXL $2, a, T1;									 \ // T1 = (a >> 2)						// S0
	 498	ADDL	(disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h 	// --
	 499	ORL	 c, y3;											 \ // y3 = a|c								// MAJA
	 500	;																	\
	 501	XORL	T1, y1;											\ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)		// S0
	 502	MOVL	a, T1;											 \ // T1 = a								// MAJB
	 503	ANDL	b, y3;											 \ // y3 = (a|c)&b							// MAJA
	 504	ANDL	c, T1;											 \ // T1 = a&c								// MAJB
	 505	ADDL	y0, y2;											\ // y2 = S1 + CH							// --
	 506	;																	\
	 507	ADDL	h, d;												\ // d = k + w + h + d					// --
	 508	ORL	 T1, y3;											\ // y3 = MAJ = (a|c)&b)|(a&c)			// MAJ
	 509	ADDL	y1, h;											 \ // h = k + w + h + S0					// --
	 510	;																	\
	 511	ADDL	y2, d												// d = k + w + h + d + S1 + CH = d + t1 // --
	 512
	 513#define DO_ROUND_N_3(disp, a, b, c, d, e, f, g, h, old_h) \
	 514	;																	\ // ################################### RND N + 3 ###########################
	 515	ADDL	y2, old_h;									 \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
	 516	MOVL	f, y2;											 \ // y2 = f								// CH
	 517	RORXL $25, e, y0;									\ // y0 = e >> 25							// S1A
	 518	RORXL $11, e, y1;									\ // y1 = e >> 11							// S1B
	 519	XORL	g, y2;											 \ // y2 = f^g								// CH
	 520	;																	\
	 521	XORL	y1, y0;											\ // y0 = (e>>25) ^ (e>>11)				// S1
	 522	RORXL $6, e, y1;									 \ // y1 = (e >> 6)						// S1
	 523	ANDL	e, y2;											 \ // y2 = (f^g)&e							// CH
	 524	ADDL	y3, old_h;									 \ // h = t1 + S0 + MAJ					// --
	 525	;																	\
	 526	XORL	y1, y0;											\ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)		// S1
	 527	RORXL $13, a, T1;									\ // T1 = a >> 13							// S0B
	 528	XORL	g, y2;											 \ // y2 = CH = ((f^g)&e)^g				// CH
	 529	RORXL $22, a, y1;									\ // y1 = a >> 22							// S0A
	 530	MOVL	a, y3;											 \ // y3 = a								// MAJA
	 531	;																	\
	 532	XORL	T1, y1;											\ // y1 = (a>>22) ^ (a>>13)				// S0
	 533	RORXL $2, a, T1;									 \ // T1 = (a >> 2)						// S0
	 534	ADDL	(disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h 	// --
	 535	ORL	 c, y3;											 \ // y3 = a|c								// MAJA
	 536	;																	\
	 537	XORL	T1, y1;											\ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)		// S0
	 538	MOVL	a, T1;											 \ // T1 = a								// MAJB
	 539	ANDL	b, y3;											 \ // y3 = (a|c)&b							// MAJA
	 540	ANDL	c, T1;											 \ // T1 = a&c								// MAJB
	 541	ADDL	y0, y2;											\ // y2 = S1 + CH							// --
	 542	;																	\
	 543	ADDL	h, d;												\ // d = k + w + h + d					// --
	 544	ORL	 T1, y3;											\ // y3 = MAJ = (a|c)&b)|(a&c)			// MAJ
	 545	ADDL	y1, h;											 \ // h = k + w + h + S0					// --
	 546	;																	\
	 547	ADDL	y2, d;											 \ // d = k + w + h + d + S1 + CH = d + t1	// --
	 548	;																	\
	 549	ADDL	y2, h;											 \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
	 550	;																	\
	 551	ADDL	y3, h												// h = t1 + S0 + MAJ					// --
	 552
	 553TEXT ·block(SB), 0, $536-32
	 554	CMPB ·useAVX2(SB), $1
	 555	JE	 avx2
	 556
	 557	MOVQ p_base+8(FP), SI
	 558	MOVQ p_len+16(FP), DX
	 559	SHRQ $6, DX
	 560	SHLQ $6, DX
	 561
	 562	LEAQ (SI)(DX*1), DI
	 563	MOVQ DI, 256(SP)
	 564	CMPQ SI, DI
	 565	JEQ	end
	 566
	 567	MOVQ dig+0(FP), BP
	 568	MOVL (0*4)(BP), R8	// a = H0
	 569	MOVL (1*4)(BP), R9	// b = H1
	 570	MOVL (2*4)(BP), R10 // c = H2
	 571	MOVL (3*4)(BP), R11 // d = H3
	 572	MOVL (4*4)(BP), R12 // e = H4
	 573	MOVL (5*4)(BP), R13 // f = H5
	 574	MOVL (6*4)(BP), R14 // g = H6
	 575	MOVL (7*4)(BP), R15 // h = H7
	 576
	 577loop:
	 578	MOVQ SP, BP
	 579
	 580	SHA256ROUND0(0, 0x428a2f98, R8, R9, R10, R11, R12, R13, R14, R15)
	 581	SHA256ROUND0(1, 0x71374491, R15, R8, R9, R10, R11, R12, R13, R14)
	 582	SHA256ROUND0(2, 0xb5c0fbcf, R14, R15, R8, R9, R10, R11, R12, R13)
	 583	SHA256ROUND0(3, 0xe9b5dba5, R13, R14, R15, R8, R9, R10, R11, R12)
	 584	SHA256ROUND0(4, 0x3956c25b, R12, R13, R14, R15, R8, R9, R10, R11)
	 585	SHA256ROUND0(5, 0x59f111f1, R11, R12, R13, R14, R15, R8, R9, R10)
	 586	SHA256ROUND0(6, 0x923f82a4, R10, R11, R12, R13, R14, R15, R8, R9)
	 587	SHA256ROUND0(7, 0xab1c5ed5, R9, R10, R11, R12, R13, R14, R15, R8)
	 588	SHA256ROUND0(8, 0xd807aa98, R8, R9, R10, R11, R12, R13, R14, R15)
	 589	SHA256ROUND0(9, 0x12835b01, R15, R8, R9, R10, R11, R12, R13, R14)
	 590	SHA256ROUND0(10, 0x243185be, R14, R15, R8, R9, R10, R11, R12, R13)
	 591	SHA256ROUND0(11, 0x550c7dc3, R13, R14, R15, R8, R9, R10, R11, R12)
	 592	SHA256ROUND0(12, 0x72be5d74, R12, R13, R14, R15, R8, R9, R10, R11)
	 593	SHA256ROUND0(13, 0x80deb1fe, R11, R12, R13, R14, R15, R8, R9, R10)
	 594	SHA256ROUND0(14, 0x9bdc06a7, R10, R11, R12, R13, R14, R15, R8, R9)
	 595	SHA256ROUND0(15, 0xc19bf174, R9, R10, R11, R12, R13, R14, R15, R8)
	 596
	 597	SHA256ROUND1(16, 0xe49b69c1, R8, R9, R10, R11, R12, R13, R14, R15)
	 598	SHA256ROUND1(17, 0xefbe4786, R15, R8, R9, R10, R11, R12, R13, R14)
	 599	SHA256ROUND1(18, 0x0fc19dc6, R14, R15, R8, R9, R10, R11, R12, R13)
	 600	SHA256ROUND1(19, 0x240ca1cc, R13, R14, R15, R8, R9, R10, R11, R12)
	 601	SHA256ROUND1(20, 0x2de92c6f, R12, R13, R14, R15, R8, R9, R10, R11)
	 602	SHA256ROUND1(21, 0x4a7484aa, R11, R12, R13, R14, R15, R8, R9, R10)
	 603	SHA256ROUND1(22, 0x5cb0a9dc, R10, R11, R12, R13, R14, R15, R8, R9)
	 604	SHA256ROUND1(23, 0x76f988da, R9, R10, R11, R12, R13, R14, R15, R8)
	 605	SHA256ROUND1(24, 0x983e5152, R8, R9, R10, R11, R12, R13, R14, R15)
	 606	SHA256ROUND1(25, 0xa831c66d, R15, R8, R9, R10, R11, R12, R13, R14)
	 607	SHA256ROUND1(26, 0xb00327c8, R14, R15, R8, R9, R10, R11, R12, R13)
	 608	SHA256ROUND1(27, 0xbf597fc7, R13, R14, R15, R8, R9, R10, R11, R12)
	 609	SHA256ROUND1(28, 0xc6e00bf3, R12, R13, R14, R15, R8, R9, R10, R11)
	 610	SHA256ROUND1(29, 0xd5a79147, R11, R12, R13, R14, R15, R8, R9, R10)
	 611	SHA256ROUND1(30, 0x06ca6351, R10, R11, R12, R13, R14, R15, R8, R9)
	 612	SHA256ROUND1(31, 0x14292967, R9, R10, R11, R12, R13, R14, R15, R8)
	 613	SHA256ROUND1(32, 0x27b70a85, R8, R9, R10, R11, R12, R13, R14, R15)
	 614	SHA256ROUND1(33, 0x2e1b2138, R15, R8, R9, R10, R11, R12, R13, R14)
	 615	SHA256ROUND1(34, 0x4d2c6dfc, R14, R15, R8, R9, R10, R11, R12, R13)
	 616	SHA256ROUND1(35, 0x53380d13, R13, R14, R15, R8, R9, R10, R11, R12)
	 617	SHA256ROUND1(36, 0x650a7354, R12, R13, R14, R15, R8, R9, R10, R11)
	 618	SHA256ROUND1(37, 0x766a0abb, R11, R12, R13, R14, R15, R8, R9, R10)
	 619	SHA256ROUND1(38, 0x81c2c92e, R10, R11, R12, R13, R14, R15, R8, R9)
	 620	SHA256ROUND1(39, 0x92722c85, R9, R10, R11, R12, R13, R14, R15, R8)
	 621	SHA256ROUND1(40, 0xa2bfe8a1, R8, R9, R10, R11, R12, R13, R14, R15)
	 622	SHA256ROUND1(41, 0xa81a664b, R15, R8, R9, R10, R11, R12, R13, R14)
	 623	SHA256ROUND1(42, 0xc24b8b70, R14, R15, R8, R9, R10, R11, R12, R13)
	 624	SHA256ROUND1(43, 0xc76c51a3, R13, R14, R15, R8, R9, R10, R11, R12)
	 625	SHA256ROUND1(44, 0xd192e819, R12, R13, R14, R15, R8, R9, R10, R11)
	 626	SHA256ROUND1(45, 0xd6990624, R11, R12, R13, R14, R15, R8, R9, R10)
	 627	SHA256ROUND1(46, 0xf40e3585, R10, R11, R12, R13, R14, R15, R8, R9)
	 628	SHA256ROUND1(47, 0x106aa070, R9, R10, R11, R12, R13, R14, R15, R8)
	 629	SHA256ROUND1(48, 0x19a4c116, R8, R9, R10, R11, R12, R13, R14, R15)
	 630	SHA256ROUND1(49, 0x1e376c08, R15, R8, R9, R10, R11, R12, R13, R14)
	 631	SHA256ROUND1(50, 0x2748774c, R14, R15, R8, R9, R10, R11, R12, R13)
	 632	SHA256ROUND1(51, 0x34b0bcb5, R13, R14, R15, R8, R9, R10, R11, R12)
	 633	SHA256ROUND1(52, 0x391c0cb3, R12, R13, R14, R15, R8, R9, R10, R11)
	 634	SHA256ROUND1(53, 0x4ed8aa4a, R11, R12, R13, R14, R15, R8, R9, R10)
	 635	SHA256ROUND1(54, 0x5b9cca4f, R10, R11, R12, R13, R14, R15, R8, R9)
	 636	SHA256ROUND1(55, 0x682e6ff3, R9, R10, R11, R12, R13, R14, R15, R8)
	 637	SHA256ROUND1(56, 0x748f82ee, R8, R9, R10, R11, R12, R13, R14, R15)
	 638	SHA256ROUND1(57, 0x78a5636f, R15, R8, R9, R10, R11, R12, R13, R14)
	 639	SHA256ROUND1(58, 0x84c87814, R14, R15, R8, R9, R10, R11, R12, R13)
	 640	SHA256ROUND1(59, 0x8cc70208, R13, R14, R15, R8, R9, R10, R11, R12)
	 641	SHA256ROUND1(60, 0x90befffa, R12, R13, R14, R15, R8, R9, R10, R11)
	 642	SHA256ROUND1(61, 0xa4506ceb, R11, R12, R13, R14, R15, R8, R9, R10)
	 643	SHA256ROUND1(62, 0xbef9a3f7, R10, R11, R12, R13, R14, R15, R8, R9)
	 644	SHA256ROUND1(63, 0xc67178f2, R9, R10, R11, R12, R13, R14, R15, R8)
	 645
	 646	MOVQ dig+0(FP), BP
	 647	ADDL (0*4)(BP), R8	// H0 = a + H0
	 648	MOVL R8, (0*4)(BP)
	 649	ADDL (1*4)(BP), R9	// H1 = b + H1
	 650	MOVL R9, (1*4)(BP)
	 651	ADDL (2*4)(BP), R10 // H2 = c + H2
	 652	MOVL R10, (2*4)(BP)
	 653	ADDL (3*4)(BP), R11 // H3 = d + H3
	 654	MOVL R11, (3*4)(BP)
	 655	ADDL (4*4)(BP), R12 // H4 = e + H4
	 656	MOVL R12, (4*4)(BP)
	 657	ADDL (5*4)(BP), R13 // H5 = f + H5
	 658	MOVL R13, (5*4)(BP)
	 659	ADDL (6*4)(BP), R14 // H6 = g + H6
	 660	MOVL R14, (6*4)(BP)
	 661	ADDL (7*4)(BP), R15 // H7 = h + H7
	 662	MOVL R15, (7*4)(BP)
	 663
	 664	ADDQ $64, SI
	 665	CMPQ SI, 256(SP)
	 666	JB	 loop
	 667
	 668end:
	 669	RET
	 670
	 671avx2:
	 672	MOVQ dig+0(FP), CTX					// d.h[8]
	 673	MOVQ p_base+8(FP), INP
	 674	MOVQ p_len+16(FP), NUM_BYTES
	 675
	 676	LEAQ -64(INP)(NUM_BYTES*1), NUM_BYTES // Pointer to the last block
	 677	MOVQ NUM_BYTES, _INP_END(SP)
	 678
	 679	CMPQ NUM_BYTES, INP
	 680	JE	 avx2_only_one_block
	 681
	 682	// Load initial digest
	 683	MOVL 0(CTX), a	// a = H0
	 684	MOVL 4(CTX), b	// b = H1
	 685	MOVL 8(CTX), c	// c = H2
	 686	MOVL 12(CTX), d // d = H3
	 687	MOVL 16(CTX), e // e = H4
	 688	MOVL 20(CTX), f // f = H5
	 689	MOVL 24(CTX), g // g = H6
	 690	MOVL 28(CTX), h // h = H7
	 691
	 692avx2_loop0: // at each iteration works with one block (512 bit)
	 693
	 694	VMOVDQU (0*32)(INP), XTMP0
	 695	VMOVDQU (1*32)(INP), XTMP1
	 696	VMOVDQU (2*32)(INP), XTMP2
	 697	VMOVDQU (3*32)(INP), XTMP3
	 698
	 699	VMOVDQU flip_mask<>(SB), BYTE_FLIP_MASK
	 700
	 701	// Apply Byte Flip Mask: LE -> BE
	 702	VPSHUFB BYTE_FLIP_MASK, XTMP0, XTMP0
	 703	VPSHUFB BYTE_FLIP_MASK, XTMP1, XTMP1
	 704	VPSHUFB BYTE_FLIP_MASK, XTMP2, XTMP2
	 705	VPSHUFB BYTE_FLIP_MASK, XTMP3, XTMP3
	 706
	 707	// Transpose data into high/low parts
	 708	VPERM2I128 $0x20, XTMP2, XTMP0, XDWORD0 // w3, w2, w1, w0
	 709	VPERM2I128 $0x31, XTMP2, XTMP0, XDWORD1 // w7, w6, w5, w4
	 710	VPERM2I128 $0x20, XTMP3, XTMP1, XDWORD2 // w11, w10, w9, w8
	 711	VPERM2I128 $0x31, XTMP3, XTMP1, XDWORD3 // w15, w14, w13, w12
	 712
	 713	MOVQ $K256<>(SB), TBL // Loading address of table with round-specific constants
	 714
	 715avx2_last_block_enter:
	 716	ADDQ $64, INP
	 717	MOVQ INP, _INP(SP)
	 718	XORQ SRND, SRND
	 719
	 720avx2_loop1: // for w0 - w47
	 721	// Do 4 rounds and scheduling
	 722	VPADDD	0*32(TBL)(SRND*1), XDWORD0, XFER
	 723	VMOVDQU XFER, (_XFER + 0*32)(SP)(SRND*1)
	 724	ROUND_AND_SCHED_N_0(_XFER + 0*32, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
	 725	ROUND_AND_SCHED_N_1(_XFER + 0*32, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
	 726	ROUND_AND_SCHED_N_2(_XFER + 0*32, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
	 727	ROUND_AND_SCHED_N_3(_XFER + 0*32, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
	 728
	 729	// Do 4 rounds and scheduling
	 730	VPADDD	1*32(TBL)(SRND*1), XDWORD1, XFER
	 731	VMOVDQU XFER, (_XFER + 1*32)(SP)(SRND*1)
	 732	ROUND_AND_SCHED_N_0(_XFER + 1*32, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
	 733	ROUND_AND_SCHED_N_1(_XFER + 1*32, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
	 734	ROUND_AND_SCHED_N_2(_XFER + 1*32, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
	 735	ROUND_AND_SCHED_N_3(_XFER + 1*32, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
	 736
	 737	// Do 4 rounds and scheduling
	 738	VPADDD	2*32(TBL)(SRND*1), XDWORD2, XFER
	 739	VMOVDQU XFER, (_XFER + 2*32)(SP)(SRND*1)
	 740	ROUND_AND_SCHED_N_0(_XFER + 2*32, a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
	 741	ROUND_AND_SCHED_N_1(_XFER + 2*32, h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
	 742	ROUND_AND_SCHED_N_2(_XFER + 2*32, g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
	 743	ROUND_AND_SCHED_N_3(_XFER + 2*32, f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
	 744
	 745	// Do 4 rounds and scheduling
	 746	VPADDD	3*32(TBL)(SRND*1), XDWORD3, XFER
	 747	VMOVDQU XFER, (_XFER + 3*32)(SP)(SRND*1)
	 748	ROUND_AND_SCHED_N_0(_XFER + 3*32, e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
	 749	ROUND_AND_SCHED_N_1(_XFER + 3*32, d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
	 750	ROUND_AND_SCHED_N_2(_XFER + 3*32, c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
	 751	ROUND_AND_SCHED_N_3(_XFER + 3*32, b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
	 752
	 753	ADDQ $4*32, SRND
	 754	CMPQ SRND, $3*4*32
	 755	JB	 avx2_loop1
	 756
	 757avx2_loop2:
	 758	// w48 - w63 processed with no scheduling (last 16 rounds)
	 759	VPADDD	0*32(TBL)(SRND*1), XDWORD0, XFER
	 760	VMOVDQU XFER, (_XFER + 0*32)(SP)(SRND*1)
	 761	DO_ROUND_N_0(_XFER + 0*32, a, b, c, d, e, f, g, h, h)
	 762	DO_ROUND_N_1(_XFER + 0*32, h, a, b, c, d, e, f, g, h)
	 763	DO_ROUND_N_2(_XFER + 0*32, g, h, a, b, c, d, e, f, g)
	 764	DO_ROUND_N_3(_XFER + 0*32, f, g, h, a, b, c, d, e, f)
	 765
	 766	VPADDD	1*32(TBL)(SRND*1), XDWORD1, XFER
	 767	VMOVDQU XFER, (_XFER + 1*32)(SP)(SRND*1)
	 768	DO_ROUND_N_0(_XFER + 1*32, e, f, g, h, a, b, c, d, e)
	 769	DO_ROUND_N_1(_XFER + 1*32, d, e, f, g, h, a, b, c, d)
	 770	DO_ROUND_N_2(_XFER + 1*32, c, d, e, f, g, h, a, b, c)
	 771	DO_ROUND_N_3(_XFER + 1*32, b, c, d, e, f, g, h, a, b)
	 772
	 773	ADDQ $2*32, SRND
	 774
	 775	VMOVDQU XDWORD2, XDWORD0
	 776	VMOVDQU XDWORD3, XDWORD1
	 777
	 778	CMPQ SRND, $4*4*32
	 779	JB	 avx2_loop2
	 780
	 781	MOVQ dig+0(FP), CTX // d.h[8]
	 782	MOVQ _INP(SP), INP
	 783
	 784	addm(	0(CTX), a)
	 785	addm(	4(CTX), b)
	 786	addm(	8(CTX), c)
	 787	addm( 12(CTX), d)
	 788	addm( 16(CTX), e)
	 789	addm( 20(CTX), f)
	 790	addm( 24(CTX), g)
	 791	addm( 28(CTX), h)
	 792
	 793	CMPQ _INP_END(SP), INP
	 794	JB	 done_hash
	 795
	 796	XORQ SRND, SRND
	 797
	 798avx2_loop3: // Do second block using previously scheduled results
	 799	DO_ROUND_N_0(_XFER + 0*32 + 16, a, b, c, d, e, f, g, h, a)
	 800	DO_ROUND_N_1(_XFER + 0*32 + 16, h, a, b, c, d, e, f, g, h)
	 801	DO_ROUND_N_2(_XFER + 0*32 + 16, g, h, a, b, c, d, e, f, g)
	 802	DO_ROUND_N_3(_XFER + 0*32 + 16, f, g, h, a, b, c, d, e, f)
	 803
	 804	DO_ROUND_N_0(_XFER + 1*32 + 16, e, f, g, h, a, b, c, d, e)
	 805	DO_ROUND_N_1(_XFER + 1*32 + 16, d, e, f, g, h, a, b, c, d)
	 806	DO_ROUND_N_2(_XFER + 1*32 + 16, c, d, e, f, g, h, a, b, c)
	 807	DO_ROUND_N_3(_XFER + 1*32 + 16, b, c, d, e, f, g, h, a, b)
	 808
	 809	ADDQ $2*32, SRND
	 810	CMPQ SRND, $4*4*32
	 811	JB	 avx2_loop3
	 812
	 813	MOVQ dig+0(FP), CTX // d.h[8]
	 814	MOVQ _INP(SP), INP
	 815	ADDQ $64, INP
	 816
	 817	addm(	0(CTX), a)
	 818	addm(	4(CTX), b)
	 819	addm(	8(CTX), c)
	 820	addm( 12(CTX), d)
	 821	addm( 16(CTX), e)
	 822	addm( 20(CTX), f)
	 823	addm( 24(CTX), g)
	 824	addm( 28(CTX), h)
	 825
	 826	CMPQ _INP_END(SP), INP
	 827	JA	 avx2_loop0
	 828	JB	 done_hash
	 829
	 830avx2_do_last_block:
	 831
	 832	VMOVDQU 0(INP), XWORD0
	 833	VMOVDQU 16(INP), XWORD1
	 834	VMOVDQU 32(INP), XWORD2
	 835	VMOVDQU 48(INP), XWORD3
	 836
	 837	VMOVDQU flip_mask<>(SB), BYTE_FLIP_MASK
	 838
	 839	VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
	 840	VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
	 841	VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
	 842	VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
	 843
	 844	MOVQ $K256<>(SB), TBL
	 845
	 846	JMP avx2_last_block_enter
	 847
	 848avx2_only_one_block:
	 849	// Load initial digest
	 850	MOVL 0(CTX), a	// a = H0
	 851	MOVL 4(CTX), b	// b = H1
	 852	MOVL 8(CTX), c	// c = H2
	 853	MOVL 12(CTX), d // d = H3
	 854	MOVL 16(CTX), e // e = H4
	 855	MOVL 20(CTX), f // f = H5
	 856	MOVL 24(CTX), g // g = H6
	 857	MOVL 28(CTX), h // h = H7
	 858
	 859	JMP avx2_do_last_block
	 860
	 861done_hash:
	 862	VZEROUPPER
	 863	RET
	 864
	 865// shuffle byte order from LE to BE
	 866DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203
	 867DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
	 868DATA flip_mask<>+0x10(SB)/8, $0x0405060700010203
	 869DATA flip_mask<>+0x18(SB)/8, $0x0c0d0e0f08090a0b
	 870GLOBL flip_mask<>(SB), 8, $32
	 871
	 872// shuffle xBxA -> 00BA
	 873DATA shuff_00BA<>+0x00(SB)/8, $0x0b0a090803020100
	 874DATA shuff_00BA<>+0x08(SB)/8, $0xFFFFFFFFFFFFFFFF
	 875DATA shuff_00BA<>+0x10(SB)/8, $0x0b0a090803020100
	 876DATA shuff_00BA<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
	 877GLOBL shuff_00BA<>(SB), 8, $32
	 878
	 879// shuffle xDxC -> DC00
	 880DATA shuff_DC00<>+0x00(SB)/8, $0xFFFFFFFFFFFFFFFF
	 881DATA shuff_DC00<>+0x08(SB)/8, $0x0b0a090803020100
	 882DATA shuff_DC00<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
	 883DATA shuff_DC00<>+0x18(SB)/8, $0x0b0a090803020100
	 884GLOBL shuff_DC00<>(SB), 8, $32
	 885
	 886// Round specific constants
	 887DATA K256<>+0x00(SB)/4, $0x428a2f98 // k1
	 888DATA K256<>+0x04(SB)/4, $0x71374491 // k2
	 889DATA K256<>+0x08(SB)/4, $0xb5c0fbcf // k3
	 890DATA K256<>+0x0c(SB)/4, $0xe9b5dba5 // k4
	 891DATA K256<>+0x10(SB)/4, $0x428a2f98 // k1
	 892DATA K256<>+0x14(SB)/4, $0x71374491 // k2
	 893DATA K256<>+0x18(SB)/4, $0xb5c0fbcf // k3
	 894DATA K256<>+0x1c(SB)/4, $0xe9b5dba5 // k4
	 895
	 896DATA K256<>+0x20(SB)/4, $0x3956c25b // k5 - k8
	 897DATA K256<>+0x24(SB)/4, $0x59f111f1
	 898DATA K256<>+0x28(SB)/4, $0x923f82a4
	 899DATA K256<>+0x2c(SB)/4, $0xab1c5ed5
	 900DATA K256<>+0x30(SB)/4, $0x3956c25b
	 901DATA K256<>+0x34(SB)/4, $0x59f111f1
	 902DATA K256<>+0x38(SB)/4, $0x923f82a4
	 903DATA K256<>+0x3c(SB)/4, $0xab1c5ed5
	 904
	 905DATA K256<>+0x40(SB)/4, $0xd807aa98 // k9 - k12
	 906DATA K256<>+0x44(SB)/4, $0x12835b01
	 907DATA K256<>+0x48(SB)/4, $0x243185be
	 908DATA K256<>+0x4c(SB)/4, $0x550c7dc3
	 909DATA K256<>+0x50(SB)/4, $0xd807aa98
	 910DATA K256<>+0x54(SB)/4, $0x12835b01
	 911DATA K256<>+0x58(SB)/4, $0x243185be
	 912DATA K256<>+0x5c(SB)/4, $0x550c7dc3
	 913
	 914DATA K256<>+0x60(SB)/4, $0x72be5d74 // k13 - k16
	 915DATA K256<>+0x64(SB)/4, $0x80deb1fe
	 916DATA K256<>+0x68(SB)/4, $0x9bdc06a7
	 917DATA K256<>+0x6c(SB)/4, $0xc19bf174
	 918DATA K256<>+0x70(SB)/4, $0x72be5d74
	 919DATA K256<>+0x74(SB)/4, $0x80deb1fe
	 920DATA K256<>+0x78(SB)/4, $0x9bdc06a7
	 921DATA K256<>+0x7c(SB)/4, $0xc19bf174
	 922
	 923DATA K256<>+0x80(SB)/4, $0xe49b69c1 // k17 - k20
	 924DATA K256<>+0x84(SB)/4, $0xefbe4786
	 925DATA K256<>+0x88(SB)/4, $0x0fc19dc6
	 926DATA K256<>+0x8c(SB)/4, $0x240ca1cc
	 927DATA K256<>+0x90(SB)/4, $0xe49b69c1
	 928DATA K256<>+0x94(SB)/4, $0xefbe4786
	 929DATA K256<>+0x98(SB)/4, $0x0fc19dc6
	 930DATA K256<>+0x9c(SB)/4, $0x240ca1cc
	 931
	 932DATA K256<>+0xa0(SB)/4, $0x2de92c6f // k21 - k24
	 933DATA K256<>+0xa4(SB)/4, $0x4a7484aa
	 934DATA K256<>+0xa8(SB)/4, $0x5cb0a9dc
	 935DATA K256<>+0xac(SB)/4, $0x76f988da
	 936DATA K256<>+0xb0(SB)/4, $0x2de92c6f
	 937DATA K256<>+0xb4(SB)/4, $0x4a7484aa
	 938DATA K256<>+0xb8(SB)/4, $0x5cb0a9dc
	 939DATA K256<>+0xbc(SB)/4, $0x76f988da
	 940
	 941DATA K256<>+0xc0(SB)/4, $0x983e5152 // k25 - k28
	 942DATA K256<>+0xc4(SB)/4, $0xa831c66d
	 943DATA K256<>+0xc8(SB)/4, $0xb00327c8
	 944DATA K256<>+0xcc(SB)/4, $0xbf597fc7
	 945DATA K256<>+0xd0(SB)/4, $0x983e5152
	 946DATA K256<>+0xd4(SB)/4, $0xa831c66d
	 947DATA K256<>+0xd8(SB)/4, $0xb00327c8
	 948DATA K256<>+0xdc(SB)/4, $0xbf597fc7
	 949
	 950DATA K256<>+0xe0(SB)/4, $0xc6e00bf3 // k29 - k32
	 951DATA K256<>+0xe4(SB)/4, $0xd5a79147
	 952DATA K256<>+0xe8(SB)/4, $0x06ca6351
	 953DATA K256<>+0xec(SB)/4, $0x14292967
	 954DATA K256<>+0xf0(SB)/4, $0xc6e00bf3
	 955DATA K256<>+0xf4(SB)/4, $0xd5a79147
	 956DATA K256<>+0xf8(SB)/4, $0x06ca6351
	 957DATA K256<>+0xfc(SB)/4, $0x14292967
	 958
	 959DATA K256<>+0x100(SB)/4, $0x27b70a85
	 960DATA K256<>+0x104(SB)/4, $0x2e1b2138
	 961DATA K256<>+0x108(SB)/4, $0x4d2c6dfc
	 962DATA K256<>+0x10c(SB)/4, $0x53380d13
	 963DATA K256<>+0x110(SB)/4, $0x27b70a85
	 964DATA K256<>+0x114(SB)/4, $0x2e1b2138
	 965DATA K256<>+0x118(SB)/4, $0x4d2c6dfc
	 966DATA K256<>+0x11c(SB)/4, $0x53380d13
	 967
	 968DATA K256<>+0x120(SB)/4, $0x650a7354
	 969DATA K256<>+0x124(SB)/4, $0x766a0abb
	 970DATA K256<>+0x128(SB)/4, $0x81c2c92e
	 971DATA K256<>+0x12c(SB)/4, $0x92722c85
	 972DATA K256<>+0x130(SB)/4, $0x650a7354
	 973DATA K256<>+0x134(SB)/4, $0x766a0abb
	 974DATA K256<>+0x138(SB)/4, $0x81c2c92e
	 975DATA K256<>+0x13c(SB)/4, $0x92722c85
	 976
	 977DATA K256<>+0x140(SB)/4, $0xa2bfe8a1
	 978DATA K256<>+0x144(SB)/4, $0xa81a664b
	 979DATA K256<>+0x148(SB)/4, $0xc24b8b70
	 980DATA K256<>+0x14c(SB)/4, $0xc76c51a3
	 981DATA K256<>+0x150(SB)/4, $0xa2bfe8a1
	 982DATA K256<>+0x154(SB)/4, $0xa81a664b
	 983DATA K256<>+0x158(SB)/4, $0xc24b8b70
	 984DATA K256<>+0x15c(SB)/4, $0xc76c51a3
	 985
	 986DATA K256<>+0x160(SB)/4, $0xd192e819
	 987DATA K256<>+0x164(SB)/4, $0xd6990624
	 988DATA K256<>+0x168(SB)/4, $0xf40e3585
	 989DATA K256<>+0x16c(SB)/4, $0x106aa070
	 990DATA K256<>+0x170(SB)/4, $0xd192e819
	 991DATA K256<>+0x174(SB)/4, $0xd6990624
	 992DATA K256<>+0x178(SB)/4, $0xf40e3585
	 993DATA K256<>+0x17c(SB)/4, $0x106aa070
	 994
	 995DATA K256<>+0x180(SB)/4, $0x19a4c116
	 996DATA K256<>+0x184(SB)/4, $0x1e376c08
	 997DATA K256<>+0x188(SB)/4, $0x2748774c
	 998DATA K256<>+0x18c(SB)/4, $0x34b0bcb5
	 999DATA K256<>+0x190(SB)/4, $0x19a4c116
	1000DATA K256<>+0x194(SB)/4, $0x1e376c08
	1001DATA K256<>+0x198(SB)/4, $0x2748774c
	1002DATA K256<>+0x19c(SB)/4, $0x34b0bcb5
	1003
	1004DATA K256<>+0x1a0(SB)/4, $0x391c0cb3
	1005DATA K256<>+0x1a4(SB)/4, $0x4ed8aa4a
	1006DATA K256<>+0x1a8(SB)/4, $0x5b9cca4f
	1007DATA K256<>+0x1ac(SB)/4, $0x682e6ff3
	1008DATA K256<>+0x1b0(SB)/4, $0x391c0cb3
	1009DATA K256<>+0x1b4(SB)/4, $0x4ed8aa4a
	1010DATA K256<>+0x1b8(SB)/4, $0x5b9cca4f
	1011DATA K256<>+0x1bc(SB)/4, $0x682e6ff3
	1012
	1013DATA K256<>+0x1c0(SB)/4, $0x748f82ee
	1014DATA K256<>+0x1c4(SB)/4, $0x78a5636f
	1015DATA K256<>+0x1c8(SB)/4, $0x84c87814
	1016DATA K256<>+0x1cc(SB)/4, $0x8cc70208
	1017DATA K256<>+0x1d0(SB)/4, $0x748f82ee
	1018DATA K256<>+0x1d4(SB)/4, $0x78a5636f
	1019DATA K256<>+0x1d8(SB)/4, $0x84c87814
	1020DATA K256<>+0x1dc(SB)/4, $0x8cc70208
	1021
	1022DATA K256<>+0x1e0(SB)/4, $0x90befffa
	1023DATA K256<>+0x1e4(SB)/4, $0xa4506ceb
	1024DATA K256<>+0x1e8(SB)/4, $0xbef9a3f7
	1025DATA K256<>+0x1ec(SB)/4, $0xc67178f2
	1026DATA K256<>+0x1f0(SB)/4, $0x90befffa
	1027DATA K256<>+0x1f4(SB)/4, $0xa4506ceb
	1028DATA K256<>+0x1f8(SB)/4, $0xbef9a3f7
	1029DATA K256<>+0x1fc(SB)/4, $0xc67178f2
	1030
	1031GLOBL K256<>(SB), (NOPTR + RODATA), $512

View as plain text