...

Text file src/crypto/sha512/sha512block_amd64.s

Documentation: crypto/sha512

		 1// Copyright 2013 The Go Authors. All rights reserved.
		 2// Use of this source code is governed by a BSD-style
		 3// license that can be found in the LICENSE file.
		 4
		 5#include "textflag.h"
		 6
		 7// SHA512 block routine. See sha512block.go for Go equivalent.
		 8//
		 9// The algorithm is detailed in FIPS 180-4:
		10//
		11//	https://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf
		12//
		13// Wt = Mt; for 0 <= t <= 15
		14// Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 79
		15//
		16// a = H0
		17// b = H1
		18// c = H2
		19// d = H3
		20// e = H4
		21// f = H5
		22// g = H6
		23// h = H7
		24//
		25// for t = 0 to 79 {
		26//		T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt
		27//		T2 = BIGSIGMA0(a) + Maj(a,b,c)
		28//		h = g
		29//		g = f
		30//		f = e
		31//		e = d + T1
		32//		d = c
		33//		c = b
		34//		b = a
		35//		a = T1 + T2
		36// }
		37//
		38// H0 = a + H0
		39// H1 = b + H1
		40// H2 = c + H2
		41// H3 = d + H3
		42// H4 = e + H4
		43// H5 = f + H5
		44// H6 = g + H6
		45// H7 = h + H7
		46
		47// Wt = Mt; for 0 <= t <= 15
		48#define MSGSCHEDULE0(index) \
		49	MOVQ	(index*8)(SI), AX; \
		50	BSWAPQ	AX; \
		51	MOVQ	AX, (index*8)(BP)
		52
		53// Wt = SIGMA1(Wt-2) + Wt-7 + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 79
		54//	 SIGMA0(x) = ROTR(1,x) XOR ROTR(8,x) XOR SHR(7,x)
		55//	 SIGMA1(x) = ROTR(19,x) XOR ROTR(61,x) XOR SHR(6,x)
		56#define MSGSCHEDULE1(index) \
		57	MOVQ	((index-2)*8)(BP), AX; \
		58	MOVQ	AX, CX; \
		59	RORQ	$19, AX; \
		60	MOVQ	CX, DX; \
		61	RORQ	$61, CX; \
		62	SHRQ	$6, DX; \
		63	MOVQ	((index-15)*8)(BP), BX; \
		64	XORQ	CX, AX; \
		65	MOVQ	BX, CX; \
		66	XORQ	DX, AX; \
		67	RORQ	$1, BX; \
		68	MOVQ	CX, DX; \
		69	SHRQ	$7, DX; \
		70	RORQ	$8, CX; \
		71	ADDQ	((index-7)*8)(BP), AX; \
		72	XORQ	CX, BX; \
		73	XORQ	DX, BX; \
		74	ADDQ	((index-16)*8)(BP), BX; \
		75	ADDQ	BX, AX; \
		76	MOVQ	AX, ((index)*8)(BP)
		77
		78// Calculate T1 in AX - uses AX, CX and DX registers.
		79// h is also used as an accumulator. Wt is passed in AX.
		80//	 T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + Kt + Wt
		81//		 BIGSIGMA1(x) = ROTR(14,x) XOR ROTR(18,x) XOR ROTR(41,x)
		82//		 Ch(x, y, z) = (x AND y) XOR (NOT x AND z)
		83#define SHA512T1(const, e, f, g, h) \
		84	MOVQ	$const, DX; \
		85	ADDQ	AX, h; \
		86	MOVQ	e, AX; \
		87	ADDQ	DX, h; \
		88	MOVQ	e, CX; \
		89	RORQ	$14, AX; \
		90	MOVQ	e, DX; \
		91	RORQ	$18, CX; \
		92	XORQ	CX, AX; \
		93	MOVQ	e, CX; \
		94	RORQ	$41, DX; \
		95	ANDQ	f, CX; \
		96	XORQ	AX, DX; \
		97	MOVQ	e, AX; \
		98	NOTQ	AX; \
		99	ADDQ	DX, h; \
	 100	ANDQ	g, AX; \
	 101	XORQ	CX, AX; \
	 102	ADDQ	h, AX
	 103
	 104// Calculate T2 in BX - uses BX, CX, DX and DI registers.
	 105//	 T2 = BIGSIGMA0(a) + Maj(a, b, c)
	 106//		 BIGSIGMA0(x) = ROTR(28,x) XOR ROTR(34,x) XOR ROTR(39,x)
	 107//		 Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z)
	 108#define SHA512T2(a, b, c) \
	 109	MOVQ	a, DI; \
	 110	MOVQ	c, BX; \
	 111	RORQ	$28, DI; \
	 112	MOVQ	a, DX; \
	 113	ANDQ	b, BX; \
	 114	RORQ	$34, DX; \
	 115	MOVQ	a, CX; \
	 116	ANDQ	c, CX; \
	 117	XORQ	DX, DI; \
	 118	XORQ	CX, BX; \
	 119	MOVQ	a, DX; \
	 120	MOVQ	b, CX; \
	 121	RORQ	$39, DX; \
	 122	ANDQ	a, CX; \
	 123	XORQ	CX, BX; \
	 124	XORQ	DX, DI; \
	 125	ADDQ	DI, BX
	 126
	 127// Calculate T1 and T2, then e = d + T1 and a = T1 + T2.
	 128// The values for e and a are stored in d and h, ready for rotation.
	 129#define SHA512ROUND(index, const, a, b, c, d, e, f, g, h) \
	 130	SHA512T1(const, e, f, g, h); \
	 131	SHA512T2(a, b, c); \
	 132	MOVQ	BX, h; \
	 133	ADDQ	AX, d; \
	 134	ADDQ	AX, h
	 135
	 136#define SHA512ROUND0(index, const, a, b, c, d, e, f, g, h) \
	 137	MSGSCHEDULE0(index); \
	 138	SHA512ROUND(index, const, a, b, c, d, e, f, g, h)
	 139
	 140#define SHA512ROUND1(index, const, a, b, c, d, e, f, g, h) \
	 141	MSGSCHEDULE1(index); \
	 142	SHA512ROUND(index, const, a, b, c, d, e, f, g, h)
	 143
	 144TEXT ·blockAMD64(SB),0,$648-32
	 145	MOVQ	p_base+8(FP), SI
	 146	MOVQ	p_len+16(FP), DX
	 147	SHRQ	$7, DX
	 148	SHLQ	$7, DX
	 149
	 150	LEAQ	(SI)(DX*1), DI
	 151	MOVQ	DI, 640(SP)
	 152	CMPQ	SI, DI
	 153	JEQ	end
	 154
	 155	MOVQ	dig+0(FP), BP
	 156	MOVQ	(0*8)(BP), R8		// a = H0
	 157	MOVQ	(1*8)(BP), R9		// b = H1
	 158	MOVQ	(2*8)(BP), R10		// c = H2
	 159	MOVQ	(3*8)(BP), R11		// d = H3
	 160	MOVQ	(4*8)(BP), R12		// e = H4
	 161	MOVQ	(5*8)(BP), R13		// f = H5
	 162	MOVQ	(6*8)(BP), R14		// g = H6
	 163	MOVQ	(7*8)(BP), R15		// h = H7
	 164
	 165loop:
	 166	MOVQ	SP, BP			// message schedule
	 167
	 168	SHA512ROUND0(0, 0x428a2f98d728ae22, R8, R9, R10, R11, R12, R13, R14, R15)
	 169	SHA512ROUND0(1, 0x7137449123ef65cd, R15, R8, R9, R10, R11, R12, R13, R14)
	 170	SHA512ROUND0(2, 0xb5c0fbcfec4d3b2f, R14, R15, R8, R9, R10, R11, R12, R13)
	 171	SHA512ROUND0(3, 0xe9b5dba58189dbbc, R13, R14, R15, R8, R9, R10, R11, R12)
	 172	SHA512ROUND0(4, 0x3956c25bf348b538, R12, R13, R14, R15, R8, R9, R10, R11)
	 173	SHA512ROUND0(5, 0x59f111f1b605d019, R11, R12, R13, R14, R15, R8, R9, R10)
	 174	SHA512ROUND0(6, 0x923f82a4af194f9b, R10, R11, R12, R13, R14, R15, R8, R9)
	 175	SHA512ROUND0(7, 0xab1c5ed5da6d8118, R9, R10, R11, R12, R13, R14, R15, R8)
	 176	SHA512ROUND0(8, 0xd807aa98a3030242, R8, R9, R10, R11, R12, R13, R14, R15)
	 177	SHA512ROUND0(9, 0x12835b0145706fbe, R15, R8, R9, R10, R11, R12, R13, R14)
	 178	SHA512ROUND0(10, 0x243185be4ee4b28c, R14, R15, R8, R9, R10, R11, R12, R13)
	 179	SHA512ROUND0(11, 0x550c7dc3d5ffb4e2, R13, R14, R15, R8, R9, R10, R11, R12)
	 180	SHA512ROUND0(12, 0x72be5d74f27b896f, R12, R13, R14, R15, R8, R9, R10, R11)
	 181	SHA512ROUND0(13, 0x80deb1fe3b1696b1, R11, R12, R13, R14, R15, R8, R9, R10)
	 182	SHA512ROUND0(14, 0x9bdc06a725c71235, R10, R11, R12, R13, R14, R15, R8, R9)
	 183	SHA512ROUND0(15, 0xc19bf174cf692694, R9, R10, R11, R12, R13, R14, R15, R8)
	 184
	 185	SHA512ROUND1(16, 0xe49b69c19ef14ad2, R8, R9, R10, R11, R12, R13, R14, R15)
	 186	SHA512ROUND1(17, 0xefbe4786384f25e3, R15, R8, R9, R10, R11, R12, R13, R14)
	 187	SHA512ROUND1(18, 0x0fc19dc68b8cd5b5, R14, R15, R8, R9, R10, R11, R12, R13)
	 188	SHA512ROUND1(19, 0x240ca1cc77ac9c65, R13, R14, R15, R8, R9, R10, R11, R12)
	 189	SHA512ROUND1(20, 0x2de92c6f592b0275, R12, R13, R14, R15, R8, R9, R10, R11)
	 190	SHA512ROUND1(21, 0x4a7484aa6ea6e483, R11, R12, R13, R14, R15, R8, R9, R10)
	 191	SHA512ROUND1(22, 0x5cb0a9dcbd41fbd4, R10, R11, R12, R13, R14, R15, R8, R9)
	 192	SHA512ROUND1(23, 0x76f988da831153b5, R9, R10, R11, R12, R13, R14, R15, R8)
	 193	SHA512ROUND1(24, 0x983e5152ee66dfab, R8, R9, R10, R11, R12, R13, R14, R15)
	 194	SHA512ROUND1(25, 0xa831c66d2db43210, R15, R8, R9, R10, R11, R12, R13, R14)
	 195	SHA512ROUND1(26, 0xb00327c898fb213f, R14, R15, R8, R9, R10, R11, R12, R13)
	 196	SHA512ROUND1(27, 0xbf597fc7beef0ee4, R13, R14, R15, R8, R9, R10, R11, R12)
	 197	SHA512ROUND1(28, 0xc6e00bf33da88fc2, R12, R13, R14, R15, R8, R9, R10, R11)
	 198	SHA512ROUND1(29, 0xd5a79147930aa725, R11, R12, R13, R14, R15, R8, R9, R10)
	 199	SHA512ROUND1(30, 0x06ca6351e003826f, R10, R11, R12, R13, R14, R15, R8, R9)
	 200	SHA512ROUND1(31, 0x142929670a0e6e70, R9, R10, R11, R12, R13, R14, R15, R8)
	 201	SHA512ROUND1(32, 0x27b70a8546d22ffc, R8, R9, R10, R11, R12, R13, R14, R15)
	 202	SHA512ROUND1(33, 0x2e1b21385c26c926, R15, R8, R9, R10, R11, R12, R13, R14)
	 203	SHA512ROUND1(34, 0x4d2c6dfc5ac42aed, R14, R15, R8, R9, R10, R11, R12, R13)
	 204	SHA512ROUND1(35, 0x53380d139d95b3df, R13, R14, R15, R8, R9, R10, R11, R12)
	 205	SHA512ROUND1(36, 0x650a73548baf63de, R12, R13, R14, R15, R8, R9, R10, R11)
	 206	SHA512ROUND1(37, 0x766a0abb3c77b2a8, R11, R12, R13, R14, R15, R8, R9, R10)
	 207	SHA512ROUND1(38, 0x81c2c92e47edaee6, R10, R11, R12, R13, R14, R15, R8, R9)
	 208	SHA512ROUND1(39, 0x92722c851482353b, R9, R10, R11, R12, R13, R14, R15, R8)
	 209	SHA512ROUND1(40, 0xa2bfe8a14cf10364, R8, R9, R10, R11, R12, R13, R14, R15)
	 210	SHA512ROUND1(41, 0xa81a664bbc423001, R15, R8, R9, R10, R11, R12, R13, R14)
	 211	SHA512ROUND1(42, 0xc24b8b70d0f89791, R14, R15, R8, R9, R10, R11, R12, R13)
	 212	SHA512ROUND1(43, 0xc76c51a30654be30, R13, R14, R15, R8, R9, R10, R11, R12)
	 213	SHA512ROUND1(44, 0xd192e819d6ef5218, R12, R13, R14, R15, R8, R9, R10, R11)
	 214	SHA512ROUND1(45, 0xd69906245565a910, R11, R12, R13, R14, R15, R8, R9, R10)
	 215	SHA512ROUND1(46, 0xf40e35855771202a, R10, R11, R12, R13, R14, R15, R8, R9)
	 216	SHA512ROUND1(47, 0x106aa07032bbd1b8, R9, R10, R11, R12, R13, R14, R15, R8)
	 217	SHA512ROUND1(48, 0x19a4c116b8d2d0c8, R8, R9, R10, R11, R12, R13, R14, R15)
	 218	SHA512ROUND1(49, 0x1e376c085141ab53, R15, R8, R9, R10, R11, R12, R13, R14)
	 219	SHA512ROUND1(50, 0x2748774cdf8eeb99, R14, R15, R8, R9, R10, R11, R12, R13)
	 220	SHA512ROUND1(51, 0x34b0bcb5e19b48a8, R13, R14, R15, R8, R9, R10, R11, R12)
	 221	SHA512ROUND1(52, 0x391c0cb3c5c95a63, R12, R13, R14, R15, R8, R9, R10, R11)
	 222	SHA512ROUND1(53, 0x4ed8aa4ae3418acb, R11, R12, R13, R14, R15, R8, R9, R10)
	 223	SHA512ROUND1(54, 0x5b9cca4f7763e373, R10, R11, R12, R13, R14, R15, R8, R9)
	 224	SHA512ROUND1(55, 0x682e6ff3d6b2b8a3, R9, R10, R11, R12, R13, R14, R15, R8)
	 225	SHA512ROUND1(56, 0x748f82ee5defb2fc, R8, R9, R10, R11, R12, R13, R14, R15)
	 226	SHA512ROUND1(57, 0x78a5636f43172f60, R15, R8, R9, R10, R11, R12, R13, R14)
	 227	SHA512ROUND1(58, 0x84c87814a1f0ab72, R14, R15, R8, R9, R10, R11, R12, R13)
	 228	SHA512ROUND1(59, 0x8cc702081a6439ec, R13, R14, R15, R8, R9, R10, R11, R12)
	 229	SHA512ROUND1(60, 0x90befffa23631e28, R12, R13, R14, R15, R8, R9, R10, R11)
	 230	SHA512ROUND1(61, 0xa4506cebde82bde9, R11, R12, R13, R14, R15, R8, R9, R10)
	 231	SHA512ROUND1(62, 0xbef9a3f7b2c67915, R10, R11, R12, R13, R14, R15, R8, R9)
	 232	SHA512ROUND1(63, 0xc67178f2e372532b, R9, R10, R11, R12, R13, R14, R15, R8)
	 233	SHA512ROUND1(64, 0xca273eceea26619c, R8, R9, R10, R11, R12, R13, R14, R15)
	 234	SHA512ROUND1(65, 0xd186b8c721c0c207, R15, R8, R9, R10, R11, R12, R13, R14)
	 235	SHA512ROUND1(66, 0xeada7dd6cde0eb1e, R14, R15, R8, R9, R10, R11, R12, R13)
	 236	SHA512ROUND1(67, 0xf57d4f7fee6ed178, R13, R14, R15, R8, R9, R10, R11, R12)
	 237	SHA512ROUND1(68, 0x06f067aa72176fba, R12, R13, R14, R15, R8, R9, R10, R11)
	 238	SHA512ROUND1(69, 0x0a637dc5a2c898a6, R11, R12, R13, R14, R15, R8, R9, R10)
	 239	SHA512ROUND1(70, 0x113f9804bef90dae, R10, R11, R12, R13, R14, R15, R8, R9)
	 240	SHA512ROUND1(71, 0x1b710b35131c471b, R9, R10, R11, R12, R13, R14, R15, R8)
	 241	SHA512ROUND1(72, 0x28db77f523047d84, R8, R9, R10, R11, R12, R13, R14, R15)
	 242	SHA512ROUND1(73, 0x32caab7b40c72493, R15, R8, R9, R10, R11, R12, R13, R14)
	 243	SHA512ROUND1(74, 0x3c9ebe0a15c9bebc, R14, R15, R8, R9, R10, R11, R12, R13)
	 244	SHA512ROUND1(75, 0x431d67c49c100d4c, R13, R14, R15, R8, R9, R10, R11, R12)
	 245	SHA512ROUND1(76, 0x4cc5d4becb3e42b6, R12, R13, R14, R15, R8, R9, R10, R11)
	 246	SHA512ROUND1(77, 0x597f299cfc657e2a, R11, R12, R13, R14, R15, R8, R9, R10)
	 247	SHA512ROUND1(78, 0x5fcb6fab3ad6faec, R10, R11, R12, R13, R14, R15, R8, R9)
	 248	SHA512ROUND1(79, 0x6c44198c4a475817, R9, R10, R11, R12, R13, R14, R15, R8)
	 249
	 250	MOVQ	dig+0(FP), BP
	 251	ADDQ	(0*8)(BP), R8	// H0 = a + H0
	 252	MOVQ	R8, (0*8)(BP)
	 253	ADDQ	(1*8)(BP), R9	// H1 = b + H1
	 254	MOVQ	R9, (1*8)(BP)
	 255	ADDQ	(2*8)(BP), R10	// H2 = c + H2
	 256	MOVQ	R10, (2*8)(BP)
	 257	ADDQ	(3*8)(BP), R11	// H3 = d + H3
	 258	MOVQ	R11, (3*8)(BP)
	 259	ADDQ	(4*8)(BP), R12	// H4 = e + H4
	 260	MOVQ	R12, (4*8)(BP)
	 261	ADDQ	(5*8)(BP), R13	// H5 = f + H5
	 262	MOVQ	R13, (5*8)(BP)
	 263	ADDQ	(6*8)(BP), R14	// H6 = g + H6
	 264	MOVQ	R14, (6*8)(BP)
	 265	ADDQ	(7*8)(BP), R15	// H7 = h + H7
	 266	MOVQ	R15, (7*8)(BP)
	 267
	 268	ADDQ	$128, SI
	 269	CMPQ	SI, 640(SP)
	 270	JB	loop
	 271
	 272end:
	 273	RET
	 274
	 275// Version below is based on "Fast SHA512 Implementations on Intel
	 276// Architecture Processors" White-paper
	 277// https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-sha512-implementations-ia-processors-paper.pdf
	 278// AVX2 version by Intel, same algorithm in Linux kernel:
	 279// https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha512-avx2-asm.S
	 280
	 281// James Guilford <[email protected]>
	 282// Kirk Yap <[email protected]>
	 283// Tim Chen <[email protected]>
	 284// David Cote <[email protected]>
	 285// Aleksey Sidorov <[email protected]>
	 286
	 287#define YFER_SIZE (4*8)
	 288#define SRND_SIZE (1*8)
	 289#define INP_SIZE (1*8)
	 290
	 291#define frame_YFER (0)
	 292#define frame_SRND (frame_YFER + YFER_SIZE)
	 293#define frame_INP (frame_SRND + SRND_SIZE)
	 294#define frame_INPEND (frame_INP + INP_SIZE)
	 295
	 296#define addm(p1, p2) \
	 297	ADDQ p1, p2; \
	 298	MOVQ p2, p1
	 299
	 300#define COPY_YMM_AND_BSWAP(p1, p2, p3) \
	 301	VMOVDQU p2, p1;		\
	 302	VPSHUFB p3, p1, p1
	 303
	 304#define MY_VPALIGNR(YDST, YSRC1, YSRC2, RVAL) \
	 305	VPERM2F128 $0x3, YSRC2, YSRC1, YDST; \
	 306	VPALIGNR	 $RVAL, YSRC2, YDST, YDST
	 307
	 308DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x00(SB)/8, $0x0001020304050607
	 309DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x08(SB)/8, $0x08090a0b0c0d0e0f
	 310DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x10(SB)/8, $0x1011121314151617
	 311DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x18(SB)/8, $0x18191a1b1c1d1e1f
	 312
	 313GLOBL PSHUFFLE_BYTE_FLIP_MASK<>(SB), (NOPTR+RODATA), $32
	 314
	 315DATA MASK_YMM_LO<>+0x00(SB)/8, $0x0000000000000000
	 316DATA MASK_YMM_LO<>+0x08(SB)/8, $0x0000000000000000
	 317DATA MASK_YMM_LO<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
	 318DATA MASK_YMM_LO<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
	 319
	 320GLOBL MASK_YMM_LO<>(SB), (NOPTR+RODATA), $32
	 321
	 322TEXT ·blockAVX2(SB), NOSPLIT, $56-32
	 323	MOVQ dig+0(FP), SI
	 324	MOVQ p_base+8(FP), DI
	 325	MOVQ p_len+16(FP), DX
	 326
	 327	SHRQ $7, DX
	 328	SHLQ $7, DX
	 329
	 330	JZ	 done_hash
	 331	ADDQ DI, DX
	 332	MOVQ DX, frame_INPEND(SP)
	 333
	 334	MOVQ (0*8)(SI), AX
	 335	MOVQ (1*8)(SI), BX
	 336	MOVQ (2*8)(SI), CX
	 337	MOVQ (3*8)(SI), R8
	 338	MOVQ (4*8)(SI), DX
	 339	MOVQ (5*8)(SI), R9
	 340	MOVQ (6*8)(SI), R10
	 341	MOVQ (7*8)(SI), R11
	 342
	 343	VMOVDQU PSHUFFLE_BYTE_FLIP_MASK<>(SB), Y9
	 344
	 345loop0:
	 346	MOVQ ·_K+0(SB), BP
	 347
	 348	// byte swap first 16 dwords
	 349	COPY_YMM_AND_BSWAP(Y4, (0*32)(DI), Y9)
	 350	COPY_YMM_AND_BSWAP(Y5, (1*32)(DI), Y9)
	 351	COPY_YMM_AND_BSWAP(Y6, (2*32)(DI), Y9)
	 352	COPY_YMM_AND_BSWAP(Y7, (3*32)(DI), Y9)
	 353
	 354	MOVQ DI, frame_INP(SP)
	 355
	 356	// schedule 64 input dwords, by doing 12 rounds of 4 each
	 357	MOVQ $4, frame_SRND(SP)
	 358
	 359loop1:
	 360	VPADDQ	(BP), Y4, Y0
	 361	VMOVDQU Y0, frame_YFER(SP)
	 362
	 363	MY_VPALIGNR(Y0, Y7, Y6, 8)
	 364
	 365	VPADDQ Y4, Y0, Y0
	 366
	 367	MY_VPALIGNR(Y1, Y5, Y4, 8)
	 368
	 369	VPSRLQ $1, Y1, Y2
	 370	VPSLLQ $(64-1), Y1, Y3
	 371	VPOR	 Y2, Y3, Y3
	 372
	 373	VPSRLQ $7, Y1, Y8
	 374
	 375	MOVQ	AX, DI
	 376	RORXQ $41, DX, R13
	 377	RORXQ $18, DX, R14
	 378	ADDQ	frame_YFER(SP), R11
	 379	ORQ	 CX, DI
	 380	MOVQ	R9, R15
	 381	RORXQ $34, AX, R12
	 382
	 383	XORQ	R14, R13
	 384	XORQ	R10, R15
	 385	RORXQ $14, DX, R14
	 386
	 387	ANDQ	DX, R15
	 388	XORQ	R14, R13
	 389	RORXQ $39, AX, R14
	 390	ADDQ	R11, R8
	 391
	 392	ANDQ	BX, DI
	 393	XORQ	R12, R14
	 394	RORXQ $28, AX, R12
	 395
	 396	XORQ R10, R15
	 397	XORQ R12, R14
	 398	MOVQ AX, R12
	 399	ANDQ CX, R12
	 400
	 401	ADDQ R13, R15
	 402	ORQ	R12, DI
	 403	ADDQ R14, R11
	 404
	 405	ADDQ R15, R8
	 406
	 407	ADDQ R15, R11
	 408	ADDQ DI, R11
	 409
	 410	VPSRLQ $8, Y1, Y2
	 411	VPSLLQ $(64-8), Y1, Y1
	 412	VPOR	 Y2, Y1, Y1
	 413
	 414	VPXOR Y8, Y3, Y3
	 415	VPXOR Y1, Y3, Y1
	 416
	 417	VPADDQ Y1, Y0, Y0
	 418
	 419	VPERM2F128 $0x0, Y0, Y0, Y4
	 420
	 421	VPAND MASK_YMM_LO<>(SB), Y0, Y0
	 422
	 423	VPERM2F128 $0x11, Y7, Y7, Y2
	 424	VPSRLQ		 $6, Y2, Y8
	 425
	 426	MOVQ	R11, DI
	 427	RORXQ $41, R8, R13
	 428	RORXQ $18, R8, R14
	 429	ADDQ	1*8+frame_YFER(SP), R10
	 430	ORQ	 BX, DI
	 431
	 432	MOVQ	DX, R15
	 433	RORXQ $34, R11, R12
	 434	XORQ	R14, R13
	 435	XORQ	R9, R15
	 436
	 437	RORXQ $14, R8, R14
	 438	XORQ	R14, R13
	 439	RORXQ $39, R11, R14
	 440	ANDQ	R8, R15
	 441	ADDQ	R10, CX
	 442
	 443	ANDQ AX, DI
	 444	XORQ R12, R14
	 445
	 446	RORXQ $28, R11, R12
	 447	XORQ	R9, R15
	 448
	 449	XORQ R12, R14
	 450	MOVQ R11, R12
	 451	ANDQ BX, R12
	 452	ADDQ R13, R15
	 453
	 454	ORQ	R12, DI
	 455	ADDQ R14, R10
	 456
	 457	ADDQ R15, CX
	 458	ADDQ R15, R10
	 459	ADDQ DI, R10
	 460
	 461	VPSRLQ $19, Y2, Y3
	 462	VPSLLQ $(64-19), Y2, Y1
	 463	VPOR	 Y1, Y3, Y3
	 464	VPXOR	Y3, Y8, Y8
	 465	VPSRLQ $61, Y2, Y3
	 466	VPSLLQ $(64-61), Y2, Y1
	 467	VPOR	 Y1, Y3, Y3
	 468	VPXOR	Y3, Y8, Y8
	 469
	 470	VPADDQ Y8, Y4, Y4
	 471
	 472	VPSRLQ $6, Y4, Y8
	 473
	 474	MOVQ	R10, DI
	 475	RORXQ $41, CX, R13
	 476	ADDQ	2*8+frame_YFER(SP), R9
	 477
	 478	RORXQ $18, CX, R14
	 479	ORQ	 AX, DI
	 480	MOVQ	R8, R15
	 481	XORQ	DX, R15
	 482
	 483	RORXQ $34, R10, R12
	 484	XORQ	R14, R13
	 485	ANDQ	CX, R15
	 486
	 487	RORXQ $14, CX, R14
	 488	ADDQ	R9, BX
	 489	ANDQ	R11, DI
	 490
	 491	XORQ	R14, R13
	 492	RORXQ $39, R10, R14
	 493	XORQ	DX, R15
	 494
	 495	XORQ	R12, R14
	 496	RORXQ $28, R10, R12
	 497
	 498	XORQ R12, R14
	 499	MOVQ R10, R12
	 500	ANDQ AX, R12
	 501	ADDQ R13, R15
	 502
	 503	ORQ	R12, DI
	 504	ADDQ R14, R9
	 505	ADDQ R15, BX
	 506	ADDQ R15, R9
	 507
	 508	ADDQ DI, R9
	 509
	 510	VPSRLQ $19, Y4, Y3
	 511	VPSLLQ $(64-19), Y4, Y1
	 512	VPOR	 Y1, Y3, Y3
	 513	VPXOR	Y3, Y8, Y8
	 514	VPSRLQ $61, Y4, Y3
	 515	VPSLLQ $(64-61), Y4, Y1
	 516	VPOR	 Y1, Y3, Y3
	 517	VPXOR	Y3, Y8, Y8
	 518
	 519	VPADDQ Y8, Y0, Y2
	 520
	 521	VPBLENDD $0xF0, Y2, Y4, Y4
	 522
	 523	MOVQ	R9, DI
	 524	RORXQ $41, BX, R13
	 525	RORXQ $18, BX, R14
	 526	ADDQ	3*8+frame_YFER(SP), DX
	 527	ORQ	 R11, DI
	 528
	 529	MOVQ	CX, R15
	 530	RORXQ $34, R9, R12
	 531	XORQ	R14, R13
	 532	XORQ	R8, R15
	 533
	 534	RORXQ $14, BX, R14
	 535	ANDQ	BX, R15
	 536	ADDQ	DX, AX
	 537	ANDQ	R10, DI
	 538
	 539	XORQ R14, R13
	 540	XORQ R8, R15
	 541
	 542	RORXQ $39, R9, R14
	 543	ADDQ	R13, R15
	 544
	 545	XORQ R12, R14
	 546	ADDQ R15, AX
	 547
	 548	RORXQ $28, R9, R12
	 549
	 550	XORQ R12, R14
	 551	MOVQ R9, R12
	 552	ANDQ R11, R12
	 553	ORQ	R12, DI
	 554
	 555	ADDQ R14, DX
	 556	ADDQ R15, DX
	 557	ADDQ DI, DX
	 558
	 559	VPADDQ	1*32(BP), Y5, Y0
	 560	VMOVDQU Y0, frame_YFER(SP)
	 561
	 562	MY_VPALIGNR(Y0, Y4, Y7, 8)
	 563
	 564	VPADDQ Y5, Y0, Y0
	 565
	 566	MY_VPALIGNR(Y1, Y6, Y5, 8)
	 567
	 568	VPSRLQ $1, Y1, Y2
	 569	VPSLLQ $(64-1), Y1, Y3
	 570	VPOR	 Y2, Y3, Y3
	 571
	 572	VPSRLQ $7, Y1, Y8
	 573
	 574	MOVQ	DX, DI
	 575	RORXQ $41, AX, R13
	 576	RORXQ $18, AX, R14
	 577	ADDQ	frame_YFER(SP), R8
	 578	ORQ	 R10, DI
	 579	MOVQ	BX, R15
	 580	RORXQ $34, DX, R12
	 581
	 582	XORQ	R14, R13
	 583	XORQ	CX, R15
	 584	RORXQ $14, AX, R14
	 585
	 586	ANDQ	AX, R15
	 587	XORQ	R14, R13
	 588	RORXQ $39, DX, R14
	 589	ADDQ	R8, R11
	 590
	 591	ANDQ	R9, DI
	 592	XORQ	R12, R14
	 593	RORXQ $28, DX, R12
	 594
	 595	XORQ CX, R15
	 596	XORQ R12, R14
	 597	MOVQ DX, R12
	 598	ANDQ R10, R12
	 599
	 600	ADDQ R13, R15
	 601	ORQ	R12, DI
	 602	ADDQ R14, R8
	 603
	 604	ADDQ R15, R11
	 605
	 606	ADDQ R15, R8
	 607	ADDQ DI, R8
	 608
	 609	VPSRLQ $8, Y1, Y2
	 610	VPSLLQ $(64-8), Y1, Y1
	 611	VPOR	 Y2, Y1, Y1
	 612
	 613	VPXOR Y8, Y3, Y3
	 614	VPXOR Y1, Y3, Y1
	 615
	 616	VPADDQ Y1, Y0, Y0
	 617
	 618	VPERM2F128 $0x0, Y0, Y0, Y5
	 619
	 620	VPAND MASK_YMM_LO<>(SB), Y0, Y0
	 621
	 622	VPERM2F128 $0x11, Y4, Y4, Y2
	 623	VPSRLQ		 $6, Y2, Y8
	 624
	 625	MOVQ	R8, DI
	 626	RORXQ $41, R11, R13
	 627	RORXQ $18, R11, R14
	 628	ADDQ	1*8+frame_YFER(SP), CX
	 629	ORQ	 R9, DI
	 630
	 631	MOVQ	AX, R15
	 632	RORXQ $34, R8, R12
	 633	XORQ	R14, R13
	 634	XORQ	BX, R15
	 635
	 636	RORXQ $14, R11, R14
	 637	XORQ	R14, R13
	 638	RORXQ $39, R8, R14
	 639	ANDQ	R11, R15
	 640	ADDQ	CX, R10
	 641
	 642	ANDQ DX, DI
	 643	XORQ R12, R14
	 644
	 645	RORXQ $28, R8, R12
	 646	XORQ	BX, R15
	 647
	 648	XORQ R12, R14
	 649	MOVQ R8, R12
	 650	ANDQ R9, R12
	 651	ADDQ R13, R15
	 652
	 653	ORQ	R12, DI
	 654	ADDQ R14, CX
	 655
	 656	ADDQ R15, R10
	 657	ADDQ R15, CX
	 658	ADDQ DI, CX
	 659
	 660	VPSRLQ $19, Y2, Y3
	 661	VPSLLQ $(64-19), Y2, Y1
	 662	VPOR	 Y1, Y3, Y3
	 663	VPXOR	Y3, Y8, Y8
	 664	VPSRLQ $61, Y2, Y3
	 665	VPSLLQ $(64-61), Y2, Y1
	 666	VPOR	 Y1, Y3, Y3
	 667	VPXOR	Y3, Y8, Y8
	 668
	 669	VPADDQ Y8, Y5, Y5
	 670
	 671	VPSRLQ $6, Y5, Y8
	 672
	 673	MOVQ	CX, DI
	 674	RORXQ $41, R10, R13
	 675	ADDQ	2*8+frame_YFER(SP), BX
	 676
	 677	RORXQ $18, R10, R14
	 678	ORQ	 DX, DI
	 679	MOVQ	R11, R15
	 680	XORQ	AX, R15
	 681
	 682	RORXQ $34, CX, R12
	 683	XORQ	R14, R13
	 684	ANDQ	R10, R15
	 685
	 686	RORXQ $14, R10, R14
	 687	ADDQ	BX, R9
	 688	ANDQ	R8, DI
	 689
	 690	XORQ	R14, R13
	 691	RORXQ $39, CX, R14
	 692	XORQ	AX, R15
	 693
	 694	XORQ	R12, R14
	 695	RORXQ $28, CX, R12
	 696
	 697	XORQ R12, R14
	 698	MOVQ CX, R12
	 699	ANDQ DX, R12
	 700	ADDQ R13, R15
	 701
	 702	ORQ	R12, DI
	 703	ADDQ R14, BX
	 704	ADDQ R15, R9
	 705	ADDQ R15, BX
	 706
	 707	ADDQ DI, BX
	 708
	 709	VPSRLQ $19, Y5, Y3
	 710	VPSLLQ $(64-19), Y5, Y1
	 711	VPOR	 Y1, Y3, Y3
	 712	VPXOR	Y3, Y8, Y8
	 713	VPSRLQ $61, Y5, Y3
	 714	VPSLLQ $(64-61), Y5, Y1
	 715	VPOR	 Y1, Y3, Y3
	 716	VPXOR	Y3, Y8, Y8
	 717
	 718	VPADDQ Y8, Y0, Y2
	 719
	 720	VPBLENDD $0xF0, Y2, Y5, Y5
	 721
	 722	MOVQ	BX, DI
	 723	RORXQ $41, R9, R13
	 724	RORXQ $18, R9, R14
	 725	ADDQ	3*8+frame_YFER(SP), AX
	 726	ORQ	 R8, DI
	 727
	 728	MOVQ	R10, R15
	 729	RORXQ $34, BX, R12
	 730	XORQ	R14, R13
	 731	XORQ	R11, R15
	 732
	 733	RORXQ $14, R9, R14
	 734	ANDQ	R9, R15
	 735	ADDQ	AX, DX
	 736	ANDQ	CX, DI
	 737
	 738	XORQ R14, R13
	 739	XORQ R11, R15
	 740
	 741	RORXQ $39, BX, R14
	 742	ADDQ	R13, R15
	 743
	 744	XORQ R12, R14
	 745	ADDQ R15, DX
	 746
	 747	RORXQ $28, BX, R12
	 748
	 749	XORQ R12, R14
	 750	MOVQ BX, R12
	 751	ANDQ R8, R12
	 752	ORQ	R12, DI
	 753
	 754	ADDQ R14, AX
	 755	ADDQ R15, AX
	 756	ADDQ DI, AX
	 757
	 758	VPADDQ	2*32(BP), Y6, Y0
	 759	VMOVDQU Y0, frame_YFER(SP)
	 760
	 761	MY_VPALIGNR(Y0, Y5, Y4, 8)
	 762
	 763	VPADDQ Y6, Y0, Y0
	 764
	 765	MY_VPALIGNR(Y1, Y7, Y6, 8)
	 766
	 767	VPSRLQ $1, Y1, Y2
	 768	VPSLLQ $(64-1), Y1, Y3
	 769	VPOR	 Y2, Y3, Y3
	 770
	 771	VPSRLQ $7, Y1, Y8
	 772
	 773	MOVQ	AX, DI
	 774	RORXQ $41, DX, R13
	 775	RORXQ $18, DX, R14
	 776	ADDQ	frame_YFER(SP), R11
	 777	ORQ	 CX, DI
	 778	MOVQ	R9, R15
	 779	RORXQ $34, AX, R12
	 780
	 781	XORQ	R14, R13
	 782	XORQ	R10, R15
	 783	RORXQ $14, DX, R14
	 784
	 785	ANDQ	DX, R15
	 786	XORQ	R14, R13
	 787	RORXQ $39, AX, R14
	 788	ADDQ	R11, R8
	 789
	 790	ANDQ	BX, DI
	 791	XORQ	R12, R14
	 792	RORXQ $28, AX, R12
	 793
	 794	XORQ R10, R15
	 795	XORQ R12, R14
	 796	MOVQ AX, R12
	 797	ANDQ CX, R12
	 798
	 799	ADDQ R13, R15
	 800	ORQ	R12, DI
	 801	ADDQ R14, R11
	 802
	 803	ADDQ R15, R8
	 804
	 805	ADDQ R15, R11
	 806	ADDQ DI, R11
	 807
	 808	VPSRLQ $8, Y1, Y2
	 809	VPSLLQ $(64-8), Y1, Y1
	 810	VPOR	 Y2, Y1, Y1
	 811
	 812	VPXOR Y8, Y3, Y3
	 813	VPXOR Y1, Y3, Y1
	 814
	 815	VPADDQ Y1, Y0, Y0
	 816
	 817	VPERM2F128 $0x0, Y0, Y0, Y6
	 818
	 819	VPAND MASK_YMM_LO<>(SB), Y0, Y0
	 820
	 821	VPERM2F128 $0x11, Y5, Y5, Y2
	 822	VPSRLQ		 $6, Y2, Y8
	 823
	 824	MOVQ	R11, DI
	 825	RORXQ $41, R8, R13
	 826	RORXQ $18, R8, R14
	 827	ADDQ	1*8+frame_YFER(SP), R10
	 828	ORQ	 BX, DI
	 829
	 830	MOVQ	DX, R15
	 831	RORXQ $34, R11, R12
	 832	XORQ	R14, R13
	 833	XORQ	R9, R15
	 834
	 835	RORXQ $14, R8, R14
	 836	XORQ	R14, R13
	 837	RORXQ $39, R11, R14
	 838	ANDQ	R8, R15
	 839	ADDQ	R10, CX
	 840
	 841	ANDQ AX, DI
	 842	XORQ R12, R14
	 843
	 844	RORXQ $28, R11, R12
	 845	XORQ	R9, R15
	 846
	 847	XORQ R12, R14
	 848	MOVQ R11, R12
	 849	ANDQ BX, R12
	 850	ADDQ R13, R15
	 851
	 852	ORQ	R12, DI
	 853	ADDQ R14, R10
	 854
	 855	ADDQ R15, CX
	 856	ADDQ R15, R10
	 857	ADDQ DI, R10
	 858
	 859	VPSRLQ $19, Y2, Y3
	 860	VPSLLQ $(64-19), Y2, Y1
	 861	VPOR	 Y1, Y3, Y3
	 862	VPXOR	Y3, Y8, Y8
	 863	VPSRLQ $61, Y2, Y3
	 864	VPSLLQ $(64-61), Y2, Y1
	 865	VPOR	 Y1, Y3, Y3
	 866	VPXOR	Y3, Y8, Y8
	 867
	 868	VPADDQ Y8, Y6, Y6
	 869
	 870	VPSRLQ $6, Y6, Y8
	 871
	 872	MOVQ	R10, DI
	 873	RORXQ $41, CX, R13
	 874	ADDQ	2*8+frame_YFER(SP), R9
	 875
	 876	RORXQ $18, CX, R14
	 877	ORQ	 AX, DI
	 878	MOVQ	R8, R15
	 879	XORQ	DX, R15
	 880
	 881	RORXQ $34, R10, R12
	 882	XORQ	R14, R13
	 883	ANDQ	CX, R15
	 884
	 885	RORXQ $14, CX, R14
	 886	ADDQ	R9, BX
	 887	ANDQ	R11, DI
	 888
	 889	XORQ	R14, R13
	 890	RORXQ $39, R10, R14
	 891	XORQ	DX, R15
	 892
	 893	XORQ	R12, R14
	 894	RORXQ $28, R10, R12
	 895
	 896	XORQ R12, R14
	 897	MOVQ R10, R12
	 898	ANDQ AX, R12
	 899	ADDQ R13, R15
	 900
	 901	ORQ	R12, DI
	 902	ADDQ R14, R9
	 903	ADDQ R15, BX
	 904	ADDQ R15, R9
	 905
	 906	ADDQ DI, R9
	 907
	 908	VPSRLQ $19, Y6, Y3
	 909	VPSLLQ $(64-19), Y6, Y1
	 910	VPOR	 Y1, Y3, Y3
	 911	VPXOR	Y3, Y8, Y8
	 912	VPSRLQ $61, Y6, Y3
	 913	VPSLLQ $(64-61), Y6, Y1
	 914	VPOR	 Y1, Y3, Y3
	 915	VPXOR	Y3, Y8, Y8
	 916
	 917	VPADDQ Y8, Y0, Y2
	 918
	 919	VPBLENDD $0xF0, Y2, Y6, Y6
	 920
	 921	MOVQ	R9, DI
	 922	RORXQ $41, BX, R13
	 923	RORXQ $18, BX, R14
	 924	ADDQ	3*8+frame_YFER(SP), DX
	 925	ORQ	 R11, DI
	 926
	 927	MOVQ	CX, R15
	 928	RORXQ $34, R9, R12
	 929	XORQ	R14, R13
	 930	XORQ	R8, R15
	 931
	 932	RORXQ $14, BX, R14
	 933	ANDQ	BX, R15
	 934	ADDQ	DX, AX
	 935	ANDQ	R10, DI
	 936
	 937	XORQ R14, R13
	 938	XORQ R8, R15
	 939
	 940	RORXQ $39, R9, R14
	 941	ADDQ	R13, R15
	 942
	 943	XORQ R12, R14
	 944	ADDQ R15, AX
	 945
	 946	RORXQ $28, R9, R12
	 947
	 948	XORQ R12, R14
	 949	MOVQ R9, R12
	 950	ANDQ R11, R12
	 951	ORQ	R12, DI
	 952
	 953	ADDQ R14, DX
	 954	ADDQ R15, DX
	 955	ADDQ DI, DX
	 956
	 957	VPADDQ	3*32(BP), Y7, Y0
	 958	VMOVDQU Y0, frame_YFER(SP)
	 959	ADDQ		$(4*32), BP
	 960
	 961	MY_VPALIGNR(Y0, Y6, Y5, 8)
	 962
	 963	VPADDQ Y7, Y0, Y0
	 964
	 965	MY_VPALIGNR(Y1, Y4, Y7, 8)
	 966
	 967	VPSRLQ $1, Y1, Y2
	 968	VPSLLQ $(64-1), Y1, Y3
	 969	VPOR	 Y2, Y3, Y3
	 970
	 971	VPSRLQ $7, Y1, Y8
	 972
	 973	MOVQ	DX, DI
	 974	RORXQ $41, AX, R13
	 975	RORXQ $18, AX, R14
	 976	ADDQ	frame_YFER(SP), R8
	 977	ORQ	 R10, DI
	 978	MOVQ	BX, R15
	 979	RORXQ $34, DX, R12
	 980
	 981	XORQ	R14, R13
	 982	XORQ	CX, R15
	 983	RORXQ $14, AX, R14
	 984
	 985	ANDQ	AX, R15
	 986	XORQ	R14, R13
	 987	RORXQ $39, DX, R14
	 988	ADDQ	R8, R11
	 989
	 990	ANDQ	R9, DI
	 991	XORQ	R12, R14
	 992	RORXQ $28, DX, R12
	 993
	 994	XORQ CX, R15
	 995	XORQ R12, R14
	 996	MOVQ DX, R12
	 997	ANDQ R10, R12
	 998
	 999	ADDQ R13, R15
	1000	ORQ	R12, DI
	1001	ADDQ R14, R8
	1002
	1003	ADDQ R15, R11
	1004
	1005	ADDQ R15, R8
	1006	ADDQ DI, R8
	1007
	1008	VPSRLQ $8, Y1, Y2
	1009	VPSLLQ $(64-8), Y1, Y1
	1010	VPOR	 Y2, Y1, Y1
	1011
	1012	VPXOR Y8, Y3, Y3
	1013	VPXOR Y1, Y3, Y1
	1014
	1015	VPADDQ Y1, Y0, Y0
	1016
	1017	VPERM2F128 $0x0, Y0, Y0, Y7
	1018
	1019	VPAND MASK_YMM_LO<>(SB), Y0, Y0
	1020
	1021	VPERM2F128 $0x11, Y6, Y6, Y2
	1022	VPSRLQ		 $6, Y2, Y8
	1023
	1024	MOVQ	R8, DI
	1025	RORXQ $41, R11, R13
	1026	RORXQ $18, R11, R14
	1027	ADDQ	1*8+frame_YFER(SP), CX
	1028	ORQ	 R9, DI
	1029
	1030	MOVQ	AX, R15
	1031	RORXQ $34, R8, R12
	1032	XORQ	R14, R13
	1033	XORQ	BX, R15
	1034
	1035	RORXQ $14, R11, R14
	1036	XORQ	R14, R13
	1037	RORXQ $39, R8, R14
	1038	ANDQ	R11, R15
	1039	ADDQ	CX, R10
	1040
	1041	ANDQ DX, DI
	1042	XORQ R12, R14
	1043
	1044	RORXQ $28, R8, R12
	1045	XORQ	BX, R15
	1046
	1047	XORQ R12, R14
	1048	MOVQ R8, R12
	1049	ANDQ R9, R12
	1050	ADDQ R13, R15
	1051
	1052	ORQ	R12, DI
	1053	ADDQ R14, CX
	1054
	1055	ADDQ R15, R10
	1056	ADDQ R15, CX
	1057	ADDQ DI, CX
	1058
	1059	VPSRLQ $19, Y2, Y3
	1060	VPSLLQ $(64-19), Y2, Y1
	1061	VPOR	 Y1, Y3, Y3
	1062	VPXOR	Y3, Y8, Y8
	1063	VPSRLQ $61, Y2, Y3
	1064	VPSLLQ $(64-61), Y2, Y1
	1065	VPOR	 Y1, Y3, Y3
	1066	VPXOR	Y3, Y8, Y8
	1067
	1068	VPADDQ Y8, Y7, Y7
	1069
	1070	VPSRLQ $6, Y7, Y8
	1071
	1072	MOVQ	CX, DI
	1073	RORXQ $41, R10, R13
	1074	ADDQ	2*8+frame_YFER(SP), BX
	1075
	1076	RORXQ $18, R10, R14
	1077	ORQ	 DX, DI
	1078	MOVQ	R11, R15
	1079	XORQ	AX, R15
	1080
	1081	RORXQ $34, CX, R12
	1082	XORQ	R14, R13
	1083	ANDQ	R10, R15
	1084
	1085	RORXQ $14, R10, R14
	1086	ADDQ	BX, R9
	1087	ANDQ	R8, DI
	1088
	1089	XORQ	R14, R13
	1090	RORXQ $39, CX, R14
	1091	XORQ	AX, R15
	1092
	1093	XORQ	R12, R14
	1094	RORXQ $28, CX, R12
	1095
	1096	XORQ R12, R14
	1097	MOVQ CX, R12
	1098	ANDQ DX, R12
	1099	ADDQ R13, R15
	1100
	1101	ORQ	R12, DI
	1102	ADDQ R14, BX
	1103	ADDQ R15, R9
	1104	ADDQ R15, BX
	1105
	1106	ADDQ DI, BX
	1107
	1108	VPSRLQ $19, Y7, Y3
	1109	VPSLLQ $(64-19), Y7, Y1
	1110	VPOR	 Y1, Y3, Y3
	1111	VPXOR	Y3, Y8, Y8
	1112	VPSRLQ $61, Y7, Y3
	1113	VPSLLQ $(64-61), Y7, Y1
	1114	VPOR	 Y1, Y3, Y3
	1115	VPXOR	Y3, Y8, Y8
	1116
	1117	VPADDQ Y8, Y0, Y2
	1118
	1119	VPBLENDD $0xF0, Y2, Y7, Y7
	1120
	1121	MOVQ	BX, DI
	1122	RORXQ $41, R9, R13
	1123	RORXQ $18, R9, R14
	1124	ADDQ	3*8+frame_YFER(SP), AX
	1125	ORQ	 R8, DI
	1126
	1127	MOVQ	R10, R15
	1128	RORXQ $34, BX, R12
	1129	XORQ	R14, R13
	1130	XORQ	R11, R15
	1131
	1132	RORXQ $14, R9, R14
	1133	ANDQ	R9, R15
	1134	ADDQ	AX, DX
	1135	ANDQ	CX, DI
	1136
	1137	XORQ R14, R13
	1138	XORQ R11, R15
	1139
	1140	RORXQ $39, BX, R14
	1141	ADDQ	R13, R15
	1142
	1143	XORQ R12, R14
	1144	ADDQ R15, DX
	1145
	1146	RORXQ $28, BX, R12
	1147
	1148	XORQ R12, R14
	1149	MOVQ BX, R12
	1150	ANDQ R8, R12
	1151	ORQ	R12, DI
	1152
	1153	ADDQ R14, AX
	1154	ADDQ R15, AX
	1155	ADDQ DI, AX
	1156
	1157	SUBQ $1, frame_SRND(SP)
	1158	JNE	loop1
	1159
	1160	MOVQ $2, frame_SRND(SP)
	1161
	1162loop2:
	1163	VPADDQ	(BP), Y4, Y0
	1164	VMOVDQU Y0, frame_YFER(SP)
	1165
	1166	MOVQ	R9, R15
	1167	RORXQ $41, DX, R13
	1168	RORXQ $18, DX, R14
	1169	XORQ	R10, R15
	1170
	1171	XORQ	R14, R13
	1172	RORXQ $14, DX, R14
	1173	ANDQ	DX, R15
	1174
	1175	XORQ	R14, R13
	1176	RORXQ $34, AX, R12
	1177	XORQ	R10, R15
	1178	RORXQ $39, AX, R14
	1179	MOVQ	AX, DI
	1180
	1181	XORQ	R12, R14
	1182	RORXQ $28, AX, R12
	1183	ADDQ	frame_YFER(SP), R11
	1184	ORQ	 CX, DI
	1185
	1186	XORQ R12, R14
	1187	MOVQ AX, R12
	1188	ANDQ BX, DI
	1189	ANDQ CX, R12
	1190	ADDQ R13, R15
	1191
	1192	ADDQ R11, R8
	1193	ORQ	R12, DI
	1194	ADDQ R14, R11
	1195
	1196	ADDQ R15, R8
	1197
	1198	ADDQ	R15, R11
	1199	MOVQ	DX, R15
	1200	RORXQ $41, R8, R13
	1201	RORXQ $18, R8, R14
	1202	XORQ	R9, R15
	1203
	1204	XORQ	R14, R13
	1205	RORXQ $14, R8, R14
	1206	ANDQ	R8, R15
	1207	ADDQ	DI, R11
	1208
	1209	XORQ	R14, R13
	1210	RORXQ $34, R11, R12
	1211	XORQ	R9, R15
	1212	RORXQ $39, R11, R14
	1213	MOVQ	R11, DI
	1214
	1215	XORQ	R12, R14
	1216	RORXQ $28, R11, R12
	1217	ADDQ	8*1+frame_YFER(SP), R10
	1218	ORQ	 BX, DI
	1219
	1220	XORQ R12, R14
	1221	MOVQ R11, R12
	1222	ANDQ AX, DI
	1223	ANDQ BX, R12
	1224	ADDQ R13, R15
	1225
	1226	ADDQ R10, CX
	1227	ORQ	R12, DI
	1228	ADDQ R14, R10
	1229
	1230	ADDQ R15, CX
	1231
	1232	ADDQ	R15, R10
	1233	MOVQ	R8, R15
	1234	RORXQ $41, CX, R13
	1235	RORXQ $18, CX, R14
	1236	XORQ	DX, R15
	1237
	1238	XORQ	R14, R13
	1239	RORXQ $14, CX, R14
	1240	ANDQ	CX, R15
	1241	ADDQ	DI, R10
	1242
	1243	XORQ	R14, R13
	1244	RORXQ $34, R10, R12
	1245	XORQ	DX, R15
	1246	RORXQ $39, R10, R14
	1247	MOVQ	R10, DI
	1248
	1249	XORQ	R12, R14
	1250	RORXQ $28, R10, R12
	1251	ADDQ	8*2+frame_YFER(SP), R9
	1252	ORQ	 AX, DI
	1253
	1254	XORQ R12, R14
	1255	MOVQ R10, R12
	1256	ANDQ R11, DI
	1257	ANDQ AX, R12
	1258	ADDQ R13, R15
	1259
	1260	ADDQ R9, BX
	1261	ORQ	R12, DI
	1262	ADDQ R14, R9
	1263
	1264	ADDQ R15, BX
	1265
	1266	ADDQ	R15, R9
	1267	MOVQ	CX, R15
	1268	RORXQ $41, BX, R13
	1269	RORXQ $18, BX, R14
	1270	XORQ	R8, R15
	1271
	1272	XORQ	R14, R13
	1273	RORXQ $14, BX, R14
	1274	ANDQ	BX, R15
	1275	ADDQ	DI, R9
	1276
	1277	XORQ	R14, R13
	1278	RORXQ $34, R9, R12
	1279	XORQ	R8, R15
	1280	RORXQ $39, R9, R14
	1281	MOVQ	R9, DI
	1282
	1283	XORQ	R12, R14
	1284	RORXQ $28, R9, R12
	1285	ADDQ	8*3+frame_YFER(SP), DX
	1286	ORQ	 R11, DI
	1287
	1288	XORQ R12, R14
	1289	MOVQ R9, R12
	1290	ANDQ R10, DI
	1291	ANDQ R11, R12
	1292	ADDQ R13, R15
	1293
	1294	ADDQ DX, AX
	1295	ORQ	R12, DI
	1296	ADDQ R14, DX
	1297
	1298	ADDQ R15, AX
	1299
	1300	ADDQ R15, DX
	1301
	1302	ADDQ DI, DX
	1303
	1304	VPADDQ	1*32(BP), Y5, Y0
	1305	VMOVDQU Y0, frame_YFER(SP)
	1306	ADDQ		$(2*32), BP
	1307
	1308	MOVQ	BX, R15
	1309	RORXQ $41, AX, R13
	1310	RORXQ $18, AX, R14
	1311	XORQ	CX, R15
	1312
	1313	XORQ	R14, R13
	1314	RORXQ $14, AX, R14
	1315	ANDQ	AX, R15
	1316
	1317	XORQ	R14, R13
	1318	RORXQ $34, DX, R12
	1319	XORQ	CX, R15
	1320	RORXQ $39, DX, R14
	1321	MOVQ	DX, DI
	1322
	1323	XORQ	R12, R14
	1324	RORXQ $28, DX, R12
	1325	ADDQ	frame_YFER(SP), R8
	1326	ORQ	 R10, DI
	1327
	1328	XORQ R12, R14
	1329	MOVQ DX, R12
	1330	ANDQ R9, DI
	1331	ANDQ R10, R12
	1332	ADDQ R13, R15
	1333
	1334	ADDQ R8, R11
	1335	ORQ	R12, DI
	1336	ADDQ R14, R8
	1337
	1338	ADDQ R15, R11
	1339
	1340	ADDQ	R15, R8
	1341	MOVQ	AX, R15
	1342	RORXQ $41, R11, R13
	1343	RORXQ $18, R11, R14
	1344	XORQ	BX, R15
	1345
	1346	XORQ	R14, R13
	1347	RORXQ $14, R11, R14
	1348	ANDQ	R11, R15
	1349	ADDQ	DI, R8
	1350
	1351	XORQ	R14, R13
	1352	RORXQ $34, R8, R12
	1353	XORQ	BX, R15
	1354	RORXQ $39, R8, R14
	1355	MOVQ	R8, DI
	1356
	1357	XORQ	R12, R14
	1358	RORXQ $28, R8, R12
	1359	ADDQ	8*1+frame_YFER(SP), CX
	1360	ORQ	 R9, DI
	1361
	1362	XORQ R12, R14
	1363	MOVQ R8, R12
	1364	ANDQ DX, DI
	1365	ANDQ R9, R12
	1366	ADDQ R13, R15
	1367
	1368	ADDQ CX, R10
	1369	ORQ	R12, DI
	1370	ADDQ R14, CX
	1371
	1372	ADDQ R15, R10
	1373
	1374	ADDQ	R15, CX
	1375	MOVQ	R11, R15
	1376	RORXQ $41, R10, R13
	1377	RORXQ $18, R10, R14
	1378	XORQ	AX, R15
	1379
	1380	XORQ	R14, R13
	1381	RORXQ $14, R10, R14
	1382	ANDQ	R10, R15
	1383	ADDQ	DI, CX
	1384
	1385	XORQ	R14, R13
	1386	RORXQ $34, CX, R12
	1387	XORQ	AX, R15
	1388	RORXQ $39, CX, R14
	1389	MOVQ	CX, DI
	1390
	1391	XORQ	R12, R14
	1392	RORXQ $28, CX, R12
	1393	ADDQ	8*2+frame_YFER(SP), BX
	1394	ORQ	 DX, DI
	1395
	1396	XORQ R12, R14
	1397	MOVQ CX, R12
	1398	ANDQ R8, DI
	1399	ANDQ DX, R12
	1400	ADDQ R13, R15
	1401
	1402	ADDQ BX, R9
	1403	ORQ	R12, DI
	1404	ADDQ R14, BX
	1405
	1406	ADDQ R15, R9
	1407
	1408	ADDQ	R15, BX
	1409	MOVQ	R10, R15
	1410	RORXQ $41, R9, R13
	1411	RORXQ $18, R9, R14
	1412	XORQ	R11, R15
	1413
	1414	XORQ	R14, R13
	1415	RORXQ $14, R9, R14
	1416	ANDQ	R9, R15
	1417	ADDQ	DI, BX
	1418
	1419	XORQ	R14, R13
	1420	RORXQ $34, BX, R12
	1421	XORQ	R11, R15
	1422	RORXQ $39, BX, R14
	1423	MOVQ	BX, DI
	1424
	1425	XORQ	R12, R14
	1426	RORXQ $28, BX, R12
	1427	ADDQ	8*3+frame_YFER(SP), AX
	1428	ORQ	 R8, DI
	1429
	1430	XORQ R12, R14
	1431	MOVQ BX, R12
	1432	ANDQ CX, DI
	1433	ANDQ R8, R12
	1434	ADDQ R13, R15
	1435
	1436	ADDQ AX, DX
	1437	ORQ	R12, DI
	1438	ADDQ R14, AX
	1439
	1440	ADDQ R15, DX
	1441
	1442	ADDQ R15, AX
	1443
	1444	ADDQ DI, AX
	1445
	1446	VMOVDQU Y6, Y4
	1447	VMOVDQU Y7, Y5
	1448
	1449	SUBQ $1, frame_SRND(SP)
	1450	JNE	loop2
	1451
	1452	addm(8*0(SI),AX)
	1453	addm(8*1(SI),BX)
	1454	addm(8*2(SI),CX)
	1455	addm(8*3(SI),R8)
	1456	addm(8*4(SI),DX)
	1457	addm(8*5(SI),R9)
	1458	addm(8*6(SI),R10)
	1459	addm(8*7(SI),R11)
	1460
	1461	MOVQ frame_INP(SP), DI
	1462	ADDQ $128, DI
	1463	CMPQ DI, frame_INPEND(SP)
	1464	JNE	loop0
	1465
	1466done_hash:
	1467	VZEROUPPER
	1468	RET

View as plain text