...

Text file src/hash/crc32/crc32_amd64.s

Documentation: hash/crc32

		 1// Copyright 2011 The Go Authors. All rights reserved.
		 2// Use of this source code is governed by a BSD-style
		 3// license that can be found in the LICENSE file.
		 4
		 5#include "textflag.h"
		 6
		 7// castagnoliSSE42 updates the (non-inverted) crc with the given buffer.
		 8//
		 9// func castagnoliSSE42(crc uint32, p []byte) uint32
		10TEXT ·castagnoliSSE42(SB),NOSPLIT,$0
		11	MOVL crc+0(FP), AX	// CRC value
		12	MOVQ p+8(FP), SI	// data pointer
		13	MOVQ p_len+16(FP), CX	// len(p)
		14
		15	// If there are fewer than 8 bytes to process, skip alignment.
		16	CMPQ CX, $8
		17	JL less_than_8
		18
		19	MOVQ SI, BX
		20	ANDQ $7, BX
		21	JZ aligned
		22
		23	// Process the first few bytes to 8-byte align the input.
		24
		25	// BX = 8 - BX. We need to process this many bytes to align.
		26	SUBQ $1, BX
		27	XORQ $7, BX
		28
		29	BTQ $0, BX
		30	JNC align_2
		31
		32	CRC32B (SI), AX
		33	DECQ CX
		34	INCQ SI
		35
		36align_2:
		37	BTQ $1, BX
		38	JNC align_4
		39
		40	CRC32W (SI), AX
		41
		42	SUBQ $2, CX
		43	ADDQ $2, SI
		44
		45align_4:
		46	BTQ $2, BX
		47	JNC aligned
		48
		49	CRC32L (SI), AX
		50
		51	SUBQ $4, CX
		52	ADDQ $4, SI
		53
		54aligned:
		55	// The input is now 8-byte aligned and we can process 8-byte chunks.
		56	CMPQ CX, $8
		57	JL less_than_8
		58
		59	CRC32Q (SI), AX
		60	ADDQ $8, SI
		61	SUBQ $8, CX
		62	JMP aligned
		63
		64less_than_8:
		65	// We may have some bytes left over; process 4 bytes, then 2, then 1.
		66	BTQ $2, CX
		67	JNC less_than_4
		68
		69	CRC32L (SI), AX
		70	ADDQ $4, SI
		71
		72less_than_4:
		73	BTQ $1, CX
		74	JNC less_than_2
		75
		76	CRC32W (SI), AX
		77	ADDQ $2, SI
		78
		79less_than_2:
		80	BTQ $0, CX
		81	JNC done
		82
		83	CRC32B (SI), AX
		84
		85done:
		86	MOVL AX, ret+32(FP)
		87	RET
		88
		89// castagnoliSSE42Triple updates three (non-inverted) crcs with (24*rounds)
		90// bytes from each buffer.
		91//
		92// func castagnoliSSE42Triple(
		93//		 crc1, crc2, crc3 uint32,
		94//		 a, b, c []byte,
		95//		 rounds uint32,
		96// ) (retA uint32, retB uint32, retC uint32)
		97TEXT ·castagnoliSSE42Triple(SB),NOSPLIT,$0
		98	MOVL crcA+0(FP), AX
		99	MOVL crcB+4(FP), CX
	 100	MOVL crcC+8(FP), DX
	 101
	 102	MOVQ a+16(FP), R8	 // data pointer
	 103	MOVQ b+40(FP), R9	 // data pointer
	 104	MOVQ c+64(FP), R10	// data pointer
	 105
	 106	MOVL rounds+88(FP), R11
	 107
	 108loop:
	 109	CRC32Q (R8), AX
	 110	CRC32Q (R9), CX
	 111	CRC32Q (R10), DX
	 112
	 113	CRC32Q 8(R8), AX
	 114	CRC32Q 8(R9), CX
	 115	CRC32Q 8(R10), DX
	 116
	 117	CRC32Q 16(R8), AX
	 118	CRC32Q 16(R9), CX
	 119	CRC32Q 16(R10), DX
	 120
	 121	ADDQ $24, R8
	 122	ADDQ $24, R9
	 123	ADDQ $24, R10
	 124
	 125	DECQ R11
	 126	JNZ loop
	 127
	 128	MOVL AX, retA+96(FP)
	 129	MOVL CX, retB+100(FP)
	 130	MOVL DX, retC+104(FP)
	 131	RET
	 132
	 133// CRC32 polynomial data
	 134//
	 135// These constants are lifted from the
	 136// Linux kernel, since they avoid the costly
	 137// PSHUFB 16 byte reversal proposed in the
	 138// original Intel paper.
	 139DATA r2r1<>+0(SB)/8, $0x154442bd4
	 140DATA r2r1<>+8(SB)/8, $0x1c6e41596
	 141DATA r4r3<>+0(SB)/8, $0x1751997d0
	 142DATA r4r3<>+8(SB)/8, $0x0ccaa009e
	 143DATA rupoly<>+0(SB)/8, $0x1db710641
	 144DATA rupoly<>+8(SB)/8, $0x1f7011641
	 145DATA r5<>+0(SB)/8, $0x163cd6124
	 146
	 147GLOBL r2r1<>(SB),RODATA,$16
	 148GLOBL r4r3<>(SB),RODATA,$16
	 149GLOBL rupoly<>(SB),RODATA,$16
	 150GLOBL r5<>(SB),RODATA,$8
	 151
	 152// Based on https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
	 153// len(p) must be at least 64, and must be a multiple of 16.
	 154
	 155// func ieeeCLMUL(crc uint32, p []byte) uint32
	 156TEXT ·ieeeCLMUL(SB),NOSPLIT,$0
	 157	MOVL	 crc+0(FP), X0						 // Initial CRC value
	 158	MOVQ	 p+8(FP), SI						 // data pointer
	 159	MOVQ	 p_len+16(FP), CX					// len(p)
	 160
	 161	MOVOU	(SI), X1
	 162	MOVOU	16(SI), X2
	 163	MOVOU	32(SI), X3
	 164	MOVOU	48(SI), X4
	 165	PXOR	 X0, X1
	 166	ADDQ	 $64, SI									// buf+=64
	 167	SUBQ	 $64, CX									// len-=64
	 168	CMPQ	 CX, $64									// Less than 64 bytes left
	 169	JB		 remain64
	 170
	 171	MOVOA	r2r1<>+0(SB), X0
	 172loopback64:
	 173	MOVOA	X1, X5
	 174	MOVOA	X2, X6
	 175	MOVOA	X3, X7
	 176	MOVOA	X4, X8
	 177
	 178	PCLMULQDQ $0, X0, X1
	 179	PCLMULQDQ $0, X0, X2
	 180	PCLMULQDQ $0, X0, X3
	 181	PCLMULQDQ $0, X0, X4
	 182
	 183	/* Load next early */
	 184	MOVOU		(SI), X11
	 185	MOVOU		16(SI), X12
	 186	MOVOU		32(SI), X13
	 187	MOVOU		48(SI), X14
	 188
	 189	PCLMULQDQ $0x11, X0, X5
	 190	PCLMULQDQ $0x11, X0, X6
	 191	PCLMULQDQ $0x11, X0, X7
	 192	PCLMULQDQ $0x11, X0, X8
	 193
	 194	PXOR		 X5, X1
	 195	PXOR		 X6, X2
	 196	PXOR		 X7, X3
	 197	PXOR		 X8, X4
	 198
	 199	PXOR		 X11, X1
	 200	PXOR		 X12, X2
	 201	PXOR		 X13, X3
	 202	PXOR		 X14, X4
	 203
	 204	ADDQ		$0x40, DI
	 205	ADDQ		$64, SI			// buf+=64
	 206	SUBQ		$64, CX			// len-=64
	 207	CMPQ		CX, $64			// Less than 64 bytes left?
	 208	JGE		 loopback64
	 209
	 210	/* Fold result into a single register (X1) */
	 211remain64:
	 212	MOVOA			 r4r3<>+0(SB), X0
	 213
	 214	MOVOA			 X1, X5
	 215	PCLMULQDQ	 $0, X0, X1
	 216	PCLMULQDQ	 $0x11, X0, X5
	 217	PXOR				X5, X1
	 218	PXOR				X2, X1
	 219
	 220	MOVOA			 X1, X5
	 221	PCLMULQDQ	 $0, X0, X1
	 222	PCLMULQDQ	 $0x11, X0, X5
	 223	PXOR				X5, X1
	 224	PXOR				X3, X1
	 225
	 226	MOVOA			 X1, X5
	 227	PCLMULQDQ	 $0, X0, X1
	 228	PCLMULQDQ	 $0x11, X0, X5
	 229	PXOR				X5, X1
	 230	PXOR				X4, X1
	 231
	 232	/* If there is less than 16 bytes left we are done */
	 233	CMPQ				CX, $16
	 234	JB					finish
	 235
	 236	/* Encode 16 bytes */
	 237remain16:
	 238	MOVOU			 (SI), X10
	 239	MOVOA			 X1, X5
	 240	PCLMULQDQ	 $0, X0, X1
	 241	PCLMULQDQ	 $0x11, X0, X5
	 242	PXOR				X5, X1
	 243	PXOR				X10, X1
	 244	SUBQ				$16, CX
	 245	ADDQ				$16, SI
	 246	CMPQ				CX, $16
	 247	JGE				 remain16
	 248
	 249finish:
	 250	/* Fold final result into 32 bits and return it */
	 251	PCMPEQB		 X3, X3
	 252	PCLMULQDQ	 $1, X1, X0
	 253	PSRLDQ			$8, X1
	 254	PXOR				X0, X1
	 255
	 256	MOVOA			 X1, X2
	 257	MOVQ				r5<>+0(SB), X0
	 258
	 259	/* Creates 32 bit mask. Note that we don't care about upper half. */
	 260	PSRLQ			 $32, X3
	 261
	 262	PSRLDQ			$4, X2
	 263	PAND				X3, X1
	 264	PCLMULQDQ	 $0, X0, X1
	 265	PXOR				X2, X1
	 266
	 267	MOVOA			 rupoly<>+0(SB), X0
	 268
	 269	MOVOA			 X1, X2
	 270	PAND				X3, X1
	 271	PCLMULQDQ	 $0x10, X0, X1
	 272	PAND				X3, X1
	 273	PCLMULQDQ	 $0, X0, X1
	 274	PXOR				X2, X1
	 275
	 276	PEXTRD	$1, X1, AX
	 277	MOVL				AX, ret+32(FP)
	 278
	 279	RET

View as plain text