You cannot select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
	
	
		
			184 lines
		
	
	
		
			3.3 KiB
		
	
	
	
		
			ArmAsm
		
	
			
		
		
	
	
			184 lines
		
	
	
		
			3.3 KiB
		
	
	
	
		
			ArmAsm
		
	
| //go:build !appengine && gc && !purego
 | |
| // +build !appengine
 | |
| // +build gc
 | |
| // +build !purego
 | |
| 
 | |
| #include "textflag.h"
 | |
| 
 | |
| // Registers:
 | |
| #define digest	R1
 | |
| #define h	R2 // return value
 | |
| #define p	R3 // input pointer
 | |
| #define n	R4 // input length
 | |
| #define nblocks	R5 // n / 32
 | |
| #define prime1	R7
 | |
| #define prime2	R8
 | |
| #define prime3	R9
 | |
| #define prime4	R10
 | |
| #define prime5	R11
 | |
| #define v1	R12
 | |
| #define v2	R13
 | |
| #define v3	R14
 | |
| #define v4	R15
 | |
| #define x1	R20
 | |
| #define x2	R21
 | |
| #define x3	R22
 | |
| #define x4	R23
 | |
| 
 | |
| #define round(acc, x) \
 | |
| 	MADD prime2, acc, x, acc \
 | |
| 	ROR  $64-31, acc         \
 | |
| 	MUL  prime1, acc
 | |
| 
 | |
| // round0 performs the operation x = round(0, x).
 | |
| #define round0(x) \
 | |
| 	MUL prime2, x \
 | |
| 	ROR $64-31, x \
 | |
| 	MUL prime1, x
 | |
| 
 | |
| #define mergeRound(acc, x) \
 | |
| 	round0(x)                     \
 | |
| 	EOR  x, acc                   \
 | |
| 	MADD acc, prime4, prime1, acc
 | |
| 
 | |
| // blockLoop processes as many 32-byte blocks as possible,
 | |
| // updating v1, v2, v3, and v4. It assumes that n >= 32.
 | |
| #define blockLoop() \
 | |
| 	LSR     $5, n, nblocks  \
 | |
| 	PCALIGN $16             \
 | |
| 	loop:                   \
 | |
| 	LDP.P   16(p), (x1, x2) \
 | |
| 	LDP.P   16(p), (x3, x4) \
 | |
| 	round(v1, x1)           \
 | |
| 	round(v2, x2)           \
 | |
| 	round(v3, x3)           \
 | |
| 	round(v4, x4)           \
 | |
| 	SUB     $1, nblocks     \
 | |
| 	CBNZ    nblocks, loop
 | |
| 
 | |
| // func Sum64(b []byte) uint64
 | |
| TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32
 | |
| 	LDP b_base+0(FP), (p, n)
 | |
| 
 | |
| 	LDP  ·primes+0(SB), (prime1, prime2)
 | |
| 	LDP  ·primes+16(SB), (prime3, prime4)
 | |
| 	MOVD ·primes+32(SB), prime5
 | |
| 
 | |
| 	CMP  $32, n
 | |
| 	CSEL LT, prime5, ZR, h // if n < 32 { h = prime5 } else { h = 0 }
 | |
| 	BLT  afterLoop
 | |
| 
 | |
| 	ADD  prime1, prime2, v1
 | |
| 	MOVD prime2, v2
 | |
| 	MOVD $0, v3
 | |
| 	NEG  prime1, v4
 | |
| 
 | |
| 	blockLoop()
 | |
| 
 | |
| 	ROR $64-1, v1, x1
 | |
| 	ROR $64-7, v2, x2
 | |
| 	ADD x1, x2
 | |
| 	ROR $64-12, v3, x3
 | |
| 	ROR $64-18, v4, x4
 | |
| 	ADD x3, x4
 | |
| 	ADD x2, x4, h
 | |
| 
 | |
| 	mergeRound(h, v1)
 | |
| 	mergeRound(h, v2)
 | |
| 	mergeRound(h, v3)
 | |
| 	mergeRound(h, v4)
 | |
| 
 | |
| afterLoop:
 | |
| 	ADD n, h
 | |
| 
 | |
| 	TBZ   $4, n, try8
 | |
| 	LDP.P 16(p), (x1, x2)
 | |
| 
 | |
| 	round0(x1)
 | |
| 
 | |
| 	// NOTE: here and below, sequencing the EOR after the ROR (using a
 | |
| 	// rotated register) is worth a small but measurable speedup for small
 | |
| 	// inputs.
 | |
| 	ROR  $64-27, h
 | |
| 	EOR  x1 @> 64-27, h, h
 | |
| 	MADD h, prime4, prime1, h
 | |
| 
 | |
| 	round0(x2)
 | |
| 	ROR  $64-27, h
 | |
| 	EOR  x2 @> 64-27, h, h
 | |
| 	MADD h, prime4, prime1, h
 | |
| 
 | |
| try8:
 | |
| 	TBZ    $3, n, try4
 | |
| 	MOVD.P 8(p), x1
 | |
| 
 | |
| 	round0(x1)
 | |
| 	ROR  $64-27, h
 | |
| 	EOR  x1 @> 64-27, h, h
 | |
| 	MADD h, prime4, prime1, h
 | |
| 
 | |
| try4:
 | |
| 	TBZ     $2, n, try2
 | |
| 	MOVWU.P 4(p), x2
 | |
| 
 | |
| 	MUL  prime1, x2
 | |
| 	ROR  $64-23, h
 | |
| 	EOR  x2 @> 64-23, h, h
 | |
| 	MADD h, prime3, prime2, h
 | |
| 
 | |
| try2:
 | |
| 	TBZ     $1, n, try1
 | |
| 	MOVHU.P 2(p), x3
 | |
| 	AND     $255, x3, x1
 | |
| 	LSR     $8, x3, x2
 | |
| 
 | |
| 	MUL prime5, x1
 | |
| 	ROR $64-11, h
 | |
| 	EOR x1 @> 64-11, h, h
 | |
| 	MUL prime1, h
 | |
| 
 | |
| 	MUL prime5, x2
 | |
| 	ROR $64-11, h
 | |
| 	EOR x2 @> 64-11, h, h
 | |
| 	MUL prime1, h
 | |
| 
 | |
| try1:
 | |
| 	TBZ   $0, n, finalize
 | |
| 	MOVBU (p), x4
 | |
| 
 | |
| 	MUL prime5, x4
 | |
| 	ROR $64-11, h
 | |
| 	EOR x4 @> 64-11, h, h
 | |
| 	MUL prime1, h
 | |
| 
 | |
| finalize:
 | |
| 	EOR h >> 33, h
 | |
| 	MUL prime2, h
 | |
| 	EOR h >> 29, h
 | |
| 	MUL prime3, h
 | |
| 	EOR h >> 32, h
 | |
| 
 | |
| 	MOVD h, ret+24(FP)
 | |
| 	RET
 | |
| 
 | |
| // func writeBlocks(d *Digest, b []byte) int
 | |
| TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
 | |
| 	LDP ·primes+0(SB), (prime1, prime2)
 | |
| 
 | |
| 	// Load state. Assume v[1-4] are stored contiguously.
 | |
| 	MOVD d+0(FP), digest
 | |
| 	LDP  0(digest), (v1, v2)
 | |
| 	LDP  16(digest), (v3, v4)
 | |
| 
 | |
| 	LDP b_base+8(FP), (p, n)
 | |
| 
 | |
| 	blockLoop()
 | |
| 
 | |
| 	// Store updated state.
 | |
| 	STP (v1, v2), 0(digest)
 | |
| 	STP (v3, v4), 16(digest)
 | |
| 
 | |
| 	BIC  $31, n
 | |
| 	MOVD n, ret+32(FP)
 | |
| 	RET
 |