You cannot select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
	
	
		
			210 lines
		
	
	
		
			3.5 KiB
		
	
	
	
		
			ArmAsm
		
	
			
		
		
	
	
			210 lines
		
	
	
		
			3.5 KiB
		
	
	
	
		
			ArmAsm
		
	
| //go:build !appengine && gc && !purego
 | |
| // +build !appengine
 | |
| // +build gc
 | |
| // +build !purego
 | |
| 
 | |
| #include "textflag.h"
 | |
| 
 | |
| // Registers:
 | |
| #define h      AX
 | |
| #define d      AX
 | |
| #define p      SI // pointer to advance through b
 | |
| #define n      DX
 | |
| #define end    BX // loop end
 | |
| #define v1     R8
 | |
| #define v2     R9
 | |
| #define v3     R10
 | |
| #define v4     R11
 | |
| #define x      R12
 | |
| #define prime1 R13
 | |
| #define prime2 R14
 | |
| #define prime4 DI
 | |
| 
 | |
| #define round(acc, x) \
 | |
| 	IMULQ prime2, x   \
 | |
| 	ADDQ  x, acc      \
 | |
| 	ROLQ  $31, acc    \
 | |
| 	IMULQ prime1, acc
 | |
| 
 | |
| // round0 performs the operation x = round(0, x).
 | |
| #define round0(x) \
 | |
| 	IMULQ prime2, x \
 | |
| 	ROLQ  $31, x    \
 | |
| 	IMULQ prime1, x
 | |
| 
 | |
| // mergeRound applies a merge round on the two registers acc and x.
 | |
| // It assumes that prime1, prime2, and prime4 have been loaded.
 | |
| #define mergeRound(acc, x) \
 | |
| 	round0(x)         \
 | |
| 	XORQ  x, acc      \
 | |
| 	IMULQ prime1, acc \
 | |
| 	ADDQ  prime4, acc
 | |
| 
 | |
| // blockLoop processes as many 32-byte blocks as possible,
 | |
| // updating v1, v2, v3, and v4. It assumes that there is at least one block
 | |
| // to process.
 | |
| #define blockLoop() \
 | |
| loop:  \
 | |
| 	MOVQ +0(p), x  \
 | |
| 	round(v1, x)   \
 | |
| 	MOVQ +8(p), x  \
 | |
| 	round(v2, x)   \
 | |
| 	MOVQ +16(p), x \
 | |
| 	round(v3, x)   \
 | |
| 	MOVQ +24(p), x \
 | |
| 	round(v4, x)   \
 | |
| 	ADDQ $32, p    \
 | |
| 	CMPQ p, end    \
 | |
| 	JLE  loop
 | |
| 
 | |
| // func Sum64(b []byte) uint64
 | |
| TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32
 | |
| 	// Load fixed primes.
 | |
| 	MOVQ ·primes+0(SB), prime1
 | |
| 	MOVQ ·primes+8(SB), prime2
 | |
| 	MOVQ ·primes+24(SB), prime4
 | |
| 
 | |
| 	// Load slice.
 | |
| 	MOVQ b_base+0(FP), p
 | |
| 	MOVQ b_len+8(FP), n
 | |
| 	LEAQ (p)(n*1), end
 | |
| 
 | |
| 	// The first loop limit will be len(b)-32.
 | |
| 	SUBQ $32, end
 | |
| 
 | |
| 	// Check whether we have at least one block.
 | |
| 	CMPQ n, $32
 | |
| 	JLT  noBlocks
 | |
| 
 | |
| 	// Set up initial state (v1, v2, v3, v4).
 | |
| 	MOVQ prime1, v1
 | |
| 	ADDQ prime2, v1
 | |
| 	MOVQ prime2, v2
 | |
| 	XORQ v3, v3
 | |
| 	XORQ v4, v4
 | |
| 	SUBQ prime1, v4
 | |
| 
 | |
| 	blockLoop()
 | |
| 
 | |
| 	MOVQ v1, h
 | |
| 	ROLQ $1, h
 | |
| 	MOVQ v2, x
 | |
| 	ROLQ $7, x
 | |
| 	ADDQ x, h
 | |
| 	MOVQ v3, x
 | |
| 	ROLQ $12, x
 | |
| 	ADDQ x, h
 | |
| 	MOVQ v4, x
 | |
| 	ROLQ $18, x
 | |
| 	ADDQ x, h
 | |
| 
 | |
| 	mergeRound(h, v1)
 | |
| 	mergeRound(h, v2)
 | |
| 	mergeRound(h, v3)
 | |
| 	mergeRound(h, v4)
 | |
| 
 | |
| 	JMP afterBlocks
 | |
| 
 | |
| noBlocks:
 | |
| 	MOVQ ·primes+32(SB), h
 | |
| 
 | |
| afterBlocks:
 | |
| 	ADDQ n, h
 | |
| 
 | |
| 	ADDQ $24, end
 | |
| 	CMPQ p, end
 | |
| 	JG   try4
 | |
| 
 | |
| loop8:
 | |
| 	MOVQ  (p), x
 | |
| 	ADDQ  $8, p
 | |
| 	round0(x)
 | |
| 	XORQ  x, h
 | |
| 	ROLQ  $27, h
 | |
| 	IMULQ prime1, h
 | |
| 	ADDQ  prime4, h
 | |
| 
 | |
| 	CMPQ p, end
 | |
| 	JLE  loop8
 | |
| 
 | |
| try4:
 | |
| 	ADDQ $4, end
 | |
| 	CMPQ p, end
 | |
| 	JG   try1
 | |
| 
 | |
| 	MOVL  (p), x
 | |
| 	ADDQ  $4, p
 | |
| 	IMULQ prime1, x
 | |
| 	XORQ  x, h
 | |
| 
 | |
| 	ROLQ  $23, h
 | |
| 	IMULQ prime2, h
 | |
| 	ADDQ  ·primes+16(SB), h
 | |
| 
 | |
| try1:
 | |
| 	ADDQ $4, end
 | |
| 	CMPQ p, end
 | |
| 	JGE  finalize
 | |
| 
 | |
| loop1:
 | |
| 	MOVBQZX (p), x
 | |
| 	ADDQ    $1, p
 | |
| 	IMULQ   ·primes+32(SB), x
 | |
| 	XORQ    x, h
 | |
| 	ROLQ    $11, h
 | |
| 	IMULQ   prime1, h
 | |
| 
 | |
| 	CMPQ p, end
 | |
| 	JL   loop1
 | |
| 
 | |
| finalize:
 | |
| 	MOVQ  h, x
 | |
| 	SHRQ  $33, x
 | |
| 	XORQ  x, h
 | |
| 	IMULQ prime2, h
 | |
| 	MOVQ  h, x
 | |
| 	SHRQ  $29, x
 | |
| 	XORQ  x, h
 | |
| 	IMULQ ·primes+16(SB), h
 | |
| 	MOVQ  h, x
 | |
| 	SHRQ  $32, x
 | |
| 	XORQ  x, h
 | |
| 
 | |
| 	MOVQ h, ret+24(FP)
 | |
| 	RET
 | |
| 
 | |
| // func writeBlocks(d *Digest, b []byte) int
 | |
| TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
 | |
| 	// Load fixed primes needed for round.
 | |
| 	MOVQ ·primes+0(SB), prime1
 | |
| 	MOVQ ·primes+8(SB), prime2
 | |
| 
 | |
| 	// Load slice.
 | |
| 	MOVQ b_base+8(FP), p
 | |
| 	MOVQ b_len+16(FP), n
 | |
| 	LEAQ (p)(n*1), end
 | |
| 	SUBQ $32, end
 | |
| 
 | |
| 	// Load vN from d.
 | |
| 	MOVQ s+0(FP), d
 | |
| 	MOVQ 0(d), v1
 | |
| 	MOVQ 8(d), v2
 | |
| 	MOVQ 16(d), v3
 | |
| 	MOVQ 24(d), v4
 | |
| 
 | |
| 	// We don't need to check the loop condition here; this function is
 | |
| 	// always called with at least one block of data to process.
 | |
| 	blockLoop()
 | |
| 
 | |
| 	// Copy vN back to d.
 | |
| 	MOVQ v1, 0(d)
 | |
| 	MOVQ v2, 8(d)
 | |
| 	MOVQ v3, 16(d)
 | |
| 	MOVQ v4, 24(d)
 | |
| 
 | |
| 	// The number of bytes written is p minus the old base pointer.
 | |
| 	SUBQ b_base+8(FP), p
 | |
| 	MOVQ p, ret+32(FP)
 | |
| 
 | |
| 	RET
 |