|
- //go:build !appengine && gc && !purego
- // +build !appengine
- // +build gc
- // +build !purego
-
- #include "textflag.h"
-
- // Registers:
- #define h AX
- #define d AX
- #define p SI // pointer to advance through b
- #define n DX
- #define end BX // loop end
- #define v1 R8
- #define v2 R9
- #define v3 R10
- #define v4 R11
- #define x R12
- #define prime1 R13
- #define prime2 R14
- #define prime4 DI
-
- #define round(acc, x) \
- IMULQ prime2, x \
- ADDQ x, acc \
- ROLQ $31, acc \
- IMULQ prime1, acc
-
- // round0 performs the operation x = round(0, x).
- #define round0(x) \
- IMULQ prime2, x \
- ROLQ $31, x \
- IMULQ prime1, x
-
- // mergeRound applies a merge round on the two registers acc and x.
- // It assumes that prime1, prime2, and prime4 have been loaded.
- #define mergeRound(acc, x) \
- round0(x) \
- XORQ x, acc \
- IMULQ prime1, acc \
- ADDQ prime4, acc
-
- // blockLoop processes as many 32-byte blocks as possible,
- // updating v1, v2, v3, and v4. It assumes that there is at least one block
- // to process.
- #define blockLoop() \
- loop: \
- MOVQ +0(p), x \
- round(v1, x) \
- MOVQ +8(p), x \
- round(v2, x) \
- MOVQ +16(p), x \
- round(v3, x) \
- MOVQ +24(p), x \
- round(v4, x) \
- ADDQ $32, p \
- CMPQ p, end \
- JLE loop
-
- // func Sum64(b []byte) uint64
- TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32
- // Load fixed primes.
- MOVQ ·primes+0(SB), prime1
- MOVQ ·primes+8(SB), prime2
- MOVQ ·primes+24(SB), prime4
-
- // Load slice.
- MOVQ b_base+0(FP), p
- MOVQ b_len+8(FP), n
- LEAQ (p)(n*1), end
-
- // The first loop limit will be len(b)-32.
- SUBQ $32, end
-
- // Check whether we have at least one block.
- CMPQ n, $32
- JLT noBlocks
-
- // Set up initial state (v1, v2, v3, v4).
- MOVQ prime1, v1
- ADDQ prime2, v1
- MOVQ prime2, v2
- XORQ v3, v3
- XORQ v4, v4
- SUBQ prime1, v4
-
- blockLoop()
-
- MOVQ v1, h
- ROLQ $1, h
- MOVQ v2, x
- ROLQ $7, x
- ADDQ x, h
- MOVQ v3, x
- ROLQ $12, x
- ADDQ x, h
- MOVQ v4, x
- ROLQ $18, x
- ADDQ x, h
-
- mergeRound(h, v1)
- mergeRound(h, v2)
- mergeRound(h, v3)
- mergeRound(h, v4)
-
- JMP afterBlocks
-
- noBlocks:
- MOVQ ·primes+32(SB), h
-
- afterBlocks:
- ADDQ n, h
-
- ADDQ $24, end
- CMPQ p, end
- JG try4
-
- loop8:
- MOVQ (p), x
- ADDQ $8, p
- round0(x)
- XORQ x, h
- ROLQ $27, h
- IMULQ prime1, h
- ADDQ prime4, h
-
- CMPQ p, end
- JLE loop8
-
- try4:
- ADDQ $4, end
- CMPQ p, end
- JG try1
-
- MOVL (p), x
- ADDQ $4, p
- IMULQ prime1, x
- XORQ x, h
-
- ROLQ $23, h
- IMULQ prime2, h
- ADDQ ·primes+16(SB), h
-
- try1:
- ADDQ $4, end
- CMPQ p, end
- JGE finalize
-
- loop1:
- MOVBQZX (p), x
- ADDQ $1, p
- IMULQ ·primes+32(SB), x
- XORQ x, h
- ROLQ $11, h
- IMULQ prime1, h
-
- CMPQ p, end
- JL loop1
-
- finalize:
- MOVQ h, x
- SHRQ $33, x
- XORQ x, h
- IMULQ prime2, h
- MOVQ h, x
- SHRQ $29, x
- XORQ x, h
- IMULQ ·primes+16(SB), h
- MOVQ h, x
- SHRQ $32, x
- XORQ x, h
-
- MOVQ h, ret+24(FP)
- RET
-
- // func writeBlocks(d *Digest, b []byte) int
- TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
- // Load fixed primes needed for round.
- MOVQ ·primes+0(SB), prime1
- MOVQ ·primes+8(SB), prime2
-
- // Load slice.
- MOVQ b_base+8(FP), p
- MOVQ b_len+16(FP), n
- LEAQ (p)(n*1), end
- SUBQ $32, end
-
- // Load vN from d.
- MOVQ s+0(FP), d
- MOVQ 0(d), v1
- MOVQ 8(d), v2
- MOVQ 16(d), v3
- MOVQ 24(d), v4
-
- // We don't need to check the loop condition here; this function is
- // always called with at least one block of data to process.
- blockLoop()
-
- // Copy vN back to d.
- MOVQ v1, 0(d)
- MOVQ v2, 8(d)
- MOVQ v3, 16(d)
- MOVQ v4, 24(d)
-
- // The number of bytes written is p minus the old base pointer.
- SUBQ b_base+8(FP), p
- MOVQ p, ret+32(FP)
-
- RET
|