@@ -3,8 +3,7 @@ | |||||
[![Go Reference](https://pkg.go.dev/badge/github.com/cespare/xxhash/v2.svg)](https://pkg.go.dev/github.com/cespare/xxhash/v2) | [![Go Reference](https://pkg.go.dev/badge/github.com/cespare/xxhash/v2.svg)](https://pkg.go.dev/github.com/cespare/xxhash/v2) | ||||
[![Test](https://github.com/cespare/xxhash/actions/workflows/test.yml/badge.svg)](https://github.com/cespare/xxhash/actions/workflows/test.yml) | [![Test](https://github.com/cespare/xxhash/actions/workflows/test.yml/badge.svg)](https://github.com/cespare/xxhash/actions/workflows/test.yml) | ||||
xxhash is a Go implementation of the 64-bit | |||||
[xxHash](http://cyan4973.github.io/xxHash/) algorithm, XXH64. This is a | |||||
xxhash is a Go implementation of the 64-bit [xxHash] algorithm, XXH64. This is a | |||||
high-quality hashing algorithm that is much faster than anything in the Go | high-quality hashing algorithm that is much faster than anything in the Go | ||||
standard library. | standard library. | ||||
@@ -25,8 +24,11 @@ func (*Digest) WriteString(string) (int, error) | |||||
func (*Digest) Sum64() uint64 | func (*Digest) Sum64() uint64 | ||||
``` | ``` | ||||
This implementation provides a fast pure-Go implementation and an even faster | |||||
assembly implementation for amd64. | |||||
The package is written with optimized pure Go and also contains even faster | |||||
assembly implementations for amd64 and arm64. If desired, the `purego` build tag | |||||
opts into using the Go code even on those architectures. | |||||
[xxHash]: http://cyan4973.github.io/xxHash/ | |||||
## Compatibility | ## Compatibility | ||||
@@ -45,19 +47,20 @@ I recommend using the latest release of Go. | |||||
Here are some quick benchmarks comparing the pure-Go and assembly | Here are some quick benchmarks comparing the pure-Go and assembly | ||||
implementations of Sum64. | implementations of Sum64. | ||||
| input size | purego | asm | | |||||
| --- | --- | --- | | |||||
| 5 B | 979.66 MB/s | 1291.17 MB/s | | |||||
| 100 B | 7475.26 MB/s | 7973.40 MB/s | | |||||
| 4 KB | 17573.46 MB/s | 17602.65 MB/s | | |||||
| 10 MB | 17131.46 MB/s | 17142.16 MB/s | | |||||
| input size | purego | asm | | |||||
| ---------- | --------- | --------- | | |||||
| 4 B | 1.3 GB/s | 1.2 GB/s | | |||||
| 16 B | 2.9 GB/s | 3.5 GB/s | | |||||
| 100 B | 6.9 GB/s | 8.1 GB/s | | |||||
| 4 KB | 11.7 GB/s | 16.7 GB/s | | |||||
| 10 MB | 12.0 GB/s | 17.3 GB/s | | |||||
These numbers were generated on Ubuntu 18.04 with an Intel i7-8700K CPU using | |||||
the following commands under Go 1.11.2: | |||||
These numbers were generated on Ubuntu 20.04 with an Intel Xeon Platinum 8252C | |||||
CPU using the following commands under Go 1.19.2: | |||||
``` | ``` | ||||
$ go test -tags purego -benchtime 10s -bench '/xxhash,direct,bytes' | |||||
$ go test -benchtime 10s -bench '/xxhash,direct,bytes' | |||||
benchstat <(go test -tags purego -benchtime 500ms -count 15 -bench 'Sum64$') | |||||
benchstat <(go test -benchtime 500ms -count 15 -bench 'Sum64$') | |||||
``` | ``` | ||||
## Projects using this package | ## Projects using this package | ||||
@@ -0,0 +1,10 @@ | |||||
#!/bin/bash | |||||
set -eu -o pipefail | |||||
# Small convenience script for running the tests with various combinations of | |||||
# arch/tags. This assumes we're running on amd64 and have qemu available. | |||||
go test ./... | |||||
go test -tags purego ./... | |||||
GOARCH=arm64 go test | |||||
GOARCH=arm64 go test -tags purego |
@@ -16,19 +16,11 @@ const ( | |||||
prime5 uint64 = 2870177450012600261 | prime5 uint64 = 2870177450012600261 | ||||
) | ) | ||||
// NOTE(caleb): I'm using both consts and vars of the primes. Using consts where | |||||
// possible in the Go code is worth a small (but measurable) performance boost | |||||
// by avoiding some MOVQs. Vars are needed for the asm and also are useful for | |||||
// convenience in the Go code in a few places where we need to intentionally | |||||
// avoid constant arithmetic (e.g., v1 := prime1 + prime2 fails because the | |||||
// result overflows a uint64). | |||||
var ( | |||||
prime1v = prime1 | |||||
prime2v = prime2 | |||||
prime3v = prime3 | |||||
prime4v = prime4 | |||||
prime5v = prime5 | |||||
) | |||||
// Store the primes in an array as well. | |||||
// | |||||
// The consts are used when possible in Go code to avoid MOVs but we need a | |||||
// contiguous array of the assembly code. | |||||
var primes = [...]uint64{prime1, prime2, prime3, prime4, prime5} | |||||
// Digest implements hash.Hash64. | // Digest implements hash.Hash64. | ||||
type Digest struct { | type Digest struct { | ||||
@@ -50,10 +42,10 @@ func New() *Digest { | |||||
// Reset clears the Digest's state so that it can be reused. | // Reset clears the Digest's state so that it can be reused. | ||||
func (d *Digest) Reset() { | func (d *Digest) Reset() { | ||||
d.v1 = prime1v + prime2 | |||||
d.v1 = primes[0] + prime2 | |||||
d.v2 = prime2 | d.v2 = prime2 | ||||
d.v3 = 0 | d.v3 = 0 | ||||
d.v4 = -prime1v | |||||
d.v4 = -primes[0] | |||||
d.total = 0 | d.total = 0 | ||||
d.n = 0 | d.n = 0 | ||||
} | } | ||||
@@ -69,21 +61,23 @@ func (d *Digest) Write(b []byte) (n int, err error) { | |||||
n = len(b) | n = len(b) | ||||
d.total += uint64(n) | d.total += uint64(n) | ||||
memleft := d.mem[d.n&(len(d.mem)-1):] | |||||
if d.n+n < 32 { | if d.n+n < 32 { | ||||
// This new data doesn't even fill the current block. | // This new data doesn't even fill the current block. | ||||
copy(d.mem[d.n:], b) | |||||
copy(memleft, b) | |||||
d.n += n | d.n += n | ||||
return | return | ||||
} | } | ||||
if d.n > 0 { | if d.n > 0 { | ||||
// Finish off the partial block. | // Finish off the partial block. | ||||
copy(d.mem[d.n:], b) | |||||
c := copy(memleft, b) | |||||
d.v1 = round(d.v1, u64(d.mem[0:8])) | d.v1 = round(d.v1, u64(d.mem[0:8])) | ||||
d.v2 = round(d.v2, u64(d.mem[8:16])) | d.v2 = round(d.v2, u64(d.mem[8:16])) | ||||
d.v3 = round(d.v3, u64(d.mem[16:24])) | d.v3 = round(d.v3, u64(d.mem[16:24])) | ||||
d.v4 = round(d.v4, u64(d.mem[24:32])) | d.v4 = round(d.v4, u64(d.mem[24:32])) | ||||
b = b[32-d.n:] | |||||
b = b[c:] | |||||
d.n = 0 | d.n = 0 | ||||
} | } | ||||
@@ -133,21 +127,20 @@ func (d *Digest) Sum64() uint64 { | |||||
h += d.total | h += d.total | ||||
i, end := 0, d.n | |||||
for ; i+8 <= end; i += 8 { | |||||
k1 := round(0, u64(d.mem[i:i+8])) | |||||
b := d.mem[:d.n&(len(d.mem)-1)] | |||||
for ; len(b) >= 8; b = b[8:] { | |||||
k1 := round(0, u64(b[:8])) | |||||
h ^= k1 | h ^= k1 | ||||
h = rol27(h)*prime1 + prime4 | h = rol27(h)*prime1 + prime4 | ||||
} | } | ||||
if i+4 <= end { | |||||
h ^= uint64(u32(d.mem[i:i+4])) * prime1 | |||||
if len(b) >= 4 { | |||||
h ^= uint64(u32(b[:4])) * prime1 | |||||
h = rol23(h)*prime2 + prime3 | h = rol23(h)*prime2 + prime3 | ||||
i += 4 | |||||
b = b[4:] | |||||
} | } | ||||
for i < end { | |||||
h ^= uint64(d.mem[i]) * prime5 | |||||
for ; len(b) > 0; b = b[1:] { | |||||
h ^= uint64(b[0]) * prime5 | |||||
h = rol11(h) * prime1 | h = rol11(h) * prime1 | ||||
i++ | |||||
} | } | ||||
h ^= h >> 33 | h ^= h >> 33 | ||||
@@ -1,215 +1,209 @@ | |||||
//go:build !appengine && gc && !purego | |||||
// +build !appengine | // +build !appengine | ||||
// +build gc | // +build gc | ||||
// +build !purego | // +build !purego | ||||
#include "textflag.h" | #include "textflag.h" | ||||
// Register allocation: | |||||
// AX h | |||||
// SI pointer to advance through b | |||||
// DX n | |||||
// BX loop end | |||||
// R8 v1, k1 | |||||
// R9 v2 | |||||
// R10 v3 | |||||
// R11 v4 | |||||
// R12 tmp | |||||
// R13 prime1v | |||||
// R14 prime2v | |||||
// DI prime4v | |||||
// round reads from and advances the buffer pointer in SI. | |||||
// It assumes that R13 has prime1v and R14 has prime2v. | |||||
#define round(r) \ | |||||
MOVQ (SI), R12 \ | |||||
ADDQ $8, SI \ | |||||
IMULQ R14, R12 \ | |||||
ADDQ R12, r \ | |||||
ROLQ $31, r \ | |||||
IMULQ R13, r | |||||
// mergeRound applies a merge round on the two registers acc and val. | |||||
// It assumes that R13 has prime1v, R14 has prime2v, and DI has prime4v. | |||||
#define mergeRound(acc, val) \ | |||||
IMULQ R14, val \ | |||||
ROLQ $31, val \ | |||||
IMULQ R13, val \ | |||||
XORQ val, acc \ | |||||
IMULQ R13, acc \ | |||||
ADDQ DI, acc | |||||
// Registers: | |||||
#define h AX | |||||
#define d AX | |||||
#define p SI // pointer to advance through b | |||||
#define n DX | |||||
#define end BX // loop end | |||||
#define v1 R8 | |||||
#define v2 R9 | |||||
#define v3 R10 | |||||
#define v4 R11 | |||||
#define x R12 | |||||
#define prime1 R13 | |||||
#define prime2 R14 | |||||
#define prime4 DI | |||||
#define round(acc, x) \ | |||||
IMULQ prime2, x \ | |||||
ADDQ x, acc \ | |||||
ROLQ $31, acc \ | |||||
IMULQ prime1, acc | |||||
// round0 performs the operation x = round(0, x). | |||||
#define round0(x) \ | |||||
IMULQ prime2, x \ | |||||
ROLQ $31, x \ | |||||
IMULQ prime1, x | |||||
// mergeRound applies a merge round on the two registers acc and x. | |||||
// It assumes that prime1, prime2, and prime4 have been loaded. | |||||
#define mergeRound(acc, x) \ | |||||
round0(x) \ | |||||
XORQ x, acc \ | |||||
IMULQ prime1, acc \ | |||||
ADDQ prime4, acc | |||||
// blockLoop processes as many 32-byte blocks as possible, | |||||
// updating v1, v2, v3, and v4. It assumes that there is at least one block | |||||
// to process. | |||||
#define blockLoop() \ | |||||
loop: \ | |||||
MOVQ +0(p), x \ | |||||
round(v1, x) \ | |||||
MOVQ +8(p), x \ | |||||
round(v2, x) \ | |||||
MOVQ +16(p), x \ | |||||
round(v3, x) \ | |||||
MOVQ +24(p), x \ | |||||
round(v4, x) \ | |||||
ADDQ $32, p \ | |||||
CMPQ p, end \ | |||||
JLE loop | |||||
// func Sum64(b []byte) uint64 | // func Sum64(b []byte) uint64 | ||||
TEXT ·Sum64(SB), NOSPLIT, $0-32 | |||||
TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32 | |||||
// Load fixed primes. | // Load fixed primes. | ||||
MOVQ ·prime1v(SB), R13 | |||||
MOVQ ·prime2v(SB), R14 | |||||
MOVQ ·prime4v(SB), DI | |||||
MOVQ ·primes+0(SB), prime1 | |||||
MOVQ ·primes+8(SB), prime2 | |||||
MOVQ ·primes+24(SB), prime4 | |||||
// Load slice. | // Load slice. | ||||
MOVQ b_base+0(FP), SI | |||||
MOVQ b_len+8(FP), DX | |||||
LEAQ (SI)(DX*1), BX | |||||
MOVQ b_base+0(FP), p | |||||
MOVQ b_len+8(FP), n | |||||
LEAQ (p)(n*1), end | |||||
// The first loop limit will be len(b)-32. | // The first loop limit will be len(b)-32. | ||||
SUBQ $32, BX | |||||
SUBQ $32, end | |||||
// Check whether we have at least one block. | // Check whether we have at least one block. | ||||
CMPQ DX, $32 | |||||
CMPQ n, $32 | |||||
JLT noBlocks | JLT noBlocks | ||||
// Set up initial state (v1, v2, v3, v4). | // Set up initial state (v1, v2, v3, v4). | ||||
MOVQ R13, R8 | |||||
ADDQ R14, R8 | |||||
MOVQ R14, R9 | |||||
XORQ R10, R10 | |||||
XORQ R11, R11 | |||||
SUBQ R13, R11 | |||||
// Loop until SI > BX. | |||||
blockLoop: | |||||
round(R8) | |||||
round(R9) | |||||
round(R10) | |||||
round(R11) | |||||
CMPQ SI, BX | |||||
JLE blockLoop | |||||
MOVQ R8, AX | |||||
ROLQ $1, AX | |||||
MOVQ R9, R12 | |||||
ROLQ $7, R12 | |||||
ADDQ R12, AX | |||||
MOVQ R10, R12 | |||||
ROLQ $12, R12 | |||||
ADDQ R12, AX | |||||
MOVQ R11, R12 | |||||
ROLQ $18, R12 | |||||
ADDQ R12, AX | |||||
mergeRound(AX, R8) | |||||
mergeRound(AX, R9) | |||||
mergeRound(AX, R10) | |||||
mergeRound(AX, R11) | |||||
MOVQ prime1, v1 | |||||
ADDQ prime2, v1 | |||||
MOVQ prime2, v2 | |||||
XORQ v3, v3 | |||||
XORQ v4, v4 | |||||
SUBQ prime1, v4 | |||||
blockLoop() | |||||
MOVQ v1, h | |||||
ROLQ $1, h | |||||
MOVQ v2, x | |||||
ROLQ $7, x | |||||
ADDQ x, h | |||||
MOVQ v3, x | |||||
ROLQ $12, x | |||||
ADDQ x, h | |||||
MOVQ v4, x | |||||
ROLQ $18, x | |||||
ADDQ x, h | |||||
mergeRound(h, v1) | |||||
mergeRound(h, v2) | |||||
mergeRound(h, v3) | |||||
mergeRound(h, v4) | |||||
JMP afterBlocks | JMP afterBlocks | ||||
noBlocks: | noBlocks: | ||||
MOVQ ·prime5v(SB), AX | |||||
MOVQ ·primes+32(SB), h | |||||
afterBlocks: | afterBlocks: | ||||
ADDQ DX, AX | |||||
// Right now BX has len(b)-32, and we want to loop until SI > len(b)-8. | |||||
ADDQ $24, BX | |||||
CMPQ SI, BX | |||||
JG fourByte | |||||
wordLoop: | |||||
// Calculate k1. | |||||
MOVQ (SI), R8 | |||||
ADDQ $8, SI | |||||
IMULQ R14, R8 | |||||
ROLQ $31, R8 | |||||
IMULQ R13, R8 | |||||
XORQ R8, AX | |||||
ROLQ $27, AX | |||||
IMULQ R13, AX | |||||
ADDQ DI, AX | |||||
CMPQ SI, BX | |||||
JLE wordLoop | |||||
fourByte: | |||||
ADDQ $4, BX | |||||
CMPQ SI, BX | |||||
JG singles | |||||
MOVL (SI), R8 | |||||
ADDQ $4, SI | |||||
IMULQ R13, R8 | |||||
XORQ R8, AX | |||||
ROLQ $23, AX | |||||
IMULQ R14, AX | |||||
ADDQ ·prime3v(SB), AX | |||||
singles: | |||||
ADDQ $4, BX | |||||
CMPQ SI, BX | |||||
ADDQ n, h | |||||
ADDQ $24, end | |||||
CMPQ p, end | |||||
JG try4 | |||||
loop8: | |||||
MOVQ (p), x | |||||
ADDQ $8, p | |||||
round0(x) | |||||
XORQ x, h | |||||
ROLQ $27, h | |||||
IMULQ prime1, h | |||||
ADDQ prime4, h | |||||
CMPQ p, end | |||||
JLE loop8 | |||||
try4: | |||||
ADDQ $4, end | |||||
CMPQ p, end | |||||
JG try1 | |||||
MOVL (p), x | |||||
ADDQ $4, p | |||||
IMULQ prime1, x | |||||
XORQ x, h | |||||
ROLQ $23, h | |||||
IMULQ prime2, h | |||||
ADDQ ·primes+16(SB), h | |||||
try1: | |||||
ADDQ $4, end | |||||
CMPQ p, end | |||||
JGE finalize | JGE finalize | ||||
singlesLoop: | |||||
MOVBQZX (SI), R12 | |||||
ADDQ $1, SI | |||||
IMULQ ·prime5v(SB), R12 | |||||
XORQ R12, AX | |||||
loop1: | |||||
MOVBQZX (p), x | |||||
ADDQ $1, p | |||||
IMULQ ·primes+32(SB), x | |||||
XORQ x, h | |||||
ROLQ $11, h | |||||
IMULQ prime1, h | |||||
ROLQ $11, AX | |||||
IMULQ R13, AX | |||||
CMPQ SI, BX | |||||
JL singlesLoop | |||||
CMPQ p, end | |||||
JL loop1 | |||||
finalize: | finalize: | ||||
MOVQ AX, R12 | |||||
SHRQ $33, R12 | |||||
XORQ R12, AX | |||||
IMULQ R14, AX | |||||
MOVQ AX, R12 | |||||
SHRQ $29, R12 | |||||
XORQ R12, AX | |||||
IMULQ ·prime3v(SB), AX | |||||
MOVQ AX, R12 | |||||
SHRQ $32, R12 | |||||
XORQ R12, AX | |||||
MOVQ AX, ret+24(FP) | |||||
MOVQ h, x | |||||
SHRQ $33, x | |||||
XORQ x, h | |||||
IMULQ prime2, h | |||||
MOVQ h, x | |||||
SHRQ $29, x | |||||
XORQ x, h | |||||
IMULQ ·primes+16(SB), h | |||||
MOVQ h, x | |||||
SHRQ $32, x | |||||
XORQ x, h | |||||
MOVQ h, ret+24(FP) | |||||
RET | RET | ||||
// writeBlocks uses the same registers as above except that it uses AX to store | |||||
// the d pointer. | |||||
// func writeBlocks(d *Digest, b []byte) int | // func writeBlocks(d *Digest, b []byte) int | ||||
TEXT ·writeBlocks(SB), NOSPLIT, $0-40 | |||||
TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40 | |||||
// Load fixed primes needed for round. | // Load fixed primes needed for round. | ||||
MOVQ ·prime1v(SB), R13 | |||||
MOVQ ·prime2v(SB), R14 | |||||
MOVQ ·primes+0(SB), prime1 | |||||
MOVQ ·primes+8(SB), prime2 | |||||
// Load slice. | // Load slice. | ||||
MOVQ b_base+8(FP), SI | |||||
MOVQ b_len+16(FP), DX | |||||
LEAQ (SI)(DX*1), BX | |||||
SUBQ $32, BX | |||||
MOVQ b_base+8(FP), p | |||||
MOVQ b_len+16(FP), n | |||||
LEAQ (p)(n*1), end | |||||
SUBQ $32, end | |||||
// Load vN from d. | // Load vN from d. | ||||
MOVQ d+0(FP), AX | |||||
MOVQ 0(AX), R8 // v1 | |||||
MOVQ 8(AX), R9 // v2 | |||||
MOVQ 16(AX), R10 // v3 | |||||
MOVQ 24(AX), R11 // v4 | |||||
MOVQ s+0(FP), d | |||||
MOVQ 0(d), v1 | |||||
MOVQ 8(d), v2 | |||||
MOVQ 16(d), v3 | |||||
MOVQ 24(d), v4 | |||||
// We don't need to check the loop condition here; this function is | // We don't need to check the loop condition here; this function is | ||||
// always called with at least one block of data to process. | // always called with at least one block of data to process. | ||||
blockLoop: | |||||
round(R8) | |||||
round(R9) | |||||
round(R10) | |||||
round(R11) | |||||
CMPQ SI, BX | |||||
JLE blockLoop | |||||
blockLoop() | |||||
// Copy vN back to d. | // Copy vN back to d. | ||||
MOVQ R8, 0(AX) | |||||
MOVQ R9, 8(AX) | |||||
MOVQ R10, 16(AX) | |||||
MOVQ R11, 24(AX) | |||||
// The number of bytes written is SI minus the old base pointer. | |||||
SUBQ b_base+8(FP), SI | |||||
MOVQ SI, ret+32(FP) | |||||
MOVQ v1, 0(d) | |||||
MOVQ v2, 8(d) | |||||
MOVQ v3, 16(d) | |||||
MOVQ v4, 24(d) | |||||
// The number of bytes written is p minus the old base pointer. | |||||
SUBQ b_base+8(FP), p | |||||
MOVQ p, ret+32(FP) | |||||
RET | RET |
@@ -0,0 +1,183 @@ | |||||
//go:build !appengine && gc && !purego | |||||
// +build !appengine | |||||
// +build gc | |||||
// +build !purego | |||||
#include "textflag.h" | |||||
// Registers: | |||||
#define digest R1 | |||||
#define h R2 // return value | |||||
#define p R3 // input pointer | |||||
#define n R4 // input length | |||||
#define nblocks R5 // n / 32 | |||||
#define prime1 R7 | |||||
#define prime2 R8 | |||||
#define prime3 R9 | |||||
#define prime4 R10 | |||||
#define prime5 R11 | |||||
#define v1 R12 | |||||
#define v2 R13 | |||||
#define v3 R14 | |||||
#define v4 R15 | |||||
#define x1 R20 | |||||
#define x2 R21 | |||||
#define x3 R22 | |||||
#define x4 R23 | |||||
#define round(acc, x) \ | |||||
MADD prime2, acc, x, acc \ | |||||
ROR $64-31, acc \ | |||||
MUL prime1, acc | |||||
// round0 performs the operation x = round(0, x). | |||||
#define round0(x) \ | |||||
MUL prime2, x \ | |||||
ROR $64-31, x \ | |||||
MUL prime1, x | |||||
#define mergeRound(acc, x) \ | |||||
round0(x) \ | |||||
EOR x, acc \ | |||||
MADD acc, prime4, prime1, acc | |||||
// blockLoop processes as many 32-byte blocks as possible, | |||||
// updating v1, v2, v3, and v4. It assumes that n >= 32. | |||||
#define blockLoop() \ | |||||
LSR $5, n, nblocks \ | |||||
PCALIGN $16 \ | |||||
loop: \ | |||||
LDP.P 16(p), (x1, x2) \ | |||||
LDP.P 16(p), (x3, x4) \ | |||||
round(v1, x1) \ | |||||
round(v2, x2) \ | |||||
round(v3, x3) \ | |||||
round(v4, x4) \ | |||||
SUB $1, nblocks \ | |||||
CBNZ nblocks, loop | |||||
// func Sum64(b []byte) uint64 | |||||
TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32 | |||||
LDP b_base+0(FP), (p, n) | |||||
LDP ·primes+0(SB), (prime1, prime2) | |||||
LDP ·primes+16(SB), (prime3, prime4) | |||||
MOVD ·primes+32(SB), prime5 | |||||
CMP $32, n | |||||
CSEL LT, prime5, ZR, h // if n < 32 { h = prime5 } else { h = 0 } | |||||
BLT afterLoop | |||||
ADD prime1, prime2, v1 | |||||
MOVD prime2, v2 | |||||
MOVD $0, v3 | |||||
NEG prime1, v4 | |||||
blockLoop() | |||||
ROR $64-1, v1, x1 | |||||
ROR $64-7, v2, x2 | |||||
ADD x1, x2 | |||||
ROR $64-12, v3, x3 | |||||
ROR $64-18, v4, x4 | |||||
ADD x3, x4 | |||||
ADD x2, x4, h | |||||
mergeRound(h, v1) | |||||
mergeRound(h, v2) | |||||
mergeRound(h, v3) | |||||
mergeRound(h, v4) | |||||
afterLoop: | |||||
ADD n, h | |||||
TBZ $4, n, try8 | |||||
LDP.P 16(p), (x1, x2) | |||||
round0(x1) | |||||
// NOTE: here and below, sequencing the EOR after the ROR (using a | |||||
// rotated register) is worth a small but measurable speedup for small | |||||
// inputs. | |||||
ROR $64-27, h | |||||
EOR x1 @> 64-27, h, h | |||||
MADD h, prime4, prime1, h | |||||
round0(x2) | |||||
ROR $64-27, h | |||||
EOR x2 @> 64-27, h, h | |||||
MADD h, prime4, prime1, h | |||||
try8: | |||||
TBZ $3, n, try4 | |||||
MOVD.P 8(p), x1 | |||||
round0(x1) | |||||
ROR $64-27, h | |||||
EOR x1 @> 64-27, h, h | |||||
MADD h, prime4, prime1, h | |||||
try4: | |||||
TBZ $2, n, try2 | |||||
MOVWU.P 4(p), x2 | |||||
MUL prime1, x2 | |||||
ROR $64-23, h | |||||
EOR x2 @> 64-23, h, h | |||||
MADD h, prime3, prime2, h | |||||
try2: | |||||
TBZ $1, n, try1 | |||||
MOVHU.P 2(p), x3 | |||||
AND $255, x3, x1 | |||||
LSR $8, x3, x2 | |||||
MUL prime5, x1 | |||||
ROR $64-11, h | |||||
EOR x1 @> 64-11, h, h | |||||
MUL prime1, h | |||||
MUL prime5, x2 | |||||
ROR $64-11, h | |||||
EOR x2 @> 64-11, h, h | |||||
MUL prime1, h | |||||
try1: | |||||
TBZ $0, n, finalize | |||||
MOVBU (p), x4 | |||||
MUL prime5, x4 | |||||
ROR $64-11, h | |||||
EOR x4 @> 64-11, h, h | |||||
MUL prime1, h | |||||
finalize: | |||||
EOR h >> 33, h | |||||
MUL prime2, h | |||||
EOR h >> 29, h | |||||
MUL prime3, h | |||||
EOR h >> 32, h | |||||
MOVD h, ret+24(FP) | |||||
RET | |||||
// func writeBlocks(d *Digest, b []byte) int | |||||
TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40 | |||||
LDP ·primes+0(SB), (prime1, prime2) | |||||
// Load state. Assume v[1-4] are stored contiguously. | |||||
MOVD d+0(FP), digest | |||||
LDP 0(digest), (v1, v2) | |||||
LDP 16(digest), (v3, v4) | |||||
LDP b_base+8(FP), (p, n) | |||||
blockLoop() | |||||
// Store updated state. | |||||
STP (v1, v2), 0(digest) | |||||
STP (v3, v4), 16(digest) | |||||
BIC $31, n | |||||
MOVD n, ret+32(FP) | |||||
RET |
@@ -1,3 +1,5 @@ | |||||
//go:build (amd64 || arm64) && !appengine && gc && !purego | |||||
// +build amd64 arm64 | |||||
// +build !appengine | // +build !appengine | ||||
// +build gc | // +build gc | ||||
// +build !purego | // +build !purego |
@@ -1,4 +1,5 @@ | |||||
// +build !amd64 appengine !gc purego | |||||
//go:build (!amd64 && !arm64) || appengine || !gc || purego | |||||
// +build !amd64,!arm64 appengine !gc purego | |||||
package xxhash | package xxhash | ||||
@@ -14,10 +15,10 @@ func Sum64(b []byte) uint64 { | |||||
var h uint64 | var h uint64 | ||||
if n >= 32 { | if n >= 32 { | ||||
v1 := prime1v + prime2 | |||||
v1 := primes[0] + prime2 | |||||
v2 := prime2 | v2 := prime2 | ||||
v3 := uint64(0) | v3 := uint64(0) | ||||
v4 := -prime1v | |||||
v4 := -primes[0] | |||||
for len(b) >= 32 { | for len(b) >= 32 { | ||||
v1 = round(v1, u64(b[0:8:len(b)])) | v1 = round(v1, u64(b[0:8:len(b)])) | ||||
v2 = round(v2, u64(b[8:16:len(b)])) | v2 = round(v2, u64(b[8:16:len(b)])) | ||||
@@ -36,19 +37,18 @@ func Sum64(b []byte) uint64 { | |||||
h += uint64(n) | h += uint64(n) | ||||
i, end := 0, len(b) | |||||
for ; i+8 <= end; i += 8 { | |||||
k1 := round(0, u64(b[i:i+8:len(b)])) | |||||
for ; len(b) >= 8; b = b[8:] { | |||||
k1 := round(0, u64(b[:8])) | |||||
h ^= k1 | h ^= k1 | ||||
h = rol27(h)*prime1 + prime4 | h = rol27(h)*prime1 + prime4 | ||||
} | } | ||||
if i+4 <= end { | |||||
h ^= uint64(u32(b[i:i+4:len(b)])) * prime1 | |||||
if len(b) >= 4 { | |||||
h ^= uint64(u32(b[:4])) * prime1 | |||||
h = rol23(h)*prime2 + prime3 | h = rol23(h)*prime2 + prime3 | ||||
i += 4 | |||||
b = b[4:] | |||||
} | } | ||||
for ; i < end; i++ { | |||||
h ^= uint64(b[i]) * prime5 | |||||
for ; len(b) > 0; b = b[1:] { | |||||
h ^= uint64(b[0]) * prime5 | |||||
h = rol11(h) * prime1 | h = rol11(h) * prime1 | ||||
} | } | ||||
@@ -1,3 +1,4 @@ | |||||
//go:build appengine | |||||
// +build appengine | // +build appengine | ||||
// This file contains the safe implementations of otherwise unsafe-using code. | // This file contains the safe implementations of otherwise unsafe-using code. | ||||
@@ -1,3 +1,4 @@ | |||||
//go:build !appengine | |||||
// +build !appengine | // +build !appengine | ||||
// This file encapsulates usage of unsafe. | // This file encapsulates usage of unsafe. | ||||
@@ -11,7 +12,7 @@ import ( | |||||
// In the future it's possible that compiler optimizations will make these | // In the future it's possible that compiler optimizations will make these | ||||
// XxxString functions unnecessary by realizing that calls such as | // XxxString functions unnecessary by realizing that calls such as | ||||
// Sum64([]byte(s)) don't need to copy s. See https://golang.org/issue/2205. | |||||
// Sum64([]byte(s)) don't need to copy s. See https://go.dev/issue/2205. | |||||
// If that happens, even if we keep these functions they can be replaced with | // If that happens, even if we keep these functions they can be replaced with | ||||
// the trivial safe code. | // the trivial safe code. | ||||
@@ -1,4 +1,4 @@ | |||||
# github.com/cespare/xxhash/v2 v2.1.2 | |||||
# github.com/cespare/xxhash/v2 v2.2.0 | |||||
## explicit; go 1.11 | ## explicit; go 1.11 | ||||
github.com/cespare/xxhash/v2 | github.com/cespare/xxhash/v2 | ||||
# github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f | # github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f | ||||