@@ -3,8 +3,7 @@ | |||
[![Go Reference](https://pkg.go.dev/badge/github.com/cespare/xxhash/v2.svg)](https://pkg.go.dev/github.com/cespare/xxhash/v2) | |||
[![Test](https://github.com/cespare/xxhash/actions/workflows/test.yml/badge.svg)](https://github.com/cespare/xxhash/actions/workflows/test.yml) | |||
xxhash is a Go implementation of the 64-bit | |||
[xxHash](http://cyan4973.github.io/xxHash/) algorithm, XXH64. This is a | |||
xxhash is a Go implementation of the 64-bit [xxHash] algorithm, XXH64. This is a | |||
high-quality hashing algorithm that is much faster than anything in the Go | |||
standard library. | |||
@@ -25,8 +24,11 @@ func (*Digest) WriteString(string) (int, error) | |||
func (*Digest) Sum64() uint64 | |||
``` | |||
This implementation provides a fast pure-Go implementation and an even faster | |||
assembly implementation for amd64. | |||
The package is written with optimized pure Go and also contains even faster | |||
assembly implementations for amd64 and arm64. If desired, the `purego` build tag | |||
opts into using the Go code even on those architectures. | |||
[xxHash]: http://cyan4973.github.io/xxHash/ | |||
## Compatibility | |||
@@ -45,19 +47,20 @@ I recommend using the latest release of Go. | |||
Here are some quick benchmarks comparing the pure-Go and assembly | |||
implementations of Sum64. | |||
| input size | purego | asm | | |||
| --- | --- | --- | | |||
| 5 B | 979.66 MB/s | 1291.17 MB/s | | |||
| 100 B | 7475.26 MB/s | 7973.40 MB/s | | |||
| 4 KB | 17573.46 MB/s | 17602.65 MB/s | | |||
| 10 MB | 17131.46 MB/s | 17142.16 MB/s | | |||
| input size | purego | asm | | |||
| ---------- | --------- | --------- | | |||
| 4 B | 1.3 GB/s | 1.2 GB/s | | |||
| 16 B | 2.9 GB/s | 3.5 GB/s | | |||
| 100 B | 6.9 GB/s | 8.1 GB/s | | |||
| 4 KB | 11.7 GB/s | 16.7 GB/s | | |||
| 10 MB | 12.0 GB/s | 17.3 GB/s | | |||
These numbers were generated on Ubuntu 18.04 with an Intel i7-8700K CPU using | |||
the following commands under Go 1.11.2: | |||
These numbers were generated on Ubuntu 20.04 with an Intel Xeon Platinum 8252C | |||
CPU using the following commands under Go 1.19.2: | |||
``` | |||
$ go test -tags purego -benchtime 10s -bench '/xxhash,direct,bytes' | |||
$ go test -benchtime 10s -bench '/xxhash,direct,bytes' | |||
benchstat <(go test -tags purego -benchtime 500ms -count 15 -bench 'Sum64$') | |||
benchstat <(go test -benchtime 500ms -count 15 -bench 'Sum64$') | |||
``` | |||
## Projects using this package | |||
@@ -0,0 +1,10 @@ | |||
#!/bin/bash | |||
set -eu -o pipefail | |||
# Small convenience script for running the tests with various combinations of | |||
# arch/tags. This assumes we're running on amd64 and have qemu available. | |||
go test ./... | |||
go test -tags purego ./... | |||
GOARCH=arm64 go test | |||
GOARCH=arm64 go test -tags purego |
@@ -16,19 +16,11 @@ const ( | |||
prime5 uint64 = 2870177450012600261 | |||
) | |||
// NOTE(caleb): I'm using both consts and vars of the primes. Using consts where | |||
// possible in the Go code is worth a small (but measurable) performance boost | |||
// by avoiding some MOVQs. Vars are needed for the asm and also are useful for | |||
// convenience in the Go code in a few places where we need to intentionally | |||
// avoid constant arithmetic (e.g., v1 := prime1 + prime2 fails because the | |||
// result overflows a uint64). | |||
var ( | |||
prime1v = prime1 | |||
prime2v = prime2 | |||
prime3v = prime3 | |||
prime4v = prime4 | |||
prime5v = prime5 | |||
) | |||
// Store the primes in an array as well. | |||
// | |||
// The consts are used when possible in Go code to avoid MOVs but we need a | |||
// contiguous array of the assembly code. | |||
var primes = [...]uint64{prime1, prime2, prime3, prime4, prime5} | |||
// Digest implements hash.Hash64. | |||
type Digest struct { | |||
@@ -50,10 +42,10 @@ func New() *Digest { | |||
// Reset clears the Digest's state so that it can be reused. | |||
func (d *Digest) Reset() { | |||
d.v1 = prime1v + prime2 | |||
d.v1 = primes[0] + prime2 | |||
d.v2 = prime2 | |||
d.v3 = 0 | |||
d.v4 = -prime1v | |||
d.v4 = -primes[0] | |||
d.total = 0 | |||
d.n = 0 | |||
} | |||
@@ -69,21 +61,23 @@ func (d *Digest) Write(b []byte) (n int, err error) { | |||
n = len(b) | |||
d.total += uint64(n) | |||
memleft := d.mem[d.n&(len(d.mem)-1):] | |||
if d.n+n < 32 { | |||
// This new data doesn't even fill the current block. | |||
copy(d.mem[d.n:], b) | |||
copy(memleft, b) | |||
d.n += n | |||
return | |||
} | |||
if d.n > 0 { | |||
// Finish off the partial block. | |||
copy(d.mem[d.n:], b) | |||
c := copy(memleft, b) | |||
d.v1 = round(d.v1, u64(d.mem[0:8])) | |||
d.v2 = round(d.v2, u64(d.mem[8:16])) | |||
d.v3 = round(d.v3, u64(d.mem[16:24])) | |||
d.v4 = round(d.v4, u64(d.mem[24:32])) | |||
b = b[32-d.n:] | |||
b = b[c:] | |||
d.n = 0 | |||
} | |||
@@ -133,21 +127,20 @@ func (d *Digest) Sum64() uint64 { | |||
h += d.total | |||
i, end := 0, d.n | |||
for ; i+8 <= end; i += 8 { | |||
k1 := round(0, u64(d.mem[i:i+8])) | |||
b := d.mem[:d.n&(len(d.mem)-1)] | |||
for ; len(b) >= 8; b = b[8:] { | |||
k1 := round(0, u64(b[:8])) | |||
h ^= k1 | |||
h = rol27(h)*prime1 + prime4 | |||
} | |||
if i+4 <= end { | |||
h ^= uint64(u32(d.mem[i:i+4])) * prime1 | |||
if len(b) >= 4 { | |||
h ^= uint64(u32(b[:4])) * prime1 | |||
h = rol23(h)*prime2 + prime3 | |||
i += 4 | |||
b = b[4:] | |||
} | |||
for i < end { | |||
h ^= uint64(d.mem[i]) * prime5 | |||
for ; len(b) > 0; b = b[1:] { | |||
h ^= uint64(b[0]) * prime5 | |||
h = rol11(h) * prime1 | |||
i++ | |||
} | |||
h ^= h >> 33 | |||
@@ -1,215 +1,209 @@ | |||
//go:build !appengine && gc && !purego | |||
// +build !appengine | |||
// +build gc | |||
// +build !purego | |||
#include "textflag.h" | |||
// Register allocation: | |||
// AX h | |||
// SI pointer to advance through b | |||
// DX n | |||
// BX loop end | |||
// R8 v1, k1 | |||
// R9 v2 | |||
// R10 v3 | |||
// R11 v4 | |||
// R12 tmp | |||
// R13 prime1v | |||
// R14 prime2v | |||
// DI prime4v | |||
// round reads from and advances the buffer pointer in SI. | |||
// It assumes that R13 has prime1v and R14 has prime2v. | |||
#define round(r) \ | |||
MOVQ (SI), R12 \ | |||
ADDQ $8, SI \ | |||
IMULQ R14, R12 \ | |||
ADDQ R12, r \ | |||
ROLQ $31, r \ | |||
IMULQ R13, r | |||
// mergeRound applies a merge round on the two registers acc and val. | |||
// It assumes that R13 has prime1v, R14 has prime2v, and DI has prime4v. | |||
#define mergeRound(acc, val) \ | |||
IMULQ R14, val \ | |||
ROLQ $31, val \ | |||
IMULQ R13, val \ | |||
XORQ val, acc \ | |||
IMULQ R13, acc \ | |||
ADDQ DI, acc | |||
// Registers: | |||
#define h AX | |||
#define d AX | |||
#define p SI // pointer to advance through b | |||
#define n DX | |||
#define end BX // loop end | |||
#define v1 R8 | |||
#define v2 R9 | |||
#define v3 R10 | |||
#define v4 R11 | |||
#define x R12 | |||
#define prime1 R13 | |||
#define prime2 R14 | |||
#define prime4 DI | |||
#define round(acc, x) \ | |||
IMULQ prime2, x \ | |||
ADDQ x, acc \ | |||
ROLQ $31, acc \ | |||
IMULQ prime1, acc | |||
// round0 performs the operation x = round(0, x). | |||
#define round0(x) \ | |||
IMULQ prime2, x \ | |||
ROLQ $31, x \ | |||
IMULQ prime1, x | |||
// mergeRound applies a merge round on the two registers acc and x. | |||
// It assumes that prime1, prime2, and prime4 have been loaded. | |||
#define mergeRound(acc, x) \ | |||
round0(x) \ | |||
XORQ x, acc \ | |||
IMULQ prime1, acc \ | |||
ADDQ prime4, acc | |||
// blockLoop processes as many 32-byte blocks as possible, | |||
// updating v1, v2, v3, and v4. It assumes that there is at least one block | |||
// to process. | |||
#define blockLoop() \ | |||
loop: \ | |||
MOVQ +0(p), x \ | |||
round(v1, x) \ | |||
MOVQ +8(p), x \ | |||
round(v2, x) \ | |||
MOVQ +16(p), x \ | |||
round(v3, x) \ | |||
MOVQ +24(p), x \ | |||
round(v4, x) \ | |||
ADDQ $32, p \ | |||
CMPQ p, end \ | |||
JLE loop | |||
// func Sum64(b []byte) uint64 | |||
TEXT ·Sum64(SB), NOSPLIT, $0-32 | |||
TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32 | |||
// Load fixed primes. | |||
MOVQ ·prime1v(SB), R13 | |||
MOVQ ·prime2v(SB), R14 | |||
MOVQ ·prime4v(SB), DI | |||
MOVQ ·primes+0(SB), prime1 | |||
MOVQ ·primes+8(SB), prime2 | |||
MOVQ ·primes+24(SB), prime4 | |||
// Load slice. | |||
MOVQ b_base+0(FP), SI | |||
MOVQ b_len+8(FP), DX | |||
LEAQ (SI)(DX*1), BX | |||
MOVQ b_base+0(FP), p | |||
MOVQ b_len+8(FP), n | |||
LEAQ (p)(n*1), end | |||
// The first loop limit will be len(b)-32. | |||
SUBQ $32, BX | |||
SUBQ $32, end | |||
// Check whether we have at least one block. | |||
CMPQ DX, $32 | |||
CMPQ n, $32 | |||
JLT noBlocks | |||
// Set up initial state (v1, v2, v3, v4). | |||
MOVQ R13, R8 | |||
ADDQ R14, R8 | |||
MOVQ R14, R9 | |||
XORQ R10, R10 | |||
XORQ R11, R11 | |||
SUBQ R13, R11 | |||
// Loop until SI > BX. | |||
blockLoop: | |||
round(R8) | |||
round(R9) | |||
round(R10) | |||
round(R11) | |||
CMPQ SI, BX | |||
JLE blockLoop | |||
MOVQ R8, AX | |||
ROLQ $1, AX | |||
MOVQ R9, R12 | |||
ROLQ $7, R12 | |||
ADDQ R12, AX | |||
MOVQ R10, R12 | |||
ROLQ $12, R12 | |||
ADDQ R12, AX | |||
MOVQ R11, R12 | |||
ROLQ $18, R12 | |||
ADDQ R12, AX | |||
mergeRound(AX, R8) | |||
mergeRound(AX, R9) | |||
mergeRound(AX, R10) | |||
mergeRound(AX, R11) | |||
MOVQ prime1, v1 | |||
ADDQ prime2, v1 | |||
MOVQ prime2, v2 | |||
XORQ v3, v3 | |||
XORQ v4, v4 | |||
SUBQ prime1, v4 | |||
blockLoop() | |||
MOVQ v1, h | |||
ROLQ $1, h | |||
MOVQ v2, x | |||
ROLQ $7, x | |||
ADDQ x, h | |||
MOVQ v3, x | |||
ROLQ $12, x | |||
ADDQ x, h | |||
MOVQ v4, x | |||
ROLQ $18, x | |||
ADDQ x, h | |||
mergeRound(h, v1) | |||
mergeRound(h, v2) | |||
mergeRound(h, v3) | |||
mergeRound(h, v4) | |||
JMP afterBlocks | |||
noBlocks: | |||
MOVQ ·prime5v(SB), AX | |||
MOVQ ·primes+32(SB), h | |||
afterBlocks: | |||
ADDQ DX, AX | |||
// Right now BX has len(b)-32, and we want to loop until SI > len(b)-8. | |||
ADDQ $24, BX | |||
CMPQ SI, BX | |||
JG fourByte | |||
wordLoop: | |||
// Calculate k1. | |||
MOVQ (SI), R8 | |||
ADDQ $8, SI | |||
IMULQ R14, R8 | |||
ROLQ $31, R8 | |||
IMULQ R13, R8 | |||
XORQ R8, AX | |||
ROLQ $27, AX | |||
IMULQ R13, AX | |||
ADDQ DI, AX | |||
CMPQ SI, BX | |||
JLE wordLoop | |||
fourByte: | |||
ADDQ $4, BX | |||
CMPQ SI, BX | |||
JG singles | |||
MOVL (SI), R8 | |||
ADDQ $4, SI | |||
IMULQ R13, R8 | |||
XORQ R8, AX | |||
ROLQ $23, AX | |||
IMULQ R14, AX | |||
ADDQ ·prime3v(SB), AX | |||
singles: | |||
ADDQ $4, BX | |||
CMPQ SI, BX | |||
ADDQ n, h | |||
ADDQ $24, end | |||
CMPQ p, end | |||
JG try4 | |||
loop8: | |||
MOVQ (p), x | |||
ADDQ $8, p | |||
round0(x) | |||
XORQ x, h | |||
ROLQ $27, h | |||
IMULQ prime1, h | |||
ADDQ prime4, h | |||
CMPQ p, end | |||
JLE loop8 | |||
try4: | |||
ADDQ $4, end | |||
CMPQ p, end | |||
JG try1 | |||
MOVL (p), x | |||
ADDQ $4, p | |||
IMULQ prime1, x | |||
XORQ x, h | |||
ROLQ $23, h | |||
IMULQ prime2, h | |||
ADDQ ·primes+16(SB), h | |||
try1: | |||
ADDQ $4, end | |||
CMPQ p, end | |||
JGE finalize | |||
singlesLoop: | |||
MOVBQZX (SI), R12 | |||
ADDQ $1, SI | |||
IMULQ ·prime5v(SB), R12 | |||
XORQ R12, AX | |||
loop1: | |||
MOVBQZX (p), x | |||
ADDQ $1, p | |||
IMULQ ·primes+32(SB), x | |||
XORQ x, h | |||
ROLQ $11, h | |||
IMULQ prime1, h | |||
ROLQ $11, AX | |||
IMULQ R13, AX | |||
CMPQ SI, BX | |||
JL singlesLoop | |||
CMPQ p, end | |||
JL loop1 | |||
finalize: | |||
MOVQ AX, R12 | |||
SHRQ $33, R12 | |||
XORQ R12, AX | |||
IMULQ R14, AX | |||
MOVQ AX, R12 | |||
SHRQ $29, R12 | |||
XORQ R12, AX | |||
IMULQ ·prime3v(SB), AX | |||
MOVQ AX, R12 | |||
SHRQ $32, R12 | |||
XORQ R12, AX | |||
MOVQ AX, ret+24(FP) | |||
MOVQ h, x | |||
SHRQ $33, x | |||
XORQ x, h | |||
IMULQ prime2, h | |||
MOVQ h, x | |||
SHRQ $29, x | |||
XORQ x, h | |||
IMULQ ·primes+16(SB), h | |||
MOVQ h, x | |||
SHRQ $32, x | |||
XORQ x, h | |||
MOVQ h, ret+24(FP) | |||
RET | |||
// writeBlocks uses the same registers as above except that it uses AX to store | |||
// the d pointer. | |||
// func writeBlocks(d *Digest, b []byte) int | |||
TEXT ·writeBlocks(SB), NOSPLIT, $0-40 | |||
TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40 | |||
// Load fixed primes needed for round. | |||
MOVQ ·prime1v(SB), R13 | |||
MOVQ ·prime2v(SB), R14 | |||
MOVQ ·primes+0(SB), prime1 | |||
MOVQ ·primes+8(SB), prime2 | |||
// Load slice. | |||
MOVQ b_base+8(FP), SI | |||
MOVQ b_len+16(FP), DX | |||
LEAQ (SI)(DX*1), BX | |||
SUBQ $32, BX | |||
MOVQ b_base+8(FP), p | |||
MOVQ b_len+16(FP), n | |||
LEAQ (p)(n*1), end | |||
SUBQ $32, end | |||
// Load vN from d. | |||
MOVQ d+0(FP), AX | |||
MOVQ 0(AX), R8 // v1 | |||
MOVQ 8(AX), R9 // v2 | |||
MOVQ 16(AX), R10 // v3 | |||
MOVQ 24(AX), R11 // v4 | |||
MOVQ s+0(FP), d | |||
MOVQ 0(d), v1 | |||
MOVQ 8(d), v2 | |||
MOVQ 16(d), v3 | |||
MOVQ 24(d), v4 | |||
// We don't need to check the loop condition here; this function is | |||
// always called with at least one block of data to process. | |||
blockLoop: | |||
round(R8) | |||
round(R9) | |||
round(R10) | |||
round(R11) | |||
CMPQ SI, BX | |||
JLE blockLoop | |||
blockLoop() | |||
// Copy vN back to d. | |||
MOVQ R8, 0(AX) | |||
MOVQ R9, 8(AX) | |||
MOVQ R10, 16(AX) | |||
MOVQ R11, 24(AX) | |||
// The number of bytes written is SI minus the old base pointer. | |||
SUBQ b_base+8(FP), SI | |||
MOVQ SI, ret+32(FP) | |||
MOVQ v1, 0(d) | |||
MOVQ v2, 8(d) | |||
MOVQ v3, 16(d) | |||
MOVQ v4, 24(d) | |||
// The number of bytes written is p minus the old base pointer. | |||
SUBQ b_base+8(FP), p | |||
MOVQ p, ret+32(FP) | |||
RET |
@@ -0,0 +1,183 @@ | |||
//go:build !appengine && gc && !purego | |||
// +build !appengine | |||
// +build gc | |||
// +build !purego | |||
#include "textflag.h" | |||
// Registers: | |||
#define digest R1 | |||
#define h R2 // return value | |||
#define p R3 // input pointer | |||
#define n R4 // input length | |||
#define nblocks R5 // n / 32 | |||
#define prime1 R7 | |||
#define prime2 R8 | |||
#define prime3 R9 | |||
#define prime4 R10 | |||
#define prime5 R11 | |||
#define v1 R12 | |||
#define v2 R13 | |||
#define v3 R14 | |||
#define v4 R15 | |||
#define x1 R20 | |||
#define x2 R21 | |||
#define x3 R22 | |||
#define x4 R23 | |||
#define round(acc, x) \ | |||
MADD prime2, acc, x, acc \ | |||
ROR $64-31, acc \ | |||
MUL prime1, acc | |||
// round0 performs the operation x = round(0, x). | |||
#define round0(x) \ | |||
MUL prime2, x \ | |||
ROR $64-31, x \ | |||
MUL prime1, x | |||
#define mergeRound(acc, x) \ | |||
round0(x) \ | |||
EOR x, acc \ | |||
MADD acc, prime4, prime1, acc | |||
// blockLoop processes as many 32-byte blocks as possible, | |||
// updating v1, v2, v3, and v4. It assumes that n >= 32. | |||
#define blockLoop() \ | |||
LSR $5, n, nblocks \ | |||
PCALIGN $16 \ | |||
loop: \ | |||
LDP.P 16(p), (x1, x2) \ | |||
LDP.P 16(p), (x3, x4) \ | |||
round(v1, x1) \ | |||
round(v2, x2) \ | |||
round(v3, x3) \ | |||
round(v4, x4) \ | |||
SUB $1, nblocks \ | |||
CBNZ nblocks, loop | |||
// func Sum64(b []byte) uint64 | |||
TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32 | |||
LDP b_base+0(FP), (p, n) | |||
LDP ·primes+0(SB), (prime1, prime2) | |||
LDP ·primes+16(SB), (prime3, prime4) | |||
MOVD ·primes+32(SB), prime5 | |||
CMP $32, n | |||
CSEL LT, prime5, ZR, h // if n < 32 { h = prime5 } else { h = 0 } | |||
BLT afterLoop | |||
ADD prime1, prime2, v1 | |||
MOVD prime2, v2 | |||
MOVD $0, v3 | |||
NEG prime1, v4 | |||
blockLoop() | |||
ROR $64-1, v1, x1 | |||
ROR $64-7, v2, x2 | |||
ADD x1, x2 | |||
ROR $64-12, v3, x3 | |||
ROR $64-18, v4, x4 | |||
ADD x3, x4 | |||
ADD x2, x4, h | |||
mergeRound(h, v1) | |||
mergeRound(h, v2) | |||
mergeRound(h, v3) | |||
mergeRound(h, v4) | |||
afterLoop: | |||
ADD n, h | |||
TBZ $4, n, try8 | |||
LDP.P 16(p), (x1, x2) | |||
round0(x1) | |||
// NOTE: here and below, sequencing the EOR after the ROR (using a | |||
// rotated register) is worth a small but measurable speedup for small | |||
// inputs. | |||
ROR $64-27, h | |||
EOR x1 @> 64-27, h, h | |||
MADD h, prime4, prime1, h | |||
round0(x2) | |||
ROR $64-27, h | |||
EOR x2 @> 64-27, h, h | |||
MADD h, prime4, prime1, h | |||
try8: | |||
TBZ $3, n, try4 | |||
MOVD.P 8(p), x1 | |||
round0(x1) | |||
ROR $64-27, h | |||
EOR x1 @> 64-27, h, h | |||
MADD h, prime4, prime1, h | |||
try4: | |||
TBZ $2, n, try2 | |||
MOVWU.P 4(p), x2 | |||
MUL prime1, x2 | |||
ROR $64-23, h | |||
EOR x2 @> 64-23, h, h | |||
MADD h, prime3, prime2, h | |||
try2: | |||
TBZ $1, n, try1 | |||
MOVHU.P 2(p), x3 | |||
AND $255, x3, x1 | |||
LSR $8, x3, x2 | |||
MUL prime5, x1 | |||
ROR $64-11, h | |||
EOR x1 @> 64-11, h, h | |||
MUL prime1, h | |||
MUL prime5, x2 | |||
ROR $64-11, h | |||
EOR x2 @> 64-11, h, h | |||
MUL prime1, h | |||
try1: | |||
TBZ $0, n, finalize | |||
MOVBU (p), x4 | |||
MUL prime5, x4 | |||
ROR $64-11, h | |||
EOR x4 @> 64-11, h, h | |||
MUL prime1, h | |||
finalize: | |||
EOR h >> 33, h | |||
MUL prime2, h | |||
EOR h >> 29, h | |||
MUL prime3, h | |||
EOR h >> 32, h | |||
MOVD h, ret+24(FP) | |||
RET | |||
// func writeBlocks(d *Digest, b []byte) int | |||
TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40 | |||
LDP ·primes+0(SB), (prime1, prime2) | |||
// Load state. Assume v[1-4] are stored contiguously. | |||
MOVD d+0(FP), digest | |||
LDP 0(digest), (v1, v2) | |||
LDP 16(digest), (v3, v4) | |||
LDP b_base+8(FP), (p, n) | |||
blockLoop() | |||
// Store updated state. | |||
STP (v1, v2), 0(digest) | |||
STP (v3, v4), 16(digest) | |||
BIC $31, n | |||
MOVD n, ret+32(FP) | |||
RET |
@@ -1,3 +1,5 @@ | |||
//go:build (amd64 || arm64) && !appengine && gc && !purego | |||
// +build amd64 arm64 | |||
// +build !appengine | |||
// +build gc | |||
// +build !purego |
@@ -1,4 +1,5 @@ | |||
// +build !amd64 appengine !gc purego | |||
//go:build (!amd64 && !arm64) || appengine || !gc || purego | |||
// +build !amd64,!arm64 appengine !gc purego | |||
package xxhash | |||
@@ -14,10 +15,10 @@ func Sum64(b []byte) uint64 { | |||
var h uint64 | |||
if n >= 32 { | |||
v1 := prime1v + prime2 | |||
v1 := primes[0] + prime2 | |||
v2 := prime2 | |||
v3 := uint64(0) | |||
v4 := -prime1v | |||
v4 := -primes[0] | |||
for len(b) >= 32 { | |||
v1 = round(v1, u64(b[0:8:len(b)])) | |||
v2 = round(v2, u64(b[8:16:len(b)])) | |||
@@ -36,19 +37,18 @@ func Sum64(b []byte) uint64 { | |||
h += uint64(n) | |||
i, end := 0, len(b) | |||
for ; i+8 <= end; i += 8 { | |||
k1 := round(0, u64(b[i:i+8:len(b)])) | |||
for ; len(b) >= 8; b = b[8:] { | |||
k1 := round(0, u64(b[:8])) | |||
h ^= k1 | |||
h = rol27(h)*prime1 + prime4 | |||
} | |||
if i+4 <= end { | |||
h ^= uint64(u32(b[i:i+4:len(b)])) * prime1 | |||
if len(b) >= 4 { | |||
h ^= uint64(u32(b[:4])) * prime1 | |||
h = rol23(h)*prime2 + prime3 | |||
i += 4 | |||
b = b[4:] | |||
} | |||
for ; i < end; i++ { | |||
h ^= uint64(b[i]) * prime5 | |||
for ; len(b) > 0; b = b[1:] { | |||
h ^= uint64(b[0]) * prime5 | |||
h = rol11(h) * prime1 | |||
} | |||
@@ -1,3 +1,4 @@ | |||
//go:build appengine | |||
// +build appengine | |||
// This file contains the safe implementations of otherwise unsafe-using code. | |||
@@ -1,3 +1,4 @@ | |||
//go:build !appengine | |||
// +build !appengine | |||
// This file encapsulates usage of unsafe. | |||
@@ -11,7 +12,7 @@ import ( | |||
// In the future it's possible that compiler optimizations will make these | |||
// XxxString functions unnecessary by realizing that calls such as | |||
// Sum64([]byte(s)) don't need to copy s. See https://golang.org/issue/2205. | |||
// Sum64([]byte(s)) don't need to copy s. See https://go.dev/issue/2205. | |||
// If that happens, even if we keep these functions they can be replaced with | |||
// the trivial safe code. | |||
@@ -1,4 +1,4 @@ | |||
# github.com/cespare/xxhash/v2 v2.1.2 | |||
# github.com/cespare/xxhash/v2 v2.2.0 | |||
## explicit; go 1.11 | |||
github.com/cespare/xxhash/v2 | |||
# github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f | |||