You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

2722 lines
106 KiB

  1. // Copyright 2016 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // This file was originally from https://golang.org/cl/24717 by Vlad Krasnov of CloudFlare.
  5. // +build go1.7,amd64,!gccgo,!appengine
  6. #include "textflag.h"
  7. // General register allocation
  8. #define oup DI
  9. #define inp SI
  10. #define inl BX
  11. #define adp CX // free to reuse, after we hash the additional data
  12. #define keyp R8 // free to reuse, when we copy the key to stack
  13. #define itr2 R9 // general iterator
  14. #define itr1 CX // general iterator
  15. #define acc0 R10
  16. #define acc1 R11
  17. #define acc2 R12
  18. #define t0 R13
  19. #define t1 R14
  20. #define t2 R15
  21. #define t3 R8
  22. // Register and stack allocation for the SSE code
  23. #define rStore (0*16)(BP)
  24. #define sStore (1*16)(BP)
  25. #define state1Store (2*16)(BP)
  26. #define state2Store (3*16)(BP)
  27. #define tmpStore (4*16)(BP)
  28. #define ctr0Store (5*16)(BP)
  29. #define ctr1Store (6*16)(BP)
  30. #define ctr2Store (7*16)(BP)
  31. #define ctr3Store (8*16)(BP)
  32. #define A0 X0
  33. #define A1 X1
  34. #define A2 X2
  35. #define B0 X3
  36. #define B1 X4
  37. #define B2 X5
  38. #define C0 X6
  39. #define C1 X7
  40. #define C2 X8
  41. #define D0 X9
  42. #define D1 X10
  43. #define D2 X11
  44. #define T0 X12
  45. #define T1 X13
  46. #define T2 X14
  47. #define T3 X15
  48. #define A3 T0
  49. #define B3 T1
  50. #define C3 T2
  51. #define D3 T3
  52. // Register and stack allocation for the AVX2 code
  53. #define rsStoreAVX2 (0*32)(BP)
  54. #define state1StoreAVX2 (1*32)(BP)
  55. #define state2StoreAVX2 (2*32)(BP)
  56. #define ctr0StoreAVX2 (3*32)(BP)
  57. #define ctr1StoreAVX2 (4*32)(BP)
  58. #define ctr2StoreAVX2 (5*32)(BP)
  59. #define ctr3StoreAVX2 (6*32)(BP)
  60. #define tmpStoreAVX2 (7*32)(BP) // 256 bytes on stack
  61. #define AA0 Y0
  62. #define AA1 Y5
  63. #define AA2 Y6
  64. #define AA3 Y7
  65. #define BB0 Y14
  66. #define BB1 Y9
  67. #define BB2 Y10
  68. #define BB3 Y11
  69. #define CC0 Y12
  70. #define CC1 Y13
  71. #define CC2 Y8
  72. #define CC3 Y15
  73. #define DD0 Y4
  74. #define DD1 Y1
  75. #define DD2 Y2
  76. #define DD3 Y3
  77. #define TT0 DD3
  78. #define TT1 AA3
  79. #define TT2 BB3
  80. #define TT3 CC3
  81. // ChaCha20 constants
  82. DATA ·chacha20Constants<>+0x00(SB)/4, $0x61707865
  83. DATA ·chacha20Constants<>+0x04(SB)/4, $0x3320646e
  84. DATA ·chacha20Constants<>+0x08(SB)/4, $0x79622d32
  85. DATA ·chacha20Constants<>+0x0c(SB)/4, $0x6b206574
  86. DATA ·chacha20Constants<>+0x10(SB)/4, $0x61707865
  87. DATA ·chacha20Constants<>+0x14(SB)/4, $0x3320646e
  88. DATA ·chacha20Constants<>+0x18(SB)/4, $0x79622d32
  89. DATA ·chacha20Constants<>+0x1c(SB)/4, $0x6b206574
  90. // <<< 16 with PSHUFB
  91. DATA ·rol16<>+0x00(SB)/8, $0x0504070601000302
  92. DATA ·rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
  93. DATA ·rol16<>+0x10(SB)/8, $0x0504070601000302
  94. DATA ·rol16<>+0x18(SB)/8, $0x0D0C0F0E09080B0A
  95. // <<< 8 with PSHUFB
  96. DATA ·rol8<>+0x00(SB)/8, $0x0605040702010003
  97. DATA ·rol8<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
  98. DATA ·rol8<>+0x10(SB)/8, $0x0605040702010003
  99. DATA ·rol8<>+0x18(SB)/8, $0x0E0D0C0F0A09080B
  100. DATA ·avx2InitMask<>+0x00(SB)/8, $0x0
  101. DATA ·avx2InitMask<>+0x08(SB)/8, $0x0
  102. DATA ·avx2InitMask<>+0x10(SB)/8, $0x1
  103. DATA ·avx2InitMask<>+0x18(SB)/8, $0x0
  104. DATA ·avx2IncMask<>+0x00(SB)/8, $0x2
  105. DATA ·avx2IncMask<>+0x08(SB)/8, $0x0
  106. DATA ·avx2IncMask<>+0x10(SB)/8, $0x2
  107. DATA ·avx2IncMask<>+0x18(SB)/8, $0x0
  108. // Poly1305 key clamp
  109. DATA ·polyClampMask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF
  110. DATA ·polyClampMask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC
  111. DATA ·polyClampMask<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
  112. DATA ·polyClampMask<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
  113. DATA ·sseIncMask<>+0x00(SB)/8, $0x1
  114. DATA ·sseIncMask<>+0x08(SB)/8, $0x0
  115. // To load/store the last < 16 bytes in a buffer
  116. DATA ·andMask<>+0x00(SB)/8, $0x00000000000000ff
  117. DATA ·andMask<>+0x08(SB)/8, $0x0000000000000000
  118. DATA ·andMask<>+0x10(SB)/8, $0x000000000000ffff
  119. DATA ·andMask<>+0x18(SB)/8, $0x0000000000000000
  120. DATA ·andMask<>+0x20(SB)/8, $0x0000000000ffffff
  121. DATA ·andMask<>+0x28(SB)/8, $0x0000000000000000
  122. DATA ·andMask<>+0x30(SB)/8, $0x00000000ffffffff
  123. DATA ·andMask<>+0x38(SB)/8, $0x0000000000000000
  124. DATA ·andMask<>+0x40(SB)/8, $0x000000ffffffffff
  125. DATA ·andMask<>+0x48(SB)/8, $0x0000000000000000
  126. DATA ·andMask<>+0x50(SB)/8, $0x0000ffffffffffff
  127. DATA ·andMask<>+0x58(SB)/8, $0x0000000000000000
  128. DATA ·andMask<>+0x60(SB)/8, $0x00ffffffffffffff
  129. DATA ·andMask<>+0x68(SB)/8, $0x0000000000000000
  130. DATA ·andMask<>+0x70(SB)/8, $0xffffffffffffffff
  131. DATA ·andMask<>+0x78(SB)/8, $0x0000000000000000
  132. DATA ·andMask<>+0x80(SB)/8, $0xffffffffffffffff
  133. DATA ·andMask<>+0x88(SB)/8, $0x00000000000000ff
  134. DATA ·andMask<>+0x90(SB)/8, $0xffffffffffffffff
  135. DATA ·andMask<>+0x98(SB)/8, $0x000000000000ffff
  136. DATA ·andMask<>+0xa0(SB)/8, $0xffffffffffffffff
  137. DATA ·andMask<>+0xa8(SB)/8, $0x0000000000ffffff
  138. DATA ·andMask<>+0xb0(SB)/8, $0xffffffffffffffff
  139. DATA ·andMask<>+0xb8(SB)/8, $0x00000000ffffffff
  140. DATA ·andMask<>+0xc0(SB)/8, $0xffffffffffffffff
  141. DATA ·andMask<>+0xc8(SB)/8, $0x000000ffffffffff
  142. DATA ·andMask<>+0xd0(SB)/8, $0xffffffffffffffff
  143. DATA ·andMask<>+0xd8(SB)/8, $0x0000ffffffffffff
  144. DATA ·andMask<>+0xe0(SB)/8, $0xffffffffffffffff
  145. DATA ·andMask<>+0xe8(SB)/8, $0x00ffffffffffffff
  146. GLOBL ·chacha20Constants<>(SB), (NOPTR+RODATA), $32
  147. GLOBL ·rol16<>(SB), (NOPTR+RODATA), $32
  148. GLOBL ·rol8<>(SB), (NOPTR+RODATA), $32
  149. GLOBL ·sseIncMask<>(SB), (NOPTR+RODATA), $16
  150. GLOBL ·avx2IncMask<>(SB), (NOPTR+RODATA), $32
  151. GLOBL ·avx2InitMask<>(SB), (NOPTR+RODATA), $32
  152. GLOBL ·polyClampMask<>(SB), (NOPTR+RODATA), $32
  153. GLOBL ·andMask<>(SB), (NOPTR+RODATA), $240
  154. // No PALIGNR in Go ASM yet (but VPALIGNR is present).
  155. #define shiftB0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X3, X3
  156. #define shiftB1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x04 // PALIGNR $4, X4, X4
  157. #define shiftB2Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X5, X5
  158. #define shiftB3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X13, X13
  159. #define shiftC0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X6, X6
  160. #define shiftC1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x08 // PALIGNR $8, X7, X7
  161. #define shiftC2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc0; BYTE $0x08 // PALIGNR $8, X8, X8
  162. #define shiftC3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X14, X14
  163. #define shiftD0Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x0c // PALIGNR $12, X9, X9
  164. #define shiftD1Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x0c // PALIGNR $12, X10, X10
  165. #define shiftD2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X11, X11
  166. #define shiftD3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x0c // PALIGNR $12, X15, X15
  167. #define shiftB0Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X3, X3
  168. #define shiftB1Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x0c // PALIGNR $12, X4, X4
  169. #define shiftB2Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X5, X5
  170. #define shiftB3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X13, X13
  171. #define shiftC0Right shiftC0Left
  172. #define shiftC1Right shiftC1Left
  173. #define shiftC2Right shiftC2Left
  174. #define shiftC3Right shiftC3Left
  175. #define shiftD0Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x04 // PALIGNR $4, X9, X9
  176. #define shiftD1Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x04 // PALIGNR $4, X10, X10
  177. #define shiftD2Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X11, X11
  178. #define shiftD3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x04 // PALIGNR $4, X15, X15
  179. // Some macros
  180. #define chachaQR(A, B, C, D, T) \
  181. PADDD B, A; PXOR A, D; PSHUFB ·rol16<>(SB), D \
  182. PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $12, T; PSRLL $20, B; PXOR T, B \
  183. PADDD B, A; PXOR A, D; PSHUFB ·rol8<>(SB), D \
  184. PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $7, T; PSRLL $25, B; PXOR T, B
  185. #define chachaQR_AVX2(A, B, C, D, T) \
  186. VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol16<>(SB), D, D \
  187. VPADDD D, C, C; VPXOR C, B, B; VPSLLD $12, B, T; VPSRLD $20, B, B; VPXOR T, B, B \
  188. VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol8<>(SB), D, D \
  189. VPADDD D, C, C; VPXOR C, B, B; VPSLLD $7, B, T; VPSRLD $25, B, B; VPXOR T, B, B
  190. #define polyAdd(S) ADDQ S, acc0; ADCQ 8+S, acc1; ADCQ $1, acc2
  191. #define polyMulStage1 MOVQ (0*8)(BP), AX; MOVQ AX, t2; MULQ acc0; MOVQ AX, t0; MOVQ DX, t1; MOVQ (0*8)(BP), AX; MULQ acc1; IMULQ acc2, t2; ADDQ AX, t1; ADCQ DX, t2
  192. #define polyMulStage2 MOVQ (1*8)(BP), AX; MOVQ AX, t3; MULQ acc0; ADDQ AX, t1; ADCQ $0, DX; MOVQ DX, acc0; MOVQ (1*8)(BP), AX; MULQ acc1; ADDQ AX, t2; ADCQ $0, DX
  193. #define polyMulStage3 IMULQ acc2, t3; ADDQ acc0, t2; ADCQ DX, t3
  194. #define polyMulReduceStage MOVQ t0, acc0; MOVQ t1, acc1; MOVQ t2, acc2; ANDQ $3, acc2; MOVQ t2, t0; ANDQ $-4, t0; MOVQ t3, t1; SHRQ $2, t2:t3; SHRQ $2, t3; ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $0, acc2; ADDQ t2, acc0; ADCQ t3, acc1; ADCQ $0, acc2
  195. #define polyMulStage1_AVX2 MOVQ (0*8)(BP), DX; MOVQ DX, t2; MULXQ acc0, t0, t1; IMULQ acc2, t2; MULXQ acc1, AX, DX; ADDQ AX, t1; ADCQ DX, t2
  196. #define polyMulStage2_AVX2 MOVQ (1*8)(BP), DX; MULXQ acc0, acc0, AX; ADDQ acc0, t1; MULXQ acc1, acc1, t3; ADCQ acc1, t2; ADCQ $0, t3
  197. #define polyMulStage3_AVX2 IMULQ acc2, DX; ADDQ AX, t2; ADCQ DX, t3
  198. #define polyMul polyMulStage1; polyMulStage2; polyMulStage3; polyMulReduceStage
  199. #define polyMulAVX2 polyMulStage1_AVX2; polyMulStage2_AVX2; polyMulStage3_AVX2; polyMulReduceStage
  200. // ----------------------------------------------------------------------------
  201. TEXT polyHashADInternal<>(SB), NOSPLIT, $0
  202. // adp points to beginning of additional data
  203. // itr2 holds ad length
  204. XORQ acc0, acc0
  205. XORQ acc1, acc1
  206. XORQ acc2, acc2
  207. CMPQ itr2, $13
  208. JNE hashADLoop
  209. openFastTLSAD:
  210. // Special treatment for the TLS case of 13 bytes
  211. MOVQ (adp), acc0
  212. MOVQ 5(adp), acc1
  213. SHRQ $24, acc1
  214. MOVQ $1, acc2
  215. polyMul
  216. RET
  217. hashADLoop:
  218. // Hash in 16 byte chunks
  219. CMPQ itr2, $16
  220. JB hashADTail
  221. polyAdd(0(adp))
  222. LEAQ (1*16)(adp), adp
  223. SUBQ $16, itr2
  224. polyMul
  225. JMP hashADLoop
  226. hashADTail:
  227. CMPQ itr2, $0
  228. JE hashADDone
  229. // Hash last < 16 byte tail
  230. XORQ t0, t0
  231. XORQ t1, t1
  232. XORQ t2, t2
  233. ADDQ itr2, adp
  234. hashADTailLoop:
  235. SHLQ $8, t1:t0
  236. SHLQ $8, t0
  237. MOVB -1(adp), t2
  238. XORQ t2, t0
  239. DECQ adp
  240. DECQ itr2
  241. JNE hashADTailLoop
  242. hashADTailFinish:
  243. ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
  244. polyMul
  245. // Finished AD
  246. hashADDone:
  247. RET
  248. // ----------------------------------------------------------------------------
  249. // func chacha20Poly1305Open(dst, key, src, ad []byte) bool
  250. TEXT ·chacha20Poly1305Open(SB), 0, $288-97
  251. // For aligned stack access
  252. MOVQ SP, BP
  253. ADDQ $32, BP
  254. ANDQ $-32, BP
  255. MOVQ dst+0(FP), oup
  256. MOVQ key+24(FP), keyp
  257. MOVQ src+48(FP), inp
  258. MOVQ src_len+56(FP), inl
  259. MOVQ ad+72(FP), adp
  260. // Check for AVX2 support
  261. CMPB runtime·support_avx2(SB), $0
  262. JE noavx2bmi2Open
  263. // Check BMI2 bit for MULXQ.
  264. // runtime·cpuid_ebx7 is always available here
  265. // because it passed avx2 check
  266. TESTL $(1<<8), runtime·cpuid_ebx7(SB)
  267. JNE chacha20Poly1305Open_AVX2
  268. noavx2bmi2Open:
  269. // Special optimization, for very short buffers
  270. CMPQ inl, $128
  271. JBE openSSE128 // About 16% faster
  272. // For long buffers, prepare the poly key first
  273. MOVOU ·chacha20Constants<>(SB), A0
  274. MOVOU (1*16)(keyp), B0
  275. MOVOU (2*16)(keyp), C0
  276. MOVOU (3*16)(keyp), D0
  277. MOVO D0, T1
  278. // Store state on stack for future use
  279. MOVO B0, state1Store
  280. MOVO C0, state2Store
  281. MOVO D0, ctr3Store
  282. MOVQ $10, itr2
  283. openSSEPreparePolyKey:
  284. chachaQR(A0, B0, C0, D0, T0)
  285. shiftB0Left; shiftC0Left; shiftD0Left
  286. chachaQR(A0, B0, C0, D0, T0)
  287. shiftB0Right; shiftC0Right; shiftD0Right
  288. DECQ itr2
  289. JNE openSSEPreparePolyKey
  290. // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
  291. PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0
  292. // Clamp and store the key
  293. PAND ·polyClampMask<>(SB), A0
  294. MOVO A0, rStore; MOVO B0, sStore
  295. // Hash AAD
  296. MOVQ ad_len+80(FP), itr2
  297. CALL polyHashADInternal<>(SB)
  298. openSSEMainLoop:
  299. CMPQ inl, $256
  300. JB openSSEMainLoopDone
  301. // Load state, increment counter blocks
  302. MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
  303. MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
  304. MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
  305. MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
  306. // Store counters
  307. MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
  308. // There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16
  309. MOVQ $4, itr1
  310. MOVQ inp, itr2
  311. openSSEInternalLoop:
  312. MOVO C3, tmpStore
  313. chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
  314. MOVO tmpStore, C3
  315. MOVO C1, tmpStore
  316. chachaQR(A3, B3, C3, D3, C1)
  317. MOVO tmpStore, C1
  318. polyAdd(0(itr2))
  319. shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
  320. shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
  321. shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
  322. polyMulStage1
  323. polyMulStage2
  324. LEAQ (2*8)(itr2), itr2
  325. MOVO C3, tmpStore
  326. chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
  327. MOVO tmpStore, C3
  328. MOVO C1, tmpStore
  329. polyMulStage3
  330. chachaQR(A3, B3, C3, D3, C1)
  331. MOVO tmpStore, C1
  332. polyMulReduceStage
  333. shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
  334. shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
  335. shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
  336. DECQ itr1
  337. JGE openSSEInternalLoop
  338. polyAdd(0(itr2))
  339. polyMul
  340. LEAQ (2*8)(itr2), itr2
  341. CMPQ itr1, $-6
  342. JG openSSEInternalLoop
  343. // Add in the state
  344. PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
  345. PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
  346. PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
  347. PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
  348. // Load - xor - store
  349. MOVO D3, tmpStore
  350. MOVOU (0*16)(inp), D3; PXOR D3, A0; MOVOU A0, (0*16)(oup)
  351. MOVOU (1*16)(inp), D3; PXOR D3, B0; MOVOU B0, (1*16)(oup)
  352. MOVOU (2*16)(inp), D3; PXOR D3, C0; MOVOU C0, (2*16)(oup)
  353. MOVOU (3*16)(inp), D3; PXOR D3, D0; MOVOU D0, (3*16)(oup)
  354. MOVOU (4*16)(inp), D0; PXOR D0, A1; MOVOU A1, (4*16)(oup)
  355. MOVOU (5*16)(inp), D0; PXOR D0, B1; MOVOU B1, (5*16)(oup)
  356. MOVOU (6*16)(inp), D0; PXOR D0, C1; MOVOU C1, (6*16)(oup)
  357. MOVOU (7*16)(inp), D0; PXOR D0, D1; MOVOU D1, (7*16)(oup)
  358. MOVOU (8*16)(inp), D0; PXOR D0, A2; MOVOU A2, (8*16)(oup)
  359. MOVOU (9*16)(inp), D0; PXOR D0, B2; MOVOU B2, (9*16)(oup)
  360. MOVOU (10*16)(inp), D0; PXOR D0, C2; MOVOU C2, (10*16)(oup)
  361. MOVOU (11*16)(inp), D0; PXOR D0, D2; MOVOU D2, (11*16)(oup)
  362. MOVOU (12*16)(inp), D0; PXOR D0, A3; MOVOU A3, (12*16)(oup)
  363. MOVOU (13*16)(inp), D0; PXOR D0, B3; MOVOU B3, (13*16)(oup)
  364. MOVOU (14*16)(inp), D0; PXOR D0, C3; MOVOU C3, (14*16)(oup)
  365. MOVOU (15*16)(inp), D0; PXOR tmpStore, D0; MOVOU D0, (15*16)(oup)
  366. LEAQ 256(inp), inp
  367. LEAQ 256(oup), oup
  368. SUBQ $256, inl
  369. JMP openSSEMainLoop
  370. openSSEMainLoopDone:
  371. // Handle the various tail sizes efficiently
  372. TESTQ inl, inl
  373. JE openSSEFinalize
  374. CMPQ inl, $64
  375. JBE openSSETail64
  376. CMPQ inl, $128
  377. JBE openSSETail128
  378. CMPQ inl, $192
  379. JBE openSSETail192
  380. JMP openSSETail256
  381. openSSEFinalize:
  382. // Hash in the PT, AAD lengths
  383. ADDQ ad_len+80(FP), acc0; ADCQ src_len+56(FP), acc1; ADCQ $1, acc2
  384. polyMul
  385. // Final reduce
  386. MOVQ acc0, t0
  387. MOVQ acc1, t1
  388. MOVQ acc2, t2
  389. SUBQ $-5, acc0
  390. SBBQ $-1, acc1
  391. SBBQ $3, acc2
  392. CMOVQCS t0, acc0
  393. CMOVQCS t1, acc1
  394. CMOVQCS t2, acc2
  395. // Add in the "s" part of the key
  396. ADDQ 0+sStore, acc0
  397. ADCQ 8+sStore, acc1
  398. // Finally, constant time compare to the tag at the end of the message
  399. XORQ AX, AX
  400. MOVQ $1, DX
  401. XORQ (0*8)(inp), acc0
  402. XORQ (1*8)(inp), acc1
  403. ORQ acc1, acc0
  404. CMOVQEQ DX, AX
  405. // Return true iff tags are equal
  406. MOVB AX, ret+96(FP)
  407. RET
  408. // ----------------------------------------------------------------------------
  409. // Special optimization for buffers smaller than 129 bytes
  410. openSSE128:
  411. // For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
  412. MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
  413. MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
  414. MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
  415. MOVO B0, T1; MOVO C0, T2; MOVO D1, T3
  416. MOVQ $10, itr2
  417. openSSE128InnerCipherLoop:
  418. chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
  419. shiftB0Left; shiftB1Left; shiftB2Left
  420. shiftC0Left; shiftC1Left; shiftC2Left
  421. shiftD0Left; shiftD1Left; shiftD2Left
  422. chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
  423. shiftB0Right; shiftB1Right; shiftB2Right
  424. shiftC0Right; shiftC1Right; shiftC2Right
  425. shiftD0Right; shiftD1Right; shiftD2Right
  426. DECQ itr2
  427. JNE openSSE128InnerCipherLoop
  428. // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
  429. PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
  430. PADDL T1, B0; PADDL T1, B1; PADDL T1, B2
  431. PADDL T2, C1; PADDL T2, C2
  432. PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2
  433. // Clamp and store the key
  434. PAND ·polyClampMask<>(SB), A0
  435. MOVOU A0, rStore; MOVOU B0, sStore
  436. // Hash
  437. MOVQ ad_len+80(FP), itr2
  438. CALL polyHashADInternal<>(SB)
  439. openSSE128Open:
  440. CMPQ inl, $16
  441. JB openSSETail16
  442. SUBQ $16, inl
  443. // Load for hashing
  444. polyAdd(0(inp))
  445. // Load for decryption
  446. MOVOU (inp), T0; PXOR T0, A1; MOVOU A1, (oup)
  447. LEAQ (1*16)(inp), inp
  448. LEAQ (1*16)(oup), oup
  449. polyMul
  450. // Shift the stream "left"
  451. MOVO B1, A1
  452. MOVO C1, B1
  453. MOVO D1, C1
  454. MOVO A2, D1
  455. MOVO B2, A2
  456. MOVO C2, B2
  457. MOVO D2, C2
  458. JMP openSSE128Open
  459. openSSETail16:
  460. TESTQ inl, inl
  461. JE openSSEFinalize
  462. // We can safely load the CT from the end, because it is padded with the MAC
  463. MOVQ inl, itr2
  464. SHLQ $4, itr2
  465. LEAQ ·andMask<>(SB), t0
  466. MOVOU (inp), T0
  467. ADDQ inl, inp
  468. PAND -16(t0)(itr2*1), T0
  469. MOVO T0, 0+tmpStore
  470. MOVQ T0, t0
  471. MOVQ 8+tmpStore, t1
  472. PXOR A1, T0
  473. // We can only store one byte at a time, since plaintext can be shorter than 16 bytes
  474. openSSETail16Store:
  475. MOVQ T0, t3
  476. MOVB t3, (oup)
  477. PSRLDQ $1, T0
  478. INCQ oup
  479. DECQ inl
  480. JNE openSSETail16Store
  481. ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
  482. polyMul
  483. JMP openSSEFinalize
  484. // ----------------------------------------------------------------------------
  485. // Special optimization for the last 64 bytes of ciphertext
  486. openSSETail64:
  487. // Need to decrypt up to 64 bytes - prepare single block
  488. MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
  489. XORQ itr2, itr2
  490. MOVQ inl, itr1
  491. CMPQ itr1, $16
  492. JB openSSETail64LoopB
  493. openSSETail64LoopA:
  494. // Perform ChaCha rounds, while hashing the remaining input
  495. polyAdd(0(inp)(itr2*1))
  496. polyMul
  497. SUBQ $16, itr1
  498. openSSETail64LoopB:
  499. ADDQ $16, itr2
  500. chachaQR(A0, B0, C0, D0, T0)
  501. shiftB0Left; shiftC0Left; shiftD0Left
  502. chachaQR(A0, B0, C0, D0, T0)
  503. shiftB0Right; shiftC0Right; shiftD0Right
  504. CMPQ itr1, $16
  505. JAE openSSETail64LoopA
  506. CMPQ itr2, $160
  507. JNE openSSETail64LoopB
  508. PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0; PADDL state2Store, C0; PADDL ctr0Store, D0
  509. openSSETail64DecLoop:
  510. CMPQ inl, $16
  511. JB openSSETail64DecLoopDone
  512. SUBQ $16, inl
  513. MOVOU (inp), T0
  514. PXOR T0, A0
  515. MOVOU A0, (oup)
  516. LEAQ 16(inp), inp
  517. LEAQ 16(oup), oup
  518. MOVO B0, A0
  519. MOVO C0, B0
  520. MOVO D0, C0
  521. JMP openSSETail64DecLoop
  522. openSSETail64DecLoopDone:
  523. MOVO A0, A1
  524. JMP openSSETail16
  525. // ----------------------------------------------------------------------------
  526. // Special optimization for the last 128 bytes of ciphertext
  527. openSSETail128:
  528. // Need to decrypt up to 128 bytes - prepare two blocks
  529. MOVO ·chacha20Constants<>(SB), A1; MOVO state1Store, B1; MOVO state2Store, C1; MOVO ctr3Store, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr0Store
  530. MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr1Store
  531. XORQ itr2, itr2
  532. MOVQ inl, itr1
  533. ANDQ $-16, itr1
  534. openSSETail128LoopA:
  535. // Perform ChaCha rounds, while hashing the remaining input
  536. polyAdd(0(inp)(itr2*1))
  537. polyMul
  538. openSSETail128LoopB:
  539. ADDQ $16, itr2
  540. chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
  541. shiftB0Left; shiftC0Left; shiftD0Left
  542. shiftB1Left; shiftC1Left; shiftD1Left
  543. chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
  544. shiftB0Right; shiftC0Right; shiftD0Right
  545. shiftB1Right; shiftC1Right; shiftD1Right
  546. CMPQ itr2, itr1
  547. JB openSSETail128LoopA
  548. CMPQ itr2, $160
  549. JNE openSSETail128LoopB
  550. PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1
  551. PADDL state1Store, B0; PADDL state1Store, B1
  552. PADDL state2Store, C0; PADDL state2Store, C1
  553. PADDL ctr1Store, D0; PADDL ctr0Store, D1
  554. MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
  555. PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
  556. MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup)
  557. SUBQ $64, inl
  558. LEAQ 64(inp), inp
  559. LEAQ 64(oup), oup
  560. JMP openSSETail64DecLoop
  561. // ----------------------------------------------------------------------------
  562. // Special optimization for the last 192 bytes of ciphertext
  563. openSSETail192:
  564. // Need to decrypt up to 192 bytes - prepare three blocks
  565. MOVO ·chacha20Constants<>(SB), A2; MOVO state1Store, B2; MOVO state2Store, C2; MOVO ctr3Store, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr0Store
  566. MOVO A2, A1; MOVO B2, B1; MOVO C2, C1; MOVO D2, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
  567. MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr2Store
  568. MOVQ inl, itr1
  569. MOVQ $160, itr2
  570. CMPQ itr1, $160
  571. CMOVQGT itr2, itr1
  572. ANDQ $-16, itr1
  573. XORQ itr2, itr2
  574. openSSLTail192LoopA:
  575. // Perform ChaCha rounds, while hashing the remaining input
  576. polyAdd(0(inp)(itr2*1))
  577. polyMul
  578. openSSLTail192LoopB:
  579. ADDQ $16, itr2
  580. chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
  581. shiftB0Left; shiftC0Left; shiftD0Left
  582. shiftB1Left; shiftC1Left; shiftD1Left
  583. shiftB2Left; shiftC2Left; shiftD2Left
  584. chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
  585. shiftB0Right; shiftC0Right; shiftD0Right
  586. shiftB1Right; shiftC1Right; shiftD1Right
  587. shiftB2Right; shiftC2Right; shiftD2Right
  588. CMPQ itr2, itr1
  589. JB openSSLTail192LoopA
  590. CMPQ itr2, $160
  591. JNE openSSLTail192LoopB
  592. CMPQ inl, $176
  593. JB openSSLTail192Store
  594. polyAdd(160(inp))
  595. polyMul
  596. CMPQ inl, $192
  597. JB openSSLTail192Store
  598. polyAdd(176(inp))
  599. polyMul
  600. openSSLTail192Store:
  601. PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
  602. PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2
  603. PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2
  604. PADDL ctr2Store, D0; PADDL ctr1Store, D1; PADDL ctr0Store, D2
  605. MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
  606. PXOR T0, A2; PXOR T1, B2; PXOR T2, C2; PXOR T3, D2
  607. MOVOU A2, (0*16)(oup); MOVOU B2, (1*16)(oup); MOVOU C2, (2*16)(oup); MOVOU D2, (3*16)(oup)
  608. MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3
  609. PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
  610. MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
  611. SUBQ $128, inl
  612. LEAQ 128(inp), inp
  613. LEAQ 128(oup), oup
  614. JMP openSSETail64DecLoop
  615. // ----------------------------------------------------------------------------
  616. // Special optimization for the last 256 bytes of ciphertext
  617. openSSETail256:
  618. // Need to decrypt up to 256 bytes - prepare four blocks
  619. MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
  620. MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
  621. MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
  622. MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
  623. // Store counters
  624. MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
  625. XORQ itr2, itr2
  626. openSSETail256Loop:
  627. // This loop inteleaves 8 ChaCha quarter rounds with 1 poly multiplication
  628. polyAdd(0(inp)(itr2*1))
  629. MOVO C3, tmpStore
  630. chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
  631. MOVO tmpStore, C3
  632. MOVO C1, tmpStore
  633. chachaQR(A3, B3, C3, D3, C1)
  634. MOVO tmpStore, C1
  635. shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
  636. shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
  637. shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
  638. polyMulStage1
  639. polyMulStage2
  640. MOVO C3, tmpStore
  641. chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
  642. MOVO tmpStore, C3
  643. MOVO C1, tmpStore
  644. chachaQR(A3, B3, C3, D3, C1)
  645. MOVO tmpStore, C1
  646. polyMulStage3
  647. polyMulReduceStage
  648. shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
  649. shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
  650. shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
  651. ADDQ $2*8, itr2
  652. CMPQ itr2, $160
  653. JB openSSETail256Loop
  654. MOVQ inl, itr1
  655. ANDQ $-16, itr1
  656. openSSETail256HashLoop:
  657. polyAdd(0(inp)(itr2*1))
  658. polyMul
  659. ADDQ $2*8, itr2
  660. CMPQ itr2, itr1
  661. JB openSSETail256HashLoop
  662. // Add in the state
  663. PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
  664. PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
  665. PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
  666. PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
  667. MOVO D3, tmpStore
  668. // Load - xor - store
  669. MOVOU (0*16)(inp), D3; PXOR D3, A0
  670. MOVOU (1*16)(inp), D3; PXOR D3, B0
  671. MOVOU (2*16)(inp), D3; PXOR D3, C0
  672. MOVOU (3*16)(inp), D3; PXOR D3, D0
  673. MOVOU A0, (0*16)(oup)
  674. MOVOU B0, (1*16)(oup)
  675. MOVOU C0, (2*16)(oup)
  676. MOVOU D0, (3*16)(oup)
  677. MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
  678. PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
  679. MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
  680. MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0
  681. PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
  682. MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup)
  683. LEAQ 192(inp), inp
  684. LEAQ 192(oup), oup
  685. SUBQ $192, inl
  686. MOVO A3, A0
  687. MOVO B3, B0
  688. MOVO C3, C0
  689. MOVO tmpStore, D0
  690. JMP openSSETail64DecLoop
  691. // ----------------------------------------------------------------------------
  692. // ------------------------- AVX2 Code ----------------------------------------
  693. chacha20Poly1305Open_AVX2:
  694. VZEROUPPER
  695. VMOVDQU ·chacha20Constants<>(SB), AA0
  696. BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14
  697. BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12
  698. BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4
  699. VPADDD ·avx2InitMask<>(SB), DD0, DD0
  700. // Special optimization, for very short buffers
  701. CMPQ inl, $192
  702. JBE openAVX2192
  703. CMPQ inl, $320
  704. JBE openAVX2320
  705. // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
  706. VMOVDQA BB0, state1StoreAVX2
  707. VMOVDQA CC0, state2StoreAVX2
  708. VMOVDQA DD0, ctr3StoreAVX2
  709. MOVQ $10, itr2
  710. openAVX2PreparePolyKey:
  711. chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
  712. VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
  713. chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
  714. VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
  715. DECQ itr2
  716. JNE openAVX2PreparePolyKey
  717. VPADDD ·chacha20Constants<>(SB), AA0, AA0
  718. VPADDD state1StoreAVX2, BB0, BB0
  719. VPADDD state2StoreAVX2, CC0, CC0
  720. VPADDD ctr3StoreAVX2, DD0, DD0
  721. VPERM2I128 $0x02, AA0, BB0, TT0
  722. // Clamp and store poly key
  723. VPAND ·polyClampMask<>(SB), TT0, TT0
  724. VMOVDQA TT0, rsStoreAVX2
  725. // Stream for the first 64 bytes
  726. VPERM2I128 $0x13, AA0, BB0, AA0
  727. VPERM2I128 $0x13, CC0, DD0, BB0
  728. // Hash AD + first 64 bytes
  729. MOVQ ad_len+80(FP), itr2
  730. CALL polyHashADInternal<>(SB)
  731. XORQ itr1, itr1
  732. openAVX2InitialHash64:
  733. polyAdd(0(inp)(itr1*1))
  734. polyMulAVX2
  735. ADDQ $16, itr1
  736. CMPQ itr1, $64
  737. JNE openAVX2InitialHash64
  738. // Decrypt the first 64 bytes
  739. VPXOR (0*32)(inp), AA0, AA0
  740. VPXOR (1*32)(inp), BB0, BB0
  741. VMOVDQU AA0, (0*32)(oup)
  742. VMOVDQU BB0, (1*32)(oup)
  743. LEAQ (2*32)(inp), inp
  744. LEAQ (2*32)(oup), oup
  745. SUBQ $64, inl
  746. openAVX2MainLoop:
  747. CMPQ inl, $512
  748. JB openAVX2MainLoopDone
  749. // Load state, increment counter blocks, store the incremented counters
  750. VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
  751. VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
  752. VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
  753. VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
  754. VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
  755. XORQ itr1, itr1
  756. openAVX2InternalLoop:
  757. // Lets just say this spaghetti loop interleaves 2 quarter rounds with 3 poly multiplications
  758. // Effectively per 512 bytes of stream we hash 480 bytes of ciphertext
  759. polyAdd(0*8(inp)(itr1*1))
  760. VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  761. polyMulStage1_AVX2
  762. VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  763. VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
  764. polyMulStage2_AVX2
  765. VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  766. VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  767. polyMulStage3_AVX2
  768. VMOVDQA CC3, tmpStoreAVX2
  769. VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
  770. VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
  771. VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
  772. VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
  773. VMOVDQA tmpStoreAVX2, CC3
  774. polyMulReduceStage
  775. VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  776. VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  777. VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
  778. polyAdd(2*8(inp)(itr1*1))
  779. VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  780. polyMulStage1_AVX2
  781. VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  782. VMOVDQA CC3, tmpStoreAVX2
  783. VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
  784. VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
  785. VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
  786. VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
  787. VMOVDQA tmpStoreAVX2, CC3
  788. polyMulStage2_AVX2
  789. VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
  790. VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
  791. VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
  792. VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  793. polyMulStage3_AVX2
  794. VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  795. VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
  796. polyMulReduceStage
  797. VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  798. VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  799. polyAdd(4*8(inp)(itr1*1))
  800. LEAQ (6*8)(itr1), itr1
  801. VMOVDQA CC3, tmpStoreAVX2
  802. VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
  803. VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
  804. VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
  805. VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
  806. VMOVDQA tmpStoreAVX2, CC3
  807. polyMulStage1_AVX2
  808. VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  809. VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  810. polyMulStage2_AVX2
  811. VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
  812. VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  813. polyMulStage3_AVX2
  814. VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  815. VMOVDQA CC3, tmpStoreAVX2
  816. VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
  817. VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
  818. VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
  819. VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
  820. VMOVDQA tmpStoreAVX2, CC3
  821. polyMulReduceStage
  822. VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
  823. VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
  824. VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
  825. CMPQ itr1, $480
  826. JNE openAVX2InternalLoop
  827. VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
  828. VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
  829. VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
  830. VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
  831. VMOVDQA CC3, tmpStoreAVX2
  832. // We only hashed 480 of the 512 bytes available - hash the remaining 32 here
  833. polyAdd(480(inp))
  834. polyMulAVX2
  835. VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
  836. VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
  837. VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
  838. VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
  839. VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
  840. VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
  841. // and here
  842. polyAdd(496(inp))
  843. polyMulAVX2
  844. VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
  845. VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
  846. VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
  847. VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
  848. VPXOR (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0
  849. VMOVDQU AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup)
  850. LEAQ (32*16)(inp), inp
  851. LEAQ (32*16)(oup), oup
  852. SUBQ $(32*16), inl
  853. JMP openAVX2MainLoop
  854. openAVX2MainLoopDone:
  855. // Handle the various tail sizes efficiently
  856. TESTQ inl, inl
  857. JE openSSEFinalize
  858. CMPQ inl, $128
  859. JBE openAVX2Tail128
  860. CMPQ inl, $256
  861. JBE openAVX2Tail256
  862. CMPQ inl, $384
  863. JBE openAVX2Tail384
  864. JMP openAVX2Tail512
  865. // ----------------------------------------------------------------------------
  866. // Special optimization for buffers smaller than 193 bytes
  867. openAVX2192:
  868. // For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks
  869. VMOVDQA AA0, AA1
  870. VMOVDQA BB0, BB1
  871. VMOVDQA CC0, CC1
  872. VPADDD ·avx2IncMask<>(SB), DD0, DD1
  873. VMOVDQA AA0, AA2
  874. VMOVDQA BB0, BB2
  875. VMOVDQA CC0, CC2
  876. VMOVDQA DD0, DD2
  877. VMOVDQA DD1, TT3
  878. MOVQ $10, itr2
  879. openAVX2192InnerCipherLoop:
  880. chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  881. VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
  882. VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
  883. VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
  884. chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  885. VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
  886. VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
  887. VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
  888. DECQ itr2
  889. JNE openAVX2192InnerCipherLoop
  890. VPADDD AA2, AA0, AA0; VPADDD AA2, AA1, AA1
  891. VPADDD BB2, BB0, BB0; VPADDD BB2, BB1, BB1
  892. VPADDD CC2, CC0, CC0; VPADDD CC2, CC1, CC1
  893. VPADDD DD2, DD0, DD0; VPADDD TT3, DD1, DD1
  894. VPERM2I128 $0x02, AA0, BB0, TT0
  895. // Clamp and store poly key
  896. VPAND ·polyClampMask<>(SB), TT0, TT0
  897. VMOVDQA TT0, rsStoreAVX2
  898. // Stream for up to 192 bytes
  899. VPERM2I128 $0x13, AA0, BB0, AA0
  900. VPERM2I128 $0x13, CC0, DD0, BB0
  901. VPERM2I128 $0x02, AA1, BB1, CC0
  902. VPERM2I128 $0x02, CC1, DD1, DD0
  903. VPERM2I128 $0x13, AA1, BB1, AA1
  904. VPERM2I128 $0x13, CC1, DD1, BB1
  905. openAVX2ShortOpen:
  906. // Hash
  907. MOVQ ad_len+80(FP), itr2
  908. CALL polyHashADInternal<>(SB)
  909. openAVX2ShortOpenLoop:
  910. CMPQ inl, $32
  911. JB openAVX2ShortTail32
  912. SUBQ $32, inl
  913. // Load for hashing
  914. polyAdd(0*8(inp))
  915. polyMulAVX2
  916. polyAdd(2*8(inp))
  917. polyMulAVX2
  918. // Load for decryption
  919. VPXOR (inp), AA0, AA0
  920. VMOVDQU AA0, (oup)
  921. LEAQ (1*32)(inp), inp
  922. LEAQ (1*32)(oup), oup
  923. // Shift stream left
  924. VMOVDQA BB0, AA0
  925. VMOVDQA CC0, BB0
  926. VMOVDQA DD0, CC0
  927. VMOVDQA AA1, DD0
  928. VMOVDQA BB1, AA1
  929. VMOVDQA CC1, BB1
  930. VMOVDQA DD1, CC1
  931. VMOVDQA AA2, DD1
  932. VMOVDQA BB2, AA2
  933. JMP openAVX2ShortOpenLoop
  934. openAVX2ShortTail32:
  935. CMPQ inl, $16
  936. VMOVDQA A0, A1
  937. JB openAVX2ShortDone
  938. SUBQ $16, inl
  939. // Load for hashing
  940. polyAdd(0*8(inp))
  941. polyMulAVX2
  942. // Load for decryption
  943. VPXOR (inp), A0, T0
  944. VMOVDQU T0, (oup)
  945. LEAQ (1*16)(inp), inp
  946. LEAQ (1*16)(oup), oup
  947. VPERM2I128 $0x11, AA0, AA0, AA0
  948. VMOVDQA A0, A1
  949. openAVX2ShortDone:
  950. VZEROUPPER
  951. JMP openSSETail16
  952. // ----------------------------------------------------------------------------
  953. // Special optimization for buffers smaller than 321 bytes
  954. openAVX2320:
  955. // For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
  956. VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1
  957. VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2
  958. VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3
  959. MOVQ $10, itr2
  960. openAVX2320InnerCipherLoop:
  961. chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  962. VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
  963. VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  964. VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
  965. chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  966. VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
  967. VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  968. VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
  969. DECQ itr2
  970. JNE openAVX2320InnerCipherLoop
  971. VMOVDQA ·chacha20Constants<>(SB), TT0
  972. VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2
  973. VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2
  974. VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2
  975. VMOVDQA ·avx2IncMask<>(SB), TT0
  976. VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3
  977. VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3
  978. VPADDD TT3, DD2, DD2
  979. // Clamp and store poly key
  980. VPERM2I128 $0x02, AA0, BB0, TT0
  981. VPAND ·polyClampMask<>(SB), TT0, TT0
  982. VMOVDQA TT0, rsStoreAVX2
  983. // Stream for up to 320 bytes
  984. VPERM2I128 $0x13, AA0, BB0, AA0
  985. VPERM2I128 $0x13, CC0, DD0, BB0
  986. VPERM2I128 $0x02, AA1, BB1, CC0
  987. VPERM2I128 $0x02, CC1, DD1, DD0
  988. VPERM2I128 $0x13, AA1, BB1, AA1
  989. VPERM2I128 $0x13, CC1, DD1, BB1
  990. VPERM2I128 $0x02, AA2, BB2, CC1
  991. VPERM2I128 $0x02, CC2, DD2, DD1
  992. VPERM2I128 $0x13, AA2, BB2, AA2
  993. VPERM2I128 $0x13, CC2, DD2, BB2
  994. JMP openAVX2ShortOpen
  995. // ----------------------------------------------------------------------------
  996. // Special optimization for the last 128 bytes of ciphertext
  997. openAVX2Tail128:
  998. // Need to decrypt up to 128 bytes - prepare two blocks
  999. VMOVDQA ·chacha20Constants<>(SB), AA1
  1000. VMOVDQA state1StoreAVX2, BB1
  1001. VMOVDQA state2StoreAVX2, CC1
  1002. VMOVDQA ctr3StoreAVX2, DD1
  1003. VPADDD ·avx2IncMask<>(SB), DD1, DD1
  1004. VMOVDQA DD1, DD0
  1005. XORQ itr2, itr2
  1006. MOVQ inl, itr1
  1007. ANDQ $-16, itr1
  1008. TESTQ itr1, itr1
  1009. JE openAVX2Tail128LoopB
  1010. openAVX2Tail128LoopA:
  1011. // Perform ChaCha rounds, while hashing the remaining input
  1012. polyAdd(0(inp)(itr2*1))
  1013. polyMulAVX2
  1014. openAVX2Tail128LoopB:
  1015. ADDQ $16, itr2
  1016. chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  1017. VPALIGNR $4, BB1, BB1, BB1
  1018. VPALIGNR $8, CC1, CC1, CC1
  1019. VPALIGNR $12, DD1, DD1, DD1
  1020. chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  1021. VPALIGNR $12, BB1, BB1, BB1
  1022. VPALIGNR $8, CC1, CC1, CC1
  1023. VPALIGNR $4, DD1, DD1, DD1
  1024. CMPQ itr2, itr1
  1025. JB openAVX2Tail128LoopA
  1026. CMPQ itr2, $160
  1027. JNE openAVX2Tail128LoopB
  1028. VPADDD ·chacha20Constants<>(SB), AA1, AA1
  1029. VPADDD state1StoreAVX2, BB1, BB1
  1030. VPADDD state2StoreAVX2, CC1, CC1
  1031. VPADDD DD0, DD1, DD1
  1032. VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
  1033. openAVX2TailLoop:
  1034. CMPQ inl, $32
  1035. JB openAVX2Tail
  1036. SUBQ $32, inl
  1037. // Load for decryption
  1038. VPXOR (inp), AA0, AA0
  1039. VMOVDQU AA0, (oup)
  1040. LEAQ (1*32)(inp), inp
  1041. LEAQ (1*32)(oup), oup
  1042. VMOVDQA BB0, AA0
  1043. VMOVDQA CC0, BB0
  1044. VMOVDQA DD0, CC0
  1045. JMP openAVX2TailLoop
  1046. openAVX2Tail:
  1047. CMPQ inl, $16
  1048. VMOVDQA A0, A1
  1049. JB openAVX2TailDone
  1050. SUBQ $16, inl
  1051. // Load for decryption
  1052. VPXOR (inp), A0, T0
  1053. VMOVDQU T0, (oup)
  1054. LEAQ (1*16)(inp), inp
  1055. LEAQ (1*16)(oup), oup
  1056. VPERM2I128 $0x11, AA0, AA0, AA0
  1057. VMOVDQA A0, A1
  1058. openAVX2TailDone:
  1059. VZEROUPPER
  1060. JMP openSSETail16
  1061. // ----------------------------------------------------------------------------
  1062. // Special optimization for the last 256 bytes of ciphertext
  1063. openAVX2Tail256:
  1064. // Need to decrypt up to 256 bytes - prepare four blocks
  1065. VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1
  1066. VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1
  1067. VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1
  1068. VMOVDQA ctr3StoreAVX2, DD0
  1069. VPADDD ·avx2IncMask<>(SB), DD0, DD0
  1070. VPADDD ·avx2IncMask<>(SB), DD0, DD1
  1071. VMOVDQA DD0, TT1
  1072. VMOVDQA DD1, TT2
  1073. // Compute the number of iterations that will hash data
  1074. MOVQ inl, tmpStoreAVX2
  1075. MOVQ inl, itr1
  1076. SUBQ $128, itr1
  1077. SHRQ $4, itr1
  1078. MOVQ $10, itr2
  1079. CMPQ itr1, $10
  1080. CMOVQGT itr2, itr1
  1081. MOVQ inp, inl
  1082. XORQ itr2, itr2
  1083. openAVX2Tail256LoopA:
  1084. polyAdd(0(inl))
  1085. polyMulAVX2
  1086. LEAQ 16(inl), inl
  1087. // Perform ChaCha rounds, while hashing the remaining input
  1088. openAVX2Tail256LoopB:
  1089. chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  1090. VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
  1091. VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
  1092. VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
  1093. INCQ itr2
  1094. chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  1095. VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
  1096. VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
  1097. VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
  1098. CMPQ itr2, itr1
  1099. JB openAVX2Tail256LoopA
  1100. CMPQ itr2, $10
  1101. JNE openAVX2Tail256LoopB
  1102. MOVQ inl, itr2
  1103. SUBQ inp, inl
  1104. MOVQ inl, itr1
  1105. MOVQ tmpStoreAVX2, inl
  1106. // Hash the remainder of data (if any)
  1107. openAVX2Tail256Hash:
  1108. ADDQ $16, itr1
  1109. CMPQ itr1, inl
  1110. JGT openAVX2Tail256HashEnd
  1111. polyAdd (0(itr2))
  1112. polyMulAVX2
  1113. LEAQ 16(itr2), itr2
  1114. JMP openAVX2Tail256Hash
  1115. // Store 128 bytes safely, then go to store loop
  1116. openAVX2Tail256HashEnd:
  1117. VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1
  1118. VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1
  1119. VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1
  1120. VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1
  1121. VPERM2I128 $0x02, AA0, BB0, AA2; VPERM2I128 $0x02, CC0, DD0, BB2; VPERM2I128 $0x13, AA0, BB0, CC2; VPERM2I128 $0x13, CC0, DD0, DD2
  1122. VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
  1123. VPXOR (0*32)(inp), AA2, AA2; VPXOR (1*32)(inp), BB2, BB2; VPXOR (2*32)(inp), CC2, CC2; VPXOR (3*32)(inp), DD2, DD2
  1124. VMOVDQU AA2, (0*32)(oup); VMOVDQU BB2, (1*32)(oup); VMOVDQU CC2, (2*32)(oup); VMOVDQU DD2, (3*32)(oup)
  1125. LEAQ (4*32)(inp), inp
  1126. LEAQ (4*32)(oup), oup
  1127. SUBQ $4*32, inl
  1128. JMP openAVX2TailLoop
  1129. // ----------------------------------------------------------------------------
  1130. // Special optimization for the last 384 bytes of ciphertext
  1131. openAVX2Tail384:
  1132. // Need to decrypt up to 384 bytes - prepare six blocks
  1133. VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
  1134. VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2
  1135. VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2
  1136. VMOVDQA ctr3StoreAVX2, DD0
  1137. VPADDD ·avx2IncMask<>(SB), DD0, DD0
  1138. VPADDD ·avx2IncMask<>(SB), DD0, DD1
  1139. VPADDD ·avx2IncMask<>(SB), DD1, DD2
  1140. VMOVDQA DD0, ctr0StoreAVX2
  1141. VMOVDQA DD1, ctr1StoreAVX2
  1142. VMOVDQA DD2, ctr2StoreAVX2
  1143. // Compute the number of iterations that will hash two blocks of data
  1144. MOVQ inl, tmpStoreAVX2
  1145. MOVQ inl, itr1
  1146. SUBQ $256, itr1
  1147. SHRQ $4, itr1
  1148. ADDQ $6, itr1
  1149. MOVQ $10, itr2
  1150. CMPQ itr1, $10
  1151. CMOVQGT itr2, itr1
  1152. MOVQ inp, inl
  1153. XORQ itr2, itr2
  1154. // Perform ChaCha rounds, while hashing the remaining input
  1155. openAVX2Tail384LoopB:
  1156. polyAdd(0(inl))
  1157. polyMulAVX2
  1158. LEAQ 16(inl), inl
  1159. openAVX2Tail384LoopA:
  1160. chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  1161. VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
  1162. VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  1163. VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
  1164. polyAdd(0(inl))
  1165. polyMulAVX2
  1166. LEAQ 16(inl), inl
  1167. INCQ itr2
  1168. chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  1169. VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
  1170. VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  1171. VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
  1172. CMPQ itr2, itr1
  1173. JB openAVX2Tail384LoopB
  1174. CMPQ itr2, $10
  1175. JNE openAVX2Tail384LoopA
  1176. MOVQ inl, itr2
  1177. SUBQ inp, inl
  1178. MOVQ inl, itr1
  1179. MOVQ tmpStoreAVX2, inl
  1180. openAVX2Tail384Hash:
  1181. ADDQ $16, itr1
  1182. CMPQ itr1, inl
  1183. JGT openAVX2Tail384HashEnd
  1184. polyAdd(0(itr2))
  1185. polyMulAVX2
  1186. LEAQ 16(itr2), itr2
  1187. JMP openAVX2Tail384Hash
  1188. // Store 256 bytes safely, then go to store loop
  1189. openAVX2Tail384HashEnd:
  1190. VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2
  1191. VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2
  1192. VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2
  1193. VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2
  1194. VPERM2I128 $0x02, AA0, BB0, TT0; VPERM2I128 $0x02, CC0, DD0, TT1; VPERM2I128 $0x13, AA0, BB0, TT2; VPERM2I128 $0x13, CC0, DD0, TT3
  1195. VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
  1196. VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
  1197. VPERM2I128 $0x02, AA1, BB1, TT0; VPERM2I128 $0x02, CC1, DD1, TT1; VPERM2I128 $0x13, AA1, BB1, TT2; VPERM2I128 $0x13, CC1, DD1, TT3
  1198. VPXOR (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3
  1199. VMOVDQU TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup)
  1200. VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
  1201. LEAQ (8*32)(inp), inp
  1202. LEAQ (8*32)(oup), oup
  1203. SUBQ $8*32, inl
  1204. JMP openAVX2TailLoop
  1205. // ----------------------------------------------------------------------------
  1206. // Special optimization for the last 512 bytes of ciphertext
  1207. openAVX2Tail512:
  1208. VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
  1209. VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
  1210. VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
  1211. VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
  1212. VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
  1213. XORQ itr1, itr1
  1214. MOVQ inp, itr2
  1215. openAVX2Tail512LoopB:
  1216. polyAdd(0(itr2))
  1217. polyMulAVX2
  1218. LEAQ (2*8)(itr2), itr2
  1219. openAVX2Tail512LoopA:
  1220. VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  1221. VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  1222. VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
  1223. VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  1224. VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  1225. VMOVDQA CC3, tmpStoreAVX2
  1226. VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
  1227. VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
  1228. VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
  1229. VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
  1230. VMOVDQA tmpStoreAVX2, CC3
  1231. polyAdd(0*8(itr2))
  1232. polyMulAVX2
  1233. VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  1234. VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  1235. VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
  1236. VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  1237. VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  1238. VMOVDQA CC3, tmpStoreAVX2
  1239. VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
  1240. VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
  1241. VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
  1242. VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
  1243. VMOVDQA tmpStoreAVX2, CC3
  1244. VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
  1245. VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
  1246. VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
  1247. VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  1248. VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  1249. VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
  1250. VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  1251. VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  1252. polyAdd(2*8(itr2))
  1253. polyMulAVX2
  1254. LEAQ (4*8)(itr2), itr2
  1255. VMOVDQA CC3, tmpStoreAVX2
  1256. VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
  1257. VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
  1258. VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
  1259. VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
  1260. VMOVDQA tmpStoreAVX2, CC3
  1261. VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  1262. VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  1263. VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
  1264. VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  1265. VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  1266. VMOVDQA CC3, tmpStoreAVX2
  1267. VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
  1268. VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
  1269. VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
  1270. VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
  1271. VMOVDQA tmpStoreAVX2, CC3
  1272. VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
  1273. VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
  1274. VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
  1275. INCQ itr1
  1276. CMPQ itr1, $4
  1277. JLT openAVX2Tail512LoopB
  1278. CMPQ itr1, $10
  1279. JNE openAVX2Tail512LoopA
  1280. MOVQ inl, itr1
  1281. SUBQ $384, itr1
  1282. ANDQ $-16, itr1
  1283. openAVX2Tail512HashLoop:
  1284. TESTQ itr1, itr1
  1285. JE openAVX2Tail512HashEnd
  1286. polyAdd(0(itr2))
  1287. polyMulAVX2
  1288. LEAQ 16(itr2), itr2
  1289. SUBQ $16, itr1
  1290. JMP openAVX2Tail512HashLoop
  1291. openAVX2Tail512HashEnd:
  1292. VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
  1293. VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
  1294. VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
  1295. VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
  1296. VMOVDQA CC3, tmpStoreAVX2
  1297. VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
  1298. VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
  1299. VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
  1300. VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
  1301. VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
  1302. VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
  1303. VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
  1304. VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
  1305. VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
  1306. VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
  1307. LEAQ (12*32)(inp), inp
  1308. LEAQ (12*32)(oup), oup
  1309. SUBQ $12*32, inl
  1310. JMP openAVX2TailLoop
  1311. // ----------------------------------------------------------------------------
  1312. // ----------------------------------------------------------------------------
  1313. // func chacha20Poly1305Seal(dst, key, src, ad []byte)
  1314. TEXT ·chacha20Poly1305Seal(SB), 0, $288-96
  1315. // For aligned stack access
  1316. MOVQ SP, BP
  1317. ADDQ $32, BP
  1318. ANDQ $-32, BP
  1319. MOVQ dst+0(FP), oup
  1320. MOVQ key+24(FP), keyp
  1321. MOVQ src+48(FP), inp
  1322. MOVQ src_len+56(FP), inl
  1323. MOVQ ad+72(FP), adp
  1324. // Check for AVX2 support
  1325. CMPB runtime·support_avx2(SB), $0
  1326. JE noavx2bmi2Seal
  1327. // Check BMI2 bit for MULXQ.
  1328. // runtime·cpuid_ebx7 is always available here
  1329. // because it passed avx2 check
  1330. TESTL $(1<<8), runtime·cpuid_ebx7(SB)
  1331. JNE chacha20Poly1305Seal_AVX2
  1332. noavx2bmi2Seal:
  1333. // Special optimization, for very short buffers
  1334. CMPQ inl, $128
  1335. JBE sealSSE128 // About 15% faster
  1336. // In the seal case - prepare the poly key + 3 blocks of stream in the first iteration
  1337. MOVOU ·chacha20Constants<>(SB), A0
  1338. MOVOU (1*16)(keyp), B0
  1339. MOVOU (2*16)(keyp), C0
  1340. MOVOU (3*16)(keyp), D0
  1341. // Store state on stack for future use
  1342. MOVO B0, state1Store
  1343. MOVO C0, state2Store
  1344. // Load state, increment counter blocks
  1345. MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
  1346. MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
  1347. MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
  1348. // Store counters
  1349. MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
  1350. MOVQ $10, itr2
  1351. sealSSEIntroLoop:
  1352. MOVO C3, tmpStore
  1353. chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
  1354. MOVO tmpStore, C3
  1355. MOVO C1, tmpStore
  1356. chachaQR(A3, B3, C3, D3, C1)
  1357. MOVO tmpStore, C1
  1358. shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
  1359. shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
  1360. shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
  1361. MOVO C3, tmpStore
  1362. chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
  1363. MOVO tmpStore, C3
  1364. MOVO C1, tmpStore
  1365. chachaQR(A3, B3, C3, D3, C1)
  1366. MOVO tmpStore, C1
  1367. shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
  1368. shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
  1369. shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
  1370. DECQ itr2
  1371. JNE sealSSEIntroLoop
  1372. // Add in the state
  1373. PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
  1374. PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
  1375. PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
  1376. PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
  1377. // Clamp and store the key
  1378. PAND ·polyClampMask<>(SB), A0
  1379. MOVO A0, rStore
  1380. MOVO B0, sStore
  1381. // Hash AAD
  1382. MOVQ ad_len+80(FP), itr2
  1383. CALL polyHashADInternal<>(SB)
  1384. MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
  1385. PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
  1386. MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup)
  1387. MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
  1388. PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
  1389. MOVOU A2, (4*16)(oup); MOVOU B2, (5*16)(oup); MOVOU C2, (6*16)(oup); MOVOU D2, (7*16)(oup)
  1390. MOVQ $128, itr1
  1391. SUBQ $128, inl
  1392. LEAQ 128(inp), inp
  1393. MOVO A3, A1; MOVO B3, B1; MOVO C3, C1; MOVO D3, D1
  1394. CMPQ inl, $64
  1395. JBE sealSSE128SealHash
  1396. MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
  1397. PXOR A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3
  1398. MOVOU A3, (8*16)(oup); MOVOU B3, (9*16)(oup); MOVOU C3, (10*16)(oup); MOVOU D3, (11*16)(oup)
  1399. ADDQ $64, itr1
  1400. SUBQ $64, inl
  1401. LEAQ 64(inp), inp
  1402. MOVQ $2, itr1
  1403. MOVQ $8, itr2
  1404. CMPQ inl, $64
  1405. JBE sealSSETail64
  1406. CMPQ inl, $128
  1407. JBE sealSSETail128
  1408. CMPQ inl, $192
  1409. JBE sealSSETail192
  1410. sealSSEMainLoop:
  1411. // Load state, increment counter blocks
  1412. MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
  1413. MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
  1414. MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
  1415. MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
  1416. // Store counters
  1417. MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
  1418. sealSSEInnerLoop:
  1419. MOVO C3, tmpStore
  1420. chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
  1421. MOVO tmpStore, C3
  1422. MOVO C1, tmpStore
  1423. chachaQR(A3, B3, C3, D3, C1)
  1424. MOVO tmpStore, C1
  1425. polyAdd(0(oup))
  1426. shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
  1427. shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
  1428. shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
  1429. polyMulStage1
  1430. polyMulStage2
  1431. LEAQ (2*8)(oup), oup
  1432. MOVO C3, tmpStore
  1433. chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
  1434. MOVO tmpStore, C3
  1435. MOVO C1, tmpStore
  1436. polyMulStage3
  1437. chachaQR(A3, B3, C3, D3, C1)
  1438. MOVO tmpStore, C1
  1439. polyMulReduceStage
  1440. shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
  1441. shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
  1442. shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
  1443. DECQ itr2
  1444. JGE sealSSEInnerLoop
  1445. polyAdd(0(oup))
  1446. polyMul
  1447. LEAQ (2*8)(oup), oup
  1448. DECQ itr1
  1449. JG sealSSEInnerLoop
  1450. // Add in the state
  1451. PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
  1452. PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
  1453. PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
  1454. PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
  1455. MOVO D3, tmpStore
  1456. // Load - xor - store
  1457. MOVOU (0*16)(inp), D3; PXOR D3, A0
  1458. MOVOU (1*16)(inp), D3; PXOR D3, B0
  1459. MOVOU (2*16)(inp), D3; PXOR D3, C0
  1460. MOVOU (3*16)(inp), D3; PXOR D3, D0
  1461. MOVOU A0, (0*16)(oup)
  1462. MOVOU B0, (1*16)(oup)
  1463. MOVOU C0, (2*16)(oup)
  1464. MOVOU D0, (3*16)(oup)
  1465. MOVO tmpStore, D3
  1466. MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
  1467. PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
  1468. MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
  1469. MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0
  1470. PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
  1471. MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup)
  1472. ADDQ $192, inp
  1473. MOVQ $192, itr1
  1474. SUBQ $192, inl
  1475. MOVO A3, A1
  1476. MOVO B3, B1
  1477. MOVO C3, C1
  1478. MOVO D3, D1
  1479. CMPQ inl, $64
  1480. JBE sealSSE128SealHash
  1481. MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
  1482. PXOR A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3
  1483. MOVOU A3, (12*16)(oup); MOVOU B3, (13*16)(oup); MOVOU C3, (14*16)(oup); MOVOU D3, (15*16)(oup)
  1484. LEAQ 64(inp), inp
  1485. SUBQ $64, inl
  1486. MOVQ $6, itr1
  1487. MOVQ $4, itr2
  1488. CMPQ inl, $192
  1489. JG sealSSEMainLoop
  1490. MOVQ inl, itr1
  1491. TESTQ inl, inl
  1492. JE sealSSE128SealHash
  1493. MOVQ $6, itr1
  1494. CMPQ inl, $64
  1495. JBE sealSSETail64
  1496. CMPQ inl, $128
  1497. JBE sealSSETail128
  1498. JMP sealSSETail192
  1499. // ----------------------------------------------------------------------------
  1500. // Special optimization for the last 64 bytes of plaintext
  1501. sealSSETail64:
  1502. // Need to encrypt up to 64 bytes - prepare single block, hash 192 or 256 bytes
  1503. MOVO ·chacha20Constants<>(SB), A1
  1504. MOVO state1Store, B1
  1505. MOVO state2Store, C1
  1506. MOVO ctr3Store, D1
  1507. PADDL ·sseIncMask<>(SB), D1
  1508. MOVO D1, ctr0Store
  1509. sealSSETail64LoopA:
  1510. // Perform ChaCha rounds, while hashing the previously encrypted ciphertext
  1511. polyAdd(0(oup))
  1512. polyMul
  1513. LEAQ 16(oup), oup
  1514. sealSSETail64LoopB:
  1515. chachaQR(A1, B1, C1, D1, T1)
  1516. shiftB1Left; shiftC1Left; shiftD1Left
  1517. chachaQR(A1, B1, C1, D1, T1)
  1518. shiftB1Right; shiftC1Right; shiftD1Right
  1519. polyAdd(0(oup))
  1520. polyMul
  1521. LEAQ 16(oup), oup
  1522. DECQ itr1
  1523. JG sealSSETail64LoopA
  1524. DECQ itr2
  1525. JGE sealSSETail64LoopB
  1526. PADDL ·chacha20Constants<>(SB), A1
  1527. PADDL state1Store, B1
  1528. PADDL state2Store, C1
  1529. PADDL ctr0Store, D1
  1530. JMP sealSSE128Seal
  1531. // ----------------------------------------------------------------------------
  1532. // Special optimization for the last 128 bytes of plaintext
  1533. sealSSETail128:
  1534. // Need to encrypt up to 128 bytes - prepare two blocks, hash 192 or 256 bytes
  1535. MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
  1536. MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
  1537. sealSSETail128LoopA:
  1538. // Perform ChaCha rounds, while hashing the previously encrypted ciphertext
  1539. polyAdd(0(oup))
  1540. polyMul
  1541. LEAQ 16(oup), oup
  1542. sealSSETail128LoopB:
  1543. chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
  1544. shiftB0Left; shiftC0Left; shiftD0Left
  1545. shiftB1Left; shiftC1Left; shiftD1Left
  1546. polyAdd(0(oup))
  1547. polyMul
  1548. LEAQ 16(oup), oup
  1549. chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
  1550. shiftB0Right; shiftC0Right; shiftD0Right
  1551. shiftB1Right; shiftC1Right; shiftD1Right
  1552. DECQ itr1
  1553. JG sealSSETail128LoopA
  1554. DECQ itr2
  1555. JGE sealSSETail128LoopB
  1556. PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1
  1557. PADDL state1Store, B0; PADDL state1Store, B1
  1558. PADDL state2Store, C0; PADDL state2Store, C1
  1559. PADDL ctr0Store, D0; PADDL ctr1Store, D1
  1560. MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
  1561. PXOR T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0
  1562. MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup)
  1563. MOVQ $64, itr1
  1564. LEAQ 64(inp), inp
  1565. SUBQ $64, inl
  1566. JMP sealSSE128SealHash
  1567. // ----------------------------------------------------------------------------
  1568. // Special optimization for the last 192 bytes of plaintext
  1569. sealSSETail192:
  1570. // Need to encrypt up to 192 bytes - prepare three blocks, hash 192 or 256 bytes
  1571. MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
  1572. MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
  1573. MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr2Store
  1574. sealSSETail192LoopA:
  1575. // Perform ChaCha rounds, while hashing the previously encrypted ciphertext
  1576. polyAdd(0(oup))
  1577. polyMul
  1578. LEAQ 16(oup), oup
  1579. sealSSETail192LoopB:
  1580. chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
  1581. shiftB0Left; shiftC0Left; shiftD0Left
  1582. shiftB1Left; shiftC1Left; shiftD1Left
  1583. shiftB2Left; shiftC2Left; shiftD2Left
  1584. polyAdd(0(oup))
  1585. polyMul
  1586. LEAQ 16(oup), oup
  1587. chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
  1588. shiftB0Right; shiftC0Right; shiftD0Right
  1589. shiftB1Right; shiftC1Right; shiftD1Right
  1590. shiftB2Right; shiftC2Right; shiftD2Right
  1591. DECQ itr1
  1592. JG sealSSETail192LoopA
  1593. DECQ itr2
  1594. JGE sealSSETail192LoopB
  1595. PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
  1596. PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2
  1597. PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2
  1598. PADDL ctr0Store, D0; PADDL ctr1Store, D1; PADDL ctr2Store, D2
  1599. MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
  1600. PXOR T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0
  1601. MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup)
  1602. MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3
  1603. PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
  1604. MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
  1605. MOVO A2, A1
  1606. MOVO B2, B1
  1607. MOVO C2, C1
  1608. MOVO D2, D1
  1609. MOVQ $128, itr1
  1610. LEAQ 128(inp), inp
  1611. SUBQ $128, inl
  1612. JMP sealSSE128SealHash
  1613. // ----------------------------------------------------------------------------
  1614. // Special seal optimization for buffers smaller than 129 bytes
  1615. sealSSE128:
  1616. // For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
  1617. MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
  1618. MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
  1619. MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
  1620. MOVO B0, T1; MOVO C0, T2; MOVO D1, T3
  1621. MOVQ $10, itr2
  1622. sealSSE128InnerCipherLoop:
  1623. chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
  1624. shiftB0Left; shiftB1Left; shiftB2Left
  1625. shiftC0Left; shiftC1Left; shiftC2Left
  1626. shiftD0Left; shiftD1Left; shiftD2Left
  1627. chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
  1628. shiftB0Right; shiftB1Right; shiftB2Right
  1629. shiftC0Right; shiftC1Right; shiftC2Right
  1630. shiftD0Right; shiftD1Right; shiftD2Right
  1631. DECQ itr2
  1632. JNE sealSSE128InnerCipherLoop
  1633. // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
  1634. PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
  1635. PADDL T1, B0; PADDL T1, B1; PADDL T1, B2
  1636. PADDL T2, C1; PADDL T2, C2
  1637. PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2
  1638. PAND ·polyClampMask<>(SB), A0
  1639. MOVOU A0, rStore
  1640. MOVOU B0, sStore
  1641. // Hash
  1642. MOVQ ad_len+80(FP), itr2
  1643. CALL polyHashADInternal<>(SB)
  1644. XORQ itr1, itr1
  1645. sealSSE128SealHash:
  1646. // itr1 holds the number of bytes encrypted but not yet hashed
  1647. CMPQ itr1, $16
  1648. JB sealSSE128Seal
  1649. polyAdd(0(oup))
  1650. polyMul
  1651. SUBQ $16, itr1
  1652. ADDQ $16, oup
  1653. JMP sealSSE128SealHash
  1654. sealSSE128Seal:
  1655. CMPQ inl, $16
  1656. JB sealSSETail
  1657. SUBQ $16, inl
  1658. // Load for decryption
  1659. MOVOU (inp), T0
  1660. PXOR T0, A1
  1661. MOVOU A1, (oup)
  1662. LEAQ (1*16)(inp), inp
  1663. LEAQ (1*16)(oup), oup
  1664. // Extract for hashing
  1665. MOVQ A1, t0
  1666. PSRLDQ $8, A1
  1667. MOVQ A1, t1
  1668. ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
  1669. polyMul
  1670. // Shift the stream "left"
  1671. MOVO B1, A1
  1672. MOVO C1, B1
  1673. MOVO D1, C1
  1674. MOVO A2, D1
  1675. MOVO B2, A2
  1676. MOVO C2, B2
  1677. MOVO D2, C2
  1678. JMP sealSSE128Seal
  1679. sealSSETail:
  1680. TESTQ inl, inl
  1681. JE sealSSEFinalize
  1682. // We can only load the PT one byte at a time to avoid read after end of buffer
  1683. MOVQ inl, itr2
  1684. SHLQ $4, itr2
  1685. LEAQ ·andMask<>(SB), t0
  1686. MOVQ inl, itr1
  1687. LEAQ -1(inp)(inl*1), inp
  1688. XORQ t2, t2
  1689. XORQ t3, t3
  1690. XORQ AX, AX
  1691. sealSSETailLoadLoop:
  1692. SHLQ $8, t2, t3
  1693. SHLQ $8, t2
  1694. MOVB (inp), AX
  1695. XORQ AX, t2
  1696. LEAQ -1(inp), inp
  1697. DECQ itr1
  1698. JNE sealSSETailLoadLoop
  1699. MOVQ t2, 0+tmpStore
  1700. MOVQ t3, 8+tmpStore
  1701. PXOR 0+tmpStore, A1
  1702. MOVOU A1, (oup)
  1703. MOVOU -16(t0)(itr2*1), T0
  1704. PAND T0, A1
  1705. MOVQ A1, t0
  1706. PSRLDQ $8, A1
  1707. MOVQ A1, t1
  1708. ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
  1709. polyMul
  1710. ADDQ inl, oup
  1711. sealSSEFinalize:
  1712. // Hash in the buffer lengths
  1713. ADDQ ad_len+80(FP), acc0
  1714. ADCQ src_len+56(FP), acc1
  1715. ADCQ $1, acc2
  1716. polyMul
  1717. // Final reduce
  1718. MOVQ acc0, t0
  1719. MOVQ acc1, t1
  1720. MOVQ acc2, t2
  1721. SUBQ $-5, acc0
  1722. SBBQ $-1, acc1
  1723. SBBQ $3, acc2
  1724. CMOVQCS t0, acc0
  1725. CMOVQCS t1, acc1
  1726. CMOVQCS t2, acc2
  1727. // Add in the "s" part of the key
  1728. ADDQ 0+sStore, acc0
  1729. ADCQ 8+sStore, acc1
  1730. // Finally store the tag at the end of the message
  1731. MOVQ acc0, (0*8)(oup)
  1732. MOVQ acc1, (1*8)(oup)
  1733. RET
  1734. // ----------------------------------------------------------------------------
  1735. // ------------------------- AVX2 Code ----------------------------------------
  1736. chacha20Poly1305Seal_AVX2:
  1737. VZEROUPPER
  1738. VMOVDQU ·chacha20Constants<>(SB), AA0
  1739. BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14
  1740. BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12
  1741. BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4
  1742. VPADDD ·avx2InitMask<>(SB), DD0, DD0
  1743. // Special optimizations, for very short buffers
  1744. CMPQ inl, $192
  1745. JBE seal192AVX2 // 33% faster
  1746. CMPQ inl, $320
  1747. JBE seal320AVX2 // 17% faster
  1748. // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
  1749. VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
  1750. VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3; VMOVDQA BB0, state1StoreAVX2
  1751. VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3; VMOVDQA CC0, state2StoreAVX2
  1752. VPADDD ·avx2IncMask<>(SB), DD0, DD1; VMOVDQA DD0, ctr0StoreAVX2
  1753. VPADDD ·avx2IncMask<>(SB), DD1, DD2; VMOVDQA DD1, ctr1StoreAVX2
  1754. VPADDD ·avx2IncMask<>(SB), DD2, DD3; VMOVDQA DD2, ctr2StoreAVX2
  1755. VMOVDQA DD3, ctr3StoreAVX2
  1756. MOVQ $10, itr2
  1757. sealAVX2IntroLoop:
  1758. VMOVDQA CC3, tmpStoreAVX2
  1759. chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
  1760. VMOVDQA tmpStoreAVX2, CC3
  1761. VMOVDQA CC1, tmpStoreAVX2
  1762. chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
  1763. VMOVDQA tmpStoreAVX2, CC1
  1764. VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
  1765. VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1
  1766. VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2
  1767. VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3
  1768. VMOVDQA CC3, tmpStoreAVX2
  1769. chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
  1770. VMOVDQA tmpStoreAVX2, CC3
  1771. VMOVDQA CC1, tmpStoreAVX2
  1772. chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
  1773. VMOVDQA tmpStoreAVX2, CC1
  1774. VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
  1775. VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1
  1776. VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2
  1777. VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3
  1778. DECQ itr2
  1779. JNE sealAVX2IntroLoop
  1780. VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
  1781. VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
  1782. VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
  1783. VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
  1784. VPERM2I128 $0x13, CC0, DD0, CC0 // Stream bytes 96 - 127
  1785. VPERM2I128 $0x02, AA0, BB0, DD0 // The Poly1305 key
  1786. VPERM2I128 $0x13, AA0, BB0, AA0 // Stream bytes 64 - 95
  1787. // Clamp and store poly key
  1788. VPAND ·polyClampMask<>(SB), DD0, DD0
  1789. VMOVDQA DD0, rsStoreAVX2
  1790. // Hash AD
  1791. MOVQ ad_len+80(FP), itr2
  1792. CALL polyHashADInternal<>(SB)
  1793. // Can store at least 320 bytes
  1794. VPXOR (0*32)(inp), AA0, AA0
  1795. VPXOR (1*32)(inp), CC0, CC0
  1796. VMOVDQU AA0, (0*32)(oup)
  1797. VMOVDQU CC0, (1*32)(oup)
  1798. VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
  1799. VPXOR (2*32)(inp), AA0, AA0; VPXOR (3*32)(inp), BB0, BB0; VPXOR (4*32)(inp), CC0, CC0; VPXOR (5*32)(inp), DD0, DD0
  1800. VMOVDQU AA0, (2*32)(oup); VMOVDQU BB0, (3*32)(oup); VMOVDQU CC0, (4*32)(oup); VMOVDQU DD0, (5*32)(oup)
  1801. VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
  1802. VPXOR (6*32)(inp), AA0, AA0; VPXOR (7*32)(inp), BB0, BB0; VPXOR (8*32)(inp), CC0, CC0; VPXOR (9*32)(inp), DD0, DD0
  1803. VMOVDQU AA0, (6*32)(oup); VMOVDQU BB0, (7*32)(oup); VMOVDQU CC0, (8*32)(oup); VMOVDQU DD0, (9*32)(oup)
  1804. MOVQ $320, itr1
  1805. SUBQ $320, inl
  1806. LEAQ 320(inp), inp
  1807. VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, CC3, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, CC3, DD3, DD0
  1808. CMPQ inl, $128
  1809. JBE sealAVX2SealHash
  1810. VPXOR (0*32)(inp), AA0, AA0; VPXOR (1*32)(inp), BB0, BB0; VPXOR (2*32)(inp), CC0, CC0; VPXOR (3*32)(inp), DD0, DD0
  1811. VMOVDQU AA0, (10*32)(oup); VMOVDQU BB0, (11*32)(oup); VMOVDQU CC0, (12*32)(oup); VMOVDQU DD0, (13*32)(oup)
  1812. SUBQ $128, inl
  1813. LEAQ 128(inp), inp
  1814. MOVQ $8, itr1
  1815. MOVQ $2, itr2
  1816. CMPQ inl, $128
  1817. JBE sealAVX2Tail128
  1818. CMPQ inl, $256
  1819. JBE sealAVX2Tail256
  1820. CMPQ inl, $384
  1821. JBE sealAVX2Tail384
  1822. CMPQ inl, $512
  1823. JBE sealAVX2Tail512
  1824. // We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop
  1825. VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
  1826. VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
  1827. VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
  1828. VMOVDQA ctr3StoreAVX2, DD0
  1829. VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
  1830. VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
  1831. VMOVDQA CC3, tmpStoreAVX2
  1832. chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
  1833. VMOVDQA tmpStoreAVX2, CC3
  1834. VMOVDQA CC1, tmpStoreAVX2
  1835. chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
  1836. VMOVDQA tmpStoreAVX2, CC1
  1837. VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
  1838. VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1
  1839. VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2
  1840. VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3
  1841. VMOVDQA CC3, tmpStoreAVX2
  1842. chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
  1843. VMOVDQA tmpStoreAVX2, CC3
  1844. VMOVDQA CC1, tmpStoreAVX2
  1845. chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
  1846. VMOVDQA tmpStoreAVX2, CC1
  1847. VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
  1848. VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1
  1849. VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2
  1850. VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3
  1851. VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  1852. VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  1853. VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
  1854. VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  1855. VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  1856. VMOVDQA CC3, tmpStoreAVX2
  1857. VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
  1858. VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
  1859. VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
  1860. VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
  1861. VMOVDQA tmpStoreAVX2, CC3
  1862. SUBQ $16, oup // Adjust the pointer
  1863. MOVQ $9, itr1
  1864. JMP sealAVX2InternalLoopStart
  1865. sealAVX2MainLoop:
  1866. // Load state, increment counter blocks, store the incremented counters
  1867. VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
  1868. VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
  1869. VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
  1870. VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
  1871. VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
  1872. MOVQ $10, itr1
  1873. sealAVX2InternalLoop:
  1874. polyAdd(0*8(oup))
  1875. VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  1876. polyMulStage1_AVX2
  1877. VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  1878. VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
  1879. polyMulStage2_AVX2
  1880. VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  1881. VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  1882. polyMulStage3_AVX2
  1883. VMOVDQA CC3, tmpStoreAVX2
  1884. VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
  1885. VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
  1886. VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
  1887. VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
  1888. VMOVDQA tmpStoreAVX2, CC3
  1889. polyMulReduceStage
  1890. sealAVX2InternalLoopStart:
  1891. VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  1892. VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  1893. VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
  1894. polyAdd(2*8(oup))
  1895. VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  1896. polyMulStage1_AVX2
  1897. VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  1898. VMOVDQA CC3, tmpStoreAVX2
  1899. VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
  1900. VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
  1901. VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
  1902. VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
  1903. VMOVDQA tmpStoreAVX2, CC3
  1904. polyMulStage2_AVX2
  1905. VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
  1906. VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
  1907. VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
  1908. VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  1909. polyMulStage3_AVX2
  1910. VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  1911. VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
  1912. polyMulReduceStage
  1913. VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  1914. VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  1915. polyAdd(4*8(oup))
  1916. LEAQ (6*8)(oup), oup
  1917. VMOVDQA CC3, tmpStoreAVX2
  1918. VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
  1919. VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
  1920. VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
  1921. VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
  1922. VMOVDQA tmpStoreAVX2, CC3
  1923. polyMulStage1_AVX2
  1924. VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  1925. VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  1926. polyMulStage2_AVX2
  1927. VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
  1928. VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  1929. polyMulStage3_AVX2
  1930. VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  1931. VMOVDQA CC3, tmpStoreAVX2
  1932. VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
  1933. VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
  1934. VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
  1935. VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
  1936. VMOVDQA tmpStoreAVX2, CC3
  1937. polyMulReduceStage
  1938. VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
  1939. VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
  1940. VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
  1941. DECQ itr1
  1942. JNE sealAVX2InternalLoop
  1943. VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
  1944. VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
  1945. VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
  1946. VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
  1947. VMOVDQA CC3, tmpStoreAVX2
  1948. // We only hashed 480 of the 512 bytes available - hash the remaining 32 here
  1949. polyAdd(0*8(oup))
  1950. polyMulAVX2
  1951. LEAQ (4*8)(oup), oup
  1952. VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
  1953. VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
  1954. VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
  1955. VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
  1956. VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
  1957. VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
  1958. // and here
  1959. polyAdd(-2*8(oup))
  1960. polyMulAVX2
  1961. VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
  1962. VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
  1963. VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
  1964. VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
  1965. VPXOR (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0
  1966. VMOVDQU AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup)
  1967. LEAQ (32*16)(inp), inp
  1968. SUBQ $(32*16), inl
  1969. CMPQ inl, $512
  1970. JG sealAVX2MainLoop
  1971. // Tail can only hash 480 bytes
  1972. polyAdd(0*8(oup))
  1973. polyMulAVX2
  1974. polyAdd(2*8(oup))
  1975. polyMulAVX2
  1976. LEAQ 32(oup), oup
  1977. MOVQ $10, itr1
  1978. MOVQ $0, itr2
  1979. CMPQ inl, $128
  1980. JBE sealAVX2Tail128
  1981. CMPQ inl, $256
  1982. JBE sealAVX2Tail256
  1983. CMPQ inl, $384
  1984. JBE sealAVX2Tail384
  1985. JMP sealAVX2Tail512
  1986. // ----------------------------------------------------------------------------
  1987. // Special optimization for buffers smaller than 193 bytes
  1988. seal192AVX2:
  1989. // For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks
  1990. VMOVDQA AA0, AA1
  1991. VMOVDQA BB0, BB1
  1992. VMOVDQA CC0, CC1
  1993. VPADDD ·avx2IncMask<>(SB), DD0, DD1
  1994. VMOVDQA AA0, AA2
  1995. VMOVDQA BB0, BB2
  1996. VMOVDQA CC0, CC2
  1997. VMOVDQA DD0, DD2
  1998. VMOVDQA DD1, TT3
  1999. MOVQ $10, itr2
  2000. sealAVX2192InnerCipherLoop:
  2001. chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  2002. VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
  2003. VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
  2004. VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
  2005. chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  2006. VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
  2007. VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
  2008. VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
  2009. DECQ itr2
  2010. JNE sealAVX2192InnerCipherLoop
  2011. VPADDD AA2, AA0, AA0; VPADDD AA2, AA1, AA1
  2012. VPADDD BB2, BB0, BB0; VPADDD BB2, BB1, BB1
  2013. VPADDD CC2, CC0, CC0; VPADDD CC2, CC1, CC1
  2014. VPADDD DD2, DD0, DD0; VPADDD TT3, DD1, DD1
  2015. VPERM2I128 $0x02, AA0, BB0, TT0
  2016. // Clamp and store poly key
  2017. VPAND ·polyClampMask<>(SB), TT0, TT0
  2018. VMOVDQA TT0, rsStoreAVX2
  2019. // Stream for up to 192 bytes
  2020. VPERM2I128 $0x13, AA0, BB0, AA0
  2021. VPERM2I128 $0x13, CC0, DD0, BB0
  2022. VPERM2I128 $0x02, AA1, BB1, CC0
  2023. VPERM2I128 $0x02, CC1, DD1, DD0
  2024. VPERM2I128 $0x13, AA1, BB1, AA1
  2025. VPERM2I128 $0x13, CC1, DD1, BB1
  2026. sealAVX2ShortSeal:
  2027. // Hash aad
  2028. MOVQ ad_len+80(FP), itr2
  2029. CALL polyHashADInternal<>(SB)
  2030. XORQ itr1, itr1
  2031. sealAVX2SealHash:
  2032. // itr1 holds the number of bytes encrypted but not yet hashed
  2033. CMPQ itr1, $16
  2034. JB sealAVX2ShortSealLoop
  2035. polyAdd(0(oup))
  2036. polyMul
  2037. SUBQ $16, itr1
  2038. ADDQ $16, oup
  2039. JMP sealAVX2SealHash
  2040. sealAVX2ShortSealLoop:
  2041. CMPQ inl, $32
  2042. JB sealAVX2ShortTail32
  2043. SUBQ $32, inl
  2044. // Load for encryption
  2045. VPXOR (inp), AA0, AA0
  2046. VMOVDQU AA0, (oup)
  2047. LEAQ (1*32)(inp), inp
  2048. // Now can hash
  2049. polyAdd(0*8(oup))
  2050. polyMulAVX2
  2051. polyAdd(2*8(oup))
  2052. polyMulAVX2
  2053. LEAQ (1*32)(oup), oup
  2054. // Shift stream left
  2055. VMOVDQA BB0, AA0
  2056. VMOVDQA CC0, BB0
  2057. VMOVDQA DD0, CC0
  2058. VMOVDQA AA1, DD0
  2059. VMOVDQA BB1, AA1
  2060. VMOVDQA CC1, BB1
  2061. VMOVDQA DD1, CC1
  2062. VMOVDQA AA2, DD1
  2063. VMOVDQA BB2, AA2
  2064. JMP sealAVX2ShortSealLoop
  2065. sealAVX2ShortTail32:
  2066. CMPQ inl, $16
  2067. VMOVDQA A0, A1
  2068. JB sealAVX2ShortDone
  2069. SUBQ $16, inl
  2070. // Load for encryption
  2071. VPXOR (inp), A0, T0
  2072. VMOVDQU T0, (oup)
  2073. LEAQ (1*16)(inp), inp
  2074. // Hash
  2075. polyAdd(0*8(oup))
  2076. polyMulAVX2
  2077. LEAQ (1*16)(oup), oup
  2078. VPERM2I128 $0x11, AA0, AA0, AA0
  2079. VMOVDQA A0, A1
  2080. sealAVX2ShortDone:
  2081. VZEROUPPER
  2082. JMP sealSSETail
  2083. // ----------------------------------------------------------------------------
  2084. // Special optimization for buffers smaller than 321 bytes
  2085. seal320AVX2:
  2086. // For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
  2087. VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1
  2088. VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2
  2089. VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3
  2090. MOVQ $10, itr2
  2091. sealAVX2320InnerCipherLoop:
  2092. chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  2093. VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
  2094. VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  2095. VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
  2096. chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  2097. VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
  2098. VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  2099. VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
  2100. DECQ itr2
  2101. JNE sealAVX2320InnerCipherLoop
  2102. VMOVDQA ·chacha20Constants<>(SB), TT0
  2103. VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2
  2104. VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2
  2105. VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2
  2106. VMOVDQA ·avx2IncMask<>(SB), TT0
  2107. VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3
  2108. VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3
  2109. VPADDD TT3, DD2, DD2
  2110. // Clamp and store poly key
  2111. VPERM2I128 $0x02, AA0, BB0, TT0
  2112. VPAND ·polyClampMask<>(SB), TT0, TT0
  2113. VMOVDQA TT0, rsStoreAVX2
  2114. // Stream for up to 320 bytes
  2115. VPERM2I128 $0x13, AA0, BB0, AA0
  2116. VPERM2I128 $0x13, CC0, DD0, BB0
  2117. VPERM2I128 $0x02, AA1, BB1, CC0
  2118. VPERM2I128 $0x02, CC1, DD1, DD0
  2119. VPERM2I128 $0x13, AA1, BB1, AA1
  2120. VPERM2I128 $0x13, CC1, DD1, BB1
  2121. VPERM2I128 $0x02, AA2, BB2, CC1
  2122. VPERM2I128 $0x02, CC2, DD2, DD1
  2123. VPERM2I128 $0x13, AA2, BB2, AA2
  2124. VPERM2I128 $0x13, CC2, DD2, BB2
  2125. JMP sealAVX2ShortSeal
  2126. // ----------------------------------------------------------------------------
  2127. // Special optimization for the last 128 bytes of ciphertext
  2128. sealAVX2Tail128:
  2129. // Need to decrypt up to 128 bytes - prepare two blocks
  2130. // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
  2131. // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
  2132. VMOVDQA ·chacha20Constants<>(SB), AA0
  2133. VMOVDQA state1StoreAVX2, BB0
  2134. VMOVDQA state2StoreAVX2, CC0
  2135. VMOVDQA ctr3StoreAVX2, DD0
  2136. VPADDD ·avx2IncMask<>(SB), DD0, DD0
  2137. VMOVDQA DD0, DD1
  2138. sealAVX2Tail128LoopA:
  2139. polyAdd(0(oup))
  2140. polyMul
  2141. LEAQ 16(oup), oup
  2142. sealAVX2Tail128LoopB:
  2143. chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
  2144. polyAdd(0(oup))
  2145. polyMul
  2146. VPALIGNR $4, BB0, BB0, BB0
  2147. VPALIGNR $8, CC0, CC0, CC0
  2148. VPALIGNR $12, DD0, DD0, DD0
  2149. chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
  2150. polyAdd(16(oup))
  2151. polyMul
  2152. LEAQ 32(oup), oup
  2153. VPALIGNR $12, BB0, BB0, BB0
  2154. VPALIGNR $8, CC0, CC0, CC0
  2155. VPALIGNR $4, DD0, DD0, DD0
  2156. DECQ itr1
  2157. JG sealAVX2Tail128LoopA
  2158. DECQ itr2
  2159. JGE sealAVX2Tail128LoopB
  2160. VPADDD ·chacha20Constants<>(SB), AA0, AA1
  2161. VPADDD state1StoreAVX2, BB0, BB1
  2162. VPADDD state2StoreAVX2, CC0, CC1
  2163. VPADDD DD1, DD0, DD1
  2164. VPERM2I128 $0x02, AA1, BB1, AA0
  2165. VPERM2I128 $0x02, CC1, DD1, BB0
  2166. VPERM2I128 $0x13, AA1, BB1, CC0
  2167. VPERM2I128 $0x13, CC1, DD1, DD0
  2168. JMP sealAVX2ShortSealLoop
  2169. // ----------------------------------------------------------------------------
  2170. // Special optimization for the last 256 bytes of ciphertext
  2171. sealAVX2Tail256:
  2172. // Need to decrypt up to 256 bytes - prepare two blocks
  2173. // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
  2174. // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
  2175. VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA ·chacha20Constants<>(SB), AA1
  2176. VMOVDQA state1StoreAVX2, BB0; VMOVDQA state1StoreAVX2, BB1
  2177. VMOVDQA state2StoreAVX2, CC0; VMOVDQA state2StoreAVX2, CC1
  2178. VMOVDQA ctr3StoreAVX2, DD0
  2179. VPADDD ·avx2IncMask<>(SB), DD0, DD0
  2180. VPADDD ·avx2IncMask<>(SB), DD0, DD1
  2181. VMOVDQA DD0, TT1
  2182. VMOVDQA DD1, TT2
  2183. sealAVX2Tail256LoopA:
  2184. polyAdd(0(oup))
  2185. polyMul
  2186. LEAQ 16(oup), oup
  2187. sealAVX2Tail256LoopB:
  2188. chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  2189. polyAdd(0(oup))
  2190. polyMul
  2191. VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
  2192. VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
  2193. VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
  2194. chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  2195. polyAdd(16(oup))
  2196. polyMul
  2197. LEAQ 32(oup), oup
  2198. VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
  2199. VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
  2200. VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
  2201. DECQ itr1
  2202. JG sealAVX2Tail256LoopA
  2203. DECQ itr2
  2204. JGE sealAVX2Tail256LoopB
  2205. VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1
  2206. VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1
  2207. VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1
  2208. VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1
  2209. VPERM2I128 $0x02, AA0, BB0, TT0
  2210. VPERM2I128 $0x02, CC0, DD0, TT1
  2211. VPERM2I128 $0x13, AA0, BB0, TT2
  2212. VPERM2I128 $0x13, CC0, DD0, TT3
  2213. VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
  2214. VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
  2215. MOVQ $128, itr1
  2216. LEAQ 128(inp), inp
  2217. SUBQ $128, inl
  2218. VPERM2I128 $0x02, AA1, BB1, AA0
  2219. VPERM2I128 $0x02, CC1, DD1, BB0
  2220. VPERM2I128 $0x13, AA1, BB1, CC0
  2221. VPERM2I128 $0x13, CC1, DD1, DD0
  2222. JMP sealAVX2SealHash
  2223. // ----------------------------------------------------------------------------
  2224. // Special optimization for the last 384 bytes of ciphertext
  2225. sealAVX2Tail384:
  2226. // Need to decrypt up to 384 bytes - prepare two blocks
  2227. // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
  2228. // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
  2229. VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
  2230. VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2
  2231. VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2
  2232. VMOVDQA ctr3StoreAVX2, DD0
  2233. VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2
  2234. VMOVDQA DD0, TT1; VMOVDQA DD1, TT2; VMOVDQA DD2, TT3
  2235. sealAVX2Tail384LoopA:
  2236. polyAdd(0(oup))
  2237. polyMul
  2238. LEAQ 16(oup), oup
  2239. sealAVX2Tail384LoopB:
  2240. chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  2241. polyAdd(0(oup))
  2242. polyMul
  2243. VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
  2244. VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  2245. VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
  2246. chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  2247. polyAdd(16(oup))
  2248. polyMul
  2249. LEAQ 32(oup), oup
  2250. VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
  2251. VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  2252. VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
  2253. DECQ itr1
  2254. JG sealAVX2Tail384LoopA
  2255. DECQ itr2
  2256. JGE sealAVX2Tail384LoopB
  2257. VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2
  2258. VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2
  2259. VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2
  2260. VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1; VPADDD TT3, DD2, DD2
  2261. VPERM2I128 $0x02, AA0, BB0, TT0
  2262. VPERM2I128 $0x02, CC0, DD0, TT1
  2263. VPERM2I128 $0x13, AA0, BB0, TT2
  2264. VPERM2I128 $0x13, CC0, DD0, TT3
  2265. VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
  2266. VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
  2267. VPERM2I128 $0x02, AA1, BB1, TT0
  2268. VPERM2I128 $0x02, CC1, DD1, TT1
  2269. VPERM2I128 $0x13, AA1, BB1, TT2
  2270. VPERM2I128 $0x13, CC1, DD1, TT3
  2271. VPXOR (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3
  2272. VMOVDQU TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup)
  2273. MOVQ $256, itr1
  2274. LEAQ 256(inp), inp
  2275. SUBQ $256, inl
  2276. VPERM2I128 $0x02, AA2, BB2, AA0
  2277. VPERM2I128 $0x02, CC2, DD2, BB0
  2278. VPERM2I128 $0x13, AA2, BB2, CC0
  2279. VPERM2I128 $0x13, CC2, DD2, DD0
  2280. JMP sealAVX2SealHash
  2281. // ----------------------------------------------------------------------------
  2282. // Special optimization for the last 512 bytes of ciphertext
  2283. sealAVX2Tail512:
  2284. // Need to decrypt up to 512 bytes - prepare two blocks
  2285. // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
  2286. // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
  2287. VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
  2288. VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
  2289. VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
  2290. VMOVDQA ctr3StoreAVX2, DD0
  2291. VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
  2292. VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
  2293. sealAVX2Tail512LoopA:
  2294. polyAdd(0(oup))
  2295. polyMul
  2296. LEAQ 16(oup), oup
  2297. sealAVX2Tail512LoopB:
  2298. VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  2299. VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  2300. VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
  2301. VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  2302. VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  2303. VMOVDQA CC3, tmpStoreAVX2
  2304. VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
  2305. VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
  2306. VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
  2307. VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
  2308. VMOVDQA tmpStoreAVX2, CC3
  2309. polyAdd(0*8(oup))
  2310. polyMulAVX2
  2311. VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  2312. VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  2313. VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
  2314. VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  2315. VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  2316. VMOVDQA CC3, tmpStoreAVX2
  2317. VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
  2318. VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
  2319. VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
  2320. VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
  2321. VMOVDQA tmpStoreAVX2, CC3
  2322. VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
  2323. VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
  2324. VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
  2325. VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  2326. VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  2327. VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
  2328. VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  2329. VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  2330. polyAdd(2*8(oup))
  2331. polyMulAVX2
  2332. LEAQ (4*8)(oup), oup
  2333. VMOVDQA CC3, tmpStoreAVX2
  2334. VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
  2335. VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
  2336. VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
  2337. VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
  2338. VMOVDQA tmpStoreAVX2, CC3
  2339. VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  2340. VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  2341. VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
  2342. VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  2343. VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  2344. VMOVDQA CC3, tmpStoreAVX2
  2345. VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
  2346. VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
  2347. VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
  2348. VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
  2349. VMOVDQA tmpStoreAVX2, CC3
  2350. VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
  2351. VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
  2352. VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
  2353. DECQ itr1
  2354. JG sealAVX2Tail512LoopA
  2355. DECQ itr2
  2356. JGE sealAVX2Tail512LoopB
  2357. VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
  2358. VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
  2359. VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
  2360. VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
  2361. VMOVDQA CC3, tmpStoreAVX2
  2362. VPERM2I128 $0x02, AA0, BB0, CC3
  2363. VPXOR (0*32)(inp), CC3, CC3
  2364. VMOVDQU CC3, (0*32)(oup)
  2365. VPERM2I128 $0x02, CC0, DD0, CC3
  2366. VPXOR (1*32)(inp), CC3, CC3
  2367. VMOVDQU CC3, (1*32)(oup)
  2368. VPERM2I128 $0x13, AA0, BB0, CC3
  2369. VPXOR (2*32)(inp), CC3, CC3
  2370. VMOVDQU CC3, (2*32)(oup)
  2371. VPERM2I128 $0x13, CC0, DD0, CC3
  2372. VPXOR (3*32)(inp), CC3, CC3
  2373. VMOVDQU CC3, (3*32)(oup)
  2374. VPERM2I128 $0x02, AA1, BB1, AA0
  2375. VPERM2I128 $0x02, CC1, DD1, BB0
  2376. VPERM2I128 $0x13, AA1, BB1, CC0
  2377. VPERM2I128 $0x13, CC1, DD1, DD0
  2378. VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
  2379. VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
  2380. VPERM2I128 $0x02, AA2, BB2, AA0
  2381. VPERM2I128 $0x02, CC2, DD2, BB0
  2382. VPERM2I128 $0x13, AA2, BB2, CC0
  2383. VPERM2I128 $0x13, CC2, DD2, DD0
  2384. VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
  2385. VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
  2386. MOVQ $384, itr1
  2387. LEAQ 384(inp), inp
  2388. SUBQ $384, inl
  2389. VPERM2I128 $0x02, AA3, BB3, AA0
  2390. VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0
  2391. VPERM2I128 $0x13, AA3, BB3, CC0
  2392. VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
  2393. JMP sealAVX2SealHash
  2394. // func haveSSSE3() bool
  2395. TEXT ·haveSSSE3(SB), NOSPLIT, $0
  2396. XORQ AX, AX
  2397. INCL AX
  2398. CPUID
  2399. SHRQ $9, CX
  2400. ANDQ $1, CX
  2401. MOVB CX, ret+0(FP)
  2402. RET