You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

2696 lines
105 KiB

  1. // Copyright 2016 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // This file was originally from https://golang.org/cl/24717 by Vlad Krasnov of CloudFlare.
  5. // +build go1.7,amd64,!gccgo,!appengine
  6. #include "textflag.h"
  7. // General register allocation
  8. #define oup DI
  9. #define inp SI
  10. #define inl BX
  11. #define adp CX // free to reuse, after we hash the additional data
  12. #define keyp R8 // free to reuse, when we copy the key to stack
  13. #define itr2 R9 // general iterator
  14. #define itr1 CX // general iterator
  15. #define acc0 R10
  16. #define acc1 R11
  17. #define acc2 R12
  18. #define t0 R13
  19. #define t1 R14
  20. #define t2 R15
  21. #define t3 R8
  22. // Register and stack allocation for the SSE code
  23. #define rStore (0*16)(BP)
  24. #define sStore (1*16)(BP)
  25. #define state1Store (2*16)(BP)
  26. #define state2Store (3*16)(BP)
  27. #define tmpStore (4*16)(BP)
  28. #define ctr0Store (5*16)(BP)
  29. #define ctr1Store (6*16)(BP)
  30. #define ctr2Store (7*16)(BP)
  31. #define ctr3Store (8*16)(BP)
  32. #define A0 X0
  33. #define A1 X1
  34. #define A2 X2
  35. #define B0 X3
  36. #define B1 X4
  37. #define B2 X5
  38. #define C0 X6
  39. #define C1 X7
  40. #define C2 X8
  41. #define D0 X9
  42. #define D1 X10
  43. #define D2 X11
  44. #define T0 X12
  45. #define T1 X13
  46. #define T2 X14
  47. #define T3 X15
  48. #define A3 T0
  49. #define B3 T1
  50. #define C3 T2
  51. #define D3 T3
  52. // Register and stack allocation for the AVX2 code
  53. #define rsStoreAVX2 (0*32)(BP)
  54. #define state1StoreAVX2 (1*32)(BP)
  55. #define state2StoreAVX2 (2*32)(BP)
  56. #define ctr0StoreAVX2 (3*32)(BP)
  57. #define ctr1StoreAVX2 (4*32)(BP)
  58. #define ctr2StoreAVX2 (5*32)(BP)
  59. #define ctr3StoreAVX2 (6*32)(BP)
  60. #define tmpStoreAVX2 (7*32)(BP) // 256 bytes on stack
  61. #define AA0 Y0
  62. #define AA1 Y5
  63. #define AA2 Y6
  64. #define AA3 Y7
  65. #define BB0 Y14
  66. #define BB1 Y9
  67. #define BB2 Y10
  68. #define BB3 Y11
  69. #define CC0 Y12
  70. #define CC1 Y13
  71. #define CC2 Y8
  72. #define CC3 Y15
  73. #define DD0 Y4
  74. #define DD1 Y1
  75. #define DD2 Y2
  76. #define DD3 Y3
  77. #define TT0 DD3
  78. #define TT1 AA3
  79. #define TT2 BB3
  80. #define TT3 CC3
  81. // ChaCha20 constants
  82. DATA ·chacha20Constants<>+0x00(SB)/4, $0x61707865
  83. DATA ·chacha20Constants<>+0x04(SB)/4, $0x3320646e
  84. DATA ·chacha20Constants<>+0x08(SB)/4, $0x79622d32
  85. DATA ·chacha20Constants<>+0x0c(SB)/4, $0x6b206574
  86. DATA ·chacha20Constants<>+0x10(SB)/4, $0x61707865
  87. DATA ·chacha20Constants<>+0x14(SB)/4, $0x3320646e
  88. DATA ·chacha20Constants<>+0x18(SB)/4, $0x79622d32
  89. DATA ·chacha20Constants<>+0x1c(SB)/4, $0x6b206574
  90. // <<< 16 with PSHUFB
  91. DATA ·rol16<>+0x00(SB)/8, $0x0504070601000302
  92. DATA ·rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
  93. DATA ·rol16<>+0x10(SB)/8, $0x0504070601000302
  94. DATA ·rol16<>+0x18(SB)/8, $0x0D0C0F0E09080B0A
  95. // <<< 8 with PSHUFB
  96. DATA ·rol8<>+0x00(SB)/8, $0x0605040702010003
  97. DATA ·rol8<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
  98. DATA ·rol8<>+0x10(SB)/8, $0x0605040702010003
  99. DATA ·rol8<>+0x18(SB)/8, $0x0E0D0C0F0A09080B
  100. DATA ·avx2InitMask<>+0x00(SB)/8, $0x0
  101. DATA ·avx2InitMask<>+0x08(SB)/8, $0x0
  102. DATA ·avx2InitMask<>+0x10(SB)/8, $0x1
  103. DATA ·avx2InitMask<>+0x18(SB)/8, $0x0
  104. DATA ·avx2IncMask<>+0x00(SB)/8, $0x2
  105. DATA ·avx2IncMask<>+0x08(SB)/8, $0x0
  106. DATA ·avx2IncMask<>+0x10(SB)/8, $0x2
  107. DATA ·avx2IncMask<>+0x18(SB)/8, $0x0
  108. // Poly1305 key clamp
  109. DATA ·polyClampMask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF
  110. DATA ·polyClampMask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC
  111. DATA ·polyClampMask<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
  112. DATA ·polyClampMask<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
  113. DATA ·sseIncMask<>+0x00(SB)/8, $0x1
  114. DATA ·sseIncMask<>+0x08(SB)/8, $0x0
  115. // To load/store the last < 16 bytes in a buffer
  116. DATA ·andMask<>+0x00(SB)/8, $0x00000000000000ff
  117. DATA ·andMask<>+0x08(SB)/8, $0x0000000000000000
  118. DATA ·andMask<>+0x10(SB)/8, $0x000000000000ffff
  119. DATA ·andMask<>+0x18(SB)/8, $0x0000000000000000
  120. DATA ·andMask<>+0x20(SB)/8, $0x0000000000ffffff
  121. DATA ·andMask<>+0x28(SB)/8, $0x0000000000000000
  122. DATA ·andMask<>+0x30(SB)/8, $0x00000000ffffffff
  123. DATA ·andMask<>+0x38(SB)/8, $0x0000000000000000
  124. DATA ·andMask<>+0x40(SB)/8, $0x000000ffffffffff
  125. DATA ·andMask<>+0x48(SB)/8, $0x0000000000000000
  126. DATA ·andMask<>+0x50(SB)/8, $0x0000ffffffffffff
  127. DATA ·andMask<>+0x58(SB)/8, $0x0000000000000000
  128. DATA ·andMask<>+0x60(SB)/8, $0x00ffffffffffffff
  129. DATA ·andMask<>+0x68(SB)/8, $0x0000000000000000
  130. DATA ·andMask<>+0x70(SB)/8, $0xffffffffffffffff
  131. DATA ·andMask<>+0x78(SB)/8, $0x0000000000000000
  132. DATA ·andMask<>+0x80(SB)/8, $0xffffffffffffffff
  133. DATA ·andMask<>+0x88(SB)/8, $0x00000000000000ff
  134. DATA ·andMask<>+0x90(SB)/8, $0xffffffffffffffff
  135. DATA ·andMask<>+0x98(SB)/8, $0x000000000000ffff
  136. DATA ·andMask<>+0xa0(SB)/8, $0xffffffffffffffff
  137. DATA ·andMask<>+0xa8(SB)/8, $0x0000000000ffffff
  138. DATA ·andMask<>+0xb0(SB)/8, $0xffffffffffffffff
  139. DATA ·andMask<>+0xb8(SB)/8, $0x00000000ffffffff
  140. DATA ·andMask<>+0xc0(SB)/8, $0xffffffffffffffff
  141. DATA ·andMask<>+0xc8(SB)/8, $0x000000ffffffffff
  142. DATA ·andMask<>+0xd0(SB)/8, $0xffffffffffffffff
  143. DATA ·andMask<>+0xd8(SB)/8, $0x0000ffffffffffff
  144. DATA ·andMask<>+0xe0(SB)/8, $0xffffffffffffffff
  145. DATA ·andMask<>+0xe8(SB)/8, $0x00ffffffffffffff
  146. GLOBL ·chacha20Constants<>(SB), (NOPTR+RODATA), $32
  147. GLOBL ·rol16<>(SB), (NOPTR+RODATA), $32
  148. GLOBL ·rol8<>(SB), (NOPTR+RODATA), $32
  149. GLOBL ·sseIncMask<>(SB), (NOPTR+RODATA), $16
  150. GLOBL ·avx2IncMask<>(SB), (NOPTR+RODATA), $32
  151. GLOBL ·avx2InitMask<>(SB), (NOPTR+RODATA), $32
  152. GLOBL ·polyClampMask<>(SB), (NOPTR+RODATA), $32
  153. GLOBL ·andMask<>(SB), (NOPTR+RODATA), $240
  154. // No PALIGNR in Go ASM yet (but VPALIGNR is present).
  155. #define shiftB0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X3, X3
  156. #define shiftB1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x04 // PALIGNR $4, X4, X4
  157. #define shiftB2Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X5, X5
  158. #define shiftB3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X13, X13
  159. #define shiftC0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X6, X6
  160. #define shiftC1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x08 // PALIGNR $8, X7, X7
  161. #define shiftC2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc0; BYTE $0x08 // PALIGNR $8, X8, X8
  162. #define shiftC3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X14, X14
  163. #define shiftD0Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x0c // PALIGNR $12, X9, X9
  164. #define shiftD1Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x0c // PALIGNR $12, X10, X10
  165. #define shiftD2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X11, X11
  166. #define shiftD3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x0c // PALIGNR $12, X15, X15
  167. #define shiftB0Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X3, X3
  168. #define shiftB1Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x0c // PALIGNR $12, X4, X4
  169. #define shiftB2Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X5, X5
  170. #define shiftB3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X13, X13
  171. #define shiftC0Right shiftC0Left
  172. #define shiftC1Right shiftC1Left
  173. #define shiftC2Right shiftC2Left
  174. #define shiftC3Right shiftC3Left
  175. #define shiftD0Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x04 // PALIGNR $4, X9, X9
  176. #define shiftD1Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x04 // PALIGNR $4, X10, X10
  177. #define shiftD2Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X11, X11
  178. #define shiftD3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x04 // PALIGNR $4, X15, X15
  179. // Some macros
  180. #define chachaQR(A, B, C, D, T) \
  181. PADDD B, A; PXOR A, D; PSHUFB ·rol16<>(SB), D \
  182. PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $12, T; PSRLL $20, B; PXOR T, B \
  183. PADDD B, A; PXOR A, D; PSHUFB ·rol8<>(SB), D \
  184. PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $7, T; PSRLL $25, B; PXOR T, B
  185. #define chachaQR_AVX2(A, B, C, D, T) \
  186. VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol16<>(SB), D, D \
  187. VPADDD D, C, C; VPXOR C, B, B; VPSLLD $12, B, T; VPSRLD $20, B, B; VPXOR T, B, B \
  188. VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol8<>(SB), D, D \
  189. VPADDD D, C, C; VPXOR C, B, B; VPSLLD $7, B, T; VPSRLD $25, B, B; VPXOR T, B, B
  190. #define polyAdd(S) ADDQ S, acc0; ADCQ 8+S, acc1; ADCQ $1, acc2
  191. #define polyMulStage1 MOVQ (0*8)(BP), AX; MOVQ AX, t2; MULQ acc0; MOVQ AX, t0; MOVQ DX, t1; MOVQ (0*8)(BP), AX; MULQ acc1; IMULQ acc2, t2; ADDQ AX, t1; ADCQ DX, t2
  192. #define polyMulStage2 MOVQ (1*8)(BP), AX; MOVQ AX, t3; MULQ acc0; ADDQ AX, t1; ADCQ $0, DX; MOVQ DX, acc0; MOVQ (1*8)(BP), AX; MULQ acc1; ADDQ AX, t2; ADCQ $0, DX
  193. #define polyMulStage3 IMULQ acc2, t3; ADDQ acc0, t2; ADCQ DX, t3
  194. #define polyMulReduceStage MOVQ t0, acc0; MOVQ t1, acc1; MOVQ t2, acc2; ANDQ $3, acc2; MOVQ t2, t0; ANDQ $-4, t0; MOVQ t3, t1; SHRQ $2, t2:t3; SHRQ $2, t3; ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $0, acc2; ADDQ t2, acc0; ADCQ t3, acc1; ADCQ $0, acc2
  195. #define polyMulStage1_AVX2 MOVQ (0*8)(BP), DX; MOVQ DX, t2; MULXQ acc0, t0, t1; IMULQ acc2, t2; MULXQ acc1, AX, DX; ADDQ AX, t1; ADCQ DX, t2
  196. #define polyMulStage2_AVX2 MOVQ (1*8)(BP), DX; MULXQ acc0, acc0, AX; ADDQ acc0, t1; MULXQ acc1, acc1, t3; ADCQ acc1, t2; ADCQ $0, t3
  197. #define polyMulStage3_AVX2 IMULQ acc2, DX; ADDQ AX, t2; ADCQ DX, t3
  198. #define polyMul polyMulStage1; polyMulStage2; polyMulStage3; polyMulReduceStage
  199. #define polyMulAVX2 polyMulStage1_AVX2; polyMulStage2_AVX2; polyMulStage3_AVX2; polyMulReduceStage
  200. // ----------------------------------------------------------------------------
  201. TEXT polyHashADInternal<>(SB), NOSPLIT, $0
  202. // adp points to beginning of additional data
  203. // itr2 holds ad length
  204. XORQ acc0, acc0
  205. XORQ acc1, acc1
  206. XORQ acc2, acc2
  207. CMPQ itr2, $13
  208. JNE hashADLoop
  209. openFastTLSAD:
  210. // Special treatment for the TLS case of 13 bytes
  211. MOVQ (adp), acc0
  212. MOVQ 5(adp), acc1
  213. SHRQ $24, acc1
  214. MOVQ $1, acc2
  215. polyMul
  216. RET
  217. hashADLoop:
  218. // Hash in 16 byte chunks
  219. CMPQ itr2, $16
  220. JB hashADTail
  221. polyAdd(0(adp))
  222. LEAQ (1*16)(adp), adp
  223. SUBQ $16, itr2
  224. polyMul
  225. JMP hashADLoop
  226. hashADTail:
  227. CMPQ itr2, $0
  228. JE hashADDone
  229. // Hash last < 16 byte tail
  230. XORQ t0, t0
  231. XORQ t1, t1
  232. XORQ t2, t2
  233. ADDQ itr2, adp
  234. hashADTailLoop:
  235. SHLQ $8, t1:t0
  236. SHLQ $8, t0
  237. MOVB -1(adp), t2
  238. XORQ t2, t0
  239. DECQ adp
  240. DECQ itr2
  241. JNE hashADTailLoop
  242. hashADTailFinish:
  243. ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
  244. polyMul
  245. // Finished AD
  246. hashADDone:
  247. RET
  248. // ----------------------------------------------------------------------------
  249. // func chacha20Poly1305Open(dst, key, src, ad []byte) bool
  250. TEXT ·chacha20Poly1305Open(SB), 0, $288-97
  251. // For aligned stack access
  252. MOVQ SP, BP
  253. ADDQ $32, BP
  254. ANDQ $-32, BP
  255. MOVQ dst+0(FP), oup
  256. MOVQ key+24(FP), keyp
  257. MOVQ src+48(FP), inp
  258. MOVQ src_len+56(FP), inl
  259. MOVQ ad+72(FP), adp
  260. // Check for AVX2 support
  261. CMPB ·useAVX2(SB), $1
  262. JE chacha20Poly1305Open_AVX2
  263. // Special optimization, for very short buffers
  264. CMPQ inl, $128
  265. JBE openSSE128 // About 16% faster
  266. // For long buffers, prepare the poly key first
  267. MOVOU ·chacha20Constants<>(SB), A0
  268. MOVOU (1*16)(keyp), B0
  269. MOVOU (2*16)(keyp), C0
  270. MOVOU (3*16)(keyp), D0
  271. MOVO D0, T1
  272. // Store state on stack for future use
  273. MOVO B0, state1Store
  274. MOVO C0, state2Store
  275. MOVO D0, ctr3Store
  276. MOVQ $10, itr2
  277. openSSEPreparePolyKey:
  278. chachaQR(A0, B0, C0, D0, T0)
  279. shiftB0Left; shiftC0Left; shiftD0Left
  280. chachaQR(A0, B0, C0, D0, T0)
  281. shiftB0Right; shiftC0Right; shiftD0Right
  282. DECQ itr2
  283. JNE openSSEPreparePolyKey
  284. // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
  285. PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0
  286. // Clamp and store the key
  287. PAND ·polyClampMask<>(SB), A0
  288. MOVO A0, rStore; MOVO B0, sStore
  289. // Hash AAD
  290. MOVQ ad_len+80(FP), itr2
  291. CALL polyHashADInternal<>(SB)
  292. openSSEMainLoop:
  293. CMPQ inl, $256
  294. JB openSSEMainLoopDone
  295. // Load state, increment counter blocks
  296. MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
  297. MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
  298. MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
  299. MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
  300. // Store counters
  301. MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
  302. // There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16
  303. MOVQ $4, itr1
  304. MOVQ inp, itr2
  305. openSSEInternalLoop:
  306. MOVO C3, tmpStore
  307. chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
  308. MOVO tmpStore, C3
  309. MOVO C1, tmpStore
  310. chachaQR(A3, B3, C3, D3, C1)
  311. MOVO tmpStore, C1
  312. polyAdd(0(itr2))
  313. shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
  314. shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
  315. shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
  316. polyMulStage1
  317. polyMulStage2
  318. LEAQ (2*8)(itr2), itr2
  319. MOVO C3, tmpStore
  320. chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
  321. MOVO tmpStore, C3
  322. MOVO C1, tmpStore
  323. polyMulStage3
  324. chachaQR(A3, B3, C3, D3, C1)
  325. MOVO tmpStore, C1
  326. polyMulReduceStage
  327. shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
  328. shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
  329. shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
  330. DECQ itr1
  331. JGE openSSEInternalLoop
  332. polyAdd(0(itr2))
  333. polyMul
  334. LEAQ (2*8)(itr2), itr2
  335. CMPQ itr1, $-6
  336. JG openSSEInternalLoop
  337. // Add in the state
  338. PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
  339. PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
  340. PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
  341. PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
  342. // Load - xor - store
  343. MOVO D3, tmpStore
  344. MOVOU (0*16)(inp), D3; PXOR D3, A0; MOVOU A0, (0*16)(oup)
  345. MOVOU (1*16)(inp), D3; PXOR D3, B0; MOVOU B0, (1*16)(oup)
  346. MOVOU (2*16)(inp), D3; PXOR D3, C0; MOVOU C0, (2*16)(oup)
  347. MOVOU (3*16)(inp), D3; PXOR D3, D0; MOVOU D0, (3*16)(oup)
  348. MOVOU (4*16)(inp), D0; PXOR D0, A1; MOVOU A1, (4*16)(oup)
  349. MOVOU (5*16)(inp), D0; PXOR D0, B1; MOVOU B1, (5*16)(oup)
  350. MOVOU (6*16)(inp), D0; PXOR D0, C1; MOVOU C1, (6*16)(oup)
  351. MOVOU (7*16)(inp), D0; PXOR D0, D1; MOVOU D1, (7*16)(oup)
  352. MOVOU (8*16)(inp), D0; PXOR D0, A2; MOVOU A2, (8*16)(oup)
  353. MOVOU (9*16)(inp), D0; PXOR D0, B2; MOVOU B2, (9*16)(oup)
  354. MOVOU (10*16)(inp), D0; PXOR D0, C2; MOVOU C2, (10*16)(oup)
  355. MOVOU (11*16)(inp), D0; PXOR D0, D2; MOVOU D2, (11*16)(oup)
  356. MOVOU (12*16)(inp), D0; PXOR D0, A3; MOVOU A3, (12*16)(oup)
  357. MOVOU (13*16)(inp), D0; PXOR D0, B3; MOVOU B3, (13*16)(oup)
  358. MOVOU (14*16)(inp), D0; PXOR D0, C3; MOVOU C3, (14*16)(oup)
  359. MOVOU (15*16)(inp), D0; PXOR tmpStore, D0; MOVOU D0, (15*16)(oup)
  360. LEAQ 256(inp), inp
  361. LEAQ 256(oup), oup
  362. SUBQ $256, inl
  363. JMP openSSEMainLoop
  364. openSSEMainLoopDone:
  365. // Handle the various tail sizes efficiently
  366. TESTQ inl, inl
  367. JE openSSEFinalize
  368. CMPQ inl, $64
  369. JBE openSSETail64
  370. CMPQ inl, $128
  371. JBE openSSETail128
  372. CMPQ inl, $192
  373. JBE openSSETail192
  374. JMP openSSETail256
  375. openSSEFinalize:
  376. // Hash in the PT, AAD lengths
  377. ADDQ ad_len+80(FP), acc0; ADCQ src_len+56(FP), acc1; ADCQ $1, acc2
  378. polyMul
  379. // Final reduce
  380. MOVQ acc0, t0
  381. MOVQ acc1, t1
  382. MOVQ acc2, t2
  383. SUBQ $-5, acc0
  384. SBBQ $-1, acc1
  385. SBBQ $3, acc2
  386. CMOVQCS t0, acc0
  387. CMOVQCS t1, acc1
  388. CMOVQCS t2, acc2
  389. // Add in the "s" part of the key
  390. ADDQ 0+sStore, acc0
  391. ADCQ 8+sStore, acc1
  392. // Finally, constant time compare to the tag at the end of the message
  393. XORQ AX, AX
  394. MOVQ $1, DX
  395. XORQ (0*8)(inp), acc0
  396. XORQ (1*8)(inp), acc1
  397. ORQ acc1, acc0
  398. CMOVQEQ DX, AX
  399. // Return true iff tags are equal
  400. MOVB AX, ret+96(FP)
  401. RET
  402. // ----------------------------------------------------------------------------
  403. // Special optimization for buffers smaller than 129 bytes
  404. openSSE128:
  405. // For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
  406. MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
  407. MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
  408. MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
  409. MOVO B0, T1; MOVO C0, T2; MOVO D1, T3
  410. MOVQ $10, itr2
  411. openSSE128InnerCipherLoop:
  412. chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
  413. shiftB0Left; shiftB1Left; shiftB2Left
  414. shiftC0Left; shiftC1Left; shiftC2Left
  415. shiftD0Left; shiftD1Left; shiftD2Left
  416. chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
  417. shiftB0Right; shiftB1Right; shiftB2Right
  418. shiftC0Right; shiftC1Right; shiftC2Right
  419. shiftD0Right; shiftD1Right; shiftD2Right
  420. DECQ itr2
  421. JNE openSSE128InnerCipherLoop
  422. // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
  423. PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
  424. PADDL T1, B0; PADDL T1, B1; PADDL T1, B2
  425. PADDL T2, C1; PADDL T2, C2
  426. PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2
  427. // Clamp and store the key
  428. PAND ·polyClampMask<>(SB), A0
  429. MOVOU A0, rStore; MOVOU B0, sStore
  430. // Hash
  431. MOVQ ad_len+80(FP), itr2
  432. CALL polyHashADInternal<>(SB)
  433. openSSE128Open:
  434. CMPQ inl, $16
  435. JB openSSETail16
  436. SUBQ $16, inl
  437. // Load for hashing
  438. polyAdd(0(inp))
  439. // Load for decryption
  440. MOVOU (inp), T0; PXOR T0, A1; MOVOU A1, (oup)
  441. LEAQ (1*16)(inp), inp
  442. LEAQ (1*16)(oup), oup
  443. polyMul
  444. // Shift the stream "left"
  445. MOVO B1, A1
  446. MOVO C1, B1
  447. MOVO D1, C1
  448. MOVO A2, D1
  449. MOVO B2, A2
  450. MOVO C2, B2
  451. MOVO D2, C2
  452. JMP openSSE128Open
  453. openSSETail16:
  454. TESTQ inl, inl
  455. JE openSSEFinalize
  456. // We can safely load the CT from the end, because it is padded with the MAC
  457. MOVQ inl, itr2
  458. SHLQ $4, itr2
  459. LEAQ ·andMask<>(SB), t0
  460. MOVOU (inp), T0
  461. ADDQ inl, inp
  462. PAND -16(t0)(itr2*1), T0
  463. MOVO T0, 0+tmpStore
  464. MOVQ T0, t0
  465. MOVQ 8+tmpStore, t1
  466. PXOR A1, T0
  467. // We can only store one byte at a time, since plaintext can be shorter than 16 bytes
  468. openSSETail16Store:
  469. MOVQ T0, t3
  470. MOVB t3, (oup)
  471. PSRLDQ $1, T0
  472. INCQ oup
  473. DECQ inl
  474. JNE openSSETail16Store
  475. ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
  476. polyMul
  477. JMP openSSEFinalize
  478. // ----------------------------------------------------------------------------
  479. // Special optimization for the last 64 bytes of ciphertext
  480. openSSETail64:
  481. // Need to decrypt up to 64 bytes - prepare single block
  482. MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
  483. XORQ itr2, itr2
  484. MOVQ inl, itr1
  485. CMPQ itr1, $16
  486. JB openSSETail64LoopB
  487. openSSETail64LoopA:
  488. // Perform ChaCha rounds, while hashing the remaining input
  489. polyAdd(0(inp)(itr2*1))
  490. polyMul
  491. SUBQ $16, itr1
  492. openSSETail64LoopB:
  493. ADDQ $16, itr2
  494. chachaQR(A0, B0, C0, D0, T0)
  495. shiftB0Left; shiftC0Left; shiftD0Left
  496. chachaQR(A0, B0, C0, D0, T0)
  497. shiftB0Right; shiftC0Right; shiftD0Right
  498. CMPQ itr1, $16
  499. JAE openSSETail64LoopA
  500. CMPQ itr2, $160
  501. JNE openSSETail64LoopB
  502. PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0; PADDL state2Store, C0; PADDL ctr0Store, D0
  503. openSSETail64DecLoop:
  504. CMPQ inl, $16
  505. JB openSSETail64DecLoopDone
  506. SUBQ $16, inl
  507. MOVOU (inp), T0
  508. PXOR T0, A0
  509. MOVOU A0, (oup)
  510. LEAQ 16(inp), inp
  511. LEAQ 16(oup), oup
  512. MOVO B0, A0
  513. MOVO C0, B0
  514. MOVO D0, C0
  515. JMP openSSETail64DecLoop
  516. openSSETail64DecLoopDone:
  517. MOVO A0, A1
  518. JMP openSSETail16
  519. // ----------------------------------------------------------------------------
  520. // Special optimization for the last 128 bytes of ciphertext
  521. openSSETail128:
  522. // Need to decrypt up to 128 bytes - prepare two blocks
  523. MOVO ·chacha20Constants<>(SB), A1; MOVO state1Store, B1; MOVO state2Store, C1; MOVO ctr3Store, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr0Store
  524. MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr1Store
  525. XORQ itr2, itr2
  526. MOVQ inl, itr1
  527. ANDQ $-16, itr1
  528. openSSETail128LoopA:
  529. // Perform ChaCha rounds, while hashing the remaining input
  530. polyAdd(0(inp)(itr2*1))
  531. polyMul
  532. openSSETail128LoopB:
  533. ADDQ $16, itr2
  534. chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
  535. shiftB0Left; shiftC0Left; shiftD0Left
  536. shiftB1Left; shiftC1Left; shiftD1Left
  537. chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
  538. shiftB0Right; shiftC0Right; shiftD0Right
  539. shiftB1Right; shiftC1Right; shiftD1Right
  540. CMPQ itr2, itr1
  541. JB openSSETail128LoopA
  542. CMPQ itr2, $160
  543. JNE openSSETail128LoopB
  544. PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1
  545. PADDL state1Store, B0; PADDL state1Store, B1
  546. PADDL state2Store, C0; PADDL state2Store, C1
  547. PADDL ctr1Store, D0; PADDL ctr0Store, D1
  548. MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
  549. PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
  550. MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup)
  551. SUBQ $64, inl
  552. LEAQ 64(inp), inp
  553. LEAQ 64(oup), oup
  554. JMP openSSETail64DecLoop
  555. // ----------------------------------------------------------------------------
  556. // Special optimization for the last 192 bytes of ciphertext
  557. openSSETail192:
  558. // Need to decrypt up to 192 bytes - prepare three blocks
  559. MOVO ·chacha20Constants<>(SB), A2; MOVO state1Store, B2; MOVO state2Store, C2; MOVO ctr3Store, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr0Store
  560. MOVO A2, A1; MOVO B2, B1; MOVO C2, C1; MOVO D2, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
  561. MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr2Store
  562. MOVQ inl, itr1
  563. MOVQ $160, itr2
  564. CMPQ itr1, $160
  565. CMOVQGT itr2, itr1
  566. ANDQ $-16, itr1
  567. XORQ itr2, itr2
  568. openSSLTail192LoopA:
  569. // Perform ChaCha rounds, while hashing the remaining input
  570. polyAdd(0(inp)(itr2*1))
  571. polyMul
  572. openSSLTail192LoopB:
  573. ADDQ $16, itr2
  574. chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
  575. shiftB0Left; shiftC0Left; shiftD0Left
  576. shiftB1Left; shiftC1Left; shiftD1Left
  577. shiftB2Left; shiftC2Left; shiftD2Left
  578. chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
  579. shiftB0Right; shiftC0Right; shiftD0Right
  580. shiftB1Right; shiftC1Right; shiftD1Right
  581. shiftB2Right; shiftC2Right; shiftD2Right
  582. CMPQ itr2, itr1
  583. JB openSSLTail192LoopA
  584. CMPQ itr2, $160
  585. JNE openSSLTail192LoopB
  586. CMPQ inl, $176
  587. JB openSSLTail192Store
  588. polyAdd(160(inp))
  589. polyMul
  590. CMPQ inl, $192
  591. JB openSSLTail192Store
  592. polyAdd(176(inp))
  593. polyMul
  594. openSSLTail192Store:
  595. PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
  596. PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2
  597. PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2
  598. PADDL ctr2Store, D0; PADDL ctr1Store, D1; PADDL ctr0Store, D2
  599. MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
  600. PXOR T0, A2; PXOR T1, B2; PXOR T2, C2; PXOR T3, D2
  601. MOVOU A2, (0*16)(oup); MOVOU B2, (1*16)(oup); MOVOU C2, (2*16)(oup); MOVOU D2, (3*16)(oup)
  602. MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3
  603. PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
  604. MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
  605. SUBQ $128, inl
  606. LEAQ 128(inp), inp
  607. LEAQ 128(oup), oup
  608. JMP openSSETail64DecLoop
  609. // ----------------------------------------------------------------------------
  610. // Special optimization for the last 256 bytes of ciphertext
  611. openSSETail256:
  612. // Need to decrypt up to 256 bytes - prepare four blocks
  613. MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
  614. MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
  615. MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
  616. MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
  617. // Store counters
  618. MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
  619. XORQ itr2, itr2
  620. openSSETail256Loop:
  621. // This loop inteleaves 8 ChaCha quarter rounds with 1 poly multiplication
  622. polyAdd(0(inp)(itr2*1))
  623. MOVO C3, tmpStore
  624. chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
  625. MOVO tmpStore, C3
  626. MOVO C1, tmpStore
  627. chachaQR(A3, B3, C3, D3, C1)
  628. MOVO tmpStore, C1
  629. shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
  630. shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
  631. shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
  632. polyMulStage1
  633. polyMulStage2
  634. MOVO C3, tmpStore
  635. chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
  636. MOVO tmpStore, C3
  637. MOVO C1, tmpStore
  638. chachaQR(A3, B3, C3, D3, C1)
  639. MOVO tmpStore, C1
  640. polyMulStage3
  641. polyMulReduceStage
  642. shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
  643. shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
  644. shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
  645. ADDQ $2*8, itr2
  646. CMPQ itr2, $160
  647. JB openSSETail256Loop
  648. MOVQ inl, itr1
  649. ANDQ $-16, itr1
  650. openSSETail256HashLoop:
  651. polyAdd(0(inp)(itr2*1))
  652. polyMul
  653. ADDQ $2*8, itr2
  654. CMPQ itr2, itr1
  655. JB openSSETail256HashLoop
  656. // Add in the state
  657. PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
  658. PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
  659. PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
  660. PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
  661. MOVO D3, tmpStore
  662. // Load - xor - store
  663. MOVOU (0*16)(inp), D3; PXOR D3, A0
  664. MOVOU (1*16)(inp), D3; PXOR D3, B0
  665. MOVOU (2*16)(inp), D3; PXOR D3, C0
  666. MOVOU (3*16)(inp), D3; PXOR D3, D0
  667. MOVOU A0, (0*16)(oup)
  668. MOVOU B0, (1*16)(oup)
  669. MOVOU C0, (2*16)(oup)
  670. MOVOU D0, (3*16)(oup)
  671. MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
  672. PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
  673. MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
  674. MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0
  675. PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
  676. MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup)
  677. LEAQ 192(inp), inp
  678. LEAQ 192(oup), oup
  679. SUBQ $192, inl
  680. MOVO A3, A0
  681. MOVO B3, B0
  682. MOVO C3, C0
  683. MOVO tmpStore, D0
  684. JMP openSSETail64DecLoop
  685. // ----------------------------------------------------------------------------
  686. // ------------------------- AVX2 Code ----------------------------------------
  687. chacha20Poly1305Open_AVX2:
  688. VZEROUPPER
  689. VMOVDQU ·chacha20Constants<>(SB), AA0
  690. BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14
  691. BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12
  692. BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4
  693. VPADDD ·avx2InitMask<>(SB), DD0, DD0
  694. // Special optimization, for very short buffers
  695. CMPQ inl, $192
  696. JBE openAVX2192
  697. CMPQ inl, $320
  698. JBE openAVX2320
  699. // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
  700. VMOVDQA BB0, state1StoreAVX2
  701. VMOVDQA CC0, state2StoreAVX2
  702. VMOVDQA DD0, ctr3StoreAVX2
  703. MOVQ $10, itr2
  704. openAVX2PreparePolyKey:
  705. chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
  706. VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
  707. chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
  708. VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
  709. DECQ itr2
  710. JNE openAVX2PreparePolyKey
  711. VPADDD ·chacha20Constants<>(SB), AA0, AA0
  712. VPADDD state1StoreAVX2, BB0, BB0
  713. VPADDD state2StoreAVX2, CC0, CC0
  714. VPADDD ctr3StoreAVX2, DD0, DD0
  715. VPERM2I128 $0x02, AA0, BB0, TT0
  716. // Clamp and store poly key
  717. VPAND ·polyClampMask<>(SB), TT0, TT0
  718. VMOVDQA TT0, rsStoreAVX2
  719. // Stream for the first 64 bytes
  720. VPERM2I128 $0x13, AA0, BB0, AA0
  721. VPERM2I128 $0x13, CC0, DD0, BB0
  722. // Hash AD + first 64 bytes
  723. MOVQ ad_len+80(FP), itr2
  724. CALL polyHashADInternal<>(SB)
  725. XORQ itr1, itr1
  726. openAVX2InitialHash64:
  727. polyAdd(0(inp)(itr1*1))
  728. polyMulAVX2
  729. ADDQ $16, itr1
  730. CMPQ itr1, $64
  731. JNE openAVX2InitialHash64
  732. // Decrypt the first 64 bytes
  733. VPXOR (0*32)(inp), AA0, AA0
  734. VPXOR (1*32)(inp), BB0, BB0
  735. VMOVDQU AA0, (0*32)(oup)
  736. VMOVDQU BB0, (1*32)(oup)
  737. LEAQ (2*32)(inp), inp
  738. LEAQ (2*32)(oup), oup
  739. SUBQ $64, inl
  740. openAVX2MainLoop:
  741. CMPQ inl, $512
  742. JB openAVX2MainLoopDone
  743. // Load state, increment counter blocks, store the incremented counters
  744. VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
  745. VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
  746. VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
  747. VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
  748. VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
  749. XORQ itr1, itr1
  750. openAVX2InternalLoop:
  751. // Lets just say this spaghetti loop interleaves 2 quarter rounds with 3 poly multiplications
  752. // Effectively per 512 bytes of stream we hash 480 bytes of ciphertext
  753. polyAdd(0*8(inp)(itr1*1))
  754. VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  755. polyMulStage1_AVX2
  756. VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  757. VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
  758. polyMulStage2_AVX2
  759. VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  760. VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  761. polyMulStage3_AVX2
  762. VMOVDQA CC3, tmpStoreAVX2
  763. VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
  764. VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
  765. VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
  766. VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
  767. VMOVDQA tmpStoreAVX2, CC3
  768. polyMulReduceStage
  769. VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  770. VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  771. VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
  772. polyAdd(2*8(inp)(itr1*1))
  773. VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  774. polyMulStage1_AVX2
  775. VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  776. VMOVDQA CC3, tmpStoreAVX2
  777. VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
  778. VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
  779. VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
  780. VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
  781. VMOVDQA tmpStoreAVX2, CC3
  782. polyMulStage2_AVX2
  783. VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
  784. VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
  785. VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
  786. VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  787. polyMulStage3_AVX2
  788. VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  789. VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
  790. polyMulReduceStage
  791. VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  792. VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  793. polyAdd(4*8(inp)(itr1*1))
  794. LEAQ (6*8)(itr1), itr1
  795. VMOVDQA CC3, tmpStoreAVX2
  796. VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
  797. VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
  798. VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
  799. VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
  800. VMOVDQA tmpStoreAVX2, CC3
  801. polyMulStage1_AVX2
  802. VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  803. VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  804. polyMulStage2_AVX2
  805. VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
  806. VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  807. polyMulStage3_AVX2
  808. VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  809. VMOVDQA CC3, tmpStoreAVX2
  810. VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
  811. VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
  812. VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
  813. VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
  814. VMOVDQA tmpStoreAVX2, CC3
  815. polyMulReduceStage
  816. VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
  817. VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
  818. VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
  819. CMPQ itr1, $480
  820. JNE openAVX2InternalLoop
  821. VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
  822. VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
  823. VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
  824. VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
  825. VMOVDQA CC3, tmpStoreAVX2
  826. // We only hashed 480 of the 512 bytes available - hash the remaining 32 here
  827. polyAdd(480(inp))
  828. polyMulAVX2
  829. VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
  830. VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
  831. VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
  832. VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
  833. VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
  834. VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
  835. // and here
  836. polyAdd(496(inp))
  837. polyMulAVX2
  838. VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
  839. VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
  840. VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
  841. VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
  842. VPXOR (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0
  843. VMOVDQU AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup)
  844. LEAQ (32*16)(inp), inp
  845. LEAQ (32*16)(oup), oup
  846. SUBQ $(32*16), inl
  847. JMP openAVX2MainLoop
  848. openAVX2MainLoopDone:
  849. // Handle the various tail sizes efficiently
  850. TESTQ inl, inl
  851. JE openSSEFinalize
  852. CMPQ inl, $128
  853. JBE openAVX2Tail128
  854. CMPQ inl, $256
  855. JBE openAVX2Tail256
  856. CMPQ inl, $384
  857. JBE openAVX2Tail384
  858. JMP openAVX2Tail512
  859. // ----------------------------------------------------------------------------
  860. // Special optimization for buffers smaller than 193 bytes
  861. openAVX2192:
  862. // For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks
  863. VMOVDQA AA0, AA1
  864. VMOVDQA BB0, BB1
  865. VMOVDQA CC0, CC1
  866. VPADDD ·avx2IncMask<>(SB), DD0, DD1
  867. VMOVDQA AA0, AA2
  868. VMOVDQA BB0, BB2
  869. VMOVDQA CC0, CC2
  870. VMOVDQA DD0, DD2
  871. VMOVDQA DD1, TT3
  872. MOVQ $10, itr2
  873. openAVX2192InnerCipherLoop:
  874. chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  875. VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
  876. VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
  877. VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
  878. chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  879. VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
  880. VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
  881. VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
  882. DECQ itr2
  883. JNE openAVX2192InnerCipherLoop
  884. VPADDD AA2, AA0, AA0; VPADDD AA2, AA1, AA1
  885. VPADDD BB2, BB0, BB0; VPADDD BB2, BB1, BB1
  886. VPADDD CC2, CC0, CC0; VPADDD CC2, CC1, CC1
  887. VPADDD DD2, DD0, DD0; VPADDD TT3, DD1, DD1
  888. VPERM2I128 $0x02, AA0, BB0, TT0
  889. // Clamp and store poly key
  890. VPAND ·polyClampMask<>(SB), TT0, TT0
  891. VMOVDQA TT0, rsStoreAVX2
  892. // Stream for up to 192 bytes
  893. VPERM2I128 $0x13, AA0, BB0, AA0
  894. VPERM2I128 $0x13, CC0, DD0, BB0
  895. VPERM2I128 $0x02, AA1, BB1, CC0
  896. VPERM2I128 $0x02, CC1, DD1, DD0
  897. VPERM2I128 $0x13, AA1, BB1, AA1
  898. VPERM2I128 $0x13, CC1, DD1, BB1
  899. openAVX2ShortOpen:
  900. // Hash
  901. MOVQ ad_len+80(FP), itr2
  902. CALL polyHashADInternal<>(SB)
  903. openAVX2ShortOpenLoop:
  904. CMPQ inl, $32
  905. JB openAVX2ShortTail32
  906. SUBQ $32, inl
  907. // Load for hashing
  908. polyAdd(0*8(inp))
  909. polyMulAVX2
  910. polyAdd(2*8(inp))
  911. polyMulAVX2
  912. // Load for decryption
  913. VPXOR (inp), AA0, AA0
  914. VMOVDQU AA0, (oup)
  915. LEAQ (1*32)(inp), inp
  916. LEAQ (1*32)(oup), oup
  917. // Shift stream left
  918. VMOVDQA BB0, AA0
  919. VMOVDQA CC0, BB0
  920. VMOVDQA DD0, CC0
  921. VMOVDQA AA1, DD0
  922. VMOVDQA BB1, AA1
  923. VMOVDQA CC1, BB1
  924. VMOVDQA DD1, CC1
  925. VMOVDQA AA2, DD1
  926. VMOVDQA BB2, AA2
  927. JMP openAVX2ShortOpenLoop
  928. openAVX2ShortTail32:
  929. CMPQ inl, $16
  930. VMOVDQA A0, A1
  931. JB openAVX2ShortDone
  932. SUBQ $16, inl
  933. // Load for hashing
  934. polyAdd(0*8(inp))
  935. polyMulAVX2
  936. // Load for decryption
  937. VPXOR (inp), A0, T0
  938. VMOVDQU T0, (oup)
  939. LEAQ (1*16)(inp), inp
  940. LEAQ (1*16)(oup), oup
  941. VPERM2I128 $0x11, AA0, AA0, AA0
  942. VMOVDQA A0, A1
  943. openAVX2ShortDone:
  944. VZEROUPPER
  945. JMP openSSETail16
  946. // ----------------------------------------------------------------------------
  947. // Special optimization for buffers smaller than 321 bytes
  948. openAVX2320:
  949. // For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
  950. VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1
  951. VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2
  952. VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3
  953. MOVQ $10, itr2
  954. openAVX2320InnerCipherLoop:
  955. chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  956. VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
  957. VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  958. VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
  959. chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  960. VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
  961. VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  962. VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
  963. DECQ itr2
  964. JNE openAVX2320InnerCipherLoop
  965. VMOVDQA ·chacha20Constants<>(SB), TT0
  966. VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2
  967. VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2
  968. VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2
  969. VMOVDQA ·avx2IncMask<>(SB), TT0
  970. VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3
  971. VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3
  972. VPADDD TT3, DD2, DD2
  973. // Clamp and store poly key
  974. VPERM2I128 $0x02, AA0, BB0, TT0
  975. VPAND ·polyClampMask<>(SB), TT0, TT0
  976. VMOVDQA TT0, rsStoreAVX2
  977. // Stream for up to 320 bytes
  978. VPERM2I128 $0x13, AA0, BB0, AA0
  979. VPERM2I128 $0x13, CC0, DD0, BB0
  980. VPERM2I128 $0x02, AA1, BB1, CC0
  981. VPERM2I128 $0x02, CC1, DD1, DD0
  982. VPERM2I128 $0x13, AA1, BB1, AA1
  983. VPERM2I128 $0x13, CC1, DD1, BB1
  984. VPERM2I128 $0x02, AA2, BB2, CC1
  985. VPERM2I128 $0x02, CC2, DD2, DD1
  986. VPERM2I128 $0x13, AA2, BB2, AA2
  987. VPERM2I128 $0x13, CC2, DD2, BB2
  988. JMP openAVX2ShortOpen
  989. // ----------------------------------------------------------------------------
  990. // Special optimization for the last 128 bytes of ciphertext
  991. openAVX2Tail128:
  992. // Need to decrypt up to 128 bytes - prepare two blocks
  993. VMOVDQA ·chacha20Constants<>(SB), AA1
  994. VMOVDQA state1StoreAVX2, BB1
  995. VMOVDQA state2StoreAVX2, CC1
  996. VMOVDQA ctr3StoreAVX2, DD1
  997. VPADDD ·avx2IncMask<>(SB), DD1, DD1
  998. VMOVDQA DD1, DD0
  999. XORQ itr2, itr2
  1000. MOVQ inl, itr1
  1001. ANDQ $-16, itr1
  1002. TESTQ itr1, itr1
  1003. JE openAVX2Tail128LoopB
  1004. openAVX2Tail128LoopA:
  1005. // Perform ChaCha rounds, while hashing the remaining input
  1006. polyAdd(0(inp)(itr2*1))
  1007. polyMulAVX2
  1008. openAVX2Tail128LoopB:
  1009. ADDQ $16, itr2
  1010. chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  1011. VPALIGNR $4, BB1, BB1, BB1
  1012. VPALIGNR $8, CC1, CC1, CC1
  1013. VPALIGNR $12, DD1, DD1, DD1
  1014. chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  1015. VPALIGNR $12, BB1, BB1, BB1
  1016. VPALIGNR $8, CC1, CC1, CC1
  1017. VPALIGNR $4, DD1, DD1, DD1
  1018. CMPQ itr2, itr1
  1019. JB openAVX2Tail128LoopA
  1020. CMPQ itr2, $160
  1021. JNE openAVX2Tail128LoopB
  1022. VPADDD ·chacha20Constants<>(SB), AA1, AA1
  1023. VPADDD state1StoreAVX2, BB1, BB1
  1024. VPADDD state2StoreAVX2, CC1, CC1
  1025. VPADDD DD0, DD1, DD1
  1026. VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
  1027. openAVX2TailLoop:
  1028. CMPQ inl, $32
  1029. JB openAVX2Tail
  1030. SUBQ $32, inl
  1031. // Load for decryption
  1032. VPXOR (inp), AA0, AA0
  1033. VMOVDQU AA0, (oup)
  1034. LEAQ (1*32)(inp), inp
  1035. LEAQ (1*32)(oup), oup
  1036. VMOVDQA BB0, AA0
  1037. VMOVDQA CC0, BB0
  1038. VMOVDQA DD0, CC0
  1039. JMP openAVX2TailLoop
  1040. openAVX2Tail:
  1041. CMPQ inl, $16
  1042. VMOVDQA A0, A1
  1043. JB openAVX2TailDone
  1044. SUBQ $16, inl
  1045. // Load for decryption
  1046. VPXOR (inp), A0, T0
  1047. VMOVDQU T0, (oup)
  1048. LEAQ (1*16)(inp), inp
  1049. LEAQ (1*16)(oup), oup
  1050. VPERM2I128 $0x11, AA0, AA0, AA0
  1051. VMOVDQA A0, A1
  1052. openAVX2TailDone:
  1053. VZEROUPPER
  1054. JMP openSSETail16
  1055. // ----------------------------------------------------------------------------
  1056. // Special optimization for the last 256 bytes of ciphertext
  1057. openAVX2Tail256:
  1058. // Need to decrypt up to 256 bytes - prepare four blocks
  1059. VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1
  1060. VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1
  1061. VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1
  1062. VMOVDQA ctr3StoreAVX2, DD0
  1063. VPADDD ·avx2IncMask<>(SB), DD0, DD0
  1064. VPADDD ·avx2IncMask<>(SB), DD0, DD1
  1065. VMOVDQA DD0, TT1
  1066. VMOVDQA DD1, TT2
  1067. // Compute the number of iterations that will hash data
  1068. MOVQ inl, tmpStoreAVX2
  1069. MOVQ inl, itr1
  1070. SUBQ $128, itr1
  1071. SHRQ $4, itr1
  1072. MOVQ $10, itr2
  1073. CMPQ itr1, $10
  1074. CMOVQGT itr2, itr1
  1075. MOVQ inp, inl
  1076. XORQ itr2, itr2
  1077. openAVX2Tail256LoopA:
  1078. polyAdd(0(inl))
  1079. polyMulAVX2
  1080. LEAQ 16(inl), inl
  1081. // Perform ChaCha rounds, while hashing the remaining input
  1082. openAVX2Tail256LoopB:
  1083. chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  1084. VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
  1085. VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
  1086. VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
  1087. INCQ itr2
  1088. chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  1089. VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
  1090. VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
  1091. VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
  1092. CMPQ itr2, itr1
  1093. JB openAVX2Tail256LoopA
  1094. CMPQ itr2, $10
  1095. JNE openAVX2Tail256LoopB
  1096. MOVQ inl, itr2
  1097. SUBQ inp, inl
  1098. MOVQ inl, itr1
  1099. MOVQ tmpStoreAVX2, inl
  1100. // Hash the remainder of data (if any)
  1101. openAVX2Tail256Hash:
  1102. ADDQ $16, itr1
  1103. CMPQ itr1, inl
  1104. JGT openAVX2Tail256HashEnd
  1105. polyAdd (0(itr2))
  1106. polyMulAVX2
  1107. LEAQ 16(itr2), itr2
  1108. JMP openAVX2Tail256Hash
  1109. // Store 128 bytes safely, then go to store loop
  1110. openAVX2Tail256HashEnd:
  1111. VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1
  1112. VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1
  1113. VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1
  1114. VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1
  1115. VPERM2I128 $0x02, AA0, BB0, AA2; VPERM2I128 $0x02, CC0, DD0, BB2; VPERM2I128 $0x13, AA0, BB0, CC2; VPERM2I128 $0x13, CC0, DD0, DD2
  1116. VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
  1117. VPXOR (0*32)(inp), AA2, AA2; VPXOR (1*32)(inp), BB2, BB2; VPXOR (2*32)(inp), CC2, CC2; VPXOR (3*32)(inp), DD2, DD2
  1118. VMOVDQU AA2, (0*32)(oup); VMOVDQU BB2, (1*32)(oup); VMOVDQU CC2, (2*32)(oup); VMOVDQU DD2, (3*32)(oup)
  1119. LEAQ (4*32)(inp), inp
  1120. LEAQ (4*32)(oup), oup
  1121. SUBQ $4*32, inl
  1122. JMP openAVX2TailLoop
  1123. // ----------------------------------------------------------------------------
  1124. // Special optimization for the last 384 bytes of ciphertext
  1125. openAVX2Tail384:
  1126. // Need to decrypt up to 384 bytes - prepare six blocks
  1127. VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
  1128. VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2
  1129. VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2
  1130. VMOVDQA ctr3StoreAVX2, DD0
  1131. VPADDD ·avx2IncMask<>(SB), DD0, DD0
  1132. VPADDD ·avx2IncMask<>(SB), DD0, DD1
  1133. VPADDD ·avx2IncMask<>(SB), DD1, DD2
  1134. VMOVDQA DD0, ctr0StoreAVX2
  1135. VMOVDQA DD1, ctr1StoreAVX2
  1136. VMOVDQA DD2, ctr2StoreAVX2
  1137. // Compute the number of iterations that will hash two blocks of data
  1138. MOVQ inl, tmpStoreAVX2
  1139. MOVQ inl, itr1
  1140. SUBQ $256, itr1
  1141. SHRQ $4, itr1
  1142. ADDQ $6, itr1
  1143. MOVQ $10, itr2
  1144. CMPQ itr1, $10
  1145. CMOVQGT itr2, itr1
  1146. MOVQ inp, inl
  1147. XORQ itr2, itr2
  1148. // Perform ChaCha rounds, while hashing the remaining input
  1149. openAVX2Tail384LoopB:
  1150. polyAdd(0(inl))
  1151. polyMulAVX2
  1152. LEAQ 16(inl), inl
  1153. openAVX2Tail384LoopA:
  1154. chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  1155. VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
  1156. VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  1157. VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
  1158. polyAdd(0(inl))
  1159. polyMulAVX2
  1160. LEAQ 16(inl), inl
  1161. INCQ itr2
  1162. chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  1163. VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
  1164. VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  1165. VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
  1166. CMPQ itr2, itr1
  1167. JB openAVX2Tail384LoopB
  1168. CMPQ itr2, $10
  1169. JNE openAVX2Tail384LoopA
  1170. MOVQ inl, itr2
  1171. SUBQ inp, inl
  1172. MOVQ inl, itr1
  1173. MOVQ tmpStoreAVX2, inl
  1174. openAVX2Tail384Hash:
  1175. ADDQ $16, itr1
  1176. CMPQ itr1, inl
  1177. JGT openAVX2Tail384HashEnd
  1178. polyAdd(0(itr2))
  1179. polyMulAVX2
  1180. LEAQ 16(itr2), itr2
  1181. JMP openAVX2Tail384Hash
  1182. // Store 256 bytes safely, then go to store loop
  1183. openAVX2Tail384HashEnd:
  1184. VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2
  1185. VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2
  1186. VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2
  1187. VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2
  1188. VPERM2I128 $0x02, AA0, BB0, TT0; VPERM2I128 $0x02, CC0, DD0, TT1; VPERM2I128 $0x13, AA0, BB0, TT2; VPERM2I128 $0x13, CC0, DD0, TT3
  1189. VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
  1190. VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
  1191. VPERM2I128 $0x02, AA1, BB1, TT0; VPERM2I128 $0x02, CC1, DD1, TT1; VPERM2I128 $0x13, AA1, BB1, TT2; VPERM2I128 $0x13, CC1, DD1, TT3
  1192. VPXOR (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3
  1193. VMOVDQU TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup)
  1194. VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
  1195. LEAQ (8*32)(inp), inp
  1196. LEAQ (8*32)(oup), oup
  1197. SUBQ $8*32, inl
  1198. JMP openAVX2TailLoop
  1199. // ----------------------------------------------------------------------------
  1200. // Special optimization for the last 512 bytes of ciphertext
  1201. openAVX2Tail512:
  1202. VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
  1203. VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
  1204. VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
  1205. VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
  1206. VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
  1207. XORQ itr1, itr1
  1208. MOVQ inp, itr2
  1209. openAVX2Tail512LoopB:
  1210. polyAdd(0(itr2))
  1211. polyMulAVX2
  1212. LEAQ (2*8)(itr2), itr2
  1213. openAVX2Tail512LoopA:
  1214. VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  1215. VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  1216. VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
  1217. VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  1218. VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  1219. VMOVDQA CC3, tmpStoreAVX2
  1220. VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
  1221. VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
  1222. VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
  1223. VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
  1224. VMOVDQA tmpStoreAVX2, CC3
  1225. polyAdd(0*8(itr2))
  1226. polyMulAVX2
  1227. VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  1228. VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  1229. VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
  1230. VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  1231. VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  1232. VMOVDQA CC3, tmpStoreAVX2
  1233. VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
  1234. VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
  1235. VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
  1236. VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
  1237. VMOVDQA tmpStoreAVX2, CC3
  1238. VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
  1239. VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
  1240. VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
  1241. VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  1242. VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  1243. VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
  1244. VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  1245. VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  1246. polyAdd(2*8(itr2))
  1247. polyMulAVX2
  1248. LEAQ (4*8)(itr2), itr2
  1249. VMOVDQA CC3, tmpStoreAVX2
  1250. VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
  1251. VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
  1252. VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
  1253. VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
  1254. VMOVDQA tmpStoreAVX2, CC3
  1255. VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  1256. VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  1257. VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
  1258. VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  1259. VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  1260. VMOVDQA CC3, tmpStoreAVX2
  1261. VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
  1262. VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
  1263. VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
  1264. VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
  1265. VMOVDQA tmpStoreAVX2, CC3
  1266. VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
  1267. VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
  1268. VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
  1269. INCQ itr1
  1270. CMPQ itr1, $4
  1271. JLT openAVX2Tail512LoopB
  1272. CMPQ itr1, $10
  1273. JNE openAVX2Tail512LoopA
  1274. MOVQ inl, itr1
  1275. SUBQ $384, itr1
  1276. ANDQ $-16, itr1
  1277. openAVX2Tail512HashLoop:
  1278. TESTQ itr1, itr1
  1279. JE openAVX2Tail512HashEnd
  1280. polyAdd(0(itr2))
  1281. polyMulAVX2
  1282. LEAQ 16(itr2), itr2
  1283. SUBQ $16, itr1
  1284. JMP openAVX2Tail512HashLoop
  1285. openAVX2Tail512HashEnd:
  1286. VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
  1287. VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
  1288. VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
  1289. VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
  1290. VMOVDQA CC3, tmpStoreAVX2
  1291. VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
  1292. VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
  1293. VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
  1294. VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
  1295. VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
  1296. VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
  1297. VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
  1298. VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
  1299. VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
  1300. VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
  1301. LEAQ (12*32)(inp), inp
  1302. LEAQ (12*32)(oup), oup
  1303. SUBQ $12*32, inl
  1304. JMP openAVX2TailLoop
  1305. // ----------------------------------------------------------------------------
  1306. // ----------------------------------------------------------------------------
  1307. // func chacha20Poly1305Seal(dst, key, src, ad []byte)
  1308. TEXT ·chacha20Poly1305Seal(SB), 0, $288-96
  1309. // For aligned stack access
  1310. MOVQ SP, BP
  1311. ADDQ $32, BP
  1312. ANDQ $-32, BP
  1313. MOVQ dst+0(FP), oup
  1314. MOVQ key+24(FP), keyp
  1315. MOVQ src+48(FP), inp
  1316. MOVQ src_len+56(FP), inl
  1317. MOVQ ad+72(FP), adp
  1318. CMPB ·useAVX2(SB), $1
  1319. JE chacha20Poly1305Seal_AVX2
  1320. // Special optimization, for very short buffers
  1321. CMPQ inl, $128
  1322. JBE sealSSE128 // About 15% faster
  1323. // In the seal case - prepare the poly key + 3 blocks of stream in the first iteration
  1324. MOVOU ·chacha20Constants<>(SB), A0
  1325. MOVOU (1*16)(keyp), B0
  1326. MOVOU (2*16)(keyp), C0
  1327. MOVOU (3*16)(keyp), D0
  1328. // Store state on stack for future use
  1329. MOVO B0, state1Store
  1330. MOVO C0, state2Store
  1331. // Load state, increment counter blocks
  1332. MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
  1333. MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
  1334. MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
  1335. // Store counters
  1336. MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
  1337. MOVQ $10, itr2
  1338. sealSSEIntroLoop:
  1339. MOVO C3, tmpStore
  1340. chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
  1341. MOVO tmpStore, C3
  1342. MOVO C1, tmpStore
  1343. chachaQR(A3, B3, C3, D3, C1)
  1344. MOVO tmpStore, C1
  1345. shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
  1346. shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
  1347. shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
  1348. MOVO C3, tmpStore
  1349. chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
  1350. MOVO tmpStore, C3
  1351. MOVO C1, tmpStore
  1352. chachaQR(A3, B3, C3, D3, C1)
  1353. MOVO tmpStore, C1
  1354. shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
  1355. shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
  1356. shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
  1357. DECQ itr2
  1358. JNE sealSSEIntroLoop
  1359. // Add in the state
  1360. PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
  1361. PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
  1362. PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
  1363. PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
  1364. // Clamp and store the key
  1365. PAND ·polyClampMask<>(SB), A0
  1366. MOVO A0, rStore
  1367. MOVO B0, sStore
  1368. // Hash AAD
  1369. MOVQ ad_len+80(FP), itr2
  1370. CALL polyHashADInternal<>(SB)
  1371. MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
  1372. PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
  1373. MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup)
  1374. MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
  1375. PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
  1376. MOVOU A2, (4*16)(oup); MOVOU B2, (5*16)(oup); MOVOU C2, (6*16)(oup); MOVOU D2, (7*16)(oup)
  1377. MOVQ $128, itr1
  1378. SUBQ $128, inl
  1379. LEAQ 128(inp), inp
  1380. MOVO A3, A1; MOVO B3, B1; MOVO C3, C1; MOVO D3, D1
  1381. CMPQ inl, $64
  1382. JBE sealSSE128SealHash
  1383. MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
  1384. PXOR A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3
  1385. MOVOU A3, (8*16)(oup); MOVOU B3, (9*16)(oup); MOVOU C3, (10*16)(oup); MOVOU D3, (11*16)(oup)
  1386. ADDQ $64, itr1
  1387. SUBQ $64, inl
  1388. LEAQ 64(inp), inp
  1389. MOVQ $2, itr1
  1390. MOVQ $8, itr2
  1391. CMPQ inl, $64
  1392. JBE sealSSETail64
  1393. CMPQ inl, $128
  1394. JBE sealSSETail128
  1395. CMPQ inl, $192
  1396. JBE sealSSETail192
  1397. sealSSEMainLoop:
  1398. // Load state, increment counter blocks
  1399. MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
  1400. MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
  1401. MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
  1402. MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
  1403. // Store counters
  1404. MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
  1405. sealSSEInnerLoop:
  1406. MOVO C3, tmpStore
  1407. chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
  1408. MOVO tmpStore, C3
  1409. MOVO C1, tmpStore
  1410. chachaQR(A3, B3, C3, D3, C1)
  1411. MOVO tmpStore, C1
  1412. polyAdd(0(oup))
  1413. shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
  1414. shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
  1415. shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
  1416. polyMulStage1
  1417. polyMulStage2
  1418. LEAQ (2*8)(oup), oup
  1419. MOVO C3, tmpStore
  1420. chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
  1421. MOVO tmpStore, C3
  1422. MOVO C1, tmpStore
  1423. polyMulStage3
  1424. chachaQR(A3, B3, C3, D3, C1)
  1425. MOVO tmpStore, C1
  1426. polyMulReduceStage
  1427. shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
  1428. shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
  1429. shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
  1430. DECQ itr2
  1431. JGE sealSSEInnerLoop
  1432. polyAdd(0(oup))
  1433. polyMul
  1434. LEAQ (2*8)(oup), oup
  1435. DECQ itr1
  1436. JG sealSSEInnerLoop
  1437. // Add in the state
  1438. PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
  1439. PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
  1440. PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
  1441. PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
  1442. MOVO D3, tmpStore
  1443. // Load - xor - store
  1444. MOVOU (0*16)(inp), D3; PXOR D3, A0
  1445. MOVOU (1*16)(inp), D3; PXOR D3, B0
  1446. MOVOU (2*16)(inp), D3; PXOR D3, C0
  1447. MOVOU (3*16)(inp), D3; PXOR D3, D0
  1448. MOVOU A0, (0*16)(oup)
  1449. MOVOU B0, (1*16)(oup)
  1450. MOVOU C0, (2*16)(oup)
  1451. MOVOU D0, (3*16)(oup)
  1452. MOVO tmpStore, D3
  1453. MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
  1454. PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
  1455. MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
  1456. MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0
  1457. PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
  1458. MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup)
  1459. ADDQ $192, inp
  1460. MOVQ $192, itr1
  1461. SUBQ $192, inl
  1462. MOVO A3, A1
  1463. MOVO B3, B1
  1464. MOVO C3, C1
  1465. MOVO D3, D1
  1466. CMPQ inl, $64
  1467. JBE sealSSE128SealHash
  1468. MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
  1469. PXOR A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3
  1470. MOVOU A3, (12*16)(oup); MOVOU B3, (13*16)(oup); MOVOU C3, (14*16)(oup); MOVOU D3, (15*16)(oup)
  1471. LEAQ 64(inp), inp
  1472. SUBQ $64, inl
  1473. MOVQ $6, itr1
  1474. MOVQ $4, itr2
  1475. CMPQ inl, $192
  1476. JG sealSSEMainLoop
  1477. MOVQ inl, itr1
  1478. TESTQ inl, inl
  1479. JE sealSSE128SealHash
  1480. MOVQ $6, itr1
  1481. CMPQ inl, $64
  1482. JBE sealSSETail64
  1483. CMPQ inl, $128
  1484. JBE sealSSETail128
  1485. JMP sealSSETail192
  1486. // ----------------------------------------------------------------------------
  1487. // Special optimization for the last 64 bytes of plaintext
  1488. sealSSETail64:
  1489. // Need to encrypt up to 64 bytes - prepare single block, hash 192 or 256 bytes
  1490. MOVO ·chacha20Constants<>(SB), A1
  1491. MOVO state1Store, B1
  1492. MOVO state2Store, C1
  1493. MOVO ctr3Store, D1
  1494. PADDL ·sseIncMask<>(SB), D1
  1495. MOVO D1, ctr0Store
  1496. sealSSETail64LoopA:
  1497. // Perform ChaCha rounds, while hashing the previously encrypted ciphertext
  1498. polyAdd(0(oup))
  1499. polyMul
  1500. LEAQ 16(oup), oup
  1501. sealSSETail64LoopB:
  1502. chachaQR(A1, B1, C1, D1, T1)
  1503. shiftB1Left; shiftC1Left; shiftD1Left
  1504. chachaQR(A1, B1, C1, D1, T1)
  1505. shiftB1Right; shiftC1Right; shiftD1Right
  1506. polyAdd(0(oup))
  1507. polyMul
  1508. LEAQ 16(oup), oup
  1509. DECQ itr1
  1510. JG sealSSETail64LoopA
  1511. DECQ itr2
  1512. JGE sealSSETail64LoopB
  1513. PADDL ·chacha20Constants<>(SB), A1
  1514. PADDL state1Store, B1
  1515. PADDL state2Store, C1
  1516. PADDL ctr0Store, D1
  1517. JMP sealSSE128Seal
  1518. // ----------------------------------------------------------------------------
  1519. // Special optimization for the last 128 bytes of plaintext
  1520. sealSSETail128:
  1521. // Need to encrypt up to 128 bytes - prepare two blocks, hash 192 or 256 bytes
  1522. MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
  1523. MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
  1524. sealSSETail128LoopA:
  1525. // Perform ChaCha rounds, while hashing the previously encrypted ciphertext
  1526. polyAdd(0(oup))
  1527. polyMul
  1528. LEAQ 16(oup), oup
  1529. sealSSETail128LoopB:
  1530. chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
  1531. shiftB0Left; shiftC0Left; shiftD0Left
  1532. shiftB1Left; shiftC1Left; shiftD1Left
  1533. polyAdd(0(oup))
  1534. polyMul
  1535. LEAQ 16(oup), oup
  1536. chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
  1537. shiftB0Right; shiftC0Right; shiftD0Right
  1538. shiftB1Right; shiftC1Right; shiftD1Right
  1539. DECQ itr1
  1540. JG sealSSETail128LoopA
  1541. DECQ itr2
  1542. JGE sealSSETail128LoopB
  1543. PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1
  1544. PADDL state1Store, B0; PADDL state1Store, B1
  1545. PADDL state2Store, C0; PADDL state2Store, C1
  1546. PADDL ctr0Store, D0; PADDL ctr1Store, D1
  1547. MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
  1548. PXOR T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0
  1549. MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup)
  1550. MOVQ $64, itr1
  1551. LEAQ 64(inp), inp
  1552. SUBQ $64, inl
  1553. JMP sealSSE128SealHash
  1554. // ----------------------------------------------------------------------------
  1555. // Special optimization for the last 192 bytes of plaintext
  1556. sealSSETail192:
  1557. // Need to encrypt up to 192 bytes - prepare three blocks, hash 192 or 256 bytes
  1558. MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
  1559. MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
  1560. MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr2Store
  1561. sealSSETail192LoopA:
  1562. // Perform ChaCha rounds, while hashing the previously encrypted ciphertext
  1563. polyAdd(0(oup))
  1564. polyMul
  1565. LEAQ 16(oup), oup
  1566. sealSSETail192LoopB:
  1567. chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
  1568. shiftB0Left; shiftC0Left; shiftD0Left
  1569. shiftB1Left; shiftC1Left; shiftD1Left
  1570. shiftB2Left; shiftC2Left; shiftD2Left
  1571. polyAdd(0(oup))
  1572. polyMul
  1573. LEAQ 16(oup), oup
  1574. chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
  1575. shiftB0Right; shiftC0Right; shiftD0Right
  1576. shiftB1Right; shiftC1Right; shiftD1Right
  1577. shiftB2Right; shiftC2Right; shiftD2Right
  1578. DECQ itr1
  1579. JG sealSSETail192LoopA
  1580. DECQ itr2
  1581. JGE sealSSETail192LoopB
  1582. PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
  1583. PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2
  1584. PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2
  1585. PADDL ctr0Store, D0; PADDL ctr1Store, D1; PADDL ctr2Store, D2
  1586. MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
  1587. PXOR T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0
  1588. MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup)
  1589. MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3
  1590. PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
  1591. MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
  1592. MOVO A2, A1
  1593. MOVO B2, B1
  1594. MOVO C2, C1
  1595. MOVO D2, D1
  1596. MOVQ $128, itr1
  1597. LEAQ 128(inp), inp
  1598. SUBQ $128, inl
  1599. JMP sealSSE128SealHash
  1600. // ----------------------------------------------------------------------------
  1601. // Special seal optimization for buffers smaller than 129 bytes
  1602. sealSSE128:
  1603. // For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
  1604. MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
  1605. MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
  1606. MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
  1607. MOVO B0, T1; MOVO C0, T2; MOVO D1, T3
  1608. MOVQ $10, itr2
  1609. sealSSE128InnerCipherLoop:
  1610. chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
  1611. shiftB0Left; shiftB1Left; shiftB2Left
  1612. shiftC0Left; shiftC1Left; shiftC2Left
  1613. shiftD0Left; shiftD1Left; shiftD2Left
  1614. chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
  1615. shiftB0Right; shiftB1Right; shiftB2Right
  1616. shiftC0Right; shiftC1Right; shiftC2Right
  1617. shiftD0Right; shiftD1Right; shiftD2Right
  1618. DECQ itr2
  1619. JNE sealSSE128InnerCipherLoop
  1620. // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
  1621. PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
  1622. PADDL T1, B0; PADDL T1, B1; PADDL T1, B2
  1623. PADDL T2, C1; PADDL T2, C2
  1624. PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2
  1625. PAND ·polyClampMask<>(SB), A0
  1626. MOVOU A0, rStore
  1627. MOVOU B0, sStore
  1628. // Hash
  1629. MOVQ ad_len+80(FP), itr2
  1630. CALL polyHashADInternal<>(SB)
  1631. XORQ itr1, itr1
  1632. sealSSE128SealHash:
  1633. // itr1 holds the number of bytes encrypted but not yet hashed
  1634. CMPQ itr1, $16
  1635. JB sealSSE128Seal
  1636. polyAdd(0(oup))
  1637. polyMul
  1638. SUBQ $16, itr1
  1639. ADDQ $16, oup
  1640. JMP sealSSE128SealHash
  1641. sealSSE128Seal:
  1642. CMPQ inl, $16
  1643. JB sealSSETail
  1644. SUBQ $16, inl
  1645. // Load for decryption
  1646. MOVOU (inp), T0
  1647. PXOR T0, A1
  1648. MOVOU A1, (oup)
  1649. LEAQ (1*16)(inp), inp
  1650. LEAQ (1*16)(oup), oup
  1651. // Extract for hashing
  1652. MOVQ A1, t0
  1653. PSRLDQ $8, A1
  1654. MOVQ A1, t1
  1655. ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
  1656. polyMul
  1657. // Shift the stream "left"
  1658. MOVO B1, A1
  1659. MOVO C1, B1
  1660. MOVO D1, C1
  1661. MOVO A2, D1
  1662. MOVO B2, A2
  1663. MOVO C2, B2
  1664. MOVO D2, C2
  1665. JMP sealSSE128Seal
  1666. sealSSETail:
  1667. TESTQ inl, inl
  1668. JE sealSSEFinalize
  1669. // We can only load the PT one byte at a time to avoid read after end of buffer
  1670. MOVQ inl, itr2
  1671. SHLQ $4, itr2
  1672. LEAQ ·andMask<>(SB), t0
  1673. MOVQ inl, itr1
  1674. LEAQ -1(inp)(inl*1), inp
  1675. XORQ t2, t2
  1676. XORQ t3, t3
  1677. XORQ AX, AX
  1678. sealSSETailLoadLoop:
  1679. SHLQ $8, t2, t3
  1680. SHLQ $8, t2
  1681. MOVB (inp), AX
  1682. XORQ AX, t2
  1683. LEAQ -1(inp), inp
  1684. DECQ itr1
  1685. JNE sealSSETailLoadLoop
  1686. MOVQ t2, 0+tmpStore
  1687. MOVQ t3, 8+tmpStore
  1688. PXOR 0+tmpStore, A1
  1689. MOVOU A1, (oup)
  1690. MOVOU -16(t0)(itr2*1), T0
  1691. PAND T0, A1
  1692. MOVQ A1, t0
  1693. PSRLDQ $8, A1
  1694. MOVQ A1, t1
  1695. ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
  1696. polyMul
  1697. ADDQ inl, oup
  1698. sealSSEFinalize:
  1699. // Hash in the buffer lengths
  1700. ADDQ ad_len+80(FP), acc0
  1701. ADCQ src_len+56(FP), acc1
  1702. ADCQ $1, acc2
  1703. polyMul
  1704. // Final reduce
  1705. MOVQ acc0, t0
  1706. MOVQ acc1, t1
  1707. MOVQ acc2, t2
  1708. SUBQ $-5, acc0
  1709. SBBQ $-1, acc1
  1710. SBBQ $3, acc2
  1711. CMOVQCS t0, acc0
  1712. CMOVQCS t1, acc1
  1713. CMOVQCS t2, acc2
  1714. // Add in the "s" part of the key
  1715. ADDQ 0+sStore, acc0
  1716. ADCQ 8+sStore, acc1
  1717. // Finally store the tag at the end of the message
  1718. MOVQ acc0, (0*8)(oup)
  1719. MOVQ acc1, (1*8)(oup)
  1720. RET
  1721. // ----------------------------------------------------------------------------
  1722. // ------------------------- AVX2 Code ----------------------------------------
  1723. chacha20Poly1305Seal_AVX2:
  1724. VZEROUPPER
  1725. VMOVDQU ·chacha20Constants<>(SB), AA0
  1726. BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14
  1727. BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12
  1728. BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4
  1729. VPADDD ·avx2InitMask<>(SB), DD0, DD0
  1730. // Special optimizations, for very short buffers
  1731. CMPQ inl, $192
  1732. JBE seal192AVX2 // 33% faster
  1733. CMPQ inl, $320
  1734. JBE seal320AVX2 // 17% faster
  1735. // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
  1736. VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
  1737. VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3; VMOVDQA BB0, state1StoreAVX2
  1738. VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3; VMOVDQA CC0, state2StoreAVX2
  1739. VPADDD ·avx2IncMask<>(SB), DD0, DD1; VMOVDQA DD0, ctr0StoreAVX2
  1740. VPADDD ·avx2IncMask<>(SB), DD1, DD2; VMOVDQA DD1, ctr1StoreAVX2
  1741. VPADDD ·avx2IncMask<>(SB), DD2, DD3; VMOVDQA DD2, ctr2StoreAVX2
  1742. VMOVDQA DD3, ctr3StoreAVX2
  1743. MOVQ $10, itr2
  1744. sealAVX2IntroLoop:
  1745. VMOVDQA CC3, tmpStoreAVX2
  1746. chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
  1747. VMOVDQA tmpStoreAVX2, CC3
  1748. VMOVDQA CC1, tmpStoreAVX2
  1749. chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
  1750. VMOVDQA tmpStoreAVX2, CC1
  1751. VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
  1752. VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1
  1753. VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2
  1754. VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3
  1755. VMOVDQA CC3, tmpStoreAVX2
  1756. chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
  1757. VMOVDQA tmpStoreAVX2, CC3
  1758. VMOVDQA CC1, tmpStoreAVX2
  1759. chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
  1760. VMOVDQA tmpStoreAVX2, CC1
  1761. VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
  1762. VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1
  1763. VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2
  1764. VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3
  1765. DECQ itr2
  1766. JNE sealAVX2IntroLoop
  1767. VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
  1768. VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
  1769. VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
  1770. VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
  1771. VPERM2I128 $0x13, CC0, DD0, CC0 // Stream bytes 96 - 127
  1772. VPERM2I128 $0x02, AA0, BB0, DD0 // The Poly1305 key
  1773. VPERM2I128 $0x13, AA0, BB0, AA0 // Stream bytes 64 - 95
  1774. // Clamp and store poly key
  1775. VPAND ·polyClampMask<>(SB), DD0, DD0
  1776. VMOVDQA DD0, rsStoreAVX2
  1777. // Hash AD
  1778. MOVQ ad_len+80(FP), itr2
  1779. CALL polyHashADInternal<>(SB)
  1780. // Can store at least 320 bytes
  1781. VPXOR (0*32)(inp), AA0, AA0
  1782. VPXOR (1*32)(inp), CC0, CC0
  1783. VMOVDQU AA0, (0*32)(oup)
  1784. VMOVDQU CC0, (1*32)(oup)
  1785. VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
  1786. VPXOR (2*32)(inp), AA0, AA0; VPXOR (3*32)(inp), BB0, BB0; VPXOR (4*32)(inp), CC0, CC0; VPXOR (5*32)(inp), DD0, DD0
  1787. VMOVDQU AA0, (2*32)(oup); VMOVDQU BB0, (3*32)(oup); VMOVDQU CC0, (4*32)(oup); VMOVDQU DD0, (5*32)(oup)
  1788. VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
  1789. VPXOR (6*32)(inp), AA0, AA0; VPXOR (7*32)(inp), BB0, BB0; VPXOR (8*32)(inp), CC0, CC0; VPXOR (9*32)(inp), DD0, DD0
  1790. VMOVDQU AA0, (6*32)(oup); VMOVDQU BB0, (7*32)(oup); VMOVDQU CC0, (8*32)(oup); VMOVDQU DD0, (9*32)(oup)
  1791. MOVQ $320, itr1
  1792. SUBQ $320, inl
  1793. LEAQ 320(inp), inp
  1794. VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, CC3, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, CC3, DD3, DD0
  1795. CMPQ inl, $128
  1796. JBE sealAVX2SealHash
  1797. VPXOR (0*32)(inp), AA0, AA0; VPXOR (1*32)(inp), BB0, BB0; VPXOR (2*32)(inp), CC0, CC0; VPXOR (3*32)(inp), DD0, DD0
  1798. VMOVDQU AA0, (10*32)(oup); VMOVDQU BB0, (11*32)(oup); VMOVDQU CC0, (12*32)(oup); VMOVDQU DD0, (13*32)(oup)
  1799. SUBQ $128, inl
  1800. LEAQ 128(inp), inp
  1801. MOVQ $8, itr1
  1802. MOVQ $2, itr2
  1803. CMPQ inl, $128
  1804. JBE sealAVX2Tail128
  1805. CMPQ inl, $256
  1806. JBE sealAVX2Tail256
  1807. CMPQ inl, $384
  1808. JBE sealAVX2Tail384
  1809. CMPQ inl, $512
  1810. JBE sealAVX2Tail512
  1811. // We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop
  1812. VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
  1813. VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
  1814. VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
  1815. VMOVDQA ctr3StoreAVX2, DD0
  1816. VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
  1817. VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
  1818. VMOVDQA CC3, tmpStoreAVX2
  1819. chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
  1820. VMOVDQA tmpStoreAVX2, CC3
  1821. VMOVDQA CC1, tmpStoreAVX2
  1822. chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
  1823. VMOVDQA tmpStoreAVX2, CC1
  1824. VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
  1825. VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1
  1826. VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2
  1827. VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3
  1828. VMOVDQA CC3, tmpStoreAVX2
  1829. chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
  1830. VMOVDQA tmpStoreAVX2, CC3
  1831. VMOVDQA CC1, tmpStoreAVX2
  1832. chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
  1833. VMOVDQA tmpStoreAVX2, CC1
  1834. VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
  1835. VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1
  1836. VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2
  1837. VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3
  1838. VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  1839. VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  1840. VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
  1841. VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  1842. VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  1843. VMOVDQA CC3, tmpStoreAVX2
  1844. VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
  1845. VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
  1846. VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
  1847. VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
  1848. VMOVDQA tmpStoreAVX2, CC3
  1849. SUBQ $16, oup // Adjust the pointer
  1850. MOVQ $9, itr1
  1851. JMP sealAVX2InternalLoopStart
  1852. sealAVX2MainLoop:
  1853. // Load state, increment counter blocks, store the incremented counters
  1854. VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
  1855. VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
  1856. VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
  1857. VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
  1858. VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
  1859. MOVQ $10, itr1
  1860. sealAVX2InternalLoop:
  1861. polyAdd(0*8(oup))
  1862. VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  1863. polyMulStage1_AVX2
  1864. VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  1865. VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
  1866. polyMulStage2_AVX2
  1867. VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  1868. VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  1869. polyMulStage3_AVX2
  1870. VMOVDQA CC3, tmpStoreAVX2
  1871. VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
  1872. VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
  1873. VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
  1874. VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
  1875. VMOVDQA tmpStoreAVX2, CC3
  1876. polyMulReduceStage
  1877. sealAVX2InternalLoopStart:
  1878. VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  1879. VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  1880. VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
  1881. polyAdd(2*8(oup))
  1882. VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  1883. polyMulStage1_AVX2
  1884. VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  1885. VMOVDQA CC3, tmpStoreAVX2
  1886. VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
  1887. VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
  1888. VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
  1889. VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
  1890. VMOVDQA tmpStoreAVX2, CC3
  1891. polyMulStage2_AVX2
  1892. VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
  1893. VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
  1894. VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
  1895. VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  1896. polyMulStage3_AVX2
  1897. VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  1898. VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
  1899. polyMulReduceStage
  1900. VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  1901. VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  1902. polyAdd(4*8(oup))
  1903. LEAQ (6*8)(oup), oup
  1904. VMOVDQA CC3, tmpStoreAVX2
  1905. VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
  1906. VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
  1907. VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
  1908. VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
  1909. VMOVDQA tmpStoreAVX2, CC3
  1910. polyMulStage1_AVX2
  1911. VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  1912. VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  1913. polyMulStage2_AVX2
  1914. VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
  1915. VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  1916. polyMulStage3_AVX2
  1917. VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  1918. VMOVDQA CC3, tmpStoreAVX2
  1919. VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
  1920. VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
  1921. VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
  1922. VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
  1923. VMOVDQA tmpStoreAVX2, CC3
  1924. polyMulReduceStage
  1925. VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
  1926. VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
  1927. VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
  1928. DECQ itr1
  1929. JNE sealAVX2InternalLoop
  1930. VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
  1931. VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
  1932. VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
  1933. VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
  1934. VMOVDQA CC3, tmpStoreAVX2
  1935. // We only hashed 480 of the 512 bytes available - hash the remaining 32 here
  1936. polyAdd(0*8(oup))
  1937. polyMulAVX2
  1938. LEAQ (4*8)(oup), oup
  1939. VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
  1940. VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
  1941. VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
  1942. VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
  1943. VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
  1944. VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
  1945. // and here
  1946. polyAdd(-2*8(oup))
  1947. polyMulAVX2
  1948. VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
  1949. VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
  1950. VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
  1951. VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
  1952. VPXOR (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0
  1953. VMOVDQU AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup)
  1954. LEAQ (32*16)(inp), inp
  1955. SUBQ $(32*16), inl
  1956. CMPQ inl, $512
  1957. JG sealAVX2MainLoop
  1958. // Tail can only hash 480 bytes
  1959. polyAdd(0*8(oup))
  1960. polyMulAVX2
  1961. polyAdd(2*8(oup))
  1962. polyMulAVX2
  1963. LEAQ 32(oup), oup
  1964. MOVQ $10, itr1
  1965. MOVQ $0, itr2
  1966. CMPQ inl, $128
  1967. JBE sealAVX2Tail128
  1968. CMPQ inl, $256
  1969. JBE sealAVX2Tail256
  1970. CMPQ inl, $384
  1971. JBE sealAVX2Tail384
  1972. JMP sealAVX2Tail512
  1973. // ----------------------------------------------------------------------------
  1974. // Special optimization for buffers smaller than 193 bytes
  1975. seal192AVX2:
  1976. // For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks
  1977. VMOVDQA AA0, AA1
  1978. VMOVDQA BB0, BB1
  1979. VMOVDQA CC0, CC1
  1980. VPADDD ·avx2IncMask<>(SB), DD0, DD1
  1981. VMOVDQA AA0, AA2
  1982. VMOVDQA BB0, BB2
  1983. VMOVDQA CC0, CC2
  1984. VMOVDQA DD0, DD2
  1985. VMOVDQA DD1, TT3
  1986. MOVQ $10, itr2
  1987. sealAVX2192InnerCipherLoop:
  1988. chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  1989. VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
  1990. VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
  1991. VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
  1992. chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  1993. VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
  1994. VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
  1995. VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
  1996. DECQ itr2
  1997. JNE sealAVX2192InnerCipherLoop
  1998. VPADDD AA2, AA0, AA0; VPADDD AA2, AA1, AA1
  1999. VPADDD BB2, BB0, BB0; VPADDD BB2, BB1, BB1
  2000. VPADDD CC2, CC0, CC0; VPADDD CC2, CC1, CC1
  2001. VPADDD DD2, DD0, DD0; VPADDD TT3, DD1, DD1
  2002. VPERM2I128 $0x02, AA0, BB0, TT0
  2003. // Clamp and store poly key
  2004. VPAND ·polyClampMask<>(SB), TT0, TT0
  2005. VMOVDQA TT0, rsStoreAVX2
  2006. // Stream for up to 192 bytes
  2007. VPERM2I128 $0x13, AA0, BB0, AA0
  2008. VPERM2I128 $0x13, CC0, DD0, BB0
  2009. VPERM2I128 $0x02, AA1, BB1, CC0
  2010. VPERM2I128 $0x02, CC1, DD1, DD0
  2011. VPERM2I128 $0x13, AA1, BB1, AA1
  2012. VPERM2I128 $0x13, CC1, DD1, BB1
  2013. sealAVX2ShortSeal:
  2014. // Hash aad
  2015. MOVQ ad_len+80(FP), itr2
  2016. CALL polyHashADInternal<>(SB)
  2017. XORQ itr1, itr1
  2018. sealAVX2SealHash:
  2019. // itr1 holds the number of bytes encrypted but not yet hashed
  2020. CMPQ itr1, $16
  2021. JB sealAVX2ShortSealLoop
  2022. polyAdd(0(oup))
  2023. polyMul
  2024. SUBQ $16, itr1
  2025. ADDQ $16, oup
  2026. JMP sealAVX2SealHash
  2027. sealAVX2ShortSealLoop:
  2028. CMPQ inl, $32
  2029. JB sealAVX2ShortTail32
  2030. SUBQ $32, inl
  2031. // Load for encryption
  2032. VPXOR (inp), AA0, AA0
  2033. VMOVDQU AA0, (oup)
  2034. LEAQ (1*32)(inp), inp
  2035. // Now can hash
  2036. polyAdd(0*8(oup))
  2037. polyMulAVX2
  2038. polyAdd(2*8(oup))
  2039. polyMulAVX2
  2040. LEAQ (1*32)(oup), oup
  2041. // Shift stream left
  2042. VMOVDQA BB0, AA0
  2043. VMOVDQA CC0, BB0
  2044. VMOVDQA DD0, CC0
  2045. VMOVDQA AA1, DD0
  2046. VMOVDQA BB1, AA1
  2047. VMOVDQA CC1, BB1
  2048. VMOVDQA DD1, CC1
  2049. VMOVDQA AA2, DD1
  2050. VMOVDQA BB2, AA2
  2051. JMP sealAVX2ShortSealLoop
  2052. sealAVX2ShortTail32:
  2053. CMPQ inl, $16
  2054. VMOVDQA A0, A1
  2055. JB sealAVX2ShortDone
  2056. SUBQ $16, inl
  2057. // Load for encryption
  2058. VPXOR (inp), A0, T0
  2059. VMOVDQU T0, (oup)
  2060. LEAQ (1*16)(inp), inp
  2061. // Hash
  2062. polyAdd(0*8(oup))
  2063. polyMulAVX2
  2064. LEAQ (1*16)(oup), oup
  2065. VPERM2I128 $0x11, AA0, AA0, AA0
  2066. VMOVDQA A0, A1
  2067. sealAVX2ShortDone:
  2068. VZEROUPPER
  2069. JMP sealSSETail
  2070. // ----------------------------------------------------------------------------
  2071. // Special optimization for buffers smaller than 321 bytes
  2072. seal320AVX2:
  2073. // For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
  2074. VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1
  2075. VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2
  2076. VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3
  2077. MOVQ $10, itr2
  2078. sealAVX2320InnerCipherLoop:
  2079. chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  2080. VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
  2081. VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  2082. VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
  2083. chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  2084. VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
  2085. VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  2086. VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
  2087. DECQ itr2
  2088. JNE sealAVX2320InnerCipherLoop
  2089. VMOVDQA ·chacha20Constants<>(SB), TT0
  2090. VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2
  2091. VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2
  2092. VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2
  2093. VMOVDQA ·avx2IncMask<>(SB), TT0
  2094. VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3
  2095. VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3
  2096. VPADDD TT3, DD2, DD2
  2097. // Clamp and store poly key
  2098. VPERM2I128 $0x02, AA0, BB0, TT0
  2099. VPAND ·polyClampMask<>(SB), TT0, TT0
  2100. VMOVDQA TT0, rsStoreAVX2
  2101. // Stream for up to 320 bytes
  2102. VPERM2I128 $0x13, AA0, BB0, AA0
  2103. VPERM2I128 $0x13, CC0, DD0, BB0
  2104. VPERM2I128 $0x02, AA1, BB1, CC0
  2105. VPERM2I128 $0x02, CC1, DD1, DD0
  2106. VPERM2I128 $0x13, AA1, BB1, AA1
  2107. VPERM2I128 $0x13, CC1, DD1, BB1
  2108. VPERM2I128 $0x02, AA2, BB2, CC1
  2109. VPERM2I128 $0x02, CC2, DD2, DD1
  2110. VPERM2I128 $0x13, AA2, BB2, AA2
  2111. VPERM2I128 $0x13, CC2, DD2, BB2
  2112. JMP sealAVX2ShortSeal
  2113. // ----------------------------------------------------------------------------
  2114. // Special optimization for the last 128 bytes of ciphertext
  2115. sealAVX2Tail128:
  2116. // Need to decrypt up to 128 bytes - prepare two blocks
  2117. // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
  2118. // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
  2119. VMOVDQA ·chacha20Constants<>(SB), AA0
  2120. VMOVDQA state1StoreAVX2, BB0
  2121. VMOVDQA state2StoreAVX2, CC0
  2122. VMOVDQA ctr3StoreAVX2, DD0
  2123. VPADDD ·avx2IncMask<>(SB), DD0, DD0
  2124. VMOVDQA DD0, DD1
  2125. sealAVX2Tail128LoopA:
  2126. polyAdd(0(oup))
  2127. polyMul
  2128. LEAQ 16(oup), oup
  2129. sealAVX2Tail128LoopB:
  2130. chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
  2131. polyAdd(0(oup))
  2132. polyMul
  2133. VPALIGNR $4, BB0, BB0, BB0
  2134. VPALIGNR $8, CC0, CC0, CC0
  2135. VPALIGNR $12, DD0, DD0, DD0
  2136. chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
  2137. polyAdd(16(oup))
  2138. polyMul
  2139. LEAQ 32(oup), oup
  2140. VPALIGNR $12, BB0, BB0, BB0
  2141. VPALIGNR $8, CC0, CC0, CC0
  2142. VPALIGNR $4, DD0, DD0, DD0
  2143. DECQ itr1
  2144. JG sealAVX2Tail128LoopA
  2145. DECQ itr2
  2146. JGE sealAVX2Tail128LoopB
  2147. VPADDD ·chacha20Constants<>(SB), AA0, AA1
  2148. VPADDD state1StoreAVX2, BB0, BB1
  2149. VPADDD state2StoreAVX2, CC0, CC1
  2150. VPADDD DD1, DD0, DD1
  2151. VPERM2I128 $0x02, AA1, BB1, AA0
  2152. VPERM2I128 $0x02, CC1, DD1, BB0
  2153. VPERM2I128 $0x13, AA1, BB1, CC0
  2154. VPERM2I128 $0x13, CC1, DD1, DD0
  2155. JMP sealAVX2ShortSealLoop
  2156. // ----------------------------------------------------------------------------
  2157. // Special optimization for the last 256 bytes of ciphertext
  2158. sealAVX2Tail256:
  2159. // Need to decrypt up to 256 bytes - prepare two blocks
  2160. // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
  2161. // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
  2162. VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA ·chacha20Constants<>(SB), AA1
  2163. VMOVDQA state1StoreAVX2, BB0; VMOVDQA state1StoreAVX2, BB1
  2164. VMOVDQA state2StoreAVX2, CC0; VMOVDQA state2StoreAVX2, CC1
  2165. VMOVDQA ctr3StoreAVX2, DD0
  2166. VPADDD ·avx2IncMask<>(SB), DD0, DD0
  2167. VPADDD ·avx2IncMask<>(SB), DD0, DD1
  2168. VMOVDQA DD0, TT1
  2169. VMOVDQA DD1, TT2
  2170. sealAVX2Tail256LoopA:
  2171. polyAdd(0(oup))
  2172. polyMul
  2173. LEAQ 16(oup), oup
  2174. sealAVX2Tail256LoopB:
  2175. chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  2176. polyAdd(0(oup))
  2177. polyMul
  2178. VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
  2179. VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
  2180. VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
  2181. chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  2182. polyAdd(16(oup))
  2183. polyMul
  2184. LEAQ 32(oup), oup
  2185. VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
  2186. VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
  2187. VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
  2188. DECQ itr1
  2189. JG sealAVX2Tail256LoopA
  2190. DECQ itr2
  2191. JGE sealAVX2Tail256LoopB
  2192. VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1
  2193. VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1
  2194. VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1
  2195. VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1
  2196. VPERM2I128 $0x02, AA0, BB0, TT0
  2197. VPERM2I128 $0x02, CC0, DD0, TT1
  2198. VPERM2I128 $0x13, AA0, BB0, TT2
  2199. VPERM2I128 $0x13, CC0, DD0, TT3
  2200. VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
  2201. VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
  2202. MOVQ $128, itr1
  2203. LEAQ 128(inp), inp
  2204. SUBQ $128, inl
  2205. VPERM2I128 $0x02, AA1, BB1, AA0
  2206. VPERM2I128 $0x02, CC1, DD1, BB0
  2207. VPERM2I128 $0x13, AA1, BB1, CC0
  2208. VPERM2I128 $0x13, CC1, DD1, DD0
  2209. JMP sealAVX2SealHash
  2210. // ----------------------------------------------------------------------------
  2211. // Special optimization for the last 384 bytes of ciphertext
  2212. sealAVX2Tail384:
  2213. // Need to decrypt up to 384 bytes - prepare two blocks
  2214. // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
  2215. // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
  2216. VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
  2217. VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2
  2218. VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2
  2219. VMOVDQA ctr3StoreAVX2, DD0
  2220. VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2
  2221. VMOVDQA DD0, TT1; VMOVDQA DD1, TT2; VMOVDQA DD2, TT3
  2222. sealAVX2Tail384LoopA:
  2223. polyAdd(0(oup))
  2224. polyMul
  2225. LEAQ 16(oup), oup
  2226. sealAVX2Tail384LoopB:
  2227. chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  2228. polyAdd(0(oup))
  2229. polyMul
  2230. VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
  2231. VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  2232. VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
  2233. chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  2234. polyAdd(16(oup))
  2235. polyMul
  2236. LEAQ 32(oup), oup
  2237. VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
  2238. VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  2239. VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
  2240. DECQ itr1
  2241. JG sealAVX2Tail384LoopA
  2242. DECQ itr2
  2243. JGE sealAVX2Tail384LoopB
  2244. VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2
  2245. VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2
  2246. VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2
  2247. VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1; VPADDD TT3, DD2, DD2
  2248. VPERM2I128 $0x02, AA0, BB0, TT0
  2249. VPERM2I128 $0x02, CC0, DD0, TT1
  2250. VPERM2I128 $0x13, AA0, BB0, TT2
  2251. VPERM2I128 $0x13, CC0, DD0, TT3
  2252. VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
  2253. VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
  2254. VPERM2I128 $0x02, AA1, BB1, TT0
  2255. VPERM2I128 $0x02, CC1, DD1, TT1
  2256. VPERM2I128 $0x13, AA1, BB1, TT2
  2257. VPERM2I128 $0x13, CC1, DD1, TT3
  2258. VPXOR (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3
  2259. VMOVDQU TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup)
  2260. MOVQ $256, itr1
  2261. LEAQ 256(inp), inp
  2262. SUBQ $256, inl
  2263. VPERM2I128 $0x02, AA2, BB2, AA0
  2264. VPERM2I128 $0x02, CC2, DD2, BB0
  2265. VPERM2I128 $0x13, AA2, BB2, CC0
  2266. VPERM2I128 $0x13, CC2, DD2, DD0
  2267. JMP sealAVX2SealHash
  2268. // ----------------------------------------------------------------------------
  2269. // Special optimization for the last 512 bytes of ciphertext
  2270. sealAVX2Tail512:
  2271. // Need to decrypt up to 512 bytes - prepare two blocks
  2272. // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
  2273. // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
  2274. VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
  2275. VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
  2276. VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
  2277. VMOVDQA ctr3StoreAVX2, DD0
  2278. VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
  2279. VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
  2280. sealAVX2Tail512LoopA:
  2281. polyAdd(0(oup))
  2282. polyMul
  2283. LEAQ 16(oup), oup
  2284. sealAVX2Tail512LoopB:
  2285. VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  2286. VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  2287. VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
  2288. VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  2289. VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  2290. VMOVDQA CC3, tmpStoreAVX2
  2291. VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
  2292. VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
  2293. VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
  2294. VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
  2295. VMOVDQA tmpStoreAVX2, CC3
  2296. polyAdd(0*8(oup))
  2297. polyMulAVX2
  2298. VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  2299. VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  2300. VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
  2301. VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  2302. VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  2303. VMOVDQA CC3, tmpStoreAVX2
  2304. VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
  2305. VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
  2306. VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
  2307. VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
  2308. VMOVDQA tmpStoreAVX2, CC3
  2309. VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
  2310. VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
  2311. VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
  2312. VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  2313. VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  2314. VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
  2315. VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  2316. VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  2317. polyAdd(2*8(oup))
  2318. polyMulAVX2
  2319. LEAQ (4*8)(oup), oup
  2320. VMOVDQA CC3, tmpStoreAVX2
  2321. VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
  2322. VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
  2323. VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
  2324. VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
  2325. VMOVDQA tmpStoreAVX2, CC3
  2326. VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  2327. VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  2328. VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
  2329. VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  2330. VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  2331. VMOVDQA CC3, tmpStoreAVX2
  2332. VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
  2333. VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
  2334. VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
  2335. VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
  2336. VMOVDQA tmpStoreAVX2, CC3
  2337. VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
  2338. VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
  2339. VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
  2340. DECQ itr1
  2341. JG sealAVX2Tail512LoopA
  2342. DECQ itr2
  2343. JGE sealAVX2Tail512LoopB
  2344. VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
  2345. VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
  2346. VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
  2347. VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
  2348. VMOVDQA CC3, tmpStoreAVX2
  2349. VPERM2I128 $0x02, AA0, BB0, CC3
  2350. VPXOR (0*32)(inp), CC3, CC3
  2351. VMOVDQU CC3, (0*32)(oup)
  2352. VPERM2I128 $0x02, CC0, DD0, CC3
  2353. VPXOR (1*32)(inp), CC3, CC3
  2354. VMOVDQU CC3, (1*32)(oup)
  2355. VPERM2I128 $0x13, AA0, BB0, CC3
  2356. VPXOR (2*32)(inp), CC3, CC3
  2357. VMOVDQU CC3, (2*32)(oup)
  2358. VPERM2I128 $0x13, CC0, DD0, CC3
  2359. VPXOR (3*32)(inp), CC3, CC3
  2360. VMOVDQU CC3, (3*32)(oup)
  2361. VPERM2I128 $0x02, AA1, BB1, AA0
  2362. VPERM2I128 $0x02, CC1, DD1, BB0
  2363. VPERM2I128 $0x13, AA1, BB1, CC0
  2364. VPERM2I128 $0x13, CC1, DD1, DD0
  2365. VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
  2366. VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
  2367. VPERM2I128 $0x02, AA2, BB2, AA0
  2368. VPERM2I128 $0x02, CC2, DD2, BB0
  2369. VPERM2I128 $0x13, AA2, BB2, CC0
  2370. VPERM2I128 $0x13, CC2, DD2, DD0
  2371. VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
  2372. VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
  2373. MOVQ $384, itr1
  2374. LEAQ 384(inp), inp
  2375. SUBQ $384, inl
  2376. VPERM2I128 $0x02, AA3, BB3, AA0
  2377. VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0
  2378. VPERM2I128 $0x13, AA3, BB3, CC0
  2379. VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
  2380. JMP sealAVX2SealHash