From 29dea9ae1f54e8eae695edbbd4dbc9982c32c489 Mon Sep 17 00:00:00 2001 From: brent s Date: Tue, 10 Jan 2023 07:24:12 -0500 Subject: [PATCH] initial release. tests pass at least. --- .gitignore | 2 +- README.adoc | 21 +- _patches/chacha20_nonce.patch | 13 + _patches/chacha20poly1305_nonce.patch | 13 + algos_test.go | 34 + consts.go | 21 +- consts_test.go | 91 +- errs.go | 11 + funcs.go | 33 + funcs_chacha20poly1305_ssh.go | 100 + funcs_test.go | 53 + types.go | 11 + .../chacha20poly1305/chacha20poly1305.go | 100 + .../chacha20poly1305_amd64.go | 87 + .../chacha20poly1305/chacha20poly1305_amd64.s | 2696 +++++++++++++++++ .../chacha20poly1305_generic.go | 81 + .../chacha20poly1305_noasm.go | 16 + .../chacha20poly1305/xchacha20poly1305.go | 86 + vendor/modules.txt | 1 + 19 files changed, 3443 insertions(+), 27 deletions(-) create mode 100644 _patches/chacha20_nonce.patch create mode 100644 _patches/chacha20poly1305_nonce.patch create mode 100644 algos_test.go create mode 100644 errs.go create mode 100644 funcs.go create mode 100644 funcs_chacha20poly1305_ssh.go create mode 100644 funcs_test.go create mode 100644 types.go create mode 100644 vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305.go create mode 100644 vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.go create mode 100644 vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s create mode 100644 vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_generic.go create mode 100644 vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_noasm.go create mode 100644 vendor/golang.org/x/crypto/chacha20poly1305/xchacha20poly1305.go diff --git a/.gitignore b/.gitignore index 7756a87..61e5e34 100644 --- a/.gitignore +++ b/.gitignore @@ -19,4 +19,4 @@ .idea/ # Dependency directories (remove the comment below to include it) -# vendor/ +!/vendor/ diff --git a/README.adoc b/README.adoc index 9ea7be3..d6df2c7 100644 --- a/README.adoc +++ b/README.adoc @@ -18,33 +18,42 @@ Last updated {localdatetime} A Golang library variant of ChaCha20-Poly1305 that OpenSSH uses (`chacha20-poly1305@openssh.com`). -Note that this module *only* supports the OpenSSH variant, and should only be used for key generation/parsing/modification/manipulation, not actual connection/stream encryption. +WARNING: Note that this module *only* supports the OpenSSH variant, and should only be used for key generation/parsing/modification/manipulation, not actual connection/stream encryption. -== Why is this necessary? +== Usage +The usage of this library is *very* limited in scope; it's only intended for low-level OpenSSH key operations. + +Primarily, for use with https://git.r00t2.io/r00t2/go_sshkeys/src/branch/master[go_sshkeys^]. + +== FAQ + +=== Why is this necessary? Because Golang.org/x/crypto https://github.com/golang/go/issues/36646[removes functionality^] (even for https://github.com/golang/go/issues/44226[very common tech^]) and thinks OpenSSH is a "weird" use case. That's a direct reference; they called it "weird". I *really, really* hope this library is https://github.com/golang/go/issues/57699[no longer necessary^] by the time I'm done writing it but based on my past experiences with core Golang devs, my expectations are extremely low. +_Narrator: It was still necessary ._ + They have no decent support for OpenSSH keys or lower-level operations. And guess what -- sometimes you *need* lower-level functionality. Who knew? -So now because I'm just a single individual, bug fixes will probably lag behind upstream. All because Golang devs decided the OpenSSH variant was "too weird". +So now because I'm just a single individual, bug fixes will probably lag behind upstream if I have to re-vendor because I'm not maintaining an entire fork of golang.org/x/crypto. All because Golang devs decided the OpenSSH variant was "too weird". But, of course, not "weird" enough to https://github.com/golang/crypto/blob/3d872d042823aed41f28af3b13beb27c0c9b1e35/ssh/cipher.go#L652[not support the *wire* protocol^] for SSH. Just the key encryption. Because of course. And not publicly exposed either. Because *of course*. Assholes. -== Why is the name so ugly? +=== Why is the name so ugly? I couldn't think of a better one and I wanted something notably distinct from the stdlib-x naming. And module names can't include the `@` symbol. -== Why don't you expose the rest of ChaCha20/Poly1305/ChaCha20-Poly1305? +=== Why don't you expose the rest of low-level ChaCha20/Poly1305/ChaCha20-Poly1305? * To keep code changes from upstream light (and thus easier to debug, audit, etc.) * Because otherwise the module name is inaccurate ** Because OpenSSH has their own specific variant ** Which means we can handle SSH-specific functionality if needed -* Because golang/x/crypto has made it painfully clear that if you want something that deviates from what they *think* is "best practice", you need to do it yourself +* Because golang.org/x/crypto has made it painfully clear that if you want something that deviates from what they *think* is "best practice", you need to do it yourself ** Which ironically is something they also brand an "anti-pattern" which is just \*chef's kiss* diff --git a/_patches/chacha20_nonce.patch b/_patches/chacha20_nonce.patch new file mode 100644 index 0000000..19aa19d --- /dev/null +++ b/_patches/chacha20_nonce.patch @@ -0,0 +1,13 @@ +--- a/vendor/golang.org/x/crypto/chacha20/chacha_generic.go 2023-01-09 05:16:30.912219212 -0500 ++++ b/vendor/golang.org/x/crypto/chacha20/chacha_generic.go 2023-01-10 02:20:55.150612278 -0500 +@@ -24,7 +24,9 @@ + // + // Note that this is too short to be safely generated at random if the same + // key is reused more than 2³² times. +- NonceSize = 12 ++ //NonceSize = 12 ++ // BITE ME, GOLANG DEVS. ++ NonceSize = 16 + + // NonceSizeX is the size of the nonce used with the XChaCha20 variant of + // this cipher, in bytes. diff --git a/_patches/chacha20poly1305_nonce.patch b/_patches/chacha20poly1305_nonce.patch new file mode 100644 index 0000000..20940e1 --- /dev/null +++ b/_patches/chacha20poly1305_nonce.patch @@ -0,0 +1,13 @@ +--- a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305.go 2023-01-09 05:16:30.912219212 -0500 ++++ b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305.go 2023-01-10 02:58:31.006536303 -0500 +@@ -21,7 +21,9 @@ + // + // Note that this is too short to be safely generated at random if the same + // key is reused more than 2³² times. +- NonceSize = 12 ++ // NonceSize = 12 ++ // BITE ME, GOLANG DEVS. ++ NonceSize = 16 + + // NonceSizeX is the size of the nonce used with the XChaCha20-Poly1305 + // variant of this AEAD, in bytes. diff --git a/algos_test.go b/algos_test.go new file mode 100644 index 0000000..4c5a151 --- /dev/null +++ b/algos_test.go @@ -0,0 +1,34 @@ +package cc20p1305ssh + +import ( + `testing` + + `golang.org/x/crypto/chacha20` + `golang.org/x/crypto/chacha20poly1305` +) + +func TestChaCha20FixedNonceSize(t *testing.T) { + + if NonceSize != chacha20.NonceSize { + t.Fatal( + "Mismatched ChaCha20 nonce sizes;\n" + + "\tgolang.org/x/crypto/chacha20 only supports 12 or 24 bytes nonce size,\n\t" + + "the chacha20poly1305@openssh.com variant only supports a nonce of 16 bytes.\n\t" + + "vendor/golang.org/x/crypto/chacha20/chacha_generic.go:NonceSize must be modified.", + ) + } + t.Log("The nonce size modification for chacha20 is correct.") +} + +func TestChaCha20Poly1305FixedNonceSize(t *testing.T) { + + if NonceSize != chacha20poly1305.NonceSize { + t.Fatal( + "Mismatched ChaCha20Poly1305 nonce sizes;\n" + + "\tgolang.org/x/crypto/chacha20poly1305 only supports 12 or 24 bytes nonce size,\n\t" + + "the chacha20poly1305@openssh.com variant only supports a nonce of 16 bytes.\n\t" + + "vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305.go:NonceSize must be modified.", + ) + } + t.Log("The nonce size modification for chacha20poly1305 is correct.") +} diff --git a/consts.go b/consts.go index 76188e2..832c927 100644 --- a/consts.go +++ b/consts.go @@ -35,14 +35,27 @@ const ( */ NonceSize int = 16 - // TagLen is the length of the Poly1305 tag. - TagLen int = poly1305.TagSize + // PolyKeySize is the amount of the cipher result of chacha20. + PolyKeySize int = 32 - // DefaultRounds specifies the number of default rounds to use if using the provided KDF derivation and the specified rounds are 0 or negative. - DefaultRounds int = 16 + // TagSize is the length of the Poly1305 tag. + TagSize int = poly1305.TagSize ) var ( + // initBlock is used at counter 0 in chacha20 to get the poly1305 key. + initBlock []byte = []byte{ + // 64 bytes + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + } + // iv is the constant fixed IV. iv []byte = []byte{ 0x0, 0x0, 0x0, 0x0, diff --git a/consts_test.go b/consts_test.go index 5b7012b..cf14a7f 100644 --- a/consts_test.go +++ b/consts_test.go @@ -1,20 +1,79 @@ package cc20p1305ssh -import ( - `testing` +var ( + /* + testKdfKey is the key that would be returned by bcrypt_pbkdf. - `golang.org/x/crypto/chacha20` -) - -func TestFixedNonceSize(t *testing.T) { - - if NonceSize != chacha20.NonceSize { - t.Error( - "Mismatched ChaCha20 nonce sizes;\n" + - "\tgolang.org/x/crypto/chacha20 only supports 12 or 24 bytes nonce size,\n\t" + - "the chacha20poly1305@openssh.com variant only supports a nonce of 16 bytes.", - ) - t.Fail() + It would be created using the passphrase (seed) "test", with 16 rounds, + and 64 bytes keysize. + */ + testKdfKey []byte = []byte{ + // 64 bytes + 0xd7, 0xe4, 0x7e, 0xf3, 0x7c, 0xfd, 0x80, 0x6d, + 0x48, 0xfd, 0x8b, 0x46, 0xec, 0x53, 0x51, 0xef, + 0x3f, 0x15, 0xf8, 0xb0, 0xa2, 0x30, 0xcf, 0x60, + 0x2c, 0x09, 0xdd, 0x44, 0x91, 0xe9, 0x54, 0xc8, + 0x72, 0x03, 0x15, 0xe4, 0x83, 0x54, 0x30, 0xca, + 0x8a, 0xe1, 0x9b, 0x15, 0x3f, 0xbc, 0xbe, 0x45, + 0x17, 0xc5, 0x03, 0xe0, 0xc9, 0xda, 0x77, 0xff, + 0xe5, 0x95, 0xa3, 0x33, 0x17, 0xad, 0x4d, 0xe7, } - t.Log("The nonce size modification is correct.") -} + + // testPoly1305Tag is the poly1305 tag. + testPoly1305Tag []byte = []byte{ + // 16 bytes + 0x7d, 0x98, 0x49, 0xd7, + 0x60, 0x2b, 0x49, 0xf9, + 0xcf, 0xa0, 0x2d, 0x9f, + 0x39, 0x66, 0x2c, 0x66, + } + + // testEncBlock is the encrypted private key of an example ssh-ed25519 private key, encrypted with the passphrase "test" (block "4.0.1" in go_sshkeys). + // We skip the KDF for tests, though (see testKdfKey). + testEncBlock []byte = []byte{ + // 152 bytes + 0x9b, 0x24, 0x63, 0x64, 0xe7, 0xbe, 0xf7, 0xc1, + 0xc7, 0x20, 0xe1, 0x0e, 0xce, 0x79, 0x3b, 0x7c, + 0xe0, 0x61, 0xa4, 0xa8, 0xf3, 0x2c, 0x18, 0x36, + 0x3b, 0xb3, 0x85, 0x82, 0xfd, 0xed, 0x5d, 0xb6, + 0xd3, 0xb5, 0x93, 0x16, 0x3c, 0x5d, 0x06, 0xaa, + 0xb4, 0x9f, 0xc4, 0x6a, 0x87, 0xa1, 0xcb, 0x29, + 0xa0, 0x8f, 0x53, 0x99, 0x2b, 0x86, 0x56, 0xc5, + 0xf5, 0xaf, 0xeb, 0x1b, 0x86, 0x5c, 0x0e, 0x21, + 0x52, 0x4f, 0xf4, 0x76, 0xb8, 0x1d, 0x8a, 0x8b, + 0xaa, 0xda, 0x2a, 0xed, 0x08, 0xc1, 0xcb, 0x21, + 0x8a, 0x6a, 0x2e, 0xa5, 0x2d, 0x9d, 0x01, 0xe7, + 0x4c, 0x7b, 0x2e, 0x42, 0x30, 0x51, 0x64, 0x5f, + 0x9f, 0x46, 0xa6, 0x2b, 0x15, 0x73, 0xd2, 0xfa, + 0xa4, 0xf4, 0xdc, 0xe9, 0xfd, 0xf8, 0x91, 0xa3, + 0x11, 0xb5, 0xce, 0x18, 0xd7, 0x4a, 0xff, 0xa9, + 0xea, 0xd3, 0x7d, 0x44, 0xd0, 0xba, 0xec, 0x16, + 0xc3, 0xc6, 0xcd, 0x1d, 0xa2, 0x2d, 0x25, 0x17, + 0x45, 0x98, 0x9b, 0x7d, 0x06, 0xf2, 0x77, 0x17, + 0x54, 0xd2, 0xee, 0xc6, 0xa6, 0x64, 0xbc, 0x9a, + } + + // testPlainBlock should match the decrypted block of testEncBlock. + testPlainBlock []byte = []byte{ + // 152 bytes + 0x09, 0xf4, 0xb5, 0x8f, 0x09, 0xf4, 0xb5, 0x8f, + 0x00, 0x00, 0x00, 0x0b, 0x73, 0x73, 0x68, 0x2d, + 0x65, 0x64, 0x32, 0x35, 0x35, 0x31, 0x39, 0x00, + 0x00, 0x00, 0x20, 0xb8, 0xd3, 0x93, 0x30, 0x65, + 0x72, 0xa6, 0xe5, 0x7e, 0x22, 0xa1, 0x64, 0x5b, + 0x97, 0x9b, 0x01, 0x60, 0x7b, 0xdb, 0x1d, 0xcd, + 0xcf, 0x92, 0x01, 0xd4, 0x80, 0xc3, 0xa9, 0x0a, + 0xbd, 0xad, 0x20, 0x00, 0x00, 0x00, 0x40, 0x2d, + 0xad, 0x50, 0x3f, 0x38, 0x88, 0xe2, 0xc1, 0x57, + 0x0b, 0x1c, 0xa4, 0xcf, 0xbc, 0x7c, 0x39, 0xac, + 0x19, 0xf2, 0x7b, 0x02, 0xac, 0x91, 0x1b, 0xd6, + 0x05, 0x06, 0x00, 0xff, 0xe9, 0xb6, 0x83, 0xb8, + 0xd3, 0x93, 0x30, 0x65, 0x72, 0xa6, 0xe5, 0x7e, + 0x22, 0xa1, 0x64, 0x5b, 0x97, 0x9b, 0x01, 0x60, + 0x7b, 0xdb, 0x1d, 0xcd, 0xcf, 0x92, 0x01, 0xd4, + 0x80, 0xc3, 0xa9, 0x0a, 0xbd, 0xad, 0x20, 0x00, + 0x00, 0x00, 0x14, 0x62, 0x74, 0x73, 0x40, 0x64, + 0x61, 0x77, 0x69, 0x64, 0x2e, 0x72, 0x30, 0x30, + 0x74, 0x2e, 0x73, 0x70, 0x61, 0x63, 0x65, 0x01, + } +) diff --git a/errs.go b/errs.go new file mode 100644 index 0000000..0097fe3 --- /dev/null +++ b/errs.go @@ -0,0 +1,11 @@ +package cc20p1305ssh + +import ( + `errors` +) + +var ( + ErrInvalidKeySize error = errors.New("a key of invalid size was provided; it must be at least 32 bytes") + ErrInvalidTag = errors.New("the provided tag does not match the ciphertext") + ErrInvalidTagSize = errors.New("a tag of invalid size was provided; it must be at least 16 bytes") +) diff --git a/funcs.go b/funcs.go new file mode 100644 index 0000000..673413c --- /dev/null +++ b/funcs.go @@ -0,0 +1,33 @@ +package cc20p1305ssh + +/* + New returns a cipher.AEAD from KDF-derived key. + + Currently, key should be KDFKeySize bytes and returned by bcrypt_pbkdf as it's currently the + only OpenSSH-supported KDF. It is up to the caller to perform the appropriate KDF. + + Per the chacha20polycom1305@openssh.com specification, only the first KeySize bytes of key + is used for encrypting the private key. The second half (the canonical key is 64 bytes) + would be used for traffic purposes, but since this is a static blob it is not used. + + If key is nil or (1<<38)-64 { + panic("chacha20poly1305: plaintext too large") + } + */ + + // We need the crypter. + if cc20, err = chacha20.NewUnauthenticatedCipher(c.realKey[:], iv); err != nil { + return + } + + // First we need the poly1305 key. This also sets the counter to 1. + firstBlock = make([]byte, len(initBlock)) + cc20.XORKeyStream(firstBlock, initBlock) + copy(polyKey[:], firstBlock[:PolyKeySize]) + // We explicitly set the counter to 1, just in case. + cc20.SetCounter(1) + encrypted = make([]byte, len(plaintext)) + cc20.XORKeyStream(encrypted, plaintext) + + poly = poly1305.New(&polyKey) + poly.Write(encrypted) + tagTmp = poly.Sum(nil) + tag = make([]byte, TagSize) + copy(tag, tagTmp[:TagSize]) + + return +} diff --git a/funcs_test.go b/funcs_test.go new file mode 100644 index 0000000..282db75 --- /dev/null +++ b/funcs_test.go @@ -0,0 +1,53 @@ +package cc20p1305ssh + +import ( + `bytes` + `encoding/hex` + `testing` +) + +func TestNewCipher(t *testing.T) { + + var err error + var c *ChaCha20Poly1305OpenSSH + var enc []byte + var plain []byte + var tag []byte + + if c, err = New(testKdfKey); err != nil { + t.Fatal(err) + } + + t.Logf("ChaCha20Poly1305OpenSSH:\n%#v", c) + + // Decrypt... + if plain, err = c.Decrypt(testEncBlock, testPoly1305Tag); err != nil { + t.Log("Failed during decrypt!") + t.Fatal(err) + } + t.Logf("plain:\n%v", hex.EncodeToString(plain)) + + if !bytes.Equal(plain, testPlainBlock) { + t.Fatal("decrypted does not match!") + } + + // And encrypt... + if enc, tag, err = c.Encrypt(plain); err != nil { + t.Log("Failed during encrypt!") + t.Fatal(err) + } + t.Logf("crypted:\n%v", hex.EncodeToString(enc)) + + if !bytes.Equal(enc, testEncBlock) { + t.Fatal("encrypted does not match!") + } + + // And check tags. + if !bytes.Equal(tag, testPoly1305Tag) { + t.Logf("real tag: %v", hex.EncodeToString(testPoly1305Tag)) + t.Logf("generated tag: %v", hex.EncodeToString(tag)) + t.Fatal("tags do not match!") + } + + t.Log("The cipher is functioning correctly.") +} diff --git a/types.go b/types.go new file mode 100644 index 0000000..1f55a63 --- /dev/null +++ b/types.go @@ -0,0 +1,11 @@ +package cc20p1305ssh + +/* + ChaCha20Poly1305OpenSSH is an implementation of the chacha20poly1305@openssh.com private key cipher. + + Use New to return a usable version. +*/ +type ChaCha20Poly1305OpenSSH struct { + kdfKey [KDFKeySize]byte + realKey [KeySize]byte +} diff --git a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305.go b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305.go new file mode 100644 index 0000000..012c0f7 --- /dev/null +++ b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305.go @@ -0,0 +1,100 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package chacha20poly1305 implements the ChaCha20-Poly1305 AEAD and its +// extended nonce variant XChaCha20-Poly1305, as specified in RFC 8439 and +// draft-irtf-cfrg-xchacha-01. +package chacha20poly1305 // import "golang.org/x/crypto/chacha20poly1305" + +import ( + "crypto/cipher" + "errors" +) + +const ( + // KeySize is the size of the key used by this AEAD, in bytes. + KeySize = 32 + + // NonceSize is the size of the nonce used with the standard variant of this + // AEAD, in bytes. + // + // Note that this is too short to be safely generated at random if the same + // key is reused more than 2³² times. + // NonceSize = 12 + // BITE ME, GOLANG DEVS. + NonceSize = 16 + + // NonceSizeX is the size of the nonce used with the XChaCha20-Poly1305 + // variant of this AEAD, in bytes. + NonceSizeX = 24 + + // Overhead is the size of the Poly1305 authentication tag, and the + // difference between a ciphertext length and its plaintext. + Overhead = 16 +) + +type chacha20poly1305 struct { + key [KeySize]byte +} + +// New returns a ChaCha20-Poly1305 AEAD that uses the given 256-bit key. +func New(key []byte) (cipher.AEAD, error) { + if len(key) != KeySize { + return nil, errors.New("chacha20poly1305: bad key length") + } + ret := new(chacha20poly1305) + copy(ret.key[:], key) + return ret, nil +} + +func (c *chacha20poly1305) NonceSize() int { + return NonceSize +} + +func (c *chacha20poly1305) Overhead() int { + return Overhead +} + +func (c *chacha20poly1305) Seal(dst, nonce, plaintext, additionalData []byte) []byte { + if len(nonce) != NonceSize { + panic("chacha20poly1305: bad nonce length passed to Seal") + } + + if uint64(len(plaintext)) > (1<<38)-64 { + panic("chacha20poly1305: plaintext too large") + } + + return c.seal(dst, nonce, plaintext, additionalData) +} + +var errOpen = errors.New("chacha20poly1305: message authentication failed") + +func (c *chacha20poly1305) Open(dst, nonce, ciphertext, additionalData []byte) ([]byte, error) { + if len(nonce) != NonceSize { + panic("chacha20poly1305: bad nonce length passed to Open") + } + if len(ciphertext) < 16 { + return nil, errOpen + } + if uint64(len(ciphertext)) > (1<<38)-48 { + panic("chacha20poly1305: ciphertext too large") + } + + return c.open(dst, nonce, ciphertext, additionalData) +} + +// sliceForAppend takes a slice and a requested number of bytes. It returns a +// slice with the contents of the given slice followed by that many bytes and a +// second slice that aliases into it and contains only the extra bytes. If the +// original slice has sufficient capacity then no allocation is performed. +func sliceForAppend(in []byte, n int) (head, tail []byte) { + if total := len(in) + n; cap(in) >= total { + head = in[:total] + } else { + head = make([]byte, total) + copy(head, in) + } + tail = head[len(in):] + return +} diff --git a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.go b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.go new file mode 100644 index 0000000..0c408c5 --- /dev/null +++ b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.go @@ -0,0 +1,87 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build gc && !purego +// +build gc,!purego + +package chacha20poly1305 + +import ( + "encoding/binary" + + "golang.org/x/crypto/internal/alias" + "golang.org/x/sys/cpu" +) + +//go:noescape +func chacha20Poly1305Open(dst []byte, key []uint32, src, ad []byte) bool + +//go:noescape +func chacha20Poly1305Seal(dst []byte, key []uint32, src, ad []byte) + +var ( + useAVX2 = cpu.X86.HasAVX2 && cpu.X86.HasBMI2 +) + +// setupState writes a ChaCha20 input matrix to state. See +// https://tools.ietf.org/html/rfc7539#section-2.3. +func setupState(state *[16]uint32, key *[32]byte, nonce []byte) { + state[0] = 0x61707865 + state[1] = 0x3320646e + state[2] = 0x79622d32 + state[3] = 0x6b206574 + + state[4] = binary.LittleEndian.Uint32(key[0:4]) + state[5] = binary.LittleEndian.Uint32(key[4:8]) + state[6] = binary.LittleEndian.Uint32(key[8:12]) + state[7] = binary.LittleEndian.Uint32(key[12:16]) + state[8] = binary.LittleEndian.Uint32(key[16:20]) + state[9] = binary.LittleEndian.Uint32(key[20:24]) + state[10] = binary.LittleEndian.Uint32(key[24:28]) + state[11] = binary.LittleEndian.Uint32(key[28:32]) + + state[12] = 0 + state[13] = binary.LittleEndian.Uint32(nonce[0:4]) + state[14] = binary.LittleEndian.Uint32(nonce[4:8]) + state[15] = binary.LittleEndian.Uint32(nonce[8:12]) +} + +func (c *chacha20poly1305) seal(dst, nonce, plaintext, additionalData []byte) []byte { + if !cpu.X86.HasSSSE3 { + return c.sealGeneric(dst, nonce, plaintext, additionalData) + } + + var state [16]uint32 + setupState(&state, &c.key, nonce) + + ret, out := sliceForAppend(dst, len(plaintext)+16) + if alias.InexactOverlap(out, plaintext) { + panic("chacha20poly1305: invalid buffer overlap") + } + chacha20Poly1305Seal(out[:], state[:], plaintext, additionalData) + return ret +} + +func (c *chacha20poly1305) open(dst, nonce, ciphertext, additionalData []byte) ([]byte, error) { + if !cpu.X86.HasSSSE3 { + return c.openGeneric(dst, nonce, ciphertext, additionalData) + } + + var state [16]uint32 + setupState(&state, &c.key, nonce) + + ciphertext = ciphertext[:len(ciphertext)-16] + ret, out := sliceForAppend(dst, len(ciphertext)) + if alias.InexactOverlap(out, ciphertext) { + panic("chacha20poly1305: invalid buffer overlap") + } + if !chacha20Poly1305Open(out, state[:], ciphertext, additionalData) { + for i := range out { + out[i] = 0 + } + return nil, errOpen + } + + return ret, nil +} diff --git a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s new file mode 100644 index 0000000..867c181 --- /dev/null +++ b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s @@ -0,0 +1,2696 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// This file was originally from https://golang.org/cl/24717 by Vlad Krasnov of CloudFlare. + +//go:build gc && !purego +// +build gc,!purego + +#include "textflag.h" +// General register allocation +#define oup DI +#define inp SI +#define inl BX +#define adp CX // free to reuse, after we hash the additional data +#define keyp R8 // free to reuse, when we copy the key to stack +#define itr2 R9 // general iterator +#define itr1 CX // general iterator +#define acc0 R10 +#define acc1 R11 +#define acc2 R12 +#define t0 R13 +#define t1 R14 +#define t2 R15 +#define t3 R8 +// Register and stack allocation for the SSE code +#define rStore (0*16)(BP) +#define sStore (1*16)(BP) +#define state1Store (2*16)(BP) +#define state2Store (3*16)(BP) +#define tmpStore (4*16)(BP) +#define ctr0Store (5*16)(BP) +#define ctr1Store (6*16)(BP) +#define ctr2Store (7*16)(BP) +#define ctr3Store (8*16)(BP) +#define A0 X0 +#define A1 X1 +#define A2 X2 +#define B0 X3 +#define B1 X4 +#define B2 X5 +#define C0 X6 +#define C1 X7 +#define C2 X8 +#define D0 X9 +#define D1 X10 +#define D2 X11 +#define T0 X12 +#define T1 X13 +#define T2 X14 +#define T3 X15 +#define A3 T0 +#define B3 T1 +#define C3 T2 +#define D3 T3 +// Register and stack allocation for the AVX2 code +#define rsStoreAVX2 (0*32)(BP) +#define state1StoreAVX2 (1*32)(BP) +#define state2StoreAVX2 (2*32)(BP) +#define ctr0StoreAVX2 (3*32)(BP) +#define ctr1StoreAVX2 (4*32)(BP) +#define ctr2StoreAVX2 (5*32)(BP) +#define ctr3StoreAVX2 (6*32)(BP) +#define tmpStoreAVX2 (7*32)(BP) // 256 bytes on stack +#define AA0 Y0 +#define AA1 Y5 +#define AA2 Y6 +#define AA3 Y7 +#define BB0 Y14 +#define BB1 Y9 +#define BB2 Y10 +#define BB3 Y11 +#define CC0 Y12 +#define CC1 Y13 +#define CC2 Y8 +#define CC3 Y15 +#define DD0 Y4 +#define DD1 Y1 +#define DD2 Y2 +#define DD3 Y3 +#define TT0 DD3 +#define TT1 AA3 +#define TT2 BB3 +#define TT3 CC3 +// ChaCha20 constants +DATA ·chacha20Constants<>+0x00(SB)/4, $0x61707865 +DATA ·chacha20Constants<>+0x04(SB)/4, $0x3320646e +DATA ·chacha20Constants<>+0x08(SB)/4, $0x79622d32 +DATA ·chacha20Constants<>+0x0c(SB)/4, $0x6b206574 +DATA ·chacha20Constants<>+0x10(SB)/4, $0x61707865 +DATA ·chacha20Constants<>+0x14(SB)/4, $0x3320646e +DATA ·chacha20Constants<>+0x18(SB)/4, $0x79622d32 +DATA ·chacha20Constants<>+0x1c(SB)/4, $0x6b206574 +// <<< 16 with PSHUFB +DATA ·rol16<>+0x00(SB)/8, $0x0504070601000302 +DATA ·rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A +DATA ·rol16<>+0x10(SB)/8, $0x0504070601000302 +DATA ·rol16<>+0x18(SB)/8, $0x0D0C0F0E09080B0A +// <<< 8 with PSHUFB +DATA ·rol8<>+0x00(SB)/8, $0x0605040702010003 +DATA ·rol8<>+0x08(SB)/8, $0x0E0D0C0F0A09080B +DATA ·rol8<>+0x10(SB)/8, $0x0605040702010003 +DATA ·rol8<>+0x18(SB)/8, $0x0E0D0C0F0A09080B + +DATA ·avx2InitMask<>+0x00(SB)/8, $0x0 +DATA ·avx2InitMask<>+0x08(SB)/8, $0x0 +DATA ·avx2InitMask<>+0x10(SB)/8, $0x1 +DATA ·avx2InitMask<>+0x18(SB)/8, $0x0 + +DATA ·avx2IncMask<>+0x00(SB)/8, $0x2 +DATA ·avx2IncMask<>+0x08(SB)/8, $0x0 +DATA ·avx2IncMask<>+0x10(SB)/8, $0x2 +DATA ·avx2IncMask<>+0x18(SB)/8, $0x0 +// Poly1305 key clamp +DATA ·polyClampMask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF +DATA ·polyClampMask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC +DATA ·polyClampMask<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF +DATA ·polyClampMask<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF + +DATA ·sseIncMask<>+0x00(SB)/8, $0x1 +DATA ·sseIncMask<>+0x08(SB)/8, $0x0 +// To load/store the last < 16 bytes in a buffer +DATA ·andMask<>+0x00(SB)/8, $0x00000000000000ff +DATA ·andMask<>+0x08(SB)/8, $0x0000000000000000 +DATA ·andMask<>+0x10(SB)/8, $0x000000000000ffff +DATA ·andMask<>+0x18(SB)/8, $0x0000000000000000 +DATA ·andMask<>+0x20(SB)/8, $0x0000000000ffffff +DATA ·andMask<>+0x28(SB)/8, $0x0000000000000000 +DATA ·andMask<>+0x30(SB)/8, $0x00000000ffffffff +DATA ·andMask<>+0x38(SB)/8, $0x0000000000000000 +DATA ·andMask<>+0x40(SB)/8, $0x000000ffffffffff +DATA ·andMask<>+0x48(SB)/8, $0x0000000000000000 +DATA ·andMask<>+0x50(SB)/8, $0x0000ffffffffffff +DATA ·andMask<>+0x58(SB)/8, $0x0000000000000000 +DATA ·andMask<>+0x60(SB)/8, $0x00ffffffffffffff +DATA ·andMask<>+0x68(SB)/8, $0x0000000000000000 +DATA ·andMask<>+0x70(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+0x78(SB)/8, $0x0000000000000000 +DATA ·andMask<>+0x80(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+0x88(SB)/8, $0x00000000000000ff +DATA ·andMask<>+0x90(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+0x98(SB)/8, $0x000000000000ffff +DATA ·andMask<>+0xa0(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+0xa8(SB)/8, $0x0000000000ffffff +DATA ·andMask<>+0xb0(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+0xb8(SB)/8, $0x00000000ffffffff +DATA ·andMask<>+0xc0(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+0xc8(SB)/8, $0x000000ffffffffff +DATA ·andMask<>+0xd0(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+0xd8(SB)/8, $0x0000ffffffffffff +DATA ·andMask<>+0xe0(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+0xe8(SB)/8, $0x00ffffffffffffff + +GLOBL ·chacha20Constants<>(SB), (NOPTR+RODATA), $32 +GLOBL ·rol16<>(SB), (NOPTR+RODATA), $32 +GLOBL ·rol8<>(SB), (NOPTR+RODATA), $32 +GLOBL ·sseIncMask<>(SB), (NOPTR+RODATA), $16 +GLOBL ·avx2IncMask<>(SB), (NOPTR+RODATA), $32 +GLOBL ·avx2InitMask<>(SB), (NOPTR+RODATA), $32 +GLOBL ·polyClampMask<>(SB), (NOPTR+RODATA), $32 +GLOBL ·andMask<>(SB), (NOPTR+RODATA), $240 +// No PALIGNR in Go ASM yet (but VPALIGNR is present). +#define shiftB0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X3, X3 +#define shiftB1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x04 // PALIGNR $4, X4, X4 +#define shiftB2Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X5, X5 +#define shiftB3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X13, X13 +#define shiftC0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X6, X6 +#define shiftC1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x08 // PALIGNR $8, X7, X7 +#define shiftC2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc0; BYTE $0x08 // PALIGNR $8, X8, X8 +#define shiftC3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X14, X14 +#define shiftD0Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x0c // PALIGNR $12, X9, X9 +#define shiftD1Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x0c // PALIGNR $12, X10, X10 +#define shiftD2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X11, X11 +#define shiftD3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x0c // PALIGNR $12, X15, X15 +#define shiftB0Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X3, X3 +#define shiftB1Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x0c // PALIGNR $12, X4, X4 +#define shiftB2Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X5, X5 +#define shiftB3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X13, X13 +#define shiftC0Right shiftC0Left +#define shiftC1Right shiftC1Left +#define shiftC2Right shiftC2Left +#define shiftC3Right shiftC3Left +#define shiftD0Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x04 // PALIGNR $4, X9, X9 +#define shiftD1Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x04 // PALIGNR $4, X10, X10 +#define shiftD2Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X11, X11 +#define shiftD3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x04 // PALIGNR $4, X15, X15 +// Some macros +#define chachaQR(A, B, C, D, T) \ + PADDD B, A; PXOR A, D; PSHUFB ·rol16<>(SB), D \ + PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $12, T; PSRLL $20, B; PXOR T, B \ + PADDD B, A; PXOR A, D; PSHUFB ·rol8<>(SB), D \ + PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $7, T; PSRLL $25, B; PXOR T, B + +#define chachaQR_AVX2(A, B, C, D, T) \ + VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol16<>(SB), D, D \ + VPADDD D, C, C; VPXOR C, B, B; VPSLLD $12, B, T; VPSRLD $20, B, B; VPXOR T, B, B \ + VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol8<>(SB), D, D \ + VPADDD D, C, C; VPXOR C, B, B; VPSLLD $7, B, T; VPSRLD $25, B, B; VPXOR T, B, B + +#define polyAdd(S) ADDQ S, acc0; ADCQ 8+S, acc1; ADCQ $1, acc2 +#define polyMulStage1 MOVQ (0*8)(BP), AX; MOVQ AX, t2; MULQ acc0; MOVQ AX, t0; MOVQ DX, t1; MOVQ (0*8)(BP), AX; MULQ acc1; IMULQ acc2, t2; ADDQ AX, t1; ADCQ DX, t2 +#define polyMulStage2 MOVQ (1*8)(BP), AX; MOVQ AX, t3; MULQ acc0; ADDQ AX, t1; ADCQ $0, DX; MOVQ DX, acc0; MOVQ (1*8)(BP), AX; MULQ acc1; ADDQ AX, t2; ADCQ $0, DX +#define polyMulStage3 IMULQ acc2, t3; ADDQ acc0, t2; ADCQ DX, t3 +#define polyMulReduceStage MOVQ t0, acc0; MOVQ t1, acc1; MOVQ t2, acc2; ANDQ $3, acc2; MOVQ t2, t0; ANDQ $-4, t0; MOVQ t3, t1; SHRQ $2, t3, t2; SHRQ $2, t3; ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $0, acc2; ADDQ t2, acc0; ADCQ t3, acc1; ADCQ $0, acc2 + +#define polyMulStage1_AVX2 MOVQ (0*8)(BP), DX; MOVQ DX, t2; MULXQ acc0, t0, t1; IMULQ acc2, t2; MULXQ acc1, AX, DX; ADDQ AX, t1; ADCQ DX, t2 +#define polyMulStage2_AVX2 MOVQ (1*8)(BP), DX; MULXQ acc0, acc0, AX; ADDQ acc0, t1; MULXQ acc1, acc1, t3; ADCQ acc1, t2; ADCQ $0, t3 +#define polyMulStage3_AVX2 IMULQ acc2, DX; ADDQ AX, t2; ADCQ DX, t3 + +#define polyMul polyMulStage1; polyMulStage2; polyMulStage3; polyMulReduceStage +#define polyMulAVX2 polyMulStage1_AVX2; polyMulStage2_AVX2; polyMulStage3_AVX2; polyMulReduceStage +// ---------------------------------------------------------------------------- +TEXT polyHashADInternal<>(SB), NOSPLIT, $0 + // adp points to beginning of additional data + // itr2 holds ad length + XORQ acc0, acc0 + XORQ acc1, acc1 + XORQ acc2, acc2 + CMPQ itr2, $13 + JNE hashADLoop + +openFastTLSAD: + // Special treatment for the TLS case of 13 bytes + MOVQ (adp), acc0 + MOVQ 5(adp), acc1 + SHRQ $24, acc1 + MOVQ $1, acc2 + polyMul + RET + +hashADLoop: + // Hash in 16 byte chunks + CMPQ itr2, $16 + JB hashADTail + polyAdd(0(adp)) + LEAQ (1*16)(adp), adp + SUBQ $16, itr2 + polyMul + JMP hashADLoop + +hashADTail: + CMPQ itr2, $0 + JE hashADDone + + // Hash last < 16 byte tail + XORQ t0, t0 + XORQ t1, t1 + XORQ t2, t2 + ADDQ itr2, adp + +hashADTailLoop: + SHLQ $8, t0, t1 + SHLQ $8, t0 + MOVB -1(adp), t2 + XORQ t2, t0 + DECQ adp + DECQ itr2 + JNE hashADTailLoop + +hashADTailFinish: + ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 + polyMul + + // Finished AD +hashADDone: + RET + +// ---------------------------------------------------------------------------- +// func chacha20Poly1305Open(dst, key, src, ad []byte) bool +TEXT ·chacha20Poly1305Open(SB), 0, $288-97 + // For aligned stack access + MOVQ SP, BP + ADDQ $32, BP + ANDQ $-32, BP + MOVQ dst+0(FP), oup + MOVQ key+24(FP), keyp + MOVQ src+48(FP), inp + MOVQ src_len+56(FP), inl + MOVQ ad+72(FP), adp + + // Check for AVX2 support + CMPB ·useAVX2(SB), $1 + JE chacha20Poly1305Open_AVX2 + + // Special optimization, for very short buffers + CMPQ inl, $128 + JBE openSSE128 // About 16% faster + + // For long buffers, prepare the poly key first + MOVOU ·chacha20Constants<>(SB), A0 + MOVOU (1*16)(keyp), B0 + MOVOU (2*16)(keyp), C0 + MOVOU (3*16)(keyp), D0 + MOVO D0, T1 + + // Store state on stack for future use + MOVO B0, state1Store + MOVO C0, state2Store + MOVO D0, ctr3Store + MOVQ $10, itr2 + +openSSEPreparePolyKey: + chachaQR(A0, B0, C0, D0, T0) + shiftB0Left; shiftC0Left; shiftD0Left + chachaQR(A0, B0, C0, D0, T0) + shiftB0Right; shiftC0Right; shiftD0Right + DECQ itr2 + JNE openSSEPreparePolyKey + + // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded + PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0 + + // Clamp and store the key + PAND ·polyClampMask<>(SB), A0 + MOVO A0, rStore; MOVO B0, sStore + + // Hash AAD + MOVQ ad_len+80(FP), itr2 + CALL polyHashADInternal<>(SB) + +openSSEMainLoop: + CMPQ inl, $256 + JB openSSEMainLoopDone + + // Load state, increment counter blocks + MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0 + MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 + MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 + MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 + + // Store counters + MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store + + // There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16 + MOVQ $4, itr1 + MOVQ inp, itr2 + +openSSEInternalLoop: + MOVO C3, tmpStore + chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) + MOVO tmpStore, C3 + MOVO C1, tmpStore + chachaQR(A3, B3, C3, D3, C1) + MOVO tmpStore, C1 + polyAdd(0(itr2)) + shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left + shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left + shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left + polyMulStage1 + polyMulStage2 + LEAQ (2*8)(itr2), itr2 + MOVO C3, tmpStore + chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) + MOVO tmpStore, C3 + MOVO C1, tmpStore + polyMulStage3 + chachaQR(A3, B3, C3, D3, C1) + MOVO tmpStore, C1 + polyMulReduceStage + shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right + shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right + shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right + DECQ itr1 + JGE openSSEInternalLoop + + polyAdd(0(itr2)) + polyMul + LEAQ (2*8)(itr2), itr2 + + CMPQ itr1, $-6 + JG openSSEInternalLoop + + // Add in the state + PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 + PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 + PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 + PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 + + // Load - xor - store + MOVO D3, tmpStore + MOVOU (0*16)(inp), D3; PXOR D3, A0; MOVOU A0, (0*16)(oup) + MOVOU (1*16)(inp), D3; PXOR D3, B0; MOVOU B0, (1*16)(oup) + MOVOU (2*16)(inp), D3; PXOR D3, C0; MOVOU C0, (2*16)(oup) + MOVOU (3*16)(inp), D3; PXOR D3, D0; MOVOU D0, (3*16)(oup) + MOVOU (4*16)(inp), D0; PXOR D0, A1; MOVOU A1, (4*16)(oup) + MOVOU (5*16)(inp), D0; PXOR D0, B1; MOVOU B1, (5*16)(oup) + MOVOU (6*16)(inp), D0; PXOR D0, C1; MOVOU C1, (6*16)(oup) + MOVOU (7*16)(inp), D0; PXOR D0, D1; MOVOU D1, (7*16)(oup) + MOVOU (8*16)(inp), D0; PXOR D0, A2; MOVOU A2, (8*16)(oup) + MOVOU (9*16)(inp), D0; PXOR D0, B2; MOVOU B2, (9*16)(oup) + MOVOU (10*16)(inp), D0; PXOR D0, C2; MOVOU C2, (10*16)(oup) + MOVOU (11*16)(inp), D0; PXOR D0, D2; MOVOU D2, (11*16)(oup) + MOVOU (12*16)(inp), D0; PXOR D0, A3; MOVOU A3, (12*16)(oup) + MOVOU (13*16)(inp), D0; PXOR D0, B3; MOVOU B3, (13*16)(oup) + MOVOU (14*16)(inp), D0; PXOR D0, C3; MOVOU C3, (14*16)(oup) + MOVOU (15*16)(inp), D0; PXOR tmpStore, D0; MOVOU D0, (15*16)(oup) + LEAQ 256(inp), inp + LEAQ 256(oup), oup + SUBQ $256, inl + JMP openSSEMainLoop + +openSSEMainLoopDone: + // Handle the various tail sizes efficiently + TESTQ inl, inl + JE openSSEFinalize + CMPQ inl, $64 + JBE openSSETail64 + CMPQ inl, $128 + JBE openSSETail128 + CMPQ inl, $192 + JBE openSSETail192 + JMP openSSETail256 + +openSSEFinalize: + // Hash in the PT, AAD lengths + ADDQ ad_len+80(FP), acc0; ADCQ src_len+56(FP), acc1; ADCQ $1, acc2 + polyMul + + // Final reduce + MOVQ acc0, t0 + MOVQ acc1, t1 + MOVQ acc2, t2 + SUBQ $-5, acc0 + SBBQ $-1, acc1 + SBBQ $3, acc2 + CMOVQCS t0, acc0 + CMOVQCS t1, acc1 + CMOVQCS t2, acc2 + + // Add in the "s" part of the key + ADDQ 0+sStore, acc0 + ADCQ 8+sStore, acc1 + + // Finally, constant time compare to the tag at the end of the message + XORQ AX, AX + MOVQ $1, DX + XORQ (0*8)(inp), acc0 + XORQ (1*8)(inp), acc1 + ORQ acc1, acc0 + CMOVQEQ DX, AX + + // Return true iff tags are equal + MOVB AX, ret+96(FP) + RET + +// ---------------------------------------------------------------------------- +// Special optimization for buffers smaller than 129 bytes +openSSE128: + // For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks + MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0 + MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 + MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 + MOVO B0, T1; MOVO C0, T2; MOVO D1, T3 + MOVQ $10, itr2 + +openSSE128InnerCipherLoop: + chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) + shiftB0Left; shiftB1Left; shiftB2Left + shiftC0Left; shiftC1Left; shiftC2Left + shiftD0Left; shiftD1Left; shiftD2Left + chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) + shiftB0Right; shiftB1Right; shiftB2Right + shiftC0Right; shiftC1Right; shiftC2Right + shiftD0Right; shiftD1Right; shiftD2Right + DECQ itr2 + JNE openSSE128InnerCipherLoop + + // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded + PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 + PADDL T1, B0; PADDL T1, B1; PADDL T1, B2 + PADDL T2, C1; PADDL T2, C2 + PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2 + + // Clamp and store the key + PAND ·polyClampMask<>(SB), A0 + MOVOU A0, rStore; MOVOU B0, sStore + + // Hash + MOVQ ad_len+80(FP), itr2 + CALL polyHashADInternal<>(SB) + +openSSE128Open: + CMPQ inl, $16 + JB openSSETail16 + SUBQ $16, inl + + // Load for hashing + polyAdd(0(inp)) + + // Load for decryption + MOVOU (inp), T0; PXOR T0, A1; MOVOU A1, (oup) + LEAQ (1*16)(inp), inp + LEAQ (1*16)(oup), oup + polyMul + + // Shift the stream "left" + MOVO B1, A1 + MOVO C1, B1 + MOVO D1, C1 + MOVO A2, D1 + MOVO B2, A2 + MOVO C2, B2 + MOVO D2, C2 + JMP openSSE128Open + +openSSETail16: + TESTQ inl, inl + JE openSSEFinalize + + // We can safely load the CT from the end, because it is padded with the MAC + MOVQ inl, itr2 + SHLQ $4, itr2 + LEAQ ·andMask<>(SB), t0 + MOVOU (inp), T0 + ADDQ inl, inp + PAND -16(t0)(itr2*1), T0 + MOVO T0, 0+tmpStore + MOVQ T0, t0 + MOVQ 8+tmpStore, t1 + PXOR A1, T0 + + // We can only store one byte at a time, since plaintext can be shorter than 16 bytes +openSSETail16Store: + MOVQ T0, t3 + MOVB t3, (oup) + PSRLDQ $1, T0 + INCQ oup + DECQ inl + JNE openSSETail16Store + ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 + polyMul + JMP openSSEFinalize + +// ---------------------------------------------------------------------------- +// Special optimization for the last 64 bytes of ciphertext +openSSETail64: + // Need to decrypt up to 64 bytes - prepare single block + MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store + XORQ itr2, itr2 + MOVQ inl, itr1 + CMPQ itr1, $16 + JB openSSETail64LoopB + +openSSETail64LoopA: + // Perform ChaCha rounds, while hashing the remaining input + polyAdd(0(inp)(itr2*1)) + polyMul + SUBQ $16, itr1 + +openSSETail64LoopB: + ADDQ $16, itr2 + chachaQR(A0, B0, C0, D0, T0) + shiftB0Left; shiftC0Left; shiftD0Left + chachaQR(A0, B0, C0, D0, T0) + shiftB0Right; shiftC0Right; shiftD0Right + + CMPQ itr1, $16 + JAE openSSETail64LoopA + + CMPQ itr2, $160 + JNE openSSETail64LoopB + + PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0; PADDL state2Store, C0; PADDL ctr0Store, D0 + +openSSETail64DecLoop: + CMPQ inl, $16 + JB openSSETail64DecLoopDone + SUBQ $16, inl + MOVOU (inp), T0 + PXOR T0, A0 + MOVOU A0, (oup) + LEAQ 16(inp), inp + LEAQ 16(oup), oup + MOVO B0, A0 + MOVO C0, B0 + MOVO D0, C0 + JMP openSSETail64DecLoop + +openSSETail64DecLoopDone: + MOVO A0, A1 + JMP openSSETail16 + +// ---------------------------------------------------------------------------- +// Special optimization for the last 128 bytes of ciphertext +openSSETail128: + // Need to decrypt up to 128 bytes - prepare two blocks + MOVO ·chacha20Constants<>(SB), A1; MOVO state1Store, B1; MOVO state2Store, C1; MOVO ctr3Store, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr0Store + MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr1Store + XORQ itr2, itr2 + MOVQ inl, itr1 + ANDQ $-16, itr1 + +openSSETail128LoopA: + // Perform ChaCha rounds, while hashing the remaining input + polyAdd(0(inp)(itr2*1)) + polyMul + +openSSETail128LoopB: + ADDQ $16, itr2 + chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) + shiftB0Left; shiftC0Left; shiftD0Left + shiftB1Left; shiftC1Left; shiftD1Left + chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) + shiftB0Right; shiftC0Right; shiftD0Right + shiftB1Right; shiftC1Right; shiftD1Right + + CMPQ itr2, itr1 + JB openSSETail128LoopA + + CMPQ itr2, $160 + JNE openSSETail128LoopB + + PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1 + PADDL state1Store, B0; PADDL state1Store, B1 + PADDL state2Store, C0; PADDL state2Store, C1 + PADDL ctr1Store, D0; PADDL ctr0Store, D1 + + MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 + PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1 + MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup) + + SUBQ $64, inl + LEAQ 64(inp), inp + LEAQ 64(oup), oup + JMP openSSETail64DecLoop + +// ---------------------------------------------------------------------------- +// Special optimization for the last 192 bytes of ciphertext +openSSETail192: + // Need to decrypt up to 192 bytes - prepare three blocks + MOVO ·chacha20Constants<>(SB), A2; MOVO state1Store, B2; MOVO state2Store, C2; MOVO ctr3Store, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr0Store + MOVO A2, A1; MOVO B2, B1; MOVO C2, C1; MOVO D2, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store + MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr2Store + + MOVQ inl, itr1 + MOVQ $160, itr2 + CMPQ itr1, $160 + CMOVQGT itr2, itr1 + ANDQ $-16, itr1 + XORQ itr2, itr2 + +openSSLTail192LoopA: + // Perform ChaCha rounds, while hashing the remaining input + polyAdd(0(inp)(itr2*1)) + polyMul + +openSSLTail192LoopB: + ADDQ $16, itr2 + chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) + shiftB0Left; shiftC0Left; shiftD0Left + shiftB1Left; shiftC1Left; shiftD1Left + shiftB2Left; shiftC2Left; shiftD2Left + + chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) + shiftB0Right; shiftC0Right; shiftD0Right + shiftB1Right; shiftC1Right; shiftD1Right + shiftB2Right; shiftC2Right; shiftD2Right + + CMPQ itr2, itr1 + JB openSSLTail192LoopA + + CMPQ itr2, $160 + JNE openSSLTail192LoopB + + CMPQ inl, $176 + JB openSSLTail192Store + + polyAdd(160(inp)) + polyMul + + CMPQ inl, $192 + JB openSSLTail192Store + + polyAdd(176(inp)) + polyMul + +openSSLTail192Store: + PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 + PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2 + PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2 + PADDL ctr2Store, D0; PADDL ctr1Store, D1; PADDL ctr0Store, D2 + + MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 + PXOR T0, A2; PXOR T1, B2; PXOR T2, C2; PXOR T3, D2 + MOVOU A2, (0*16)(oup); MOVOU B2, (1*16)(oup); MOVOU C2, (2*16)(oup); MOVOU D2, (3*16)(oup) + + MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3 + PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1 + MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) + + SUBQ $128, inl + LEAQ 128(inp), inp + LEAQ 128(oup), oup + JMP openSSETail64DecLoop + +// ---------------------------------------------------------------------------- +// Special optimization for the last 256 bytes of ciphertext +openSSETail256: + // Need to decrypt up to 256 bytes - prepare four blocks + MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0 + MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 + MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 + MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 + + // Store counters + MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store + XORQ itr2, itr2 + +openSSETail256Loop: + // This loop inteleaves 8 ChaCha quarter rounds with 1 poly multiplication + polyAdd(0(inp)(itr2*1)) + MOVO C3, tmpStore + chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) + MOVO tmpStore, C3 + MOVO C1, tmpStore + chachaQR(A3, B3, C3, D3, C1) + MOVO tmpStore, C1 + shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left + shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left + shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left + polyMulStage1 + polyMulStage2 + MOVO C3, tmpStore + chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) + MOVO tmpStore, C3 + MOVO C1, tmpStore + chachaQR(A3, B3, C3, D3, C1) + MOVO tmpStore, C1 + polyMulStage3 + polyMulReduceStage + shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right + shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right + shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right + ADDQ $2*8, itr2 + CMPQ itr2, $160 + JB openSSETail256Loop + MOVQ inl, itr1 + ANDQ $-16, itr1 + +openSSETail256HashLoop: + polyAdd(0(inp)(itr2*1)) + polyMul + ADDQ $2*8, itr2 + CMPQ itr2, itr1 + JB openSSETail256HashLoop + + // Add in the state + PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 + PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 + PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 + PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 + MOVO D3, tmpStore + + // Load - xor - store + MOVOU (0*16)(inp), D3; PXOR D3, A0 + MOVOU (1*16)(inp), D3; PXOR D3, B0 + MOVOU (2*16)(inp), D3; PXOR D3, C0 + MOVOU (3*16)(inp), D3; PXOR D3, D0 + MOVOU A0, (0*16)(oup) + MOVOU B0, (1*16)(oup) + MOVOU C0, (2*16)(oup) + MOVOU D0, (3*16)(oup) + MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0 + PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1 + MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) + MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0 + PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2 + MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup) + LEAQ 192(inp), inp + LEAQ 192(oup), oup + SUBQ $192, inl + MOVO A3, A0 + MOVO B3, B0 + MOVO C3, C0 + MOVO tmpStore, D0 + + JMP openSSETail64DecLoop + +// ---------------------------------------------------------------------------- +// ------------------------- AVX2 Code ---------------------------------------- +chacha20Poly1305Open_AVX2: + VZEROUPPER + VMOVDQU ·chacha20Constants<>(SB), AA0 + BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14 + BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12 + BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4 + VPADDD ·avx2InitMask<>(SB), DD0, DD0 + + // Special optimization, for very short buffers + CMPQ inl, $192 + JBE openAVX2192 + CMPQ inl, $320 + JBE openAVX2320 + + // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream + VMOVDQA BB0, state1StoreAVX2 + VMOVDQA CC0, state2StoreAVX2 + VMOVDQA DD0, ctr3StoreAVX2 + MOVQ $10, itr2 + +openAVX2PreparePolyKey: + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0 + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0 + DECQ itr2 + JNE openAVX2PreparePolyKey + + VPADDD ·chacha20Constants<>(SB), AA0, AA0 + VPADDD state1StoreAVX2, BB0, BB0 + VPADDD state2StoreAVX2, CC0, CC0 + VPADDD ctr3StoreAVX2, DD0, DD0 + + VPERM2I128 $0x02, AA0, BB0, TT0 + + // Clamp and store poly key + VPAND ·polyClampMask<>(SB), TT0, TT0 + VMOVDQA TT0, rsStoreAVX2 + + // Stream for the first 64 bytes + VPERM2I128 $0x13, AA0, BB0, AA0 + VPERM2I128 $0x13, CC0, DD0, BB0 + + // Hash AD + first 64 bytes + MOVQ ad_len+80(FP), itr2 + CALL polyHashADInternal<>(SB) + XORQ itr1, itr1 + +openAVX2InitialHash64: + polyAdd(0(inp)(itr1*1)) + polyMulAVX2 + ADDQ $16, itr1 + CMPQ itr1, $64 + JNE openAVX2InitialHash64 + + // Decrypt the first 64 bytes + VPXOR (0*32)(inp), AA0, AA0 + VPXOR (1*32)(inp), BB0, BB0 + VMOVDQU AA0, (0*32)(oup) + VMOVDQU BB0, (1*32)(oup) + LEAQ (2*32)(inp), inp + LEAQ (2*32)(oup), oup + SUBQ $64, inl + +openAVX2MainLoop: + CMPQ inl, $512 + JB openAVX2MainLoopDone + + // Load state, increment counter blocks, store the incremented counters + VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 + VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 + VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 + VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 + VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 + XORQ itr1, itr1 + +openAVX2InternalLoop: + // Lets just say this spaghetti loop interleaves 2 quarter rounds with 3 poly multiplications + // Effectively per 512 bytes of stream we hash 480 bytes of ciphertext + polyAdd(0*8(inp)(itr1*1)) + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + polyMulStage1_AVX2 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 + polyMulStage2_AVX2 + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + polyMulStage3_AVX2 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + polyMulReduceStage + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 + polyAdd(2*8(inp)(itr1*1)) + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + polyMulStage1_AVX2 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + polyMulStage2_AVX2 + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 + VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + polyMulStage3_AVX2 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 + polyMulReduceStage + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + polyAdd(4*8(inp)(itr1*1)) + LEAQ (6*8)(itr1), itr1 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + polyMulStage1_AVX2 + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + polyMulStage2_AVX2 + VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + polyMulStage3_AVX2 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + polyMulReduceStage + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 + VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 + CMPQ itr1, $480 + JNE openAVX2InternalLoop + + VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 + VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 + VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 + VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 + VMOVDQA CC3, tmpStoreAVX2 + + // We only hashed 480 of the 512 bytes available - hash the remaining 32 here + polyAdd(480(inp)) + polyMulAVX2 + VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0 + VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0 + VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup) + VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 + VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 + VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) + + // and here + polyAdd(496(inp)) + polyMulAVX2 + VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 + VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 + VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) + VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 + VPXOR (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0 + VMOVDQU AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup) + LEAQ (32*16)(inp), inp + LEAQ (32*16)(oup), oup + SUBQ $(32*16), inl + JMP openAVX2MainLoop + +openAVX2MainLoopDone: + // Handle the various tail sizes efficiently + TESTQ inl, inl + JE openSSEFinalize + CMPQ inl, $128 + JBE openAVX2Tail128 + CMPQ inl, $256 + JBE openAVX2Tail256 + CMPQ inl, $384 + JBE openAVX2Tail384 + JMP openAVX2Tail512 + +// ---------------------------------------------------------------------------- +// Special optimization for buffers smaller than 193 bytes +openAVX2192: + // For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks + VMOVDQA AA0, AA1 + VMOVDQA BB0, BB1 + VMOVDQA CC0, CC1 + VPADDD ·avx2IncMask<>(SB), DD0, DD1 + VMOVDQA AA0, AA2 + VMOVDQA BB0, BB2 + VMOVDQA CC0, CC2 + VMOVDQA DD0, DD2 + VMOVDQA DD1, TT3 + MOVQ $10, itr2 + +openAVX2192InnerCipherLoop: + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 + VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 + VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 + DECQ itr2 + JNE openAVX2192InnerCipherLoop + VPADDD AA2, AA0, AA0; VPADDD AA2, AA1, AA1 + VPADDD BB2, BB0, BB0; VPADDD BB2, BB1, BB1 + VPADDD CC2, CC0, CC0; VPADDD CC2, CC1, CC1 + VPADDD DD2, DD0, DD0; VPADDD TT3, DD1, DD1 + VPERM2I128 $0x02, AA0, BB0, TT0 + + // Clamp and store poly key + VPAND ·polyClampMask<>(SB), TT0, TT0 + VMOVDQA TT0, rsStoreAVX2 + + // Stream for up to 192 bytes + VPERM2I128 $0x13, AA0, BB0, AA0 + VPERM2I128 $0x13, CC0, DD0, BB0 + VPERM2I128 $0x02, AA1, BB1, CC0 + VPERM2I128 $0x02, CC1, DD1, DD0 + VPERM2I128 $0x13, AA1, BB1, AA1 + VPERM2I128 $0x13, CC1, DD1, BB1 + +openAVX2ShortOpen: + // Hash + MOVQ ad_len+80(FP), itr2 + CALL polyHashADInternal<>(SB) + +openAVX2ShortOpenLoop: + CMPQ inl, $32 + JB openAVX2ShortTail32 + SUBQ $32, inl + + // Load for hashing + polyAdd(0*8(inp)) + polyMulAVX2 + polyAdd(2*8(inp)) + polyMulAVX2 + + // Load for decryption + VPXOR (inp), AA0, AA0 + VMOVDQU AA0, (oup) + LEAQ (1*32)(inp), inp + LEAQ (1*32)(oup), oup + + // Shift stream left + VMOVDQA BB0, AA0 + VMOVDQA CC0, BB0 + VMOVDQA DD0, CC0 + VMOVDQA AA1, DD0 + VMOVDQA BB1, AA1 + VMOVDQA CC1, BB1 + VMOVDQA DD1, CC1 + VMOVDQA AA2, DD1 + VMOVDQA BB2, AA2 + JMP openAVX2ShortOpenLoop + +openAVX2ShortTail32: + CMPQ inl, $16 + VMOVDQA A0, A1 + JB openAVX2ShortDone + + SUBQ $16, inl + + // Load for hashing + polyAdd(0*8(inp)) + polyMulAVX2 + + // Load for decryption + VPXOR (inp), A0, T0 + VMOVDQU T0, (oup) + LEAQ (1*16)(inp), inp + LEAQ (1*16)(oup), oup + VPERM2I128 $0x11, AA0, AA0, AA0 + VMOVDQA A0, A1 + +openAVX2ShortDone: + VZEROUPPER + JMP openSSETail16 + +// ---------------------------------------------------------------------------- +// Special optimization for buffers smaller than 321 bytes +openAVX2320: + // For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks + VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1 + VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2 + VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3 + MOVQ $10, itr2 + +openAVX2320InnerCipherLoop: + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 + VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 + VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 + DECQ itr2 + JNE openAVX2320InnerCipherLoop + + VMOVDQA ·chacha20Constants<>(SB), TT0 + VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2 + VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2 + VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2 + VMOVDQA ·avx2IncMask<>(SB), TT0 + VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3 + VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3 + VPADDD TT3, DD2, DD2 + + // Clamp and store poly key + VPERM2I128 $0x02, AA0, BB0, TT0 + VPAND ·polyClampMask<>(SB), TT0, TT0 + VMOVDQA TT0, rsStoreAVX2 + + // Stream for up to 320 bytes + VPERM2I128 $0x13, AA0, BB0, AA0 + VPERM2I128 $0x13, CC0, DD0, BB0 + VPERM2I128 $0x02, AA1, BB1, CC0 + VPERM2I128 $0x02, CC1, DD1, DD0 + VPERM2I128 $0x13, AA1, BB1, AA1 + VPERM2I128 $0x13, CC1, DD1, BB1 + VPERM2I128 $0x02, AA2, BB2, CC1 + VPERM2I128 $0x02, CC2, DD2, DD1 + VPERM2I128 $0x13, AA2, BB2, AA2 + VPERM2I128 $0x13, CC2, DD2, BB2 + JMP openAVX2ShortOpen + +// ---------------------------------------------------------------------------- +// Special optimization for the last 128 bytes of ciphertext +openAVX2Tail128: + // Need to decrypt up to 128 bytes - prepare two blocks + VMOVDQA ·chacha20Constants<>(SB), AA1 + VMOVDQA state1StoreAVX2, BB1 + VMOVDQA state2StoreAVX2, CC1 + VMOVDQA ctr3StoreAVX2, DD1 + VPADDD ·avx2IncMask<>(SB), DD1, DD1 + VMOVDQA DD1, DD0 + + XORQ itr2, itr2 + MOVQ inl, itr1 + ANDQ $-16, itr1 + TESTQ itr1, itr1 + JE openAVX2Tail128LoopB + +openAVX2Tail128LoopA: + // Perform ChaCha rounds, while hashing the remaining input + polyAdd(0(inp)(itr2*1)) + polyMulAVX2 + +openAVX2Tail128LoopB: + ADDQ $16, itr2 + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR $4, BB1, BB1, BB1 + VPALIGNR $8, CC1, CC1, CC1 + VPALIGNR $12, DD1, DD1, DD1 + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR $12, BB1, BB1, BB1 + VPALIGNR $8, CC1, CC1, CC1 + VPALIGNR $4, DD1, DD1, DD1 + CMPQ itr2, itr1 + JB openAVX2Tail128LoopA + CMPQ itr2, $160 + JNE openAVX2Tail128LoopB + + VPADDD ·chacha20Constants<>(SB), AA1, AA1 + VPADDD state1StoreAVX2, BB1, BB1 + VPADDD state2StoreAVX2, CC1, CC1 + VPADDD DD0, DD1, DD1 + VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 + +openAVX2TailLoop: + CMPQ inl, $32 + JB openAVX2Tail + SUBQ $32, inl + + // Load for decryption + VPXOR (inp), AA0, AA0 + VMOVDQU AA0, (oup) + LEAQ (1*32)(inp), inp + LEAQ (1*32)(oup), oup + VMOVDQA BB0, AA0 + VMOVDQA CC0, BB0 + VMOVDQA DD0, CC0 + JMP openAVX2TailLoop + +openAVX2Tail: + CMPQ inl, $16 + VMOVDQA A0, A1 + JB openAVX2TailDone + SUBQ $16, inl + + // Load for decryption + VPXOR (inp), A0, T0 + VMOVDQU T0, (oup) + LEAQ (1*16)(inp), inp + LEAQ (1*16)(oup), oup + VPERM2I128 $0x11, AA0, AA0, AA0 + VMOVDQA A0, A1 + +openAVX2TailDone: + VZEROUPPER + JMP openSSETail16 + +// ---------------------------------------------------------------------------- +// Special optimization for the last 256 bytes of ciphertext +openAVX2Tail256: + // Need to decrypt up to 256 bytes - prepare four blocks + VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1 + VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1 + VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1 + VMOVDQA ctr3StoreAVX2, DD0 + VPADDD ·avx2IncMask<>(SB), DD0, DD0 + VPADDD ·avx2IncMask<>(SB), DD0, DD1 + VMOVDQA DD0, TT1 + VMOVDQA DD1, TT2 + + // Compute the number of iterations that will hash data + MOVQ inl, tmpStoreAVX2 + MOVQ inl, itr1 + SUBQ $128, itr1 + SHRQ $4, itr1 + MOVQ $10, itr2 + CMPQ itr1, $10 + CMOVQGT itr2, itr1 + MOVQ inp, inl + XORQ itr2, itr2 + +openAVX2Tail256LoopA: + polyAdd(0(inl)) + polyMulAVX2 + LEAQ 16(inl), inl + + // Perform ChaCha rounds, while hashing the remaining input +openAVX2Tail256LoopB: + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 + VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 + INCQ itr2 + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 + VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 + CMPQ itr2, itr1 + JB openAVX2Tail256LoopA + + CMPQ itr2, $10 + JNE openAVX2Tail256LoopB + + MOVQ inl, itr2 + SUBQ inp, inl + MOVQ inl, itr1 + MOVQ tmpStoreAVX2, inl + + // Hash the remainder of data (if any) +openAVX2Tail256Hash: + ADDQ $16, itr1 + CMPQ itr1, inl + JGT openAVX2Tail256HashEnd + polyAdd (0(itr2)) + polyMulAVX2 + LEAQ 16(itr2), itr2 + JMP openAVX2Tail256Hash + +// Store 128 bytes safely, then go to store loop +openAVX2Tail256HashEnd: + VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1 + VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1 + VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1 + VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1 + VPERM2I128 $0x02, AA0, BB0, AA2; VPERM2I128 $0x02, CC0, DD0, BB2; VPERM2I128 $0x13, AA0, BB0, CC2; VPERM2I128 $0x13, CC0, DD0, DD2 + VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 + + VPXOR (0*32)(inp), AA2, AA2; VPXOR (1*32)(inp), BB2, BB2; VPXOR (2*32)(inp), CC2, CC2; VPXOR (3*32)(inp), DD2, DD2 + VMOVDQU AA2, (0*32)(oup); VMOVDQU BB2, (1*32)(oup); VMOVDQU CC2, (2*32)(oup); VMOVDQU DD2, (3*32)(oup) + LEAQ (4*32)(inp), inp + LEAQ (4*32)(oup), oup + SUBQ $4*32, inl + + JMP openAVX2TailLoop + +// ---------------------------------------------------------------------------- +// Special optimization for the last 384 bytes of ciphertext +openAVX2Tail384: + // Need to decrypt up to 384 bytes - prepare six blocks + VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2 + VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2 + VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2 + VMOVDQA ctr3StoreAVX2, DD0 + VPADDD ·avx2IncMask<>(SB), DD0, DD0 + VPADDD ·avx2IncMask<>(SB), DD0, DD1 + VPADDD ·avx2IncMask<>(SB), DD1, DD2 + VMOVDQA DD0, ctr0StoreAVX2 + VMOVDQA DD1, ctr1StoreAVX2 + VMOVDQA DD2, ctr2StoreAVX2 + + // Compute the number of iterations that will hash two blocks of data + MOVQ inl, tmpStoreAVX2 + MOVQ inl, itr1 + SUBQ $256, itr1 + SHRQ $4, itr1 + ADDQ $6, itr1 + MOVQ $10, itr2 + CMPQ itr1, $10 + CMOVQGT itr2, itr1 + MOVQ inp, inl + XORQ itr2, itr2 + + // Perform ChaCha rounds, while hashing the remaining input +openAVX2Tail384LoopB: + polyAdd(0(inl)) + polyMulAVX2 + LEAQ 16(inl), inl + +openAVX2Tail384LoopA: + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 + VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 + polyAdd(0(inl)) + polyMulAVX2 + LEAQ 16(inl), inl + INCQ itr2 + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 + VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 + + CMPQ itr2, itr1 + JB openAVX2Tail384LoopB + + CMPQ itr2, $10 + JNE openAVX2Tail384LoopA + + MOVQ inl, itr2 + SUBQ inp, inl + MOVQ inl, itr1 + MOVQ tmpStoreAVX2, inl + +openAVX2Tail384Hash: + ADDQ $16, itr1 + CMPQ itr1, inl + JGT openAVX2Tail384HashEnd + polyAdd(0(itr2)) + polyMulAVX2 + LEAQ 16(itr2), itr2 + JMP openAVX2Tail384Hash + +// Store 256 bytes safely, then go to store loop +openAVX2Tail384HashEnd: + VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2 + VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2 + VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2 + VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2 + VPERM2I128 $0x02, AA0, BB0, TT0; VPERM2I128 $0x02, CC0, DD0, TT1; VPERM2I128 $0x13, AA0, BB0, TT2; VPERM2I128 $0x13, CC0, DD0, TT3 + VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3 + VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup) + VPERM2I128 $0x02, AA1, BB1, TT0; VPERM2I128 $0x02, CC1, DD1, TT1; VPERM2I128 $0x13, AA1, BB1, TT2; VPERM2I128 $0x13, CC1, DD1, TT3 + VPXOR (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3 + VMOVDQU TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup) + VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 + LEAQ (8*32)(inp), inp + LEAQ (8*32)(oup), oup + SUBQ $8*32, inl + JMP openAVX2TailLoop + +// ---------------------------------------------------------------------------- +// Special optimization for the last 512 bytes of ciphertext +openAVX2Tail512: + VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 + VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 + VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 + VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 + VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 + XORQ itr1, itr1 + MOVQ inp, itr2 + +openAVX2Tail512LoopB: + polyAdd(0(itr2)) + polyMulAVX2 + LEAQ (2*8)(itr2), itr2 + +openAVX2Tail512LoopA: + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + polyAdd(0*8(itr2)) + polyMulAVX2 + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 + VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + polyAdd(2*8(itr2)) + polyMulAVX2 + LEAQ (4*8)(itr2), itr2 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 + VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 + INCQ itr1 + CMPQ itr1, $4 + JLT openAVX2Tail512LoopB + + CMPQ itr1, $10 + JNE openAVX2Tail512LoopA + + MOVQ inl, itr1 + SUBQ $384, itr1 + ANDQ $-16, itr1 + +openAVX2Tail512HashLoop: + TESTQ itr1, itr1 + JE openAVX2Tail512HashEnd + polyAdd(0(itr2)) + polyMulAVX2 + LEAQ 16(itr2), itr2 + SUBQ $16, itr1 + JMP openAVX2Tail512HashLoop + +openAVX2Tail512HashEnd: + VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 + VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 + VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 + VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 + VMOVDQA CC3, tmpStoreAVX2 + VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0 + VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0 + VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup) + VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 + VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 + VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) + VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 + VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 + VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) + VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 + + LEAQ (12*32)(inp), inp + LEAQ (12*32)(oup), oup + SUBQ $12*32, inl + + JMP openAVX2TailLoop + +// ---------------------------------------------------------------------------- +// ---------------------------------------------------------------------------- +// func chacha20Poly1305Seal(dst, key, src, ad []byte) +TEXT ·chacha20Poly1305Seal(SB), 0, $288-96 + // For aligned stack access + MOVQ SP, BP + ADDQ $32, BP + ANDQ $-32, BP + MOVQ dst+0(FP), oup + MOVQ key+24(FP), keyp + MOVQ src+48(FP), inp + MOVQ src_len+56(FP), inl + MOVQ ad+72(FP), adp + + CMPB ·useAVX2(SB), $1 + JE chacha20Poly1305Seal_AVX2 + + // Special optimization, for very short buffers + CMPQ inl, $128 + JBE sealSSE128 // About 15% faster + + // In the seal case - prepare the poly key + 3 blocks of stream in the first iteration + MOVOU ·chacha20Constants<>(SB), A0 + MOVOU (1*16)(keyp), B0 + MOVOU (2*16)(keyp), C0 + MOVOU (3*16)(keyp), D0 + + // Store state on stack for future use + MOVO B0, state1Store + MOVO C0, state2Store + + // Load state, increment counter blocks + MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 + MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 + MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 + + // Store counters + MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store + MOVQ $10, itr2 + +sealSSEIntroLoop: + MOVO C3, tmpStore + chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) + MOVO tmpStore, C3 + MOVO C1, tmpStore + chachaQR(A3, B3, C3, D3, C1) + MOVO tmpStore, C1 + shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left + shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left + shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left + + MOVO C3, tmpStore + chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) + MOVO tmpStore, C3 + MOVO C1, tmpStore + chachaQR(A3, B3, C3, D3, C1) + MOVO tmpStore, C1 + shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right + shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right + shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right + DECQ itr2 + JNE sealSSEIntroLoop + + // Add in the state + PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 + PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 + PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 + PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 + + // Clamp and store the key + PAND ·polyClampMask<>(SB), A0 + MOVO A0, rStore + MOVO B0, sStore + + // Hash AAD + MOVQ ad_len+80(FP), itr2 + CALL polyHashADInternal<>(SB) + + MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0 + PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1 + MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup) + MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0 + PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2 + MOVOU A2, (4*16)(oup); MOVOU B2, (5*16)(oup); MOVOU C2, (6*16)(oup); MOVOU D2, (7*16)(oup) + + MOVQ $128, itr1 + SUBQ $128, inl + LEAQ 128(inp), inp + + MOVO A3, A1; MOVO B3, B1; MOVO C3, C1; MOVO D3, D1 + + CMPQ inl, $64 + JBE sealSSE128SealHash + + MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0 + PXOR A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3 + MOVOU A3, (8*16)(oup); MOVOU B3, (9*16)(oup); MOVOU C3, (10*16)(oup); MOVOU D3, (11*16)(oup) + + ADDQ $64, itr1 + SUBQ $64, inl + LEAQ 64(inp), inp + + MOVQ $2, itr1 + MOVQ $8, itr2 + + CMPQ inl, $64 + JBE sealSSETail64 + CMPQ inl, $128 + JBE sealSSETail128 + CMPQ inl, $192 + JBE sealSSETail192 + +sealSSEMainLoop: + // Load state, increment counter blocks + MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0 + MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 + MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 + MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 + + // Store counters + MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store + +sealSSEInnerLoop: + MOVO C3, tmpStore + chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) + MOVO tmpStore, C3 + MOVO C1, tmpStore + chachaQR(A3, B3, C3, D3, C1) + MOVO tmpStore, C1 + polyAdd(0(oup)) + shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left + shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left + shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left + polyMulStage1 + polyMulStage2 + LEAQ (2*8)(oup), oup + MOVO C3, tmpStore + chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) + MOVO tmpStore, C3 + MOVO C1, tmpStore + polyMulStage3 + chachaQR(A3, B3, C3, D3, C1) + MOVO tmpStore, C1 + polyMulReduceStage + shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right + shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right + shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right + DECQ itr2 + JGE sealSSEInnerLoop + polyAdd(0(oup)) + polyMul + LEAQ (2*8)(oup), oup + DECQ itr1 + JG sealSSEInnerLoop + + // Add in the state + PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 + PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 + PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 + PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 + MOVO D3, tmpStore + + // Load - xor - store + MOVOU (0*16)(inp), D3; PXOR D3, A0 + MOVOU (1*16)(inp), D3; PXOR D3, B0 + MOVOU (2*16)(inp), D3; PXOR D3, C0 + MOVOU (3*16)(inp), D3; PXOR D3, D0 + MOVOU A0, (0*16)(oup) + MOVOU B0, (1*16)(oup) + MOVOU C0, (2*16)(oup) + MOVOU D0, (3*16)(oup) + MOVO tmpStore, D3 + + MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0 + PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1 + MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) + MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0 + PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2 + MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup) + ADDQ $192, inp + MOVQ $192, itr1 + SUBQ $192, inl + MOVO A3, A1 + MOVO B3, B1 + MOVO C3, C1 + MOVO D3, D1 + CMPQ inl, $64 + JBE sealSSE128SealHash + MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0 + PXOR A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3 + MOVOU A3, (12*16)(oup); MOVOU B3, (13*16)(oup); MOVOU C3, (14*16)(oup); MOVOU D3, (15*16)(oup) + LEAQ 64(inp), inp + SUBQ $64, inl + MOVQ $6, itr1 + MOVQ $4, itr2 + CMPQ inl, $192 + JG sealSSEMainLoop + + MOVQ inl, itr1 + TESTQ inl, inl + JE sealSSE128SealHash + MOVQ $6, itr1 + CMPQ inl, $64 + JBE sealSSETail64 + CMPQ inl, $128 + JBE sealSSETail128 + JMP sealSSETail192 + +// ---------------------------------------------------------------------------- +// Special optimization for the last 64 bytes of plaintext +sealSSETail64: + // Need to encrypt up to 64 bytes - prepare single block, hash 192 or 256 bytes + MOVO ·chacha20Constants<>(SB), A1 + MOVO state1Store, B1 + MOVO state2Store, C1 + MOVO ctr3Store, D1 + PADDL ·sseIncMask<>(SB), D1 + MOVO D1, ctr0Store + +sealSSETail64LoopA: + // Perform ChaCha rounds, while hashing the previously encrypted ciphertext + polyAdd(0(oup)) + polyMul + LEAQ 16(oup), oup + +sealSSETail64LoopB: + chachaQR(A1, B1, C1, D1, T1) + shiftB1Left; shiftC1Left; shiftD1Left + chachaQR(A1, B1, C1, D1, T1) + shiftB1Right; shiftC1Right; shiftD1Right + polyAdd(0(oup)) + polyMul + LEAQ 16(oup), oup + + DECQ itr1 + JG sealSSETail64LoopA + + DECQ itr2 + JGE sealSSETail64LoopB + PADDL ·chacha20Constants<>(SB), A1 + PADDL state1Store, B1 + PADDL state2Store, C1 + PADDL ctr0Store, D1 + + JMP sealSSE128Seal + +// ---------------------------------------------------------------------------- +// Special optimization for the last 128 bytes of plaintext +sealSSETail128: + // Need to encrypt up to 128 bytes - prepare two blocks, hash 192 or 256 bytes + MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store + MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store + +sealSSETail128LoopA: + // Perform ChaCha rounds, while hashing the previously encrypted ciphertext + polyAdd(0(oup)) + polyMul + LEAQ 16(oup), oup + +sealSSETail128LoopB: + chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) + shiftB0Left; shiftC0Left; shiftD0Left + shiftB1Left; shiftC1Left; shiftD1Left + polyAdd(0(oup)) + polyMul + LEAQ 16(oup), oup + chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) + shiftB0Right; shiftC0Right; shiftD0Right + shiftB1Right; shiftC1Right; shiftD1Right + + DECQ itr1 + JG sealSSETail128LoopA + + DECQ itr2 + JGE sealSSETail128LoopB + + PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1 + PADDL state1Store, B0; PADDL state1Store, B1 + PADDL state2Store, C0; PADDL state2Store, C1 + PADDL ctr0Store, D0; PADDL ctr1Store, D1 + + MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 + PXOR T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0 + MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup) + + MOVQ $64, itr1 + LEAQ 64(inp), inp + SUBQ $64, inl + + JMP sealSSE128SealHash + +// ---------------------------------------------------------------------------- +// Special optimization for the last 192 bytes of plaintext +sealSSETail192: + // Need to encrypt up to 192 bytes - prepare three blocks, hash 192 or 256 bytes + MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store + MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store + MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr2Store + +sealSSETail192LoopA: + // Perform ChaCha rounds, while hashing the previously encrypted ciphertext + polyAdd(0(oup)) + polyMul + LEAQ 16(oup), oup + +sealSSETail192LoopB: + chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) + shiftB0Left; shiftC0Left; shiftD0Left + shiftB1Left; shiftC1Left; shiftD1Left + shiftB2Left; shiftC2Left; shiftD2Left + + polyAdd(0(oup)) + polyMul + LEAQ 16(oup), oup + + chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) + shiftB0Right; shiftC0Right; shiftD0Right + shiftB1Right; shiftC1Right; shiftD1Right + shiftB2Right; shiftC2Right; shiftD2Right + + DECQ itr1 + JG sealSSETail192LoopA + + DECQ itr2 + JGE sealSSETail192LoopB + + PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 + PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2 + PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2 + PADDL ctr0Store, D0; PADDL ctr1Store, D1; PADDL ctr2Store, D2 + + MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 + PXOR T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0 + MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup) + MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3 + PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1 + MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) + + MOVO A2, A1 + MOVO B2, B1 + MOVO C2, C1 + MOVO D2, D1 + MOVQ $128, itr1 + LEAQ 128(inp), inp + SUBQ $128, inl + + JMP sealSSE128SealHash + +// ---------------------------------------------------------------------------- +// Special seal optimization for buffers smaller than 129 bytes +sealSSE128: + // For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks + MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0 + MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 + MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 + MOVO B0, T1; MOVO C0, T2; MOVO D1, T3 + MOVQ $10, itr2 + +sealSSE128InnerCipherLoop: + chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) + shiftB0Left; shiftB1Left; shiftB2Left + shiftC0Left; shiftC1Left; shiftC2Left + shiftD0Left; shiftD1Left; shiftD2Left + chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) + shiftB0Right; shiftB1Right; shiftB2Right + shiftC0Right; shiftC1Right; shiftC2Right + shiftD0Right; shiftD1Right; shiftD2Right + DECQ itr2 + JNE sealSSE128InnerCipherLoop + + // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded + PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 + PADDL T1, B0; PADDL T1, B1; PADDL T1, B2 + PADDL T2, C1; PADDL T2, C2 + PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2 + PAND ·polyClampMask<>(SB), A0 + MOVOU A0, rStore + MOVOU B0, sStore + + // Hash + MOVQ ad_len+80(FP), itr2 + CALL polyHashADInternal<>(SB) + XORQ itr1, itr1 + +sealSSE128SealHash: + // itr1 holds the number of bytes encrypted but not yet hashed + CMPQ itr1, $16 + JB sealSSE128Seal + polyAdd(0(oup)) + polyMul + + SUBQ $16, itr1 + ADDQ $16, oup + + JMP sealSSE128SealHash + +sealSSE128Seal: + CMPQ inl, $16 + JB sealSSETail + SUBQ $16, inl + + // Load for decryption + MOVOU (inp), T0 + PXOR T0, A1 + MOVOU A1, (oup) + LEAQ (1*16)(inp), inp + LEAQ (1*16)(oup), oup + + // Extract for hashing + MOVQ A1, t0 + PSRLDQ $8, A1 + MOVQ A1, t1 + ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 + polyMul + + // Shift the stream "left" + MOVO B1, A1 + MOVO C1, B1 + MOVO D1, C1 + MOVO A2, D1 + MOVO B2, A2 + MOVO C2, B2 + MOVO D2, C2 + JMP sealSSE128Seal + +sealSSETail: + TESTQ inl, inl + JE sealSSEFinalize + + // We can only load the PT one byte at a time to avoid read after end of buffer + MOVQ inl, itr2 + SHLQ $4, itr2 + LEAQ ·andMask<>(SB), t0 + MOVQ inl, itr1 + LEAQ -1(inp)(inl*1), inp + XORQ t2, t2 + XORQ t3, t3 + XORQ AX, AX + +sealSSETailLoadLoop: + SHLQ $8, t2, t3 + SHLQ $8, t2 + MOVB (inp), AX + XORQ AX, t2 + LEAQ -1(inp), inp + DECQ itr1 + JNE sealSSETailLoadLoop + MOVQ t2, 0+tmpStore + MOVQ t3, 8+tmpStore + PXOR 0+tmpStore, A1 + MOVOU A1, (oup) + MOVOU -16(t0)(itr2*1), T0 + PAND T0, A1 + MOVQ A1, t0 + PSRLDQ $8, A1 + MOVQ A1, t1 + ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 + polyMul + + ADDQ inl, oup + +sealSSEFinalize: + // Hash in the buffer lengths + ADDQ ad_len+80(FP), acc0 + ADCQ src_len+56(FP), acc1 + ADCQ $1, acc2 + polyMul + + // Final reduce + MOVQ acc0, t0 + MOVQ acc1, t1 + MOVQ acc2, t2 + SUBQ $-5, acc0 + SBBQ $-1, acc1 + SBBQ $3, acc2 + CMOVQCS t0, acc0 + CMOVQCS t1, acc1 + CMOVQCS t2, acc2 + + // Add in the "s" part of the key + ADDQ 0+sStore, acc0 + ADCQ 8+sStore, acc1 + + // Finally store the tag at the end of the message + MOVQ acc0, (0*8)(oup) + MOVQ acc1, (1*8)(oup) + RET + +// ---------------------------------------------------------------------------- +// ------------------------- AVX2 Code ---------------------------------------- +chacha20Poly1305Seal_AVX2: + VZEROUPPER + VMOVDQU ·chacha20Constants<>(SB), AA0 + BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14 + BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12 + BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4 + VPADDD ·avx2InitMask<>(SB), DD0, DD0 + + // Special optimizations, for very short buffers + CMPQ inl, $192 + JBE seal192AVX2 // 33% faster + CMPQ inl, $320 + JBE seal320AVX2 // 17% faster + + // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream + VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 + VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3; VMOVDQA BB0, state1StoreAVX2 + VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3; VMOVDQA CC0, state2StoreAVX2 + VPADDD ·avx2IncMask<>(SB), DD0, DD1; VMOVDQA DD0, ctr0StoreAVX2 + VPADDD ·avx2IncMask<>(SB), DD1, DD2; VMOVDQA DD1, ctr1StoreAVX2 + VPADDD ·avx2IncMask<>(SB), DD2, DD3; VMOVDQA DD2, ctr2StoreAVX2 + VMOVDQA DD3, ctr3StoreAVX2 + MOVQ $10, itr2 + +sealAVX2IntroLoop: + VMOVDQA CC3, tmpStoreAVX2 + chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) + VMOVDQA tmpStoreAVX2, CC3 + VMOVDQA CC1, tmpStoreAVX2 + chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) + VMOVDQA tmpStoreAVX2, CC1 + + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0 + VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1 + VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2 + VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3 + + VMOVDQA CC3, tmpStoreAVX2 + chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) + VMOVDQA tmpStoreAVX2, CC3 + VMOVDQA CC1, tmpStoreAVX2 + chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) + VMOVDQA tmpStoreAVX2, CC1 + + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0 + VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1 + VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2 + VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3 + DECQ itr2 + JNE sealAVX2IntroLoop + + VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 + VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 + VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 + VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 + + VPERM2I128 $0x13, CC0, DD0, CC0 // Stream bytes 96 - 127 + VPERM2I128 $0x02, AA0, BB0, DD0 // The Poly1305 key + VPERM2I128 $0x13, AA0, BB0, AA0 // Stream bytes 64 - 95 + + // Clamp and store poly key + VPAND ·polyClampMask<>(SB), DD0, DD0 + VMOVDQA DD0, rsStoreAVX2 + + // Hash AD + MOVQ ad_len+80(FP), itr2 + CALL polyHashADInternal<>(SB) + + // Can store at least 320 bytes + VPXOR (0*32)(inp), AA0, AA0 + VPXOR (1*32)(inp), CC0, CC0 + VMOVDQU AA0, (0*32)(oup) + VMOVDQU CC0, (1*32)(oup) + + VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 + VPXOR (2*32)(inp), AA0, AA0; VPXOR (3*32)(inp), BB0, BB0; VPXOR (4*32)(inp), CC0, CC0; VPXOR (5*32)(inp), DD0, DD0 + VMOVDQU AA0, (2*32)(oup); VMOVDQU BB0, (3*32)(oup); VMOVDQU CC0, (4*32)(oup); VMOVDQU DD0, (5*32)(oup) + VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 + VPXOR (6*32)(inp), AA0, AA0; VPXOR (7*32)(inp), BB0, BB0; VPXOR (8*32)(inp), CC0, CC0; VPXOR (9*32)(inp), DD0, DD0 + VMOVDQU AA0, (6*32)(oup); VMOVDQU BB0, (7*32)(oup); VMOVDQU CC0, (8*32)(oup); VMOVDQU DD0, (9*32)(oup) + + MOVQ $320, itr1 + SUBQ $320, inl + LEAQ 320(inp), inp + + VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, CC3, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, CC3, DD3, DD0 + CMPQ inl, $128 + JBE sealAVX2SealHash + + VPXOR (0*32)(inp), AA0, AA0; VPXOR (1*32)(inp), BB0, BB0; VPXOR (2*32)(inp), CC0, CC0; VPXOR (3*32)(inp), DD0, DD0 + VMOVDQU AA0, (10*32)(oup); VMOVDQU BB0, (11*32)(oup); VMOVDQU CC0, (12*32)(oup); VMOVDQU DD0, (13*32)(oup) + SUBQ $128, inl + LEAQ 128(inp), inp + + MOVQ $8, itr1 + MOVQ $2, itr2 + + CMPQ inl, $128 + JBE sealAVX2Tail128 + CMPQ inl, $256 + JBE sealAVX2Tail256 + CMPQ inl, $384 + JBE sealAVX2Tail384 + CMPQ inl, $512 + JBE sealAVX2Tail512 + + // We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop + VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 + VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 + VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 + VMOVDQA ctr3StoreAVX2, DD0 + VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 + VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 + + VMOVDQA CC3, tmpStoreAVX2 + chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) + VMOVDQA tmpStoreAVX2, CC3 + VMOVDQA CC1, tmpStoreAVX2 + chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) + VMOVDQA tmpStoreAVX2, CC1 + + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0 + VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1 + VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2 + VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3 + + VMOVDQA CC3, tmpStoreAVX2 + chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) + VMOVDQA tmpStoreAVX2, CC3 + VMOVDQA CC1, tmpStoreAVX2 + chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) + VMOVDQA tmpStoreAVX2, CC1 + + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0 + VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1 + VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2 + VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3 + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + + SUBQ $16, oup // Adjust the pointer + MOVQ $9, itr1 + JMP sealAVX2InternalLoopStart + +sealAVX2MainLoop: + // Load state, increment counter blocks, store the incremented counters + VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 + VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 + VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 + VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 + VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 + MOVQ $10, itr1 + +sealAVX2InternalLoop: + polyAdd(0*8(oup)) + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + polyMulStage1_AVX2 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 + polyMulStage2_AVX2 + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + polyMulStage3_AVX2 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + polyMulReduceStage + +sealAVX2InternalLoopStart: + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 + polyAdd(2*8(oup)) + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + polyMulStage1_AVX2 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + polyMulStage2_AVX2 + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 + VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + polyMulStage3_AVX2 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 + polyMulReduceStage + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + polyAdd(4*8(oup)) + LEAQ (6*8)(oup), oup + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + polyMulStage1_AVX2 + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + polyMulStage2_AVX2 + VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + polyMulStage3_AVX2 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + polyMulReduceStage + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 + VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 + DECQ itr1 + JNE sealAVX2InternalLoop + + VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 + VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 + VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 + VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 + VMOVDQA CC3, tmpStoreAVX2 + + // We only hashed 480 of the 512 bytes available - hash the remaining 32 here + polyAdd(0*8(oup)) + polyMulAVX2 + LEAQ (4*8)(oup), oup + VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0 + VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0 + VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup) + VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 + VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 + VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) + + // and here + polyAdd(-2*8(oup)) + polyMulAVX2 + VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 + VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 + VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) + VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 + VPXOR (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0 + VMOVDQU AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup) + LEAQ (32*16)(inp), inp + SUBQ $(32*16), inl + CMPQ inl, $512 + JG sealAVX2MainLoop + + // Tail can only hash 480 bytes + polyAdd(0*8(oup)) + polyMulAVX2 + polyAdd(2*8(oup)) + polyMulAVX2 + LEAQ 32(oup), oup + + MOVQ $10, itr1 + MOVQ $0, itr2 + CMPQ inl, $128 + JBE sealAVX2Tail128 + CMPQ inl, $256 + JBE sealAVX2Tail256 + CMPQ inl, $384 + JBE sealAVX2Tail384 + JMP sealAVX2Tail512 + +// ---------------------------------------------------------------------------- +// Special optimization for buffers smaller than 193 bytes +seal192AVX2: + // For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks + VMOVDQA AA0, AA1 + VMOVDQA BB0, BB1 + VMOVDQA CC0, CC1 + VPADDD ·avx2IncMask<>(SB), DD0, DD1 + VMOVDQA AA0, AA2 + VMOVDQA BB0, BB2 + VMOVDQA CC0, CC2 + VMOVDQA DD0, DD2 + VMOVDQA DD1, TT3 + MOVQ $10, itr2 + +sealAVX2192InnerCipherLoop: + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 + VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 + VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 + DECQ itr2 + JNE sealAVX2192InnerCipherLoop + VPADDD AA2, AA0, AA0; VPADDD AA2, AA1, AA1 + VPADDD BB2, BB0, BB0; VPADDD BB2, BB1, BB1 + VPADDD CC2, CC0, CC0; VPADDD CC2, CC1, CC1 + VPADDD DD2, DD0, DD0; VPADDD TT3, DD1, DD1 + VPERM2I128 $0x02, AA0, BB0, TT0 + + // Clamp and store poly key + VPAND ·polyClampMask<>(SB), TT0, TT0 + VMOVDQA TT0, rsStoreAVX2 + + // Stream for up to 192 bytes + VPERM2I128 $0x13, AA0, BB0, AA0 + VPERM2I128 $0x13, CC0, DD0, BB0 + VPERM2I128 $0x02, AA1, BB1, CC0 + VPERM2I128 $0x02, CC1, DD1, DD0 + VPERM2I128 $0x13, AA1, BB1, AA1 + VPERM2I128 $0x13, CC1, DD1, BB1 + +sealAVX2ShortSeal: + // Hash aad + MOVQ ad_len+80(FP), itr2 + CALL polyHashADInternal<>(SB) + XORQ itr1, itr1 + +sealAVX2SealHash: + // itr1 holds the number of bytes encrypted but not yet hashed + CMPQ itr1, $16 + JB sealAVX2ShortSealLoop + polyAdd(0(oup)) + polyMul + SUBQ $16, itr1 + ADDQ $16, oup + JMP sealAVX2SealHash + +sealAVX2ShortSealLoop: + CMPQ inl, $32 + JB sealAVX2ShortTail32 + SUBQ $32, inl + + // Load for encryption + VPXOR (inp), AA0, AA0 + VMOVDQU AA0, (oup) + LEAQ (1*32)(inp), inp + + // Now can hash + polyAdd(0*8(oup)) + polyMulAVX2 + polyAdd(2*8(oup)) + polyMulAVX2 + LEAQ (1*32)(oup), oup + + // Shift stream left + VMOVDQA BB0, AA0 + VMOVDQA CC0, BB0 + VMOVDQA DD0, CC0 + VMOVDQA AA1, DD0 + VMOVDQA BB1, AA1 + VMOVDQA CC1, BB1 + VMOVDQA DD1, CC1 + VMOVDQA AA2, DD1 + VMOVDQA BB2, AA2 + JMP sealAVX2ShortSealLoop + +sealAVX2ShortTail32: + CMPQ inl, $16 + VMOVDQA A0, A1 + JB sealAVX2ShortDone + + SUBQ $16, inl + + // Load for encryption + VPXOR (inp), A0, T0 + VMOVDQU T0, (oup) + LEAQ (1*16)(inp), inp + + // Hash + polyAdd(0*8(oup)) + polyMulAVX2 + LEAQ (1*16)(oup), oup + VPERM2I128 $0x11, AA0, AA0, AA0 + VMOVDQA A0, A1 + +sealAVX2ShortDone: + VZEROUPPER + JMP sealSSETail + +// ---------------------------------------------------------------------------- +// Special optimization for buffers smaller than 321 bytes +seal320AVX2: + // For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks + VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1 + VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2 + VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3 + MOVQ $10, itr2 + +sealAVX2320InnerCipherLoop: + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 + VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 + VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 + DECQ itr2 + JNE sealAVX2320InnerCipherLoop + + VMOVDQA ·chacha20Constants<>(SB), TT0 + VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2 + VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2 + VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2 + VMOVDQA ·avx2IncMask<>(SB), TT0 + VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3 + VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3 + VPADDD TT3, DD2, DD2 + + // Clamp and store poly key + VPERM2I128 $0x02, AA0, BB0, TT0 + VPAND ·polyClampMask<>(SB), TT0, TT0 + VMOVDQA TT0, rsStoreAVX2 + + // Stream for up to 320 bytes + VPERM2I128 $0x13, AA0, BB0, AA0 + VPERM2I128 $0x13, CC0, DD0, BB0 + VPERM2I128 $0x02, AA1, BB1, CC0 + VPERM2I128 $0x02, CC1, DD1, DD0 + VPERM2I128 $0x13, AA1, BB1, AA1 + VPERM2I128 $0x13, CC1, DD1, BB1 + VPERM2I128 $0x02, AA2, BB2, CC1 + VPERM2I128 $0x02, CC2, DD2, DD1 + VPERM2I128 $0x13, AA2, BB2, AA2 + VPERM2I128 $0x13, CC2, DD2, BB2 + JMP sealAVX2ShortSeal + +// ---------------------------------------------------------------------------- +// Special optimization for the last 128 bytes of ciphertext +sealAVX2Tail128: + // Need to decrypt up to 128 bytes - prepare two blocks + // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed + // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed + VMOVDQA ·chacha20Constants<>(SB), AA0 + VMOVDQA state1StoreAVX2, BB0 + VMOVDQA state2StoreAVX2, CC0 + VMOVDQA ctr3StoreAVX2, DD0 + VPADDD ·avx2IncMask<>(SB), DD0, DD0 + VMOVDQA DD0, DD1 + +sealAVX2Tail128LoopA: + polyAdd(0(oup)) + polyMul + LEAQ 16(oup), oup + +sealAVX2Tail128LoopB: + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + polyAdd(0(oup)) + polyMul + VPALIGNR $4, BB0, BB0, BB0 + VPALIGNR $8, CC0, CC0, CC0 + VPALIGNR $12, DD0, DD0, DD0 + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + polyAdd(16(oup)) + polyMul + LEAQ 32(oup), oup + VPALIGNR $12, BB0, BB0, BB0 + VPALIGNR $8, CC0, CC0, CC0 + VPALIGNR $4, DD0, DD0, DD0 + DECQ itr1 + JG sealAVX2Tail128LoopA + DECQ itr2 + JGE sealAVX2Tail128LoopB + + VPADDD ·chacha20Constants<>(SB), AA0, AA1 + VPADDD state1StoreAVX2, BB0, BB1 + VPADDD state2StoreAVX2, CC0, CC1 + VPADDD DD1, DD0, DD1 + + VPERM2I128 $0x02, AA1, BB1, AA0 + VPERM2I128 $0x02, CC1, DD1, BB0 + VPERM2I128 $0x13, AA1, BB1, CC0 + VPERM2I128 $0x13, CC1, DD1, DD0 + JMP sealAVX2ShortSealLoop + +// ---------------------------------------------------------------------------- +// Special optimization for the last 256 bytes of ciphertext +sealAVX2Tail256: + // Need to decrypt up to 256 bytes - prepare two blocks + // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed + // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed + VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA ·chacha20Constants<>(SB), AA1 + VMOVDQA state1StoreAVX2, BB0; VMOVDQA state1StoreAVX2, BB1 + VMOVDQA state2StoreAVX2, CC0; VMOVDQA state2StoreAVX2, CC1 + VMOVDQA ctr3StoreAVX2, DD0 + VPADDD ·avx2IncMask<>(SB), DD0, DD0 + VPADDD ·avx2IncMask<>(SB), DD0, DD1 + VMOVDQA DD0, TT1 + VMOVDQA DD1, TT2 + +sealAVX2Tail256LoopA: + polyAdd(0(oup)) + polyMul + LEAQ 16(oup), oup + +sealAVX2Tail256LoopB: + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + polyAdd(0(oup)) + polyMul + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 + VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + polyAdd(16(oup)) + polyMul + LEAQ 32(oup), oup + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 + VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 + DECQ itr1 + JG sealAVX2Tail256LoopA + DECQ itr2 + JGE sealAVX2Tail256LoopB + + VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1 + VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1 + VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1 + VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1 + VPERM2I128 $0x02, AA0, BB0, TT0 + VPERM2I128 $0x02, CC0, DD0, TT1 + VPERM2I128 $0x13, AA0, BB0, TT2 + VPERM2I128 $0x13, CC0, DD0, TT3 + VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3 + VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup) + MOVQ $128, itr1 + LEAQ 128(inp), inp + SUBQ $128, inl + VPERM2I128 $0x02, AA1, BB1, AA0 + VPERM2I128 $0x02, CC1, DD1, BB0 + VPERM2I128 $0x13, AA1, BB1, CC0 + VPERM2I128 $0x13, CC1, DD1, DD0 + + JMP sealAVX2SealHash + +// ---------------------------------------------------------------------------- +// Special optimization for the last 384 bytes of ciphertext +sealAVX2Tail384: + // Need to decrypt up to 384 bytes - prepare two blocks + // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed + // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed + VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2 + VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2 + VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2 + VMOVDQA ctr3StoreAVX2, DD0 + VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2 + VMOVDQA DD0, TT1; VMOVDQA DD1, TT2; VMOVDQA DD2, TT3 + +sealAVX2Tail384LoopA: + polyAdd(0(oup)) + polyMul + LEAQ 16(oup), oup + +sealAVX2Tail384LoopB: + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + polyAdd(0(oup)) + polyMul + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 + VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + polyAdd(16(oup)) + polyMul + LEAQ 32(oup), oup + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 + VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 + DECQ itr1 + JG sealAVX2Tail384LoopA + DECQ itr2 + JGE sealAVX2Tail384LoopB + + VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2 + VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2 + VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2 + VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1; VPADDD TT3, DD2, DD2 + VPERM2I128 $0x02, AA0, BB0, TT0 + VPERM2I128 $0x02, CC0, DD0, TT1 + VPERM2I128 $0x13, AA0, BB0, TT2 + VPERM2I128 $0x13, CC0, DD0, TT3 + VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3 + VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup) + VPERM2I128 $0x02, AA1, BB1, TT0 + VPERM2I128 $0x02, CC1, DD1, TT1 + VPERM2I128 $0x13, AA1, BB1, TT2 + VPERM2I128 $0x13, CC1, DD1, TT3 + VPXOR (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3 + VMOVDQU TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup) + MOVQ $256, itr1 + LEAQ 256(inp), inp + SUBQ $256, inl + VPERM2I128 $0x02, AA2, BB2, AA0 + VPERM2I128 $0x02, CC2, DD2, BB0 + VPERM2I128 $0x13, AA2, BB2, CC0 + VPERM2I128 $0x13, CC2, DD2, DD0 + + JMP sealAVX2SealHash + +// ---------------------------------------------------------------------------- +// Special optimization for the last 512 bytes of ciphertext +sealAVX2Tail512: + // Need to decrypt up to 512 bytes - prepare two blocks + // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed + // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed + VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 + VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 + VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 + VMOVDQA ctr3StoreAVX2, DD0 + VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 + VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 + +sealAVX2Tail512LoopA: + polyAdd(0(oup)) + polyMul + LEAQ 16(oup), oup + +sealAVX2Tail512LoopB: + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + polyAdd(0*8(oup)) + polyMulAVX2 + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 + VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + polyAdd(2*8(oup)) + polyMulAVX2 + LEAQ (4*8)(oup), oup + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 + VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 + + DECQ itr1 + JG sealAVX2Tail512LoopA + DECQ itr2 + JGE sealAVX2Tail512LoopB + + VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 + VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 + VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 + VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 + VMOVDQA CC3, tmpStoreAVX2 + VPERM2I128 $0x02, AA0, BB0, CC3 + VPXOR (0*32)(inp), CC3, CC3 + VMOVDQU CC3, (0*32)(oup) + VPERM2I128 $0x02, CC0, DD0, CC3 + VPXOR (1*32)(inp), CC3, CC3 + VMOVDQU CC3, (1*32)(oup) + VPERM2I128 $0x13, AA0, BB0, CC3 + VPXOR (2*32)(inp), CC3, CC3 + VMOVDQU CC3, (2*32)(oup) + VPERM2I128 $0x13, CC0, DD0, CC3 + VPXOR (3*32)(inp), CC3, CC3 + VMOVDQU CC3, (3*32)(oup) + + VPERM2I128 $0x02, AA1, BB1, AA0 + VPERM2I128 $0x02, CC1, DD1, BB0 + VPERM2I128 $0x13, AA1, BB1, CC0 + VPERM2I128 $0x13, CC1, DD1, DD0 + VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 + VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) + + VPERM2I128 $0x02, AA2, BB2, AA0 + VPERM2I128 $0x02, CC2, DD2, BB0 + VPERM2I128 $0x13, AA2, BB2, CC0 + VPERM2I128 $0x13, CC2, DD2, DD0 + VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 + VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) + + MOVQ $384, itr1 + LEAQ 384(inp), inp + SUBQ $384, inl + VPERM2I128 $0x02, AA3, BB3, AA0 + VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0 + VPERM2I128 $0x13, AA3, BB3, CC0 + VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 + + JMP sealAVX2SealHash diff --git a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_generic.go b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_generic.go new file mode 100644 index 0000000..6313898 --- /dev/null +++ b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_generic.go @@ -0,0 +1,81 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package chacha20poly1305 + +import ( + "encoding/binary" + + "golang.org/x/crypto/chacha20" + "golang.org/x/crypto/internal/alias" + "golang.org/x/crypto/internal/poly1305" +) + +func writeWithPadding(p *poly1305.MAC, b []byte) { + p.Write(b) + if rem := len(b) % 16; rem != 0 { + var buf [16]byte + padLen := 16 - rem + p.Write(buf[:padLen]) + } +} + +func writeUint64(p *poly1305.MAC, n int) { + var buf [8]byte + binary.LittleEndian.PutUint64(buf[:], uint64(n)) + p.Write(buf[:]) +} + +func (c *chacha20poly1305) sealGeneric(dst, nonce, plaintext, additionalData []byte) []byte { + ret, out := sliceForAppend(dst, len(plaintext)+poly1305.TagSize) + ciphertext, tag := out[:len(plaintext)], out[len(plaintext):] + if alias.InexactOverlap(out, plaintext) { + panic("chacha20poly1305: invalid buffer overlap") + } + + var polyKey [32]byte + s, _ := chacha20.NewUnauthenticatedCipher(c.key[:], nonce) + s.XORKeyStream(polyKey[:], polyKey[:]) + s.SetCounter(1) // set the counter to 1, skipping 32 bytes + s.XORKeyStream(ciphertext, plaintext) + + p := poly1305.New(&polyKey) + writeWithPadding(p, additionalData) + writeWithPadding(p, ciphertext) + writeUint64(p, len(additionalData)) + writeUint64(p, len(plaintext)) + p.Sum(tag[:0]) + + return ret +} + +func (c *chacha20poly1305) openGeneric(dst, nonce, ciphertext, additionalData []byte) ([]byte, error) { + tag := ciphertext[len(ciphertext)-16:] + ciphertext = ciphertext[:len(ciphertext)-16] + + var polyKey [32]byte + s, _ := chacha20.NewUnauthenticatedCipher(c.key[:], nonce) + s.XORKeyStream(polyKey[:], polyKey[:]) + s.SetCounter(1) // set the counter to 1, skipping 32 bytes + + p := poly1305.New(&polyKey) + writeWithPadding(p, additionalData) + writeWithPadding(p, ciphertext) + writeUint64(p, len(additionalData)) + writeUint64(p, len(ciphertext)) + + ret, out := sliceForAppend(dst, len(ciphertext)) + if alias.InexactOverlap(out, ciphertext) { + panic("chacha20poly1305: invalid buffer overlap") + } + if !p.Verify(tag) { + for i := range out { + out[i] = 0 + } + return nil, errOpen + } + + s.XORKeyStream(out, ciphertext) + return ret, nil +} diff --git a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_noasm.go b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_noasm.go new file mode 100644 index 0000000..f832b33 --- /dev/null +++ b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_noasm.go @@ -0,0 +1,16 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !amd64 || !gc || purego +// +build !amd64 !gc purego + +package chacha20poly1305 + +func (c *chacha20poly1305) seal(dst, nonce, plaintext, additionalData []byte) []byte { + return c.sealGeneric(dst, nonce, plaintext, additionalData) +} + +func (c *chacha20poly1305) open(dst, nonce, ciphertext, additionalData []byte) ([]byte, error) { + return c.openGeneric(dst, nonce, ciphertext, additionalData) +} diff --git a/vendor/golang.org/x/crypto/chacha20poly1305/xchacha20poly1305.go b/vendor/golang.org/x/crypto/chacha20poly1305/xchacha20poly1305.go new file mode 100644 index 0000000..1cebfe9 --- /dev/null +++ b/vendor/golang.org/x/crypto/chacha20poly1305/xchacha20poly1305.go @@ -0,0 +1,86 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package chacha20poly1305 + +import ( + "crypto/cipher" + "errors" + + "golang.org/x/crypto/chacha20" +) + +type xchacha20poly1305 struct { + key [KeySize]byte +} + +// NewX returns a XChaCha20-Poly1305 AEAD that uses the given 256-bit key. +// +// XChaCha20-Poly1305 is a ChaCha20-Poly1305 variant that takes a longer nonce, +// suitable to be generated randomly without risk of collisions. It should be +// preferred when nonce uniqueness cannot be trivially ensured, or whenever +// nonces are randomly generated. +func NewX(key []byte) (cipher.AEAD, error) { + if len(key) != KeySize { + return nil, errors.New("chacha20poly1305: bad key length") + } + ret := new(xchacha20poly1305) + copy(ret.key[:], key) + return ret, nil +} + +func (*xchacha20poly1305) NonceSize() int { + return NonceSizeX +} + +func (*xchacha20poly1305) Overhead() int { + return Overhead +} + +func (x *xchacha20poly1305) Seal(dst, nonce, plaintext, additionalData []byte) []byte { + if len(nonce) != NonceSizeX { + panic("chacha20poly1305: bad nonce length passed to Seal") + } + + // XChaCha20-Poly1305 technically supports a 64-bit counter, so there is no + // size limit. However, since we reuse the ChaCha20-Poly1305 implementation, + // the second half of the counter is not available. This is unlikely to be + // an issue because the cipher.AEAD API requires the entire message to be in + // memory, and the counter overflows at 256 GB. + if uint64(len(plaintext)) > (1<<38)-64 { + panic("chacha20poly1305: plaintext too large") + } + + c := new(chacha20poly1305) + hKey, _ := chacha20.HChaCha20(x.key[:], nonce[0:16]) + copy(c.key[:], hKey) + + // The first 4 bytes of the final nonce are unused counter space. + cNonce := make([]byte, NonceSize) + copy(cNonce[4:12], nonce[16:24]) + + return c.seal(dst, cNonce[:], plaintext, additionalData) +} + +func (x *xchacha20poly1305) Open(dst, nonce, ciphertext, additionalData []byte) ([]byte, error) { + if len(nonce) != NonceSizeX { + panic("chacha20poly1305: bad nonce length passed to Open") + } + if len(ciphertext) < 16 { + return nil, errOpen + } + if uint64(len(ciphertext)) > (1<<38)-48 { + panic("chacha20poly1305: ciphertext too large") + } + + c := new(chacha20poly1305) + hKey, _ := chacha20.HChaCha20(x.key[:], nonce[0:16]) + copy(c.key[:], hKey) + + // The first 4 bytes of the final nonce are unused counter space. + cNonce := make([]byte, NonceSize) + copy(cNonce[4:12], nonce[16:24]) + + return c.open(dst, cNonce[:], ciphertext, additionalData) +} diff --git a/vendor/modules.txt b/vendor/modules.txt index 352e8dd..cbc0c79 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -1,6 +1,7 @@ # golang.org/x/crypto v0.5.0 ## explicit; go 1.17 golang.org/x/crypto/chacha20 +golang.org/x/crypto/chacha20poly1305 golang.org/x/crypto/internal/alias golang.org/x/crypto/internal/poly1305 golang.org/x/crypto/poly1305