Add AES-NI acceleration for AES-XTS
authorJussi Kivilinna <jussi.kivilinna@iki.fi>
Sat, 6 Jan 2018 16:53:20 +0000 (18:53 +0200)
committerJussi Kivilinna <jussi.kivilinna@iki.fi>
Tue, 9 Jan 2018 16:44:43 +0000 (18:44 +0200)
* cipher/cipher-internal.h (gcry_cipher_handle): Change bulk
XTS function to take cipher context.
* cipher/cipher-xts.c (_gcry_cipher_xts_crypt): Ditto.
* cipher/cipher.c (_gcry_cipher_open_internal): Setup AES-NI
XTS bulk function.
* cipher/rijndael-aesni.c (xts_gfmul_const, _gcry_aes_aesni_xts_enc)
(_gcry_aes_aesni_xts_enc, _gcry_aes_aesni_xts_crypt): New.
* cipher/rijndael.c (_gcry_aes_aesni_xts_crypt)
(_gcry_aes_xts_crypt): New.
* src/cipher.h (_gcry_aes_xts_crypt): New.
--

Benchmarks on Intel Core i7-4790K, 4.0Ghz (no turbo):

Before:
        XTS enc |      1.66 ns/B     575.7 MiB/s      6.63 c/B
        XTS dec |      1.66 ns/B     575.5 MiB/s      6.63 c/B

After (~6x faster):
        XTS enc |     0.270 ns/B    3528.5 MiB/s      1.08 c/B
        XTS dec |     0.272 ns/B    3511.5 MiB/s      1.09 c/B

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
cipher/cipher-internal.h
cipher/cipher-xts.c
cipher/cipher.c
cipher/rijndael-aesni.c
cipher/rijndael.c
src/cipher.h

index b748125..8c897d7 100644 (file)
@@ -146,7 +146,7 @@ struct gcry_cipher_handle
                        const void *inbuf_arg, size_t nblocks, int encrypt);
     size_t (*ocb_auth)(gcry_cipher_hd_t c, const void *abuf_arg,
                       size_t nblocks);
-    void (*xts_crypt)(gcry_cipher_hd_t c, unsigned char *tweak,
+    void (*xts_crypt)(void *context, unsigned char *tweak,
                      void *outbuf_arg, const void *inbuf_arg,
                      size_t nblocks, int encrypt);
   } bulk;
index 4da89e5..06cefbe 100644 (file)
@@ -93,7 +93,8 @@ _gcry_cipher_xts_crypt (gcry_cipher_hd_t c,
   /* Use a bulk method if available.  */
   if (nblocks && c->bulk.xts_crypt)
     {
-      c->bulk.xts_crypt (c, c->u_ctr.ctr, outbuf, inbuf, nblocks, encrypt);
+      c->bulk.xts_crypt (&c->context.c, c->u_ctr.ctr, outbuf, inbuf, nblocks,
+                        encrypt);
       inbuf  += nblocks * GCRY_XTS_BLOCK_LEN;
       outbuf += nblocks * GCRY_XTS_BLOCK_LEN;
       inbuflen -= nblocks * GCRY_XTS_BLOCK_LEN;
index 9812738..063c13d 100644 (file)
@@ -532,6 +532,7 @@ _gcry_cipher_open_internal (gcry_cipher_hd_t *handle,
               h->bulk.ctr_enc = _gcry_aes_ctr_enc;
               h->bulk.ocb_crypt = _gcry_aes_ocb_crypt;
               h->bulk.ocb_auth  = _gcry_aes_ocb_auth;
+              h->bulk.xts_crypt = _gcry_aes_xts_crypt;
               break;
 #endif /*USE_AES*/
 #ifdef USE_BLOWFISH
index 3d323cf..50a0745 100644 (file)
@@ -3007,4 +3007,295 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
 }
 
 
+static const u64 xts_gfmul_const[16] __attribute__ ((aligned (16))) =
+  { 0x87, 0x01 };
+
+
+static void
+_gcry_aes_aesni_xts_enc (RIJNDAEL_context *ctx, unsigned char *tweak,
+                        unsigned char *outbuf, const unsigned char *inbuf,
+                        size_t nblocks)
+{
+  aesni_prepare_2_6_variable;
+
+  aesni_prepare ();
+  aesni_prepare_2_6 ();
+
+  /* Preload Tweak */
+  asm volatile ("movdqu %[tweak], %%xmm5\n\t"
+               "movdqa %[gfmul], %%xmm6\n\t"
+               :
+               : [tweak] "m" (*tweak),
+                 [gfmul] "m" (*xts_gfmul_const)
+               : "memory" );
+
+  for ( ;nblocks >= 4; nblocks -= 4 )
+    {
+      asm volatile ("pshufd $0x13,     %%xmm5,  %%xmm4\n\t"
+                   "movdqu %[inbuf0], %%xmm1\n\t"
+                   "pxor   %%xmm5,    %%xmm1\n\t"
+                   "movdqu %%xmm5,    %[outbuf0]\n\t"
+
+                   "movdqa %%xmm4,    %%xmm0\n\t"
+                   "paddd  %%xmm4,    %%xmm4\n\t"
+                   "psrad  $31,       %%xmm0\n\t"
+                   "paddq  %%xmm5,    %%xmm5\n\t"
+                   "pand   %%xmm6,    %%xmm0\n\t"
+                   "pxor   %%xmm0,    %%xmm5\n\t"
+                   : [outbuf0] "=m" (*(outbuf + 0 * 16))
+                   : [inbuf0] "m" (*(inbuf + 0 * 16))
+                   : "memory" );
+
+      asm volatile ("movdqu %[inbuf1], %%xmm2\n\t"
+                   "pxor   %%xmm5,    %%xmm2\n\t"
+                   "movdqu %%xmm5,    %[outbuf1]\n\t"
+
+                   "movdqa %%xmm4,    %%xmm0\n\t"
+                   "paddd  %%xmm4,    %%xmm4\n\t"
+                   "psrad  $31,       %%xmm0\n\t"
+                   "paddq  %%xmm5,    %%xmm5\n\t"
+                   "pand   %%xmm6,    %%xmm0\n\t"
+                   "pxor   %%xmm0,    %%xmm5\n\t"
+                   : [outbuf1] "=m" (*(outbuf + 1 * 16))
+                   : [inbuf1] "m" (*(inbuf + 1 * 16))
+                   : "memory" );
+
+      asm volatile ("movdqu %[inbuf2], %%xmm3\n\t"
+                   "pxor   %%xmm5,    %%xmm3\n\t"
+                   "movdqu %%xmm5,    %[outbuf2]\n\t"
+
+                   "movdqa %%xmm4,    %%xmm0\n\t"
+                   "paddd  %%xmm4,    %%xmm4\n\t"
+                   "psrad  $31,       %%xmm0\n\t"
+                   "paddq  %%xmm5,    %%xmm5\n\t"
+                   "pand   %%xmm6,    %%xmm0\n\t"
+                   "pxor   %%xmm0,    %%xmm5\n\t"
+                   : [outbuf2] "=m" (*(outbuf + 2 * 16))
+                   : [inbuf2] "m" (*(inbuf + 2 * 16))
+                   : "memory" );
+
+      asm volatile ("movdqa %%xmm4,    %%xmm0\n\t"
+                   "movdqu %[inbuf3], %%xmm4\n\t"
+                   "pxor   %%xmm5,    %%xmm4\n\t"
+                   "movdqu %%xmm5,    %[outbuf3]\n\t"
+
+                   "psrad  $31,       %%xmm0\n\t"
+                   "paddq  %%xmm5,    %%xmm5\n\t"
+                   "pand   %%xmm6,    %%xmm0\n\t"
+                   "pxor   %%xmm0,    %%xmm5\n\t"
+                   : [outbuf3] "=m" (*(outbuf + 3 * 16))
+                   : [inbuf3] "m" (*(inbuf + 3 * 16))
+                   : "memory" );
+
+      do_aesni_enc_vec4 (ctx);
+
+      asm volatile ("movdqu %[outbuf0], %%xmm0\n\t"
+                    "pxor   %%xmm0,     %%xmm1\n\t"
+                   "movdqu %[outbuf1], %%xmm0\n\t"
+                   "movdqu %%xmm1,     %[outbuf0]\n\t"
+                   "movdqu %[outbuf2], %%xmm1\n\t"
+                    "pxor   %%xmm0,     %%xmm2\n\t"
+                   "movdqu %[outbuf3], %%xmm0\n\t"
+                    "pxor   %%xmm1,     %%xmm3\n\t"
+                    "pxor   %%xmm0,     %%xmm4\n\t"
+                   "movdqu %%xmm2,     %[outbuf1]\n\t"
+                   "movdqu %%xmm3,     %[outbuf2]\n\t"
+                   "movdqu %%xmm4,     %[outbuf3]\n\t"
+                   : [outbuf0] "+m" (*(outbuf + 0 * 16)),
+                     [outbuf1] "+m" (*(outbuf + 1 * 16)),
+                     [outbuf2] "+m" (*(outbuf + 2 * 16)),
+                     [outbuf3] "+m" (*(outbuf + 3 * 16))
+                   :
+                   : "memory" );
+
+      outbuf += BLOCKSIZE * 4;
+      inbuf += BLOCKSIZE * 4;
+    }
+
+  for ( ;nblocks; nblocks-- )
+    {
+      asm volatile ("movdqu %[inbuf],  %%xmm0\n\t"
+                   "pxor   %%xmm5,    %%xmm0\n\t"
+                   "movdqa %%xmm5,    %%xmm4\n\t"
+
+                   "pshufd $0x13,     %%xmm5,  %%xmm1\n\t"
+                   "psrad  $31,       %%xmm1\n\t"
+                   "paddq  %%xmm5,    %%xmm5\n\t"
+                   "pand   %%xmm6,    %%xmm1\n\t"
+                   "pxor   %%xmm1,    %%xmm5\n\t"
+                   :
+                   : [inbuf] "m" (*inbuf)
+                   : "memory" );
+
+      do_aesni_enc (ctx);
+
+      asm volatile ("pxor   %%xmm4,    %%xmm0\n\t"
+                   "movdqu %%xmm0,    %[outbuf]\n\t"
+                   : [outbuf] "=m" (*outbuf)
+                   :
+                   : "memory" );
+
+      outbuf += BLOCKSIZE;
+      inbuf += BLOCKSIZE;
+    }
+
+  asm volatile ("movdqu %%xmm5, %[tweak]\n\t"
+               : [tweak] "=m" (*tweak)
+               :
+               : "memory" );
+
+  aesni_cleanup ();
+  aesni_cleanup_2_6 ();
+}
+
+
+static void
+_gcry_aes_aesni_xts_dec (RIJNDAEL_context *ctx, unsigned char *tweak,
+                        unsigned char *outbuf, const unsigned char *inbuf,
+                        size_t nblocks)
+{
+  aesni_prepare_2_6_variable;
+
+  aesni_prepare ();
+  aesni_prepare_2_6 ();
+
+  /* Preload Tweak */
+  asm volatile ("movdqu %[tweak], %%xmm5\n\t"
+               "movdqa %[gfmul], %%xmm6\n\t"
+               :
+               : [tweak] "m" (*tweak),
+                 [gfmul] "m" (*xts_gfmul_const)
+               : "memory" );
+
+  for ( ;nblocks >= 4; nblocks -= 4 )
+    {
+      asm volatile ("pshufd $0x13,     %%xmm5,  %%xmm4\n\t"
+                   "movdqu %[inbuf0], %%xmm1\n\t"
+                   "pxor   %%xmm5,    %%xmm1\n\t"
+                   "movdqu %%xmm5,    %[outbuf0]\n\t"
+
+                   "movdqa %%xmm4,    %%xmm0\n\t"
+                   "paddd  %%xmm4,    %%xmm4\n\t"
+                   "psrad  $31,       %%xmm0\n\t"
+                   "paddq  %%xmm5,    %%xmm5\n\t"
+                   "pand   %%xmm6,    %%xmm0\n\t"
+                   "pxor   %%xmm0,    %%xmm5\n\t"
+                   : [outbuf0] "=m" (*(outbuf + 0 * 16))
+                   : [inbuf0] "m" (*(inbuf + 0 * 16))
+                   : "memory" );
+
+      asm volatile ("movdqu %[inbuf1], %%xmm2\n\t"
+                   "pxor   %%xmm5,    %%xmm2\n\t"
+                   "movdqu %%xmm5,    %[outbuf1]\n\t"
+
+                   "movdqa %%xmm4,    %%xmm0\n\t"
+                   "paddd  %%xmm4,    %%xmm4\n\t"
+                   "psrad  $31,       %%xmm0\n\t"
+                   "paddq  %%xmm5,    %%xmm5\n\t"
+                   "pand   %%xmm6,    %%xmm0\n\t"
+                   "pxor   %%xmm0,    %%xmm5\n\t"
+                   : [outbuf1] "=m" (*(outbuf + 1 * 16))
+                   : [inbuf1] "m" (*(inbuf + 1 * 16))
+                   : "memory" );
+
+      asm volatile ("movdqu %[inbuf2], %%xmm3\n\t"
+                   "pxor   %%xmm5,    %%xmm3\n\t"
+                   "movdqu %%xmm5,    %[outbuf2]\n\t"
+
+                   "movdqa %%xmm4,    %%xmm0\n\t"
+                   "paddd  %%xmm4,    %%xmm4\n\t"
+                   "psrad  $31,       %%xmm0\n\t"
+                   "paddq  %%xmm5,    %%xmm5\n\t"
+                   "pand   %%xmm6,    %%xmm0\n\t"
+                   "pxor   %%xmm0,    %%xmm5\n\t"
+                   : [outbuf2] "=m" (*(outbuf + 2 * 16))
+                   : [inbuf2] "m" (*(inbuf + 2 * 16))
+                   : "memory" );
+
+      asm volatile ("movdqa %%xmm4,    %%xmm0\n\t"
+                   "movdqu %[inbuf3], %%xmm4\n\t"
+                   "pxor   %%xmm5,    %%xmm4\n\t"
+                   "movdqu %%xmm5,    %[outbuf3]\n\t"
+
+                   "psrad  $31,       %%xmm0\n\t"
+                   "paddq  %%xmm5,    %%xmm5\n\t"
+                   "pand   %%xmm6,    %%xmm0\n\t"
+                   "pxor   %%xmm0,    %%xmm5\n\t"
+                   : [outbuf3] "=m" (*(outbuf + 3 * 16))
+                   : [inbuf3] "m" (*(inbuf + 3 * 16))
+                   : "memory" );
+
+      do_aesni_dec_vec4 (ctx);
+
+      asm volatile ("movdqu %[outbuf0], %%xmm0\n\t"
+                    "pxor   %%xmm0,     %%xmm1\n\t"
+                   "movdqu %[outbuf1], %%xmm0\n\t"
+                   "movdqu %%xmm1,     %[outbuf0]\n\t"
+                   "movdqu %[outbuf2], %%xmm1\n\t"
+                    "pxor   %%xmm0,     %%xmm2\n\t"
+                   "movdqu %[outbuf3], %%xmm0\n\t"
+                    "pxor   %%xmm1,     %%xmm3\n\t"
+                    "pxor   %%xmm0,     %%xmm4\n\t"
+                   "movdqu %%xmm2,     %[outbuf1]\n\t"
+                   "movdqu %%xmm3,     %[outbuf2]\n\t"
+                   "movdqu %%xmm4,     %[outbuf3]\n\t"
+                   : [outbuf0] "+m" (*(outbuf + 0 * 16)),
+                     [outbuf1] "+m" (*(outbuf + 1 * 16)),
+                     [outbuf2] "+m" (*(outbuf + 2 * 16)),
+                     [outbuf3] "+m" (*(outbuf + 3 * 16))
+                   :
+                   : "memory" );
+
+      outbuf += BLOCKSIZE * 4;
+      inbuf += BLOCKSIZE * 4;
+    }
+
+  for ( ;nblocks; nblocks-- )
+    {
+      asm volatile ("movdqu %[inbuf],  %%xmm0\n\t"
+                   "pxor   %%xmm5,    %%xmm0\n\t"
+                   "movdqa %%xmm5,    %%xmm4\n\t"
+
+                   "pshufd $0x13,     %%xmm5,  %%xmm1\n\t"
+                   "psrad  $31,       %%xmm1\n\t"
+                   "paddq  %%xmm5,    %%xmm5\n\t"
+                   "pand   %%xmm6,    %%xmm1\n\t"
+                   "pxor   %%xmm1,    %%xmm5\n\t"
+                   :
+                   : [inbuf] "m" (*inbuf)
+                   : "memory" );
+
+      do_aesni_dec (ctx);
+
+      asm volatile ("pxor   %%xmm4,    %%xmm0\n\t"
+                   "movdqu %%xmm0,    %[outbuf]\n\t"
+                   : [outbuf] "=m" (*outbuf)
+                   :
+                   : "memory" );
+
+      outbuf += BLOCKSIZE;
+      inbuf += BLOCKSIZE;
+    }
+
+  asm volatile ("movdqu %%xmm5, %[tweak]\n\t"
+                : [tweak] "=m" (*tweak)
+                :
+                : "memory" );
+
+  aesni_cleanup ();
+  aesni_cleanup_2_6 ();
+}
+
+
+void
+_gcry_aes_aesni_xts_crypt (RIJNDAEL_context *ctx, unsigned char *tweak,
+                          unsigned char *outbuf, const unsigned char *inbuf,
+                          size_t nblocks, int encrypt)
+{
+  if (encrypt)
+    _gcry_aes_aesni_xts_enc(ctx, tweak, outbuf, inbuf, nblocks);
+  else
+    _gcry_aes_aesni_xts_dec(ctx, tweak, outbuf, inbuf, nblocks);
+}
+
 #endif /* USE_AESNI */
index 8637195..548bfa0 100644 (file)
@@ -103,6 +103,11 @@ extern void _gcry_aes_aesni_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
                                        int encrypt);
 extern void _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
                                       size_t nblocks);
+extern void _gcry_aes_aesni_xts_crypt (RIJNDAEL_context *ctx,
+                                      unsigned char *tweak,
+                                      unsigned char *outbuf,
+                                      const unsigned char *inbuf,
+                                      size_t nblocks, int encrypt);
 #endif
 
 #ifdef USE_SSSE3
@@ -1467,6 +1472,85 @@ _gcry_aes_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks)
 }
 
 
+/* Bulk encryption/decryption of complete blocks in XTS mode. */
+void
+_gcry_aes_xts_crypt (void *context, unsigned char *tweak,
+                    void *outbuf_arg, const void *inbuf_arg,
+                    size_t nblocks, int encrypt)
+{
+  RIJNDAEL_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned int burn_depth = 0;
+  rijndael_cryptfn_t crypt_fn;
+  u64 tweak_lo, tweak_hi, tweak_next_lo, tweak_next_hi, tmp_lo, tmp_hi, carry;
+
+  if (encrypt)
+    {
+      if (ctx->prefetch_enc_fn)
+        ctx->prefetch_enc_fn();
+
+      crypt_fn = ctx->encrypt_fn;
+    }
+  else
+    {
+      check_decryption_preparation (ctx);
+
+      if (ctx->prefetch_dec_fn)
+        ctx->prefetch_dec_fn();
+
+      crypt_fn = ctx->decrypt_fn;
+    }
+
+  if (0)
+    ;
+#ifdef USE_AESNI
+  else if (ctx->use_aesni)
+    {
+      _gcry_aes_aesni_xts_crypt (ctx, tweak, outbuf, inbuf, nblocks, encrypt);
+      burn_depth = 0;
+    }
+#endif /*USE_AESNI*/
+  else
+    {
+      tweak_next_lo = buf_get_le64 (tweak + 0);
+      tweak_next_hi = buf_get_le64 (tweak + 8);
+
+      while (nblocks)
+       {
+         tweak_lo = tweak_next_lo;
+         tweak_hi = tweak_next_hi;
+
+         /* Xor-Encrypt/Decrypt-Xor block. */
+         tmp_lo = buf_get_le64 (inbuf + 0) ^ tweak_lo;
+         tmp_hi = buf_get_le64 (inbuf + 8) ^ tweak_hi;
+
+         buf_put_le64 (outbuf + 0, tmp_lo);
+         buf_put_le64 (outbuf + 8, tmp_hi);
+
+         /* Generate next tweak. */
+         carry = -(tweak_next_hi >> 63) & 0x87;
+         tweak_next_hi = (tweak_next_hi << 1) + (tweak_next_lo >> 63);
+         tweak_next_lo = (tweak_next_lo << 1) ^ carry;
+
+         burn_depth = crypt_fn (ctx, outbuf, outbuf);
+
+         buf_put_le64 (outbuf + 0, buf_get_le64 (outbuf + 0) ^ tweak_lo);
+         buf_put_le64 (outbuf + 8, buf_get_le64 (outbuf + 8) ^ tweak_hi);
+
+         outbuf += GCRY_XTS_BLOCK_LEN;
+         inbuf += GCRY_XTS_BLOCK_LEN;
+         nblocks--;
+       }
+
+      buf_put_le64 (tweak + 0, tweak_next_lo);
+      buf_put_le64 (tweak + 8, tweak_next_hi);
+    }
+
+  if (burn_depth)
+    _gcry_burn_stack (burn_depth + 5 * sizeof(void *));
+}
+
 \f
 /* Run the self-tests for AES 128.  Returns NULL on success. */
 static const char*
index a6f257d..7c2e5d9 100644 (file)
@@ -164,6 +164,9 @@ size_t _gcry_aes_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
                            const void *inbuf_arg, size_t nblocks, int encrypt);
 size_t _gcry_aes_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
                           size_t nblocks);
+void _gcry_aes_xts_crypt (void *context, unsigned char *tweak,
+                         void *outbuf_arg, const void *inbuf_arg,
+                         size_t nblocks, int encrypt);
 
 /*-- blowfish.c --*/
 void _gcry_blowfish_cfb_dec (void *context, unsigned char *iv,