rinjdael: add parallel processing for CFB decryption with AES-NI
authorJussi Kivilinna <jussi.kivilinna@iki.fi>
Thu, 23 May 2013 11:15:41 +0000 (14:15 +0300)
committerWerner Koch <wk@gnupg.org>
Thu, 23 May 2013 15:33:54 +0000 (17:33 +0200)
* cipher/cipher-selftest.c (_gcry_selftest_helper_cfb_128): New
function for CFB selftests.
* cipher/cipher-selftest.h (_gcry_selftest_helper_cfb_128): New
prototype.
* cipher/rijndael.c [USE_AESNI] (do_aesni_enc_vec4): New function.
(_gcry_aes_cfb_dec) [USE_AESNI]: Add parallelized CFB decryption.
(selftest_cfb_128): New function.
(selftest): Call selftest_cfb_128.
--

CFB decryption can be parallelized for additional performance. On Intel
Sandy-Bridge processor, this change makes CFB decryption 4.6 times faster.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
cipher/cipher-selftest.c
cipher/cipher-selftest.h
cipher/rijndael.c

index 439f3ae..41eb405 100644 (file)
@@ -160,6 +160,119 @@ _gcry_selftest_helper_cbc_128 (const char *cipher,
   return NULL;
 }
 
+/* Run the self-tests for <block cipher>-CFB-128, tests bulk CFB
+   decryption.  Returns NULL on success. */
+const char *
+_gcry_selftest_helper_cfb_128 (const char *cipher,
+                              gcry_cipher_setkey_t setkey_func,
+                              gcry_cipher_encrypt_t encrypt_one,
+                              gcry_cipher_bulk_cfb_dec_t bulk_cfb_dec,
+                              const int nblocks, const int blocksize,
+                              const int context_size)
+{
+  int i, offs;
+  unsigned char *ctx, *plaintext, *plaintext2, *ciphertext, *iv, *iv2, *mem;
+  unsigned int ctx_aligned_size, memsize;
+
+  static const unsigned char key[16] ATTR_ALIGNED_16 = {
+      0x11,0x9A,0x00,0x7F,0xC7,0x6A,0x45,0x9F,
+      0x98,0xBA,0xF9,0x17,0xFE,0xDF,0x95,0x33
+    };
+
+  /* Allocate buffers, align elements to 16 bytes.  */
+  ctx_aligned_size = context_size + 15;
+  ctx_aligned_size -= ctx_aligned_size & 0xf;
+
+  memsize = ctx_aligned_size + (blocksize * 2) + (blocksize * nblocks * 3) + 16;
+
+  mem = gcry_calloc (1, memsize);
+  if (!mem)
+    return "failed to allocate memory";
+
+  offs = (16 - ((uintptr_t)mem & 15)) & 15;
+  ctx = (void*)(mem + offs);
+  iv = ctx + ctx_aligned_size;
+  iv2 = iv + blocksize;
+  plaintext = iv2 + blocksize;
+  plaintext2 = plaintext + nblocks * blocksize;
+  ciphertext = plaintext2 + nblocks * blocksize;
+
+  /* Initialize ctx */
+  setkey_func (ctx, key, sizeof(key));
+
+  /* Test single block code path */
+  memset(iv, 0xd3, blocksize);
+  memset(iv2, 0xd3, blocksize);
+  for (i = 0; i < blocksize; i++)
+    plaintext[i] = i;
+
+  /* CFB manually.  */
+  encrypt_one (ctx, ciphertext, iv);
+  buf_xor_2dst (iv, ciphertext, plaintext, blocksize);
+
+  /* CFB decrypt.  */
+  bulk_cfb_dec (ctx, iv2, plaintext2, ciphertext, 1);
+  if (memcmp(plaintext2, plaintext, blocksize))
+    {
+      gcry_free(mem);
+#ifdef HAVE_SYSLOG
+      syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
+              "%s-128-CFB test failed (plaintext mismatch)", cipher);
+#endif
+      return "selftest for 128 bit CFB failed - see syslog for details";
+    }
+
+  if (memcmp(iv2, iv, blocksize))
+    {
+      gcry_free(mem);
+#ifdef HAVE_SYSLOG
+      syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
+              "%s-128-CFB test failed (IV mismatch)", cipher);
+#endif
+      return "selftest for 128 bit CFB failed - see syslog for details";
+    }
+
+  /* Test parallelized code paths */
+  memset(iv, 0xe6, blocksize);
+  memset(iv2, 0xe6, blocksize);
+
+  for (i = 0; i < nblocks * blocksize; i++)
+    plaintext[i] = i;
+
+  /* Create CFB ciphertext manually.  */
+  for (i = 0; i < nblocks * blocksize; i+=blocksize)
+    {
+      encrypt_one (ctx, &ciphertext[i], iv);
+      buf_xor_2dst (iv, &ciphertext[i], &plaintext[i], blocksize);
+    }
+
+  /* Decrypt using bulk CBC and compare result.  */
+  bulk_cfb_dec (ctx, iv2, plaintext2, ciphertext, nblocks);
+
+  if (memcmp(plaintext2, plaintext, nblocks * blocksize))
+    {
+      gcry_free(mem);
+#ifdef HAVE_SYSLOG
+      syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
+              "%s-128-CFB test failed (plaintext mismatch, parallel path)",
+              cipher);
+#endif
+      return "selftest for 128 bit CFB failed - see syslog for details";
+    }
+  if (memcmp(iv2, iv, blocksize))
+    {
+      gcry_free(mem);
+#ifdef HAVE_SYSLOG
+      syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
+              "%s-128-CFB test failed (IV mismatch, parallel path)", cipher);
+#endif
+      return "selftest for 128 bit CFB failed - see syslog for details";
+    }
+
+  gcry_free(mem);
+  return NULL;
+}
+
 /* Run the self-tests for <block cipher>-CTR-128, tests IV increment of bulk CTR
    encryption.  Returns NULL on success. */
 const char *
index 89d79c2..30bc251 100644 (file)
@@ -30,6 +30,11 @@ typedef void (*gcry_cipher_bulk_cbc_dec_t)(void *context, unsigned char *iv,
                                           const void *inbuf_arg,
                                           unsigned int nblocks);
 
+typedef void (*gcry_cipher_bulk_cfb_dec_t)(void *context, unsigned char *iv,
+                                          void *outbuf_arg,
+                                          const void *inbuf_arg,
+                                          unsigned int nblocks);
+
 typedef void (*gcry_cipher_bulk_ctr_enc_t)(void *context, unsigned char *iv,
                                           void *outbuf_arg,
                                           const void *inbuf_arg,
@@ -43,6 +48,14 @@ _gcry_selftest_helper_cbc_128 (const char *cipher, gcry_cipher_setkey_t setkey,
                               const int nblocks, const int blocksize,
                               const int context_size);
 
+/* Helper function for bulk CFB decryption selftest */
+const char *
+_gcry_selftest_helper_cfb_128 (const char *cipher, gcry_cipher_setkey_t setkey,
+                              gcry_cipher_encrypt_t encrypt_one,
+                              gcry_cipher_bulk_cfb_dec_t bulk_cfb_dec,
+                              const int nblocks, const int blocksize,
+                              const int context_size);
+
 /* Helper function for bulk CTR encryption selftest */
 const char *
 _gcry_selftest_helper_ctr_128 (const char *cipher, gcry_cipher_setkey_t setkey,
index 4c81688..9f075ff 100644 (file)
@@ -821,6 +821,115 @@ do_aesni_dec_aligned (const RIJNDAEL_context *ctx,
 }
 
 
+/* Encrypt four blocks using the Intel AES-NI instructions.  Blocks are input
+ * and output through SSE registers xmm1 to xmm4.  */
+static void
+do_aesni_enc_vec4 (const RIJNDAEL_context *ctx)
+{
+#define aesenc_xmm0_xmm1      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc8\n\t"
+#define aesenc_xmm0_xmm2      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd0\n\t"
+#define aesenc_xmm0_xmm3      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd8\n\t"
+#define aesenc_xmm0_xmm4      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xe0\n\t"
+#define aesenclast_xmm0_xmm1  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc8\n\t"
+#define aesenclast_xmm0_xmm2  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd0\n\t"
+#define aesenclast_xmm0_xmm3  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd8\n\t"
+#define aesenclast_xmm0_xmm4  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xe0\n\t"
+  asm volatile ("movdqa (%[key]), %%xmm0\n\t"
+                "pxor   %%xmm0, %%xmm1\n\t"     /* xmm1 ^= key[0] */
+                "pxor   %%xmm0, %%xmm2\n\t"     /* xmm2 ^= key[0] */
+                "pxor   %%xmm0, %%xmm3\n\t"     /* xmm3 ^= key[0] */
+                "pxor   %%xmm0, %%xmm4\n\t"     /* xmm4 ^= key[0] */
+                "movdqa 0x10(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0x20(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0x30(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0x40(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0x50(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0x60(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0x70(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0x80(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0x90(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0xa0(%[key]), %%xmm0\n\t"
+                "cmpl $10, %[rounds]\n\t"
+                "jz .Ldeclast%=\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0xb0(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0xc0(%[key]), %%xmm0\n\t"
+                "cmpl $12, %[rounds]\n\t"
+                "jz .Ldeclast%=\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0xd0(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0xe0(%[key]), %%xmm0\n"
+
+                ".Ldeclast%=:\n\t"
+                aesenclast_xmm0_xmm1
+                aesenclast_xmm0_xmm2
+                aesenclast_xmm0_xmm3
+                aesenclast_xmm0_xmm4
+                : /* no output */
+                : [key] "r" (ctx->keyschenc),
+                  [rounds] "r" (ctx->rounds)
+                : "cc", "memory");
+#undef aesenc_xmm0_xmm1
+#undef aesenc_xmm0_xmm2
+#undef aesenc_xmm0_xmm3
+#undef aesenc_xmm0_xmm4
+#undef aesenclast_xmm0_xmm1
+#undef aesenclast_xmm0_xmm2
+#undef aesenclast_xmm0_xmm3
+#undef aesenclast_xmm0_xmm4
+}
+
+
 /* Decrypt four blocks using the Intel AES-NI instructions.  Blocks are input
  * and output through SSE registers xmm1 to xmm4.  */
 static void
@@ -1685,7 +1794,7 @@ rijndael_decrypt (void *context, byte *b, const byte *a)
 
 
 /* Bulk decryption of complete blocks in CFB mode.  Caller needs to
-   make sure that IV is aligned on an unisgned lonhg boundary.  This
+   make sure that IV is aligned on an unsigned long boundary.  This
    function is only intended for the bulk encryption feature of
    cipher.c. */
 void
@@ -1716,6 +1825,50 @@ _gcry_aes_cfb_dec (void *context, unsigned char *iv,
   else if (ctx->use_aesni)
     {
       aesni_prepare ();
+
+      /* CFB decryption can be parallelized */
+      for ( ;nblocks >= 4; nblocks -= 4)
+        {
+          asm volatile
+            ("movdqu (%[iv]),        %%xmm1\n\t" /* load input blocks */
+             "movdqu 0*16(%[inbuf]), %%xmm2\n\t"
+             "movdqu 1*16(%[inbuf]), %%xmm3\n\t"
+             "movdqu 2*16(%[inbuf]), %%xmm4\n\t"
+
+             "movdqu 3*16(%[inbuf]), %%xmm0\n\t" /* update IV */
+             "movdqu %%xmm0,         (%[iv])\n\t"
+             : /* No output */
+             : [inbuf] "r" (inbuf), [iv] "r" (iv)
+             : "memory");
+
+          do_aesni_enc_vec4 (ctx);
+
+          asm volatile
+            ("movdqu 0*16(%[inbuf]), %%xmm5\n\t"
+             "pxor %%xmm5, %%xmm1\n\t"
+             "movdqu %%xmm1, 0*16(%[outbuf])\n\t"
+
+             "movdqu 1*16(%[inbuf]), %%xmm5\n\t"
+             "pxor %%xmm5, %%xmm2\n\t"
+             "movdqu %%xmm2, 1*16(%[outbuf])\n\t"
+
+             "movdqu 2*16(%[inbuf]), %%xmm5\n\t"
+             "pxor %%xmm5, %%xmm3\n\t"
+             "movdqu %%xmm3, 2*16(%[outbuf])\n\t"
+
+             "movdqu 3*16(%[inbuf]), %%xmm5\n\t"
+             "pxor %%xmm5, %%xmm4\n\t"
+             "movdqu %%xmm4, 3*16(%[outbuf])\n\t"
+
+             : /* No output */
+             : [inbuf] "r" (inbuf),
+               [outbuf] "r" (outbuf)
+             : "memory");
+
+          outbuf += 4*BLOCKSIZE;
+          inbuf  += 4*BLOCKSIZE;
+        }
+
       for ( ;nblocks; nblocks-- )
         {
           do_aesni_cfb (ctx, 1, iv, outbuf, inbuf);
@@ -1723,6 +1876,7 @@ _gcry_aes_cfb_dec (void *context, unsigned char *iv,
           inbuf  += BLOCKSIZE;
         }
       aesni_cleanup ();
+      aesni_cleanup_2_5 ();
     }
 #endif /*USE_AESNI*/
   else
@@ -2035,6 +2189,21 @@ selftest_cbc_128 (void)
 }
 
 
+/* Run the self-tests for AES-CFB-128, tests bulk CFB decryption.
+   Returns NULL on success. */
+static const char*
+selftest_cfb_128 (void)
+{
+  const int nblocks = 8+2;
+  const int blocksize = BLOCKSIZE;
+  const int context_size = sizeof(RIJNDAEL_context);
+
+  return _gcry_selftest_helper_cfb_128("AES", &rijndael_setkey,
+           &rijndael_encrypt, &_gcry_aes_cfb_dec, nblocks, blocksize,
+          context_size);
+}
+
+
 /* Run all the self-tests and return NULL on success.  This function
    is used for the on-the-fly self-tests. */
 static const char *
@@ -2053,6 +2222,9 @@ selftest (void)
   if ( (r = selftest_cbc_128 ()) )
     return r;
 
+  if ( (r = selftest_cfb_128 ()) )
+    return r;
+
   return r;
 }