AES-NI improvements for AMD64
authorJussi Kivilinna <jussi.kivilinna@iki.fi>
Sat, 6 Jan 2018 16:53:20 +0000 (18:53 +0200)
committerJussi Kivilinna <jussi.kivilinna@iki.fi>
Tue, 9 Jan 2018 16:44:34 +0000 (18:44 +0200)
* cipher/rijndael-aesni.c [__x86_64__] (aesni_prepare_7_15_variable)
(aesni_prepare_7_15, aesni_cleanup_7_15, do_aesni_enc_vec8)
(do_aesni_dec_vec8, do_aesni_ctr_8): New.
(_gcry_aes_aesni_ctr_enc, _gcry_aes_aesni_cfb_dec)
(_gcry_aes_aesni_cbc_dec, aesni_ocb_enc, aesni_ocb_dec)
(_gcry_aes_aesni_ocb_auth) [__x86_64__]: Add 8 parallel blocks
processing.
--

Benchmarks on Intel Core i7-4790K, 4.0Ghz (no turbo, no HT):

Before:
 AES            |  nanosecs/byte   mebibytes/sec   cycles/byte
        CBC dec |     0.175 ns/B    5448.7 MiB/s     0.700 c/B
        CFB dec |     0.174 ns/B    5466.2 MiB/s     0.698 c/B
        CTR enc |     0.182 ns/B    5226.0 MiB/s     0.730 c/B
        OCB enc |     0.194 ns/B    4913.9 MiB/s     0.776 c/B
        OCB dec |     0.200 ns/B    4769.2 MiB/s     0.800 c/B
       OCB auth |     0.172 ns/B    5545.0 MiB/s     0.688 c/B

After (1.08x to 1.14x faster):
 AES            |  nanosecs/byte   mebibytes/sec   cycles/byte
        CBC dec |     0.157 ns/B    6075.6 MiB/s     0.628 c/B
        CFB dec |     0.158 ns/B    6034.1 MiB/s     0.632 c/B
        CTR enc |     0.159 ns/B    5979.4 MiB/s     0.638 c/B
        OCB enc |     0.175 ns/B    5447.1 MiB/s     0.700 c/B
        OCB dec |     0.183 ns/B    5203.9 MiB/s     0.733 c/B
       OCB auth |     0.156 ns/B    6101.3 MiB/s     0.625 c/B

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
cipher/rijndael-aesni.c

index 735e5cd..3d323cf 100644 (file)
@@ -55,6 +55,7 @@ typedef struct u128_s
 #ifdef __WIN64__
 /* XMM6-XMM15 are callee-saved registers on WIN64. */
 # define aesni_prepare_2_6_variable char win64tmp[16]
+# define aesni_prepare_7_15_variable char win64tmp7_15[16 * 9]
 # define aesni_prepare() do { } while (0)
 # define aesni_prepare_2_6()                                            \
    do { asm volatile ("movdqu %%xmm6, %0\n\t"                           \
@@ -62,6 +63,20 @@ typedef struct u128_s
                       :                                                 \
                       : "memory");                                      \
    } while (0)
+# define aesni_prepare_7_15()                                           \
+   do { asm volatile ("movdqu %%xmm7,  0*16(%0)\n\t"                    \
+                      "movdqu %%xmm8,  1*16(%0)\n\t"                    \
+                      "movdqu %%xmm9,  2*16(%0)\n\t"                    \
+                      "movdqu %%xmm10, 3*16(%0)\n\t"                    \
+                      "movdqu %%xmm11, 4*16(%0)\n\t"                    \
+                      "movdqu %%xmm12, 5*16(%0)\n\t"                    \
+                      "movdqu %%xmm13, 6*16(%0)\n\t"                    \
+                      "movdqu %%xmm14, 7*16(%0)\n\t"                    \
+                      "movdqu %%xmm15, 8*16(%0)\n\t"                    \
+                      :                                                 \
+                      : "r" (win64tmp7_15)                              \
+                      : "memory");                                      \
+   } while (0)
 # define aesni_cleanup()                                                \
    do { asm volatile ("pxor %%xmm0, %%xmm0\n\t"                         \
                       "pxor %%xmm1, %%xmm1\n" :: );                     \
@@ -76,6 +91,20 @@ typedef struct u128_s
                       : "m" (*win64tmp)                                 \
                       : "memory");                                      \
    } while (0)
+# define aesni_cleanup_7_15()                                           \
+   do { asm volatile ("movdqu 0*16(%0), %%xmm7\n\t"                     \
+                      "movdqu 1*16(%0), %%xmm8\n\t"                     \
+                      "movdqu 2*16(%0), %%xmm9\n\t"                     \
+                      "movdqu 3*16(%0), %%xmm10\n\t"                    \
+                      "movdqu 4*16(%0), %%xmm11\n\t"                    \
+                      "movdqu 5*16(%0), %%xmm12\n\t"                    \
+                      "movdqu 6*16(%0), %%xmm13\n\t"                    \
+                      "movdqu 7*16(%0), %%xmm14\n\t"                    \
+                      "movdqu 8*16(%0), %%xmm15\n\t"                    \
+                      :                                                 \
+                      : "r" (win64tmp7_15)                              \
+                      : "memory");                                      \
+   } while (0)
 #else
 # define aesni_prepare_2_6_variable
 # define aesni_prepare() do { } while (0)
@@ -91,6 +120,21 @@ typedef struct u128_s
                       "pxor %%xmm5, %%xmm5\n"                           \
                       "pxor %%xmm6, %%xmm6\n":: );                      \
    } while (0)
+# ifdef __x86_64__
+#  define aesni_prepare_7_15_variable
+#  define aesni_prepare_7_15() do { } while (0)
+#  define aesni_cleanup_7_15()                                          \
+   do { asm volatile ("pxor %%xmm7, %%xmm7\n\t"                         \
+                      "pxor %%xmm8, %%xmm8\n"                           \
+                      "pxor %%xmm9, %%xmm9\n"                           \
+                      "pxor %%xmm10, %%xmm10\n"                         \
+                      "pxor %%xmm11, %%xmm11\n"                         \
+                      "pxor %%xmm12, %%xmm12\n"                         \
+                      "pxor %%xmm13, %%xmm13\n"                         \
+                      "pxor %%xmm14, %%xmm14\n"                         \
+                      "pxor %%xmm15, %%xmm15\n":: );                    \
+   } while (0)
+# endif
 #endif
 
 void
@@ -704,6 +748,314 @@ do_aesni_dec_vec4 (const RIJNDAEL_context *ctx)
 }
 
 
+#ifdef __x86_64__
+
+/* Encrypt eight blocks using the Intel AES-NI instructions.  Blocks are input
+ * and output through SSE registers xmm1 to xmm4 and xmm8 to xmm11.  */
+static inline void
+do_aesni_enc_vec8 (const RIJNDAEL_context *ctx)
+{
+  asm volatile ("movdqa (%[key]), %%xmm0\n\t"
+                "pxor   %%xmm0, %%xmm1\n\t"     /* xmm1 ^= key[0] */
+                "pxor   %%xmm0, %%xmm2\n\t"     /* xmm2 ^= key[0] */
+                "pxor   %%xmm0, %%xmm3\n\t"     /* xmm3 ^= key[0] */
+                "pxor   %%xmm0, %%xmm4\n\t"     /* xmm4 ^= key[0] */
+                "pxor   %%xmm0, %%xmm8\n\t"     /* xmm8 ^= key[0] */
+                "pxor   %%xmm0, %%xmm9\n\t"     /* xmm9 ^= key[0] */
+                "pxor   %%xmm0, %%xmm10\n\t"     /* xmm10 ^= key[0] */
+                "pxor   %%xmm0, %%xmm11\n\t"     /* xmm11 ^= key[0] */
+                "movdqa 0x10(%[key]), %%xmm0\n\t"
+                "cmpl $12, %[rounds]\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0x20(%[key]), %%xmm0\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0x30(%[key]), %%xmm0\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0x40(%[key]), %%xmm0\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0x50(%[key]), %%xmm0\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0x60(%[key]), %%xmm0\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0x70(%[key]), %%xmm0\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0x80(%[key]), %%xmm0\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0x90(%[key]), %%xmm0\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0xa0(%[key]), %%xmm0\n\t"
+                "jb .Ldeclast%=\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0xb0(%[key]), %%xmm0\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0xc0(%[key]), %%xmm0\n\t"
+                "je .Ldeclast%=\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0xd0(%[key]), %%xmm0\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0xe0(%[key]), %%xmm0\n"
+
+                ".Ldeclast%=:\n\t"
+                "aesenclast %%xmm0, %%xmm1\n\t"
+                "aesenclast %%xmm0, %%xmm2\n\t"
+                "aesenclast %%xmm0, %%xmm3\n\t"
+                "aesenclast %%xmm0, %%xmm4\n\t"
+                "aesenclast %%xmm0, %%xmm8\n\t"
+                "aesenclast %%xmm0, %%xmm9\n\t"
+                "aesenclast %%xmm0, %%xmm10\n\t"
+                "aesenclast %%xmm0, %%xmm11\n\t"
+                : /* no output */
+                : [key] "r" (ctx->keyschenc),
+                  [rounds] "r" (ctx->rounds)
+                : "cc", "memory");
+}
+
+
+/* Decrypt eight blocks using the Intel AES-NI instructions.  Blocks are input
+ * and output through SSE registers xmm1 to xmm4 and xmm8 to xmm11.  */
+static inline void
+do_aesni_dec_vec8 (const RIJNDAEL_context *ctx)
+{
+  asm volatile ("movdqa (%[key]), %%xmm0\n\t"
+                "pxor   %%xmm0, %%xmm1\n\t"     /* xmm1 ^= key[0] */
+                "pxor   %%xmm0, %%xmm2\n\t"     /* xmm2 ^= key[0] */
+                "pxor   %%xmm0, %%xmm3\n\t"     /* xmm3 ^= key[0] */
+                "pxor   %%xmm0, %%xmm4\n\t"     /* xmm4 ^= key[0] */
+                "pxor   %%xmm0, %%xmm8\n\t"     /* xmm8 ^= key[0] */
+                "pxor   %%xmm0, %%xmm9\n\t"     /* xmm9 ^= key[0] */
+                "pxor   %%xmm0, %%xmm10\n\t"    /* xmm10 ^= key[0] */
+                "pxor   %%xmm0, %%xmm11\n\t"    /* xmm11 ^= key[0] */
+                "movdqa 0x10(%[key]), %%xmm0\n\t"
+                "cmpl $12, %[rounds]\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0x20(%[key]), %%xmm0\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0x30(%[key]), %%xmm0\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0x40(%[key]), %%xmm0\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0x50(%[key]), %%xmm0\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0x60(%[key]), %%xmm0\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0x70(%[key]), %%xmm0\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0x80(%[key]), %%xmm0\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0x90(%[key]), %%xmm0\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0xa0(%[key]), %%xmm0\n\t"
+                "jb .Ldeclast%=\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0xb0(%[key]), %%xmm0\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0xc0(%[key]), %%xmm0\n\t"
+                "je .Ldeclast%=\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0xd0(%[key]), %%xmm0\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0xe0(%[key]), %%xmm0\n"
+
+                ".Ldeclast%=:\n\t"
+                "aesdeclast %%xmm0, %%xmm1\n\t"
+                "aesdeclast %%xmm0, %%xmm2\n\t"
+                "aesdeclast %%xmm0, %%xmm3\n\t"
+                "aesdeclast %%xmm0, %%xmm4\n\t"
+                "aesdeclast %%xmm0, %%xmm8\n\t"
+                "aesdeclast %%xmm0, %%xmm9\n\t"
+                "aesdeclast %%xmm0, %%xmm10\n\t"
+                "aesdeclast %%xmm0, %%xmm11\n\t"
+                : /* no output */
+                : [key] "r" (ctx->keyschdec),
+                  [rounds] "r" (ctx->rounds)
+                : "cc", "memory");
+}
+
+#endif /* __x86_64__ */
+
+
 /* Perform a CTR encryption round using the counter CTR and the input
    block A.  Write the result to the output block B and update CTR.
    CTR needs to be a 16 byte aligned little-endian value.  */
@@ -808,7 +1160,7 @@ do_aesni_ctr_4 (const RIJNDAEL_context *ctx,
 #define aesenclast_xmm1_xmm4  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xe1\n\t"
 
   /* Register usage:
-      esi   keyschedule
+      [key] keyschedule
       xmm0  CTR-0
       xmm1  temp / round key
       xmm2  CTR-1
@@ -1003,6 +1355,327 @@ do_aesni_ctr_4 (const RIJNDAEL_context *ctx,
 }
 
 
+#ifdef __x86_64__
+
+/* Eight blocks at a time variant of do_aesni_ctr.  */
+static void
+do_aesni_ctr_8 (const RIJNDAEL_context *ctx,
+                unsigned char *ctr, unsigned char *b, const unsigned char *a)
+{
+  static const byte bige_addb_const[8][16] __attribute__ ((aligned (16))) =
+    {
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8 }
+    };
+  const void *bige_addb = bige_addb_const;
+
+  /* Register usage:
+      [key] keyschedule
+      xmm0  CTR-0
+      xmm1  temp / round key
+      xmm2  CTR-1
+      xmm3  CTR-2
+      xmm4  CTR-3
+      xmm5  copy of *ctr
+      xmm6  endian swapping mask
+      xmm8  CTR-4
+      xmm9  CTR-5
+      xmm10 CTR-6
+      xmm11 CTR-7
+      xmm12 temp
+      xmm13 temp
+      xmm14 temp
+      xmm15 temp
+   */
+
+  asm volatile (/* detect if 8-bit carry handling is needed */
+                "cmpb   $0xf7, 15(%[ctr])\n\t"
+                "ja     .Ladd32bit%=\n\t"
+
+                "movdqa %%xmm5, %%xmm0\n\t"     /* xmm0 := CTR (xmm5) */
+                "movdqa 0*16(%[addb]), %%xmm2\n\t"  /* xmm2 := be(1) */
+                "movdqa 1*16(%[addb]), %%xmm3\n\t"  /* xmm3 := be(2) */
+                "movdqa 2*16(%[addb]), %%xmm4\n\t"  /* xmm4 := be(3) */
+                "movdqa 3*16(%[addb]), %%xmm8\n\t"  /* xmm8 := be(4) */
+                "movdqa 4*16(%[addb]), %%xmm9\n\t"  /* xmm9 := be(5) */
+                "movdqa 5*16(%[addb]), %%xmm10\n\t" /* xmm10 := be(6) */
+                "movdqa 6*16(%[addb]), %%xmm11\n\t" /* xmm11 := be(7) */
+                "movdqa 7*16(%[addb]), %%xmm5\n\t"  /* xmm5 := be(8) */
+                "movdqa (%[key]), %%xmm1\n\t"   /* xmm1 := key[0] */
+                "paddb  %%xmm0, %%xmm2\n\t"     /* xmm2 := be(1) + CTR (xmm0) */
+                "paddb  %%xmm0, %%xmm3\n\t"     /* xmm3 := be(2) + CTR (xmm0) */
+                "paddb  %%xmm0, %%xmm4\n\t"     /* xmm4 := be(3) + CTR (xmm0) */
+                "paddb  %%xmm0, %%xmm8\n\t"     /* xmm8 := be(4) + CTR (xmm0) */
+                "paddb  %%xmm0, %%xmm9\n\t"     /* xmm9 := be(5) + CTR (xmm0) */
+                "paddb  %%xmm0, %%xmm10\n\t"    /* xmm10 := be(6) + CTR (xmm0) */
+                "paddb  %%xmm0, %%xmm11\n\t"    /* xmm11 := be(7) + CTR (xmm0) */
+                "paddb  %%xmm0, %%xmm5\n\t"     /* xmm5 := be(8) + CTR (xmm0) */
+                "jmp    .Lstore_ctr%=\n\t"
+
+                ".Ladd32bit%=:\n\t"
+                "movdqa %%xmm5, %%xmm0\n\t"     /* xmm0, xmm2 := CTR (xmm5) */
+                "movdqa %%xmm0, %%xmm2\n\t"
+                "pcmpeqd %%xmm1, %%xmm1\n\t"
+                "psrldq $8, %%xmm1\n\t"         /* xmm1 = -1 */
+
+                "pshufb %%xmm6, %%xmm2\n\t"     /* xmm2 := le(xmm2) */
+                "psubq  %%xmm1, %%xmm2\n\t"     /* xmm2++           */
+                "movdqa %%xmm2, %%xmm3\n\t"     /* xmm3 := xmm2     */
+                "psubq  %%xmm1, %%xmm3\n\t"     /* xmm3++           */
+                "movdqa %%xmm3, %%xmm4\n\t"     /* xmm4 := xmm3     */
+                "psubq  %%xmm1, %%xmm4\n\t"     /* xmm4++           */
+                "movdqa %%xmm4, %%xmm8\n\t"     /* xmm8 := xmm4     */
+                "psubq  %%xmm1, %%xmm8\n\t"     /* xmm8++           */
+                "movdqa %%xmm8, %%xmm9\n\t"     /* xmm9 := xmm8     */
+                "psubq  %%xmm1, %%xmm9\n\t"     /* xmm9++           */
+                "movdqa %%xmm9, %%xmm10\n\t"    /* xmm10 := xmm9    */
+                "psubq  %%xmm1, %%xmm10\n\t"    /* xmm10++          */
+                "movdqa %%xmm10, %%xmm11\n\t"   /* xmm11 := xmm10   */
+                "psubq  %%xmm1, %%xmm11\n\t"    /* xmm11++          */
+                "movdqa %%xmm11, %%xmm5\n\t"    /* xmm5 := xmm11    */
+                "psubq  %%xmm1, %%xmm5\n\t"     /* xmm5++           */
+
+                /* detect if 64-bit carry handling is needed */
+                "cmpl   $0xffffffff, 8(%[ctr])\n\t"
+                "jne    .Lno_carry%=\n\t"
+                "movl   12(%[ctr]), %%esi\n\t"
+                "bswapl %%esi\n\t"
+                "cmpl   $0xfffffff8, %%esi\n\t"
+                "jb     .Lno_carry%=\n\t"       /* no carry */
+
+                "pslldq $8, %%xmm1\n\t"         /* move lower 64-bit to high */
+                "je     .Lcarry_xmm5%=\n\t"     /* esi == 0xfffffff8 */
+                "cmpl   $0xfffffffa, %%esi\n\t"
+                "jb     .Lcarry_xmm11%=\n\t"     /* esi == 0xfffffff9 */
+                "je     .Lcarry_xmm10%=\n\t"     /* esi == 0xfffffffa */
+                "cmpl   $0xfffffffc, %%esi\n\t"
+                "jb     .Lcarry_xmm9%=\n\t"     /* esi == 0xfffffffb */
+                "je     .Lcarry_xmm8%=\n\t"     /* esi == 0xfffffffc */
+                "cmpl   $0xfffffffe, %%esi\n\t"
+                "jb     .Lcarry_xmm4%=\n\t"     /* esi == 0xfffffffd */
+                "je     .Lcarry_xmm3%=\n\t"     /* esi == 0xfffffffe */
+                /* esi == 0xffffffff */
+
+                "psubq   %%xmm1, %%xmm2\n\t"
+                ".Lcarry_xmm3%=:\n\t"
+                "psubq   %%xmm1, %%xmm3\n\t"
+                ".Lcarry_xmm4%=:\n\t"
+                "psubq   %%xmm1, %%xmm4\n\t"
+                ".Lcarry_xmm8%=:\n\t"
+                "psubq   %%xmm1, %%xmm8\n\t"
+                ".Lcarry_xmm9%=:\n\t"
+                "psubq   %%xmm1, %%xmm9\n\t"
+                ".Lcarry_xmm10%=:\n\t"
+                "psubq   %%xmm1, %%xmm10\n\t"
+                ".Lcarry_xmm11%=:\n\t"
+                "psubq   %%xmm1, %%xmm11\n\t"
+                ".Lcarry_xmm5%=:\n\t"
+                "psubq   %%xmm1, %%xmm5\n\t"
+
+                ".Lno_carry%=:\n\t"
+                "movdqa (%[key]), %%xmm1\n\t"   /* xmm1 := key[0]    */
+
+                "pshufb %%xmm6, %%xmm2\n\t"     /* xmm2 := be(xmm2) */
+                "pshufb %%xmm6, %%xmm3\n\t"     /* xmm3 := be(xmm3) */
+                "pshufb %%xmm6, %%xmm4\n\t"     /* xmm4 := be(xmm4) */
+                "pshufb %%xmm6, %%xmm5\n\t"     /* xmm5 := be(xmm5) */
+                "pshufb %%xmm6, %%xmm8\n\t"     /* xmm8 := be(xmm8) */
+                "pshufb %%xmm6, %%xmm9\n\t"     /* xmm9 := be(xmm9) */
+                "pshufb %%xmm6, %%xmm10\n\t"    /* xmm10 := be(xmm10) */
+                "pshufb %%xmm6, %%xmm11\n\t"    /* xmm11 := be(xmm11) */
+
+                ".Lstore_ctr%=:\n\t"
+                "movdqa %%xmm5, (%[ctr])\n\t"   /* Update CTR (mem).  */
+                :
+                : [ctr] "r" (ctr),
+                  [key] "r" (ctx->keyschenc),
+                  [addb] "r" (bige_addb)
+                : "%esi", "cc", "memory");
+
+  asm volatile ("pxor   %%xmm1, %%xmm0\n\t"     /* xmm0 ^= key[0]    */
+                "pxor   %%xmm1, %%xmm2\n\t"     /* xmm2 ^= key[0]    */
+                "pxor   %%xmm1, %%xmm3\n\t"     /* xmm3 ^= key[0]    */
+                "pxor   %%xmm1, %%xmm4\n\t"     /* xmm4 ^= key[0]    */
+                "pxor   %%xmm1, %%xmm8\n\t"     /* xmm8 ^= key[0]    */
+                "pxor   %%xmm1, %%xmm9\n\t"     /* xmm9 ^= key[0]    */
+                "pxor   %%xmm1, %%xmm10\n\t"    /* xmm10 ^= key[0]   */
+                "pxor   %%xmm1, %%xmm11\n\t"    /* xmm11 ^= key[0]   */
+                "movdqa 0x10(%[key]), %%xmm1\n\t"
+                "cmpl $12, %[rounds]\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0x20(%[key]), %%xmm1\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0x30(%[key]), %%xmm1\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0x40(%[key]), %%xmm1\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0x50(%[key]), %%xmm1\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0x60(%[key]), %%xmm1\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0x70(%[key]), %%xmm1\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0x80(%[key]), %%xmm1\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0x90(%[key]), %%xmm1\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0xa0(%[key]), %%xmm1\n\t"
+                "jb .Lenclast%=\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0xb0(%[key]), %%xmm1\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0xc0(%[key]), %%xmm1\n\t"
+                "je .Lenclast%=\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0xd0(%[key]), %%xmm1\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0xe0(%[key]), %%xmm1\n"
+
+                ".Lenclast%=:\n\t"
+                "aesenclast %%xmm1, %%xmm0\n\t"
+                "aesenclast %%xmm1, %%xmm2\n\t"
+                "aesenclast %%xmm1, %%xmm3\n\t"
+                "aesenclast %%xmm1, %%xmm4\n\t"
+                "aesenclast %%xmm1, %%xmm8\n\t"
+                "aesenclast %%xmm1, %%xmm9\n\t"
+                "aesenclast %%xmm1, %%xmm10\n\t"
+                "aesenclast %%xmm1, %%xmm11\n\t"
+                :
+                : [key] "r" (ctx->keyschenc),
+                  [rounds] "r" (ctx->rounds)
+                : "cc", "memory");
+
+  asm volatile ("movdqu 0*16(%[src]), %%xmm12\n\t" /* Get block 1.      */
+                "movdqu 1*16(%[src]), %%xmm13\n\t" /* Get block 2.      */
+                "movdqu 2*16(%[src]), %%xmm14\n\t" /* Get block 3.      */
+                "movdqu 3*16(%[src]), %%xmm15\n\t" /* Get block 4.      */
+                "movdqu 4*16(%[src]), %%xmm1\n\t"  /* Get block 5.      */
+                "pxor %%xmm12, %%xmm0\n\t"         /* EncCTR-1 ^= input */
+                "movdqu 5*16(%[src]), %%xmm12\n\t" /* Get block 6.      */
+                "pxor %%xmm13, %%xmm2\n\t"         /* EncCTR-2 ^= input */
+                "movdqu 6*16(%[src]), %%xmm13\n\t" /* Get block 7.      */
+                "pxor %%xmm14, %%xmm3\n\t"         /* EncCTR-3 ^= input */
+                "movdqu 7*16(%[src]), %%xmm14\n\t" /* Get block 8.      */
+                "pxor %%xmm15, %%xmm4\n\t"         /* EncCTR-4 ^= input */
+                "movdqu %%xmm0, 0*16(%[dst])\n\t"  /* Store block 1     */
+                "pxor %%xmm1,  %%xmm8\n\t"         /* EncCTR-5 ^= input */
+                "movdqu %%xmm0, 0*16(%[dst])\n\t"  /* Store block 1     */
+                "pxor %%xmm12, %%xmm9\n\t"         /* EncCTR-6 ^= input */
+                "movdqu %%xmm2, 1*16(%[dst])\n\t"  /* Store block 2.    */
+                "pxor %%xmm13, %%xmm10\n\t"        /* EncCTR-7 ^= input */
+                "movdqu %%xmm3, 2*16(%[dst])\n\t"  /* Store block 3.    */
+                "pxor %%xmm14, %%xmm11\n\t"        /* EncCTR-8 ^= input */
+                "movdqu %%xmm4, 3*16(%[dst])\n\t"  /* Store block 4.    */
+                "movdqu %%xmm8, 4*16(%[dst])\n\t"  /* Store block 8.    */
+                "movdqu %%xmm9, 5*16(%[dst])\n\t"  /* Store block 9.    */
+                "movdqu %%xmm10, 6*16(%[dst])\n\t" /* Store block 10.   */
+                "movdqu %%xmm11, 7*16(%[dst])\n\t" /* Store block 11.   */
+                :
+                : [src] "r" (a),
+                  [dst] "r" (b)
+                : "memory");
+}
+
+#endif /* __x86_64__ */
+
+
 unsigned int
 _gcry_aes_aesni_encrypt (const RIJNDAEL_context *ctx, unsigned char *dst,
                          const unsigned char *src)
@@ -1123,7 +1796,25 @@ _gcry_aes_aesni_ctr_enc (RIJNDAEL_context *ctx, unsigned char *outbuf,
                   [ctr] "m" (*ctr)
                 : "memory");
 
-  for ( ;nblocks > 3 ; nblocks -= 4 )
+#ifdef __x86_64__
+  if (nblocks >= 8)
+    {
+      aesni_prepare_7_15_variable;
+
+      aesni_prepare_7_15();
+
+      for ( ;nblocks >= 8 ; nblocks -= 8 )
+       {
+         do_aesni_ctr_8 (ctx, ctr, outbuf, inbuf);
+         outbuf += 8*BLOCKSIZE;
+         inbuf  += 8*BLOCKSIZE;
+       }
+
+      aesni_cleanup_7_15();
+    }
+#endif
+
+  for ( ;nblocks >= 4 ; nblocks -= 4 )
     {
       do_aesni_ctr_4 (ctx, ctr, outbuf, inbuf);
       outbuf += 4*BLOCKSIZE;
@@ -1175,6 +1866,76 @@ _gcry_aes_aesni_cfb_dec (RIJNDAEL_context *ctx, unsigned char *outbuf,
                 : "memory" );
 
   /* CFB decryption can be parallelized */
+
+#ifdef __x86_64__
+  if (nblocks >= 8)
+    {
+      aesni_prepare_7_15_variable;
+
+      aesni_prepare_7_15();
+
+      for ( ;nblocks >= 8; nblocks -= 8)
+       {
+         asm volatile
+           ("movdqu %%xmm6,         %%xmm1\n\t" /* load input blocks */
+            "movdqu 0*16(%[inbuf]), %%xmm2\n\t"
+            "movdqu 1*16(%[inbuf]), %%xmm3\n\t"
+            "movdqu 2*16(%[inbuf]), %%xmm4\n\t"
+            "movdqu 3*16(%[inbuf]), %%xmm8\n\t"
+            "movdqu 4*16(%[inbuf]), %%xmm9\n\t"
+            "movdqu 5*16(%[inbuf]), %%xmm10\n\t"
+            "movdqu 6*16(%[inbuf]), %%xmm11\n\t"
+
+            "movdqu 7*16(%[inbuf]), %%xmm6\n\t" /* update IV */
+
+            "movdqa %%xmm2, %%xmm12\n\t"
+            "movdqa %%xmm3, %%xmm13\n\t"
+            "movdqa %%xmm4, %%xmm14\n\t"
+            "movdqa %%xmm8, %%xmm15\n\t"
+            : /* No output */
+            : [inbuf] "r" (inbuf)
+            : "memory");
+
+         do_aesni_enc_vec8 (ctx);
+
+         asm volatile
+           (
+            "pxor %%xmm12, %%xmm1\n\t"
+            "movdqu 4*16(%[inbuf]), %%xmm12\n\t"
+            "pxor %%xmm13, %%xmm2\n\t"
+            "movdqu 5*16(%[inbuf]), %%xmm13\n\t"
+            "pxor %%xmm14, %%xmm3\n\t"
+            "movdqu 6*16(%[inbuf]), %%xmm14\n\t"
+            "pxor %%xmm15, %%xmm4\n\t"
+            "movdqu 7*16(%[inbuf]), %%xmm15\n\t"
+
+            "pxor %%xmm12, %%xmm8\n\t"
+            "movdqu %%xmm1, 0*16(%[outbuf])\n\t"
+            "pxor %%xmm13, %%xmm9\n\t"
+            "movdqu %%xmm2, 1*16(%[outbuf])\n\t"
+            "pxor %%xmm14, %%xmm10\n\t"
+            "movdqu %%xmm3, 2*16(%[outbuf])\n\t"
+            "pxor %%xmm15, %%xmm11\n\t"
+            "movdqu %%xmm4, 3*16(%[outbuf])\n\t"
+
+            "movdqu %%xmm8, 4*16(%[outbuf])\n\t"
+            "movdqu %%xmm9, 5*16(%[outbuf])\n\t"
+            "movdqu %%xmm10, 6*16(%[outbuf])\n\t"
+            "movdqu %%xmm11, 7*16(%[outbuf])\n\t"
+
+            : /* No output */
+            : [inbuf] "r" (inbuf),
+              [outbuf] "r" (outbuf)
+            : "memory");
+
+         outbuf += 8*BLOCKSIZE;
+         inbuf  += 8*BLOCKSIZE;
+       }
+
+      aesni_cleanup_7_15();
+    }
+#endif
+
   for ( ;nblocks >= 4; nblocks -= 4)
     {
       asm volatile
@@ -1260,7 +2021,76 @@ _gcry_aes_aesni_cbc_dec (RIJNDAEL_context *ctx, unsigned char *outbuf,
      : [iv] "m" (*iv)
      : "memory");
 
-  for ( ;nblocks > 3 ; nblocks -= 4 )
+#ifdef __x86_64__
+  if (nblocks >= 8)
+    {
+      aesni_prepare_7_15_variable;
+
+      aesni_prepare_7_15();
+
+      for ( ;nblocks >= 8 ; nblocks -= 8 )
+       {
+         asm volatile
+           ("movdqu 0*16(%[inbuf]), %%xmm1\n\t"        /* load input blocks */
+            "movdqu 1*16(%[inbuf]), %%xmm2\n\t"
+            "movdqu 2*16(%[inbuf]), %%xmm3\n\t"
+            "movdqu 3*16(%[inbuf]), %%xmm4\n\t"
+            "movdqu 4*16(%[inbuf]), %%xmm8\n\t"
+            "movdqu 5*16(%[inbuf]), %%xmm9\n\t"
+            "movdqu 6*16(%[inbuf]), %%xmm10\n\t"
+            "movdqu 7*16(%[inbuf]), %%xmm11\n\t"
+
+            "movdqa %%xmm1, %%xmm12\n\t"
+            "movdqa %%xmm2, %%xmm13\n\t"
+            "movdqa %%xmm3, %%xmm14\n\t"
+            "movdqa %%xmm4, %%xmm15\n\t"
+
+            : /* No output */
+            : [inbuf] "r" (inbuf)
+            : "memory");
+
+         do_aesni_dec_vec8 (ctx);
+
+         asm volatile
+           ("pxor %%xmm5, %%xmm1\n\t"          /* xor IV with output */
+
+            "pxor %%xmm12, %%xmm2\n\t"         /* xor IV with output */
+            "movdqu 4*16(%[inbuf]), %%xmm12\n\t"
+
+            "pxor %%xmm13, %%xmm3\n\t"         /* xor IV with output */
+            "movdqu 5*16(%[inbuf]), %%xmm13\n\t"
+
+            "pxor %%xmm14, %%xmm4\n\t"         /* xor IV with output */
+            "movdqu 6*16(%[inbuf]), %%xmm14\n\t"
+
+            "pxor %%xmm15, %%xmm8\n\t"         /* xor IV with output */
+            "movdqu 7*16(%[inbuf]), %%xmm5\n\t"
+            "pxor %%xmm12, %%xmm9\n\t"         /* xor IV with output */
+            "movdqu %%xmm1, 0*16(%[outbuf])\n\t"
+            "pxor %%xmm13, %%xmm10\n\t"                /* xor IV with output */
+            "movdqu %%xmm2, 1*16(%[outbuf])\n\t"
+            "pxor %%xmm14, %%xmm11\n\t"                /* xor IV with output */
+            "movdqu %%xmm3, 2*16(%[outbuf])\n\t"
+            "movdqu %%xmm4, 3*16(%[outbuf])\n\t"
+            "movdqu %%xmm8, 4*16(%[outbuf])\n\t"
+            "movdqu %%xmm9, 5*16(%[outbuf])\n\t"
+            "movdqu %%xmm10, 6*16(%[outbuf])\n\t"
+            "movdqu %%xmm11, 7*16(%[outbuf])\n\t"
+
+            : /* No output */
+            : [inbuf] "r" (inbuf),
+              [outbuf] "r" (outbuf)
+            : "memory");
+
+         outbuf += 8*BLOCKSIZE;
+         inbuf  += 8*BLOCKSIZE;
+       }
+
+      aesni_cleanup_7_15();
+    }
+#endif
+
+  for ( ;nblocks >= 4 ; nblocks -= 4 )
     {
       asm volatile
         ("movdqu 0*16(%[inbuf]), %%xmm1\n\t"   /* load input blocks */
@@ -1386,7 +2216,142 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
       outbuf += BLOCKSIZE;
     }
 
-  for ( ;nblocks > 3 ; nblocks -= 4 )
+#ifdef __x86_64__
+  if (nblocks >= 8)
+    {
+      aesni_prepare_7_15_variable;
+
+      aesni_prepare_7_15();
+
+      asm volatile ("movdqu %[l0], %%xmm7\n\t"
+                   :
+                   : [l0] "m" (*c->u_mode.ocb.L[0])
+                   : "memory" );
+
+      for ( ;nblocks >= 8 ; nblocks -= 8 )
+       {
+         n += 4;
+         l = ocb_get_l(c, n);
+
+         /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+         /* Checksum_i = Checksum_{i-1} xor P_i  */
+         /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+
+         asm volatile ("movdqu %[l1],     %%xmm10\n\t"
+                       "movdqu %[inbuf0], %%xmm1\n\t"
+                       "pxor   %%xmm7,    %%xmm5\n\t"
+                       "pxor   %%xmm1,    %%xmm6\n\t"
+                       "pxor   %%xmm5,    %%xmm1\n\t"
+                       "movdqa %%xmm5,    %%xmm12\n\t"
+                       :
+                       : [l1] "m" (*c->u_mode.ocb.L[1]),
+                         [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE))
+                       : "memory" );
+         asm volatile ("movdqu %[inbuf1], %%xmm2\n\t"
+                       "pxor   %%xmm10,   %%xmm5\n\t"
+                       "pxor   %%xmm2,    %%xmm6\n\t"
+                       "pxor   %%xmm5,    %%xmm2\n\t"
+                       "movdqa %%xmm5,    %%xmm13\n\t"
+                       :
+                       : [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
+                       : "memory" );
+         asm volatile ("movdqu %[inbuf2], %%xmm3\n\t"
+                       "pxor   %%xmm7,    %%xmm5\n\t"
+                       "pxor   %%xmm3,    %%xmm6\n\t"
+                       "pxor   %%xmm5,    %%xmm3\n\t"
+                       "movdqa %%xmm5,    %%xmm14\n\t"
+                       :
+                       : [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
+                       : "memory" );
+         asm volatile ("movdqu %[l3],     %%xmm15\n\t"
+                       "movdqu %[inbuf3], %%xmm4\n\t"
+                       "pxor   %%xmm15,   %%xmm5\n\t"
+                       "pxor   %%xmm4,    %%xmm6\n\t"
+                       "pxor   %%xmm5,    %%xmm4\n\t"
+                       "movdqa %%xmm5,    %%xmm15\n\t"
+                       :
+                       : [l3] "m" (*l),
+                         [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE))
+                       : "memory" );
+
+         n += 4;
+         l = ocb_get_l(c, n);
+
+         asm volatile ("movdqu %[inbuf4], %%xmm8\n\t"
+                       "pxor   %%xmm7,    %%xmm5\n\t"
+                       "pxor   %%xmm8,    %%xmm6\n\t"
+                       "pxor   %%xmm5,    %%xmm8\n\t"
+                       "movdqu %%xmm5,    %[outbuf4]\n\t"
+                       : [outbuf4] "=m" (*(outbuf + 4 * BLOCKSIZE))
+                       : [inbuf4] "m" (*(inbuf + 4 * BLOCKSIZE))
+                       : "memory" );
+         asm volatile ("movdqu %[inbuf5], %%xmm9\n\t"
+                       "pxor   %%xmm10,   %%xmm5\n\t"
+                       "pxor   %%xmm9,    %%xmm6\n\t"
+                       "pxor   %%xmm5,    %%xmm9\n\t"
+                       "movdqu %%xmm5,    %[outbuf5]\n\t"
+                       : [outbuf5] "=m" (*(outbuf + 5 * BLOCKSIZE))
+                       : [inbuf5] "m" (*(inbuf + 5 * BLOCKSIZE))
+                       : "memory" );
+         asm volatile ("movdqu %[inbuf6], %%xmm10\n\t"
+                       "pxor   %%xmm7,    %%xmm5\n\t"
+                       "pxor   %%xmm10,   %%xmm6\n\t"
+                       "pxor   %%xmm5,    %%xmm10\n\t"
+                       "movdqu %%xmm5,    %[outbuf6]\n\t"
+                       : [outbuf6] "=m" (*(outbuf + 6 * BLOCKSIZE))
+                       : [inbuf6] "m" (*(inbuf + 6 * BLOCKSIZE))
+                       : "memory" );
+         asm volatile ("movdqu %[l7],     %%xmm11\n\t"
+                       "pxor   %%xmm11,   %%xmm5\n\t"
+                       "movdqu %[inbuf7], %%xmm11\n\t"
+                       "pxor   %%xmm11,   %%xmm6\n\t"
+                       "pxor   %%xmm5,    %%xmm11\n\t"
+                       :
+                       : [l7] "m" (*l),
+                         [inbuf7] "m" (*(inbuf + 7 * BLOCKSIZE))
+                       : "memory" );
+
+         do_aesni_enc_vec8 (ctx);
+
+         asm volatile ("pxor   %%xmm12,   %%xmm1\n\t"
+                       "pxor   %%xmm13,   %%xmm2\n\t"
+                       "movdqu %[outbuf4],%%xmm0\n\t"
+                       "movdqu %[outbuf5],%%xmm12\n\t"
+                       "movdqu %[outbuf6],%%xmm13\n\t"
+                       "pxor   %%xmm14,   %%xmm3\n\t"
+                       "pxor   %%xmm15,   %%xmm4\n\t"
+                       "pxor   %%xmm0,    %%xmm8\n\t"
+                       "pxor   %%xmm12,   %%xmm9\n\t"
+                       "pxor   %%xmm13,   %%xmm10\n\t"
+                       "pxor   %%xmm5,    %%xmm11\n\t"
+                       "movdqu %%xmm1,    %[outbuf0]\n\t"
+                       "movdqu %%xmm2,    %[outbuf1]\n\t"
+                       "movdqu %%xmm3,    %[outbuf2]\n\t"
+                       "movdqu %%xmm4,    %[outbuf3]\n\t"
+                       "movdqu %%xmm8,    %[outbuf4]\n\t"
+                       "movdqu %%xmm9,    %[outbuf5]\n\t"
+                       "movdqu %%xmm10,   %[outbuf6]\n\t"
+                       "movdqu %%xmm11,   %[outbuf7]\n\t"
+                       : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)),
+                         [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE)),
+                         [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)),
+                         [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE)),
+                         [outbuf4] "+m" (*(outbuf + 4 * BLOCKSIZE)),
+                         [outbuf5] "+m" (*(outbuf + 5 * BLOCKSIZE)),
+                         [outbuf6] "+m" (*(outbuf + 6 * BLOCKSIZE)),
+                         [outbuf7] "=m" (*(outbuf + 7 * BLOCKSIZE))
+                       :
+                       : "memory" );
+
+         outbuf += 8*BLOCKSIZE;
+         inbuf  += 8*BLOCKSIZE;
+       }
+
+    aesni_cleanup_7_15();
+  }
+#endif
+
+  for ( ;nblocks >= 4 ; nblocks -= 4 )
     {
       n += 4;
       l = ocb_get_l(c, n);
@@ -1394,9 +2359,9 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
       /* Checksum_i = Checksum_{i-1} xor P_i  */
       /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
-      asm volatile ("movdqu %[l0],     %%xmm0\n\t"
+      asm volatile ("movdqu %[l0],     %%xmm4\n\t"
                    "movdqu %[inbuf0], %%xmm1\n\t"
-                   "pxor   %%xmm0,    %%xmm5\n\t"
+                   "pxor   %%xmm4,    %%xmm5\n\t"
                    "pxor   %%xmm1,    %%xmm6\n\t"
                    "pxor   %%xmm5,    %%xmm1\n\t"
                    "movdqu %%xmm5,    %[outbuf0]\n\t"
@@ -1414,19 +2379,17 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
                    : [l1] "m" (*c->u_mode.ocb.L[1]),
                      [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
                    : "memory" );
-      asm volatile ("movdqu %[l2],     %%xmm0\n\t"
-                   "movdqu %[inbuf2], %%xmm3\n\t"
-                   "pxor   %%xmm0,    %%xmm5\n\t"
+      asm volatile ("movdqu %[inbuf2], %%xmm3\n\t"
+                   "pxor   %%xmm4,    %%xmm5\n\t"
                    "pxor   %%xmm3,    %%xmm6\n\t"
                    "pxor   %%xmm5,    %%xmm3\n\t"
                    "movdqu %%xmm5,    %[outbuf2]\n\t"
                    : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE))
-                   : [l2] "m" (*c->u_mode.ocb.L[0]),
-                     [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
+                   : [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
                    : "memory" );
-      asm volatile ("movdqu %[l3],     %%xmm0\n\t"
+      asm volatile ("movdqu %[l3],     %%xmm4\n\t"
+                   "pxor   %%xmm4,    %%xmm5\n\t"
                    "movdqu %[inbuf3], %%xmm4\n\t"
-                   "pxor   %%xmm0,    %%xmm5\n\t"
                    "pxor   %%xmm4,    %%xmm6\n\t"
                    "pxor   %%xmm5,    %%xmm4\n\t"
                    :
@@ -1551,7 +2514,142 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
       outbuf += BLOCKSIZE;
     }
 
-  for ( ;nblocks > 3 ; nblocks -= 4 )
+#ifdef __x86_64__
+  if (nblocks >= 8)
+    {
+      aesni_prepare_7_15_variable;
+
+      aesni_prepare_7_15();
+
+      asm volatile ("movdqu %[l0], %%xmm7\n\t"
+                   :
+                   : [l0] "m" (*c->u_mode.ocb.L[0])
+                   : "memory" );
+
+      for ( ;nblocks >= 8 ; nblocks -= 8 )
+       {
+         n += 4;
+         l = ocb_get_l(c, n);
+
+         /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+         /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
+         /* Checksum_i = Checksum_{i-1} xor P_i  */
+
+         asm volatile ("movdqu %[l1],     %%xmm10\n\t"
+                       "movdqu %[inbuf0], %%xmm1\n\t"
+                       "pxor   %%xmm7,    %%xmm5\n\t"
+                       "pxor   %%xmm5,    %%xmm1\n\t"
+                       "movdqa %%xmm5,    %%xmm12\n\t"
+                       :
+                       : [l1] "m" (*c->u_mode.ocb.L[1]),
+                         [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE))
+                       : "memory" );
+         asm volatile ("movdqu %[inbuf1], %%xmm2\n\t"
+                       "pxor   %%xmm10,   %%xmm5\n\t"
+                       "pxor   %%xmm5,    %%xmm2\n\t"
+                       "movdqa %%xmm5,    %%xmm13\n\t"
+                       :
+                       : [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
+                       : "memory" );
+         asm volatile ("movdqu %[inbuf2], %%xmm3\n\t"
+                       "pxor   %%xmm7,    %%xmm5\n\t"
+                       "pxor   %%xmm5,    %%xmm3\n\t"
+                       "movdqa %%xmm5,    %%xmm14\n\t"
+                       :
+                       : [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
+                       : "memory" );
+         asm volatile ("movdqu %[l3],     %%xmm0\n\t"
+                       "movdqu %[inbuf3], %%xmm4\n\t"
+                       "pxor   %%xmm0,    %%xmm5\n\t"
+                       "pxor   %%xmm5,    %%xmm4\n\t"
+                       "movdqa %%xmm5,    %%xmm15\n\t"
+                       :
+                       : [l3] "m" (*l),
+                         [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE))
+                       : "memory" );
+
+         n += 4;
+         l = ocb_get_l(c, n);
+
+         asm volatile ("movdqu %[inbuf4], %%xmm8\n\t"
+                       "pxor   %%xmm7,    %%xmm5\n\t"
+                       "pxor   %%xmm5,    %%xmm8\n\t"
+                       "movdqu %%xmm5,    %[outbuf4]\n\t"
+                       : [outbuf4] "=m" (*(outbuf + 4 * BLOCKSIZE))
+                       : [inbuf4] "m" (*(inbuf + 4 * BLOCKSIZE))
+                       : "memory" );
+         asm volatile ("movdqu %[inbuf5], %%xmm9\n\t"
+                       "pxor   %%xmm10,   %%xmm5\n\t"
+                       "pxor   %%xmm5,    %%xmm9\n\t"
+                       "movdqu %%xmm5,    %[outbuf5]\n\t"
+                       : [outbuf5] "=m" (*(outbuf + 5 * BLOCKSIZE))
+                       : [inbuf5] "m" (*(inbuf + 5 * BLOCKSIZE))
+                       : "memory" );
+         asm volatile ("movdqu %[inbuf6], %%xmm10\n\t"
+                       "pxor   %%xmm7,    %%xmm5\n\t"
+                       "pxor   %%xmm5,    %%xmm10\n\t"
+                       "movdqu %%xmm5,    %[outbuf6]\n\t"
+                       : [outbuf6] "=m" (*(outbuf + 6 * BLOCKSIZE))
+                       : [inbuf6] "m" (*(inbuf + 6 * BLOCKSIZE))
+                       : "memory" );
+         asm volatile ("movdqu %[l7],     %%xmm0\n\t"
+                       "movdqu %[inbuf7], %%xmm11\n\t"
+                       "pxor   %%xmm0,    %%xmm5\n\t"
+                       "pxor   %%xmm5,    %%xmm11\n\t"
+                       :
+                       : [l7] "m" (*l),
+                         [inbuf7] "m" (*(inbuf + 7 * BLOCKSIZE))
+                       : "memory" );
+
+         do_aesni_dec_vec8 (ctx);
+
+         asm volatile ("pxor   %%xmm12,   %%xmm1\n\t"
+                       "pxor   %%xmm13,   %%xmm2\n\t"
+                       "movdqu %[outbuf4],%%xmm0\n\t"
+                       "movdqu %[outbuf5],%%xmm12\n\t"
+                       "movdqu %[outbuf6],%%xmm13\n\t"
+                       "pxor   %%xmm14,   %%xmm3\n\t"
+                       "pxor   %%xmm15,   %%xmm4\n\t"
+                       "pxor   %%xmm0,    %%xmm8\n\t"
+                       "pxor   %%xmm12,   %%xmm9\n\t"
+                       "pxor   %%xmm13,   %%xmm10\n\t"
+                       "pxor   %%xmm5,    %%xmm11\n\t"
+                       "movdqu %%xmm1,    %[outbuf0]\n\t"
+                       "movdqu %%xmm2,    %[outbuf1]\n\t"
+                       "movdqu %%xmm3,    %[outbuf2]\n\t"
+                       "movdqu %%xmm4,    %[outbuf3]\n\t"
+                       "movdqu %%xmm8,    %[outbuf4]\n\t"
+                       "movdqu %%xmm9,    %[outbuf5]\n\t"
+                       "movdqu %%xmm10,   %[outbuf6]\n\t"
+                       "movdqu %%xmm11,   %[outbuf7]\n\t"
+                       "pxor   %%xmm2,    %%xmm1\n\t"
+                       "pxor   %%xmm4,    %%xmm1\n\t"
+                       "pxor   %%xmm9,    %%xmm1\n\t"
+                       "pxor   %%xmm11,   %%xmm1\n\t"
+                       "pxor   %%xmm3,    %%xmm6\n\t"
+                       "pxor   %%xmm8,    %%xmm6\n\t"
+                       "pxor   %%xmm10,   %%xmm6\n\t"
+                       "pxor   %%xmm1,    %%xmm6\n\t"
+                       : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)),
+                         [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE)),
+                         [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)),
+                         [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE)),
+                         [outbuf4] "+m" (*(outbuf + 4 * BLOCKSIZE)),
+                         [outbuf5] "+m" (*(outbuf + 5 * BLOCKSIZE)),
+                         [outbuf6] "+m" (*(outbuf + 6 * BLOCKSIZE)),
+                         [outbuf7] "=m" (*(outbuf + 7 * BLOCKSIZE))
+                       :
+                       : "memory" );
+
+         outbuf += 8*BLOCKSIZE;
+         inbuf  += 8*BLOCKSIZE;
+       }
+
+      aesni_cleanup_7_15();
+    }
+#endif
+
+  for ( ;nblocks >= 4 ; nblocks -= 4 )
     {
       n += 4;
       l = ocb_get_l(c, n);
@@ -1559,9 +2657,9 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
       /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
       /* Checksum_i = Checksum_{i-1} xor P_i  */
-      asm volatile ("movdqu %[l0],     %%xmm0\n\t"
+      asm volatile ("movdqu %[l0],     %%xmm4\n\t"
                    "movdqu %[inbuf0], %%xmm1\n\t"
-                   "pxor   %%xmm0,    %%xmm5\n\t"
+                   "pxor   %%xmm4,    %%xmm5\n\t"
                    "pxor   %%xmm5,    %%xmm1\n\t"
                    "movdqu %%xmm5,    %[outbuf0]\n\t"
                    : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE))
@@ -1577,14 +2675,12 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
                    : [l1] "m" (*c->u_mode.ocb.L[1]),
                      [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
                    : "memory" );
-      asm volatile ("movdqu %[l2],     %%xmm0\n\t"
-                   "movdqu %[inbuf2], %%xmm3\n\t"
-                   "pxor   %%xmm0,    %%xmm5\n\t"
+      asm volatile ("movdqu %[inbuf2], %%xmm3\n\t"
+                   "pxor   %%xmm4,    %%xmm5\n\t"
                    "pxor   %%xmm5,    %%xmm3\n\t"
                    "movdqu %%xmm5,    %[outbuf2]\n\t"
                    : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE))
-                   : [l2] "m" (*c->u_mode.ocb.L[0]),
-                     [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
+                   : [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
                    : "memory" );
       asm volatile ("movdqu %[l3],     %%xmm0\n\t"
                    "movdqu %[inbuf3], %%xmm4\n\t"
@@ -1722,16 +2818,115 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
       abuf += BLOCKSIZE;
     }
 
-  for ( ;nblocks > 3 ; nblocks -= 4 )
+#ifdef __x86_64__
+  if (nblocks >= 8)
+    {
+      aesni_prepare_7_15_variable;
+
+      aesni_prepare_7_15();
+
+      asm volatile ("movdqu %[l0], %%xmm7\n\t"
+                   "movdqu %[l1], %%xmm12\n\t"
+                   :
+                   : [l0] "m" (*c->u_mode.ocb.L[0]),
+                     [l1] "m" (*c->u_mode.ocb.L[1])
+                   : "memory" );
+
+      for ( ;nblocks >= 8 ; nblocks -= 8 )
+       {
+         n += 4;
+         l = ocb_get_l(c, n);
+
+         /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+         /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
+         asm volatile ("movdqu %[abuf0],  %%xmm1\n\t"
+                       "pxor   %%xmm7,    %%xmm5\n\t"
+                       "pxor   %%xmm5,    %%xmm1\n\t"
+                       :
+                       : [abuf0] "m" (*(abuf + 0 * BLOCKSIZE))
+                       : "memory" );
+         asm volatile ("movdqu %[abuf1],  %%xmm2\n\t"
+                       "pxor   %%xmm12,   %%xmm5\n\t"
+                       "pxor   %%xmm5,    %%xmm2\n\t"
+                       :
+                       : [abuf1] "m" (*(abuf + 1 * BLOCKSIZE))
+                       : "memory" );
+         asm volatile ("movdqu %[abuf2],  %%xmm3\n\t"
+                       "pxor   %%xmm7,    %%xmm5\n\t"
+                       "pxor   %%xmm5,    %%xmm3\n\t"
+                       :
+                       : [abuf2] "m" (*(abuf + 2 * BLOCKSIZE))
+                       : "memory" );
+         asm volatile ("movdqu %[l3],     %%xmm0\n\t"
+                       "movdqu %[abuf3],  %%xmm4\n\t"
+                       "pxor   %%xmm0,    %%xmm5\n\t"
+                       "pxor   %%xmm5,    %%xmm4\n\t"
+                       :
+                       : [l3] "m" (*l),
+                         [abuf3] "m" (*(abuf + 3 * BLOCKSIZE))
+                       : "memory" );
+
+         n += 4;
+         l = ocb_get_l(c, n);
+
+         asm volatile ("movdqu %[abuf4],  %%xmm8\n\t"
+                       "pxor   %%xmm7,    %%xmm5\n\t"
+                       "pxor   %%xmm5,    %%xmm8\n\t"
+                       :
+                       : [abuf4] "m" (*(abuf + 4 * BLOCKSIZE))
+                       : "memory" );
+         asm volatile ("movdqu %[abuf5],  %%xmm9\n\t"
+                       "pxor   %%xmm12,   %%xmm5\n\t"
+                       "pxor   %%xmm5,    %%xmm9\n\t"
+                       :
+                       : [abuf5] "m" (*(abuf + 5 * BLOCKSIZE))
+                       : "memory" );
+         asm volatile ("movdqu %[abuf6],  %%xmm10\n\t"
+                       "pxor   %%xmm7,    %%xmm5\n\t"
+                       "pxor   %%xmm5,    %%xmm10\n\t"
+                       :
+                       : [abuf6] "m" (*(abuf + 6 * BLOCKSIZE))
+                       : "memory" );
+         asm volatile ("movdqu %[l7],     %%xmm0\n\t"
+                       "movdqu %[abuf7],  %%xmm11\n\t"
+                       "pxor   %%xmm0,    %%xmm5\n\t"
+                       "pxor   %%xmm5,    %%xmm11\n\t"
+                       :
+                       : [l7] "m" (*l),
+                         [abuf7] "m" (*(abuf + 7 * BLOCKSIZE))
+                       : "memory" );
+
+         do_aesni_enc_vec8 (ctx);
+
+         asm volatile ("pxor   %%xmm2,   %%xmm1\n\t"
+                       "pxor   %%xmm3,   %%xmm1\n\t"
+                       "pxor   %%xmm4,   %%xmm1\n\t"
+                       "pxor   %%xmm8,   %%xmm1\n\t"
+                       "pxor   %%xmm9,   %%xmm6\n\t"
+                       "pxor   %%xmm10,  %%xmm6\n\t"
+                       "pxor   %%xmm11,  %%xmm6\n\t"
+                       "pxor   %%xmm1,   %%xmm6\n\t"
+                       :
+                       :
+                       : "memory" );
+
+         abuf += 8*BLOCKSIZE;
+       }
+
+      aesni_cleanup_7_15();
+    }
+#endif
+
+  for ( ;nblocks >= 4 ; nblocks -= 4 )
     {
       n += 4;
       l = ocb_get_l(c, n);
 
       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
       /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
-      asm volatile ("movdqu %[l0],     %%xmm0\n\t"
+      asm volatile ("movdqu %[l0],     %%xmm4\n\t"
                    "movdqu %[abuf0],  %%xmm1\n\t"
-                   "pxor   %%xmm0,    %%xmm5\n\t"
+                   "pxor   %%xmm4,    %%xmm5\n\t"
                    "pxor   %%xmm5,    %%xmm1\n\t"
                    :
                    : [l0] "m" (*c->u_mode.ocb.L[0]),
@@ -1745,9 +2940,8 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
                    : [l1] "m" (*c->u_mode.ocb.L[1]),
                      [abuf1] "m" (*(abuf + 1 * BLOCKSIZE))
                    : "memory" );
-      asm volatile ("movdqu %[l2],     %%xmm0\n\t"
-                   "movdqu %[abuf2],  %%xmm3\n\t"
-                   "pxor   %%xmm0,    %%xmm5\n\t"
+      asm volatile ("movdqu %[abuf2],  %%xmm3\n\t"
+                   "pxor   %%xmm4,    %%xmm5\n\t"
                    "pxor   %%xmm5,    %%xmm3\n\t"
                    :
                    : [l2] "m" (*c->u_mode.ocb.L[0]),