Add ARMv8/AArch64 Crypto Extension implementation of AES
authorJussi Kivilinna <jussi.kivilinna@iki.fi>
Sun, 4 Sep 2016 10:41:02 +0000 (13:41 +0300)
committerJussi Kivilinna <jussi.kivilinna@iki.fi>
Mon, 5 Sep 2016 17:08:48 +0000 (20:08 +0300)
* cipher/Makefile.am: Add 'rijndael-armv-aarch64-ce.S'.
* cipher/rijndael-armv8-aarch64-ce.S: New.
* cipher/rijndael-internal.h (USE_ARM_CE): Enable for ARMv8/AArch64.
* configure.ac: Add 'rijndael-armv-aarch64-ce.lo' and
'rijndael-armv8-ce.lo' for ARMv8/AArch64.
--

Improvement vs AArch64 assembly on Cortex-A53:

           AES-128  AES-192  AES-256
CBC enc:    13.19x   13.53x   13.76x
CBC dec:    20.53x   21.91x   22.60x
CFB enc:    14.29x   14.50x   14.63x
CFB dec:    20.42x   21.69x   22.50x
CTR:        18.29x   19.61x   20.53x
OCB enc:    15.21x   16.32x   17.12x
OCB dec:    14.95x   16.11x   16.88x
OCB auth:   16.73x   17.93x   18.66x

Benchmark on Cortex-A53 (1152 Mhz):

Before:
 AES            |  nanosecs/byte   mebibytes/sec   cycles/byte
        ECB enc |     21.86 ns/B     43.62 MiB/s     25.19 c/B
        ECB dec |     22.68 ns/B     42.05 MiB/s     26.13 c/B
        CBC enc |     18.66 ns/B     51.10 MiB/s     21.50 c/B
        CBC dec |     18.72 ns/B     50.95 MiB/s     21.56 c/B
        CFB enc |     18.61 ns/B     51.25 MiB/s     21.44 c/B
        CFB dec |     18.61 ns/B     51.25 MiB/s     21.44 c/B
        OFB enc |     22.84 ns/B     41.75 MiB/s     26.31 c/B
        OFB dec |     22.84 ns/B     41.75 MiB/s     26.31 c/B
        CTR enc |     18.89 ns/B     50.50 MiB/s     21.76 c/B
        CTR dec |     18.89 ns/B     50.50 MiB/s     21.76 c/B
        CCM enc |     37.55 ns/B     25.40 MiB/s     43.25 c/B
        CCM dec |     37.55 ns/B     25.40 MiB/s     43.25 c/B
       CCM auth |     18.77 ns/B     50.80 MiB/s     21.63 c/B
        GCM enc |     20.18 ns/B     47.25 MiB/s     23.25 c/B
        GCM dec |     20.18 ns/B     47.25 MiB/s     23.25 c/B
       GCM auth |      1.30 ns/B     732.5 MiB/s      1.50 c/B
        OCB enc |     19.67 ns/B     48.48 MiB/s     22.66 c/B
        OCB dec |     19.73 ns/B     48.34 MiB/s     22.72 c/B
       OCB auth |     19.46 ns/B     49.00 MiB/s     22.42 c/B
                =
 AES192         |  nanosecs/byte   mebibytes/sec   cycles/byte
        ECB enc |     25.39 ns/B     37.56 MiB/s     29.25 c/B
        ECB dec |     26.15 ns/B     36.47 MiB/s     30.13 c/B
        CBC enc |     22.08 ns/B     43.19 MiB/s     25.44 c/B
        CBC dec |     22.25 ns/B     42.87 MiB/s     25.63 c/B
        CFB enc |     22.03 ns/B     43.30 MiB/s     25.38 c/B
        CFB dec |     22.03 ns/B     43.29 MiB/s     25.38 c/B
        OFB enc |     26.26 ns/B     36.32 MiB/s     30.25 c/B
        OFB dec |     26.26 ns/B     36.32 MiB/s     30.25 c/B
        CTR enc |     22.30 ns/B     42.76 MiB/s     25.69 c/B
        CTR dec |     22.30 ns/B     42.76 MiB/s     25.69 c/B
        CCM enc |     44.38 ns/B     21.49 MiB/s     51.13 c/B
        CCM dec |     44.38 ns/B     21.49 MiB/s     51.13 c/B
       CCM auth |     22.20 ns/B     42.97 MiB/s     25.57 c/B
        GCM enc |     23.60 ns/B     40.41 MiB/s     27.19 c/B
        GCM dec |     23.60 ns/B     40.41 MiB/s     27.19 c/B
       GCM auth |      1.30 ns/B     732.4 MiB/s      1.50 c/B
        OCB enc |     23.09 ns/B     41.31 MiB/s     26.60 c/B
        OCB dec |     23.21 ns/B     41.09 MiB/s     26.74 c/B
       OCB auth |     22.88 ns/B     41.68 MiB/s     26.36 c/B
                =
 AES256         |  nanosecs/byte   mebibytes/sec   cycles/byte
        ECB enc |     28.76 ns/B     33.17 MiB/s     33.13 c/B
        ECB dec |     29.46 ns/B     32.37 MiB/s     33.94 c/B
        CBC enc |     25.45 ns/B     37.48 MiB/s     29.31 c/B
        CBC dec |     25.50 ns/B     37.40 MiB/s     29.38 c/B
        CFB enc |     25.39 ns/B     37.56 MiB/s     29.25 c/B
        CFB dec |     25.39 ns/B     37.56 MiB/s     29.25 c/B
        OFB enc |     29.62 ns/B     32.19 MiB/s     34.13 c/B
        OFB dec |     29.62 ns/B     32.19 MiB/s     34.13 c/B
        CTR enc |     25.67 ns/B     37.15 MiB/s     29.57 c/B
        CTR dec |     25.67 ns/B     37.15 MiB/s     29.57 c/B
        CCM enc |     51.11 ns/B     18.66 MiB/s     58.88 c/B
        CCM dec |     51.11 ns/B     18.66 MiB/s     58.88 c/B
       CCM auth |     25.56 ns/B     37.32 MiB/s     29.44 c/B
        GCM enc |     26.96 ns/B     35.37 MiB/s     31.06 c/B
        GCM dec |     26.98 ns/B     35.35 MiB/s     31.08 c/B
       GCM auth |      1.30 ns/B     733.4 MiB/s      1.50 c/B
        OCB enc |     26.45 ns/B     36.05 MiB/s     30.47 c/B
        OCB dec |     26.53 ns/B     35.95 MiB/s     30.56 c/B
       OCB auth |     26.24 ns/B     36.34 MiB/s     30.23 c/B
                =

After:
Cipher:
 AES            |  nanosecs/byte   mebibytes/sec   cycles/byte
        ECB enc |      4.83 ns/B     197.5 MiB/s      5.56 c/B
        ECB dec |      4.99 ns/B     191.1 MiB/s      5.75 c/B
        CBC enc |      1.41 ns/B     675.5 MiB/s      1.63 c/B
        CBC dec |     0.911 ns/B    1046.9 MiB/s      1.05 c/B
        CFB enc |      1.30 ns/B     732.2 MiB/s      1.50 c/B
        CFB dec |     0.911 ns/B    1046.7 MiB/s      1.05 c/B
        OFB enc |      5.81 ns/B     164.3 MiB/s      6.69 c/B
        OFB dec |      5.81 ns/B     164.3 MiB/s      6.69 c/B
        CTR enc |      1.03 ns/B     924.0 MiB/s      1.19 c/B
        CTR dec |      1.03 ns/B     924.1 MiB/s      1.19 c/B
        CCM enc |      2.50 ns/B     381.8 MiB/s      2.88 c/B
        CCM dec |      2.50 ns/B     381.7 MiB/s      2.88 c/B
       CCM auth |      1.57 ns/B     606.1 MiB/s      1.81 c/B
        GCM enc |      2.33 ns/B     408.5 MiB/s      2.69 c/B
        GCM dec |      2.34 ns/B     408.4 MiB/s      2.69 c/B
       GCM auth |      1.30 ns/B     732.1 MiB/s      1.50 c/B
        OCB enc |      1.29 ns/B     736.6 MiB/s      1.49 c/B
        OCB dec |      1.32 ns/B     724.4 MiB/s      1.52 c/B
       OCB auth |      1.16 ns/B     819.6 MiB/s      1.34 c/B
                =
 AES192         |  nanosecs/byte   mebibytes/sec   cycles/byte
        ECB enc |      5.48 ns/B     174.0 MiB/s      6.31 c/B
        ECB dec |      5.64 ns/B     169.0 MiB/s      6.50 c/B
        CBC enc |      1.63 ns/B     585.8 MiB/s      1.88 c/B
        CBC dec |      1.02 ns/B     935.8 MiB/s      1.17 c/B
        CFB enc |      1.52 ns/B     627.7 MiB/s      1.75 c/B
        CFB dec |      1.02 ns/B     935.9 MiB/s      1.17 c/B
        OFB enc |      6.46 ns/B     147.7 MiB/s      7.44 c/B
        OFB dec |      6.46 ns/B     147.7 MiB/s      7.44 c/B
        CTR enc |      1.14 ns/B     836.1 MiB/s      1.31 c/B
        CTR dec |      1.14 ns/B     835.9 MiB/s      1.31 c/B
        CCM enc |      2.83 ns/B     337.6 MiB/s      3.25 c/B
        CCM dec |      2.82 ns/B     338.0 MiB/s      3.25 c/B
       CCM auth |      1.79 ns/B     532.7 MiB/s      2.06 c/B
        GCM enc |      2.44 ns/B     390.3 MiB/s      2.82 c/B
        GCM dec |      2.44 ns/B     390.2 MiB/s      2.82 c/B
       GCM auth |      1.30 ns/B     731.9 MiB/s      1.50 c/B
        OCB enc |      1.41 ns/B     674.7 MiB/s      1.63 c/B
        OCB dec |      1.44 ns/B     662.0 MiB/s      1.66 c/B
       OCB auth |      1.28 ns/B     746.1 MiB/s      1.47 c/B
                =
 AES256         |  nanosecs/byte   mebibytes/sec   cycles/byte
        ECB enc |      6.13 ns/B     155.5 MiB/s      7.06 c/B
        ECB dec |      6.29 ns/B     151.5 MiB/s      7.25 c/B
        CBC enc |      1.85 ns/B     516.8 MiB/s      2.13 c/B
        CBC dec |      1.13 ns/B     845.6 MiB/s      1.30 c/B
        CFB enc |      1.74 ns/B     549.5 MiB/s      2.00 c/B
        CFB dec |      1.13 ns/B     846.1 MiB/s      1.30 c/B
        OFB enc |      7.11 ns/B     134.2 MiB/s      8.19 c/B
        OFB dec |      7.11 ns/B     134.2 MiB/s      8.19 c/B
        CTR enc |      1.25 ns/B     763.5 MiB/s      1.44 c/B
        CTR dec |      1.25 ns/B     763.4 MiB/s      1.44 c/B
        CCM enc |      3.15 ns/B     302.9 MiB/s      3.63 c/B
        CCM dec |      3.15 ns/B     302.9 MiB/s      3.63 c/B
       CCM auth |      2.01 ns/B     474.2 MiB/s      2.32 c/B
        GCM enc |      2.55 ns/B     374.2 MiB/s      2.94 c/B
        GCM dec |      2.55 ns/B     373.7 MiB/s      2.94 c/B
       GCM auth |      1.30 ns/B     732.2 MiB/s      1.50 c/B
        OCB enc |      1.54 ns/B     617.6 MiB/s      1.78 c/B
        OCB dec |      1.57 ns/B     606.8 MiB/s      1.81 c/B
       OCB auth |      1.40 ns/B     679.8 MiB/s      1.62 c/B
                =

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
cipher/Makefile.am
cipher/rijndael-armv8-aarch32-ce.S
cipher/rijndael-armv8-aarch64-ce.S [new file with mode: 0644]
cipher/rijndael-internal.h
configure.ac

index c31b233..db606ca 100644 (file)
@@ -81,7 +81,7 @@ md5.c \
 poly1305-sse2-amd64.S poly1305-avx2-amd64.S poly1305-armv7-neon.S \
 rijndael.c rijndael-internal.h rijndael-tables.h rijndael-aesni.c \
   rijndael-padlock.c rijndael-amd64.S rijndael-arm.S rijndael-ssse3-amd64.c \
-  rijndael-armv8-ce.c rijndael-armv8-aarch32-ce.S \
+  rijndael-armv8-ce.c rijndael-armv8-aarch32-ce.S rijndael-armv8-aarch64-ce.S \
   rijndael-aarch64.S \
 rmd160.c \
 rsa.c \
index f3b5400..bf68f20 100644 (file)
@@ -1,4 +1,4 @@
-/* ARMv8 CE accelerated AES
+/* rijndael-armv8-aarch32-ce.S - ARMv8/CE accelerated AES
  * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
diff --git a/cipher/rijndael-armv8-aarch64-ce.S b/cipher/rijndael-armv8-aarch64-ce.S
new file mode 100644 (file)
index 0000000..21d0aec
--- /dev/null
@@ -0,0 +1,1265 @@
+/* rijndael-armv8-aarch64-ce.S - ARMv8/CE accelerated AES
+ * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if defined(__AARCH64EL__) && \
+    defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO)
+
+.arch armv8-a+crypto
+
+.text
+
+
+#if (SIZEOF_VOID_P == 4)
+  #define ptr8   w8
+  #define ptr9   w9
+  #define ptr10  w10
+  #define ptr11  w11
+  #define ptr_sz 4
+#elif (SIZEOF_VOID_P == 8)
+  #define ptr8   x8
+  #define ptr9   x9
+  #define ptr10  x10
+  #define ptr11  x11
+  #define ptr_sz 8
+#else
+  #error "missing SIZEOF_VOID_P"
+#endif
+
+
+#define GET_DATA_POINTER(reg, name) \
+       adrp    reg, :got:name ; \
+       ldr     reg, [reg, #:got_lo12:name] ;
+
+
+/* Register macros */
+
+#define vk0 v17
+#define vk1 v18
+#define vk2 v19
+#define vk3 v20
+#define vk4 v21
+#define vk5 v22
+#define vk6 v23
+#define vk7 v24
+#define vk8 v25
+#define vk9 v26
+#define vk10 v27
+#define vk11 v28
+#define vk12 v29
+#define vk13 v30
+#define vk14 v31
+
+
+/* AES macros */
+
+#define aes_preload_keys(keysched, nrounds) \
+       cmp nrounds, #12; \
+       ld1 {vk0.16b-vk3.16b}, [keysched], #64; \
+       ld1 {vk4.16b-vk7.16b}, [keysched], #64; \
+       ld1 {vk8.16b-vk10.16b}, [keysched], #48; \
+       b.lo 1f; \
+       ld1 {vk11.16b-vk12.16b}, [keysched], #32; \
+       b.eq 1f; \
+       ld1 {vk13.16b-vk14.16b}, [keysched]; \
+1:     ;
+
+#define do_aes_one128(ed, mcimc, vo, vb) \
+       aes##ed    vb.16b, vk0.16b; \
+       aes##mcimc vb.16b, vb.16b; \
+       aes##ed    vb.16b, vk1.16b; \
+       aes##mcimc vb.16b, vb.16b; \
+       aes##ed    vb.16b, vk2.16b; \
+       aes##mcimc vb.16b, vb.16b; \
+       aes##ed    vb.16b, vk3.16b; \
+       aes##mcimc vb.16b, vb.16b; \
+       aes##ed    vb.16b, vk4.16b; \
+       aes##mcimc vb.16b, vb.16b; \
+       aes##ed    vb.16b, vk5.16b; \
+       aes##mcimc vb.16b, vb.16b; \
+       aes##ed    vb.16b, vk6.16b; \
+       aes##mcimc vb.16b, vb.16b; \
+       aes##ed    vb.16b, vk7.16b; \
+       aes##mcimc vb.16b, vb.16b; \
+       aes##ed    vb.16b, vk8.16b; \
+       aes##mcimc vb.16b, vb.16b; \
+       aes##ed    vb.16b, vk9.16b; \
+       eor        vo.16b, vb.16b, vk10.16b;
+
+#define do_aes_one192(ed, mcimc, vo, vb) \
+       aes##ed    vb.16b, vk0.16b; \
+       aes##mcimc vb.16b, vb.16b; \
+       aes##ed    vb.16b, vk1.16b; \
+       aes##mcimc vb.16b, vb.16b; \
+       aes##ed    vb.16b, vk2.16b; \
+       aes##mcimc vb.16b, vb.16b; \
+       aes##ed    vb.16b, vk3.16b; \
+       aes##mcimc vb.16b, vb.16b; \
+       aes##ed    vb.16b, vk4.16b; \
+       aes##mcimc vb.16b, vb.16b; \
+       aes##ed    vb.16b, vk5.16b; \
+       aes##mcimc vb.16b, vb.16b; \
+       aes##ed    vb.16b, vk6.16b; \
+       aes##mcimc vb.16b, vb.16b; \
+       aes##ed    vb.16b, vk7.16b; \
+       aes##mcimc vb.16b, vb.16b; \
+       aes##ed    vb.16b, vk8.16b; \
+       aes##mcimc vb.16b, vb.16b; \
+       aes##ed    vb.16b, vk9.16b; \
+       aes##mcimc vb.16b, vb.16b; \
+       aes##ed    vb.16b, vk10.16b; \
+       aes##mcimc vb.16b, vb.16b; \
+       aes##ed    vb.16b, vk11.16b; \
+       eor        vo.16b, vb.16b, vk12.16b;
+
+#define do_aes_one256(ed, mcimc, vo, vb) \
+       aes##ed    vb.16b, vk0.16b; \
+       aes##mcimc vb.16b, vb.16b; \
+       aes##ed    vb.16b, vk1.16b; \
+       aes##mcimc vb.16b, vb.16b; \
+       aes##ed    vb.16b, vk2.16b; \
+       aes##mcimc vb.16b, vb.16b; \
+       aes##ed    vb.16b, vk3.16b; \
+       aes##mcimc vb.16b, vb.16b; \
+       aes##ed    vb.16b, vk4.16b; \
+       aes##mcimc vb.16b, vb.16b; \
+       aes##ed    vb.16b, vk5.16b; \
+       aes##mcimc vb.16b, vb.16b; \
+       aes##ed    vb.16b, vk6.16b; \
+       aes##mcimc vb.16b, vb.16b; \
+       aes##ed    vb.16b, vk7.16b; \
+       aes##mcimc vb.16b, vb.16b; \
+       aes##ed    vb.16b, vk8.16b; \
+       aes##mcimc vb.16b, vb.16b; \
+       aes##ed    vb.16b, vk9.16b; \
+       aes##mcimc vb.16b, vb.16b; \
+       aes##ed    vb.16b, vk10.16b; \
+       aes##mcimc vb.16b, vb.16b; \
+       aes##ed    vb.16b, vk11.16b; \
+       aes##mcimc vb.16b, vb.16b; \
+       aes##ed    vb.16b, vk12.16b; \
+       aes##mcimc vb.16b, vb.16b; \
+       aes##ed    vb.16b, vk13.16b; \
+       eor        vo.16b, vb.16b, vk14.16b;
+
+#define aes_round_4(ed, mcimc, b0, b1, b2, b3, key) \
+       aes##ed    b0.16b, key.16b; \
+       aes##mcimc b0.16b, b0.16b; \
+         aes##ed    b1.16b, key.16b; \
+         aes##mcimc b1.16b, b1.16b; \
+           aes##ed    b2.16b, key.16b; \
+           aes##mcimc b2.16b, b2.16b; \
+             aes##ed    b3.16b, key.16b; \
+             aes##mcimc b3.16b, b3.16b;
+
+#define aes_lastround_4(ed, b0, b1, b2, b3, key1, key2) \
+       aes##ed    b0.16b, key1.16b; \
+       eor        b0.16b, b0.16b, key2.16b; \
+         aes##ed    b1.16b, key1.16b; \
+         eor        b1.16b, b1.16b, key2.16b; \
+           aes##ed    b2.16b, key1.16b; \
+           eor        b2.16b, b2.16b, key2.16b; \
+             aes##ed    b3.16b, key1.16b; \
+             eor        b3.16b, b3.16b, key2.16b;
+
+#define do_aes_4_128(ed, mcimc, b0, b1, b2, b3) \
+       aes_round_4(ed, mcimc, b0, b1, b2, b3, vk0); \
+       aes_round_4(ed, mcimc, b0, b1, b2, b3, vk1); \
+       aes_round_4(ed, mcimc, b0, b1, b2, b3, vk2); \
+       aes_round_4(ed, mcimc, b0, b1, b2, b3, vk3); \
+       aes_round_4(ed, mcimc, b0, b1, b2, b3, vk4); \
+       aes_round_4(ed, mcimc, b0, b1, b2, b3, vk5); \
+       aes_round_4(ed, mcimc, b0, b1, b2, b3, vk6); \
+       aes_round_4(ed, mcimc, b0, b1, b2, b3, vk7); \
+       aes_round_4(ed, mcimc, b0, b1, b2, b3, vk8); \
+       aes_lastround_4(ed, b0, b1, b2, b3, vk9, vk10);
+
+#define do_aes_4_192(ed, mcimc, b0, b1, b2, b3) \
+       aes_round_4(ed, mcimc, b0, b1, b2, b3, vk0); \
+       aes_round_4(ed, mcimc, b0, b1, b2, b3, vk1); \
+       aes_round_4(ed, mcimc, b0, b1, b2, b3, vk2); \
+       aes_round_4(ed, mcimc, b0, b1, b2, b3, vk3); \
+       aes_round_4(ed, mcimc, b0, b1, b2, b3, vk4); \
+       aes_round_4(ed, mcimc, b0, b1, b2, b3, vk5); \
+       aes_round_4(ed, mcimc, b0, b1, b2, b3, vk6); \
+       aes_round_4(ed, mcimc, b0, b1, b2, b3, vk7); \
+       aes_round_4(ed, mcimc, b0, b1, b2, b3, vk8); \
+       aes_round_4(ed, mcimc, b0, b1, b2, b3, vk9); \
+       aes_round_4(ed, mcimc, b0, b1, b2, b3, vk10); \
+       aes_lastround_4(ed, b0, b1, b2, b3, vk11, vk12);
+
+#define do_aes_4_256(ed, mcimc, b0, b1, b2, b3) \
+       aes_round_4(ed, mcimc, b0, b1, b2, b3, vk0); \
+       aes_round_4(ed, mcimc, b0, b1, b2, b3, vk1); \
+       aes_round_4(ed, mcimc, b0, b1, b2, b3, vk2); \
+       aes_round_4(ed, mcimc, b0, b1, b2, b3, vk3); \
+       aes_round_4(ed, mcimc, b0, b1, b2, b3, vk4); \
+       aes_round_4(ed, mcimc, b0, b1, b2, b3, vk5); \
+       aes_round_4(ed, mcimc, b0, b1, b2, b3, vk6); \
+       aes_round_4(ed, mcimc, b0, b1, b2, b3, vk7); \
+       aes_round_4(ed, mcimc, b0, b1, b2, b3, vk8); \
+       aes_round_4(ed, mcimc, b0, b1, b2, b3, vk9); \
+       aes_round_4(ed, mcimc, b0, b1, b2, b3, vk10); \
+       aes_round_4(ed, mcimc, b0, b1, b2, b3, vk11); \
+       aes_round_4(ed, mcimc, b0, b1, b2, b3, vk12); \
+       aes_lastround_4(ed, b0, b1, b2, b3, vk13, vk14);
+
+
+/* Other functional macros */
+
+#define CLEAR_REG(reg) eor reg.16b, reg.16b, reg.16b;
+
+#define aes_clear_keys(nrounds) \
+       cmp nrounds, #12; \
+       CLEAR_REG(vk0); \
+       CLEAR_REG(vk1); \
+       CLEAR_REG(vk2); \
+       CLEAR_REG(vk3); \
+       CLEAR_REG(vk4); \
+       CLEAR_REG(vk5); \
+       CLEAR_REG(vk6); \
+       CLEAR_REG(vk7); \
+       CLEAR_REG(vk9); \
+       CLEAR_REG(vk8); \
+       CLEAR_REG(vk10); \
+       b.lo 1f; \
+       CLEAR_REG(vk11); \
+       CLEAR_REG(vk12); \
+       b.eq 1f; \
+       CLEAR_REG(vk13); \
+       CLEAR_REG(vk14); \
+1:     ;
+
+
+/*
+ * unsigned int _gcry_aes_enc_armv8_ce(void *keysched, byte *dst,
+ *                                     const byte *src,
+ *                                     unsigned int nrounds);
+ */
+.align 3
+.globl _gcry_aes_enc_armv8_ce
+.type  _gcry_aes_enc_armv8_ce,%function;
+_gcry_aes_enc_armv8_ce:
+  /* input:
+   *    x0: keysched
+   *    x1: dst
+   *    x2: src
+   *    w3: nrounds
+   */
+
+  aes_preload_keys(x0, w3);
+
+  ld1 {v0.16b}, [x2]
+
+  b.hi .Lenc1_256
+  b.eq .Lenc1_192
+
+.Lenc1_128:
+  do_aes_one128(e, mc, v0, v0);
+
+.Lenc1_tail:
+  CLEAR_REG(vk0)
+  CLEAR_REG(vk1)
+  CLEAR_REG(vk2)
+  CLEAR_REG(vk3)
+  CLEAR_REG(vk4)
+  CLEAR_REG(vk5)
+  CLEAR_REG(vk6)
+  CLEAR_REG(vk7)
+  CLEAR_REG(vk8)
+  CLEAR_REG(vk9)
+  CLEAR_REG(vk10)
+  st1 {v0.16b}, [x1]
+  CLEAR_REG(v0)
+
+  mov x0, #0
+  ret
+
+.Lenc1_192:
+  do_aes_one192(e, mc, v0, v0);
+
+  CLEAR_REG(vk11)
+  CLEAR_REG(vk12)
+  b .Lenc1_tail
+
+.Lenc1_256:
+  do_aes_one256(e, mc, v0, v0);
+
+  CLEAR_REG(vk11)
+  CLEAR_REG(vk12)
+  CLEAR_REG(vk13)
+  CLEAR_REG(vk14)
+  b .Lenc1_tail
+.size _gcry_aes_enc_armv8_ce,.-_gcry_aes_enc_armv8_ce;
+
+
+/*
+ * unsigned int _gcry_aes_dec_armv8_ce(void *keysched, byte *dst,
+ *                                     const byte *src,
+ *                                     unsigned int nrounds);
+ */
+.align 3
+.globl _gcry_aes_dec_armv8_ce
+.type  _gcry_aes_dec_armv8_ce,%function;
+_gcry_aes_dec_armv8_ce:
+  /* input:
+   *    x0: keysched
+   *    x1: dst
+   *    x2: src
+   *    w3: nrounds
+   */
+
+  aes_preload_keys(x0, w3);
+
+  ld1 {v0.16b}, [x2]
+
+  b.hi .Ldec1_256
+  b.eq .Ldec1_192
+
+.Ldec1_128:
+  do_aes_one128(d, imc, v0, v0);
+
+.Ldec1_tail:
+  CLEAR_REG(vk0)
+  CLEAR_REG(vk1)
+  CLEAR_REG(vk2)
+  CLEAR_REG(vk3)
+  CLEAR_REG(vk4)
+  CLEAR_REG(vk5)
+  CLEAR_REG(vk6)
+  CLEAR_REG(vk7)
+  CLEAR_REG(vk8)
+  CLEAR_REG(vk9)
+  CLEAR_REG(vk10)
+  st1 {v0.16b}, [x1]
+  CLEAR_REG(v0)
+
+  mov x0, #0
+  ret
+
+.Ldec1_192:
+  do_aes_one192(d, imc, v0, v0);
+
+  CLEAR_REG(vk11)
+  CLEAR_REG(vk12)
+  b .Ldec1_tail
+
+.Ldec1_256:
+  do_aes_one256(d, imc, v0, v0);
+
+  CLEAR_REG(vk11)
+  CLEAR_REG(vk12)
+  CLEAR_REG(vk13)
+  CLEAR_REG(vk14)
+  b .Ldec1_tail
+.size _gcry_aes_dec_armv8_ce,.-_gcry_aes_dec_armv8_ce;
+
+
+/*
+ * void _gcry_aes_cbc_enc_armv8_ce (const void *keysched,
+ *                                  unsigned char *outbuf,
+ *                                  const unsigned char *inbuf,
+ *                                  unsigned char *iv, size_t nblocks,
+ *                                  int cbc_mac, unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_cbc_enc_armv8_ce
+.type  _gcry_aes_cbc_enc_armv8_ce,%function;
+_gcry_aes_cbc_enc_armv8_ce:
+  /* input:
+   *    x0: keysched
+   *    x1: outbuf
+   *    x2: inbuf
+   *    x3: iv
+   *    x4: nblocks
+   *    w5: cbc_mac
+   *    w6: nrounds
+   */
+
+  cbz x4, .Lcbc_enc_skip
+
+  cmp w5, #0
+  ld1 {v1.16b}, [x3] /* load IV */
+  cset x5, eq
+
+  aes_preload_keys(x0, w6);
+  lsl x5, x5, #4
+
+  b.eq .Lcbc_enc_loop192
+  b.hi .Lcbc_enc_loop256
+
+#define CBC_ENC(bits) \
+  .Lcbc_enc_loop##bits: \
+    ld1 {v0.16b}, [x2], #16; /* load plaintext */ \
+    eor v1.16b, v0.16b, v1.16b; \
+    sub x4, x4, #1; \
+    \
+    do_aes_one##bits(e, mc, v1, v1); \
+    \
+    st1 {v1.16b}, [x1], x5; /* store ciphertext */ \
+    \
+    cbnz x4, .Lcbc_enc_loop##bits; \
+    b .Lcbc_enc_done;
+
+  CBC_ENC(128)
+  CBC_ENC(192)
+  CBC_ENC(256)
+
+#undef CBC_ENC
+
+.Lcbc_enc_done:
+  aes_clear_keys(w6)
+
+  st1 {v1.16b}, [x3] /* store IV */
+
+  CLEAR_REG(v1)
+  CLEAR_REG(v0)
+
+.Lcbc_enc_skip:
+  ret
+.size _gcry_aes_cbc_enc_armv8_ce,.-_gcry_aes_cbc_enc_armv8_ce;
+
+/*
+ * void _gcry_aes_cbc_dec_armv8_ce (const void *keysched,
+ *                                  unsigned char *outbuf,
+ *                                  const unsigned char *inbuf,
+ *                                  unsigned char *iv, unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_cbc_dec_armv8_ce
+.type  _gcry_aes_cbc_dec_armv8_ce,%function;
+_gcry_aes_cbc_dec_armv8_ce:
+  /* input:
+   *    x0: keysched
+   *    x1: outbuf
+   *    x2: inbuf
+   *    x3: iv
+   *    x4: nblocks
+   *    w5: nrounds
+   */
+
+  cbz x4, .Lcbc_dec_skip
+
+  ld1 {v0.16b}, [x3] /* load IV */
+
+  aes_preload_keys(x0, w5);
+
+  b.eq .Lcbc_dec_entry_192
+  b.hi .Lcbc_dec_entry_256
+
+#define CBC_DEC(bits) \
+  .Lcbc_dec_entry_##bits: \
+    cmp x4, #4; \
+    b.lo .Lcbc_dec_loop_##bits; \
+    \
+  .Lcbc_dec_loop4_##bits: \
+    \
+    ld1 {v1.16b-v4.16b}, [x2], #64; /* load ciphertext */ \
+    sub x4, x4, #4; \
+    mov v5.16b, v1.16b; \
+    mov v6.16b, v2.16b; \
+    mov v7.16b, v3.16b; \
+    mov v16.16b, v4.16b; \
+    cmp x4, #4; \
+    \
+    do_aes_4_##bits(d, imc, v1, v2, v3, v4); \
+    \
+    eor v1.16b, v1.16b, v0.16b; \
+    eor v2.16b, v2.16b, v5.16b; \
+    st1 {v1.16b-v2.16b}, [x1], #32; /* store plaintext */ \
+    eor v3.16b, v3.16b, v6.16b; \
+    eor v4.16b, v4.16b, v7.16b; \
+    mov v0.16b, v16.16b; /* next IV */ \
+    st1 {v3.16b-v4.16b}, [x1], #32; /* store plaintext */ \
+    \
+    b.hs .Lcbc_dec_loop4_##bits; \
+    CLEAR_REG(v3); \
+    CLEAR_REG(v4); \
+    CLEAR_REG(v5); \
+    CLEAR_REG(v6); \
+    CLEAR_REG(v7); \
+    CLEAR_REG(v16); \
+    cbz x4, .Lcbc_dec_done; \
+    \
+  .Lcbc_dec_loop_##bits: \
+    ld1 {v1.16b}, [x2], #16; /* load ciphertext */ \
+    sub x4, x4, #1; \
+    mov v2.16b, v1.16b; \
+    \
+    do_aes_one##bits(d, imc, v1, v1); \
+    \
+    eor v1.16b, v1.16b, v0.16b; \
+    mov v0.16b, v2.16b; \
+    st1 {v1.16b}, [x1], #16; /* store plaintext */ \
+    \
+    cbnz x4, .Lcbc_dec_loop_##bits; \
+    b .Lcbc_dec_done;
+
+  CBC_DEC(128)
+  CBC_DEC(192)
+  CBC_DEC(256)
+
+#undef CBC_DEC
+
+.Lcbc_dec_done:
+  aes_clear_keys(w5)
+
+  st1 {v0.16b}, [x3] /* store IV */
+
+  CLEAR_REG(v0)
+  CLEAR_REG(v1)
+  CLEAR_REG(v2)
+
+.Lcbc_dec_skip:
+  ret
+.size _gcry_aes_cbc_dec_armv8_ce,.-_gcry_aes_cbc_dec_armv8_ce;
+
+
+/*
+ * void _gcry_aes_ctr_enc_armv8_ce (const void *keysched,
+ *                                  unsigned char *outbuf,
+ *                                  const unsigned char *inbuf,
+ *                                  unsigned char *iv, unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_ctr_enc_armv8_ce
+.type  _gcry_aes_ctr_enc_armv8_ce,%function;
+_gcry_aes_ctr_enc_armv8_ce:
+  /* input:
+   *    r0: keysched
+   *    r1: outbuf
+   *    r2: inbuf
+   *    r3: iv
+   *    x4: nblocks
+   *    w5: nrounds
+   */
+
+  cbz x4, .Lctr_enc_skip
+
+  mov x6, #1
+  movi v16.16b, #0
+  mov v16.D[1], x6
+
+  /* load IV */
+  ldp x9, x10, [x3]
+  ld1 {v0.16b}, [x3]
+  rev x9, x9
+  rev x10, x10
+
+  aes_preload_keys(x0, w5);
+
+  b.eq .Lctr_enc_entry_192
+  b.hi .Lctr_enc_entry_256
+
+#define CTR_ENC(bits) \
+  .Lctr_enc_entry_##bits: \
+    cmp x4, #4; \
+    b.lo .Lctr_enc_loop_##bits; \
+    \
+  .Lctr_enc_loop4_##bits: \
+    cmp x10, #0xfffffffffffffffc; \
+    sub x4, x4, #4; \
+    b.lo .Lctr_enc_loop4_##bits##_nocarry; \
+    \
+    adds x10, x10, #1; \
+    mov v1.16b, v0.16b; \
+    adc x9, x9, xzr; \
+    mov v2.D[1], x10; \
+    mov v2.D[0], x9; \
+    \
+    adds x10, x10, #1; \
+    rev64 v2.16b, v2.16b; \
+    adc x9, x9, xzr; \
+    mov v3.D[1], x10; \
+    mov v3.D[0], x9; \
+    \
+    adds x10, x10, #1; \
+    rev64 v3.16b, v3.16b; \
+    adc x9, x9, xzr; \
+    mov v4.D[1], x10; \
+    mov v4.D[0], x9; \
+    \
+    adds x10, x10, #1; \
+    rev64 v4.16b, v4.16b; \
+    adc x9, x9, xzr; \
+    mov v0.D[1], x10; \
+    mov v0.D[0], x9; \
+    rev64 v0.16b, v0.16b; \
+    \
+    b .Lctr_enc_loop4_##bits##_store_ctr; \
+    \
+  .Lctr_enc_loop4_##bits##_nocarry: \
+    \
+    add v3.2d, v16.2d, v16.2d; /* 2 */ \
+    rev64 v6.16b, v0.16b; \
+    add x10, x10, #4; \
+    add v4.2d, v3.2d, v16.2d;  /* 3 */ \
+    add v0.2d, v3.2d, v3.2d;   /* 4 */ \
+    rev64 v1.16b, v6.16b; \
+    add v2.2d, v6.2d, v16.2d; \
+    add v3.2d, v6.2d, v3.2d; \
+    add v4.2d, v6.2d, v4.2d; \
+    add v0.2d, v6.2d, v0.2d; \
+    rev64 v2.16b, v2.16b; \
+    rev64 v3.16b, v3.16b; \
+    rev64 v0.16b, v0.16b; \
+    rev64 v4.16b, v4.16b; \
+    \
+  .Lctr_enc_loop4_##bits##_store_ctr: \
+    \
+    st1 {v0.16b}, [x3]; \
+    cmp x4, #4; \
+    ld1 {v5.16b-v7.16b}, [x2], #48; /* preload ciphertext */ \
+    \
+    do_aes_4_##bits(e, mc, v1, v2, v3, v4); \
+    \
+    eor v1.16b, v1.16b, v5.16b; \
+    ld1 {v5.16b}, [x2], #16; /* load ciphertext */ \
+    eor v2.16b, v2.16b, v6.16b; \
+    eor v3.16b, v3.16b, v7.16b; \
+    eor v4.16b, v4.16b, v5.16b; \
+    st1 {v1.16b-v4.16b}, [x1], #64; /* store plaintext */ \
+    \
+    b.hs .Lctr_enc_loop4_##bits; \
+    CLEAR_REG(v3); \
+    CLEAR_REG(v4); \
+    CLEAR_REG(v5); \
+    CLEAR_REG(v6); \
+    CLEAR_REG(v7); \
+    cbz x4, .Lctr_enc_done; \
+    \
+  .Lctr_enc_loop_##bits: \
+    \
+    adds x10, x10, #1; \
+    mov v1.16b, v0.16b; \
+    adc x9, x9, xzr; \
+    mov v0.D[1], x10; \
+    mov v0.D[0], x9; \
+    sub x4, x4, #1; \
+    ld1 {v2.16b}, [x2], #16; /* load ciphertext */ \
+    rev64 v0.16b, v0.16b; \
+    \
+    do_aes_one##bits(e, mc, v1, v1); \
+    \
+    eor v1.16b, v2.16b, v1.16b; \
+    st1 {v1.16b}, [x1], #16; /* store plaintext */ \
+    \
+    cbnz x4, .Lctr_enc_loop_##bits; \
+    b .Lctr_enc_done;
+
+  CTR_ENC(128)
+  CTR_ENC(192)
+  CTR_ENC(256)
+
+#undef CTR_ENC
+
+.Lctr_enc_done:
+  aes_clear_keys(w5)
+
+  st1 {v0.16b}, [x3] /* store IV */
+
+  CLEAR_REG(v0)
+  CLEAR_REG(v1)
+  CLEAR_REG(v2)
+
+.Lctr_enc_skip:
+  ret
+
+.size _gcry_aes_ctr_enc_armv8_ce,.-_gcry_aes_ctr_enc_armv8_ce;
+
+
+/*
+ * void _gcry_aes_cfb_enc_armv8_ce (const void *keysched,
+ *                                  unsigned char *outbuf,
+ *                                  const unsigned char *inbuf,
+ *                                  unsigned char *iv, unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_cfb_enc_armv8_ce
+.type  _gcry_aes_cfb_enc_armv8_ce,%function;
+_gcry_aes_cfb_enc_armv8_ce:
+  /* input:
+   *    r0: keysched
+   *    r1: outbuf
+   *    r2: inbuf
+   *    r3: iv
+   *    x4: nblocks
+   *    w5: nrounds
+   */
+
+  cbz x4, .Lcfb_enc_skip
+
+  /* load IV */
+  ld1 {v0.16b}, [x3]
+
+  aes_preload_keys(x0, w5);
+
+  b.eq .Lcfb_enc_entry_192
+  b.hi .Lcfb_enc_entry_256
+
+#define CFB_ENC(bits) \
+  .Lcfb_enc_entry_##bits: \
+  .Lcfb_enc_loop_##bits: \
+    ld1 {v1.16b}, [x2], #16; /* load plaintext */ \
+    sub x4, x4, #1; \
+    \
+    do_aes_one##bits(e, mc, v0, v0); \
+    \
+    eor v0.16b, v1.16b, v0.16b; \
+    st1 {v0.16b}, [x1], #16; /* store ciphertext */ \
+    \
+    cbnz x4, .Lcfb_enc_loop_##bits; \
+    b .Lcfb_enc_done;
+
+  CFB_ENC(128)
+  CFB_ENC(192)
+  CFB_ENC(256)
+
+#undef CFB_ENC
+
+.Lcfb_enc_done:
+  aes_clear_keys(w5)
+
+  st1 {v0.16b}, [x3] /* store IV */
+
+  CLEAR_REG(v0)
+  CLEAR_REG(v1)
+
+.Lcfb_enc_skip:
+  ret
+.size _gcry_aes_cfb_enc_armv8_ce,.-_gcry_aes_cfb_enc_armv8_ce;
+
+
+/*
+ * void _gcry_aes_cfb_dec_armv8_ce (const void *keysched,
+ *                                  unsigned char *outbuf,
+ *                                  const unsigned char *inbuf,
+ *                                  unsigned char *iv, unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_cfb_dec_armv8_ce
+.type  _gcry_aes_cfb_dec_armv8_ce,%function;
+_gcry_aes_cfb_dec_armv8_ce:
+  /* input:
+   *    r0: keysched
+   *    r1: outbuf
+   *    r2: inbuf
+   *    r3: iv
+   *    x4: nblocks
+   *    w5: nrounds
+   */
+
+  cbz x4, .Lcfb_dec_skip
+
+  /* load IV */
+  ld1 {v0.16b}, [x3]
+
+  aes_preload_keys(x0, w5);
+
+  b.eq .Lcfb_dec_entry_192
+  b.hi .Lcfb_dec_entry_256
+
+#define CFB_DEC(bits) \
+  .Lcfb_dec_entry_##bits: \
+    cmp x4, #4; \
+    b.lo .Lcfb_dec_loop_##bits; \
+    \
+  .Lcfb_dec_loop4_##bits: \
+    \
+    ld1 {v2.16b-v4.16b}, [x2], #48; /* load ciphertext */ \
+    mov v1.16b, v0.16b; \
+    sub x4, x4, #4; \
+    cmp x4, #4; \
+    mov v5.16b, v2.16b; \
+    mov v6.16b, v3.16b; \
+    mov v7.16b, v4.16b; \
+    ld1 {v0.16b}, [x2], #16; /* load next IV / ciphertext */ \
+    \
+    do_aes_4_##bits(e, mc, v1, v2, v3, v4); \
+    \
+    eor v1.16b, v1.16b, v5.16b; \
+    eor v2.16b, v2.16b, v6.16b; \
+    eor v3.16b, v3.16b, v7.16b; \
+    eor v4.16b, v4.16b, v0.16b; \
+    st1 {v1.16b-v4.16b}, [x1], #64; /* store plaintext */ \
+    \
+    b.hs .Lcfb_dec_loop4_##bits; \
+    CLEAR_REG(v3); \
+    CLEAR_REG(v4); \
+    CLEAR_REG(v5); \
+    CLEAR_REG(v6); \
+    CLEAR_REG(v7); \
+    cbz x4, .Lcfb_dec_done; \
+    \
+  .Lcfb_dec_loop_##bits: \
+    \
+    ld1 {v1.16b}, [x2], #16; /* load ciphertext */ \
+    \
+    sub x4, x4, #1; \
+    \
+    do_aes_one##bits(e, mc, v0, v0); \
+    \
+    eor v2.16b, v1.16b, v0.16b; \
+    mov v0.16b, v1.16b; \
+    st1 {v2.16b}, [x1], #16; /* store plaintext */ \
+    \
+    cbnz x4, .Lcfb_dec_loop_##bits; \
+    b .Lcfb_dec_done;
+
+  CFB_DEC(128)
+  CFB_DEC(192)
+  CFB_DEC(256)
+
+#undef CFB_DEC
+
+.Lcfb_dec_done:
+  aes_clear_keys(w5)
+
+  st1 {v0.16b}, [x3] /* store IV */
+
+  CLEAR_REG(v0)
+  CLEAR_REG(v1)
+  CLEAR_REG(v2)
+
+.Lcfb_dec_skip:
+  ret
+.size _gcry_aes_cfb_dec_armv8_ce,.-_gcry_aes_cfb_dec_armv8_ce;
+
+
+/*
+ * void _gcry_aes_ocb_enc_armv8_ce (const void *keysched,
+ *                                  unsigned char *outbuf,
+ *                                  const unsigned char *inbuf,
+ *                                  unsigned char *offset,
+ *                                  unsigned char *checksum,
+ *                                  void **Ls,
+ *                                  size_t nblocks,
+ *                                  unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_ocb_enc_armv8_ce
+.type  _gcry_aes_ocb_enc_armv8_ce,%function;
+_gcry_aes_ocb_enc_armv8_ce:
+  /* input:
+   *    x0: keysched
+   *    x1: outbuf
+   *    x2: inbuf
+   *    x3: offset
+   *    x4: checksum
+   *    x5: Ls
+   *    x6: nblocks (0 < nblocks <= 32)
+   *    w7: nrounds
+   */
+
+  ld1 {v0.16b}, [x3] /* load offset */
+  ld1 {v16.16b}, [x4] /* load checksum */
+
+  aes_preload_keys(x0, w7);
+
+  b.eq .Locb_enc_entry_192
+  b.hi .Locb_enc_entry_256
+
+#define OCB_ENC(bits, ...) \
+  .Locb_enc_entry_##bits: \
+    cmp x6, #4; \
+    b.lo .Locb_enc_loop_##bits; \
+    \
+  .Locb_enc_loop4_##bits: \
+    \
+    /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
+    /* Checksum_i = Checksum_{i-1} xor P_i  */ \
+    /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */ \
+    \
+    ldp ptr8, ptr9, [x5], #(ptr_sz*2); \
+    \
+    ld1 {v1.16b-v4.16b}, [x2], #64; /* load P_i+<0-3> */ \
+    ldp ptr10, ptr11, [x5], #(ptr_sz*2); \
+    sub x6, x6, #4; \
+    \
+    ld1 {v5.16b}, [x8];               /* load L_{ntz(i+0)} */ \
+    eor v16.16b, v16.16b, v1.16b;     /* Checksum_i+0 */ \
+    ld1 {v6.16b}, [x9];               /* load L_{ntz(i+1)} */ \
+    eor v16.16b, v16.16b, v2.16b;     /* Checksum_i+1 */ \
+    ld1 {v7.16b}, [x10];              /* load L_{ntz(i+2)} */ \
+    eor v16.16b, v16.16b, v3.16b;     /* Checksum_i+2 */ \
+    eor v5.16b, v5.16b, v0.16b;       /* Offset_i+0 */ \
+    ld1 {v0.16b}, [x11];              /* load L_{ntz(i+3)} */ \
+    eor v16.16b, v16.16b, v4.16b;     /* Checksum_i+3 */ \
+    eor v6.16b, v6.16b, v5.16b;       /* Offset_i+1 */ \
+    eor v1.16b, v1.16b, v5.16b;       /* P_i+0 xor Offset_i+0 */ \
+    eor v7.16b, v7.16b, v6.16b;       /* Offset_i+2 */ \
+    eor v2.16b, v2.16b, v6.16b;       /* P_i+1 xor Offset_i+1 */ \
+    eor v0.16b, v0.16b, v7.16b;       /* Offset_i+3 */ \
+    cmp x6, #4; \
+    eor v3.16b, v3.16b, v7.16b;       /* P_i+2 xor Offset_i+2 */ \
+    eor v4.16b, v4.16b, v0.16b;       /* P_i+3 xor Offset_i+3 */ \
+    \
+    do_aes_4_##bits(e, mc, v1, v2, v3, v4); \
+    \
+    eor v1.16b, v1.16b, v5.16b;       /* xor Offset_i+0 */ \
+    eor v2.16b, v2.16b, v6.16b;       /* xor Offset_i+1 */ \
+    eor v3.16b, v3.16b, v7.16b;       /* xor Offset_i+2 */ \
+    eor v4.16b, v4.16b, v0.16b;       /* xor Offset_i+3 */ \
+    st1 {v1.16b-v4.16b}, [x1], #64; \
+    \
+    b.hs .Locb_enc_loop4_##bits; \
+    CLEAR_REG(v3); \
+    CLEAR_REG(v4); \
+    CLEAR_REG(v5); \
+    CLEAR_REG(v6); \
+    CLEAR_REG(v7); \
+    cbz x6, .Locb_enc_done; \
+    \
+  .Locb_enc_loop_##bits: \
+    \
+    /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
+    /* Checksum_i = Checksum_{i-1} xor P_i  */ \
+    /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */ \
+    \
+    ldr ptr8, [x5], #(ptr_sz); \
+    ld1 {v1.16b}, [x2], #16; /* load plaintext */ \
+    ld1 {v2.16b}, [x8]; /* load L_{ntz(i)} */ \
+    sub x6, x6, #1; \
+    eor v0.16b, v0.16b, v2.16b; \
+    eor v16.16b, v16.16b, v1.16b; \
+    eor v1.16b, v1.16b, v0.16b; \
+    \
+    do_aes_one##bits(e, mc, v1, v1); \
+    \
+    eor v1.16b, v1.16b, v0.16b; \
+    st1 {v1.16b}, [x1], #16; /* store ciphertext */ \
+    \
+    cbnz x6, .Locb_enc_loop_##bits; \
+    b .Locb_enc_done;
+
+  OCB_ENC(128)
+  OCB_ENC(192)
+  OCB_ENC(256)
+
+#undef OCB_ENC
+
+.Locb_enc_done:
+  aes_clear_keys(w7)
+
+  st1 {v16.16b}, [x4] /* store checksum */
+  st1 {v0.16b}, [x3] /* store offset */
+
+  CLEAR_REG(v0)
+  CLEAR_REG(v1)
+  CLEAR_REG(v2)
+  CLEAR_REG(v16)
+
+  ret
+.size _gcry_aes_ocb_enc_armv8_ce,.-_gcry_aes_ocb_enc_armv8_ce;
+
+
+/*
+ * void _gcry_aes_ocb_dec_armv8_ce (const void *keysched,
+ *                                  unsigned char *outbuf,
+ *                                  const unsigned char *inbuf,
+ *                                  unsigned char *offset,
+ *                                  unsigned char *checksum,
+ *                                  void **Ls,
+ *                                  size_t nblocks,
+ *                                  unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_ocb_dec_armv8_ce
+.type  _gcry_aes_ocb_dec_armv8_ce,%function;
+_gcry_aes_ocb_dec_armv8_ce:
+  /* input:
+   *    x0: keysched
+   *    x1: outbuf
+   *    x2: inbuf
+   *    x3: offset
+   *    x4: checksum
+   *    x5: Ls
+   *    x6: nblocks (0 < nblocks <= 32)
+   *    w7: nrounds
+   */
+
+  ld1 {v0.16b}, [x3] /* load offset */
+  ld1 {v16.16b}, [x4] /* load checksum */
+
+  aes_preload_keys(x0, w7);
+
+  b.eq .Locb_dec_entry_192
+  b.hi .Locb_dec_entry_256
+
+#define OCB_DEC(bits) \
+  .Locb_dec_entry_##bits: \
+    cmp x6, #4; \
+    b.lo .Locb_dec_loop_##bits; \
+    \
+  .Locb_dec_loop4_##bits: \
+    \
+    /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
+    /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */ \
+    /* Checksum_i = Checksum_{i-1} xor P_i  */ \
+    \
+    ldp ptr8, ptr9, [x5], #(ptr_sz*2); \
+    \
+    ld1 {v1.16b-v4.16b}, [x2], #64; /* load C_i+<0-3> */ \
+    ldp ptr10, ptr11, [x5], #(ptr_sz*2); \
+    sub x6, x6, #4; \
+    \
+    ld1 {v5.16b}, [x8];               /* load L_{ntz(i+0)} */ \
+    ld1 {v6.16b}, [x9];               /* load L_{ntz(i+1)} */ \
+    ld1 {v7.16b}, [x10];              /* load L_{ntz(i+2)} */ \
+    eor v5.16b, v5.16b, v0.16b;       /* Offset_i+0 */ \
+    ld1 {v0.16b}, [x11];              /* load L_{ntz(i+3)} */ \
+    eor v6.16b, v6.16b, v5.16b;       /* Offset_i+1 */ \
+    eor v1.16b, v1.16b, v5.16b;       /* C_i+0 xor Offset_i+0 */ \
+    eor v7.16b, v7.16b, v6.16b;       /* Offset_i+2 */ \
+    eor v2.16b, v2.16b, v6.16b;       /* C_i+1 xor Offset_i+1 */ \
+    eor v0.16b, v0.16b, v7.16b;       /* Offset_i+3 */ \
+    cmp x6, #4; \
+    eor v3.16b, v3.16b, v7.16b;       /* C_i+2 xor Offset_i+2 */ \
+    eor v4.16b, v4.16b, v0.16b;       /* C_i+3 xor Offset_i+3 */ \
+    \
+    do_aes_4_##bits(d, imc, v1, v2, v3, v4); \
+    \
+    eor v1.16b, v1.16b, v5.16b;       /* xor Offset_i+0 */ \
+    eor v2.16b, v2.16b, v6.16b;       /* xor Offset_i+1 */ \
+    eor v16.16b, v16.16b, v1.16b;     /* Checksum_i+0 */ \
+    eor v3.16b, v3.16b, v7.16b;       /* xor Offset_i+2 */ \
+    eor v16.16b, v16.16b, v2.16b;     /* Checksum_i+1 */ \
+    eor v4.16b, v4.16b, v0.16b;       /* xor Offset_i+3 */ \
+    eor v16.16b, v16.16b, v3.16b;     /* Checksum_i+2 */ \
+    eor v16.16b, v16.16b, v4.16b;     /* Checksum_i+3 */ \
+    st1 {v1.16b-v4.16b}, [x1], #64; \
+    \
+    b.hs .Locb_dec_loop4_##bits; \
+    CLEAR_REG(v3); \
+    CLEAR_REG(v4); \
+    CLEAR_REG(v5); \
+    CLEAR_REG(v6); \
+    CLEAR_REG(v7); \
+    cbz x6, .Locb_dec_done; \
+    \
+  .Locb_dec_loop_##bits: \
+    \
+    /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
+    /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */ \
+    /* Checksum_i = Checksum_{i-1} xor P_i  */ \
+    \
+    ldr ptr8, [x5], #(ptr_sz); \
+    ld1 {v1.16b}, [x2], #16; /* load ciphertext */ \
+    ld1 {v2.16b}, [x8]; /* load L_{ntz(i)} */ \
+    sub x6, x6, #1; \
+    eor v0.16b, v0.16b, v2.16b; \
+    eor v1.16b, v1.16b, v0.16b; \
+    \
+    do_aes_one##bits(d, imc, v1, v1) \
+    \
+    eor v1.16b, v1.16b, v0.16b; \
+    st1 {v1.16b}, [x1], #16; /* store plaintext */ \
+    eor v16.16b, v16.16b, v1.16b; \
+    \
+    cbnz x6, .Locb_dec_loop_##bits; \
+    b .Locb_dec_done;
+
+  OCB_DEC(128)
+  OCB_DEC(192)
+  OCB_DEC(256)
+
+#undef OCB_DEC
+
+.Locb_dec_done:
+  aes_clear_keys(w7)
+
+  st1 {v16.16b}, [x4] /* store checksum */
+  st1 {v0.16b}, [x3] /* store offset */
+
+  CLEAR_REG(v0)
+  CLEAR_REG(v1)
+  CLEAR_REG(v2)
+  CLEAR_REG(v16)
+
+  ret
+.size _gcry_aes_ocb_dec_armv8_ce,.-_gcry_aes_ocb_dec_armv8_ce;
+
+
+/*
+ * void _gcry_aes_ocb_auth_armv8_ce (const void *keysched,
+ *                                   const unsigned char *abuf,
+ *                                   unsigned char *offset,
+ *                                   unsigned char *checksum,
+ *                                   void **Ls,
+ *                                   size_t nblocks,
+ *                                   unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_ocb_auth_armv8_ce
+.type  _gcry_aes_ocb_auth_armv8_ce,%function;
+_gcry_aes_ocb_auth_armv8_ce:
+  /* input:
+   *    x0: keysched
+   *    x1: abuf
+   *    x2: offset => x3
+   *    x3: checksum => x4
+   *    x4: Ls => x5
+   *    x5: nblocks => x6  (0 < nblocks <= 32)
+   *    w6: nrounds => w7
+   */
+  mov x7, x6
+  mov x6, x5
+  mov x5, x4
+  mov x4, x3
+  mov x3, x2
+
+  aes_preload_keys(x0, w7);
+
+  ld1 {v0.16b}, [x3] /* load offset */
+  ld1 {v16.16b}, [x4] /* load checksum */
+
+  beq .Locb_auth_entry_192
+  bhi .Locb_auth_entry_256
+
+#define OCB_AUTH(bits) \
+  .Locb_auth_entry_##bits: \
+    cmp x6, #4; \
+    b.lo .Locb_auth_loop_##bits; \
+    \
+  .Locb_auth_loop4_##bits: \
+    \
+    /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
+    /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */ \
+    \
+    ldp ptr8, ptr9, [x5], #(ptr_sz*2); \
+    \
+    ld1 {v1.16b-v4.16b}, [x1], #64; /* load A_i+<0-3> */ \
+    ldp ptr10, ptr11, [x5], #(ptr_sz*2); \
+    sub x6, x6, #4; \
+    \
+    ld1 {v5.16b}, [x8];               /* load L_{ntz(i+0)} */ \
+    ld1 {v6.16b}, [x9];               /* load L_{ntz(i+1)} */ \
+    ld1 {v7.16b}, [x10];              /* load L_{ntz(i+2)} */ \
+    eor v5.16b, v5.16b, v0.16b;       /* Offset_i+0 */ \
+    ld1 {v0.16b}, [x11];              /* load L_{ntz(i+3)} */ \
+    eor v6.16b, v6.16b, v5.16b;       /* Offset_i+1 */ \
+    eor v1.16b, v1.16b, v5.16b;       /* A_i+0 xor Offset_i+0 */ \
+    eor v7.16b, v7.16b, v6.16b;       /* Offset_i+2 */ \
+    eor v2.16b, v2.16b, v6.16b;       /* A_i+1 xor Offset_i+1 */ \
+    eor v0.16b, v0.16b, v7.16b;       /* Offset_i+3 */ \
+    cmp x6, #4; \
+    eor v3.16b, v3.16b, v7.16b;       /* A_i+2 xor Offset_i+2 */ \
+    eor v4.16b, v4.16b, v0.16b;       /* A_i+3 xor Offset_i+3 */ \
+    \
+    do_aes_4_##bits(e, mc, v1, v2, v3, v4); \
+    \
+    eor v1.16b, v1.16b, v2.16b; \
+    eor v16.16b, v16.16b, v3.16b; \
+    eor v1.16b, v1.16b, v4.16b; \
+    eor v16.16b, v16.16b, v1.16b; \
+    \
+    b.hs .Locb_auth_loop4_##bits; \
+    CLEAR_REG(v3); \
+    CLEAR_REG(v4); \
+    CLEAR_REG(v5); \
+    CLEAR_REG(v6); \
+    CLEAR_REG(v7); \
+    cbz x6, .Locb_auth_done; \
+    \
+  .Locb_auth_loop_##bits: \
+    \
+    /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
+    /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */ \
+    \
+    ldr ptr8, [x5], #(ptr_sz); \
+    ld1 {v1.16b}, [x1], #16; /* load aadtext */ \
+    ld1 {v2.16b}, [x8]; /* load L_{ntz(i)} */ \
+    sub x6, x6, #1; \
+    eor v0.16b, v0.16b, v2.16b; \
+    eor v1.16b, v1.16b, v0.16b; \
+    \
+    do_aes_one##bits(e, mc, v1, v1) \
+    \
+    eor v16.16b, v16.16b, v1.16b; \
+    \
+    cbnz x6, .Locb_auth_loop_##bits; \
+    b .Locb_auth_done;
+
+  OCB_AUTH(128)
+  OCB_AUTH(192)
+  OCB_AUTH(256)
+
+#undef OCB_AUTH
+
+.Locb_auth_done:
+  aes_clear_keys(w7)
+
+  st1 {v16.16b}, [x4] /* store checksum */
+  st1 {v0.16b}, [x3] /* store offset */
+
+  CLEAR_REG(v0)
+  CLEAR_REG(v1)
+  CLEAR_REG(v2)
+  CLEAR_REG(v16)
+
+  ret
+.size _gcry_aes_ocb_auth_armv8_ce,.-_gcry_aes_ocb_auth_armv8_ce;
+
+
+/*
+ * u32 _gcry_aes_sbox4_armv8_ce(u32 in4b);
+ */
+.align 3
+.globl _gcry_aes_sbox4_armv8_ce
+.type  _gcry_aes_sbox4_armv8_ce,%function;
+_gcry_aes_sbox4_armv8_ce:
+  /* See "Gouvêa, C. P. L. & López, J. Implementing GCM on ARMv8. Topics in
+   * Cryptology — CT-RSA 2015" for details.
+   */
+  movi v0.16b, #0x52
+  movi v1.16b, #0
+  mov v0.S[0], w0
+  aese v0.16b, v1.16b
+  addv s0, v0.4s
+  mov w0, v0.S[0]
+  CLEAR_REG(v0)
+  ret
+.size _gcry_aes_sbox4_armv8_ce,.-_gcry_aes_sbox4_armv8_ce;
+
+
+/*
+ * void _gcry_aes_invmixcol_armv8_ce(void *dst, const void *src);
+ */
+.align 3
+.globl _gcry_aes_invmixcol_armv8_ce
+.type  _gcry_aes_invmixcol_armv8_ce,%function;
+_gcry_aes_invmixcol_armv8_ce:
+  ld1 {v0.16b}, [x1]
+  aesimc v0.16b, v0.16b
+  st1 {v0.16b}, [x0]
+  CLEAR_REG(v0)
+  ret
+.size _gcry_aes_invmixcol_armv8_ce,.-_gcry_aes_invmixcol_armv8_ce;
+
+#endif
index 340dbc0..160fb8c 100644 (file)
      && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \
      && defined(HAVE_GCC_INLINE_ASM_AARCH32_CRYPTO)
 #  define USE_ARM_CE 1
+# elif defined(__AARCH64EL__) \
+       && defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) \
+       && defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO)
+#  define USE_ARM_CE 1
 # endif
 #endif /* ENABLE_ARM_CRYPTO_SUPPORT */
 
index 9049db7..ca82af9 100644 (file)
@@ -2043,6 +2043,10 @@ if test "$found" = "1" ; then
       aarch64-*-*)
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-aarch64.lo"
+
+         # Build with the ARMv8/AArch64 CE implementation
+         GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-armv8-ce.lo"
+         GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-armv8-aarch64-ce.lo"
       ;;
    esac