Add ARMv8/AArch64 implementation of chacha20
authorJussi Kivilinna <jussi.kivilinna@iki.fi>
Sat, 6 Jan 2018 16:58:04 +0000 (18:58 +0200)
committerJussi Kivilinna <jussi.kivilinna@iki.fi>
Tue, 9 Jan 2018 16:41:03 +0000 (18:41 +0200)
* cipher/Makefile.am: Add 'chacha20-aarch64.S'.
* cipher/chacha20-aarch64.S: New.
* cipher/chacha20.c (USE_AARCH64_SIMD): New.
(_gcry_chacha20_aarch_blocks4): New.
(chacha20_do_setkey): Add HWF selection for Aarch64 implementation.
* configure.ac: Add 'chacha20-aarch64.lo'.
--

Benchmark on Cortex-A53 (1152 Mhz):

Before:
 CHACHA20       |  nanosecs/byte   mebibytes/sec   cycles/byte
     STREAM enc |      7.91 ns/B     120.6 MiB/s      9.11 c/B
     STREAM dec |      7.91 ns/B     120.6 MiB/s      9.11 c/B

After (1.66x faster):
 CHACHA20       |  nanosecs/byte   mebibytes/sec   cycles/byte
     STREAM enc |      4.74 ns/B     201.2 MiB/s      5.46 c/B
     STREAM dec |      4.74 ns/B     201.3 MiB/s      5.46 c/B

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
cipher/Makefile.am
cipher/chacha20-aarch64.S [new file with mode: 0644]
cipher/chacha20.c
configure.ac

index a24b117..3c4eae0 100644 (file)
@@ -65,6 +65,7 @@ arcfour.c arcfour-amd64.S \
 blowfish.c blowfish-amd64.S blowfish-arm.S \
 cast5.c cast5-amd64.S cast5-arm.S \
 chacha20.c chacha20-amd64-ssse3.S chacha20-amd64-avx2.S chacha20-armv7-neon.S \
+  chacha20-aarch64.S \
 crc.c \
   crc-intel-pclmul.c \
 des.c des-amd64.S \
diff --git a/cipher/chacha20-aarch64.S b/cipher/chacha20-aarch64.S
new file mode 100644 (file)
index 0000000..739ddde
--- /dev/null
@@ -0,0 +1,308 @@
+/* chacha20-aarch64.S - ARMv8/AArch64 accelerated chacha20 blocks function
+ *
+ * Copyright (C) 2017,2018 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Based on D. J. Bernstein reference implementation at
+ * http://cr.yp.to/chacha.html:
+ *
+ * chacha-regs.c version 20080118
+ * D. J. Bernstein
+ * Public domain.
+ */
+
+#include <config.h>
+
+#if defined(__AARCH64EL__) && \
+    defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_AARCH64_NEON) && \
+    defined(USE_CHACHA20)
+
+.cpu generic+simd
+
+.text
+
+#define GET_DATA_POINTER(reg, name) \
+       adrp    reg, :got:name ; \
+       ldr     reg, [reg, #:got_lo12:name] ;
+
+/* register macros */
+#define INPUT     x0
+#define DST       x1
+#define SRC       x2
+#define NBLKS     x3
+#define ROUND     x4
+#define INPUT_CTR x5
+#define INPUT_POS x6
+#define CTR       x7
+
+/* vector registers */
+#define X0 v16
+#define X1 v17
+#define X2 v18
+#define X3 v19
+#define X4 v20
+#define X5 v21
+#define X6 v22
+#define X7 v23
+#define X8 v24
+#define X9 v25
+#define X10 v26
+#define X11 v27
+#define X12 v28
+#define X13 v29
+#define X14 v30
+#define X15 v31
+
+#define VCTR    v0
+#define VTMP0   v1
+#define VTMP1   v2
+#define VTMP2   v3
+#define VTMP3   v4
+#define X12_TMP v5
+#define X13_TMP v6
+
+/**********************************************************************
+  helper macros
+ **********************************************************************/
+
+#define vpunpckldq(s1, s2, dst) \
+       zip1 dst.4s, s2.4s, s1.4s;
+
+#define vpunpckhdq(s1, s2, dst) \
+       zip2 dst.4s, s2.4s, s1.4s;
+
+#define vpunpcklqdq(s1, s2, dst) \
+       zip1 dst.2d, s2.2d, s1.2d;
+
+#define vpunpckhqdq(s1, s2, dst) \
+       zip2 dst.2d, s2.2d, s1.2d;
+
+/* 4x4 32-bit integer matrix transpose */
+#define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \
+       vpunpckhdq(x1, x0, t2); \
+       vpunpckldq(x1, x0, x0); \
+       \
+       vpunpckldq(x3, x2, t1); \
+       vpunpckhdq(x3, x2, x2); \
+       \
+       vpunpckhqdq(t1, x0, x1); \
+       vpunpcklqdq(t1, x0, x0); \
+       \
+       vpunpckhqdq(x2, t2, x3); \
+       vpunpcklqdq(x2, t2, x2);
+
+#define clear(x) \
+       eor x.16b, x.16b, x.16b;
+
+/**********************************************************************
+  4-way chacha20
+ **********************************************************************/
+
+#define ROTATE2(dst1,dst2,c,src1,src2)         \
+       shl dst1.4s, src1.4s, #(c);             \
+       shl dst2.4s, src2.4s, #(c);             \
+       sri dst1.4s, src1.4s, #(32 - (c));      \
+       sri dst2.4s, src2.4s, #(32 - (c));
+
+#define ROTATE2_16(dst1,dst2,src1,src2)                \
+       rev32 dst1.8h, src1.8h;                 \
+       rev32 dst2.8h, src2.8h;
+
+#define XOR(d,s1,s2) \
+       eor d.16b, s2.16b, s1.16b;
+
+#define PLUS(ds,s) \
+       add ds.4s, ds.4s, s.4s;
+
+#define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2,ign,tmp1,tmp2)           \
+       PLUS(a1,b1); PLUS(a2,b2); XOR(tmp1,d1,a1); XOR(tmp2,d2,a2);     \
+           ROTATE2_16(d1, d2, tmp1, tmp2);                             \
+       PLUS(c1,d1); PLUS(c2,d2); XOR(tmp1,b1,c1); XOR(tmp2,b2,c2);     \
+           ROTATE2(b1, b2, 12, tmp1, tmp2);                            \
+       PLUS(a1,b1); PLUS(a2,b2); XOR(tmp1,d1,a1); XOR(tmp2,d2,a2);     \
+           ROTATE2(d1, d2,  8, tmp1, tmp2);                            \
+       PLUS(c1,d1); PLUS(c2,d2); XOR(tmp1,b1,c1); XOR(tmp2,b2,c2);     \
+           ROTATE2(b1, b2,  7, tmp1, tmp2);
+
+chacha20_data:
+.align 4
+.Linc_counter:
+       .long 0,1,2,3
+
+.align 3
+.globl _gcry_chacha20_aarch64_blocks4
+.type _gcry_chacha20_aarch64_blocks4,%function;
+
+_gcry_chacha20_aarch64_blocks4:
+       /* input:
+        *      x0: input
+        *      x1: dst
+        *      x2: src
+        *      x3: nblks (multiple of 4)
+        */
+
+       GET_DATA_POINTER(CTR, .Linc_counter);
+       add INPUT_CTR, INPUT, #(12*4);
+       mov INPUT_POS, INPUT;
+       ld1 {VCTR.16b}, [CTR];
+
+.Loop4:
+       /* Construct counter vectors X12 and X13 */
+
+       ld1 {X15.16b}, [INPUT_CTR];
+       mov ROUND, #20;
+       ld1 {VTMP1.16b-VTMP3.16b}, [INPUT_POS];
+
+       dup X12.4s, X15.4s[0];
+       dup X13.4s, X15.4s[1];
+       ldr CTR, [INPUT_CTR];
+       add X12.4s, X12.4s, VCTR.4s;
+       dup X0.4s, VTMP1.4s[0];
+       dup X1.4s, VTMP1.4s[1];
+       dup X2.4s, VTMP1.4s[2];
+       dup X3.4s, VTMP1.4s[3];
+       dup X14.4s, X15.4s[2];
+       cmhi VTMP0.4s, VCTR.4s, X12.4s;
+       dup X15.4s, X15.4s[3];
+       add CTR, CTR, #4; /* Update counter */
+       dup X4.4s, VTMP2.4s[0];
+       dup X5.4s, VTMP2.4s[1];
+       dup X6.4s, VTMP2.4s[2];
+       dup X7.4s, VTMP2.4s[3];
+       sub X13.4s, X13.4s, VTMP0.4s;
+       dup X8.4s, VTMP3.4s[0];
+       dup X9.4s, VTMP3.4s[1];
+       dup X10.4s, VTMP3.4s[2];
+       dup X11.4s, VTMP3.4s[3];
+       mov X12_TMP.16b, X12.16b;
+       mov X13_TMP.16b, X13.16b;
+       str CTR, [INPUT_CTR];
+
+.Lround2:
+       subs ROUND, ROUND, #2
+       QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,VTMP0,VTMP1)
+       QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,VTMP0,VTMP1)
+       QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,VTMP0,VTMP1)
+       QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,VTMP0,VTMP1)
+       b.ne .Lround2;
+
+       ld1 {VTMP0.16b, VTMP1.16b}, [INPUT_POS], #32;
+
+       PLUS(X12, X12_TMP);        /* INPUT + 12 * 4 + counter */
+       PLUS(X13, X13_TMP);        /* INPUT + 13 * 4 + counter */
+
+       dup VTMP2.4s, VTMP0.4s[0]; /* INPUT + 0 * 4 */
+       dup VTMP3.4s, VTMP0.4s[1]; /* INPUT + 1 * 4 */
+       dup X12_TMP.4s, VTMP0.4s[2]; /* INPUT + 2 * 4 */
+       dup X13_TMP.4s, VTMP0.4s[3]; /* INPUT + 3 * 4 */
+       PLUS(X0, VTMP2);
+       PLUS(X1, VTMP3);
+       PLUS(X2, X12_TMP);
+       PLUS(X3, X13_TMP);
+
+       dup VTMP2.4s, VTMP1.4s[0]; /* INPUT + 4 * 4 */
+       dup VTMP3.4s, VTMP1.4s[1]; /* INPUT + 5 * 4 */
+       dup X12_TMP.4s, VTMP1.4s[2]; /* INPUT + 6 * 4 */
+       dup X13_TMP.4s, VTMP1.4s[3]; /* INPUT + 7 * 4 */
+       ld1 {VTMP0.16b, VTMP1.16b}, [INPUT_POS];
+       mov INPUT_POS, INPUT;
+       PLUS(X4, VTMP2);
+       PLUS(X5, VTMP3);
+       PLUS(X6, X12_TMP);
+       PLUS(X7, X13_TMP);
+
+       dup VTMP2.4s, VTMP0.4s[0]; /* INPUT + 8 * 4 */
+       dup VTMP3.4s, VTMP0.4s[1]; /* INPUT + 9 * 4 */
+       dup X12_TMP.4s, VTMP0.4s[2]; /* INPUT + 10 * 4 */
+       dup X13_TMP.4s, VTMP0.4s[3]; /* INPUT + 11 * 4 */
+       dup VTMP0.4s, VTMP1.4s[2]; /* INPUT + 14 * 4 */
+       dup VTMP1.4s, VTMP1.4s[3]; /* INPUT + 15 * 4 */
+       PLUS(X8, VTMP2);
+       PLUS(X9, VTMP3);
+       PLUS(X10, X12_TMP);
+       PLUS(X11, X13_TMP);
+       PLUS(X14, VTMP0);
+       PLUS(X15, VTMP1);
+
+       transpose_4x4(X0, X1, X2, X3, VTMP0, VTMP1, VTMP2);
+       transpose_4x4(X4, X5, X6, X7, VTMP0, VTMP1, VTMP2);
+       transpose_4x4(X8, X9, X10, X11, VTMP0, VTMP1, VTMP2);
+       transpose_4x4(X12, X13, X14, X15, VTMP0, VTMP1, VTMP2);
+
+       subs NBLKS, NBLKS, #4;
+
+       ld1 {VTMP0.16b-VTMP3.16b}, [SRC], #64;
+       ld1 {X12_TMP.16b-X13_TMP.16b}, [SRC], #32;
+       eor VTMP0.16b, X0.16b, VTMP0.16b;
+       eor VTMP1.16b, X4.16b, VTMP1.16b;
+       eor VTMP2.16b, X8.16b, VTMP2.16b;
+       eor VTMP3.16b, X12.16b, VTMP3.16b;
+       eor X12_TMP.16b, X1.16b, X12_TMP.16b;
+       eor X13_TMP.16b, X5.16b, X13_TMP.16b;
+       st1 {VTMP0.16b-VTMP3.16b}, [DST], #64;
+       ld1 {VTMP0.16b-VTMP3.16b}, [SRC], #64;
+       st1 {X12_TMP.16b-X13_TMP.16b}, [DST], #32;
+       ld1 {X12_TMP.16b-X13_TMP.16b}, [SRC], #32;
+       eor VTMP0.16b, X9.16b, VTMP0.16b;
+       eor VTMP1.16b, X13.16b, VTMP1.16b;
+       eor VTMP2.16b, X2.16b, VTMP2.16b;
+       eor VTMP3.16b, X6.16b, VTMP3.16b;
+       eor X12_TMP.16b, X10.16b, X12_TMP.16b;
+       eor X13_TMP.16b, X14.16b, X13_TMP.16b;
+       st1 {VTMP0.16b-VTMP3.16b}, [DST], #64;
+       ld1 {VTMP0.16b-VTMP3.16b}, [SRC], #64;
+       st1 {X12_TMP.16b-X13_TMP.16b}, [DST], #32;
+       eor VTMP0.16b, X3.16b, VTMP0.16b;
+       eor VTMP1.16b, X7.16b, VTMP1.16b;
+       eor VTMP2.16b, X11.16b, VTMP2.16b;
+       eor VTMP3.16b, X15.16b, VTMP3.16b;
+       st1 {VTMP0.16b-VTMP3.16b}, [DST], #64;
+
+       b.ne .Loop4;
+
+       /* clear the used vector registers and stack */
+       clear(VTMP0);
+       clear(VTMP1);
+       clear(VTMP2);
+       clear(VTMP3);
+       clear(X12_TMP);
+       clear(X13_TMP);
+       clear(X0);
+       clear(X1);
+       clear(X2);
+       clear(X3);
+       clear(X4);
+       clear(X5);
+       clear(X6);
+       clear(X7);
+       clear(X8);
+       clear(X9);
+       clear(X10);
+       clear(X11);
+       clear(X12);
+       clear(X13);
+       clear(X14);
+       clear(X15);
+
+       eor x0, x0, x0
+       ret
+.size _gcry_chacha20_aarch64_blocks4, .-_gcry_chacha20_aarch64_blocks4;
+
+#endif
index ac6cc29..e89ad2e 100644 (file)
 # endif
 #endif
 
+/* USE_AARCH64_SIMD indicates whether to enable ARMv8 SIMD assembly
+ * code. */
+#undef USE_AARCH64_SIMD
+#ifdef ENABLE_NEON_SUPPORT
+# if defined(__AARCH64EL__) \
+       && defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) \
+       && defined(HAVE_GCC_INLINE_ASM_AARCH64_NEON)
+#  define USE_AARCH64_SIMD 1
+# endif
+#endif
+
 /* Assembly implementations use SystemV ABI, ABI conversion and additional
  * stack to store XMM6-XMM15 needed on Win64. */
 #undef ASM_FUNC_ABI
@@ -119,6 +130,13 @@ unsigned int _gcry_chacha20_armv7_neon_blocks4(u32 *state, byte *dst,
 
 #endif /* USE_ARMV7_NEON */
 
+#ifdef USE_AARCH64_SIMD
+
+unsigned int _gcry_chacha20_aarch64_blocks4(u32 *state, byte *dst,
+                                           const byte *src, size_t nblks);
+
+#endif /* USE_AARCH64_SIMD */
+
 
 static const char *selftest (void);
 \f
@@ -338,6 +356,10 @@ chacha20_do_setkey (CHACHA20_context_t *ctx,
 #ifdef USE_ARMV7_NEON
   ctx->use_neon = (features & HWF_ARM_NEON) != 0;
 #endif
+#ifdef USE_AARCH64_SIMD
+  ctx->use_neon = (features & HWF_ARM_NEON) != 0;
+#endif
+
   (void)features;
 
   chacha20_keysetup (ctx, key, keylen);
@@ -434,6 +456,20 @@ chacha20_encrypt_stream (void *context, byte *outbuf, const byte *inbuf,
     }
 #endif
 
+#ifdef USE_AARCH64_SIMD
+  if (ctx->use_neon && length >= CHACHA20_BLOCK_SIZE * 4)
+    {
+      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+      nblocks -= nblocks % 4;
+      nburn = _gcry_chacha20_aarch64_blocks4(ctx->input, outbuf, inbuf,
+                                            nblocks);
+      burn = nburn > burn ? nburn : burn;
+      length -= nblocks * CHACHA20_BLOCK_SIZE;
+      outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+      inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
+    }
+#endif
+
   if (length >= CHACHA20_BLOCK_SIZE)
     {
       size_t nblocks = length / CHACHA20_BLOCK_SIZE;
index a5aba14..42cd4c2 100644 (file)
@@ -2230,6 +2230,10 @@ if test "$found" = "1" ; then
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-amd64-ssse3.lo"
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-amd64-avx2.lo"
       ;;
+      aarch64-*-*)
+         # Build with the assembly implementation
+         GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-aarch64.lo"
+      ;;
    esac
 
    if test x"$neonsupport" = xyes ; then