Add ARMv8/AArch32 Crypto Extension implementation of GCM
authorJussi Kivilinna <jussi.kivilinna@iki.fi>
Thu, 14 Jul 2016 14:55:28 +0000 (17:55 +0300)
committerJussi Kivilinna <jussi.kivilinna@iki.fi>
Thu, 14 Jul 2016 14:55:28 +0000 (17:55 +0300)
* cipher/Makefile.am: Add 'cipher-gcm-armv8-aarch32-ce.S'.
* cipher/cipher-gcm-armv8-aarch32-ce.S: New.
* cipher/cipher-gcm.c [GCM_USE_ARM_PMULL]
(_gcry_ghash_setup_armv8_ce_pmull, _gcry_ghash_armv8_ce_pmull)
(ghash_setup_armv8_ce_pmull, ghash_armv8_ce_pmull): New.
(setupM) [GCM_USE_ARM_PMULL]: Enable ARM PMULL implementation if
HWF_ARM_PULL HW feature flag is enabled.
* cipher/cipher-gcm.h (GCM_USE_ARM_PMULL): New.
--

Benchmark on Cortex-A53 (1152 Mhz):

Before:
                     |  nanosecs/byte   mebibytes/sec   cycles/byte
  GMAC_AES           |     24.10 ns/B     39.57 MiB/s     27.76 c/B

After (~26x faster):
                     |  nanosecs/byte   mebibytes/sec   cycles/byte
  GMAC_AES           |     0.924 ns/B    1032.2 MiB/s      1.06 c/B

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
cipher/Makefile.am
cipher/cipher-gcm-armv8-aarch32-ce.S [new file with mode: 0644]
cipher/cipher-gcm.c
cipher/cipher-internal.h

index 1e97050..5d69a38 100644 (file)
@@ -43,6 +43,7 @@ libcipher_la_SOURCES = \
 cipher.c cipher-internal.h \
 cipher-cbc.c cipher-cfb.c cipher-ofb.c cipher-ctr.c cipher-aeswrap.c \
 cipher-ccm.c cipher-cmac.c cipher-gcm.c cipher-gcm-intel-pclmul.c \
+ cipher-gcm-armv8-aarch32-ce.S \
 cipher-poly1305.c cipher-ocb.c \
 cipher-selftest.c cipher-selftest.h \
 pubkey.c pubkey-internal.h pubkey-util.c \
diff --git a/cipher/cipher-gcm-armv8-aarch32-ce.S b/cipher/cipher-gcm-armv8-aarch32-ce.S
new file mode 100644 (file)
index 0000000..b879fb2
--- /dev/null
@@ -0,0 +1,235 @@
+/* cipher-gcm-armv8-aarch32-ce.S - ARM/CE accelerated GHASH
+ * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \
+    defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_AARCH32_CRYPTO)
+
+.syntax unified
+.fpu crypto-neon-fp-armv8
+.arm
+
+.text
+
+#ifdef __PIC__
+#  define GET_DATA_POINTER(reg, name, rtmp) \
+               ldr reg, 1f; \
+               ldr rtmp, 2f; \
+               b 3f; \
+       1:      .word _GLOBAL_OFFSET_TABLE_-(3f+8); \
+       2:      .word name(GOT); \
+       3:      add reg, pc, reg; \
+               ldr reg, [reg, rtmp];
+#else
+#  define GET_DATA_POINTER(reg, name, rtmp) ldr reg, =name
+#endif
+
+
+/* Constants */
+
+.align 4
+gcry_gcm_reduction_constant:
+.Lrconst64:
+  .quad 0xc200000000000000
+
+
+/* Register macros */
+
+#define rhash q0
+#define rhash_l d0
+#define rhash_h d1
+
+#define rbuf q1
+#define rbuf_l d2
+#define rbuf_h d3
+
+#define rh0 q2
+#define rh0_l d4
+#define rh0_h d5
+
+#define rt0 q3
+#define rt0_l d6
+#define rt0_h d7
+
+#define rr0 q8
+#define rr0_l d16
+#define rr0_h d17
+
+#define rr1 q9
+#define rr1_l d18
+#define rr1_h d19
+
+#define rrconst q15
+#define rrconst_l d30
+#define rrconst_h d31
+
+#define ia rbuf_h
+#define ib rbuf_l
+#define oa rh0_l
+#define ob rh0_h
+#define co rrconst_l
+#define ma rrconst_h
+
+/* GHASH macros */
+
+/* See "Gouvêa, C. P. L. & López, J. Implementing GCM on ARMv8. Topics in
+ * Cryptology — CT-RSA 2015" for details.
+ */
+
+/* Input: 'a' and 'b', Output: 'r0:r1' (low 128-bits in r0, high in r1) */
+#define PMUL_128x128(r0, r1, a, b, t, interleave_op) \
+        veor t##_h, b##_l, b##_h; \
+        veor t##_l, a##_l, a##_h; \
+        vmull.p64 r0, a##_l, b##_l; \
+        vmull.p64 r1, a##_h, b##_h; \
+        vmull.p64 t, t##_h, t##_l; \
+        interleave_op(); \
+        veor t, r0; \
+        veor t, r1; \
+        veor r0##_h, t##_l; \
+        veor r1##_l, t##_h;
+
+/* Input: 'r0:r1', Output: 'a' */
+#define REDUCTION(a, r0, r1, rconst, t, interleave_op) \
+        vmull.p64 t, r0##_l, rconst; \
+        veor r0##_h, t##_l; \
+        veor r1##_l, t##_h; \
+        interleave_op(); \
+        vmull.p64 t, r0##_h, rconst; \
+        veor r1, t; \
+        veor a, r0, r1;
+
+#define _(...) /*_*/
+#define vrev_rbuf() vrev64.8 rbuf, rbuf;
+#define vext_rbuf() vext.8 rbuf, rbuf, rbuf, #8;
+
+/* Other functional macros */
+
+#define CLEAR_REG(reg) veor reg, reg;
+
+
+/*
+ * unsigned int _gcry_ghash_armv8_ce_pmull (void *gcm_key, byte *result,
+ *                                          const byte *buf, size_t nblocks,
+ *                                          void *gcm_table);
+ */
+.align 3
+.globl _gcry_ghash_armv8_ce_pmull
+.type  _gcry_ghash_armv8_ce_pmull,%function;
+_gcry_ghash_armv8_ce_pmull:
+  /* input:
+   *    r0: gcm_key
+   *    r1: result/hash
+   *    r2: buf
+   *    r3: nblocks
+   *    %st+0: gcm_table
+   */
+  push {r4, lr}
+
+  cmp r3, #0
+  beq .Ldo_nothing
+
+  GET_DATA_POINTER(lr, .Lrconst64, r4)
+
+  subs r3, r3, #1
+  vld1.64 {rhash}, [r1]
+  vld1.64 {rh0}, [r0]
+
+  vrev64.8 rhash, rhash /* byte-swap */
+  vld1.64 {rrconst_h}, [lr]
+  vext.8 rhash, rhash, rhash, #8
+
+  vld1.64 {rbuf}, [r2]!
+
+  vrev64.8 rbuf, rbuf /* byte-swap */
+  vext.8 rbuf, rbuf, rbuf, #8
+
+  veor rhash, rhash, rbuf
+
+  beq .Lend
+
+.Loop:
+  vld1.64 {rbuf}, [r2]!
+  subs r3, r3, #1
+  PMUL_128x128(rr0, rr1, rh0, rhash, rt0, vrev_rbuf)
+  REDUCTION(rhash, rr0, rr1, rrconst_h, rt0, vext_rbuf)
+  veor rhash, rhash, rbuf
+
+  bne .Loop
+
+.Lend:
+  PMUL_128x128(rr0, rr1, rh0, rhash, rt0, _)
+  REDUCTION(rhash, rr0, rr1, rrconst_h, rt0, _)
+
+  CLEAR_REG(rr1)
+  CLEAR_REG(rr0)
+  vrev64.8 rhash, rhash /* byte-swap */
+  CLEAR_REG(rbuf)
+  CLEAR_REG(rt0)
+  vext.8 rhash, rhash, rhash, #8
+  CLEAR_REG(rh0)
+
+  vst1.64 {rhash}, [r1]
+  CLEAR_REG(rhash)
+
+.Ldo_nothing:
+  mov r0, #0
+  pop {r4, pc}
+.size _gcry_ghash_armv8_ce_pmull,.-_gcry_ghash_armv8_ce_pmull;
+
+
+/*
+ * void _gcry_ghash_setup_armv8_ce_pmull (void *gcm_key, void *gcm_table);
+ */
+.align 3
+.globl _gcry_ghash_setup_armv8_ce_pmull
+.type  _gcry_ghash_setup_armv8_ce_pmull,%function;
+_gcry_ghash_setup_armv8_ce_pmull:
+  /* input:
+   *   r0: gcm_key
+   *   r1: gcm_table
+   */
+
+  push {r4, lr}
+
+  GET_DATA_POINTER(r4, .Lrconst64, lr)
+
+  /* H <<< 1 */
+  vld1.64 {ib,ia}, [r0]
+  vld1.64 {co}, [r4]
+  vrev64.8 ib, ib;
+  vrev64.8 ia, ia;
+  vshr.s64 ma, ib, #63
+  vshr.u64 oa, ib, #63
+  vshr.u64 ob, ia, #63
+  vand ma, co
+  vshl.u64 ib, ib, #1
+  vshl.u64 ia, ia, #1
+  vorr ob, ib
+  vorr oa, ia
+  veor ob, ma
+
+  vst1.64 {oa, ob}, [r0]
+
+  pop {r4, pc}
+.size _gcry_ghash_setup_armv8_ce_pmull,.-_gcry_ghash_setup_armv8_ce_pmull;
+
+#endif
index 6e0959a..2b8b454 100644 (file)
@@ -37,6 +37,30 @@ extern unsigned int _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result,
                                               const byte *buf, size_t nblocks);
 #endif
 
+#ifdef GCM_USE_ARM_PMULL
+extern void _gcry_ghash_setup_armv8_ce_pmull (void *gcm_key, void *gcm_table);
+
+extern unsigned int _gcry_ghash_armv8_ce_pmull (void *gcm_key, byte *result,
+                                                const byte *buf, size_t nblocks,
+                                                void *gcm_table);
+
+static void
+ghash_setup_armv8_ce_pmull (gcry_cipher_hd_t c)
+{
+  _gcry_ghash_setup_armv8_ce_pmull(c->u_mode.gcm.u_ghash_key.key,
+                                   c->u_mode.gcm.gcm_table);
+}
+
+static unsigned int
+ghash_armv8_ce_pmull (gcry_cipher_hd_t c, byte *result, const byte *buf,
+                      size_t nblocks)
+{
+  return _gcry_ghash_armv8_ce_pmull(c->u_mode.gcm.u_ghash_key.key, result, buf,
+                                    nblocks, c->u_mode.gcm.gcm_table);
+}
+
+#endif
+
 
 #ifdef GCM_USE_TABLES
 static const u16 gcmR[256] = {
@@ -379,15 +403,26 @@ ghash_internal (gcry_cipher_hd_t c, byte *result, const byte *buf,
 static void
 setupM (gcry_cipher_hd_t c)
 {
+#if defined(GCM_USE_INTEL_PCLMUL) || defined(GCM_USE_ARM_PMULL)
+  unsigned int features = _gcry_get_hw_features ();
+#endif
+
   if (0)
     ;
 #ifdef GCM_USE_INTEL_PCLMUL
-  else if (_gcry_get_hw_features () & HWF_INTEL_PCLMUL)
+  else if (features & HWF_INTEL_PCLMUL)
     {
       c->u_mode.gcm.ghash_fn = _gcry_ghash_intel_pclmul;
       _gcry_ghash_setup_intel_pclmul (c);
     }
 #endif
+#ifdef GCM_USE_ARM_PMULL
+  else if (features & HWF_ARM_PMULL)
+    {
+      c->u_mode.gcm.ghash_fn = ghash_armv8_ce_pmull;
+      ghash_setup_armv8_ce_pmull (c);
+    }
+#endif
   else
     {
       c->u_mode.gcm.ghash_fn = ghash_internal;
index 9fd1d91..52504f6 100644 (file)
 # endif
 #endif /* GCM_USE_INTEL_PCLMUL */
 
+/* GCM_USE_ARM_PMULL indicates whether to compile GCM with ARMv8 PMULL code. */
+#undef GCM_USE_ARM_PMULL
+#if defined(ENABLE_ARM_CRYPTO_SUPPORT) && defined(GCM_USE_TABLES)
+# if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \
+     && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \
+     && defined(HAVE_GCC_INLINE_ASM_AARCH32_CRYPTO)
+#  define GCM_USE_ARM_PMULL 1
+# endif
+#endif /* GCM_USE_ARM_PMULL */
+
 
 typedef unsigned int (*ghash_fn_t) (gcry_cipher_hd_t c, byte *result,
                                     const byte *buf, size_t nblocks);