blowfish: add ARMv6 assembly implementation
authorJussi Kivilinna <jussi.kivilinna@iki.fi>
Fri, 16 Aug 2013 09:51:52 +0000 (12:51 +0300)
committerJussi Kivilinna <jussi.kivilinna@iki.fi>
Fri, 16 Aug 2013 11:42:46 +0000 (14:42 +0300)
* cipher/Makefile.am: Add 'blowfish-armv6.S'.
* cipher/blowfish-armv6.S: New file.
* cipher/blowfish.c (USE_ARMV6_ASM): New macro.
[USE_ARMV6_ASM] (_gcry_blowfish_armv6_do_encrypt)
(_gcry_blowfish_armv6_encrypt_block)
(_gcry_blowfish_armv6_decrypt_block, _gcry_blowfish_armv6_ctr_enc)
(_gcry_blowfish_armv6_cbc_dec, _gcry_blowfish_armv6_cfb_dec): New
prototypes.
[USE_ARMV6_ASM] (do_encrypt, do_encrypt_block, do_decrypt_block)
(encrypt_block, decrypt_block): New functions.
(_gcry_blowfish_ctr_enc) [USE_ARMV6_ASM]: Use ARMv6 assembly function.
(_gcry_blowfish_cbc_dec) [USE_ARMV6_ASM]: Use ARMv6 assembly function.
(_gcry_blowfish_cfb_dec) [USE_ARMV6_ASM]: Use ARMv6 assembly function.
* configure.ac (blowfish) [arm]: Add 'blowfish-armv6.lo'.
--

Patch provides non-parallel implementations for small speed-up and 2-way
parallel implementations that gets accelerated on multi-issue CPUs (hand-tuned
for in-order dual-issue Cortex-A8). Unaligned access handling is done in
assembly.

For now, only enable this on little-endian systems as big-endian correctness
have not been tested yet.

Old vs new (Cortex-A8, Debian Wheezy/armhf):

             ECB/Stream         CBC             CFB             OFB             CTR
  --------------- --------------- --------------- --------------- ---------------
BLOWFISH   1.28x   1.16x   1.21x   2.16x   1.26x   1.86x   1.21x   1.25x   1.89x   1.96x

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
cipher/blowfish-armv6.S [new file with mode: 0644]
cipher/blowfish.c
configure.ac

diff --git a/cipher/blowfish-armv6.S b/cipher/blowfish-armv6.S
new file mode 100644 (file)
index 0000000..b11d27f
--- /dev/null
@@ -0,0 +1,737 @@
+/* blowfish-armv6.S  -  ARM assembly implementation of Blowfish cipher
+ *
+ * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if defined(__arm__) && defined(__ARMEL__) && \
+        ((defined(__ARM_ARCH) && __ARM_ARCH >= 6) \
+       || defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) \
+       || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) \
+       || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__) \
+       || defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) \
+       || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) \
+       || defined(__ARM_ARCH_7EM__))
+#ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS
+
+.text
+
+.syntax unified
+.arm
+
+/* structure of crypto context */
+#define s0     0
+#define s1     (s0 + (1 * 256) * 4)
+#define s2     (s0 + (2 * 256) * 4)
+#define s3     (s0 + (3 * 256) * 4)
+#define p      (s3 + (1 * 256) * 4)
+
+/* register macros */
+#define CTXs0 %r0
+#define CTXs1 %r9
+#define CTXs2 %r8
+#define CTXs3 %r10
+#define RMASK %lr
+#define RKEYL %r2
+#define RKEYR %ip
+
+#define RL0 %r3
+#define RR0 %r4
+
+#define RL1 %r9
+#define RR1 %r10
+
+#define RT0 %r11
+#define RT1 %r7
+#define RT2 %r5
+#define RT3 %r6
+
+/* helper macros */
+#define ldr_unaligned_le(rout, rsrc, offs, rtmp) \
+       ldrb rout, [rsrc, #((offs) + 0)]; \
+       ldrb rtmp, [rsrc, #((offs) + 1)]; \
+       orr rout, rout, rtmp, lsl #8; \
+       ldrb rtmp, [rsrc, #((offs) + 2)]; \
+       orr rout, rout, rtmp, lsl #16; \
+       ldrb rtmp, [rsrc, #((offs) + 3)]; \
+       orr rout, rout, rtmp, lsl #24;
+
+#define str_unaligned_le(rin, rdst, offs, rtmp0, rtmp1) \
+       mov rtmp0, rin, lsr #8; \
+       strb rin, [rdst, #((offs) + 0)]; \
+       mov rtmp1, rin, lsr #16; \
+       strb rtmp0, [rdst, #((offs) + 1)]; \
+       mov rtmp0, rin, lsr #24; \
+       strb rtmp1, [rdst, #((offs) + 2)]; \
+       strb rtmp0, [rdst, #((offs) + 3)];
+
+#define ldr_unaligned_be(rout, rsrc, offs, rtmp) \
+       ldrb rout, [rsrc, #((offs) + 3)]; \
+       ldrb rtmp, [rsrc, #((offs) + 2)]; \
+       orr rout, rout, rtmp, lsl #8; \
+       ldrb rtmp, [rsrc, #((offs) + 1)]; \
+       orr rout, rout, rtmp, lsl #16; \
+       ldrb rtmp, [rsrc, #((offs) + 0)]; \
+       orr rout, rout, rtmp, lsl #24;
+
+#define str_unaligned_be(rin, rdst, offs, rtmp0, rtmp1) \
+       mov rtmp0, rin, lsr #8; \
+       strb rin, [rdst, #((offs) + 3)]; \
+       mov rtmp1, rin, lsr #16; \
+       strb rtmp0, [rdst, #((offs) + 2)]; \
+       mov rtmp0, rin, lsr #24; \
+       strb rtmp1, [rdst, #((offs) + 1)]; \
+       strb rtmp0, [rdst, #((offs) + 0)];
+
+#ifdef __ARMEL__
+       #define ldr_unaligned_host ldr_unaligned_le
+       #define str_unaligned_host str_unaligned_le
+
+       /* bswap on little-endian */
+       #define host_to_be(reg) \
+               rev reg, reg;
+       #define be_to_host(reg) \
+               rev reg, reg;
+#else
+       #define ldr_unaligned_host ldr_unaligned_be
+       #define str_unaligned_host str_unaligned_be
+
+       /* nop on big-endian */
+       #define host_to_be(reg) /*_*/
+       #define be_to_host(reg) /*_*/
+#endif
+
+#define host_to_host(x) /*_*/
+
+/***********************************************************************
+ * 1-way blowfish
+ ***********************************************************************/
+#define F(l, r) \
+       and RT0, RMASK, l, lsr#(24 - 2); \
+       and RT1, RMASK, l, lsr#(16 - 2); \
+       ldr RT0, [CTXs0, RT0]; \
+       and RT2, RMASK, l, lsr#(8 - 2); \
+       ldr RT1, [CTXs1, RT1]; \
+       and RT3, RMASK, l, lsl#2; \
+       ldr RT2, [CTXs2, RT2]; \
+       add RT0, RT1; \
+       ldr RT3, [CTXs3, RT3]; \
+       eor RT0, RT2; \
+       add RT0, RT3; \
+       eor r, RT0;
+
+#define load_roundkey_enc(n) \
+       ldr RKEYL, [CTXs2, #((p - s2) + (4 * (n) + 0))]; \
+       ldr RKEYR, [CTXs2, #((p - s2) + (4 * (n) + 4))];
+
+#define add_roundkey_enc() \
+       eor RL0, RKEYL; \
+       eor RR0, RKEYR;
+
+#define round_enc(n) \
+       add_roundkey_enc(); \
+       load_roundkey_enc(n); \
+       \
+       F(RL0, RR0); \
+       F(RR0, RL0);
+
+#define load_roundkey_dec(n) \
+       ldr RKEYL, [CTXs2, #((p - s2) + (4 * ((n) - 1) + 4))]; \
+       ldr RKEYR, [CTXs2, #((p - s2) + (4 * ((n) - 1) + 0))];
+
+#define add_roundkey_dec() \
+       eor RL0, RKEYL; \
+       eor RR0, RKEYR;
+
+#define round_dec(n) \
+       add_roundkey_dec(); \
+       load_roundkey_dec(n); \
+       \
+       F(RL0, RR0); \
+       F(RR0, RL0);
+
+#define read_block_aligned(rin, offs, l0, r0, convert) \
+       ldr l0, [rin, #((offs) + 0)]; \
+       ldr r0, [rin, #((offs) + 4)]; \
+       convert(l0); \
+       convert(r0);
+
+#define write_block_aligned(rout, offs, l0, r0, convert) \
+       convert(l0); \
+       convert(r0); \
+       str l0, [rout, #((offs) + 0)]; \
+       str r0, [rout, #((offs) + 4)];
+
+#ifdef __ARM_FEATURE_UNALIGNED
+       /* unaligned word reads allowed */
+       #define read_block(rin, offs, l0, r0, rtmp0) \
+               read_block_aligned(rin, offs, l0, r0, host_to_be)
+
+       #define write_block(rout, offs, r0, l0, rtmp0, rtmp1) \
+               write_block_aligned(rout, offs, r0, l0, be_to_host)
+
+       #define read_block_host(rin, offs, l0, r0, rtmp0) \
+               read_block_aligned(rin, offs, l0, r0, host_to_host)
+
+       #define write_block_host(rout, offs, r0, l0, rtmp0, rtmp1) \
+               write_block_aligned(rout, offs, r0, l0, host_to_host)
+#else
+       /* need to handle unaligned reads by byte reads */
+       #define read_block(rin, offs, l0, r0, rtmp0) \
+               tst rin, #3; \
+               beq 1f; \
+                       ldr_unaligned_be(l0, rin, (offs) + 0, rtmp0); \
+                       ldr_unaligned_be(r0, rin, (offs) + 4, rtmp0); \
+                       b 2f; \
+               1:;\
+                       read_block_aligned(rin, offs, l0, r0, host_to_be); \
+               2:;
+
+       #define write_block(rout, offs, l0, r0, rtmp0, rtmp1) \
+               tst rout, #3; \
+               beq 1f; \
+                       str_unaligned_be(l0, rout, (offs) + 0, rtmp0, rtmp1); \
+                       str_unaligned_be(r0, rout, (offs) + 4, rtmp0, rtmp1); \
+                       b 2f; \
+               1:;\
+                       write_block_aligned(rout, offs, l0, r0, be_to_host); \
+               2:;
+
+       #define read_block_host(rin, offs, l0, r0, rtmp0) \
+               tst rin, #3; \
+               beq 1f; \
+                       ldr_unaligned_host(l0, rin, (offs) + 0, rtmp0); \
+                       ldr_unaligned_host(r0, rin, (offs) + 4, rtmp0); \
+                       b 2f; \
+               1:;\
+                       read_block_aligned(rin, offs, l0, r0, host_to_host); \
+               2:;
+
+       #define write_block_host(rout, offs, l0, r0, rtmp0, rtmp1) \
+               tst rout, #3; \
+               beq 1f; \
+                       str_unaligned_host(l0, rout, (offs) + 0, rtmp0, rtmp1); \
+                       str_unaligned_host(r0, rout, (offs) + 4, rtmp0, rtmp1); \
+                       b 2f; \
+               1:;\
+                       write_block_aligned(rout, offs, l0, r0, host_to_host); \
+               2:;
+#endif
+
+.align 3
+.type  __blowfish_enc_blk1,%function;
+
+__blowfish_enc_blk1:
+       /* input:
+        *      preloaded: CTX
+        *      [RL0, RR0]: src
+        * output:
+        *      [RR0, RL0]: dst
+        */
+       push {%lr};
+
+       add CTXs1, CTXs0, #(s1 - s0);
+       add CTXs2, CTXs0, #(s2 - s0);
+       mov RMASK, #(0xff << 2); /* byte mask */
+       add CTXs3, CTXs1, #(s3 - s1);
+
+       load_roundkey_enc(0);
+       round_enc(2);
+       round_enc(4);
+       round_enc(6);
+       round_enc(8);
+       round_enc(10);
+       round_enc(12);
+       round_enc(14);
+       round_enc(16);
+       add_roundkey_enc();
+
+       pop {%pc};
+.size __blowfish_enc_blk1,.-__blowfish_enc_blk1;
+
+.align 8
+.globl  _gcry_blowfish_armv6_do_encrypt
+.type   _gcry_blowfish_armv6_do_encrypt,%function;
+
+_gcry_blowfish_armv6_do_encrypt:
+       /* input:
+        *      %r0: ctx, CTX
+        *      %r1: u32 *ret_xl
+        *      %r2: u32 *ret_xr
+        */
+       push {%r2, %r4-%r11, %ip, %lr};
+
+       ldr RL0, [%r1];
+       ldr RR0, [%r2];
+
+       bl __blowfish_enc_blk1;
+
+       pop {%r2};
+       str RR0, [%r1];
+       str RL0, [%r2];
+
+       pop {%r4-%r11, %ip, %pc};
+.size _gcry_blowfish_armv6_do_encrypt,.-_gcry_blowfish_armv6_do_encrypt;
+
+.align 3
+.global _gcry_blowfish_armv6_encrypt_block
+.type   _gcry_blowfish_armv6_encrypt_block,%function;
+
+_gcry_blowfish_armv6_encrypt_block:
+       /* input:
+        *      %r0: ctx, CTX
+        *      %r1: dst
+        *      %r2: src
+        */
+       push {%r4-%r11, %ip, %lr};
+
+       read_block(%r2, 0, RL0, RR0, RT0);
+
+       bl __blowfish_enc_blk1;
+
+       write_block(%r1, 0, RR0, RL0, RT0, RT1);
+
+       pop {%r4-%r11, %ip, %pc};
+.size _gcry_blowfish_armv6_encrypt_block,.-_gcry_blowfish_armv6_encrypt_block;
+
+.align 3
+.global _gcry_blowfish_armv6_decrypt_block
+.type   _gcry_blowfish_armv6_decrypt_block,%function;
+
+_gcry_blowfish_armv6_decrypt_block:
+       /* input:
+        *      %r0: ctx, CTX
+        *      %r1: dst
+        *      %r2: src
+        */
+       push {%r4-%r11, %ip, %lr};
+
+       add CTXs1, CTXs0, #(s1 - s0);
+       add CTXs2, CTXs0, #(s2 - s0);
+       mov RMASK, #(0xff << 2); /* byte mask */
+       add CTXs3, CTXs1, #(s3 - s1);
+
+       read_block(%r2, 0, RL0, RR0, RT0);
+
+       load_roundkey_dec(17);
+       round_dec(15);
+       round_dec(13);
+       round_dec(11);
+       round_dec(9);
+       round_dec(7);
+       round_dec(5);
+       round_dec(3);
+       round_dec(1);
+       add_roundkey_dec();
+
+       write_block(%r1, 0, RR0, RL0, RT0, RT1);
+
+       pop {%r4-%r11, %ip, %pc};
+.size _gcry_blowfish_armv6_decrypt_block,.-_gcry_blowfish_armv6_decrypt_block;
+
+/***********************************************************************
+ * 2-way blowfish
+ ***********************************************************************/
+#define F2(n, l0, r0, l1, r1, set_nextk, dec) \
+       \
+       and RT0, RMASK, l0, lsr#(24 - 2); \
+       and RT1, RMASK, l0, lsr#(16 - 2); \
+       and RT2, RMASK, l0, lsr#(8 - 2); \
+       add RT1, #(s1 - s0); \
+       \
+       ldr RT0, [CTXs0, RT0]; \
+       and RT3, RMASK, l0, lsl#2; \
+       ldr RT1, [CTXs0, RT1]; \
+       add RT3, #(s3 - s2); \
+       ldr RT2, [CTXs2, RT2]; \
+       add RT0, RT1; \
+       ldr RT3, [CTXs2, RT3]; \
+       \
+       and RT1, RMASK, l1, lsr#(24 - 2); \
+       eor RT0, RT2; \
+       and RT2, RMASK, l1, lsr#(16 - 2); \
+       add RT0, RT3; \
+       add RT2, #(s1 - s0); \
+       and RT3, RMASK, l1, lsr#(8 - 2); \
+       eor r0, RT0; \
+       \
+       ldr RT1, [CTXs0, RT1]; \
+       and RT0, RMASK, l1, lsl#2; \
+       ldr RT2, [CTXs0, RT2]; \
+       add RT0, #(s3 - s2); \
+       ldr RT3, [CTXs2, RT3]; \
+       add RT1, RT2; \
+       ldr RT0, [CTXs2, RT0]; \
+       \
+       and RT2, RMASK, r0, lsr#(24 - 2); \
+       eor RT1, RT3; \
+       and RT3, RMASK, r0, lsr#(16 - 2); \
+       add RT1, RT0; \
+       add RT3, #(s1 - s0); \
+       and RT0, RMASK, r0, lsr#(8 - 2); \
+       eor r1, RT1; \
+       \
+       ldr RT2, [CTXs0, RT2]; \
+       and RT1, RMASK, r0, lsl#2; \
+       ldr RT3, [CTXs0, RT3]; \
+       add RT1, #(s3 - s2); \
+       ldr RT0, [CTXs2, RT0]; \
+       add RT2, RT3; \
+       ldr RT1, [CTXs2, RT1]; \
+       \
+       and RT3, RMASK, r1, lsr#(24 - 2); \
+       eor RT2, RT0; \
+       and RT0, RMASK, r1, lsr#(16 - 2); \
+       add RT2, RT1; \
+       add RT0, #(s1 - s0); \
+       and RT1, RMASK, r1, lsr#(8 - 2); \
+       eor l0, RT2; \
+       \
+       ldr RT3, [CTXs0, RT3]; \
+       and RT2, RMASK, r1, lsl#2; \
+       ldr RT0, [CTXs0, RT0]; \
+       add RT2, #(s3 - s2); \
+       ldr RT1, [CTXs2, RT1]; \
+       eor l1, RKEYL; \
+       ldr RT2, [CTXs2, RT2]; \
+       \
+       eor r0, RKEYR; \
+       add RT3, RT0; \
+       eor r1, RKEYR; \
+       eor RT3, RT1; \
+       eor l0, RKEYL; \
+       add RT3, RT2; \
+       set_nextk(RKEYL, (p - s2) + (4 * (n) + ((dec) * 4))); \
+       eor l1, RT3; \
+       set_nextk(RKEYR, (p - s2) + (4 * (n) + (!(dec) * 4)));
+
+#define load_n_add_roundkey_enc2(n) \
+       load_roundkey_enc(n); \
+       eor RL0, RKEYL; \
+       eor RR0, RKEYR; \
+       eor RL1, RKEYL; \
+       eor RR1, RKEYR; \
+       load_roundkey_enc((n) + 2);
+
+#define next_key(reg, offs) \
+       ldr reg, [CTXs2, #(offs)];
+
+#define dummy(x, y) /* do nothing */
+
+#define round_enc2(n, load_next_key) \
+       F2((n) + 2, RL0, RR0, RL1, RR1, load_next_key, 0);
+
+#define load_n_add_roundkey_dec2(n) \
+       load_roundkey_dec(n); \
+       eor RL0, RKEYL; \
+       eor RR0, RKEYR; \
+       eor RL1, RKEYL; \
+       eor RR1, RKEYR; \
+       load_roundkey_dec((n) - 2);
+
+#define round_dec2(n, load_next_key) \
+       F2((n) - 3, RL0, RR0, RL1, RR1, load_next_key, 1);
+
+#define read_block2_aligned(rin, l0, r0, l1, r1, convert) \
+       ldr l0, [rin, #(0)]; \
+       ldr r0, [rin, #(4)]; \
+       convert(l0); \
+       ldr l1, [rin, #(8)]; \
+       convert(r0); \
+       ldr r1, [rin, #(12)]; \
+       convert(l1); \
+       convert(r1);
+
+#define write_block2_aligned(rout, l0, r0, l1, r1, convert) \
+       convert(l0); \
+       convert(r0); \
+       convert(l1); \
+       str l0, [rout, #(0)]; \
+       convert(r1); \
+       str r0, [rout, #(4)]; \
+       str l1, [rout, #(8)]; \
+       str r1, [rout, #(12)];
+
+#ifdef __ARM_FEATURE_UNALIGNED
+       /* unaligned word reads allowed */
+       #define read_block2(rin, l0, r0, l1, r1, rtmp0) \
+               read_block2_aligned(rin, l0, r0, l1, r1, host_to_be)
+
+       #define write_block2(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
+               write_block2_aligned(rout, l0, r0, l1, r1, be_to_host)
+
+       #define read_block2_host(rin, l0, r0, l1, r1, rtmp0) \
+               read_block2_aligned(rin, l0, r0, l1, r1, host_to_host)
+
+       #define write_block2_host(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
+               write_block2_aligned(rout, l0, r0, l1, r1, host_to_host)
+#else
+       /* need to handle unaligned reads by byte reads */
+       #define read_block2(rin, l0, r0, l1, r1, rtmp0) \
+               tst rin, #3; \
+               beq 1f; \
+                       ldr_unaligned_be(l0, rin, 0, rtmp0); \
+                       ldr_unaligned_be(r0, rin, 4, rtmp0); \
+                       ldr_unaligned_be(l1, rin, 8, rtmp0); \
+                       ldr_unaligned_be(r1, rin, 12, rtmp0); \
+                       b 2f; \
+               1:;\
+                       read_block2_aligned(rin, l0, r0, l1, r1, host_to_be); \
+               2:;
+
+       #define write_block2(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
+               tst rout, #3; \
+               beq 1f; \
+                       str_unaligned_be(l0, rout, 0, rtmp0, rtmp1); \
+                       str_unaligned_be(r0, rout, 4, rtmp0, rtmp1); \
+                       str_unaligned_be(l1, rout, 8, rtmp0, rtmp1); \
+                       str_unaligned_be(r1, rout, 12, rtmp0, rtmp1); \
+                       b 2f; \
+               1:;\
+                       write_block2_aligned(rout, l0, r0, l1, r1, be_to_host); \
+               2:;
+
+       #define read_block2_host(rin, l0, r0, l1, r1, rtmp0) \
+               tst rin, #3; \
+               beq 1f; \
+                       ldr_unaligned_host(l0, rin, 0, rtmp0); \
+                       ldr_unaligned_host(r0, rin, 4, rtmp0); \
+                       ldr_unaligned_host(l1, rin, 8, rtmp0); \
+                       ldr_unaligned_host(r1, rin, 12, rtmp0); \
+                       b 2f; \
+               1:;\
+                       read_block2_aligned(rin, l0, r0, l1, r1, host_to_host); \
+               2:;
+
+       #define write_block2_host(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
+               tst rout, #3; \
+               beq 1f; \
+                       str_unaligned_host(l0, rout, 0, rtmp0, rtmp1); \
+                       str_unaligned_host(r0, rout, 4, rtmp0, rtmp1); \
+                       str_unaligned_host(l1, rout, 8, rtmp0, rtmp1); \
+                       str_unaligned_host(r1, rout, 12, rtmp0, rtmp1); \
+                       b 2f; \
+               1:;\
+                       write_block2_aligned(rout, l0, r0, l1, r1, host_to_host); \
+               2:;
+#endif
+
+.align 3
+.type  _gcry_blowfish_armv6_enc_blk2,%function;
+
+_gcry_blowfish_armv6_enc_blk2:
+       /* input:
+        *      preloaded: CTX
+        *      [RL0, RR0], [RL1, RR1]: src
+        * output:
+        *      [RR0, RL0], [RR1, RL1]: dst
+        */
+       push {%lr};
+
+       add CTXs2, CTXs0, #(s2 - s0);
+       mov RMASK, #(0xff << 2); /* byte mask */
+
+       load_n_add_roundkey_enc2(0);
+       round_enc2(2, next_key);
+       round_enc2(4, next_key);
+       round_enc2(6, next_key);
+       round_enc2(8, next_key);
+       round_enc2(10, next_key);
+       round_enc2(12, next_key);
+       round_enc2(14, next_key);
+       round_enc2(16, dummy);
+
+       host_to_be(RR0);
+       host_to_be(RL0);
+       host_to_be(RR1);
+       host_to_be(RL1);
+
+       pop {%pc};
+.size _gcry_blowfish_armv6_enc_blk2,.-_gcry_blowfish_armv6_enc_blk2;
+
+.align 3
+.globl _gcry_blowfish_armv6_cfb_dec;
+.type  _gcry_blowfish_armv6_cfb_dec,%function;
+
+_gcry_blowfish_armv6_cfb_dec:
+       /* input:
+        *      %r0: CTX
+        *      %r1: dst (2 blocks)
+        *      %r2: src (2 blocks)
+        *      %r3: iv (64bit)
+        */
+       push {%r2, %r4-%r11, %ip, %lr};
+
+       mov %lr, %r3;
+
+       /* Load input (iv/%r3 is aligned, src/%r2 might not be) */
+       ldm %r3, {RL0, RR0};
+       host_to_be(RL0);
+       host_to_be(RR0);
+       read_block(%r2, 0, RL1, RR1, RT0);
+
+       /* Update IV, load src[1] and save to iv[0] */
+       read_block_host(%r2, 8, %r5, %r6, RT0);
+       stm %lr, {%r5, %r6};
+
+       bl _gcry_blowfish_armv6_enc_blk2;
+       /* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */
+
+       /* %r1: dst, %r0: %src */
+       pop {%r0};
+
+       /* dst = src ^ result */
+       read_block2_host(%r0, %r5, %r6, %r7, %r8, %lr);
+       eor %r5, %r4;
+       eor %r6, %r3;
+       eor %r7, %r10;
+       eor %r8, %r9;
+       write_block2_host(%r1, %r5, %r6, %r7, %r8, %r9, %r10);
+
+       pop {%r4-%r11, %ip, %pc};
+.ltorg
+.size _gcry_blowfish_armv6_cfb_dec,.-_gcry_blowfish_armv6_cfb_dec;
+
+.align 3
+.globl _gcry_blowfish_armv6_ctr_enc;
+.type  _gcry_blowfish_armv6_ctr_enc,%function;
+
+_gcry_blowfish_armv6_ctr_enc:
+       /* input:
+        *      %r0: CTX
+        *      %r1: dst (2 blocks)
+        *      %r2: src (2 blocks)
+        *      %r3: iv (64bit, big-endian)
+        */
+       push {%r2, %r4-%r11, %ip, %lr};
+
+       mov %lr, %r3;
+
+       /* Load IV (big => host endian) */
+       read_block_aligned(%lr, 0, RL0, RR0, be_to_host);
+
+       /* Construct IVs */
+       adds RR1, RR0, #1; /* +1 */
+       adc RL1, RL0, #0;
+       adds %r6, RR1, #1; /* +2 */
+       adc %r5, RL1, #0;
+
+       /* Store new IV (host => big-endian) */
+       write_block_aligned(%lr, 0, %r5, %r6, host_to_be);
+
+       bl _gcry_blowfish_armv6_enc_blk2;
+       /* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */
+
+       /* %r1: dst, %r0: %src */
+       pop {%r0};
+
+       /* XOR key-stream with plaintext */
+       read_block2_host(%r0, %r5, %r6, %r7, %r8, %lr);
+       eor %r5, %r4;
+       eor %r6, %r3;
+       eor %r7, %r10;
+       eor %r8, %r9;
+       write_block2_host(%r1, %r5, %r6, %r7, %r8, %r9, %r10);
+
+       pop {%r4-%r11, %ip, %pc};
+.ltorg
+.size _gcry_blowfish_armv6_ctr_enc,.-_gcry_blowfish_armv6_ctr_enc;
+
+.align 3
+.type  _gcry_blowfish_armv6_dec_blk2,%function;
+
+_gcry_blowfish_armv6_dec_blk2:
+       /* input:
+        *      preloaded: CTX
+        *      [RL0, RR0], [RL1, RR1]: src
+        * output:
+        *      [RR0, RL0], [RR1, RL1]: dst
+        */
+       add CTXs2, CTXs0, #(s2 - s0);
+       mov RMASK, #(0xff << 2); /* byte mask */
+
+       load_n_add_roundkey_dec2(17);
+       round_dec2(15, next_key);
+       round_dec2(13, next_key);
+       round_dec2(11, next_key);
+       round_dec2(9, next_key);
+       round_dec2(7, next_key);
+       round_dec2(5, next_key);
+       round_dec2(3, next_key);
+       round_dec2(1, dummy);
+
+       host_to_be(RR0);
+       host_to_be(RL0);
+       host_to_be(RR1);
+       host_to_be(RL1);
+
+       b .Ldec_cbc_tail;
+.ltorg
+.size _gcry_blowfish_armv6_dec_blk2,.-_gcry_blowfish_armv6_dec_blk2;
+
+.align 3
+.globl _gcry_blowfish_armv6_cbc_dec;
+.type  _gcry_blowfish_armv6_cbc_dec,%function;
+
+_gcry_blowfish_armv6_cbc_dec:
+       /* input:
+        *      %r0: CTX
+        *      %r1: dst (2 blocks)
+        *      %r2: src (2 blocks)
+        *      %r3: iv (64bit)
+        */
+       push {%r2-%r11, %ip, %lr};
+
+       read_block2(%r2, RL0, RR0, RL1, RR1, RT0);
+
+       /* dec_blk2 is only used by cbc_dec, jump directly in/out instead
+        * of function call. */
+       b _gcry_blowfish_armv6_dec_blk2;
+.Ldec_cbc_tail:
+       /* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */
+
+       /* %r0: %src, %r1: dst, %r2: iv */
+       pop {%r0, %r2};
+
+       /* load IV+1 (src[0]) to %r7:%r8. Might be unaligned. */
+       read_block_host(%r0, 0, %r7, %r8, %r5);
+       /* load IV (iv[0]) to %r5:%r6. 'iv' is aligned. */
+       ldm %r2, {%r5, %r6};
+
+       /* out[1] ^= IV+1 */
+       eor %r10, %r7;
+       eor %r9, %r8;
+       /* out[0] ^= IV */
+       eor %r4, %r5;
+       eor %r3, %r6;
+
+       /* load IV+2 (src[1]) to %r7:%r8. Might be unaligned. */
+       read_block_host(%r0, 8, %r7, %r8, %r5);
+       /* store IV+2 to iv[0] (aligned). */
+       stm %r2, {%r7, %r8};
+
+       /* store result to dst[0-3]. Might be unaligned. */
+       write_block2_host(%r1, %r4, %r3, %r10, %r9, %r5, %r6);
+
+       pop {%r4-%r11, %ip, %pc};
+.ltorg
+.size _gcry_blowfish_armv6_cbc_dec,.-_gcry_blowfish_armv6_cbc_dec;
+
+#endif /*HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS*/
+#endif /*__ARM_ARCH >= 6*/
index 69baebe..fe4e280 100644 (file)
 # define USE_AMD64_ASM 1
 #endif
 
+/* USE_ARMV6_ASM indicates whether to use ARMv6 assembly code. */
+#undef USE_ARMV6_ASM
+#if defined(__arm__) && defined(__ARMEL__) && \
+        ((defined(__ARM_ARCH) && __ARM_ARCH >= 6) \
+       || defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) \
+       || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) \
+       || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__) \
+       || defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) \
+       || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) \
+       || defined(__ARM_ARCH_7EM__))
+# if (BLOWFISH_ROUNDS == 16) && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS)
+#  define USE_ARMV6_ASM 1
+# endif
+#endif
 
 typedef struct {
     u32 s0[256];
@@ -305,7 +319,61 @@ static void decrypt_block (void *context, byte *outbuf, const byte *inbuf)
   _gcry_burn_stack (2*8);
 }
 
-#else /*USE_AMD64_ASM*/
+#elif defined(USE_ARMV6_ASM)
+
+/* Assembly implementations of Blowfish. */
+extern void _gcry_blowfish_armv6_do_encrypt(BLOWFISH_context *c, u32 *ret_xl,
+                                           u32 *ret_xr);
+
+extern void _gcry_blowfish_armv6_encrypt_block(BLOWFISH_context *c, byte *out,
+                                              const byte *in);
+
+extern void _gcry_blowfish_armv6_decrypt_block(BLOWFISH_context *c, byte *out,
+                                              const byte *in);
+
+/* These assembly implementations process two blocks in parallel. */
+extern void _gcry_blowfish_armv6_ctr_enc(BLOWFISH_context *ctx, byte *out,
+                                        const byte *in, byte *ctr);
+
+extern void _gcry_blowfish_armv6_cbc_dec(BLOWFISH_context *ctx, byte *out,
+                                        const byte *in, byte *iv);
+
+extern void _gcry_blowfish_armv6_cfb_dec(BLOWFISH_context *ctx, byte *out,
+                                        const byte *in, byte *iv);
+
+static void
+do_encrypt ( BLOWFISH_context *bc, u32 *ret_xl, u32 *ret_xr )
+{
+  _gcry_blowfish_armv6_do_encrypt (bc, ret_xl, ret_xr);
+}
+
+static void
+do_encrypt_block (BLOWFISH_context *context, byte *outbuf, const byte *inbuf)
+{
+  _gcry_blowfish_armv6_encrypt_block (context, outbuf, inbuf);
+}
+
+static void
+do_decrypt_block (BLOWFISH_context *context, byte *outbuf, const byte *inbuf)
+{
+  _gcry_blowfish_armv6_decrypt_block (context, outbuf, inbuf);
+}
+
+static void encrypt_block (void *context , byte *outbuf, const byte *inbuf)
+{
+  BLOWFISH_context *c = (BLOWFISH_context *) context;
+  do_encrypt_block (c, outbuf, inbuf);
+  _gcry_burn_stack (10*4);
+}
+
+static void decrypt_block (void *context, byte *outbuf, const byte *inbuf)
+{
+  BLOWFISH_context *c = (BLOWFISH_context *) context;
+  do_decrypt_block (c, outbuf, inbuf);
+  _gcry_burn_stack (10*4);
+}
+
+#else /*USE_ARMV6_ASM*/
 
 #if BLOWFISH_ROUNDS != 16
 static inline u32
@@ -527,7 +595,7 @@ decrypt_block (void *context, byte *outbuf, const byte *inbuf)
   _gcry_burn_stack (64);
 }
 
-#endif /*!USE_AMD64_ASM*/
+#endif /*!USE_AMD64_ASM&&!USE_ARMV6_ASM*/
 
 
 /* Bulk encryption of complete blocks in CTR mode.  This function is only
@@ -562,6 +630,21 @@ _gcry_blowfish_ctr_enc(void *context, unsigned char *ctr, void *outbuf_arg,
     /* Use generic code to handle smaller chunks... */
     /* TODO: use caching instead? */
   }
+#elif defined(USE_ARMV6_ASM)
+  {
+    /* Process data in 2 block chunks. */
+    while (nblocks >= 2)
+      {
+        _gcry_blowfish_armv6_ctr_enc(ctx, outbuf, inbuf, ctr);
+
+        nblocks -= 2;
+        outbuf += 2 * BLOWFISH_BLOCKSIZE;
+        inbuf  += 2 * BLOWFISH_BLOCKSIZE;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+    /* TODO: use caching instead? */
+  }
 #endif
 
   for ( ;nblocks; nblocks-- )
@@ -615,6 +698,20 @@ _gcry_blowfish_cbc_dec(void *context, unsigned char *iv, void *outbuf_arg,
 
     /* Use generic code to handle smaller chunks... */
   }
+#elif defined(USE_ARMV6_ASM)
+  {
+    /* Process data in 2 block chunks. */
+    while (nblocks >= 2)
+      {
+        _gcry_blowfish_armv6_cbc_dec(ctx, outbuf, inbuf, iv);
+
+        nblocks -= 2;
+        outbuf += 2 * BLOWFISH_BLOCKSIZE;
+        inbuf  += 2 * BLOWFISH_BLOCKSIZE;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+  }
 #endif
 
   for ( ;nblocks; nblocks-- )
@@ -664,6 +761,20 @@ _gcry_blowfish_cfb_dec(void *context, unsigned char *iv, void *outbuf_arg,
 
     /* Use generic code to handle smaller chunks... */
   }
+#elif defined(USE_ARMV6_ASM)
+  {
+    /* Process data in 2 block chunks. */
+    while (nblocks >= 2)
+      {
+        _gcry_blowfish_armv6_cfb_dec(ctx, outbuf, inbuf, iv);
+
+        nblocks -= 2;
+        outbuf += 2 * BLOWFISH_BLOCKSIZE;
+        inbuf  += 2 * BLOWFISH_BLOCKSIZE;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+  }
 #endif
 
   for ( ;nblocks; nblocks-- )
index f05f919..22dab9d 100644 (file)
@@ -1280,6 +1280,10 @@ if test "$found" = "1" ; then
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS blowfish-amd64.lo"
       ;;
+      arm*-*-*)
+         # Build with the assembly implementation
+         GCRYPT_CIPHERS="$GCRYPT_CIPHERS blowfish-armv6.lo"
+      ;;
    esac
 fi