Fix unaligned accesses with ldm/stm in ChaCha20 and Poly1305 ARM/NEON
authorJussi Kivilinna <jussi.kivilinna@iki.fi>
Thu, 7 Jul 2016 22:22:58 +0000 (01:22 +0300)
committerJussi Kivilinna <jussi.kivilinna@iki.fi>
Thu, 7 Jul 2016 22:22:58 +0000 (01:22 +0300)
* cipher/chacha20-armv7-neon.S (UNALIGNED_STMIA8)
(UNALIGNED_LDMIA4): New.
(_gcry_chacha20_armv7_neon_blocks): Use new helper macros instead of
ldm/stm instructions directly.
* cipher/poly1305-armv7-neon.S (UNALIGNED_LDMIA2)
(UNALIGNED_LDMIA4): New.
(_gcry_poly1305_armv7_neon_init_ext, _gcry_poly1305_armv7_neon_blocks)
(_gcry_poly1305_armv7_neon_finish_ext): Use new helper macros instead
of ldm instruction directly.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
cipher/chacha20-armv7-neon.S
cipher/poly1305-armv7-neon.S

index 1a395ba..4d3340b 100644 (file)
 .fpu neon
 .arm
 
+#define UNALIGNED_STMIA8(ptr, l0, l1, l2, l3, l4, l5, l6, l7) \
+        tst ptr, #3; \
+        beq 1f; \
+        vpush {d0-d3}; \
+        vmov s0, l0; \
+        vmov s1, l1; \
+        vmov s2, l2; \
+        vmov s3, l3; \
+        vmov s4, l4; \
+        vmov s5, l5; \
+        vmov s6, l6; \
+        vmov s7, l7; \
+        vst1.32 {d0-d3}, [ptr]; \
+        add ptr, #32; \
+        vpop {d0-d3}; \
+        b 2f; \
+     1: stmia ptr!, {l0-l7}; \
+     2: ;
+
+#define UNALIGNED_LDMIA4(ptr, l0, l1, l2, l3) \
+        tst ptr, #3; \
+        /*beq 1f;*/ \
+        vpush {d0-d1}; \
+        vld1.32 {d0-d1}, [ptr]; \
+        add ptr, #16; \
+        vmov l0, s0; \
+        vmov l1, s1; \
+        vmov l2, s2; \
+        vmov l3, s3; \
+        vpop {d0-d1}; \
+        b 2f; \
+     1: ldmia ptr!, {l0-l3}; \
+     2: ;
+
 .text
 
 .globl _gcry_chacha20_armv7_neon_blocks
@@ -352,7 +386,8 @@ _gcry_chacha20_armv7_neon_blocks:
        add r7, r7, r11
        vadd.i32 q11, q11, q14
        beq .Lchacha_blocks_neon_nomessage11
-       ldmia r12!, {r8-r11}
+       UNALIGNED_LDMIA4(r12, r8, r9, r10, r11)
+       tst r12, r12
        eor r0, r0, r8
        eor r1, r1, r9
        eor r2, r2, r10
@@ -367,7 +402,8 @@ _gcry_chacha20_armv7_neon_blocks:
        add r12, r12, #16
        eor r7, r7, r11
 .Lchacha_blocks_neon_nomessage11:
-       stmia r14!, {r0-r7}
+       UNALIGNED_STMIA8(r14, r0, r1, r2, r3, r4, r5, r6, r7)
+       tst r12, r12
        ldm sp, {r0-r7}
        ldr r8, [sp, #(64 +32)]
        ldr r9, [sp, #(64 +36)]
@@ -391,7 +427,8 @@ _gcry_chacha20_armv7_neon_blocks:
        tst r12, r12
        str r9, [sp, #(64 +52)]
        beq .Lchacha_blocks_neon_nomessage12
-       ldmia r12!, {r8-r11}
+       UNALIGNED_LDMIA4(r12, r8, r9, r10, r11)
+       tst r12, r12
        eor r0, r0, r8
        eor r1, r1, r9
        eor r2, r2, r10
@@ -406,7 +443,8 @@ _gcry_chacha20_armv7_neon_blocks:
        add r12, r12, #16
        eor r7, r7, r11
 .Lchacha_blocks_neon_nomessage12:
-       stmia r14!, {r0-r7}
+       UNALIGNED_STMIA8(r14, r0, r1, r2, r3, r4, r5, r6, r7)
+       tst r12, r12
        beq .Lchacha_blocks_neon_nomessage13
        vld1.32 {q12,q13}, [r12]!
        vld1.32 {q14,q15}, [r12]!
@@ -613,7 +651,8 @@ _gcry_chacha20_armv7_neon_blocks:
        tst r12, r12
        add r7, r7, r11
        beq .Lchacha_blocks_neon_nomessage21
-       ldmia r12!, {r8-r11}
+       UNALIGNED_LDMIA4(r12, r8, r9, r10, r11)
+       tst r12, r12
        eor r0, r0, r8
        eor r1, r1, r9
        eor r2, r2, r10
@@ -628,7 +667,7 @@ _gcry_chacha20_armv7_neon_blocks:
        add r12, r12, #16
        eor r7, r7, r11
 .Lchacha_blocks_neon_nomessage21:
-       stmia r14!, {r0-r7}
+       UNALIGNED_STMIA8(r14, r0, r1, r2, r3, r4, r5, r6, r7)
        ldm sp, {r0-r7}
        ldr r8, [sp, #(64 +32)]
        ldr r9, [sp, #(64 +36)]
@@ -652,7 +691,8 @@ _gcry_chacha20_armv7_neon_blocks:
        tst r12, r12
        str r9, [sp, #(64 +52)]
        beq .Lchacha_blocks_neon_nomessage22
-       ldmia r12!, {r8-r11}
+       UNALIGNED_LDMIA4(r12, r8, r9, r10, r11)
+       tst r12, r12
        eor r0, r0, r8
        eor r1, r1, r9
        eor r2, r2, r10
@@ -667,7 +707,7 @@ _gcry_chacha20_armv7_neon_blocks:
        add r12, r12, #16
        eor r7, r7, r11
 .Lchacha_blocks_neon_nomessage22:
-       stmia r14!, {r0-r7}
+       UNALIGNED_STMIA8(r14, r0, r1, r2, r3, r4, r5, r6, r7)
        str r12, [sp, #48]
        str r14, [sp, #40]
        ldr r3, [sp, #52]
index b1554ed..13cb4a5 100644 (file)
 #  define GET_DATA_POINTER(reg, name, rtmp) ldr reg, =name
 #endif
 
+#define UNALIGNED_LDMIA2(ptr, l0, l1) \
+        tst ptr, #3; \
+        beq 1f; \
+        vpush {d0}; \
+        vld1.32 {d0}, [ptr]!; \
+        vmov l0, s0; \
+        vmov l1, s1; \
+        vpop {d0}; \
+        b 2f; \
+     1: ldmia ptr!, {l0-l1}; \
+     2: ;
+
+#define UNALIGNED_LDMIA4(ptr, l0, l1, l2, l3) \
+        tst ptr, #3; \
+        beq 1f; \
+        vpush {d0-d1}; \
+        vld1.32 {d0-d1}, [ptr]!; \
+        vmov l0, s0; \
+        vmov l1, s1; \
+        vmov l2, s2; \
+        vmov l3, s3; \
+        vpop {d0-d1}; \
+        b 2f; \
+     1: ldmia ptr!, {l0-l3}; \
+     2: ;
+
 .text
 
 .p2align 2
@@ -64,7 +90,7 @@ _gcry_poly1305_armv7_neon_init_ext:
        mov r14, r2
        and r2, r2, r2
        moveq r14, #-1
-       ldmia r1!, {r2-r5}
+       UNALIGNED_LDMIA4(r1, r2, r3, r4, r5)
        GET_DATA_POINTER(r7,.Lpoly1305_init_constants_neon,r8)
        mov r6, r2
        mov r8, r2, lsr #26
@@ -175,7 +201,7 @@ _gcry_poly1305_armv7_neon_init_ext:
        eor r6, r6, r6
        stmia r0!, {r2-r6}
        stmia r0!, {r2-r6}
-       ldmia r1!, {r2-r5}
+       UNALIGNED_LDMIA4(r1, r2, r3, r4, r5)
        stmia r0, {r2-r6}
        add sp, sp, #32
        ldmfd sp!, {r4-r11, lr}
@@ -286,7 +312,7 @@ _gcry_poly1305_armv7_neon_blocks:
        vmov d14, d12
        vmul.i32 q6, q5, d0[0]
 .Lpoly1305_blocks_neon_mainloop:
-       ldmia r0!, {r2-r5}
+       UNALIGNED_LDMIA4(r0, r2, r3, r4, r5)
        vmull.u32 q0, d25, d12[0]
        mov r7, r2, lsr #26
        vmlal.u32 q0, d24, d12[1]
@@ -302,7 +328,7 @@ _gcry_poly1305_armv7_neon_blocks:
        orr r4, r8, r4, lsl #12
        orr r5, r9, r5, lsl #18
        vmlal.u32 q1, d24, d13[0]
-       ldmia r0!, {r7-r10}
+       UNALIGNED_LDMIA4(r0, r7, r8, r9, r10)
        vmlal.u32 q1, d23, d13[1]
        mov r1, r7, lsr #26
        vmlal.u32 q1, d22, d14[0]
@@ -344,7 +370,7 @@ _gcry_poly1305_armv7_neon_blocks:
        vmlal.u32 q4, d21, d11[1]
        vld1.64 {d21-d24}, [r14, :256]!
        vld1.64 {d25}, [r14, :64]
-       ldmia r0!, {r2-r5}
+       UNALIGNED_LDMIA4(r0, r2, r3, r4, r5)
        vmlal.u32 q0, d25, d26
        mov r7, r2, lsr #26
        vmlal.u32 q0, d24, d27
@@ -360,7 +386,7 @@ _gcry_poly1305_armv7_neon_blocks:
        orr r4, r8, r4, lsl #12
        orr r5, r9, r5, lsl #18
        vmlal.u32 q1, d24, d28
-       ldmia r0!, {r7-r10}
+       UNALIGNED_LDMIA4(r0, r7, r8, r9, r10)
        vmlal.u32 q1, d23, d29
        mov r1, r7, lsr #26
        vmlal.u32 q1, d22, d20
@@ -643,7 +669,7 @@ _gcry_poly1305_armv7_neon_finish_ext:
 .Lpoly1305_finish_ext_neon_skip16:
        tst r7, #8
        beq .Lpoly1305_finish_ext_neon_skip8
-       ldmia r1!, {r10-r11}
+       UNALIGNED_LDMIA2(r1, r10, r11)
        stmia r9!, {r10-r11}
 .Lpoly1305_finish_ext_neon_skip8:
        tst r7, #4