OCB ARM CE: Move ocb_get_l handling to assembly part
authorJussi Kivilinna <jussi.kivilinna@iki.fi>
Sat, 10 Dec 2016 10:29:12 +0000 (12:29 +0200)
committerJussi Kivilinna <jussi.kivilinna@iki.fi>
Sat, 10 Dec 2016 10:29:12 +0000 (12:29 +0200)
* cipher/rijndael-armv8-aarch32-ce.S: Add OCB 'L_{ntz(i)}' calculation.
* cipher/rijndael-armv8-aarch64-ce.S: Ditto.
* cipher/rijndael-armv8-ce.c (_gcry_aes_ocb_enc_armv8_ce)
(_gcry_aes_ocb_dec_armv8_ce, _gcry_aes_ocb_auth_armv8_ce)
(ocb_cryt_fn_t): Updated arguments.
(_gcry_aes_armv8_ce_ocb_crypt, _gcry_aes_armv8_ce_ocb_auth): Remove
'ocb_get_l' handling and splitting input to 32 block chunks, instead
pass full buffers to assembly.
--

Performance on Cortex-A53 (AArch32):

Before:
 AES            |  nanosecs/byte   mebibytes/sec   cycles/byte
        OCB enc |      1.63 ns/B     583.8 MiB/s      1.88 c/B
        OCB dec |      1.67 ns/B     572.1 MiB/s      1.92 c/B
       OCB auth |      1.33 ns/B     717.1 MiB/s      1.53 c/B

After (~12% faster):
 AES            |  nanosecs/byte   mebibytes/sec   cycles/byte
        OCB enc |      1.47 ns/B     650.2 MiB/s      1.69 c/B
        OCB dec |      1.48 ns/B     644.5 MiB/s      1.70 c/B
       OCB auth |      1.19 ns/B     798.2 MiB/s      1.38 c/B

Performance on Cortex-A53 (AArch64):

Before:
 AES            |  nanosecs/byte   mebibytes/sec   cycles/byte
        OCB enc |      1.29 ns/B     738.5 MiB/s      1.49 c/B
        OCB dec |      1.32 ns/B     723.5 MiB/s      1.52 c/B
       OCB auth |      1.15 ns/B     827.0 MiB/s      1.33 c/B

After (~8% faster):
 AES            |  nanosecs/byte   mebibytes/sec   cycles/byte
        OCB enc |      1.21 ns/B     789.1 MiB/s      1.39 c/B
        OCB dec |      1.21 ns/B     789.2 MiB/s      1.39 c/B
       OCB auth |      1.10 ns/B     867.0 MiB/s      1.27 c/B

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
cipher/rijndael-armv8-aarch32-ce.S
cipher/rijndael-armv8-aarch64-ce.S
cipher/rijndael-armv8-ce.c

index bf68f20..f375f67 100644 (file)
@@ -1021,9 +1021,10 @@ _gcry_aes_ctr_enc_armv8_ce:
  *                                  const unsigned char *inbuf,
  *                                  unsigned char *offset,
  *                                  unsigned char *checksum,
- *                                  void **Ls,
+ *                                  unsigned char *L_table,
  *                                  size_t nblocks,
- *                                  unsigned int nrounds);
+ *                                  unsigned int nrounds,
+ *                                  unsigned int blkn);
  */
 
 .align 3
@@ -1039,6 +1040,7 @@ _gcry_aes_ocb_enc_armv8_ce:
    *    %st+4: Ls => r5
    *    %st+8: nblocks => r6  (0 < nblocks <= 32)
    *    %st+12: nrounds => r7
+   *    %st+16: blkn => lr
    */
 
   vpush {q4-q7}
@@ -1047,6 +1049,7 @@ _gcry_aes_ocb_enc_armv8_ce:
   ldr r4, [sp, #(104+0)]
   ldr r5, [sp, #(104+4)]
   ldr r6, [sp, #(104+8)]
+  ldr lr, [sp, #(104+16)]
 
   cmp r7, #12
   vld1.8 {q0}, [r3] /* load offset */
@@ -1059,6 +1062,7 @@ _gcry_aes_ocb_enc_armv8_ce:
 #define OCB_ENC(bits, ...) \
   .Locb_enc_entry_##bits: \
     cmp r6, #4; \
+    add lr, #1; \
     blo .Locb_enc_loop_##bits; \
     \
   .Locb_enc_loop4_##bits: \
@@ -1067,7 +1071,23 @@ _gcry_aes_ocb_enc_armv8_ce:
     /* Checksum_i = Checksum_{i-1} xor P_i  */ \
     /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */ \
     \
-    ldm r5!, {r8, r9, r10, r11}; \
+    add r9, lr, #1; \
+    add r10, lr, #2; \
+    add r11, lr, #3; \
+    rbit r8, lr; \
+    add lr, lr, #4; \
+    rbit r9, r9; \
+    rbit r10, r10; \
+    rbit r11, r11; \
+    clz r8, r8; /* ntz(i+0) */ \
+    clz r9, r9; /* ntz(i+1) */ \
+    clz r10, r10; /* ntz(i+2) */ \
+    clz r11, r11; /* ntz(i+3) */ \
+    add r8, r5, r8, lsl #4; \
+    add r9, r5, r9, lsl #4; \
+    add r10, r5, r10, lsl #4; \
+    add r11, r5, r11, lsl #4; \
+    \
     sub r6, #4; \
     \
     vld1.8 {q9}, [r8];     /* load L_{ntz(i+0)} */ \
@@ -1120,7 +1140,11 @@ _gcry_aes_ocb_enc_armv8_ce:
     /* Checksum_i = Checksum_{i-1} xor P_i  */ \
     /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */ \
     \
-    ldr r8, [r5], #4; \
+    rbit r8, lr; \
+    add lr, #1; \
+    clz r8, r8; /* ntz(i) */ \
+    add r8, r5, r8, lsl #4; \
+    \
     vld1.8 {q1}, [r2]!; /* load plaintext */ \
     vld1.8 {q2}, [r8]; /* load L_{ntz(i)} */ \
     vld1.8 {q3}, [r4]; /* load checksum */ \
@@ -1171,9 +1195,10 @@ _gcry_aes_ocb_enc_armv8_ce:
  *                                  const unsigned char *inbuf,
  *                                  unsigned char *offset,
  *                                  unsigned char *checksum,
- *                                  void **Ls,
+ *                                  unsigned char *L_table,
  *                                  size_t nblocks,
- *                                  unsigned int nrounds);
+ *                                  unsigned int nrounds,
+ *                                  unsigned int blkn);
  */
 
 .align 3
@@ -1189,6 +1214,7 @@ _gcry_aes_ocb_dec_armv8_ce:
    *    %st+4: Ls => r5
    *    %st+8: nblocks => r6  (0 < nblocks <= 32)
    *    %st+12: nrounds => r7
+   *    %st+16: blkn => lr
    */
 
   vpush {q4-q7}
@@ -1197,6 +1223,7 @@ _gcry_aes_ocb_dec_armv8_ce:
   ldr r4, [sp, #(104+0)]
   ldr r5, [sp, #(104+4)]
   ldr r6, [sp, #(104+8)]
+  ldr lr, [sp, #(104+16)]
 
   cmp r7, #12
   vld1.8 {q0}, [r3] /* load offset */
@@ -1209,6 +1236,7 @@ _gcry_aes_ocb_dec_armv8_ce:
 #define OCB_DEC(bits, ...) \
   .Locb_dec_entry_##bits: \
     cmp r6, #4; \
+    add lr, #1; \
     blo .Locb_dec_loop_##bits; \
     \
   .Locb_dec_loop4_##bits: \
@@ -1217,7 +1245,23 @@ _gcry_aes_ocb_dec_armv8_ce:
     /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */ \
     /* Checksum_i = Checksum_{i-1} xor P_i  */ \
     \
-    ldm r5!, {r8, r9, r10, r11}; \
+    add r9, lr, #1; \
+    add r10, lr, #2; \
+    add r11, lr, #3; \
+    rbit r8, lr; \
+    add lr, lr, #4; \
+    rbit r9, r9; \
+    rbit r10, r10; \
+    rbit r11, r11; \
+    clz r8, r8; /* ntz(i+0) */ \
+    clz r9, r9; /* ntz(i+1) */ \
+    clz r10, r10; /* ntz(i+2) */ \
+    clz r11, r11; /* ntz(i+3) */ \
+    add r8, r5, r8, lsl #4; \
+    add r9, r5, r9, lsl #4; \
+    add r10, r5, r10, lsl #4; \
+    add r11, r5, r11, lsl #4; \
+    \
     sub r6, #4; \
     \
     vld1.8 {q9}, [r8];     /* load L_{ntz(i+0)} */ \
@@ -1270,7 +1314,11 @@ _gcry_aes_ocb_dec_armv8_ce:
     /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */ \
     /* Checksum_i = Checksum_{i-1} xor P_i  */ \
     \
-    ldr r8, [r5], #4; \
+    rbit r8, lr; \
+    add lr, #1; \
+    clz r8, r8; /* ntz(i) */ \
+    add r8, r5, r8, lsl #4; \
+    \
     vld1.8 {q2}, [r8]; /* load L_{ntz(i)} */ \
     vld1.8 {q1}, [r2]!; /* load ciphertext */ \
     subs r6, #1; \
@@ -1320,9 +1368,10 @@ _gcry_aes_ocb_dec_armv8_ce:
  *                                   const unsigned char *abuf,
  *                                   unsigned char *offset,
  *                                   unsigned char *checksum,
- *                                   void **Ls,
+ *                                   unsigned char *L_table,
  *                                   size_t nblocks,
- *                                   unsigned int nrounds);
+ *                                   unsigned int nrounds,
+ *                                   unsigned int blkn);
  */
 
 .align 3
@@ -1337,6 +1386,7 @@ _gcry_aes_ocb_auth_armv8_ce:
    *    %st+0: Ls => r5
    *    %st+4: nblocks => r6  (0 < nblocks <= 32)
    *    %st+8: nrounds => r7
+   *    %st+12: blkn => lr
    */
 
   vpush {q4-q7}
@@ -1344,6 +1394,7 @@ _gcry_aes_ocb_auth_armv8_ce:
   ldr r7, [sp, #(104+8)]
   ldr r5, [sp, #(104+0)]
   ldr r6, [sp, #(104+4)]
+  ldr lr, [sp, #(104+12)]
 
   cmp r7, #12
   vld1.8 {q0}, [r2] /* load offset */
@@ -1356,6 +1407,7 @@ _gcry_aes_ocb_auth_armv8_ce:
 #define OCB_AUTH(bits, ...) \
   .Locb_auth_entry_##bits: \
     cmp r6, #4; \
+    add lr, #1; \
     blo .Locb_auth_loop_##bits; \
     \
   .Locb_auth_loop4_##bits: \
@@ -1363,7 +1415,23 @@ _gcry_aes_ocb_auth_armv8_ce:
     /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
     /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */ \
     \
-    ldm r5!, {r8, r9, r10, r11}; \
+    add r9, lr, #1; \
+    add r10, lr, #2; \
+    add r11, lr, #3; \
+    rbit r8, lr; \
+    add lr, lr, #4; \
+    rbit r9, r9; \
+    rbit r10, r10; \
+    rbit r11, r11; \
+    clz r8, r8; /* ntz(i+0) */ \
+    clz r9, r9; /* ntz(i+1) */ \
+    clz r10, r10; /* ntz(i+2) */ \
+    clz r11, r11; /* ntz(i+3) */ \
+    add r8, r5, r8, lsl #4; \
+    add r9, r5, r9, lsl #4; \
+    add r10, r5, r10, lsl #4; \
+    add r11, r5, r11, lsl #4; \
+    \
     sub r6, #4; \
     \
     vld1.8 {q9}, [r8];     /* load L_{ntz(i+0)} */ \
@@ -1401,8 +1469,12 @@ _gcry_aes_ocb_auth_armv8_ce:
     /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
     /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */ \
     \
-    ldr r8, [r5], #4; \
-    vld1.8 {q2}, [r8]; /* load L_{ntz(i)} */ \
+    rbit r8, lr; \
+    add lr, #1; \
+    clz r8, r8; /* ntz(i) */ \
+    add r8, r5, r8, lsl #4; \
+    \
+    vld1.8 {q2}, [r8];  /* load L_{ntz(i)} */ \
     vld1.8 {q1}, [r1]!; /* load aadtext */ \
     subs r6, #1; \
     veor q0, q0, q2; \
index 21d0aec..1ebb363 100644 (file)
 .text
 
 
-#if (SIZEOF_VOID_P == 4)
-  #define ptr8   w8
-  #define ptr9   w9
-  #define ptr10  w10
-  #define ptr11  w11
-  #define ptr_sz 4
-#elif (SIZEOF_VOID_P == 8)
-  #define ptr8   x8
-  #define ptr9   x9
-  #define ptr10  x10
-  #define ptr11  x11
-  #define ptr_sz 8
-#else
-  #error "missing SIZEOF_VOID_P"
-#endif
-
-
 #define GET_DATA_POINTER(reg, name) \
        adrp    reg, :got:name ; \
        ldr     reg, [reg, #:got_lo12:name] ;
@@ -855,9 +838,10 @@ _gcry_aes_cfb_dec_armv8_ce:
  *                                  const unsigned char *inbuf,
  *                                  unsigned char *offset,
  *                                  unsigned char *checksum,
- *                                  void **Ls,
+ *                                  unsigned char *L_table,
  *                                  size_t nblocks,
- *                                  unsigned int nrounds);
+ *                                  unsigned int nrounds,
+ *                                  unsigned int blkn);
  */
 
 .align 3
@@ -870,11 +854,13 @@ _gcry_aes_ocb_enc_armv8_ce:
    *    x2: inbuf
    *    x3: offset
    *    x4: checksum
-   *    x5: Ls
+   *    x5: Ltable
    *    x6: nblocks (0 < nblocks <= 32)
    *    w7: nrounds
+   *    %st+0: blkn => w12
    */
 
+  ldr w12, [sp]
   ld1 {v0.16b}, [x3] /* load offset */
   ld1 {v16.16b}, [x4] /* load checksum */
 
@@ -886,6 +872,7 @@ _gcry_aes_ocb_enc_armv8_ce:
 #define OCB_ENC(bits, ...) \
   .Locb_enc_entry_##bits: \
     cmp x6, #4; \
+    add x12, x12, #1; \
     b.lo .Locb_enc_loop_##bits; \
     \
   .Locb_enc_loop4_##bits: \
@@ -894,10 +881,24 @@ _gcry_aes_ocb_enc_armv8_ce:
     /* Checksum_i = Checksum_{i-1} xor P_i  */ \
     /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */ \
     \
-    ldp ptr8, ptr9, [x5], #(ptr_sz*2); \
+    add w9, w12, #1; \
+    add w10, w12, #2; \
+    add w11, w12, #3; \
+    rbit w8, w12; \
+    add w12, w12, #4; \
+    rbit w9, w9; \
+    rbit w10, w10; \
+    rbit w11, w11; \
+    clz w8, w8; /* ntz(i+0) */ \
+    clz w9, w9; /* ntz(i+1) */ \
+    clz w10, w10; /* ntz(i+2) */ \
+    clz w11, w11; /* ntz(i+3) */ \
+    add x8, x5, x8, lsl #4; \
+    ld1 {v1.16b-v4.16b}, [x2], #64;   /* load P_i+<0-3> */ \
+    add x9, x5, x9, lsl #4; \
+    add x10, x5, x10, lsl #4; \
+    add x11, x5, x11, lsl #4; \
     \
-    ld1 {v1.16b-v4.16b}, [x2], #64; /* load P_i+<0-3> */ \
-    ldp ptr10, ptr11, [x5], #(ptr_sz*2); \
     sub x6, x6, #4; \
     \
     ld1 {v5.16b}, [x8];               /* load L_{ntz(i+0)} */ \
@@ -940,7 +941,11 @@ _gcry_aes_ocb_enc_armv8_ce:
     /* Checksum_i = Checksum_{i-1} xor P_i  */ \
     /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */ \
     \
-    ldr ptr8, [x5], #(ptr_sz); \
+    rbit x8, x12; \
+    add x12, x12, #1; \
+    clz x8, x8; /* ntz(i) */ \
+    add x8, x5, x8, lsl #4; \
+    \
     ld1 {v1.16b}, [x2], #16; /* load plaintext */ \
     ld1 {v2.16b}, [x8]; /* load L_{ntz(i)} */ \
     sub x6, x6, #1; \
@@ -983,9 +988,10 @@ _gcry_aes_ocb_enc_armv8_ce:
  *                                  const unsigned char *inbuf,
  *                                  unsigned char *offset,
  *                                  unsigned char *checksum,
- *                                  void **Ls,
+ *                                  unsigned char *L_table,
  *                                  size_t nblocks,
- *                                  unsigned int nrounds);
+ *                                  unsigned int nrounds,
+ *                                  unsigned int blkn);
  */
 
 .align 3
@@ -998,11 +1004,13 @@ _gcry_aes_ocb_dec_armv8_ce:
    *    x2: inbuf
    *    x3: offset
    *    x4: checksum
-   *    x5: Ls
+   *    x5: Ltable
    *    x6: nblocks (0 < nblocks <= 32)
    *    w7: nrounds
+   *    %st+0: blkn => w12
    */
 
+  ldr w12, [sp]
   ld1 {v0.16b}, [x3] /* load offset */
   ld1 {v16.16b}, [x4] /* load checksum */
 
@@ -1014,6 +1022,7 @@ _gcry_aes_ocb_dec_armv8_ce:
 #define OCB_DEC(bits) \
   .Locb_dec_entry_##bits: \
     cmp x6, #4; \
+    add w12, w12, #1; \
     b.lo .Locb_dec_loop_##bits; \
     \
   .Locb_dec_loop4_##bits: \
@@ -1022,10 +1031,24 @@ _gcry_aes_ocb_dec_armv8_ce:
     /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */ \
     /* Checksum_i = Checksum_{i-1} xor P_i  */ \
     \
-    ldp ptr8, ptr9, [x5], #(ptr_sz*2); \
+    add w9, w12, #1; \
+    add w10, w12, #2; \
+    add w11, w12, #3; \
+    rbit w8, w12; \
+    add w12, w12, #4; \
+    rbit w9, w9; \
+    rbit w10, w10; \
+    rbit w11, w11; \
+    clz w8, w8; /* ntz(i+0) */ \
+    clz w9, w9; /* ntz(i+1) */ \
+    clz w10, w10; /* ntz(i+2) */ \
+    clz w11, w11; /* ntz(i+3) */ \
+    add x8, x5, x8, lsl #4; \
+    ld1 {v1.16b-v4.16b}, [x2], #64;   /* load C_i+<0-3> */ \
+    add x9, x5, x9, lsl #4; \
+    add x10, x5, x10, lsl #4; \
+    add x11, x5, x11, lsl #4; \
     \
-    ld1 {v1.16b-v4.16b}, [x2], #64; /* load C_i+<0-3> */ \
-    ldp ptr10, ptr11, [x5], #(ptr_sz*2); \
     sub x6, x6, #4; \
     \
     ld1 {v5.16b}, [x8];               /* load L_{ntz(i+0)} */ \
@@ -1068,7 +1091,11 @@ _gcry_aes_ocb_dec_armv8_ce:
     /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */ \
     /* Checksum_i = Checksum_{i-1} xor P_i  */ \
     \
-    ldr ptr8, [x5], #(ptr_sz); \
+    rbit w8, w12; \
+    add w12, w12, #1; \
+    clz w8, w8; /* ntz(i) */ \
+    add x8, x5, x8, lsl #4; \
+    \
     ld1 {v1.16b}, [x2], #16; /* load ciphertext */ \
     ld1 {v2.16b}, [x8]; /* load L_{ntz(i)} */ \
     sub x6, x6, #1; \
@@ -1110,9 +1137,10 @@ _gcry_aes_ocb_dec_armv8_ce:
  *                                   const unsigned char *abuf,
  *                                   unsigned char *offset,
  *                                   unsigned char *checksum,
- *                                   void **Ls,
+ *                                   unsigned char *L_table,
  *                                   size_t nblocks,
- *                                   unsigned int nrounds);
+ *                                   unsigned int nrounds,
+ *                                   unsigned int blkn);
  */
 
 .align 3
@@ -1124,10 +1152,12 @@ _gcry_aes_ocb_auth_armv8_ce:
    *    x1: abuf
    *    x2: offset => x3
    *    x3: checksum => x4
-   *    x4: Ls => x5
+   *    x4: Ltable => x5
    *    x5: nblocks => x6  (0 < nblocks <= 32)
    *    w6: nrounds => w7
+   *    w7: blkn => w12
    */
+  mov x12, x7
   mov x7, x6
   mov x6, x5
   mov x5, x4
@@ -1145,6 +1175,7 @@ _gcry_aes_ocb_auth_armv8_ce:
 #define OCB_AUTH(bits) \
   .Locb_auth_entry_##bits: \
     cmp x6, #4; \
+    add w12, w12, #1; \
     b.lo .Locb_auth_loop_##bits; \
     \
   .Locb_auth_loop4_##bits: \
@@ -1152,10 +1183,24 @@ _gcry_aes_ocb_auth_armv8_ce:
     /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
     /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */ \
     \
-    ldp ptr8, ptr9, [x5], #(ptr_sz*2); \
+    add w9, w12, #1; \
+    add w10, w12, #2; \
+    add w11, w12, #3; \
+    rbit w8, w12; \
+    add w12, w12, #4; \
+    rbit w9, w9; \
+    rbit w10, w10; \
+    rbit w11, w11; \
+    clz w8, w8; /* ntz(i+0) */ \
+    clz w9, w9; /* ntz(i+1) */ \
+    clz w10, w10; /* ntz(i+2) */ \
+    clz w11, w11; /* ntz(i+3) */ \
+    add x8, x5, x8, lsl #4; \
+    ld1 {v1.16b-v4.16b}, [x1], #64;   /* load A_i+<0-3> */ \
+    add x9, x5, x9, lsl #4; \
+    add x10, x5, x10, lsl #4; \
+    add x11, x5, x11, lsl #4; \
     \
-    ld1 {v1.16b-v4.16b}, [x1], #64; /* load A_i+<0-3> */ \
-    ldp ptr10, ptr11, [x5], #(ptr_sz*2); \
     sub x6, x6, #4; \
     \
     ld1 {v5.16b}, [x8];               /* load L_{ntz(i+0)} */ \
@@ -1192,7 +1237,11 @@ _gcry_aes_ocb_auth_armv8_ce:
     /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
     /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */ \
     \
-    ldr ptr8, [x5], #(ptr_sz); \
+    rbit w8, w12; \
+    add w12, w12, #1; \
+    clz w8, w8; /* ntz(i) */ \
+    add x8, x5, x8, lsl #4; \
+    \
     ld1 {v1.16b}, [x1], #16; /* load aadtext */ \
     ld1 {v2.16b}, [x8]; /* load L_{ntz(i)} */ \
     sub x6, x6, #1; \
index 1bf74da..334cf68 100644 (file)
@@ -80,30 +80,33 @@ extern void _gcry_aes_ocb_enc_armv8_ce (const void *keysched,
                                         const unsigned char *inbuf,
                                         unsigned char *offset,
                                         unsigned char *checksum,
-                                        void **Ls,
+                                        unsigned char *L_table,
                                         size_t nblocks,
-                                        unsigned int nrounds);
+                                        unsigned int nrounds,
+                                        unsigned int blkn);
 extern void _gcry_aes_ocb_dec_armv8_ce (const void *keysched,
                                         unsigned char *outbuf,
                                         const unsigned char *inbuf,
                                         unsigned char *offset,
                                         unsigned char *checksum,
-                                        void **Ls,
+                                        unsigned char *L_table,
                                         size_t nblocks,
-                                        unsigned int nrounds);
+                                        unsigned int nrounds,
+                                        unsigned int blkn);
 extern void _gcry_aes_ocb_auth_armv8_ce (const void *keysched,
                                          const unsigned char *abuf,
                                          unsigned char *offset,
                                          unsigned char *checksum,
-                                         void **Ls,
+                                         unsigned char *L_table,
                                          size_t nblocks,
-                                         unsigned int nrounds);
+                                         unsigned int nrounds,
+                                         unsigned int blkn);
 
 typedef void (*ocb_crypt_fn_t) (const void *keysched, unsigned char *outbuf,
                                 const unsigned char *inbuf,
                                 unsigned char *offset, unsigned char *checksum,
-                                void **Ls, size_t nblocks,
-                                unsigned int nrounds);
+                                unsigned char *L_table, size_t nblocks,
+                                unsigned int nrounds, unsigned int blkn);
 
 void
 _gcry_aes_armv8_ce_setkey (RIJNDAEL_context *ctx, const byte *key)
@@ -334,62 +337,11 @@ _gcry_aes_armv8_ce_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
   const unsigned char *inbuf = inbuf_arg;
   unsigned int nrounds = ctx->rounds;
   u64 blkn = c->u_mode.ocb.data_nblocks;
-  u64 blkn_offs = blkn - blkn % 32;
-  unsigned int n = 32 - blkn % 32;
-  void *Ls[32];
-  void **l;
-  size_t i;
 
   c->u_mode.ocb.data_nblocks = blkn + nblocks;
 
-  if (nblocks >= 32)
-    {
-      for (i = 0; i < 32; i += 8)
-        {
-          Ls[(i + 0 + n) % 32] = (void *)c->u_mode.ocb.L[0];
-          Ls[(i + 1 + n) % 32] = (void *)c->u_mode.ocb.L[1];
-          Ls[(i + 2 + n) % 32] = (void *)c->u_mode.ocb.L[0];
-          Ls[(i + 3 + n) % 32] = (void *)c->u_mode.ocb.L[2];
-          Ls[(i + 4 + n) % 32] = (void *)c->u_mode.ocb.L[0];
-          Ls[(i + 5 + n) % 32] = (void *)c->u_mode.ocb.L[1];
-          Ls[(i + 6 + n) % 32] = (void *)c->u_mode.ocb.L[0];
-        }
-
-      Ls[(7 + n) % 32] = (void *)c->u_mode.ocb.L[3];
-      Ls[(15 + n) % 32] = (void *)c->u_mode.ocb.L[4];
-      Ls[(23 + n) % 32] = (void *)c->u_mode.ocb.L[3];
-      l = &Ls[(31 + n) % 32];
-
-      /* Process data in 32 block chunks. */
-      while (nblocks >= 32)
-        {
-          blkn_offs += 32;
-          *l = (void *)ocb_get_l(c, blkn_offs);
-
-          crypt_fn(keysched, outbuf, inbuf, c->u_iv.iv, c->u_ctr.ctr, Ls, 32,
-                    nrounds);
-
-          nblocks -= 32;
-          outbuf += 32 * 16;
-          inbuf  += 32 * 16;
-        }
-
-      if (nblocks && l < &Ls[nblocks])
-        {
-          *l = (void *)ocb_get_l(c, 32 + blkn_offs);
-        }
-    }
-  else
-    {
-      for (i = 0; i < nblocks; i++)
-        Ls[i] = (void *)ocb_get_l(c, ++blkn);
-    }
-
-  if (nblocks)
-    {
-      crypt_fn(keysched, outbuf, inbuf, c->u_iv.iv, c->u_ctr.ctr, Ls, nblocks,
-               nrounds);
-    }
+  crypt_fn(keysched, outbuf, inbuf, c->u_iv.iv, c->u_ctr.ctr,
+           c->u_mode.ocb.L[0], nblocks, nrounds, (unsigned int)blkn);
 }
 
 void
@@ -401,61 +353,12 @@ _gcry_aes_armv8_ce_ocb_auth (gcry_cipher_hd_t c, void *abuf_arg,
   const unsigned char *abuf = abuf_arg;
   unsigned int nrounds = ctx->rounds;
   u64 blkn = c->u_mode.ocb.aad_nblocks;
-  u64 blkn_offs = blkn - blkn % 32;
-  unsigned int n = 32 - blkn % 32;
-  void *Ls[32];
-  void **l;
-  size_t i;
 
   c->u_mode.ocb.aad_nblocks = blkn + nblocks;
 
-  if (nblocks >= 32)
-    {
-      for (i = 0; i < 32; i += 8)
-        {
-          Ls[(i + 0 + n) % 32] = (void *)c->u_mode.ocb.L[0];
-          Ls[(i + 1 + n) % 32] = (void *)c->u_mode.ocb.L[1];
-          Ls[(i + 2 + n) % 32] = (void *)c->u_mode.ocb.L[0];
-          Ls[(i + 3 + n) % 32] = (void *)c->u_mode.ocb.L[2];
-          Ls[(i + 4 + n) % 32] = (void *)c->u_mode.ocb.L[0];
-          Ls[(i + 5 + n) % 32] = (void *)c->u_mode.ocb.L[1];
-          Ls[(i + 6 + n) % 32] = (void *)c->u_mode.ocb.L[0];
-        }
-
-      Ls[(7 + n) % 32] = (void *)c->u_mode.ocb.L[3];
-      Ls[(15 + n) % 32] = (void *)c->u_mode.ocb.L[4];
-      Ls[(23 + n) % 32] = (void *)c->u_mode.ocb.L[3];
-      l = &Ls[(31 + n) % 32];
-
-      /* Process data in 32 block chunks. */
-      while (nblocks >= 32)
-        {
-          blkn_offs += 32;
-          *l = (void *)ocb_get_l(c, blkn_offs);
-
-          _gcry_aes_ocb_auth_armv8_ce(keysched, abuf, c->u_mode.ocb.aad_offset,
-                                      c->u_mode.ocb.aad_sum, Ls, 32, nrounds);
-
-          nblocks -= 32;
-          abuf += 32 * 16;
-        }
-
-      if (nblocks && l < &Ls[nblocks])
-        {
-          *l = (void *)ocb_get_l(c, 32 + blkn_offs);
-        }
-    }
-  else
-    {
-      for (i = 0; i < nblocks; i++)
-        Ls[i] = (void *)ocb_get_l(c, ++blkn);
-    }
-
-  if (nblocks)
-    {
-      _gcry_aes_ocb_auth_armv8_ce(keysched, abuf, c->u_mode.ocb.aad_offset,
-                                  c->u_mode.ocb.aad_sum, Ls, nblocks, nrounds);
-    }
+  _gcry_aes_ocb_auth_armv8_ce(keysched, abuf, c->u_mode.ocb.aad_offset,
+                             c->u_mode.ocb.aad_sum, c->u_mode.ocb.L[0],
+                             nblocks, nrounds, (unsigned int)blkn);
 }
 
 #endif /* USE_ARM_CE */