1 /* salsa-armv7-neon.S - ARM NEON implementation of Salsa20 cipher
3 * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
5 * This file is part of Libgcrypt.
7 * Libgcrypt is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU Lesser General Public License as
9 * published by the Free Software Foundation; either version 2.1 of
10 * the License, or (at your option) any later version.
12 * Libgcrypt is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with this program; if not, see <http://www.gnu.org/licenses/>.
23 #if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \
24 defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
25 defined(HAVE_GCC_INLINE_ASM_NEON) && defined(USE_SALSA20)
28 * Based on public domain implementation from SUPERCOP benchmarking framework
29 * by Peter Schwabe and D. J. Bernstein. Paper about the implementation at:
30 * http://cryptojedi.org/papers/#neoncrypto
39 .globl _gcry_arm_neon_salsa20_encrypt
40 .type _gcry_arm_neon_salsa20_encrypt,%function;
41 _gcry_arm_neon_salsa20_encrypt:
43 * - arguments changed to (void *c, const void *m, unsigned int nblks,
44 * void *ctx, unsigned int rounds) from (void *c, const void *m,
45 * unsigned long long mlen, const void *n, const void *k)
46 * - nonce and key read from 'ctx' as well as sigma and counter.
47 * - read in counter from 'ctx' at the start.
48 * - update counter in 'ctx' at the end.
49 * - length is input as number of blocks, so don't handle tail bytes
50 * (this is done in salsa20.c).
107 vst1.8 {d4-d5},[r12,: 128]
110 vst1.8 {d2-d3},[r14,: 128]
120 blo .L_mlenlowbelow192
126 vld1.8 {d18-d19},[r12,: 128]
131 vext.32 d10,d18,d7,#1
134 vld1.8 {d24-d25},[r14,: 128]
141 vst1.8 {d4-d5},[r8,: 128]
147 vst1.8 {d10-d11},[r9,: 128]
149 vext.32 d12,d7,d19,#1
153 vext.32 d14,d18,d7,#1
156 vst1.8 {d12-d13},[r2,: 128]
158 vst1.8 {d14-d15},[r6,: 128]
170 eor r4,r4,r12,ROR #25
171 eor r7,r7,r14,ROR #25
178 eor r6,r6,r12,ROR #23
179 eor r3,r3,r14,ROR #23
194 eor r2,r2,r12,ROR #19
225 eor r6,r14,r6,ROR #23
234 eor r7,r12,r7,ROR #19
235 eor r11,r11,r14,ROR #19
244 eor r10,r10,r12,ROR #14
245 eor r9,r9,r14,ROR #14
250 eor r1,r1,r12,ROR #25
251 eor r7,r7,r14,ROR #25
258 eor r5,r5,r12,ROR #23
259 eor r6,r6,r14,ROR #23
272 eor r3,r3,r12,ROR #19
274 eor r4,r4,r14,ROR #19
280 eor r0,r0,r12,ROR #14
300 eor r11,r11,r6,ROR #25
301 eor r2,r2,r12,ROR #25
311 eor r3,r12,r3,ROR #23
320 eor r7,r14,r7,ROR #19
321 eor r8,r8,r12,ROR #19
330 eor r10,r10,r12,ROR #14
332 eor r9,r9,r14,ROR #14
342 eor r4,r4,r12,ROR #25
343 eor r7,r7,r14,ROR #25
350 eor r6,r6,r12,ROR #23
351 eor r3,r3,r14,ROR #23
366 eor r2,r2,r12,ROR #19
397 eor r6,r14,r6,ROR #23
406 eor r7,r12,r7,ROR #19
407 eor r11,r11,r14,ROR #19
416 eor r10,r10,r12,ROR #14
417 eor r9,r9,r14,ROR #14
422 eor r1,r1,r12,ROR #25
423 eor r7,r7,r14,ROR #25
430 eor r5,r5,r12,ROR #23
431 eor r6,r6,r14,ROR #23
444 eor r3,r3,r12,ROR #19
446 eor r4,r4,r14,ROR #19
452 eor r0,r0,r12,ROR #14
472 eor r11,r11,r6,ROR #25
473 eor r2,r2,r12,ROR #25
483 eor r3,r12,r3,ROR #23
492 eor r7,r14,r7,ROR #19
493 eor r8,r8,r12,ROR #19
502 eor r10,r10,r12,ROR #14
505 eor r9,r9,r14,ROR #14
552 vld1.8 {d16-d17},[r0,: 128]
558 vld1.8 {d16-d17},[r0,: 128]
564 vmov.i64 q8,#0xffffffff
566 vext.32 d20,d8,d10,#1
568 vext.32 d25,d9,d11,#1
572 vext.32 d21,d5,d19,#1
574 vext.32 d24,d4,d18,#1
610 vld1.8 {d20-d21},[r12]!
613 vld1.8 {d20-d21},[r12]!
616 vld1.8 {d20-d21},[r12]!
619 vld1.8 {d20-d21},[r12]!
623 vst1.8 {d10-d11},[r8]!
626 vst1.8 {d18-d19},[r8]!
633 vld1.8 {d10-d11},[r6,: 128]
635 vld1.8 {d14-d15},[r7,: 128]
639 vext.32 d14,d4,d10,#1
641 vext.32 d21,d5,d11,#1
645 vext.32 d15,d13,d19,#1
648 vext.32 d20,d12,d18,#1
658 vld1.8 {d16-d17},[r12]!
660 vld1.8 {d16-d17},[r12]!
662 vld1.8 {d16-d17},[r12]!
664 vld1.8 {d16-d17},[r12]!
668 vst1.8 {d10-d11},[r8]!
669 vst1.8 {d12-d13},[r8]!
670 vst1.8 {d14-d15},[r8]!
676 bhs .L_mlenatleast192
701 eor r4,r4,r12,ROR #25
702 eor r7,r7,r14,ROR #25
705 eor r6,r6,r12,ROR #23
706 eor r3,r3,r14,ROR #23
711 eor r2,r2,r12,ROR #19
729 eor r6,r12,r6,ROR #23
733 eor r7,r14,r7,ROR #19
734 eor r11,r11,r12,ROR #19
737 eor r10,r10,r12,ROR #14
738 eor r9,r9,r14,ROR #14
741 eor r1,r1,r12,ROR #25
742 eor r7,r7,r14,ROR #25
745 eor r6,r6,r12,ROR #23
746 eor r3,r3,r14,ROR #23
751 eor r5,r5,r12,ROR #19
759 eor r0,r0,r12,ROR #14
765 eor r11,r11,r2,ROR #25
766 eor r2,r6,r12,ROR #25
769 eor r6,r14,r6,ROR #23
770 eor r3,r3,r12,ROR #23
773 eor r7,r7,r12,ROR #19
774 eor r8,r8,r14,ROR #19
777 eor r10,r10,r12,ROR #14
778 eor r9,r9,r14,ROR #14
897 .size _gcry_arm_neon_salsa20_encrypt,.-_gcry_arm_neon_salsa20_encrypt;