1 /* blowfish-amd64.S - AMD64 assembly implementation of Blowfish cipher
3 * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
5 * This file is part of Libgcrypt.
7 * Libgcrypt is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU Lesser General Public License as
9 * published by the Free Software Foundation; either version 2.1 of
10 * the License, or (at your option) any later version.
12 * Libgcrypt is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with this program; if not, see <http://www.gnu.org/licenses/>.
23 #if defined(USE_BLOWFISH) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)
27 /* structure of BLOWFISH_context: */
29 #define s1 ((s0) + 256 * 4)
30 #define s2 ((s1) + 256 * 4)
31 #define s3 ((s2) + 256 * 4)
32 #define p ((s3) + 256 * 4)
70 /***********************************************************************
72 ***********************************************************************/
80 movl s0(CTX,RT0,4), RT0d; \
81 addl s1(CTX,RT2,4), RT0d; \
82 xorl s2(CTX,RT1,4), RT0d; \
83 addl s3(CTX,RT3,4), RT0d; \
86 #define load_roundkey_enc(n) \
87 movq p+4*(n)(CTX), RX3;
89 #define add_roundkey_enc() \
92 #define round_enc(n) \
94 load_roundkey_enc(n); \
99 #define load_roundkey_dec(n) \
100 movq p+4*(n-1)(CTX), RX3; \
103 #define add_roundkey_dec() \
106 #define round_dec(n) \
107 add_roundkey_dec(); \
108 load_roundkey_dec(n); \
113 #define read_block() \
118 #define write_block() \
123 .type __blowfish_enc_blk1,@function;
128 * RX0: input plaintext block
130 * RX0: output plaintext block
134 load_roundkey_enc(0);
148 .size __blowfish_enc_blk1,.-__blowfish_enc_blk1;
151 .globl _gcry_blowfish_amd64_do_encrypt
152 .type _gcry_blowfish_amd64_do_encrypt,@function;
154 _gcry_blowfish_amd64_do_encrypt:
167 call __blowfish_enc_blk1;
174 .size _gcry_blowfish_amd64_do_encrypt,.-_gcry_blowfish_amd64_do_encrypt;
177 .globl _gcry_blowfish_amd64_encrypt_block
178 .type _gcry_blowfish_amd64_encrypt_block,@function;
180 _gcry_blowfish_amd64_encrypt_block:
192 call __blowfish_enc_blk1;
198 .size _gcry_blowfish_amd64_encrypt_block,.-_gcry_blowfish_amd64_encrypt_block;
201 .globl _gcry_blowfish_amd64_decrypt_block
202 .type _gcry_blowfish_amd64_decrypt_block,@function;
204 _gcry_blowfish_amd64_decrypt_block:
217 load_roundkey_dec(17);
234 .size _gcry_blowfish_amd64_decrypt_block,.-_gcry_blowfish_amd64_decrypt_block;
236 /**********************************************************************
237 4-way blowfish, four blocks parallel
238 **********************************************************************/
240 movzbl x ## bh, RT1d; \
241 movzbl x ## bl, RT3d; \
243 movzbl x ## bh, RT0d; \
244 movzbl x ## bl, RT2d; \
246 movl s0(CTX,RT0,4), RT0d; \
247 addl s1(CTX,RT2,4), RT0d; \
248 xorl s2(CTX,RT1,4), RT0d; \
249 addl s3(CTX,RT3,4), RT0d; \
252 #define add_preloaded_roundkey4() \
258 #define preload_roundkey_enc(n) \
259 movq p+4*(n)(CTX), RKEY;
261 #define add_roundkey_enc4(n) \
262 add_preloaded_roundkey4(); \
263 preload_roundkey_enc(n + 2);
265 #define round_enc4(n) \
266 add_roundkey_enc4(n); \
278 #define preload_roundkey_dec(n) \
279 movq p+4*((n)-1)(CTX), RKEY; \
282 #define add_roundkey_dec4(n) \
283 add_preloaded_roundkey4(); \
284 preload_roundkey_dec(n - 2);
286 #define round_dec4(n) \
287 add_roundkey_dec4(n); \
299 #define inbswap_block4() \
309 #define inctrswap_block4() \
315 #define outbswap_block4() \
322 .type __blowfish_enc_blk4,@function;
327 * RX0,RX1,RX2,RX3: four input inbswapped plaintext blocks
329 * RX0,RX1,RX2,RX3: four output ciphertext blocks
331 preload_roundkey_enc(0);
341 add_preloaded_roundkey4();
346 .size __blowfish_enc_blk4,.-__blowfish_enc_blk4;
349 .type __blowfish_dec_blk4,@function;
354 * RX0,RX1,RX2,RX3: four input ciphertext blocks
356 * RX0,RX1,RX2,RX3: four output plaintext blocks
358 preload_roundkey_dec(17);
370 add_preloaded_roundkey4();
375 .size __blowfish_dec_blk4,.-__blowfish_dec_blk4;
378 .globl _gcry_blowfish_amd64_ctr_enc
379 .type _gcry_blowfish_amd64_ctr_enc,@function;
380 _gcry_blowfish_amd64_ctr_enc:
383 * %rsi: dst (4 blocks)
384 * %rdx: src (4 blocks)
385 * %rcx: iv (big endian, 64bit)
392 /* %r11-%r13 are not used by __blowfish_enc_blk4 */
393 movq %rcx, %r13; /*iv*/
394 movq %rdx, %r12; /*src*/
395 movq %rsi, %r11; /*dst*/
397 /* load IV and byteswap */
414 call __blowfish_enc_blk4;
416 /* XOR key-stream with plaintext */
417 xorq 0 * 8(%r12), RX0;
418 xorq 1 * 8(%r12), RX1;
419 xorq 2 * 8(%r12), RX2;
420 xorq 3 * 8(%r12), RX3;
421 movq RX0, 0 * 8(%r11);
422 movq RX1, 1 * 8(%r11);
423 movq RX2, 2 * 8(%r11);
424 movq RX3, 3 * 8(%r11);
432 .size _gcry_blowfish_amd64_ctr_enc,.-_gcry_blowfish_amd64_ctr_enc;
435 .globl _gcry_blowfish_amd64_cbc_dec
436 .type _gcry_blowfish_amd64_cbc_dec,@function;
437 _gcry_blowfish_amd64_cbc_dec:
440 * %rsi: dst (4 blocks)
441 * %rdx: src (4 blocks)
449 /* %r11-%r13 are not used by __blowfish_dec_blk4 */
450 movq %rsi, %r11; /*dst*/
451 movq %rdx, %r12; /*src*/
452 movq %rcx, %r13; /*iv*/
455 movq 0 * 8(%r12), RX0;
456 movq 1 * 8(%r12), RX1;
457 movq 2 * 8(%r12), RX2;
458 movq 3 * 8(%r12), RX3;
460 call __blowfish_dec_blk4;
462 movq 3 * 8(%r12), RT0;
464 xorq 0 * 8(%r12), RX1;
465 xorq 1 * 8(%r12), RX2;
466 xorq 2 * 8(%r12), RX3;
467 movq RT0, (%r13); /* store new IV */
469 movq RX0, 0 * 8(%r11);
470 movq RX1, 1 * 8(%r11);
471 movq RX2, 2 * 8(%r11);
472 movq RX3, 3 * 8(%r11);
480 .size _gcry_blowfish_amd64_cbc_dec,.-_gcry_blowfish_amd64_cbc_dec;
483 .globl _gcry_blowfish_amd64_cfb_dec
484 .type _gcry_blowfish_amd64_cfb_dec,@function;
485 _gcry_blowfish_amd64_cfb_dec:
488 * %rsi: dst (4 blocks)
489 * %rdx: src (4 blocks)
497 /* %r11-%r13 are not used by __blowfish_enc_blk4 */
498 movq %rcx, %r13; /*iv*/
499 movq %rdx, %r12; /*src*/
500 movq %rsi, %r11; /*dst*/
504 movq 0 * 8(%r12), RX1;
505 movq 1 * 8(%r12), RX2;
506 movq 2 * 8(%r12), RX3;
511 movq 3 * 8(%r12), RT0;
514 call __blowfish_enc_blk4;
516 xorq 0 * 8(%r12), RX0;
517 xorq 1 * 8(%r12), RX1;
518 xorq 2 * 8(%r12), RX2;
519 xorq 3 * 8(%r12), RX3;
520 movq RX0, 0 * 8(%r11);
521 movq RX1, 1 * 8(%r11);
522 movq RX2, 2 * 8(%r11);
523 movq RX3, 3 * 8(%r11);
530 .size _gcry_blowfish_amd64_cfb_dec,.-_gcry_blowfish_amd64_cfb_dec;
532 #endif /*defined(USE_BLOWFISH)*/