Enable AMD64 CAST5 implementation on WIN64
authorJussi Kivilinna <jussi.kivilinna@iki.fi>
Tue, 5 May 2015 17:46:10 +0000 (20:46 +0300)
committerJussi Kivilinna <jussi.kivilinna@iki.fi>
Thu, 14 May 2015 10:43:59 +0000 (13:43 +0300)
* cipher/cast5-amd64.S: Enable when
HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS defined.
(RIP): Remove.
(GET_EXTERN_POINTER): Use 'leaq' version on WIN64.
(ELF): New macro to mask lines with ELF specific commands.
* cipher/cast5.c (USE_AMD64_ASM): Enable when
HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS defined.
[HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS] (call_sysv_fn): New.
(do_encrypt_block, do_decrypt_block)
[HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS]: Call assembly
function through 'call_sysv_fn'.
(cast5_amd64_ctr_enc, cast5_amd64_cbc_dec)
(cast5_amd64_cfb_dec): New wrapper functions for bulk
assembly functions.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
cipher/cast5-amd64.S
cipher/cast5.c

index 41fbb74..a5f078e 100644 (file)
 
 #ifdef __x86_64
 #include <config.h>
-#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && defined(USE_CAST5)
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_CAST5)
 
-#ifdef __PIC__
-#  define RIP %rip
+#if defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS) || !defined(__PIC__)
+#  define GET_EXTERN_POINTER(name, reg) leaq name, reg
+#else
 #  define GET_EXTERN_POINTER(name, reg) movq name@GOTPCREL(%rip), reg
+#endif
+
+#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
+# define ELF(...) __VA_ARGS__
 #else
-#  define RIP
-#  define GET_EXTERN_POINTER(name, reg) leaq name, reg
+# define ELF(...) /*_*/
 #endif
 
 .text
 
 .align 8
 .globl _gcry_cast5_amd64_encrypt_block
-.type   _gcry_cast5_amd64_encrypt_block,@function;
+ELF(.type   _gcry_cast5_amd64_encrypt_block,@function;)
 
 _gcry_cast5_amd64_encrypt_block:
        /* input:
@@ -216,11 +221,11 @@ _gcry_cast5_amd64_encrypt_block:
        popq %rbx;
        popq %rbp;
        ret;
-.size _gcry_cast5_amd64_encrypt_block,.-_gcry_cast5_amd64_encrypt_block;
+ELF(.size _gcry_cast5_amd64_encrypt_block,.-_gcry_cast5_amd64_encrypt_block;)
 
 .align 8
 .globl _gcry_cast5_amd64_decrypt_block
-.type   _gcry_cast5_amd64_decrypt_block,@function;
+ELF(.type   _gcry_cast5_amd64_decrypt_block,@function;)
 
 _gcry_cast5_amd64_decrypt_block:
        /* input:
@@ -256,7 +261,7 @@ _gcry_cast5_amd64_decrypt_block:
        popq %rbx;
        popq %rbp;
        ret;
-.size _gcry_cast5_amd64_decrypt_block,.-_gcry_cast5_amd64_decrypt_block;
+ELF(.size _gcry_cast5_amd64_decrypt_block,.-_gcry_cast5_amd64_decrypt_block;)
 
 /**********************************************************************
   4-way cast5, four blocks parallel
@@ -359,7 +364,7 @@ _gcry_cast5_amd64_decrypt_block:
        rorq $32,               d;
 
 .align 8
-.type   __cast5_enc_blk4,@function;
+ELF(.type   __cast5_enc_blk4,@function;)
 
 __cast5_enc_blk4:
        /* input:
@@ -384,10 +389,10 @@ __cast5_enc_blk4:
 
        outbswap_block4(RLR0, RLR1, RLR2, RLR3);
        ret;
-.size __cast5_enc_blk4,.-__cast5_enc_blk4;
+ELF(.size __cast5_enc_blk4,.-__cast5_enc_blk4;)
 
 .align 8
-.type   __cast5_dec_blk4,@function;
+ELF(.type   __cast5_dec_blk4,@function;)
 
 __cast5_dec_blk4:
        /* input:
@@ -414,11 +419,11 @@ __cast5_dec_blk4:
 
        outbswap_block4(RLR0, RLR1, RLR2, RLR3);
        ret;
-.size __cast5_dec_blk4,.-__cast5_dec_blk4;
+ELF(.size __cast5_dec_blk4,.-__cast5_dec_blk4;)
 
 .align 8
 .globl _gcry_cast5_amd64_ctr_enc
-.type   _gcry_cast5_amd64_ctr_enc,@function;
+ELF(.type   _gcry_cast5_amd64_ctr_enc,@function;)
 _gcry_cast5_amd64_ctr_enc:
        /* input:
         *      %rdi: ctx, CTX
@@ -472,11 +477,11 @@ _gcry_cast5_amd64_ctr_enc:
        popq %rbx;
        popq %rbp;
        ret
-.size _gcry_cast5_amd64_ctr_enc,.-_gcry_cast5_amd64_ctr_enc;
+ELF(.size _gcry_cast5_amd64_ctr_enc,.-_gcry_cast5_amd64_ctr_enc;)
 
 .align 8
 .globl _gcry_cast5_amd64_cbc_dec
-.type   _gcry_cast5_amd64_cbc_dec,@function;
+ELF(.type   _gcry_cast5_amd64_cbc_dec,@function;)
 _gcry_cast5_amd64_cbc_dec:
        /* input:
         *      %rdi: ctx, CTX
@@ -526,11 +531,11 @@ _gcry_cast5_amd64_cbc_dec:
        popq %rbp;
        ret;
 
-.size _gcry_cast5_amd64_cbc_dec,.-_gcry_cast5_amd64_cbc_dec;
+ELF(.size _gcry_cast5_amd64_cbc_dec,.-_gcry_cast5_amd64_cbc_dec;)
 
 .align 8
 .globl _gcry_cast5_amd64_cfb_dec
-.type   _gcry_cast5_amd64_cfb_dec,@function;
+ELF(.type   _gcry_cast5_amd64_cfb_dec,@function;)
 _gcry_cast5_amd64_cfb_dec:
        /* input:
         *      %rdi: ctx, CTX
@@ -581,7 +586,7 @@ _gcry_cast5_amd64_cfb_dec:
        popq %rbp;
        ret;
 
-.size _gcry_cast5_amd64_cfb_dec,.-_gcry_cast5_amd64_cfb_dec;
+ELF(.size _gcry_cast5_amd64_cfb_dec,.-_gcry_cast5_amd64_cfb_dec;)
 
 #endif /*defined(USE_CAST5)*/
 #endif /*__x86_64*/
index 115e1e6..94dcee7 100644 (file)
@@ -48,7 +48,8 @@
 
 /* USE_AMD64_ASM indicates whether to use AMD64 assembly code. */
 #undef USE_AMD64_ASM
-#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)
+#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 # define USE_AMD64_ASM 1
 #endif
 
@@ -372,16 +373,72 @@ extern void _gcry_cast5_amd64_cbc_dec(CAST5_context *ctx, byte *out,
 extern void _gcry_cast5_amd64_cfb_dec(CAST5_context *ctx, byte *out,
                                      const byte *in, byte *iv);
 
+#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+static inline void
+call_sysv_fn (const void *fn, const void *arg1, const void *arg2,
+              const void *arg3, const void *arg4)
+{
+  /* Call SystemV ABI function without storing non-volatile XMM registers,
+   * as target function does not use vector instruction sets. */
+  asm volatile ("callq *%0\n\t"
+                : "+a" (fn),
+                  "+D" (arg1),
+                  "+S" (arg2),
+                  "+d" (arg3),
+                  "+c" (arg4)
+                :
+                : "cc", "memory", "r8", "r9", "r10", "r11");
+}
+#endif
+
 static void
 do_encrypt_block (CAST5_context *context, byte *outbuf, const byte *inbuf)
 {
+#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+  call_sysv_fn (_gcry_cast5_amd64_encrypt_block, context, outbuf, inbuf, NULL);
+#else
   _gcry_cast5_amd64_encrypt_block (context, outbuf, inbuf);
+#endif
 }
 
 static void
 do_decrypt_block (CAST5_context *context, byte *outbuf, const byte *inbuf)
 {
+#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+  call_sysv_fn (_gcry_cast5_amd64_decrypt_block, context, outbuf, inbuf, NULL);
+#else
   _gcry_cast5_amd64_decrypt_block (context, outbuf, inbuf);
+#endif
+}
+
+static void
+cast5_amd64_ctr_enc(CAST5_context *ctx, byte *out, const byte *in, byte *ctr)
+{
+#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+  call_sysv_fn (_gcry_cast5_amd64_ctr_enc, ctx, out, in, ctr);
+#else
+  _gcry_cast5_amd64_ctr_enc (ctx, out, in, ctr);
+#endif
+}
+
+static void
+cast5_amd64_cbc_dec(CAST5_context *ctx, byte *out, const byte *in, byte *iv)
+{
+#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+  call_sysv_fn (_gcry_cast5_amd64_cbc_dec, ctx, out, in, iv);
+#else
+  _gcry_cast5_amd64_cbc_dec (ctx, out, in, iv);
+#endif
+}
+
+static void
+cast5_amd64_cfb_dec(CAST5_context *ctx, byte *out, const byte *in, byte *iv)
+{
+#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+  call_sysv_fn (_gcry_cast5_amd64_cfb_dec, ctx, out, in, iv);
+#else
+  _gcry_cast5_amd64_cfb_dec (ctx, out, in, iv);
+#endif
 }
 
 static unsigned int
@@ -396,7 +453,7 @@ static unsigned int
 decrypt_block (void *context, byte *outbuf, const byte *inbuf)
 {
   CAST5_context *c = (CAST5_context *) context;
-  _gcry_cast5_amd64_decrypt_block (c, outbuf, inbuf);
+  do_decrypt_block (c, outbuf, inbuf);
   return /*burn_stack*/ (2*8);
 }
 
@@ -582,7 +639,7 @@ _gcry_cast5_ctr_enc(void *context, unsigned char *ctr, void *outbuf_arg,
     /* Process data in 4 block chunks. */
     while (nblocks >= 4)
       {
-        _gcry_cast5_amd64_ctr_enc(ctx, outbuf, inbuf, ctr);
+        cast5_amd64_ctr_enc(ctx, outbuf, inbuf, ctr);
 
         nblocks -= 4;
         outbuf += 4 * CAST5_BLOCKSIZE;
@@ -651,7 +708,7 @@ _gcry_cast5_cbc_dec(void *context, unsigned char *iv, void *outbuf_arg,
     /* Process data in 4 block chunks. */
     while (nblocks >= 4)
       {
-        _gcry_cast5_amd64_cbc_dec(ctx, outbuf, inbuf, iv);
+        cast5_amd64_cbc_dec(ctx, outbuf, inbuf, iv);
 
         nblocks -= 4;
         outbuf += 4 * CAST5_BLOCKSIZE;
@@ -710,7 +767,7 @@ _gcry_cast5_cfb_dec(void *context, unsigned char *iv, void *outbuf_arg,
     /* Process data in 4 block chunks. */
     while (nblocks >= 4)
       {
-        _gcry_cast5_amd64_cfb_dec(ctx, outbuf, inbuf, iv);
+        cast5_amd64_cfb_dec(ctx, outbuf, inbuf, iv);
 
         nblocks -= 4;
         outbuf += 4 * CAST5_BLOCKSIZE;