Enable AMD64 Camellia implementations on WIN64
authorJussi Kivilinna <jussi.kivilinna@iki.fi>
Thu, 14 May 2015 10:33:07 +0000 (13:33 +0300)
committerJussi Kivilinna <jussi.kivilinna@iki.fi>
Thu, 14 May 2015 10:43:57 +0000 (13:43 +0300)
* cipher/camellia-aesni-avx-amd64.S: Enable when
HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS defined.
(ELF): New macro to mask lines with ELF specific commands.
* cipher/camellia-aesni-avx2-amd64.S: Enable when
HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS defined.
(ELF): New macro to mask lines with ELF specific commands.
* cipher/camellia-glue.c (USE_AESNI_AVX, USE_AESNI_AVX2): Enable when
HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS defined.
[USE_AESNI_AVX ||¬†USE_AESNI_AVX2] (ASM_FUNC_ABI, ASM_EXTRA_STACK): New.
(_gcry_camellia_aesni_avx_ctr_enc, _gcry_camellia_aesni_avx_cbc_dec)
(_gcry_camellia_aesni_avx_cfb_dec, _gcry_camellia_aesni_avx_keygen)
(_gcry_camellia_aesni_avx2_ctr_enc, _gcry_camellia_aesni_avx2_cbc_dec)
(_gcry_camellia_aesni_avx2_cfb_dec): Add ASM_FUNC_ABI.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
cipher/camellia-aesni-avx-amd64.S
cipher/camellia-aesni-avx2-amd64.S
cipher/camellia-glue.c

index 6d157a7..c047a21 100644 (file)
@@ -20,7 +20,8 @@
 
 #ifdef __x86_64
 #include <config.h>
-#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
     defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT)
 
 #ifdef __PIC__
 #  define RIP
 #endif
 
+#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
+# define ELF(...) __VA_ARGS__
+#else
+# define ELF(...) /*_*/
+#endif
+
 #define CAMELLIA_TABLE_BYTE_LEN 272
 
 /* struct CAMELLIA_context: */
 .text
 
 .align 8
-.type   __camellia_enc_blk16,@function;
+ELF(.type   __camellia_enc_blk16,@function;)
 
 __camellia_enc_blk16:
        /* input:
@@ -853,10 +860,10 @@ __camellia_enc_blk16:
                     %xmm15, %rax, %rcx, 24);
 
        jmp .Lenc_done;
-.size __camellia_enc_blk16,.-__camellia_enc_blk16;
+ELF(.size __camellia_enc_blk16,.-__camellia_enc_blk16;)
 
 .align 8
-.type   __camellia_dec_blk16,@function;
+ELF(.type   __camellia_dec_blk16,@function;)
 
 __camellia_dec_blk16:
        /* input:
@@ -938,7 +945,7 @@ __camellia_dec_blk16:
              ((key_table + (24) * 8) + 4)(CTX));
 
        jmp .Ldec_max24;
-.size __camellia_dec_blk16,.-__camellia_dec_blk16;
+ELF(.size __camellia_dec_blk16,.-__camellia_dec_blk16;)
 
 #define inc_le128(x, minus_one, tmp) \
        vpcmpeqq minus_one, x, tmp; \
@@ -948,7 +955,7 @@ __camellia_dec_blk16:
 
 .align 8
 .globl _gcry_camellia_aesni_avx_ctr_enc
-.type   _gcry_camellia_aesni_avx_ctr_enc,@function;
+ELF(.type   _gcry_camellia_aesni_avx_ctr_enc,@function;)
 
 _gcry_camellia_aesni_avx_ctr_enc:
        /* input:
@@ -1062,11 +1069,11 @@ _gcry_camellia_aesni_avx_ctr_enc:
 
        leave;
        ret;
-.size _gcry_camellia_aesni_avx_ctr_enc,.-_gcry_camellia_aesni_avx_ctr_enc;
+ELF(.size _gcry_camellia_aesni_avx_ctr_enc,.-_gcry_camellia_aesni_avx_ctr_enc;)
 
 .align 8
 .globl _gcry_camellia_aesni_avx_cbc_dec
-.type   _gcry_camellia_aesni_avx_cbc_dec,@function;
+ELF(.type   _gcry_camellia_aesni_avx_cbc_dec,@function;)
 
 _gcry_camellia_aesni_avx_cbc_dec:
        /* input:
@@ -1130,11 +1137,11 @@ _gcry_camellia_aesni_avx_cbc_dec:
 
        leave;
        ret;
-.size _gcry_camellia_aesni_avx_cbc_dec,.-_gcry_camellia_aesni_avx_cbc_dec;
+ELF(.size _gcry_camellia_aesni_avx_cbc_dec,.-_gcry_camellia_aesni_avx_cbc_dec;)
 
 .align 8
 .globl _gcry_camellia_aesni_avx_cfb_dec
-.type   _gcry_camellia_aesni_avx_cfb_dec,@function;
+ELF(.type   _gcry_camellia_aesni_avx_cfb_dec,@function;)
 
 _gcry_camellia_aesni_avx_cfb_dec:
        /* input:
@@ -1202,7 +1209,7 @@ _gcry_camellia_aesni_avx_cfb_dec:
 
        leave;
        ret;
-.size _gcry_camellia_aesni_avx_cfb_dec,.-_gcry_camellia_aesni_avx_cfb_dec;
+ELF(.size _gcry_camellia_aesni_avx_cfb_dec,.-_gcry_camellia_aesni_avx_cfb_dec;)
 
 /*
  * IN:
@@ -1309,7 +1316,7 @@ _gcry_camellia_aesni_avx_cfb_dec:
 .text
 
 .align 8
-.type  __camellia_avx_setup128,@function;
+ELF(.type  __camellia_avx_setup128,@function;)
 __camellia_avx_setup128:
        /* input:
         *      %rdi: ctx, CTX; subkey storage at key_table(CTX)
@@ -1650,10 +1657,10 @@ __camellia_avx_setup128:
        vzeroall;
 
        ret;
-.size __camellia_avx_setup128,.-__camellia_avx_setup128;
+ELF(.size __camellia_avx_setup128,.-__camellia_avx_setup128;)
 
 .align 8
-.type  __camellia_avx_setup256,@function;
+ELF(.type  __camellia_avx_setup256,@function;)
 
 __camellia_avx_setup256:
        /* input:
@@ -2127,11 +2134,11 @@ __camellia_avx_setup256:
        vzeroall;
 
        ret;
-.size __camellia_avx_setup256,.-__camellia_avx_setup256;
+ELF(.size __camellia_avx_setup256,.-__camellia_avx_setup256;)
 
 .align 8
 .globl _gcry_camellia_aesni_avx_keygen
-.type  _gcry_camellia_aesni_avx_keygen,@function;
+ELF(.type  _gcry_camellia_aesni_avx_keygen,@function;)
 
 _gcry_camellia_aesni_avx_keygen:
        /* input:
@@ -2159,7 +2166,7 @@ _gcry_camellia_aesni_avx_keygen:
        vpor %xmm2, %xmm1, %xmm1;
 
        jmp __camellia_avx_setup256;
-.size _gcry_camellia_aesni_avx_keygen,.-_gcry_camellia_aesni_avx_keygen;
+ELF(.size _gcry_camellia_aesni_avx_keygen,.-_gcry_camellia_aesni_avx_keygen;)
 
 #endif /*defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT)*/
 #endif /*__x86_64*/
index 25f48bc..a3fa229 100644 (file)
@@ -20,7 +20,8 @@
 
 #ifdef __x86_64
 #include <config.h>
-#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
     defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT)
 
 #ifdef __PIC__
 #  define RIP
 #endif
 
+#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
+# define ELF(...) __VA_ARGS__
+#else
+# define ELF(...) /*_*/
+#endif
+
 #define CAMELLIA_TABLE_BYTE_LEN 272
 
 /* struct CAMELLIA_context: */
 .text
 
 .align 8
-.type   __camellia_enc_blk32,@function;
+ELF(.type   __camellia_enc_blk32,@function;)
 
 __camellia_enc_blk32:
        /* input:
@@ -832,10 +839,10 @@ __camellia_enc_blk32:
                     %ymm15, %rax, %rcx, 24);
 
        jmp .Lenc_done;
-.size __camellia_enc_blk32,.-__camellia_enc_blk32;
+ELF(.size __camellia_enc_blk32,.-__camellia_enc_blk32;)
 
 .align 8
-.type   __camellia_dec_blk32,@function;
+ELF(.type   __camellia_dec_blk32,@function;)
 
 __camellia_dec_blk32:
        /* input:
@@ -917,7 +924,7 @@ __camellia_dec_blk32:
              ((key_table + (24) * 8) + 4)(CTX));
 
        jmp .Ldec_max24;
-.size __camellia_dec_blk32,.-__camellia_dec_blk32;
+ELF(.size __camellia_dec_blk32,.-__camellia_dec_blk32;)
 
 #define inc_le128(x, minus_one, tmp) \
        vpcmpeqq minus_one, x, tmp; \
@@ -927,7 +934,7 @@ __camellia_dec_blk32:
 
 .align 8
 .globl _gcry_camellia_aesni_avx2_ctr_enc
-.type   _gcry_camellia_aesni_avx2_ctr_enc,@function;
+ELF(.type   _gcry_camellia_aesni_avx2_ctr_enc,@function;)
 
 _gcry_camellia_aesni_avx2_ctr_enc:
        /* input:
@@ -1111,11 +1118,11 @@ _gcry_camellia_aesni_avx2_ctr_enc:
 
        leave;
        ret;
-.size _gcry_camellia_aesni_avx2_ctr_enc,.-_gcry_camellia_aesni_avx2_ctr_enc;
+ELF(.size _gcry_camellia_aesni_avx2_ctr_enc,.-_gcry_camellia_aesni_avx2_ctr_enc;)
 
 .align 8
 .globl _gcry_camellia_aesni_avx2_cbc_dec
-.type   _gcry_camellia_aesni_avx2_cbc_dec,@function;
+ELF(.type   _gcry_camellia_aesni_avx2_cbc_dec,@function;)
 
 _gcry_camellia_aesni_avx2_cbc_dec:
        /* input:
@@ -1183,11 +1190,11 @@ _gcry_camellia_aesni_avx2_cbc_dec:
 
        leave;
        ret;
-.size _gcry_camellia_aesni_avx2_cbc_dec,.-_gcry_camellia_aesni_avx2_cbc_dec;
+ELF(.size _gcry_camellia_aesni_avx2_cbc_dec,.-_gcry_camellia_aesni_avx2_cbc_dec;)
 
 .align 8
 .globl _gcry_camellia_aesni_avx2_cfb_dec
-.type   _gcry_camellia_aesni_avx2_cfb_dec,@function;
+ELF(.type   _gcry_camellia_aesni_avx2_cfb_dec,@function;)
 
 _gcry_camellia_aesni_avx2_cfb_dec:
        /* input:
@@ -1257,7 +1264,7 @@ _gcry_camellia_aesni_avx2_cfb_dec:
 
        leave;
        ret;
-.size _gcry_camellia_aesni_avx2_cfb_dec,.-_gcry_camellia_aesni_avx2_cfb_dec;
+ELF(.size _gcry_camellia_aesni_avx2_cfb_dec,.-_gcry_camellia_aesni_avx2_cfb_dec;)
 
 #endif /*defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT)*/
 #endif /*__x86_64*/
index f18d135..5032321 100644 (file)
@@ -75,7 +75,8 @@
 /* USE_AESNI inidicates whether to compile with Intel AES-NI/AVX code. */
 #undef USE_AESNI_AVX
 #if defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT)
-# if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)
+# if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 #  define USE_AESNI_AVX 1
 # endif
 #endif
@@ -83,7 +84,8 @@
 /* USE_AESNI_AVX2 inidicates whether to compile with Intel AES-NI/AVX2 code. */
 #undef USE_AESNI_AVX2
 #if defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT)
-# if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)
+# if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 #  define USE_AESNI_AVX2 1
 # endif
 #endif
@@ -100,6 +102,20 @@ typedef struct
 #endif /*USE_AESNI_AVX2*/
 } CAMELLIA_context;
 
+/* Assembly implementations use SystemV ABI, ABI conversion and additional
+ * stack to store XMM6-XMM15 needed on Win64. */
+#undef ASM_FUNC_ABI
+#undef ASM_EXTRA_STACK
+#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
+# ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+#  define ASM_FUNC_ABI __attribute__((sysv_abi))
+#  define ASM_EXTRA_STACK (10 * 16)
+# else
+#  define ASM_FUNC_ABI
+#  define ASM_EXTRA_STACK 0
+# endif
+#endif
+
 #ifdef USE_AESNI_AVX
 /* Assembler implementations of Camellia using AES-NI and AVX.  Process data
    in 16 block same time.
@@ -107,21 +123,21 @@ typedef struct
 extern void _gcry_camellia_aesni_avx_ctr_enc(CAMELLIA_context *ctx,
                                             unsigned char *out,
                                             const unsigned char *in,
-                                            unsigned char *ctr);
+                                            unsigned char *ctr) ASM_FUNC_ABI;
 
 extern void _gcry_camellia_aesni_avx_cbc_dec(CAMELLIA_context *ctx,
                                             unsigned char *out,
                                             const unsigned char *in,
-                                            unsigned char *iv);
+                                            unsigned char *iv) ASM_FUNC_ABI;
 
 extern void _gcry_camellia_aesni_avx_cfb_dec(CAMELLIA_context *ctx,
                                             unsigned char *out,
                                             const unsigned char *in,
-                                            unsigned char *iv);
+                                            unsigned char *iv) ASM_FUNC_ABI;
 
 extern void _gcry_camellia_aesni_avx_keygen(CAMELLIA_context *ctx,
                                            const unsigned char *key,
-                                           unsigned int keylen);
+                                           unsigned int keylen) ASM_FUNC_ABI;
 #endif
 
 #ifdef USE_AESNI_AVX2
@@ -131,17 +147,17 @@ extern void _gcry_camellia_aesni_avx_keygen(CAMELLIA_context *ctx,
 extern void _gcry_camellia_aesni_avx2_ctr_enc(CAMELLIA_context *ctx,
                                              unsigned char *out,
                                              const unsigned char *in,
-                                             unsigned char *ctr);
+                                             unsigned char *ctr) ASM_FUNC_ABI;
 
 extern void _gcry_camellia_aesni_avx2_cbc_dec(CAMELLIA_context *ctx,
                                              unsigned char *out,
                                              const unsigned char *in,
-                                             unsigned char *iv);
+                                             unsigned char *iv) ASM_FUNC_ABI;
 
 extern void _gcry_camellia_aesni_avx2_cfb_dec(CAMELLIA_context *ctx,
                                              unsigned char *out,
                                              const unsigned char *in,
-                                             unsigned char *iv);
+                                             unsigned char *iv) ASM_FUNC_ABI;
 #endif
 
 static const char *selftest(void);
@@ -318,7 +334,7 @@ _gcry_camellia_ctr_enc(void *context, unsigned char *ctr,
       if (did_use_aesni_avx2)
         {
           int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE + 16 +
-                                        2 * sizeof(void *);
+                                        2 * sizeof(void *) + ASM_EXTRA_STACK;
 
           if (burn_stack_depth < avx2_burn_stack_depth)
             burn_stack_depth = avx2_burn_stack_depth;
@@ -347,8 +363,11 @@ _gcry_camellia_ctr_enc(void *context, unsigned char *ctr,
 
       if (did_use_aesni_avx)
         {
-          if (burn_stack_depth < 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *))
-            burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *);
+          int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE +
+                                       2 * sizeof(void *) + ASM_EXTRA_STACK;
+
+          if (burn_stack_depth < avx_burn_stack_depth)
+            burn_stack_depth = avx_burn_stack_depth;
         }
 
       /* Use generic code to handle smaller chunks... */
@@ -409,7 +428,7 @@ _gcry_camellia_cbc_dec(void *context, unsigned char *iv,
       if (did_use_aesni_avx2)
         {
           int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE + 16 +
-                                        2 * sizeof(void *);
+                                        2 * sizeof(void *) + ASM_EXTRA_STACK;;
 
           if (burn_stack_depth < avx2_burn_stack_depth)
             burn_stack_depth = avx2_burn_stack_depth;
@@ -437,8 +456,11 @@ _gcry_camellia_cbc_dec(void *context, unsigned char *iv,
 
       if (did_use_aesni_avx)
         {
-          if (burn_stack_depth < 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *))
-            burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *);
+          int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE +
+                                       2 * sizeof(void *) + ASM_EXTRA_STACK;
+
+          if (burn_stack_depth < avx_burn_stack_depth)
+            burn_stack_depth = avx_burn_stack_depth;
         }
 
       /* Use generic code to handle smaller chunks... */
@@ -491,7 +513,7 @@ _gcry_camellia_cfb_dec(void *context, unsigned char *iv,
       if (did_use_aesni_avx2)
         {
           int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE + 16 +
-                                        2 * sizeof(void *);
+                                        2 * sizeof(void *) + ASM_EXTRA_STACK;
 
           if (burn_stack_depth < avx2_burn_stack_depth)
             burn_stack_depth = avx2_burn_stack_depth;
@@ -519,8 +541,11 @@ _gcry_camellia_cfb_dec(void *context, unsigned char *iv,
 
       if (did_use_aesni_avx)
         {
-          if (burn_stack_depth < 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *))
-            burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *);
+          int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE +
+                                       2 * sizeof(void *) + ASM_EXTRA_STACK;
+
+          if (burn_stack_depth < avx_burn_stack_depth)
+            burn_stack_depth = avx_burn_stack_depth;
         }
 
       /* Use generic code to handle smaller chunks... */