SHA-1: Add SSSE3 implementation
[libgcrypt.git] / cipher / camellia-glue.c
index f9bbb33..f18d135 100644 (file)
 /* USE_AESNI inidicates whether to compile with Intel AES-NI/AVX code. */
 #undef USE_AESNI_AVX
 #if defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT)
-# if defined(__x86_64__)
+# if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)
 #  define USE_AESNI_AVX 1
 # endif
 #endif
 
+/* USE_AESNI_AVX2 inidicates whether to compile with Intel AES-NI/AVX2 code. */
+#undef USE_AESNI_AVX2
+#if defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT)
+# if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)
+#  define USE_AESNI_AVX2 1
+# endif
+#endif
+
 typedef struct
 {
-  int keybitlength;
   KEY_TABLE_TYPE keytable;
+  int keybitlength;
 #ifdef USE_AESNI_AVX
-  int use_aesni_avx;           /* AES-NI/AVX implementation shall be used.  */
+  unsigned int use_aesni_avx:1;        /* AES-NI/AVX implementation shall be used.  */
 #endif /*USE_AESNI_AVX*/
+#ifdef USE_AESNI_AVX2
+  unsigned int use_aesni_avx2:1;/* AES-NI/AVX2 implementation shall be used.  */
+#endif /*USE_AESNI_AVX2*/
 } CAMELLIA_context;
 
 #ifdef USE_AESNI_AVX
@@ -107,6 +118,30 @@ extern void _gcry_camellia_aesni_avx_cfb_dec(CAMELLIA_context *ctx,
                                             unsigned char *out,
                                             const unsigned char *in,
                                             unsigned char *iv);
+
+extern void _gcry_camellia_aesni_avx_keygen(CAMELLIA_context *ctx,
+                                           const unsigned char *key,
+                                           unsigned int keylen);
+#endif
+
+#ifdef USE_AESNI_AVX2
+/* Assembler implementations of Camellia using AES-NI and AVX2.  Process data
+   in 32 block same time.
+ */
+extern void _gcry_camellia_aesni_avx2_ctr_enc(CAMELLIA_context *ctx,
+                                             unsigned char *out,
+                                             const unsigned char *in,
+                                             unsigned char *ctr);
+
+extern void _gcry_camellia_aesni_avx2_cbc_dec(CAMELLIA_context *ctx,
+                                             unsigned char *out,
+                                             const unsigned char *in,
+                                             unsigned char *iv);
+
+extern void _gcry_camellia_aesni_avx2_cfb_dec(CAMELLIA_context *ctx,
+                                             unsigned char *out,
+                                             const unsigned char *in,
+                                             unsigned char *iv);
 #endif
 
 static const char *selftest(void);
@@ -117,6 +152,9 @@ camellia_setkey(void *c, const byte *key, unsigned keylen)
   CAMELLIA_context *ctx=c;
   static int initialized=0;
   static const char *selftest_failed=NULL;
+#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
+  unsigned int hwf = _gcry_get_hw_features ();
+#endif
 
   if(keylen!=16 && keylen!=24 && keylen!=32)
     return GPG_ERR_INV_KEYLEN;
@@ -132,28 +170,85 @@ camellia_setkey(void *c, const byte *key, unsigned keylen)
   if(selftest_failed)
     return GPG_ERR_SELFTEST_FAILED;
 
+#ifdef USE_AESNI_AVX
+  ctx->use_aesni_avx = (hwf & HWF_INTEL_AESNI) && (hwf & HWF_INTEL_AVX);
+#endif
+#ifdef USE_AESNI_AVX2
+  ctx->use_aesni_avx2 = (hwf & HWF_INTEL_AESNI) && (hwf & HWF_INTEL_AVX2);
+#endif
+
   ctx->keybitlength=keylen*8;
-  Camellia_Ekeygen(ctx->keybitlength,key,ctx->keytable);
-  _gcry_burn_stack
-    ((19+34+34)*sizeof(u32)+2*sizeof(void*) /* camellia_setup256 */
-     +(4+32)*sizeof(u32)+2*sizeof(void*)    /* camellia_setup192 */
-     +0+sizeof(int)+2*sizeof(void*)         /* Camellia_Ekeygen */
-     +3*2*sizeof(void*)                     /* Function calls.  */
-     );
 
+  if (0)
+    { }
 #ifdef USE_AESNI_AVX
-  ctx->use_aesni_avx = 0;
-  if ((_gcry_get_hw_features () & HWF_INTEL_AESNI) &&
-      (_gcry_get_hw_features () & HWF_INTEL_AVX))
+  else if (ctx->use_aesni_avx)
+    _gcry_camellia_aesni_avx_keygen(ctx, key, keylen);
+  else
+#endif
     {
-      ctx->use_aesni_avx = 1;
+      Camellia_Ekeygen(ctx->keybitlength,key,ctx->keytable);
+      _gcry_burn_stack
+        ((19+34+34)*sizeof(u32)+2*sizeof(void*) /* camellia_setup256 */
+         +(4+32)*sizeof(u32)+2*sizeof(void*)    /* camellia_setup192 */
+         +0+sizeof(int)+2*sizeof(void*)         /* Camellia_Ekeygen */
+         +3*2*sizeof(void*)                     /* Function calls.  */
+         );
     }
-#endif
 
   return 0;
 }
 
-static void
+#ifdef USE_ARM_ASM
+
+/* Assembly implementations of Camellia. */
+extern void _gcry_camellia_arm_encrypt_block(const KEY_TABLE_TYPE keyTable,
+                                              byte *outbuf, const byte *inbuf,
+                                              const int keybits);
+
+extern void _gcry_camellia_arm_decrypt_block(const KEY_TABLE_TYPE keyTable,
+                                              byte *outbuf, const byte *inbuf,
+                                              const int keybits);
+
+static void Camellia_EncryptBlock(const int keyBitLength,
+                                 const unsigned char *plaintext,
+                                 const KEY_TABLE_TYPE keyTable,
+                                 unsigned char *cipherText)
+{
+  _gcry_camellia_arm_encrypt_block(keyTable, cipherText, plaintext,
+                                    keyBitLength);
+}
+
+static void Camellia_DecryptBlock(const int keyBitLength,
+                                 const unsigned char *cipherText,
+                                 const KEY_TABLE_TYPE keyTable,
+                                 unsigned char *plaintext)
+{
+  _gcry_camellia_arm_decrypt_block(keyTable, plaintext, cipherText,
+                                    keyBitLength);
+}
+
+static unsigned int
+camellia_encrypt(void *c, byte *outbuf, const byte *inbuf)
+{
+  CAMELLIA_context *ctx = c;
+  Camellia_EncryptBlock(ctx->keybitlength,inbuf,ctx->keytable,outbuf);
+#define CAMELLIA_encrypt_stack_burn_size (15*4)
+  return /*burn_stack*/ (CAMELLIA_encrypt_stack_burn_size);
+}
+
+static unsigned int
+camellia_decrypt(void *c, byte *outbuf, const byte *inbuf)
+{
+  CAMELLIA_context *ctx=c;
+  Camellia_DecryptBlock(ctx->keybitlength,inbuf,ctx->keytable,outbuf);
+#define CAMELLIA_decrypt_stack_burn_size (15*4)
+  return /*burn_stack*/ (CAMELLIA_decrypt_stack_burn_size);
+}
+
+#else /*USE_ARM_ASM*/
+
+static unsigned int
 camellia_encrypt(void *c, byte *outbuf, const byte *inbuf)
 {
   CAMELLIA_context *ctx=c;
@@ -167,10 +262,10 @@ camellia_encrypt(void *c, byte *outbuf, const byte *inbuf)
      +2*2*sizeof(void*) /* Function calls.  */ \
     )
 
-  _gcry_burn_stack(CAMELLIA_encrypt_stack_burn_size);
+  return /*burn_stack*/ (CAMELLIA_encrypt_stack_burn_size);
 }
 
-static void
+static unsigned int
 camellia_decrypt(void *c, byte *outbuf, const byte *inbuf)
 {
   CAMELLIA_context *ctx=c;
@@ -184,16 +279,18 @@ camellia_decrypt(void *c, byte *outbuf, const byte *inbuf)
      +2*2*sizeof(void*) /* Function calls.  */ \
     )
 
-  _gcry_burn_stack(CAMELLIA_decrypt_stack_burn_size);
+  return /*burn_stack*/ (CAMELLIA_decrypt_stack_burn_size);
 }
 
+#endif /*!USE_ARM_ASM*/
+
 /* Bulk encryption of complete blocks in CTR mode.  This function is only
    intended for the bulk encryption feature of cipher.c.  CTR is expected to be
    of size CAMELLIA_BLOCK_SIZE. */
 void
 _gcry_camellia_ctr_enc(void *context, unsigned char *ctr,
                        void *outbuf_arg, const void *inbuf_arg,
-                       unsigned int nblocks)
+                       size_t nblocks)
 {
   CAMELLIA_context *ctx = context;
   unsigned char *outbuf = outbuf_arg;
@@ -202,6 +299,36 @@ _gcry_camellia_ctr_enc(void *context, unsigned char *ctr,
   int burn_stack_depth = CAMELLIA_encrypt_stack_burn_size;
   int i;
 
+#ifdef USE_AESNI_AVX2
+  if (ctx->use_aesni_avx2)
+    {
+      int did_use_aesni_avx2 = 0;
+
+      /* Process data in 32 block chunks. */
+      while (nblocks >= 32)
+        {
+          _gcry_camellia_aesni_avx2_ctr_enc(ctx, outbuf, inbuf, ctr);
+
+          nblocks -= 32;
+          outbuf += 32 * CAMELLIA_BLOCK_SIZE;
+          inbuf  += 32 * CAMELLIA_BLOCK_SIZE;
+          did_use_aesni_avx2 = 1;
+        }
+
+      if (did_use_aesni_avx2)
+        {
+          int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE + 16 +
+                                        2 * sizeof(void *);
+
+          if (burn_stack_depth < avx2_burn_stack_depth)
+            burn_stack_depth = avx2_burn_stack_depth;
+        }
+
+      /* Use generic code to handle smaller chunks... */
+      /* TODO: use caching instead? */
+    }
+#endif
+
 #ifdef USE_AESNI_AVX
   if (ctx->use_aesni_avx)
     {
@@ -220,9 +347,6 @@ _gcry_camellia_ctr_enc(void *context, unsigned char *ctr,
 
       if (did_use_aesni_avx)
         {
-          /* clear AVX registers */
-          asm volatile ("vzeroall;\n":::);
-
           if (burn_stack_depth < 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *))
             burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *);
         }
@@ -258,7 +382,7 @@ _gcry_camellia_ctr_enc(void *context, unsigned char *ctr,
 void
 _gcry_camellia_cbc_dec(void *context, unsigned char *iv,
                        void *outbuf_arg, const void *inbuf_arg,
-                       unsigned int nblocks)
+                       size_t nblocks)
 {
   CAMELLIA_context *ctx = context;
   unsigned char *outbuf = outbuf_arg;
@@ -266,6 +390,35 @@ _gcry_camellia_cbc_dec(void *context, unsigned char *iv,
   unsigned char savebuf[CAMELLIA_BLOCK_SIZE];
   int burn_stack_depth = CAMELLIA_decrypt_stack_burn_size;
 
+#ifdef USE_AESNI_AVX2
+  if (ctx->use_aesni_avx2)
+    {
+      int did_use_aesni_avx2 = 0;
+
+      /* Process data in 32 block chunks. */
+      while (nblocks >= 32)
+        {
+          _gcry_camellia_aesni_avx2_cbc_dec(ctx, outbuf, inbuf, iv);
+
+          nblocks -= 32;
+          outbuf += 32 * CAMELLIA_BLOCK_SIZE;
+          inbuf  += 32 * CAMELLIA_BLOCK_SIZE;
+          did_use_aesni_avx2 = 1;
+        }
+
+      if (did_use_aesni_avx2)
+        {
+          int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE + 16 +
+                                        2 * sizeof(void *);
+
+          if (burn_stack_depth < avx2_burn_stack_depth)
+            burn_stack_depth = avx2_burn_stack_depth;
+        }
+
+      /* Use generic code to handle smaller chunks... */
+    }
+#endif
+
 #ifdef USE_AESNI_AVX
   if (ctx->use_aesni_avx)
     {
@@ -284,9 +437,6 @@ _gcry_camellia_cbc_dec(void *context, unsigned char *iv,
 
       if (did_use_aesni_avx)
         {
-          /* clear AVX registers */
-          asm volatile ("vzeroall;\n":::);
-
           if (burn_stack_depth < 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *))
             burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *);
         }
@@ -297,14 +447,11 @@ _gcry_camellia_cbc_dec(void *context, unsigned char *iv,
 
   for ( ;nblocks; nblocks-- )
     {
-      /* We need to save INBUF away because it may be identical to
-         OUTBUF.  */
-      memcpy(savebuf, inbuf, CAMELLIA_BLOCK_SIZE);
-
-      Camellia_DecryptBlock(ctx->keybitlength, inbuf, ctx->keytable, outbuf);
+      /* INBUF is needed later and it may be identical to OUTBUF, so store
+         the intermediate result to SAVEBUF.  */
+      Camellia_DecryptBlock(ctx->keybitlength, inbuf, ctx->keytable, savebuf);
 
-      buf_xor(outbuf, outbuf, iv, CAMELLIA_BLOCK_SIZE);
-      memcpy(iv, savebuf, CAMELLIA_BLOCK_SIZE);
+      buf_xor_n_copy_2(outbuf, savebuf, iv, inbuf, CAMELLIA_BLOCK_SIZE);
       inbuf += CAMELLIA_BLOCK_SIZE;
       outbuf += CAMELLIA_BLOCK_SIZE;
     }
@@ -318,13 +465,42 @@ _gcry_camellia_cbc_dec(void *context, unsigned char *iv,
 void
 _gcry_camellia_cfb_dec(void *context, unsigned char *iv,
                        void *outbuf_arg, const void *inbuf_arg,
-                       unsigned int nblocks)
+                       size_t nblocks)
 {
   CAMELLIA_context *ctx = context;
   unsigned char *outbuf = outbuf_arg;
   const unsigned char *inbuf = inbuf_arg;
   int burn_stack_depth = CAMELLIA_decrypt_stack_burn_size;
 
+#ifdef USE_AESNI_AVX2
+  if (ctx->use_aesni_avx2)
+    {
+      int did_use_aesni_avx2 = 0;
+
+      /* Process data in 32 block chunks. */
+      while (nblocks >= 32)
+        {
+          _gcry_camellia_aesni_avx2_cfb_dec(ctx, outbuf, inbuf, iv);
+
+          nblocks -= 32;
+          outbuf += 32 * CAMELLIA_BLOCK_SIZE;
+          inbuf  += 32 * CAMELLIA_BLOCK_SIZE;
+          did_use_aesni_avx2 = 1;
+        }
+
+      if (did_use_aesni_avx2)
+        {
+          int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE + 16 +
+                                        2 * sizeof(void *);
+
+          if (burn_stack_depth < avx2_burn_stack_depth)
+            burn_stack_depth = avx2_burn_stack_depth;
+        }
+
+      /* Use generic code to handle smaller chunks... */
+    }
+#endif
+
 #ifdef USE_AESNI_AVX
   if (ctx->use_aesni_avx)
     {
@@ -343,9 +519,6 @@ _gcry_camellia_cfb_dec(void *context, unsigned char *iv,
 
       if (did_use_aesni_avx)
         {
-          /* clear AVX registers */
-          asm volatile ("vzeroall;\n":::);
-
           if (burn_stack_depth < 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *))
             burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *);
         }
@@ -370,11 +543,11 @@ _gcry_camellia_cfb_dec(void *context, unsigned char *iv,
 static const char*
 selftest_ctr_128 (void)
 {
-  const int nblocks = 16+1;
+  const int nblocks = 32+16+1;
   const int blocksize = CAMELLIA_BLOCK_SIZE;
   const int context_size = sizeof(CAMELLIA_context);
 
-  return _gcry_selftest_helper_ctr_128("CAMELLIA", &camellia_setkey,
+  return _gcry_selftest_helper_ctr("CAMELLIA", &camellia_setkey,
            &camellia_encrypt, &_gcry_camellia_ctr_enc, nblocks, blocksize,
           context_size);
 }
@@ -384,11 +557,11 @@ selftest_ctr_128 (void)
 static const char*
 selftest_cbc_128 (void)
 {
-  const int nblocks = 16+2;
+  const int nblocks = 32+16+2;
   const int blocksize = CAMELLIA_BLOCK_SIZE;
   const int context_size = sizeof(CAMELLIA_context);
 
-  return _gcry_selftest_helper_cbc_128("CAMELLIA", &camellia_setkey,
+  return _gcry_selftest_helper_cbc("CAMELLIA", &camellia_setkey,
            &camellia_encrypt, &_gcry_camellia_cbc_dec, nblocks, blocksize,
           context_size);
 }
@@ -398,11 +571,11 @@ selftest_cbc_128 (void)
 static const char*
 selftest_cfb_128 (void)
 {
-  const int nblocks = 16+2;
+  const int nblocks = 32+16+2;
   const int blocksize = CAMELLIA_BLOCK_SIZE;
   const int context_size = sizeof(CAMELLIA_context);
 
-  return _gcry_selftest_helper_cfb_128("CAMELLIA", &camellia_setkey,
+  return _gcry_selftest_helper_cfb("CAMELLIA", &camellia_setkey,
            &camellia_encrypt, &_gcry_camellia_cfb_dec, nblocks, blocksize,
           context_size);
 }
@@ -415,38 +588,38 @@ selftest(void)
   const char *r;
 
   /* These test vectors are from RFC-3713 */
-  const byte plaintext[]=
+  static const byte plaintext[]=
     {
       0x01,0x23,0x45,0x67,0x89,0xab,0xcd,0xef,
       0xfe,0xdc,0xba,0x98,0x76,0x54,0x32,0x10
     };
-  const byte key_128[]=
+  static const byte key_128[]=
     {
       0x01,0x23,0x45,0x67,0x89,0xab,0xcd,0xef,
       0xfe,0xdc,0xba,0x98,0x76,0x54,0x32,0x10
     };
-  const byte ciphertext_128[]=
+  static const byte ciphertext_128[]=
     {
       0x67,0x67,0x31,0x38,0x54,0x96,0x69,0x73,
       0x08,0x57,0x06,0x56,0x48,0xea,0xbe,0x43
     };
-  const byte key_192[]=
+  static const byte key_192[]=
     {
       0x01,0x23,0x45,0x67,0x89,0xab,0xcd,0xef,0xfe,0xdc,0xba,0x98,
       0x76,0x54,0x32,0x10,0x00,0x11,0x22,0x33,0x44,0x55,0x66,0x77
     };
-  const byte ciphertext_192[]=
+  static const byte ciphertext_192[]=
     {
       0xb4,0x99,0x34,0x01,0xb3,0xe9,0x96,0xf8,
       0x4e,0xe5,0xce,0xe7,0xd7,0x9b,0x09,0xb9
     };
-  const byte key_256[]=
+  static const byte key_256[]=
     {
       0x01,0x23,0x45,0x67,0x89,0xab,0xcd,0xef,0xfe,0xdc,0xba,
       0x98,0x76,0x54,0x32,0x10,0x00,0x11,0x22,0x33,0x44,0x55,
       0x66,0x77,0x88,0x99,0xaa,0xbb,0xcc,0xdd,0xee,0xff
     };
-  const byte ciphertext_256[]=
+  static const byte ciphertext_256[]=
     {
       0x9a,0xcc,0x23,0x7d,0xff,0x16,0xd7,0x6c,
       0x20,0xef,0x7c,0x91,0x9e,0x3a,0x75,0x09
@@ -521,18 +694,21 @@ static gcry_cipher_oid_spec_t camellia256_oids[] =
 
 gcry_cipher_spec_t _gcry_cipher_spec_camellia128 =
   {
+    GCRY_CIPHER_CAMELLIA128, {0, 0},
     "CAMELLIA128",NULL,camellia128_oids,CAMELLIA_BLOCK_SIZE,128,
     sizeof(CAMELLIA_context),camellia_setkey,camellia_encrypt,camellia_decrypt
   };
 
 gcry_cipher_spec_t _gcry_cipher_spec_camellia192 =
   {
+    GCRY_CIPHER_CAMELLIA192, {0, 0},
     "CAMELLIA192",NULL,camellia192_oids,CAMELLIA_BLOCK_SIZE,192,
     sizeof(CAMELLIA_context),camellia_setkey,camellia_encrypt,camellia_decrypt
   };
 
 gcry_cipher_spec_t _gcry_cipher_spec_camellia256 =
   {
+    GCRY_CIPHER_CAMELLIA256, {0, 0},
     "CAMELLIA256",NULL,camellia256_oids,CAMELLIA_BLOCK_SIZE,256,
     sizeof(CAMELLIA_context),camellia_setkey,camellia_encrypt,camellia_decrypt
   };