Add Aarch64 assembly implementation of Camellia
[libgcrypt.git] / cipher / camellia-glue.c
1 /* camellia-glue.c - Glue for the Camellia cipher
2  * Copyright (C) 2007 Free Software Foundation, Inc.
3  *
4  * This file is part of Libgcrypt.
5  *
6  * Libgcrypt is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU Lesser General Public License as
8  * published by the Free Software Foundation; either version 2.1 of
9  * the License, or (at your option) any later version.
10  *
11  * Libgcrypt is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19  * 02110-1301, USA.
20  */
21
22 /* I put all the libgcrypt-specific stuff in this file to keep the
23    camellia.c/camellia.h files exactly as provided by NTT.  If they
24    update their code, this should make it easier to bring the changes
25    in. - dshaw
26
27    There is one small change which needs to be done: Include the
28    following code at the top of camellia.h: */
29 #if 0
30
31 /* To use Camellia with libraries it is often useful to keep the name
32  * space of the library clean.  The following macro is thus useful:
33  *
34  *     #define CAMELLIA_EXT_SYM_PREFIX foo_
35  *
36  * This prefixes all external symbols with "foo_".
37  */
38 #ifdef HAVE_CONFIG_H
39 #include <config.h>
40 #endif
41 #ifdef CAMELLIA_EXT_SYM_PREFIX
42 #define CAMELLIA_PREFIX1(x,y) x ## y
43 #define CAMELLIA_PREFIX2(x,y) CAMELLIA_PREFIX1(x,y)
44 #define CAMELLIA_PREFIX(x)    CAMELLIA_PREFIX2(CAMELLIA_EXT_SYM_PREFIX,x)
45 #define Camellia_Ekeygen      CAMELLIA_PREFIX(Camellia_Ekeygen)
46 #define Camellia_EncryptBlock CAMELLIA_PREFIX(Camellia_EncryptBlock)
47 #define Camellia_DecryptBlock CAMELLIA_PREFIX(Camellia_DecryptBlock)
48 #define camellia_decrypt128   CAMELLIA_PREFIX(camellia_decrypt128)
49 #define camellia_decrypt256   CAMELLIA_PREFIX(camellia_decrypt256)
50 #define camellia_encrypt128   CAMELLIA_PREFIX(camellia_encrypt128)
51 #define camellia_encrypt256   CAMELLIA_PREFIX(camellia_encrypt256)
52 #define camellia_setup128     CAMELLIA_PREFIX(camellia_setup128)
53 #define camellia_setup192     CAMELLIA_PREFIX(camellia_setup192)
54 #define camellia_setup256     CAMELLIA_PREFIX(camellia_setup256)
55 #endif /*CAMELLIA_EXT_SYM_PREFIX*/
56
57 #endif /* Code sample. */
58
59
60 #include <config.h>
61 #include "types.h"
62 #include "g10lib.h"
63 #include "cipher.h"
64 #include "camellia.h"
65 #include "bufhelp.h"
66 #include "cipher-internal.h"
67 #include "cipher-selftest.h"
68
69 /* Helper macro to force alignment to 16 bytes.  */
70 #ifdef HAVE_GCC_ATTRIBUTE_ALIGNED
71 # define ATTR_ALIGNED_16  __attribute__ ((aligned (16)))
72 #else
73 # define ATTR_ALIGNED_16
74 #endif
75
76 /* USE_AESNI inidicates whether to compile with Intel AES-NI/AVX code. */
77 #undef USE_AESNI_AVX
78 #if defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT)
79 # if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
80      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
81 #  define USE_AESNI_AVX 1
82 # endif
83 #endif
84
85 /* USE_AESNI_AVX2 inidicates whether to compile with Intel AES-NI/AVX2 code. */
86 #undef USE_AESNI_AVX2
87 #if defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT)
88 # if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
89      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
90 #  define USE_AESNI_AVX2 1
91 # endif
92 #endif
93
94 typedef struct
95 {
96   KEY_TABLE_TYPE keytable;
97   int keybitlength;
98 #ifdef USE_AESNI_AVX
99   unsigned int use_aesni_avx:1; /* AES-NI/AVX implementation shall be used.  */
100 #endif /*USE_AESNI_AVX*/
101 #ifdef USE_AESNI_AVX2
102   unsigned int use_aesni_avx2:1;/* AES-NI/AVX2 implementation shall be used.  */
103 #endif /*USE_AESNI_AVX2*/
104 } CAMELLIA_context;
105
106 /* Assembly implementations use SystemV ABI, ABI conversion and additional
107  * stack to store XMM6-XMM15 needed on Win64. */
108 #undef ASM_FUNC_ABI
109 #undef ASM_EXTRA_STACK
110 #if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
111 # ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
112 #  define ASM_FUNC_ABI __attribute__((sysv_abi))
113 #  define ASM_EXTRA_STACK (10 * 16)
114 # else
115 #  define ASM_FUNC_ABI
116 #  define ASM_EXTRA_STACK 0
117 # endif
118 #endif
119
120 #ifdef USE_AESNI_AVX
121 /* Assembler implementations of Camellia using AES-NI and AVX.  Process data
122    in 16 block same time.
123  */
124 extern void _gcry_camellia_aesni_avx_ctr_enc(CAMELLIA_context *ctx,
125                                              unsigned char *out,
126                                              const unsigned char *in,
127                                              unsigned char *ctr) ASM_FUNC_ABI;
128
129 extern void _gcry_camellia_aesni_avx_cbc_dec(CAMELLIA_context *ctx,
130                                              unsigned char *out,
131                                              const unsigned char *in,
132                                              unsigned char *iv) ASM_FUNC_ABI;
133
134 extern void _gcry_camellia_aesni_avx_cfb_dec(CAMELLIA_context *ctx,
135                                              unsigned char *out,
136                                              const unsigned char *in,
137                                              unsigned char *iv) ASM_FUNC_ABI;
138
139 extern void _gcry_camellia_aesni_avx_ocb_enc(CAMELLIA_context *ctx,
140                                              unsigned char *out,
141                                              const unsigned char *in,
142                                              unsigned char *offset,
143                                              unsigned char *checksum,
144                                              const u64 Ls[16]) ASM_FUNC_ABI;
145
146 extern void _gcry_camellia_aesni_avx_ocb_dec(CAMELLIA_context *ctx,
147                                              unsigned char *out,
148                                              const unsigned char *in,
149                                              unsigned char *offset,
150                                              unsigned char *checksum,
151                                              const u64 Ls[16]) ASM_FUNC_ABI;
152
153 extern void _gcry_camellia_aesni_avx_ocb_auth(CAMELLIA_context *ctx,
154                                              const unsigned char *abuf,
155                                              unsigned char *offset,
156                                              unsigned char *checksum,
157                                              const u64 Ls[16]) ASM_FUNC_ABI;
158
159 extern void _gcry_camellia_aesni_avx_keygen(CAMELLIA_context *ctx,
160                                             const unsigned char *key,
161                                             unsigned int keylen) ASM_FUNC_ABI;
162 #endif
163
164 #ifdef USE_AESNI_AVX2
165 /* Assembler implementations of Camellia using AES-NI and AVX2.  Process data
166    in 32 block same time.
167  */
168 extern void _gcry_camellia_aesni_avx2_ctr_enc(CAMELLIA_context *ctx,
169                                               unsigned char *out,
170                                               const unsigned char *in,
171                                               unsigned char *ctr) ASM_FUNC_ABI;
172
173 extern void _gcry_camellia_aesni_avx2_cbc_dec(CAMELLIA_context *ctx,
174                                               unsigned char *out,
175                                               const unsigned char *in,
176                                               unsigned char *iv) ASM_FUNC_ABI;
177
178 extern void _gcry_camellia_aesni_avx2_cfb_dec(CAMELLIA_context *ctx,
179                                               unsigned char *out,
180                                               const unsigned char *in,
181                                               unsigned char *iv) ASM_FUNC_ABI;
182
183 extern void _gcry_camellia_aesni_avx2_ocb_enc(CAMELLIA_context *ctx,
184                                               unsigned char *out,
185                                               const unsigned char *in,
186                                               unsigned char *offset,
187                                               unsigned char *checksum,
188                                               const u64 Ls[32]) ASM_FUNC_ABI;
189
190 extern void _gcry_camellia_aesni_avx2_ocb_dec(CAMELLIA_context *ctx,
191                                               unsigned char *out,
192                                               const unsigned char *in,
193                                               unsigned char *offset,
194                                               unsigned char *checksum,
195                                               const u64 Ls[32]) ASM_FUNC_ABI;
196
197 extern void _gcry_camellia_aesni_avx2_ocb_auth(CAMELLIA_context *ctx,
198                                                const unsigned char *abuf,
199                                                unsigned char *offset,
200                                                unsigned char *checksum,
201                                                const u64 Ls[32]) ASM_FUNC_ABI;
202 #endif
203
204 static const char *selftest(void);
205
206 static gcry_err_code_t
207 camellia_setkey(void *c, const byte *key, unsigned keylen)
208 {
209   CAMELLIA_context *ctx=c;
210   static int initialized=0;
211   static const char *selftest_failed=NULL;
212 #if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
213   unsigned int hwf = _gcry_get_hw_features ();
214 #endif
215
216   if(keylen!=16 && keylen!=24 && keylen!=32)
217     return GPG_ERR_INV_KEYLEN;
218
219   if(!initialized)
220     {
221       initialized=1;
222       selftest_failed=selftest();
223       if(selftest_failed)
224         log_error("%s\n",selftest_failed);
225     }
226
227   if(selftest_failed)
228     return GPG_ERR_SELFTEST_FAILED;
229
230 #ifdef USE_AESNI_AVX
231   ctx->use_aesni_avx = (hwf & HWF_INTEL_AESNI) && (hwf & HWF_INTEL_AVX);
232 #endif
233 #ifdef USE_AESNI_AVX2
234   ctx->use_aesni_avx2 = (hwf & HWF_INTEL_AESNI) && (hwf & HWF_INTEL_AVX2);
235 #endif
236
237   ctx->keybitlength=keylen*8;
238
239   if (0)
240     { }
241 #ifdef USE_AESNI_AVX
242   else if (ctx->use_aesni_avx)
243     _gcry_camellia_aesni_avx_keygen(ctx, key, keylen);
244   else
245 #endif
246     {
247       Camellia_Ekeygen(ctx->keybitlength,key,ctx->keytable);
248       _gcry_burn_stack
249         ((19+34+34)*sizeof(u32)+2*sizeof(void*) /* camellia_setup256 */
250          +(4+32)*sizeof(u32)+2*sizeof(void*)    /* camellia_setup192 */
251          +0+sizeof(int)+2*sizeof(void*)         /* Camellia_Ekeygen */
252          +3*2*sizeof(void*)                     /* Function calls.  */
253          );
254     }
255
256   return 0;
257 }
258
259 #ifdef USE_ARM_ASM
260
261 /* Assembly implementations of Camellia. */
262 extern void _gcry_camellia_arm_encrypt_block(const KEY_TABLE_TYPE keyTable,
263                                                byte *outbuf, const byte *inbuf,
264                                                const int keybits);
265
266 extern void _gcry_camellia_arm_decrypt_block(const KEY_TABLE_TYPE keyTable,
267                                                byte *outbuf, const byte *inbuf,
268                                                const int keybits);
269
270 static void Camellia_EncryptBlock(const int keyBitLength,
271                                   const unsigned char *plaintext,
272                                   const KEY_TABLE_TYPE keyTable,
273                                   unsigned char *cipherText)
274 {
275   _gcry_camellia_arm_encrypt_block(keyTable, cipherText, plaintext,
276                                      keyBitLength);
277 }
278
279 static void Camellia_DecryptBlock(const int keyBitLength,
280                                   const unsigned char *cipherText,
281                                   const KEY_TABLE_TYPE keyTable,
282                                   unsigned char *plaintext)
283 {
284   _gcry_camellia_arm_decrypt_block(keyTable, plaintext, cipherText,
285                                      keyBitLength);
286 }
287
288 #ifdef __aarch64__
289 #  define CAMELLIA_encrypt_stack_burn_size (0)
290 #  define CAMELLIA_decrypt_stack_burn_size (0)
291 #else
292 #  define CAMELLIA_encrypt_stack_burn_size (15*4)
293 #  define CAMELLIA_decrypt_stack_burn_size (15*4)
294 #endif
295
296 static unsigned int
297 camellia_encrypt(void *c, byte *outbuf, const byte *inbuf)
298 {
299   CAMELLIA_context *ctx = c;
300   Camellia_EncryptBlock(ctx->keybitlength,inbuf,ctx->keytable,outbuf);
301   return /*burn_stack*/ (CAMELLIA_encrypt_stack_burn_size);
302 }
303
304 static unsigned int
305 camellia_decrypt(void *c, byte *outbuf, const byte *inbuf)
306 {
307   CAMELLIA_context *ctx=c;
308   Camellia_DecryptBlock(ctx->keybitlength,inbuf,ctx->keytable,outbuf);
309   return /*burn_stack*/ (CAMELLIA_decrypt_stack_burn_size);
310 }
311
312 #else /*USE_ARM_ASM*/
313
314 static unsigned int
315 camellia_encrypt(void *c, byte *outbuf, const byte *inbuf)
316 {
317   CAMELLIA_context *ctx=c;
318
319   Camellia_EncryptBlock(ctx->keybitlength,inbuf,ctx->keytable,outbuf);
320
321 #define CAMELLIA_encrypt_stack_burn_size \
322   (sizeof(int)+2*sizeof(unsigned char *)+sizeof(void*/*KEY_TABLE_TYPE*/) \
323      +4*sizeof(u32)+4*sizeof(u32) \
324      +2*sizeof(u32*)+4*sizeof(u32) \
325      +2*2*sizeof(void*) /* Function calls.  */ \
326     )
327
328   return /*burn_stack*/ (CAMELLIA_encrypt_stack_burn_size);
329 }
330
331 static unsigned int
332 camellia_decrypt(void *c, byte *outbuf, const byte *inbuf)
333 {
334   CAMELLIA_context *ctx=c;
335
336   Camellia_DecryptBlock(ctx->keybitlength,inbuf,ctx->keytable,outbuf);
337
338 #define CAMELLIA_decrypt_stack_burn_size \
339     (sizeof(int)+2*sizeof(unsigned char *)+sizeof(void*/*KEY_TABLE_TYPE*/) \
340      +4*sizeof(u32)+4*sizeof(u32) \
341      +2*sizeof(u32*)+4*sizeof(u32) \
342      +2*2*sizeof(void*) /* Function calls.  */ \
343     )
344
345   return /*burn_stack*/ (CAMELLIA_decrypt_stack_burn_size);
346 }
347
348 #endif /*!USE_ARM_ASM*/
349
350 /* Bulk encryption of complete blocks in CTR mode.  This function is only
351    intended for the bulk encryption feature of cipher.c.  CTR is expected to be
352    of size CAMELLIA_BLOCK_SIZE. */
353 void
354 _gcry_camellia_ctr_enc(void *context, unsigned char *ctr,
355                        void *outbuf_arg, const void *inbuf_arg,
356                        size_t nblocks)
357 {
358   CAMELLIA_context *ctx = context;
359   unsigned char *outbuf = outbuf_arg;
360   const unsigned char *inbuf = inbuf_arg;
361   unsigned char tmpbuf[CAMELLIA_BLOCK_SIZE];
362   int burn_stack_depth = CAMELLIA_encrypt_stack_burn_size;
363   int i;
364
365 #ifdef USE_AESNI_AVX2
366   if (ctx->use_aesni_avx2)
367     {
368       int did_use_aesni_avx2 = 0;
369
370       /* Process data in 32 block chunks. */
371       while (nblocks >= 32)
372         {
373           _gcry_camellia_aesni_avx2_ctr_enc(ctx, outbuf, inbuf, ctr);
374
375           nblocks -= 32;
376           outbuf += 32 * CAMELLIA_BLOCK_SIZE;
377           inbuf  += 32 * CAMELLIA_BLOCK_SIZE;
378           did_use_aesni_avx2 = 1;
379         }
380
381       if (did_use_aesni_avx2)
382         {
383           int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE + 16 +
384                                         2 * sizeof(void *) + ASM_EXTRA_STACK;
385
386           if (burn_stack_depth < avx2_burn_stack_depth)
387             burn_stack_depth = avx2_burn_stack_depth;
388         }
389
390       /* Use generic code to handle smaller chunks... */
391       /* TODO: use caching instead? */
392     }
393 #endif
394
395 #ifdef USE_AESNI_AVX
396   if (ctx->use_aesni_avx)
397     {
398       int did_use_aesni_avx = 0;
399
400       /* Process data in 16 block chunks. */
401       while (nblocks >= 16)
402         {
403           _gcry_camellia_aesni_avx_ctr_enc(ctx, outbuf, inbuf, ctr);
404
405           nblocks -= 16;
406           outbuf += 16 * CAMELLIA_BLOCK_SIZE;
407           inbuf  += 16 * CAMELLIA_BLOCK_SIZE;
408           did_use_aesni_avx = 1;
409         }
410
411       if (did_use_aesni_avx)
412         {
413           int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE +
414                                        2 * sizeof(void *) + ASM_EXTRA_STACK;
415
416           if (burn_stack_depth < avx_burn_stack_depth)
417             burn_stack_depth = avx_burn_stack_depth;
418         }
419
420       /* Use generic code to handle smaller chunks... */
421       /* TODO: use caching instead? */
422     }
423 #endif
424
425   for ( ;nblocks; nblocks-- )
426     {
427       /* Encrypt the counter. */
428       Camellia_EncryptBlock(ctx->keybitlength, ctr, ctx->keytable, tmpbuf);
429       /* XOR the input with the encrypted counter and store in output.  */
430       buf_xor(outbuf, tmpbuf, inbuf, CAMELLIA_BLOCK_SIZE);
431       outbuf += CAMELLIA_BLOCK_SIZE;
432       inbuf  += CAMELLIA_BLOCK_SIZE;
433       /* Increment the counter.  */
434       for (i = CAMELLIA_BLOCK_SIZE; i > 0; i--)
435         {
436           ctr[i-1]++;
437           if (ctr[i-1])
438             break;
439         }
440     }
441
442   wipememory(tmpbuf, sizeof(tmpbuf));
443   _gcry_burn_stack(burn_stack_depth);
444 }
445
446 /* Bulk decryption of complete blocks in CBC mode.  This function is only
447    intended for the bulk encryption feature of cipher.c. */
448 void
449 _gcry_camellia_cbc_dec(void *context, unsigned char *iv,
450                        void *outbuf_arg, const void *inbuf_arg,
451                        size_t nblocks)
452 {
453   CAMELLIA_context *ctx = context;
454   unsigned char *outbuf = outbuf_arg;
455   const unsigned char *inbuf = inbuf_arg;
456   unsigned char savebuf[CAMELLIA_BLOCK_SIZE];
457   int burn_stack_depth = CAMELLIA_decrypt_stack_burn_size;
458
459 #ifdef USE_AESNI_AVX2
460   if (ctx->use_aesni_avx2)
461     {
462       int did_use_aesni_avx2 = 0;
463
464       /* Process data in 32 block chunks. */
465       while (nblocks >= 32)
466         {
467           _gcry_camellia_aesni_avx2_cbc_dec(ctx, outbuf, inbuf, iv);
468
469           nblocks -= 32;
470           outbuf += 32 * CAMELLIA_BLOCK_SIZE;
471           inbuf  += 32 * CAMELLIA_BLOCK_SIZE;
472           did_use_aesni_avx2 = 1;
473         }
474
475       if (did_use_aesni_avx2)
476         {
477           int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE + 16 +
478                                         2 * sizeof(void *) + ASM_EXTRA_STACK;;
479
480           if (burn_stack_depth < avx2_burn_stack_depth)
481             burn_stack_depth = avx2_burn_stack_depth;
482         }
483
484       /* Use generic code to handle smaller chunks... */
485     }
486 #endif
487
488 #ifdef USE_AESNI_AVX
489   if (ctx->use_aesni_avx)
490     {
491       int did_use_aesni_avx = 0;
492
493       /* Process data in 16 block chunks. */
494       while (nblocks >= 16)
495         {
496           _gcry_camellia_aesni_avx_cbc_dec(ctx, outbuf, inbuf, iv);
497
498           nblocks -= 16;
499           outbuf += 16 * CAMELLIA_BLOCK_SIZE;
500           inbuf  += 16 * CAMELLIA_BLOCK_SIZE;
501           did_use_aesni_avx = 1;
502         }
503
504       if (did_use_aesni_avx)
505         {
506           int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE +
507                                        2 * sizeof(void *) + ASM_EXTRA_STACK;
508
509           if (burn_stack_depth < avx_burn_stack_depth)
510             burn_stack_depth = avx_burn_stack_depth;
511         }
512
513       /* Use generic code to handle smaller chunks... */
514     }
515 #endif
516
517   for ( ;nblocks; nblocks-- )
518     {
519       /* INBUF is needed later and it may be identical to OUTBUF, so store
520          the intermediate result to SAVEBUF.  */
521       Camellia_DecryptBlock(ctx->keybitlength, inbuf, ctx->keytable, savebuf);
522
523       buf_xor_n_copy_2(outbuf, savebuf, iv, inbuf, CAMELLIA_BLOCK_SIZE);
524       inbuf += CAMELLIA_BLOCK_SIZE;
525       outbuf += CAMELLIA_BLOCK_SIZE;
526     }
527
528   wipememory(savebuf, sizeof(savebuf));
529   _gcry_burn_stack(burn_stack_depth);
530 }
531
532 /* Bulk decryption of complete blocks in CFB mode.  This function is only
533    intended for the bulk encryption feature of cipher.c. */
534 void
535 _gcry_camellia_cfb_dec(void *context, unsigned char *iv,
536                        void *outbuf_arg, const void *inbuf_arg,
537                        size_t nblocks)
538 {
539   CAMELLIA_context *ctx = context;
540   unsigned char *outbuf = outbuf_arg;
541   const unsigned char *inbuf = inbuf_arg;
542   int burn_stack_depth = CAMELLIA_decrypt_stack_burn_size;
543
544 #ifdef USE_AESNI_AVX2
545   if (ctx->use_aesni_avx2)
546     {
547       int did_use_aesni_avx2 = 0;
548
549       /* Process data in 32 block chunks. */
550       while (nblocks >= 32)
551         {
552           _gcry_camellia_aesni_avx2_cfb_dec(ctx, outbuf, inbuf, iv);
553
554           nblocks -= 32;
555           outbuf += 32 * CAMELLIA_BLOCK_SIZE;
556           inbuf  += 32 * CAMELLIA_BLOCK_SIZE;
557           did_use_aesni_avx2 = 1;
558         }
559
560       if (did_use_aesni_avx2)
561         {
562           int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE + 16 +
563                                         2 * sizeof(void *) + ASM_EXTRA_STACK;
564
565           if (burn_stack_depth < avx2_burn_stack_depth)
566             burn_stack_depth = avx2_burn_stack_depth;
567         }
568
569       /* Use generic code to handle smaller chunks... */
570     }
571 #endif
572
573 #ifdef USE_AESNI_AVX
574   if (ctx->use_aesni_avx)
575     {
576       int did_use_aesni_avx = 0;
577
578       /* Process data in 16 block chunks. */
579       while (nblocks >= 16)
580         {
581           _gcry_camellia_aesni_avx_cfb_dec(ctx, outbuf, inbuf, iv);
582
583           nblocks -= 16;
584           outbuf += 16 * CAMELLIA_BLOCK_SIZE;
585           inbuf  += 16 * CAMELLIA_BLOCK_SIZE;
586           did_use_aesni_avx = 1;
587         }
588
589       if (did_use_aesni_avx)
590         {
591           int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE +
592                                        2 * sizeof(void *) + ASM_EXTRA_STACK;
593
594           if (burn_stack_depth < avx_burn_stack_depth)
595             burn_stack_depth = avx_burn_stack_depth;
596         }
597
598       /* Use generic code to handle smaller chunks... */
599     }
600 #endif
601
602   for ( ;nblocks; nblocks-- )
603     {
604       Camellia_EncryptBlock(ctx->keybitlength, iv, ctx->keytable, iv);
605       buf_xor_n_copy(outbuf, iv, inbuf, CAMELLIA_BLOCK_SIZE);
606       outbuf += CAMELLIA_BLOCK_SIZE;
607       inbuf  += CAMELLIA_BLOCK_SIZE;
608     }
609
610   _gcry_burn_stack(burn_stack_depth);
611 }
612
613 /* Bulk encryption/decryption of complete blocks in OCB mode. */
614 size_t
615 _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
616                           const void *inbuf_arg, size_t nblocks, int encrypt)
617 {
618 #if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
619   CAMELLIA_context *ctx = (void *)&c->context.c;
620   unsigned char *outbuf = outbuf_arg;
621   const unsigned char *inbuf = inbuf_arg;
622   unsigned char l_tmp[CAMELLIA_BLOCK_SIZE];
623   int burn_stack_depth;
624   u64 blkn = c->u_mode.ocb.data_nblocks;
625
626   burn_stack_depth = encrypt ? CAMELLIA_encrypt_stack_burn_size :
627                               CAMELLIA_decrypt_stack_burn_size;
628 #else
629   (void)c;
630   (void)outbuf_arg;
631   (void)inbuf_arg;
632   (void)encrypt;
633 #endif
634
635 #ifdef USE_AESNI_AVX2
636   if (ctx->use_aesni_avx2)
637     {
638       int did_use_aesni_avx2 = 0;
639       u64 Ls[32];
640       unsigned int n = 32 - (blkn % 32);
641       u64 *l;
642       int i;
643
644       if (nblocks >= 32)
645         {
646           for (i = 0; i < 32; i += 8)
647             {
648               /* Use u64 to store pointers for x32 support (assembly function
649                * assumes 64-bit pointers). */
650               Ls[(i + 0 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
651               Ls[(i + 1 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
652               Ls[(i + 2 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
653               Ls[(i + 3 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
654               Ls[(i + 4 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
655               Ls[(i + 5 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
656               Ls[(i + 6 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
657             }
658
659           Ls[(7 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
660           Ls[(15 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[4];
661           Ls[(23 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
662           l = &Ls[(31 + n) % 32];
663
664           /* Process data in 32 block chunks. */
665           while (nblocks >= 32)
666             {
667               /* l_tmp will be used only every 65536-th block. */
668               blkn += 32;
669               *l = (uintptr_t)(void *)ocb_get_l(c, l_tmp, blkn - blkn % 32);
670
671               if (encrypt)
672                 _gcry_camellia_aesni_avx2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
673                                                   c->u_ctr.ctr, Ls);
674               else
675                 _gcry_camellia_aesni_avx2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
676                                                   c->u_ctr.ctr, Ls);
677
678               nblocks -= 32;
679               outbuf += 32 * CAMELLIA_BLOCK_SIZE;
680               inbuf  += 32 * CAMELLIA_BLOCK_SIZE;
681               did_use_aesni_avx2 = 1;
682             }
683         }
684
685       if (did_use_aesni_avx2)
686         {
687           int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE +
688                                       2 * sizeof(void *) + ASM_EXTRA_STACK;
689
690           if (burn_stack_depth < avx2_burn_stack_depth)
691             burn_stack_depth = avx2_burn_stack_depth;
692         }
693
694       /* Use generic code to handle smaller chunks... */
695     }
696 #endif
697
698 #ifdef USE_AESNI_AVX
699   if (ctx->use_aesni_avx)
700     {
701       int did_use_aesni_avx = 0;
702       u64 Ls[16];
703       unsigned int n = 16 - (blkn % 16);
704       u64 *l;
705       int i;
706
707       if (nblocks >= 16)
708         {
709           for (i = 0; i < 16; i += 8)
710             {
711               /* Use u64 to store pointers for x32 support (assembly function
712                * assumes 64-bit pointers). */
713               Ls[(i + 0 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
714               Ls[(i + 1 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
715               Ls[(i + 2 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
716               Ls[(i + 3 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
717               Ls[(i + 4 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
718               Ls[(i + 5 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
719               Ls[(i + 6 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
720             }
721
722           Ls[(7 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
723           l = &Ls[(15 + n) % 16];
724
725           /* Process data in 16 block chunks. */
726           while (nblocks >= 16)
727             {
728               /* l_tmp will be used only every 65536-th block. */
729               blkn += 16;
730               *l = (uintptr_t)(void *)ocb_get_l(c, l_tmp, blkn - blkn % 16);
731
732               if (encrypt)
733                 _gcry_camellia_aesni_avx_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
734                                                 c->u_ctr.ctr, Ls);
735               else
736                 _gcry_camellia_aesni_avx_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
737                                                 c->u_ctr.ctr, Ls);
738
739               nblocks -= 16;
740               outbuf += 16 * CAMELLIA_BLOCK_SIZE;
741               inbuf  += 16 * CAMELLIA_BLOCK_SIZE;
742               did_use_aesni_avx = 1;
743             }
744         }
745
746       if (did_use_aesni_avx)
747         {
748           int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE +
749                                       2 * sizeof(void *) + ASM_EXTRA_STACK;
750
751           if (burn_stack_depth < avx_burn_stack_depth)
752             burn_stack_depth = avx_burn_stack_depth;
753         }
754
755       /* Use generic code to handle smaller chunks... */
756     }
757 #endif
758
759 #if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
760   c->u_mode.ocb.data_nblocks = blkn;
761
762   wipememory(&l_tmp, sizeof(l_tmp));
763
764   if (burn_stack_depth)
765     _gcry_burn_stack (burn_stack_depth + 4 * sizeof(void *));
766 #endif
767
768   return nblocks;
769 }
770
771 /* Bulk authentication of complete blocks in OCB mode. */
772 size_t
773 _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
774                          size_t nblocks)
775 {
776 #if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
777   CAMELLIA_context *ctx = (void *)&c->context.c;
778   const unsigned char *abuf = abuf_arg;
779   unsigned char l_tmp[CAMELLIA_BLOCK_SIZE];
780   int burn_stack_depth;
781   u64 blkn = c->u_mode.ocb.aad_nblocks;
782
783   burn_stack_depth = CAMELLIA_encrypt_stack_burn_size;
784 #else
785   (void)c;
786   (void)abuf_arg;
787 #endif
788
789 #ifdef USE_AESNI_AVX2
790   if (ctx->use_aesni_avx2)
791     {
792       int did_use_aesni_avx2 = 0;
793       u64 Ls[32];
794       unsigned int n = 32 - (blkn % 32);
795       u64 *l;
796       int i;
797
798       if (nblocks >= 32)
799         {
800           for (i = 0; i < 32; i += 8)
801             {
802               /* Use u64 to store pointers for x32 support (assembly function
803                * assumes 64-bit pointers). */
804               Ls[(i + 0 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
805               Ls[(i + 1 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
806               Ls[(i + 2 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
807               Ls[(i + 3 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
808               Ls[(i + 4 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
809               Ls[(i + 5 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
810               Ls[(i + 6 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
811             }
812
813           Ls[(7 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
814           Ls[(15 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[4];
815           Ls[(23 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
816           l = &Ls[(31 + n) % 32];
817
818           /* Process data in 32 block chunks. */
819           while (nblocks >= 32)
820             {
821               /* l_tmp will be used only every 65536-th block. */
822               blkn += 32;
823               *l = (uintptr_t)(void *)ocb_get_l(c, l_tmp, blkn - blkn % 32);
824
825               _gcry_camellia_aesni_avx2_ocb_auth(ctx, abuf,
826                                                  c->u_mode.ocb.aad_offset,
827                                                  c->u_mode.ocb.aad_sum, Ls);
828
829               nblocks -= 32;
830               abuf += 32 * CAMELLIA_BLOCK_SIZE;
831               did_use_aesni_avx2 = 1;
832             }
833         }
834
835       if (did_use_aesni_avx2)
836         {
837           int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE +
838                                       2 * sizeof(void *) + ASM_EXTRA_STACK;
839
840           if (burn_stack_depth < avx2_burn_stack_depth)
841             burn_stack_depth = avx2_burn_stack_depth;
842         }
843
844       /* Use generic code to handle smaller chunks... */
845     }
846 #endif
847
848 #ifdef USE_AESNI_AVX
849   if (ctx->use_aesni_avx)
850     {
851       int did_use_aesni_avx = 0;
852       u64 Ls[16];
853       unsigned int n = 16 - (blkn % 16);
854       u64 *l;
855       int i;
856
857       if (nblocks >= 16)
858         {
859           for (i = 0; i < 16; i += 8)
860             {
861               /* Use u64 to store pointers for x32 support (assembly function
862                * assumes 64-bit pointers). */
863               Ls[(i + 0 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
864               Ls[(i + 1 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
865               Ls[(i + 2 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
866               Ls[(i + 3 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
867               Ls[(i + 4 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
868               Ls[(i + 5 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
869               Ls[(i + 6 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
870             }
871
872           Ls[(7 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
873           l = &Ls[(15 + n) % 16];
874
875           /* Process data in 16 block chunks. */
876           while (nblocks >= 16)
877             {
878               /* l_tmp will be used only every 65536-th block. */
879               blkn += 16;
880               *l = (uintptr_t)(void *)ocb_get_l(c, l_tmp, blkn - blkn % 16);
881
882               _gcry_camellia_aesni_avx_ocb_auth(ctx, abuf,
883                                                 c->u_mode.ocb.aad_offset,
884                                                 c->u_mode.ocb.aad_sum, Ls);
885
886               nblocks -= 16;
887               abuf += 16 * CAMELLIA_BLOCK_SIZE;
888               did_use_aesni_avx = 1;
889             }
890         }
891
892       if (did_use_aesni_avx)
893         {
894           int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE +
895                                       2 * sizeof(void *) + ASM_EXTRA_STACK;
896
897           if (burn_stack_depth < avx_burn_stack_depth)
898             burn_stack_depth = avx_burn_stack_depth;
899         }
900
901       /* Use generic code to handle smaller chunks... */
902     }
903 #endif
904
905 #if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
906   c->u_mode.ocb.aad_nblocks = blkn;
907
908   wipememory(&l_tmp, sizeof(l_tmp));
909
910   if (burn_stack_depth)
911     _gcry_burn_stack (burn_stack_depth + 4 * sizeof(void *));
912 #endif
913
914   return nblocks;
915 }
916
917 /* Run the self-tests for CAMELLIA-CTR-128, tests IV increment of bulk CTR
918    encryption.  Returns NULL on success. */
919 static const char*
920 selftest_ctr_128 (void)
921 {
922   const int nblocks = 32+16+1;
923   const int blocksize = CAMELLIA_BLOCK_SIZE;
924   const int context_size = sizeof(CAMELLIA_context);
925
926   return _gcry_selftest_helper_ctr("CAMELLIA", &camellia_setkey,
927            &camellia_encrypt, &_gcry_camellia_ctr_enc, nblocks, blocksize,
928            context_size);
929 }
930
931 /* Run the self-tests for CAMELLIA-CBC-128, tests bulk CBC decryption.
932    Returns NULL on success. */
933 static const char*
934 selftest_cbc_128 (void)
935 {
936   const int nblocks = 32+16+2;
937   const int blocksize = CAMELLIA_BLOCK_SIZE;
938   const int context_size = sizeof(CAMELLIA_context);
939
940   return _gcry_selftest_helper_cbc("CAMELLIA", &camellia_setkey,
941            &camellia_encrypt, &_gcry_camellia_cbc_dec, nblocks, blocksize,
942            context_size);
943 }
944
945 /* Run the self-tests for CAMELLIA-CFB-128, tests bulk CFB decryption.
946    Returns NULL on success. */
947 static const char*
948 selftest_cfb_128 (void)
949 {
950   const int nblocks = 32+16+2;
951   const int blocksize = CAMELLIA_BLOCK_SIZE;
952   const int context_size = sizeof(CAMELLIA_context);
953
954   return _gcry_selftest_helper_cfb("CAMELLIA", &camellia_setkey,
955            &camellia_encrypt, &_gcry_camellia_cfb_dec, nblocks, blocksize,
956            context_size);
957 }
958
959 static const char *
960 selftest(void)
961 {
962   CAMELLIA_context ctx;
963   byte scratch[16];
964   const char *r;
965
966   /* These test vectors are from RFC-3713 */
967   static const byte plaintext[]=
968     {
969       0x01,0x23,0x45,0x67,0x89,0xab,0xcd,0xef,
970       0xfe,0xdc,0xba,0x98,0x76,0x54,0x32,0x10
971     };
972   static const byte key_128[]=
973     {
974       0x01,0x23,0x45,0x67,0x89,0xab,0xcd,0xef,
975       0xfe,0xdc,0xba,0x98,0x76,0x54,0x32,0x10
976     };
977   static const byte ciphertext_128[]=
978     {
979       0x67,0x67,0x31,0x38,0x54,0x96,0x69,0x73,
980       0x08,0x57,0x06,0x56,0x48,0xea,0xbe,0x43
981     };
982   static const byte key_192[]=
983     {
984       0x01,0x23,0x45,0x67,0x89,0xab,0xcd,0xef,0xfe,0xdc,0xba,0x98,
985       0x76,0x54,0x32,0x10,0x00,0x11,0x22,0x33,0x44,0x55,0x66,0x77
986     };
987   static const byte ciphertext_192[]=
988     {
989       0xb4,0x99,0x34,0x01,0xb3,0xe9,0x96,0xf8,
990       0x4e,0xe5,0xce,0xe7,0xd7,0x9b,0x09,0xb9
991     };
992   static const byte key_256[]=
993     {
994       0x01,0x23,0x45,0x67,0x89,0xab,0xcd,0xef,0xfe,0xdc,0xba,
995       0x98,0x76,0x54,0x32,0x10,0x00,0x11,0x22,0x33,0x44,0x55,
996       0x66,0x77,0x88,0x99,0xaa,0xbb,0xcc,0xdd,0xee,0xff
997     };
998   static const byte ciphertext_256[]=
999     {
1000       0x9a,0xcc,0x23,0x7d,0xff,0x16,0xd7,0x6c,
1001       0x20,0xef,0x7c,0x91,0x9e,0x3a,0x75,0x09
1002     };
1003
1004   camellia_setkey(&ctx,key_128,sizeof(key_128));
1005   camellia_encrypt(&ctx,scratch,plaintext);
1006   if(memcmp(scratch,ciphertext_128,sizeof(ciphertext_128))!=0)
1007     return "CAMELLIA-128 test encryption failed.";
1008   camellia_decrypt(&ctx,scratch,scratch);
1009   if(memcmp(scratch,plaintext,sizeof(plaintext))!=0)
1010     return "CAMELLIA-128 test decryption failed.";
1011
1012   camellia_setkey(&ctx,key_192,sizeof(key_192));
1013   camellia_encrypt(&ctx,scratch,plaintext);
1014   if(memcmp(scratch,ciphertext_192,sizeof(ciphertext_192))!=0)
1015     return "CAMELLIA-192 test encryption failed.";
1016   camellia_decrypt(&ctx,scratch,scratch);
1017   if(memcmp(scratch,plaintext,sizeof(plaintext))!=0)
1018     return "CAMELLIA-192 test decryption failed.";
1019
1020   camellia_setkey(&ctx,key_256,sizeof(key_256));
1021   camellia_encrypt(&ctx,scratch,plaintext);
1022   if(memcmp(scratch,ciphertext_256,sizeof(ciphertext_256))!=0)
1023     return "CAMELLIA-256 test encryption failed.";
1024   camellia_decrypt(&ctx,scratch,scratch);
1025   if(memcmp(scratch,plaintext,sizeof(plaintext))!=0)
1026     return "CAMELLIA-256 test decryption failed.";
1027
1028   if ( (r = selftest_ctr_128 ()) )
1029     return r;
1030
1031   if ( (r = selftest_cbc_128 ()) )
1032     return r;
1033
1034   if ( (r = selftest_cfb_128 ()) )
1035     return r;
1036
1037   return NULL;
1038 }
1039
1040 /* These oids are from
1041    <http://info.isl.ntt.co.jp/crypt/eng/camellia/specifications_oid.html>,
1042    retrieved May 1, 2007. */
1043
1044 static gcry_cipher_oid_spec_t camellia128_oids[] =
1045   {
1046     {"1.2.392.200011.61.1.1.1.2", GCRY_CIPHER_MODE_CBC},
1047     {"0.3.4401.5.3.1.9.1", GCRY_CIPHER_MODE_ECB},
1048     {"0.3.4401.5.3.1.9.3", GCRY_CIPHER_MODE_OFB},
1049     {"0.3.4401.5.3.1.9.4", GCRY_CIPHER_MODE_CFB},
1050     { NULL }
1051   };
1052
1053 static gcry_cipher_oid_spec_t camellia192_oids[] =
1054   {
1055     {"1.2.392.200011.61.1.1.1.3", GCRY_CIPHER_MODE_CBC},
1056     {"0.3.4401.5.3.1.9.21", GCRY_CIPHER_MODE_ECB},
1057     {"0.3.4401.5.3.1.9.23", GCRY_CIPHER_MODE_OFB},
1058     {"0.3.4401.5.3.1.9.24", GCRY_CIPHER_MODE_CFB},
1059     { NULL }
1060   };
1061
1062 static gcry_cipher_oid_spec_t camellia256_oids[] =
1063   {
1064     {"1.2.392.200011.61.1.1.1.4", GCRY_CIPHER_MODE_CBC},
1065     {"0.3.4401.5.3.1.9.41", GCRY_CIPHER_MODE_ECB},
1066     {"0.3.4401.5.3.1.9.43", GCRY_CIPHER_MODE_OFB},
1067     {"0.3.4401.5.3.1.9.44", GCRY_CIPHER_MODE_CFB},
1068     { NULL }
1069   };
1070
1071 gcry_cipher_spec_t _gcry_cipher_spec_camellia128 =
1072   {
1073     GCRY_CIPHER_CAMELLIA128, {0, 0},
1074     "CAMELLIA128",NULL,camellia128_oids,CAMELLIA_BLOCK_SIZE,128,
1075     sizeof(CAMELLIA_context),camellia_setkey,camellia_encrypt,camellia_decrypt
1076   };
1077
1078 gcry_cipher_spec_t _gcry_cipher_spec_camellia192 =
1079   {
1080     GCRY_CIPHER_CAMELLIA192, {0, 0},
1081     "CAMELLIA192",NULL,camellia192_oids,CAMELLIA_BLOCK_SIZE,192,
1082     sizeof(CAMELLIA_context),camellia_setkey,camellia_encrypt,camellia_decrypt
1083   };
1084
1085 gcry_cipher_spec_t _gcry_cipher_spec_camellia256 =
1086   {
1087     GCRY_CIPHER_CAMELLIA256, {0, 0},
1088     "CAMELLIA256",NULL,camellia256_oids,CAMELLIA_BLOCK_SIZE,256,
1089     sizeof(CAMELLIA_context),camellia_setkey,camellia_encrypt,camellia_decrypt
1090   };