* configure.ac: Check for funopen and fopencookie as part of the
[gnupg.git] / jnlib / utf8conv.c
1 /* utf8conf.c -  UTF8 character set conversion
2  * Copyright (C) 1994, 1998, 1999, 2000, 2001,
3  *               2003  Free Software Foundation, Inc.
4  *
5  * This file is part of GnuPG.
6  *
7  * GnuPG is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2 of the License, or
10  * (at your option) any later version.
11  *
12  * GnuPG is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
20  */
21
22 #include <config.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #include <stdarg.h>
26 #include <ctype.h>
27 #ifdef HAVE_LANGINFO_CODESET
28 #include <langinfo.h>
29 #endif
30
31 #include "libjnlib-config.h"
32 #include "stringhelp.h"
33 #include "utf8conv.h"
34
35
36 static ushort koi8_unicode[128] = {
37   0x2500, 0x2502, 0x250c, 0x2510, 0x2514, 0x2518, 0x251c, 0x2524,
38   0x252c, 0x2534, 0x253c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
39   0x2591, 0x2592, 0x2593, 0x2320, 0x25a0, 0x2219, 0x221a, 0x2248,
40   0x2264, 0x2265, 0x00a0, 0x2321, 0x00b0, 0x00b2, 0x00b7, 0x00f7,
41   0x2550, 0x2551, 0x2552, 0x0451, 0x2553, 0x2554, 0x2555, 0x2556,
42   0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d, 0x255e,
43   0x255f, 0x2560, 0x2561, 0x0401, 0x2562, 0x2563, 0x2564, 0x2565,
44   0x2566, 0x2567, 0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x00a9,
45   0x044e, 0x0430, 0x0431, 0x0446, 0x0434, 0x0435, 0x0444, 0x0433,
46   0x0445, 0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e,
47   0x043f, 0x044f, 0x0440, 0x0441, 0x0442, 0x0443, 0x0436, 0x0432,
48   0x044c, 0x044b, 0x0437, 0x0448, 0x044d, 0x0449, 0x0447, 0x044a,
49   0x042e, 0x0410, 0x0411, 0x0426, 0x0414, 0x0415, 0x0424, 0x0413,
50   0x0425, 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e,
51   0x041f, 0x042f, 0x0420, 0x0421, 0x0422, 0x0423, 0x0416, 0x0412,
52   0x042c, 0x042b, 0x0417, 0x0428, 0x042d, 0x0429, 0x0427, 0x042a
53 };
54
55 static ushort latin2_unicode[128] = {
56   0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,
57   0x0088, 0x0089, 0x008A, 0x008B, 0x008C, 0x008D, 0x008E, 0x008F,
58   0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,
59   0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E, 0x009F,
60   0x00A0, 0x0104, 0x02D8, 0x0141, 0x00A4, 0x013D, 0x015A, 0x00A7,
61   0x00A8, 0x0160, 0x015E, 0x0164, 0x0179, 0x00AD, 0x017D, 0x017B,
62   0x00B0, 0x0105, 0x02DB, 0x0142, 0x00B4, 0x013E, 0x015B, 0x02C7,
63   0x00B8, 0x0161, 0x015F, 0x0165, 0x017A, 0x02DD, 0x017E, 0x017C,
64   0x0154, 0x00C1, 0x00C2, 0x0102, 0x00C4, 0x0139, 0x0106, 0x00C7,
65   0x010C, 0x00C9, 0x0118, 0x00CB, 0x011A, 0x00CD, 0x00CE, 0x010E,
66   0x0110, 0x0143, 0x0147, 0x00D3, 0x00D4, 0x0150, 0x00D6, 0x00D7,
67   0x0158, 0x016E, 0x00DA, 0x0170, 0x00DC, 0x00DD, 0x0162, 0x00DF,
68   0x0155, 0x00E1, 0x00E2, 0x0103, 0x00E4, 0x013A, 0x0107, 0x00E7,
69   0x010D, 0x00E9, 0x0119, 0x00EB, 0x011B, 0x00ED, 0x00EE, 0x010F,
70   0x0111, 0x0144, 0x0148, 0x00F3, 0x00F4, 0x0151, 0x00F6, 0x00F7,
71   0x0159, 0x016F, 0x00FA, 0x0171, 0x00FC, 0x00FD, 0x0163, 0x02D9
72 };
73
74
75 static const char *active_charset_name = "iso-8859-1";
76 static ushort *active_charset = NULL;
77 static int no_translation = 0;
78
79 int
80 set_native_charset (const char *newset)
81 {
82   if (!newset)
83 #ifdef HAVE_LANGINFO_CODESET
84     newset = nl_langinfo (CODESET);
85 #else
86     newset = "8859-1";
87 #endif
88
89   if (strlen (newset) > 3 && !ascii_memcasecmp (newset, "iso", 3))
90     {
91       newset += 3;
92       if (*newset == '-' || *newset == '_')
93         newset++;
94     }
95
96   if (!*newset
97       || !ascii_strcasecmp (newset, "8859-1")
98       || !ascii_strcasecmp (newset, "8859-15"))
99     {
100       active_charset_name = "iso-8859-1";
101       no_translation = 0;
102       active_charset = NULL;
103     }
104   else if (!ascii_strcasecmp (newset, "8859-2"))
105     {
106       active_charset_name = "iso-8859-2";
107       no_translation = 0;
108       active_charset = latin2_unicode;
109     }
110   else if (!ascii_strcasecmp (newset, "koi8-r"))
111     {
112       active_charset_name = "koi8-r";
113       no_translation = 0;
114       active_charset = koi8_unicode;
115     }
116   else if (!ascii_strcasecmp (newset, "utf8")
117            || !ascii_strcasecmp (newset, "utf-8"))
118     {
119       active_charset_name = "utf-8";
120       no_translation = 1;
121       active_charset = NULL;
122     }
123   else
124     return -1;
125   return 0;
126 }
127
128 const char *
129 get_native_charset ()
130 {
131   return active_charset_name;
132 }
133
134 /****************
135  * Convert string, which is in native encoding to UTF8 and return the
136  * new allocated UTF8 string.
137  */
138 char *
139 native_to_utf8 (const char *string)
140 {
141   const byte *s;
142   char *buffer;
143   byte *p;
144   size_t length = 0;
145
146   if (no_translation)
147     {
148       buffer = jnlib_xstrdup (string);
149     }
150   else if (active_charset)
151     {
152       for (s = string; *s; s++)
153         {
154           length++;
155           if (*s & 0x80)
156             length += 2;        /* we may need 3 bytes */
157         }
158       buffer = jnlib_xmalloc (length + 1);
159       for (p = buffer, s = string; *s; s++)
160         {
161           if ((*s & 0x80))
162             {
163               ushort val = active_charset[*s & 0x7f];
164               if (val < 0x0800)
165                 {
166                   *p++ = 0xc0 | ((val >> 6) & 0x1f);
167                   *p++ = 0x80 | (val & 0x3f);
168                 }
169               else
170                 {
171                   *p++ = 0xe0 | ((val >> 12) & 0x0f);
172                   *p++ = 0x80 | ((val >> 6) & 0x3f);
173                   *p++ = 0x80 | (val & 0x3f);
174                 }
175             }
176           else
177             *p++ = *s;
178         }
179       *p = 0;
180     }
181   else
182     {
183       for (s = string; *s; s++)
184         {
185           length++;
186           if (*s & 0x80)
187             length++;
188         }
189       buffer = jnlib_xmalloc (length + 1);
190       for (p = buffer, s = string; *s; s++)
191         {
192           if (*s & 0x80)
193             {
194               *p++ = 0xc0 | ((*s >> 6) & 3);
195               *p++ = 0x80 | (*s & 0x3f);
196             }
197           else
198             *p++ = *s;
199         }
200       *p = 0;
201     }
202   return buffer;
203 }
204
205
206 /* Convert string, which is in UTF8 to native encoding.  Replace
207  * illegal encodings by some "\xnn" and quote all control
208  * characters. A character with value DELIM will always be quoted, it
209  * must be a vanilla ASCII character.  */
210 char *
211 utf8_to_native (const char *string, size_t length, int delim)
212 {
213   int nleft;
214   int i;
215   byte encbuf[8];
216   int encidx;
217   const byte *s;
218   size_t n;
219   byte *buffer = NULL, *p = NULL;
220   unsigned long val = 0;
221   size_t slen;
222   int resync = 0;
223
224   /* 1. pass (p==NULL): count the extended utf-8 characters */
225   /* 2. pass (p!=NULL): create string */
226   for (;;)
227     {
228       for (slen = length, nleft = encidx = 0, n = 0, s = string; slen;
229            s++, slen--)
230         {
231           if (resync)
232             {
233               if (!(*s < 128 || (*s >= 0xc0 && *s <= 0xfd)))
234                 {
235                   /* still invalid */
236                   if (p)
237                     {
238                       sprintf (p, "\\x%02x", *s);
239                       p += 4;
240                     }
241                   n += 4;
242                   continue;
243                 }
244               resync = 0;
245             }
246           if (!nleft)
247             {
248               if (!(*s & 0x80))
249                 {               /* plain ascii */
250                   if (*s < 0x20 || *s == 0x7f || *s == delim ||
251                       (delim && *s == '\\'))
252                     {
253                       n++;
254                       if (p)
255                         *p++ = '\\';
256                       switch (*s)
257                         {
258                         case '\n':
259                           n++;
260                           if (p)
261                             *p++ = 'n';
262                           break;
263                         case '\r':
264                           n++;
265                           if (p)
266                             *p++ = 'r';
267                           break;
268                         case '\f':
269                           n++;
270                           if (p)
271                             *p++ = 'f';
272                           break;
273                         case '\v':
274                           n++;
275                           if (p)
276                             *p++ = 'v';
277                           break;
278                         case '\b':
279                           n++;
280                           if (p)
281                             *p++ = 'b';
282                           break;
283                         case 0:
284                           n++;
285                           if (p)
286                             *p++ = '0';
287                           break;
288                         default:
289                           n += 3;
290                           if (p)
291                             {
292                               sprintf (p, "x%02x", *s);
293                               p += 3;
294                             }
295                           break;
296                         }
297                     }
298                   else
299                     {
300                       if (p)
301                         *p++ = *s;
302                       n++;
303                     }
304                 }
305               else if ((*s & 0xe0) == 0xc0)
306                 {               /* 110x xxxx */
307                   val = *s & 0x1f;
308                   nleft = 1;
309                   encidx = 0;
310                   encbuf[encidx++] = *s;
311                 }
312               else if ((*s & 0xf0) == 0xe0)
313                 {               /* 1110 xxxx */
314                   val = *s & 0x0f;
315                   nleft = 2;
316                   encidx = 0;
317                   encbuf[encidx++] = *s;
318                 }
319               else if ((*s & 0xf8) == 0xf0)
320                 {               /* 1111 0xxx */
321                   val = *s & 0x07;
322                   nleft = 3;
323                   encidx = 0;
324                   encbuf[encidx++] = *s;
325                 }
326               else if ((*s & 0xfc) == 0xf8)
327                 {               /* 1111 10xx */
328                   val = *s & 0x03;
329                   nleft = 4;
330                   encidx = 0;
331                   encbuf[encidx++] = *s;
332                 }
333               else if ((*s & 0xfe) == 0xfc)
334                 {               /* 1111 110x */
335                   val = *s & 0x01;
336                   nleft = 5;
337                   encidx = 0;
338                   encbuf[encidx++] = *s;
339                 }
340               else
341                 {               /* invalid encoding: print as \xnn */
342                   if (p)
343                     {
344                       sprintf (p, "\\x%02x", *s);
345                       p += 4;
346                     }
347                   n += 4;
348                   resync = 1;
349                 }
350             }
351           else if (*s < 0x80 || *s >= 0xc0)
352             {                   /* invalid */
353               if (p)
354                 {
355                   for (i = 0; i < encidx; i++)
356                     {
357                       sprintf (p, "\\x%02x", encbuf[i]);
358                       p += 4;
359                     }
360                   sprintf (p, "\\x%02x", *s);
361                   p += 4;
362                 }
363               n += 4 + 4 * encidx;
364               nleft = 0;
365               encidx = 0;
366               resync = 1;
367             }
368           else
369             {
370               encbuf[encidx++] = *s;
371               val <<= 6;
372               val |= *s & 0x3f;
373               if (!--nleft)
374                 {               /* ready */
375                   if (no_translation)
376                     {
377                       if (p)
378                         {
379                           for (i = 0; i < encidx; i++)
380                             *p++ = encbuf[i];
381                         }
382                       n += encidx;
383                       encidx = 0;
384                     }
385                   else if (active_charset)
386                     {           /* table lookup */
387                       for (i = 0; i < 128; i++)
388                         {
389                           if (active_charset[i] == val)
390                             break;
391                         }
392                       if (i < 128)
393                         {       /* we can print this one */
394                           if (p)
395                             *p++ = i + 128;
396                           n++;
397                         }
398                       else
399                         {       /* we do not have a translation: print utf8 */
400                           if (p)
401                             {
402                               for (i = 0; i < encidx; i++)
403                                 {
404                                   sprintf (p, "\\x%02x", encbuf[i]);
405                                   p += 4;
406                                 }
407                             }
408                           n += encidx * 4;
409                           encidx = 0;
410                         }
411                     }
412                   else
413                     {           /* native set */
414                       if (val >= 0x80 && val < 256)
415                         {
416                           n++;  /* we can simply print this character */
417                           if (p)
418                             *p++ = val;
419                         }
420                       else
421                         {       /* we do not have a translation: print utf8 */
422                           if (p)
423                             {
424                               for (i = 0; i < encidx; i++)
425                                 {
426                                   sprintf (p, "\\x%02x", encbuf[i]);
427                                   p += 4;
428                                 }
429                             }
430                           n += encidx * 4;
431                           encidx = 0;
432                         }
433                     }
434                 }
435
436             }
437         }
438       if (!buffer)
439         {                       /* allocate the buffer after the first pass */
440           buffer = p = jnlib_xmalloc (n + 1);
441         }
442       else
443         {
444           *p = 0;               /* make a string */
445           return buffer;
446         }
447     }
448 }