Updated FSF's address.
[gnupg.git] / jnlib / utf8conv.c
1 /* utf8conf.c -  UTF8 character set conversion
2  * Copyright (C) 1994, 1998, 1999, 2000, 2001,
3  *               2003  Free Software Foundation, Inc.
4  *
5  * This file is part of GnuPG.
6  *
7  * GnuPG is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2 of the License, or
10  * (at your option) any later version.
11  *
12  * GnuPG is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
20  * USA.
21  */
22
23 #include <config.h>
24 #include <stdlib.h>
25 #include <string.h>
26 #include <stdarg.h>
27 #include <ctype.h>
28 #ifdef HAVE_LANGINFO_CODESET
29 #include <langinfo.h>
30 #endif
31
32 #include "libjnlib-config.h"
33 #include "stringhelp.h"
34 #include "utf8conv.h"
35
36
37 static ushort koi8_unicode[128] = {
38   0x2500, 0x2502, 0x250c, 0x2510, 0x2514, 0x2518, 0x251c, 0x2524,
39   0x252c, 0x2534, 0x253c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
40   0x2591, 0x2592, 0x2593, 0x2320, 0x25a0, 0x2219, 0x221a, 0x2248,
41   0x2264, 0x2265, 0x00a0, 0x2321, 0x00b0, 0x00b2, 0x00b7, 0x00f7,
42   0x2550, 0x2551, 0x2552, 0x0451, 0x2553, 0x2554, 0x2555, 0x2556,
43   0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d, 0x255e,
44   0x255f, 0x2560, 0x2561, 0x0401, 0x2562, 0x2563, 0x2564, 0x2565,
45   0x2566, 0x2567, 0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x00a9,
46   0x044e, 0x0430, 0x0431, 0x0446, 0x0434, 0x0435, 0x0444, 0x0433,
47   0x0445, 0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e,
48   0x043f, 0x044f, 0x0440, 0x0441, 0x0442, 0x0443, 0x0436, 0x0432,
49   0x044c, 0x044b, 0x0437, 0x0448, 0x044d, 0x0449, 0x0447, 0x044a,
50   0x042e, 0x0410, 0x0411, 0x0426, 0x0414, 0x0415, 0x0424, 0x0413,
51   0x0425, 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e,
52   0x041f, 0x042f, 0x0420, 0x0421, 0x0422, 0x0423, 0x0416, 0x0412,
53   0x042c, 0x042b, 0x0417, 0x0428, 0x042d, 0x0429, 0x0427, 0x042a
54 };
55
56 static ushort latin2_unicode[128] = {
57   0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,
58   0x0088, 0x0089, 0x008A, 0x008B, 0x008C, 0x008D, 0x008E, 0x008F,
59   0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,
60   0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E, 0x009F,
61   0x00A0, 0x0104, 0x02D8, 0x0141, 0x00A4, 0x013D, 0x015A, 0x00A7,
62   0x00A8, 0x0160, 0x015E, 0x0164, 0x0179, 0x00AD, 0x017D, 0x017B,
63   0x00B0, 0x0105, 0x02DB, 0x0142, 0x00B4, 0x013E, 0x015B, 0x02C7,
64   0x00B8, 0x0161, 0x015F, 0x0165, 0x017A, 0x02DD, 0x017E, 0x017C,
65   0x0154, 0x00C1, 0x00C2, 0x0102, 0x00C4, 0x0139, 0x0106, 0x00C7,
66   0x010C, 0x00C9, 0x0118, 0x00CB, 0x011A, 0x00CD, 0x00CE, 0x010E,
67   0x0110, 0x0143, 0x0147, 0x00D3, 0x00D4, 0x0150, 0x00D6, 0x00D7,
68   0x0158, 0x016E, 0x00DA, 0x0170, 0x00DC, 0x00DD, 0x0162, 0x00DF,
69   0x0155, 0x00E1, 0x00E2, 0x0103, 0x00E4, 0x013A, 0x0107, 0x00E7,
70   0x010D, 0x00E9, 0x0119, 0x00EB, 0x011B, 0x00ED, 0x00EE, 0x010F,
71   0x0111, 0x0144, 0x0148, 0x00F3, 0x00F4, 0x0151, 0x00F6, 0x00F7,
72   0x0159, 0x016F, 0x00FA, 0x0171, 0x00FC, 0x00FD, 0x0163, 0x02D9
73 };
74
75
76 static const char *active_charset_name = "iso-8859-1";
77 static ushort *active_charset = NULL;
78 static int no_translation = 0;
79
80 int
81 set_native_charset (const char *newset)
82 {
83   if (!newset)
84 #ifdef HAVE_LANGINFO_CODESET
85     newset = nl_langinfo (CODESET);
86 #else
87     newset = "8859-1";
88 #endif
89
90   if (strlen (newset) > 3 && !ascii_memcasecmp (newset, "iso", 3))
91     {
92       newset += 3;
93       if (*newset == '-' || *newset == '_')
94         newset++;
95     }
96
97   if (!*newset
98       || !ascii_strcasecmp (newset, "8859-1")
99       || !ascii_strcasecmp (newset, "8859-15"))
100     {
101       active_charset_name = "iso-8859-1";
102       no_translation = 0;
103       active_charset = NULL;
104     }
105   else if (!ascii_strcasecmp (newset, "8859-2"))
106     {
107       active_charset_name = "iso-8859-2";
108       no_translation = 0;
109       active_charset = latin2_unicode;
110     }
111   else if (!ascii_strcasecmp (newset, "koi8-r"))
112     {
113       active_charset_name = "koi8-r";
114       no_translation = 0;
115       active_charset = koi8_unicode;
116     }
117   else if (!ascii_strcasecmp (newset, "utf8")
118            || !ascii_strcasecmp (newset, "utf-8"))
119     {
120       active_charset_name = "utf-8";
121       no_translation = 1;
122       active_charset = NULL;
123     }
124   else
125     return -1;
126   return 0;
127 }
128
129 const char *
130 get_native_charset ()
131 {
132   return active_charset_name;
133 }
134
135 /****************
136  * Convert string, which is in native encoding to UTF8 and return the
137  * new allocated UTF8 string.
138  */
139 char *
140 native_to_utf8 (const char *orig_string)
141 {
142   const unsigned char *string = (const unsigned char *)orig_string;
143   const unsigned char *s;
144   char *buffer;
145   unsigned char *p;
146   size_t length = 0;
147
148   if (no_translation)
149     {
150       buffer = jnlib_xstrdup (orig_string);
151     }
152   else if (active_charset)
153     {
154       for (s = string; *s; s++)
155         {
156           length++;
157           if (*s & 0x80)
158             length += 2;        /* we may need 3 bytes */
159         }
160       buffer = jnlib_xmalloc (length + 1);
161       for (p = (unsigned char *)buffer, s = string; *s; s++)
162         {
163           if ((*s & 0x80))
164             {
165               ushort val = active_charset[*s & 0x7f];
166               if (val < 0x0800)
167                 {
168                   *p++ = 0xc0 | ((val >> 6) & 0x1f);
169                   *p++ = 0x80 | (val & 0x3f);
170                 }
171               else
172                 {
173                   *p++ = 0xe0 | ((val >> 12) & 0x0f);
174                   *p++ = 0x80 | ((val >> 6) & 0x3f);
175                   *p++ = 0x80 | (val & 0x3f);
176                 }
177             }
178           else
179             *p++ = *s;
180         }
181       *p = 0;
182     }
183   else
184     {
185       for (s = string; *s; s++)
186         {
187           length++;
188           if (*s & 0x80)
189             length++;
190         }
191       buffer = jnlib_xmalloc (length + 1);
192       for (p = (unsigned char *)buffer, s = string; *s; s++)
193         {
194           if (*s & 0x80)
195             {
196               *p++ = 0xc0 | ((*s >> 6) & 3);
197               *p++ = 0x80 | (*s & 0x3f);
198             }
199           else
200             *p++ = *s;
201         }
202       *p = 0;
203     }
204   return buffer;
205 }
206
207
208 /* Convert string, which is in UTF8 to native encoding.  Replace
209  * illegal encodings by some "\xnn" and quote all control
210  * characters. A character with value DELIM will always be quoted, it
211  * must be a vanilla ASCII character.  */
212 char *
213 utf8_to_native (const char *string, size_t length, int delim)
214 {
215   int nleft;
216   int i;
217   unsigned char encbuf[8];
218   int encidx;
219   const byte *s;
220   size_t n;
221   char *buffer = NULL;
222   char *p = NULL;
223   unsigned long val = 0;
224   size_t slen;
225   int resync = 0;
226
227   /* 1. pass (p==NULL): count the extended utf-8 characters */
228   /* 2. pass (p!=NULL): create string */
229   for (;;)
230     {
231       for (slen = length, nleft = encidx = 0, n = 0,
232              s = (const unsigned char *)string; slen;
233            s++, slen--)
234         {
235           if (resync)
236             {
237               if (!(*s < 128 || (*s >= 0xc0 && *s <= 0xfd)))
238                 {
239                   /* still invalid */
240                   if (p)
241                     {
242                       sprintf (p, "\\x%02x", *s);
243                       p += 4;
244                     }
245                   n += 4;
246                   continue;
247                 }
248               resync = 0;
249             }
250           if (!nleft)
251             {
252               if (!(*s & 0x80))
253                 {               /* plain ascii */
254                   if (*s < 0x20 || *s == 0x7f || *s == delim ||
255                       (delim && *s == '\\'))
256                     {
257                       n++;
258                       if (p)
259                         *p++ = '\\';
260                       switch (*s)
261                         {
262                         case '\n':
263                           n++;
264                           if (p)
265                             *p++ = 'n';
266                           break;
267                         case '\r':
268                           n++;
269                           if (p)
270                             *p++ = 'r';
271                           break;
272                         case '\f':
273                           n++;
274                           if (p)
275                             *p++ = 'f';
276                           break;
277                         case '\v':
278                           n++;
279                           if (p)
280                             *p++ = 'v';
281                           break;
282                         case '\b':
283                           n++;
284                           if (p)
285                             *p++ = 'b';
286                           break;
287                         case 0:
288                           n++;
289                           if (p)
290                             *p++ = '0';
291                           break;
292                         default:
293                           n += 3;
294                           if (p)
295                             {
296                               sprintf (p, "x%02x", *s);
297                               p += 3;
298                             }
299                           break;
300                         }
301                     }
302                   else
303                     {
304                       if (p)
305                         *p++ = *s;
306                       n++;
307                     }
308                 }
309               else if ((*s & 0xe0) == 0xc0)
310                 {               /* 110x xxxx */
311                   val = *s & 0x1f;
312                   nleft = 1;
313                   encidx = 0;
314                   encbuf[encidx++] = *s;
315                 }
316               else if ((*s & 0xf0) == 0xe0)
317                 {               /* 1110 xxxx */
318                   val = *s & 0x0f;
319                   nleft = 2;
320                   encidx = 0;
321                   encbuf[encidx++] = *s;
322                 }
323               else if ((*s & 0xf8) == 0xf0)
324                 {               /* 1111 0xxx */
325                   val = *s & 0x07;
326                   nleft = 3;
327                   encidx = 0;
328                   encbuf[encidx++] = *s;
329                 }
330               else if ((*s & 0xfc) == 0xf8)
331                 {               /* 1111 10xx */
332                   val = *s & 0x03;
333                   nleft = 4;
334                   encidx = 0;
335                   encbuf[encidx++] = *s;
336                 }
337               else if ((*s & 0xfe) == 0xfc)
338                 {               /* 1111 110x */
339                   val = *s & 0x01;
340                   nleft = 5;
341                   encidx = 0;
342                   encbuf[encidx++] = *s;
343                 }
344               else
345                 {               /* invalid encoding: print as \xnn */
346                   if (p)
347                     {
348                       sprintf (p, "\\x%02x", *s);
349                       p += 4;
350                     }
351                   n += 4;
352                   resync = 1;
353                 }
354             }
355           else if (*s < 0x80 || *s >= 0xc0)
356             {                   /* invalid */
357               if (p)
358                 {
359                   for (i = 0; i < encidx; i++)
360                     {
361                       sprintf (p, "\\x%02x", encbuf[i]);
362                       p += 4;
363                     }
364                   sprintf (p, "\\x%02x", *s);
365                   p += 4;
366                 }
367               n += 4 + 4 * encidx;
368               nleft = 0;
369               encidx = 0;
370               resync = 1;
371             }
372           else
373             {
374               encbuf[encidx++] = *s;
375               val <<= 6;
376               val |= *s & 0x3f;
377               if (!--nleft)
378                 {               /* ready */
379                   if (no_translation)
380                     {
381                       if (p)
382                         {
383                           for (i = 0; i < encidx; i++)
384                             *p++ = encbuf[i];
385                         }
386                       n += encidx;
387                       encidx = 0;
388                     }
389                   else if (active_charset)
390                     {           /* table lookup */
391                       for (i = 0; i < 128; i++)
392                         {
393                           if (active_charset[i] == val)
394                             break;
395                         }
396                       if (i < 128)
397                         {       /* we can print this one */
398                           if (p)
399                             *p++ = i + 128;
400                           n++;
401                         }
402                       else
403                         {       /* we do not have a translation: print utf8 */
404                           if (p)
405                             {
406                               for (i = 0; i < encidx; i++)
407                                 {
408                                   sprintf (p, "\\x%02x", encbuf[i]);
409                                   p += 4;
410                                 }
411                             }
412                           n += encidx * 4;
413                           encidx = 0;
414                         }
415                     }
416                   else
417                     {           /* native set */
418                       if (val >= 0x80 && val < 256)
419                         {
420                           n++;  /* we can simply print this character */
421                           if (p)
422                             *p++ = val;
423                         }
424                       else
425                         {       /* we do not have a translation: print utf8 */
426                           if (p)
427                             {
428                               for (i = 0; i < encidx; i++)
429                                 {
430                                   sprintf (p, "\\x%02x", encbuf[i]);
431                                   p += 4;
432                                 }
433                             }
434                           n += encidx * 4;
435                           encidx = 0;
436                         }
437                     }
438                 }
439
440             }
441         }
442       if (!buffer)
443         {                       /* allocate the buffer after the first pass */
444           buffer = p = jnlib_xmalloc (n + 1);
445         }
446       else
447         {
448           *p = 0;               /* make a string */
449           return buffer;
450         }
451     }
452 }