Qt 4.8
qiconvcodec.cpp
Go to the documentation of this file.
1 /****************************************************************************
2 **
3 ** Copyright (C) 2014 Digia Plc and/or its subsidiary(-ies).
4 ** Contact: http://www.qt-project.org/legal
5 **
6 ** This file is part of the QtCore module of the Qt Toolkit.
7 **
8 ** $QT_BEGIN_LICENSE:LGPL$
9 ** Commercial License Usage
10 ** Licensees holding valid commercial Qt licenses may use this file in
11 ** accordance with the commercial license agreement provided with the
12 ** Software or, alternatively, in accordance with the terms contained in
13 ** a written agreement between you and Digia. For licensing terms and
14 ** conditions see http://qt.digia.com/licensing. For further information
15 ** use the contact form at http://qt.digia.com/contact-us.
16 **
17 ** GNU Lesser General Public License Usage
18 ** Alternatively, this file may be used under the terms of the GNU Lesser
19 ** General Public License version 2.1 as published by the Free Software
20 ** Foundation and appearing in the file LICENSE.LGPL included in the
21 ** packaging of this file. Please review the following information to
22 ** ensure the GNU Lesser General Public License version 2.1 requirements
23 ** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
24 **
25 ** In addition, as a special exception, Digia gives you certain additional
26 ** rights. These rights are described in the Digia Qt LGPL Exception
27 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
28 **
29 ** GNU General Public License Usage
30 ** Alternatively, this file may be used under the terms of the GNU
31 ** General Public License version 3.0 as published by the Free Software
32 ** Foundation and appearing in the file LICENSE.GPL included in the
33 ** packaging of this file. Please review the following information to
34 ** ensure the GNU General Public License version 3.0 requirements will be
35 ** met: http://www.gnu.org/copyleft/gpl.html.
36 **
37 **
38 ** $QT_END_LICENSE$
39 **
40 ****************************************************************************/
41 
42 #include "qiconvcodec_p.h"
43 #include "qtextcodec_p.h"
44 #include <qlibrary.h>
45 #include <qdebug.h>
46 #include <qthreadstorage.h>
47 
48 #include <errno.h>
49 #include <locale.h>
50 #include <stdio.h>
51 #include <dlfcn.h>
52 
53 // unistd.h is needed for the _XOPEN_UNIX macro
54 #include <unistd.h>
55 #if defined(_XOPEN_UNIX) && !defined(Q_OS_QNX) && !defined(Q_OS_OSF)
56 # include <langinfo.h>
57 #endif
58 
59 #if defined(Q_OS_HPUX)
60 # define NO_BOM
61 # define UTF16 "ucs2"
62 #elif defined(Q_OS_AIX)
63 # define NO_BOM
64 # define UTF16 "UCS-2"
65 #elif defined(Q_OS_FREEBSD) || defined(Q_OS_MAC)
66 # define NO_BOM
67 # if Q_BYTE_ORDER == Q_BIG_ENDIAN
68 # define UTF16 "UTF-16BE"
69 # else
70 # define UTF16 "UTF-16LE"
71 # endif
72 #else
73 # define UTF16 "UTF-16"
74 #endif
75 
76 #if defined(Q_OS_MAC)
77 #ifndef GNU_LIBICONV
78 #define GNU_LIBICONV
79 #endif
80 typedef iconv_t (*Ptr_iconv_open) (const char*, const char*);
81 typedef size_t (*Ptr_iconv) (iconv_t, const char **, size_t *, char **, size_t *);
82 typedef int (*Ptr_iconv_close) (iconv_t);
83 
85 static Ptr_iconv ptr_iconv = 0;
87 #endif
88 
90 
91 extern bool qt_locale_initialized;
92 
94  : utf16Codec(0)
95 {
98  "QIconvCodec::convertToUnicode",
99  "internal error, UTF-16 codec not found");
100  if (!utf16Codec) {
101  fprintf(stderr, "QIconvCodec::convertToUnicode: internal error, UTF-16 codec not found\n");
102  utf16Codec = reinterpret_cast<QTextCodec *>(~0);
103  }
104 #if defined(Q_OS_MAC)
105  if (ptr_iconv_open == 0) {
106  QLibrary libiconv(QLatin1String("/usr/lib/libiconv"));
108 
109  ptr_iconv_open = reinterpret_cast<Ptr_iconv_open>(libiconv.resolve("libiconv_open"));
110  if (!ptr_iconv_open)
111  ptr_iconv_open = reinterpret_cast<Ptr_iconv_open>(libiconv.resolve("iconv_open"));
112  ptr_iconv = reinterpret_cast<Ptr_iconv>(libiconv.resolve("libiconv"));
113  if (!ptr_iconv)
114  ptr_iconv = reinterpret_cast<Ptr_iconv>(libiconv.resolve("iconv"));
115  ptr_iconv_close = reinterpret_cast<Ptr_iconv_close>(libiconv.resolve("libiconv_close"));
116  if (!ptr_iconv_close)
117  ptr_iconv_close = reinterpret_cast<Ptr_iconv_close>(libiconv.resolve("iconv_close"));
118 
120  "QIconvCodec::QIconvCodec()",
121  "internal error, could not resolve the iconv functions");
122 
123 # undef iconv_open
124 # define iconv_open ptr_iconv_open
125 # undef iconv
126 # define iconv ptr_iconv
127 # undef iconv_close
128 # define iconv_close ptr_iconv_close
129  }
130 #endif
131 }
132 
134 {
135 }
136 
138  : buffer(array), bufferLen(sizeof array), cd(x)
139 {
140 }
141 
143 {
144  if (cd != reinterpret_cast<iconv_t>(-1))
145  iconv_close(cd);
146  if (buffer != array)
147  delete[] buffer;
148 }
149 
150 void QIconvCodec::IconvState::saveChars(const char *c, int count)
151 {
152  if (count > bufferLen) {
153  if (buffer != array)
154  delete[] buffer;
155  buffer = new char[bufferLen = count];
156  }
157 
158  memcpy(buffer, c, count);
159 }
160 
162 {
163  delete reinterpret_cast<QIconvCodec::IconvState *>(state->d);
164 }
165 
167 
168 QString QIconvCodec::convertToUnicode(const char* chars, int len, ConverterState *convState) const
169 {
170  if (utf16Codec == reinterpret_cast<QTextCodec *>(~0))
171  return QString::fromLatin1(chars, len);
172 
173  int invalidCount = 0;
174  int remainingCount = 0;
175  char *remainingBuffer = 0;
176  IconvState *temporaryState = 0;
177  IconvState **pstate;
178 
179  if (convState) {
180  // stateful conversion
181  pstate = reinterpret_cast<IconvState **>(&convState->d);
182  if (convState->d) {
183  // restore state
184  remainingCount = convState->remainingChars;
185  remainingBuffer = (*pstate)->buffer;
186  } else {
187  // first time
188  convState->flags |= FreeFunction;
190  }
191  } else {
192  QThreadStorage<QIconvCodec::IconvState *> *ts = toUnicodeState();
193  if (!qt_locale_initialized || !ts) {
194  // we're running after the Q_GLOBAL_STATIC has been deleted
195  // or before the QCoreApplication initialization
196  // bad programmer, no cookie for you
197  pstate = &temporaryState;
198  } else {
199  // stateless conversion -- use thread-local data
200  pstate = &toUnicodeState()->localData();
201  }
202  }
203 
204  if (!*pstate) {
205  // first time, create the state
207  if (cd == reinterpret_cast<iconv_t>(-1)) {
208  static int reported = 0;
209  if (!reported++) {
210  fprintf(stderr,
211  "QIconvCodec::convertToUnicode: using Latin-1 for conversion, iconv_open failed\n");
212  }
213  return QString::fromLatin1(chars, len);
214  }
215 
216  *pstate = new IconvState(cd);
217  }
218 
219  IconvState *state = *pstate;
220  size_t inBytesLeft = len;
221  // best case assumption, each byte is converted into one UTF-16 character, plus 2 bytes for the BOM
222 #ifdef GNU_LIBICONV
223  // GNU doesn't disagree with POSIX :/
224  const char *inBytes = chars;
225 #else
226  char *inBytes = const_cast<char *>(chars);
227 #endif
228 
229  QByteArray in;
230  if (remainingCount) {
231  // we have to prepend the remaining bytes from the previous conversion
232  inBytesLeft += remainingCount;
233  in.resize(inBytesLeft);
234  inBytes = in.data();
235 
236  memcpy(in.data(), remainingBuffer, remainingCount);
237  memcpy(in.data() + remainingCount, chars, len);
238 
239  remainingCount = 0;
240  }
241 
242  size_t outBytesLeft = len * 2 + 2;
243  QByteArray ba(outBytesLeft, Qt::Uninitialized);
244  char *outBytes = ba.data();
245  do {
246  size_t ret = iconv(state->cd, &inBytes, &inBytesLeft, &outBytes, &outBytesLeft);
247  if (ret == (size_t) -1) {
248  if (errno == E2BIG) {
249  int offset = ba.size() - outBytesLeft;
250  ba.resize(ba.size() * 2);
251  outBytes = ba.data() + offset;
252  outBytesLeft = ba.size() - offset;
253 
254  continue;
255  }
256 
257  if (errno == EILSEQ) {
258  // conversion stopped because of an invalid character in the sequence
259  ++invalidCount;
260  } else if (errno == EINVAL && convState) {
261  // conversion stopped because the remaining inBytesLeft make up
262  // an incomplete multi-byte sequence; save them for later
263  state->saveChars(inBytes, inBytesLeft);
264  remainingCount = inBytesLeft;
265  break;
266  }
267 
268  if (errno == EILSEQ || errno == EINVAL) {
269  // skip the next character
270  ++inBytes;
271  --inBytesLeft;
272  continue;
273  }
274 
275  // some other error
276  // note, cannot use qWarning() since we are implementing the codecForLocale :)
277  perror("QIconvCodec::convertToUnicode: using Latin-1 for conversion, iconv failed");
278 
279  if (!convState) {
280  // reset state
281  iconv(state->cd, 0, &inBytesLeft, 0, &outBytesLeft);
282  }
283 
284  delete temporaryState;
285  return QString::fromLatin1(chars, len);
286  }
287  } while (inBytesLeft != 0);
288 
289  QString s;
290 
291  if (convState) {
292  s = utf16Codec->toUnicode(ba.constData(), ba.size() - outBytesLeft, &state->internalState);
293 
294  convState->invalidChars = invalidCount;
295  convState->remainingChars = remainingCount;
296  } else {
297  s = utf16Codec->toUnicode(ba.constData(), ba.size() - outBytesLeft);
298 
299  // reset state
300  iconv(state->cd, 0, &inBytesLeft, 0, &outBytesLeft);
301  }
302 
303  delete temporaryState;
304  return s;
305 }
306 
308 
309 static bool setByteOrder(iconv_t cd)
310 {
311 #if !defined(NO_BOM)
312  // give iconv() a BOM
313  char buf[4];
315 
316  char *outBytes = buf;
317  char *inBytes = reinterpret_cast<char *>(bom);
318  size_t outBytesLeft = sizeof buf;
319  size_t inBytesLeft = sizeof bom;
320 
321 #if defined(GNU_LIBICONV)
322  const char **inBytesPtr = const_cast<const char **>(&inBytes);
323 #else
324  char **inBytesPtr = &inBytes;
325 #endif
326 
327  if (iconv(cd, inBytesPtr, &inBytesLeft, &outBytes, &outBytesLeft) == (size_t) -1) {
328  return false;
329  }
330 #endif // NO_BOM
331 
332  return true;
333 }
334 
335 QByteArray QIconvCodec::convertFromUnicode(const QChar *uc, int len, ConverterState *convState) const
336 {
337  char *inBytes;
338  char *outBytes;
339  size_t inBytesLeft;
340 
341 #if defined(GNU_LIBICONV)
342  const char **inBytesPtr = const_cast<const char **>(&inBytes);
343 #else
344  char **inBytesPtr = &inBytes;
345 #endif
346 
347  IconvState *temporaryState = 0;
348  QThreadStorage<QIconvCodec::IconvState *> *ts = fromUnicodeState();
349  IconvState *&state = (qt_locale_initialized && ts) ? ts->localData() : temporaryState;
350  if (!state) {
352  if (cd != reinterpret_cast<iconv_t>(-1)) {
353  if (!setByteOrder(cd)) {
354  perror("QIconvCodec::convertFromUnicode: using Latin-1 for conversion, iconv failed for BOM");
355 
356  iconv_close(cd);
357  cd = reinterpret_cast<iconv_t>(-1);
358 
359  return QString(uc, len).toLatin1();
360  }
361  }
362  state = new IconvState(cd);
363  }
364  if (state->cd == reinterpret_cast<iconv_t>(-1)) {
365  static int reported = 0;
366  if (!reported++) {
367  fprintf(stderr,
368  "QIconvCodec::convertFromUnicode: using Latin-1 for conversion, iconv_open failed\n");
369  }
370  delete temporaryState;
371  return QString(uc, len).toLatin1();
372  }
373 
374  size_t outBytesLeft = len;
375  QByteArray ba(outBytesLeft, Qt::Uninitialized);
376  outBytes = ba.data();
377 
378  // now feed iconv() the real data
379  inBytes = const_cast<char *>(reinterpret_cast<const char *>(uc));
380  inBytesLeft = len * sizeof(QChar);
381 
382  QByteArray in;
383  if (convState && convState->remainingChars) {
384  // we have one surrogate char to be prepended
385  in.resize(sizeof(QChar) + len);
386  inBytes = in.data();
387 
388  QChar remaining = convState->state_data[0];
389  memcpy(in.data(), &remaining, sizeof(QChar));
390  memcpy(in.data() + sizeof(QChar), uc, inBytesLeft);
391 
392  inBytesLeft += sizeof(QChar);
393  convState->remainingChars = 0;
394  }
395 
396  int invalidCount = 0;
397  while (inBytesLeft != 0) {
398  if (iconv(state->cd, inBytesPtr, &inBytesLeft, &outBytes, &outBytesLeft) == (size_t) -1) {
399  if (errno == EINVAL && convState) {
400  // buffer ends in a surrogate
401  Q_ASSERT(inBytesLeft == 2);
402  convState->remainingChars = 1;
403  convState->state_data[0] = uc[len - 1].unicode();
404  break;
405  }
406 
407  switch (errno) {
408  case EILSEQ:
409  ++invalidCount;
410  // fall through
411  case EINVAL:
412  {
413  inBytes += sizeof(QChar);
414  inBytesLeft -= sizeof(QChar);
415  break;
416  }
417  case E2BIG:
418  {
419  int offset = ba.size() - outBytesLeft;
420  ba.resize(ba.size() * 2);
421  outBytes = ba.data() + offset;
422  outBytesLeft = ba.size() - offset;
423  break;
424  }
425  default:
426  {
427  // note, cannot use qWarning() since we are implementing the codecForLocale :)
428  perror("QIconvCodec::convertFromUnicode: using Latin-1 for conversion, iconv failed");
429 
430  // reset to initial state
431  iconv(state->cd, 0, &inBytesLeft, 0, &outBytesLeft);
432 
433  delete temporaryState;
434  return QString(uc, len).toLatin1();
435  }
436  }
437  }
438  }
439 
440  // reset to initial state
441  iconv(state->cd, 0, &inBytesLeft, 0, &outBytesLeft);
442  setByteOrder(state->cd);
443 
444  ba.resize(ba.size() - outBytesLeft);
445 
446  if (convState)
447  convState->invalidChars = invalidCount;
448 
449  delete temporaryState;
450  return ba;
451 }
452 
454 {
455  return "System";
456 }
457 
459 {
460  return 0;
461 }
462 
463 iconv_t QIconvCodec::createIconv_t(const char *to, const char *from)
464 {
465  Q_ASSERT((to == 0 && from != 0) || (to != 0 && from == 0));
466 
467  iconv_t cd = (iconv_t) -1;
468 #if defined(__GLIBC__) || defined(GNU_LIBICONV) || defined(Q_OS_QNX)
469 #if defined(Q_OS_QNX)
470  // on QNX the default locale is UTF-8, and an empty string will cause iconv_open to fail
471  static const char empty_codeset[] = "UTF-8";
472 #else
473  // both GLIBC and libgnuiconv will use the locale's encoding if from or to is an empty string
474  static const char empty_codeset[] = "";
475 #endif
476  const char *codeset = empty_codeset;
477  cd = iconv_open(to ? to : codeset, from ? from : codeset);
478 #else
479  char *codeset = 0;
480 #endif
481 
482 #if defined(_XOPEN_UNIX) && !defined(Q_OS_QNX) && !defined(Q_OS_OSF)
483  if (cd == (iconv_t) -1) {
484  codeset = nl_langinfo(CODESET);
485  if (codeset)
486  cd = iconv_open(to ? to : codeset, from ? from : codeset);
487  }
488 #endif
489 
490  if (cd == (iconv_t) -1) {
491  // Very poorly defined and followed standards causes lots of
492  // code to try to get all the cases... This logic is
493  // duplicated in QTextCodec, so if you change it here, change
494  // it there too.
495 
496  // Try to determine locale codeset from locale name assigned to
497  // LC_CTYPE category.
498 
499  // First part is getting that locale name. First try setlocale() which
500  // definitely knows it, but since we cannot fully trust it, get ready
501  // to fall back to environment variables.
502  char * ctype = qstrdup(setlocale(LC_CTYPE, 0));
503 
504  // Get the first nonempty value from $LC_ALL, $LC_CTYPE, and $LANG
505  // environment variables.
506  char * lang = qstrdup(qgetenv("LC_ALL").constData());
507  if (!lang || lang[0] == 0 || strcmp(lang, "C") == 0) {
508  if (lang) delete [] lang;
509  lang = qstrdup(qgetenv("LC_CTYPE").constData());
510  }
511  if (!lang || lang[0] == 0 || strcmp(lang, "C") == 0) {
512  if (lang) delete [] lang;
513  lang = qstrdup(qgetenv("LANG").constData());
514  }
515 
516  // Now try these in order:
517  // 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15)
518  // 2. CODESET from lang if it contains a .CODESET part
519  // 3. ctype (maybe the locale is named "ISO-8859-1" or something)
520  // 4. locale (ditto)
521  // 5. check for "@euro"
522 
523  // 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15)
524  codeset = ctype ? strchr(ctype, '.') : 0;
525  if (codeset && *codeset == '.') {
526  ++codeset;
527  cd = iconv_open(to ? to : codeset, from ? from : codeset);
528  }
529 
530  // 2. CODESET from lang if it contains a .CODESET part
531  codeset = lang ? strchr(lang, '.') : 0;
532  if (cd == (iconv_t) -1 && codeset && *codeset == '.') {
533  ++codeset;
534  cd = iconv_open(to ? to : codeset, from ? from : codeset);
535  }
536 
537  // 3. ctype (maybe the locale is named "ISO-8859-1" or something)
538  if (cd == (iconv_t) -1 && ctype && *ctype != 0 && strcmp (ctype, "C") != 0)
539  cd = iconv_open(to ? to : ctype, from ? from : ctype);
540 
541 
542  // 4. locale (ditto)
543  if (cd == (iconv_t) -1 && lang && *lang != 0)
544  cd = iconv_open(to ? to : lang, from ? from : lang);
545 
546  // 5. "@euro"
547  if ((cd == (iconv_t) -1 && ctype && strstr(ctype, "@euro")) || (lang && strstr(lang, "@euro")))
548  cd = iconv_open(to ? to : "ISO8859-15", from ? from : "ISO8859-15");
549 
550  delete [] ctype;
551  delete [] lang;
552  }
553 
554  return cd;
555 }
556 
Q_CORE_EXPORT QTextStream & bom(QTextStream &s)
static Ptr_iconv ptr_iconv
Definition: qiconvcodec.cpp:85
ConverterState internalState
Definition: qiconvcodec_p.h:90
Q_CORE_EXPORT QByteArray qgetenv(const char *varName)
#define iconv_close
unsigned char c[8]
Definition: qnumeric_p.h:62
#define QT_END_NAMESPACE
This macro expands to.
Definition: qglobal.h:90
char * data()
Returns a pointer to the data stored in the byte array.
Definition: qbytearray.h:429
ushort unicode() const
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: qchar.h:251
QTextCodec * utf16Codec
Definition: qiconvcodec_p.h:71
The QByteArray class provides an array of bytes.
Definition: qbytearray.h:135
QLatin1String(DBUS_INTERFACE_DBUS))) Q_GLOBAL_STATIC_WITH_ARGS(QString
void * resolve(const char *symbol)
Returns the address of the exported symbol symbol.
Definition: qlibrary.cpp:1155
The QString class provides a Unicode character string.
Definition: qstring.h:83
static bool setByteOrder(iconv_t cd)
#define Q_ASSERT(cond)
Definition: qglobal.h:1823
QByteArray name() const
QTextCodec subclasses must reimplement this function.
The QChar class provides a 16-bit Unicode character.
Definition: qchar.h:72
#define iconv
#define QT_BEGIN_NAMESPACE
This macro expands to.
Definition: qglobal.h:89
int mibEnum() const
Subclasses of QTextCodec must reimplement this function.
#define Q_GLOBAL_STATIC(TYPE, NAME)
Declares a global static variable with the given type and name.
Definition: qglobal.h:1968
T & localData()
Returns a reference to the data that was set by the calling thread.
iconv_t(* Ptr_iconv_open)(const char *, const char *)
Definition: qiconvcodec.cpp:80
void * iconv_t
Definition: qiconvcodec_p.h:61
static QTextCodec * codecForMib(int mib)
Returns the QTextCodec which matches the MIBenum mib.
QByteArray toLatin1() const Q_REQUIRED_RESULT
Returns a Latin-1 representation of the string as a QByteArray.
Definition: qstring.cpp:3993
static Ptr_iconv_open ptr_iconv_open
Definition: qiconvcodec.cpp:84
const char * constData() const
Returns a pointer to the data stored in the byte array.
Definition: qbytearray.h:433
QString toUnicode(const QByteArray &) const
Converts a from the encoding of this codec to Unicode, and returns the result in a QString...
#define Q_ASSERT_X(cond, where, what)
Definition: qglobal.h:1837
static void qIconvCodecStateFree(QTextCodec::ConverterState *state)
static void encode(uint *dst, QTextCodecStateFreeFunction fn)
Definition: qtextcodec_p.h:73
static Ptr_iconv_close ptr_iconv_close
Definition: qiconvcodec.cpp:86
Q_CORE_EXPORT char * qstrdup(const char *)
unsigned short ushort
Definition: qglobal.h:995
QString convertToUnicode(const char *, int, ConverterState *) const
QTextCodec subclasses must reimplement this function.
static QString fromLatin1(const char *, int size=-1)
Returns a QString initialized with the first size characters of the Latin-1 string str...
Definition: qstring.cpp:4188
QByteArray convertFromUnicode(const QChar *, int, ConverterState *) const
QTextCodec subclasses must reimplement this function.
void saveChars(const char *c, int count)
void resize(int size)
Sets the size of the byte array to size bytes.
int size() const
Returns the number of bytes in this byte array.
Definition: qbytearray.h:402
void setLoadHints(LoadHints hints)
Definition: qlibrary.cpp:1304
#define iconv_open
bool qt_locale_initialized
int(* Ptr_iconv_close)(iconv_t)
Definition: qiconvcodec.cpp:82
The QTextCodec class provides conversions between text encodings.
Definition: qtextcodec.h:62
size_t(* Ptr_iconv)(iconv_t, const char **, size_t *, char **, size_t *)
Definition: qiconvcodec.cpp:81
The QThreadStorage class provides per-thread data storage.
#define UTF16
Definition: qiconvcodec.cpp:68
int errno
The QLibrary class loads shared libraries at runtime.
Definition: qlibrary.h:62
static iconv_t createIconv_t(const char *to, const char *from)