Qt 4.8
Static Public Functions | List of all members
QUtf8 Struct Reference

#include <qutfcodec_p.h>

Static Public Functions

static QByteArray convertFromUnicode (const QChar *, int, QTextCodec::ConverterState *)
 
static QString convertToUnicode (const char *, int, QTextCodec::ConverterState *)
 

Detailed Description

Definition at line 68 of file qutfcodec_p.h.

Functions

◆ convertFromUnicode()

QByteArray QUtf8::convertFromUnicode ( const QChar uc,
int  len,
QTextCodec::ConverterState state 
)
static

Definition at line 64 of file qutfcodec.cpp.

Referenced by QUtf8Codec::convertFromUnicode(), QUtf16Codec::QUtf16Codec(), QUtf32Codec::QUtf32Codec(), QString::toUtf8(), and QStringRef::toUtf8().

65 {
66  uchar replacement = '?';
67  int rlen = 3*len;
68  int surrogate_high = -1;
69  if (state) {
71  replacement = 0;
72  if (!(state->flags & QTextCodec::IgnoreHeader))
73  rlen += 3;
74  if (state->remainingChars)
75  surrogate_high = state->state_data[0];
76  }
77 
78  QByteArray rstr;
79  rstr.resize(rlen);
80  uchar* cursor = (uchar*)rstr.data();
81  const QChar *ch = uc;
82  int invalid = 0;
83  if (state && !(state->flags & QTextCodec::IgnoreHeader)) {
84  *cursor++ = 0xef;
85  *cursor++ = 0xbb;
86  *cursor++ = 0xbf;
87  }
88 
89  const QChar *end = ch + len;
90  while (ch < end) {
91  uint u = ch->unicode();
92  if (surrogate_high >= 0) {
93  if (ch->isLowSurrogate()) {
94  u = QChar::surrogateToUcs4(surrogate_high, u);
95  surrogate_high = -1;
96  } else {
97  // high surrogate without low
98  *cursor = replacement;
99  ++ch;
100  ++invalid;
101  surrogate_high = -1;
102  continue;
103  }
104  } else if (ch->isLowSurrogate()) {
105  // low surrogate without high
106  *cursor = replacement;
107  ++ch;
108  ++invalid;
109  continue;
110  } else if (ch->isHighSurrogate()) {
111  surrogate_high = u;
112  ++ch;
113  continue;
114  }
115 
116  if (u < 0x80) {
117  *cursor++ = (uchar)u;
118  } else {
119  if (u < 0x0800) {
120  *cursor++ = 0xc0 | ((uchar) (u >> 6));
121  } else {
122  // is it one of the Unicode non-characters?
123  if (isUnicodeNonCharacter(u)) {
124  *cursor++ = replacement;
125  ++ch;
126  ++invalid;
127  continue;
128  }
129 
130  if (u > 0xffff) {
131  *cursor++ = 0xf0 | ((uchar) (u >> 18));
132  *cursor++ = 0x80 | (((uchar) (u >> 12)) & 0x3f);
133  } else {
134  *cursor++ = 0xe0 | (((uchar) (u >> 12)) & 0x3f);
135  }
136  *cursor++ = 0x80 | (((uchar) (u >> 6)) & 0x3f);
137  }
138  *cursor++ = 0x80 | ((uchar) (u&0x3f));
139  }
140  ++ch;
141  }
142 
143  rstr.resize(cursor - (const uchar*)rstr.constData());
144  if (state) {
145  state->invalidChars += invalid;
147  state->remainingChars = 0;
148  if (surrogate_high >= 0) {
149  state->remainingChars = 1;
150  state->state_data[0] = surrogate_high;
151  }
152  }
153  return rstr;
154 }
char * data()
Returns a pointer to the data stored in the byte array.
Definition: qbytearray.h:429
ushort unicode() const
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: qchar.h:251
The QByteArray class provides an array of bytes.
Definition: qbytearray.h:135
bool isLowSurrogate() const
Returns true if the QChar is the low part of a utf16 surrogate (ie.
Definition: qchar.h:279
quint16 u
bool isHighSurrogate() const
Returns true if the QChar is the high part of a utf16 surrogate (ie.
Definition: qchar.h:276
The QChar class provides a 16-bit Unicode character.
Definition: qchar.h:72
unsigned char uchar
Definition: qglobal.h:994
unsigned int uint
Definition: qglobal.h:996
ConversionFlags flags
Definition: qtextcodec.h:106
const char * constData() const
Returns a pointer to the data stored in the byte array.
Definition: qbytearray.h:433
static bool isUnicodeNonCharacter(uint ucs4)
Definition: qutfcodec.cpp:51
void resize(int size)
Sets the size of the byte array to size bytes.
static uint surrogateToUcs4(ushort high, ushort low)
Converts a UTF16 surrogate pair with the given high and low values to its UCS-4 code point...
Definition: qchar.h:297
static const KeyPair *const end

◆ convertToUnicode()

QString QUtf8::convertToUnicode ( const char *  chars,
int  len,
QTextCodec::ConverterState state 
)
static

Definition at line 156 of file qutfcodec.cpp.

Referenced by QUtf8Codec::convertToUnicode(), QString::fromUtf8(), QUtf16Codec::QUtf16Codec(), and QUtf32Codec::QUtf32Codec().

157 {
158  bool headerdone = false;
159  ushort replacement = QChar::ReplacementCharacter;
160  int need = 0;
161  int error = -1;
162  uint uc = 0;
163  uint min_uc = 0;
164  if (state) {
165  if (state->flags & QTextCodec::IgnoreHeader)
166  headerdone = true;
168  replacement = QChar::Null;
169  need = state->remainingChars;
170  if (need) {
171  uc = state->state_data[0];
172  min_uc = state->state_data[1];
173  }
174  }
175  if (!headerdone && len > 3
176  && (uchar)chars[0] == 0xef && (uchar)chars[1] == 0xbb && (uchar)chars[2] == 0xbf) {
177  // starts with a byte order mark
178  chars += 3;
179  len -= 3;
180  headerdone = true;
181  }
182 
183  QString result(need + len + 1, Qt::Uninitialized); // worst case
184  ushort *qch = (ushort *)result.unicode();
185  uchar ch;
186  int invalid = 0;
187 
188  for (int i = 0; i < len; ++i) {
189  ch = chars[i];
190  if (need) {
191  if ((ch&0xc0) == 0x80) {
192  uc = (uc << 6) | (ch & 0x3f);
193  --need;
194  if (!need) {
195  // utf-8 bom composes into 0xfeff code point
196  bool nonCharacter;
197  if (!headerdone && uc == 0xfeff) {
198  // don't do anything, just skip the BOM
199  } else if (!(nonCharacter = isUnicodeNonCharacter(uc)) && uc > 0xffff && uc < 0x110000) {
200  // surrogate pair
201  Q_ASSERT((qch - (ushort*)result.unicode()) + 2 < result.length());
202  *qch++ = QChar::highSurrogate(uc);
203  *qch++ = QChar::lowSurrogate(uc);
204  } else if ((uc < min_uc) || (uc >= 0xd800 && uc <= 0xdfff) || nonCharacter || uc >= 0x110000) {
205  // error: overlong sequence, UTF16 surrogate or non-character
206  *qch++ = replacement;
207  ++invalid;
208  } else {
209  *qch++ = uc;
210  }
211  headerdone = true;
212  }
213  } else {
214  // error
215  i = error;
216  *qch++ = replacement;
217  ++invalid;
218  need = 0;
219  headerdone = true;
220  }
221  } else {
222  if (ch < 128) {
223  *qch++ = ushort(ch);
224  headerdone = true;
225  } else if ((ch & 0xe0) == 0xc0) {
226  uc = ch & 0x1f;
227  need = 1;
228  error = i;
229  min_uc = 0x80;
230  headerdone = true;
231  } else if ((ch & 0xf0) == 0xe0) {
232  uc = ch & 0x0f;
233  need = 2;
234  error = i;
235  min_uc = 0x800;
236  } else if ((ch&0xf8) == 0xf0) {
237  uc = ch & 0x07;
238  need = 3;
239  error = i;
240  min_uc = 0x10000;
241  headerdone = true;
242  } else {
243  // error
244  *qch++ = replacement;
245  ++invalid;
246  headerdone = true;
247  }
248  }
249  }
250  if (!state && need > 0) {
251  // unterminated UTF sequence
252  for (int i = error; i < len; ++i) {
253  *qch++ = replacement;
254  ++invalid;
255  }
256  }
257  result.truncate(qch - (ushort *)result.unicode());
258  if (state) {
259  state->invalidChars += invalid;
260  state->remainingChars = need;
261  if (headerdone)
263  state->state_data[0] = need ? uc : 0;
264  state->state_data[1] = need ? min_uc : 0;
265  }
266  return result;
267 }
#define error(msg)
The QString class provides a Unicode character string.
Definition: qstring.h:83
#define Q_ASSERT(cond)
Definition: qglobal.h:1823
static ushort highSurrogate(uint ucs4)
Returns the high surrogate value of a ucs4 code point.
Definition: qchar.h:303
static ushort lowSurrogate(uint ucs4)
Returns the low surrogate value of a ucs4 code point.
Definition: qchar.h:306
unsigned char uchar
Definition: qglobal.h:994
unsigned int uint
Definition: qglobal.h:996
ConversionFlags flags
Definition: qtextcodec.h:106
static bool isUnicodeNonCharacter(uint ucs4)
Definition: qutfcodec.cpp:51
unsigned short ushort
Definition: qglobal.h:995

The documentation for this struct was generated from the following files: