Qt 4.8
Static Public Functions | Private Types | Private Functions | Static Private Functions | List of all members
QPatternist::CompressedWhitespace Class Reference

A compression facility for whitespace nodes. More...

#include <qcompressedwhitespace_p.h>

Static Public Functions

static QString compress (const QStringRef &input)
 Compresses input into a compressed format, returned as a QString. More...
 
static QString decompress (const QString &input)
 Decompresses input into a usual QString. More...
 

Private Types

enum  CharIdentifier { Space = 0x0, CR = 0x80, LF = 0x40, Tab = 0xC0 }
 
enum  Constants { MaxCharCount = (1 << 6) - 1, Lower8Bits = (1 << 8) - 1, Lower6Bits = (1 << 6) - 1, UpperTwoBits = 3 << 6 }
 

Private Functions

 CompressedWhitespace ()
 This class can only be used via its static members. More...
 

Static Private Functions

static bool isEven (const int number)
 Returns true if number is an even number, otherwise false. More...
 
static QChar toChar (const CharIdentifier id)
 
static quint8 toCompressedChar (const QChar ch, const int len)
 
static CharIdentifier toIdentifier (const QChar ch)
 

Detailed Description

A compression facility for whitespace nodes.

CompressedWhitespace compresses and decompresses strings that consists of whitespace only, and do so with a scheme that is designed to do this specialized task in an efficient way. The approach is simple: each sequence of equal whitespace in the input gets coded into one byte, where the first two bits signals the type, CharIdentifier, and the remininding six bits is the count.

For instance, this scheme manages to compress a sequence of spaces followed by a new line into 16 bits(one QChar), and QString stores strings of one QChar quite efficiently, by avoiding a heap allocation.

There is no way to tell whether a QString is compressed or not.

The compression scheme originates from Saxon, by Michael Kay.

Author
Frans Englich frans.nosp@m..eng.nosp@m.lich@.nosp@m.noki.nosp@m.a.com

Definition at line 87 of file qcompressedwhitespace_p.h.

Enumerations

◆ CharIdentifier

We use the two upper bits for communicating what space it is.

Enumerator
Space 
CR 

0xA, \r

Binary: 10000000

LF 

0xD, \n

Binary: 01000000

Tab 

Binary: 11000000

Definition at line 117 of file qcompressedwhitespace_p.h.

◆ Constants

Enumerator
MaxCharCount 
Lower8Bits 

Binary: 11111111

Lower6Bits 

Binary: 111111

UpperTwoBits 

Definition at line 141 of file qcompressedwhitespace_p.h.

142  {
143  /* We can at maximum store this many consecutive characters
144  * of one type. We use 6 bits for the count. */
145  MaxCharCount = (1 << 6) - 1,
146 
150  Lower8Bits = (1 << 8) - 1,
151 
155  Lower6Bits = (1 << 6) - 1,
156 
157  /*
158  * Binary: 11000000
159  */
160  UpperTwoBits = 3 << 6
161  };

Constructors and Destructors

◆ CompressedWhitespace()

QPatternist::CompressedWhitespace::CompressedWhitespace ( )
inlineprivate

This class can only be used via its static members.

Functions

◆ compress()

QString CompressedWhitespace::compress ( const QStringRef input)
static

Compresses input into a compressed format, returned as a QString.

The caller guarantees that input is not empty and consists only of whitespace.

The returned format is opaque. There is no way to find out whether a QString contains compressed data or not.

See also
decompress()

Definition at line 101 of file qcompressedwhitespace.cpp.

102 {
103  Q_ASSERT(!isEven(1) && isEven(0) && isEven(2));
104  Q_ASSERT(!input.isEmpty());
105 
106  QString result;
107  const int len = input.length();
108 
109  /* The amount of compressed characters. For instance, if input is
110  * four spaces followed by one tab, compressedChars will be 2, and the resulting
111  * QString will have a length of 1, two compressedChars stored in one QChar. */
112  int compressedChars = 0;
113 
114  for(int i = 0; i < len; ++i)
115  {
116  const QChar c(input.at(i));
117 
118  int start = i;
119 
120  while(true)
121  {
122  if(i + 1 == input.length() || input.at(i + 1) != c)
123  break;
124  else
125  ++i;
126  }
127 
128  /* The length of subsequent whitespace characters in the input. */
129  int wsLen = (i - start) + 1;
130 
131  /* We might get a sequence of whitespace that is so long, that we can't
132  * store it in one unit/byte. In that case we chop it into as many subsequent
133  * ones that is needed. */
134  while(true)
135  {
136  const int unitLength = qMin(wsLen, int(MaxCharCount));
137  wsLen -= unitLength;
138 
139  ushort resultCP = toCompressedChar(c, unitLength);
140 
141  if(isEven(compressedChars))
142  result += QChar(resultCP);
143  else
144  {
145  resultCP = resultCP << 8;
146  resultCP |= result.at(result.size() - 1).unicode();
147  result[result.size() - 1] = resultCP;
148  }
149 
150  ++compressedChars;
151 
152  if(wsLen == 0)
153  break;
154  }
155  }
156 
157  return result;
158 }
unsigned char c[8]
Definition: qnumeric_p.h:62
Q_DECL_CONSTEXPR const T & qMin(const T &a, const T &b)
Definition: qglobal.h:1215
int length() const
Returns the number of characters referred to by the string reference.
Definition: qstring.h:1116
static quint8 toCompressedChar(const QChar ch, const int len)
The QString class provides a Unicode character string.
Definition: qstring.h:83
#define Q_ASSERT(cond)
Definition: qglobal.h:1823
The QChar class provides a 16-bit Unicode character.
Definition: qchar.h:72
bool isEmpty() const
Returns true if the string reference has no characters; otherwise returns false.
Definition: qstring.h:1169
static bool isEven(const int number)
Returns true if number is an even number, otherwise false.
unsigned short ushort
Definition: qglobal.h:995
const QChar at(int i) const
Returns the character at the given index position in the string reference.
Definition: qstring.h:1174

◆ decompress()

QString CompressedWhitespace::decompress ( const QString input)
static

Decompresses input into a usual QString.

input must be a QString as per returned from compress().

See also
compress()

Definition at line 160 of file qcompressedwhitespace.cpp.

Referenced by QPatternist::AccelTree::printStats(), and QPatternist::AccelTree::stringValue().

161 {
162  Q_ASSERT(!input.isEmpty());
163  const int len = input.length() * 2;
164  QString retval;
165 
166  for(int i = 0; i < len; ++i)
167  {
168  ushort cp = input.at(i / 2).unicode();
169 
170  if(isEven(i))
171  cp &= Lower8Bits;
172  else
173  {
174  cp = cp >> 8;
175 
176  if(cp == 0)
177  return retval;
178  }
179 
180  const quint8 wsLen = cp & Lower6Bits;
181  const quint8 id = cp & UpperTwoBits;
182 
183  /* Resize retval, and fill in on the top. */
184  const int oldSize = retval.size();
185  const int newSize = retval.size() + wsLen;
186  retval.resize(newSize);
187  const QChar ch(toChar(CharIdentifier(id)));
188 
189  for(int f = oldSize; f < newSize; ++f)
190  retval[f] = ch;
191  }
192 
193  return retval;
194 }
const QChar at(int i) const
Returns the character at the given index position in the string.
Definition: qstring.h:698
ushort unicode() const
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: qchar.h:251
int length() const
Returns the number of characters in this string.
Definition: qstring.h:696
unsigned char quint8
Definition: qglobal.h:934
The QString class provides a Unicode character string.
Definition: qstring.h:83
#define Q_ASSERT(cond)
Definition: qglobal.h:1823
The QChar class provides a 16-bit Unicode character.
Definition: qchar.h:72
int size() const
Returns the number of characters in this string.
Definition: qstring.h:102
bool isEmpty() const
Returns true if the string has no characters; otherwise returns false.
Definition: qstring.h:704
void resize(int size)
Sets the size of the string to size characters.
Definition: qstring.cpp:1353
static QChar toChar(const CharIdentifier id)
static bool isEven(const int number)
Returns true if number is an even number, otherwise false.
unsigned short ushort
Definition: qglobal.h:995

◆ isEven()

bool CompressedWhitespace::isEven ( const int  number)
inlinestaticprivate

Returns true if number is an even number, otherwise false.

Definition at line 71 of file qcompressedwhitespace.cpp.

Referenced by compress(), and decompress().

72 {
73  Q_ASSERT(number >= 0);
74  return number % 2 == 0;
75 }
#define Q_ASSERT(cond)
Definition: qglobal.h:1823

◆ toChar()

QChar CompressedWhitespace::toChar ( const CharIdentifier  id)
inlinestaticprivate

Definition at line 85 of file qcompressedwhitespace.cpp.

Referenced by decompress().

86 {
87  switch(id)
88  {
89  case Space: return QLatin1Char(' ');
90  case CR: return QLatin1Char('\r');
91  case LF: return QLatin1Char('\n');
92  case Tab: return QLatin1Char('\t');
93  default:
94  {
95  Q_ASSERT_X(false, Q_FUNC_INFO, "Unexpected input");
96  return QChar();
97  }
98  }
99 }
The QChar class provides a 16-bit Unicode character.
Definition: qchar.h:72
#define Q_ASSERT_X(cond, where, what)
Definition: qglobal.h:1837
The QLatin1Char class provides an 8-bit ASCII/Latin-1 character.
Definition: qchar.h:55
#define Q_FUNC_INFO
Definition: qglobal.h:1871

◆ toCompressedChar()

quint8 CompressedWhitespace::toCompressedChar ( const QChar  ch,
const int  len 
)
inlinestaticprivate

Definition at line 77 of file qcompressedwhitespace.cpp.

Referenced by compress().

78 {
79  Q_ASSERT(len > 0);
80  Q_ASSERT(len <= MaxCharCount);
81 
82  return len + toIdentifier(ch);
83 }
#define Q_ASSERT(cond)
Definition: qglobal.h:1823
static CharIdentifier toIdentifier(const QChar ch)

◆ toIdentifier()

CompressedWhitespace::CharIdentifier CompressedWhitespace::toIdentifier ( const QChar  ch)
inlinestaticprivate

Definition at line 50 of file qcompressedwhitespace.cpp.

Referenced by toCompressedChar().

51 {
52  switch(ch.unicode())
53  {
54  case ' ':
55  return Space;
56  case '\n':
57  return LF;
58  case '\r':
59  return CR;
60  case '\t':
61  return Tab;
62  default:
63  {
64  Q_ASSERT_X(false, Q_FUNC_INFO,
65  "The caller must guarantee only whitespace is passed.");
66  return Tab;
67  }
68  }
69 }
ushort unicode() const
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: qchar.h:251
#define Q_ASSERT_X(cond, where, what)
Definition: qglobal.h:1837
#define Q_FUNC_INFO
Definition: qglobal.h:1871

The documentation for this class was generated from the following files: