A hand-written tokenizer which tokenizes XQuery 1.0 & XPath 2.0, and delivers tokens to the Bison generated parser. More...

#include <qxquerytokenizer_p.h>

Inheritance diagram for QPatternist::XQueryTokenizer:

Public Types
enum	State { AfterAxisSeparator, AposAttributeContent, Axis, Default, ElementContent, EndTag, ItemType, KindTest, KindTestForPI, NamespaceDecl, NamespaceKeyword, OccurrenceIndicator, Operator, Pragma, PragmaContent, ProcessingInstructionContent, ProcessingInstructionName, QuotAttributeContent, StartTag, VarName, XMLComment, XMLSpaceDecl, XQueryVersion }

Public Types inherited from QPatternist::Tokenizer
typedef QExplicitlySharedDataPointer< Tokenizer >	Ptr

Public Types inherited from QPatternist::TokenSource
typedef QExplicitlySharedDataPointer< TokenSource >	Ptr

typedef QQueue< Ptr >	Queue

typedef yytokentype	TokenType

Public Functions
virtual int	commenceScanOnly ()

virtual Token	nextToken (YYLTYPE *const sourceLocator)

virtual void	resumeTokenizationFrom (const int position)

virtual void	setParserContext (const ParserContext::Ptr &parseInfo)

	XQueryTokenizer (const QString &query, const QUrl &location, const State startingState=Default)

Public Functions inherited from QPatternist::Tokenizer
const QUrl &	queryURI () const

	Tokenizer (const QUrl &queryU)

Public Functions inherited from QPatternist::TokenSource
	TokenSource ()

virtual	~TokenSource ()

Public Functions inherited from QSharedData
	QSharedData ()
	Constructs a QSharedData object with a reference count of 0. More...

	QSharedData (const QSharedData &)
	Constructs a QSharedData object with reference count 0. More...

Private Types
typedef QSet< int >	CharacterSkips

Private Functions
bool	aheadEquals (const char *const chs, const int len, const int offset=1) const

bool	atEnd () const

Token	attributeAsRaw (const QChar separator, int &stack, const int startPos, const bool inLiteral, QString &result)

QChar	charForReference (const QString &reference)

Tokenizer::TokenType	consumeComment ()
	Parses comments: `(: comment content :)`. It recurses for parsing nested comments. More...

bool	consumeRawWhitespace ()

TokenType	consumeWhitespace ()

const QChar	current () const

Token	nextToken ()

char	peekAhead (const int length=1) const

char	peekCurrent () const
	Returns the character at the current position, converted to `ASCII`. More...

int	peekForColonColon () const

void	popState ()

void	pushState (const State state)

void	pushState ()

int	scanUntil (const char *const content)

void	setState (const State s)

State	state () const

Token	tokenAndAdvance (const TokenType code, const int advance=1)

Token	tokenAndChangeState (const TokenType code, const State state, const int advance=1)

Token	tokenAndChangeState (const TokenType code, const QString &value, const State state)

QString	tokenizeCharacterReference ()

Token	tokenizeNCName ()

Token	tokenizeNCNameOrQName ()

Token	tokenizeNumberLiteral ()

Token	tokenizeStringLiteral ()

Static Private Functions
static Token	error ()

static bool	isDigit (const char ch)

static bool	isNCNameBody (const QChar ch)

static bool	isNCNameStart (const QChar ch)

static bool	isOperatorKeyword (const TokenType)

static bool	isPhraseKeyword (const TokenType code)

static bool	isTypeToken (const TokenType t)

static const TokenMap *	lookupKeyword (const QString &keyword)

static QString	normalizeEOL (const QString &input, const CharacterSkips &characterSkips)

Properties
QHash< QString, QChar >	m_charRefs

int	m_columnOffset

const QString	m_data

const int	m_length

int	m_line

const NamePool::Ptr	m_namePool

int	m_pos

bool	m_scanOnly

State	m_state

QStack< State >	m_stateStack

QStack< Token >	m_tokenStack

Additional Inherited Members
Public Variables inherited from QSharedData
QAtomicInt	ref

Static Protected Functions inherited from QPatternist::Tokenizer
static QString	tokenToString (const Token &token)

Detailed Description

A hand-written tokenizer which tokenizes XQuery 1.0 & XPath 2.0, and delivers tokens to the Bison generated parser.

Author: Frans Englich frans.nosp@m..eng.nosp@m.lich@.nosp@m.noki.nosp@m.a.com

Definition at line 76 of file qxquerytokenizer_p.h.

Typedefs

◆ CharacterSkips

typedef QSet<int> QPatternist::XQueryTokenizer::CharacterSkips

private

A set of indexes into a QString, the one being passed to normalizeEOL() whose characters shouldn't be normalized.

Definition at line 260 of file qxquerytokenizer_p.h.

Enumerations

◆ State

enum QPatternist::XQueryTokenizer::State

Tokenizer states. Organized alphabetically.

Enumerator
AfterAxisSeparator
AposAttributeContent
Axis
Default
ElementContent
EndTag
ItemType
KindTest
KindTestForPI
NamespaceDecl
NamespaceKeyword
OccurrenceIndicator
Operator
Pragma
PragmaContent
ProcessingInstructionContent
ProcessingInstructionName
QuotAttributeContent
StartTag
VarName
XMLComment
XMLSpaceDecl
XQueryVersion

Definition at line 82 of file qxquerytokenizer_p.h.

         {
             AfterAxisSeparator,
             AposAttributeContent,
             Axis,
             Default,
             ElementContent,
             EndTag,
             ItemType,
             KindTest,
             KindTestForPI,
             NamespaceDecl,
             NamespaceKeyword,
             OccurrenceIndicator,
             Operator,
             Pragma,
             PragmaContent,
             ProcessingInstructionContent,
             ProcessingInstructionName,
             QuotAttributeContent,
             StartTag,
             VarName,
             XMLComment,
             XMLSpaceDecl,
             XQueryVersion
         };

Constructors and Destructors

◆ XQueryTokenizer()

QPatternist::XQueryTokenizer::XQueryTokenizer	(	const QString &	query,
		const QUrl &	location,
		const State	startingState = `Default`
	)

Definition at line 62 of file qxquerytokenizer.cpp.

                                                             : Tokenizer(location)
                                                             , m_data(query)
                                                             , m_length(query.length())
                                                             , m_state(startingState)
                                                             , m_pos(0)
                                                             , m_line(1)
                                                             , m_columnOffset(0)
                                                             , m_scanOnly(false)
 {
     Q_ASSERT(location.isValid() || location.isEmpty());
 }

Functions

◆ aheadEquals()

bool QPatternist::XQueryTokenizer::aheadEquals	(	const char *const	chs,
		const int	len,
		const int	offset = `1`
	)		const

inlineprivate

Returns: whether the stream, starting from offset from the current position, matches chs. The length of chs is len.

Definition at line 693 of file qxquerytokenizer.cpp.

Referenced by nextToken().

 {
     Q_ASSERT(len > 0);
     Q_ASSERT(qstrlen(chs) == uint(len));
 
     if(m_pos + len >= m_length)
         return false;
 
     for(int i = offset; i < (len + offset); ++i)
     {
         if(m_data.at(m_pos + i).toAscii() != chs[i - offset])
             return false;
     }
 
     return true;
 }

◆ atEnd()

bool QPatternist::XQueryTokenizer::atEnd ( ) const

inlineprivate

Definition at line 270 of file qxquerytokenizer_p.h.

Referenced by attributeAsRaw(), consumeComment(), and nextToken().

         {
             return m_pos == m_length;
         }

◆ attributeAsRaw()

Tokenizer::Token QPatternist::XQueryTokenizer::attributeAsRaw	(	const QChar	separator,
		int &	stack,
		const int	startPos,
		const bool	inLiteral,
		QString &	result
	)

private

Instead of recognizing and tokenizing embedded expressions in direct attriute constructors, this function is essentially a mini recursive-descent parser that has the necessary logic to recognize embedded expressions and their potentially interfering string literals, in order to scan to the very end of the attribute value, and return the whole as a string.

There is of course syntax errors this function will not detect, but that is ok since the attributes will be parsed once more.

An inelegant solution, but which gets the job done.

See also: commenceScanOnly(), resumeTokenizationFrom()

Definition at line 2061 of file qxquerytokenizer.cpp.

Referenced by atEnd(), and nextToken().

 {
     bool inLiteral = aInLiteral;
     const char otherSep = (sep == QLatin1Char('"') ? '\'' : '"');
 
     while(true)
     {
         if(atEnd())
             return END_OF_FILE;
 
         if(peekCurrent() == sep.unicode())
         {
             if(inLiteral)
                 inLiteral = false;
             else
                 inLiteral = true;
 
             if(peekAhead() == sep.unicode())
             {
                 /* The quoting mechanism was used. */
                 result.append(current());
                 m_pos += 2;
                 continue;
             }
             else
             {
                 /* Don't consume the separator, such that we
                  * return a token for it next time. */
                 if(m_pos == startPos)
                 {
                     ++m_pos;
                     setState(StartTag);
                     return Token(sep == QLatin1Char('"') ? QUOTE : APOS);
                 }
 
 
                 if(sepStack == 0)
                 {
                     return Token(STRING_LITERAL, result);
                 }
                 else
                 {
                     result.append(current());
                     ++m_pos;
                     continue;
                 }
             }
         }
         else if(peekCurrent() == '&')
         {
             const QString ret(tokenizeCharacterReference());
             if(ret.isNull())
                 return Token(ERROR);
             else
             {
                 result.append(ret);
                 ++m_pos;
                 continue;
             }
         }
         else if(peekCurrent() == otherSep)
         {
             result.append(current());
             ++m_pos;
 
             if(peekCurrent() == otherSep)
                 ++m_pos;
 
             if(inLiteral)
                 inLiteral = false;
             else
                 inLiteral = true;
 
             continue;
         }
         else if(peekCurrent() == '{')
         {
             result.append(current());
 
             if(peekAhead() == '{')
             {
                 m_pos += 2;
                 continue;
             }
             else
             {
                 ++m_pos;
                 ++sepStack;
                 const Token t(attributeAsRaw(sep, sepStack, startPos, false, result));
                 if(t.type != SUCCESS)
                     return t;
             }
 
         }
         else if(peekCurrent() == '}')
         {
             if(inLiteral && peekAhead() == '}')
             {
                 result.append(current());
                 m_pos += 2;
                 continue;
             }
             else
             {
                 ++m_pos;
                 --sepStack;
                 return Token(SUCCESS); /* The return value is arbitrary. */
             }
         }
         else
         {
             result.append(current());
             ++m_pos;
         }
     }
 }

◆ charForReference()

QChar QPatternist::XQueryTokenizer::charForReference ( const QString & reference )

private

Returns the character corresponding to the builtin reference reference. For instance, passing gt will give you '>' in return.

If reference is an invalid character reference, a null QChar is returned.

See also: QChar::isNull()

Definition at line 607 of file qxquerytokenizer.cpp.

Referenced by tokenizeCharacterReference().

 {
     if(m_charRefs.isEmpty())
     {
         /* Initialize. */
         m_charRefs.reserve(5);
         m_charRefs.insert(QLatin1String("lt"),     QLatin1Char('<'));
         m_charRefs.insert(QLatin1String("gt"),     QLatin1Char('>'));
         m_charRefs.insert(QLatin1String("amp"),    QLatin1Char('&'));
         m_charRefs.insert(QLatin1String("quot"),   QLatin1Char('"'));
         m_charRefs.insert(QLatin1String("apos"),   QLatin1Char('\''));
     }
 
     return m_charRefs.value(reference);
 }

◆ commenceScanOnly()

int QPatternist::XQueryTokenizer::commenceScanOnly ( )

virtual

Switches the Tokenizer to only do scanning, and returns complete strings for attribute value templates as opposed to the tokens for the contained expressions.

The current position in the stream is returned. It can be used to later resume regular tokenization.

Implements QPatternist::Tokenizer.

Definition at line 2229 of file qxquerytokenizer.cpp.

 {
     m_scanOnly = true;
     return m_pos;
 }

◆ consumeComment()

Tokenizer::TokenType QPatternist::XQueryTokenizer::consumeComment ( )

private

Parses comments: (: comment content :). It recurses for parsing nested comments.

It is assumed that the start token for the comment, "(:", has already been parsed.

Typically, don't call this function, but ignoreWhitespace().

See also: XML Path Language (XPath) 2.0, 2.6 Comments

Returns

SUCCESS if everything went ok
ERROR if there was an error in parsing one or more comments
END_OF_FILE if the end was reached

Definition at line 188 of file qxquerytokenizer.cpp.

Referenced by consumeWhitespace().

 {
     /* Below, we return ERROR instead of END_OF_FILE such that the parser
      * sees an invalid comment. */
     while(m_pos < m_length)
     {
         switch(peekCurrent())
         {
             case ':':
             {
                 ++m_pos; /* Consume ':' */
                 if(atEnd())
                     return ERROR;
 
                 if(peekCurrent() == ')')
                 {
                     ++m_pos; /* Consume ')' */
                     return SUCCESS; /* The comment closed nicely. */
                 }
                 continue; /* We don't want to increment m_pos twice. */
             }
             case '(':
             { /* It looks like the start of a comment. */
                 ++m_pos;
 
                 if(atEnd())
                     return END_OF_FILE;
                 else if(peekCurrent() == ':')
                 {
                     /* And it is a nested comment -- parse it. */
                     const TokenType retval = consumeComment();
                     if(retval == SUCCESS)
                         continue; /* Continue with our "own" comment. */
                     else
                         return retval; /* Return the error in the nested comment. */
                 }
                 break;
             }
             case '\n':
             /* Fallthrough. */
             case '\r':
             {
                 /* We want to count \r\n as a single line break. */
                 if(peekAhead() == '\n')
                     ++m_pos;
 
                 m_columnOffset = m_pos;
                 ++m_line;
 
                 break;
             }
         }
         ++m_pos;
     }
 
     return ERROR; /* Error: we reached the end while inside a comment. */
 }

◆ consumeRawWhitespace()

bool QPatternist::XQueryTokenizer::consumeRawWhitespace ( )

inlineprivate

Consumes only whitespace, in the traditional sense. The function exits if non-whitespace is encountered, such as the start of a comment.

Returns: true if the end was reached, otherwise false

Definition at line 246 of file qxquerytokenizer.cpp.

Referenced by nextToken().

 {
     while(m_pos < m_length)
     {
         switch(peekCurrent())
         {
             case ' ':
             case '\t':
                 break;
             case '\n':
             case '\r':
             {
                 if(peekAhead() == '\n')
                     ++m_pos;
 
                 m_columnOffset = m_pos;
                 ++m_line;
 
                 break;
             }
             default:
                 return false;
         }
         ++m_pos;
     }
     return true;
 }

◆ consumeWhitespace()

Tokenizer::TokenType QPatternist::XQueryTokenizer::consumeWhitespace ( )

inlineprivate

Definition at line 274 of file qxquerytokenizer.cpp.

Referenced by nextToken().

 {
     while(m_pos < m_length)
     {
         switch(peekCurrent())
         {
             case ' ':
             case '\t':
                 break;
             case '\n':
             case '\r':
             {
                 /* We want to count \r\n as a single line break. */
                 if(peekAhead() == '\n')
                     ++m_pos;
 
                 m_columnOffset = m_pos;
                 ++m_line;
 
                 break;
             }
             case '(':
             {
                 if(peekAhead() == ':')
                 {
                     m_pos += 2; /* Consume "(:" */
 
                     const TokenType comment = consumeComment();
                     if(comment == SUCCESS)
                         continue;
                     else
                         return comment;
                 }
             }
             default:
                 return SUCCESS;
         }
         ++m_pos;
     }
 
     return END_OF_FILE;
 }

◆ current()

const QChar QPatternist::XQueryTokenizer::current ( ) const

inlineprivate

Disregarding encoding conversion, equivalent to calling:

peekAhead(0);

Definition at line 76 of file qxquerytokenizer.cpp.

Referenced by attributeAsRaw(), nextToken(), peekCurrent(), tokenizeNCName(), tokenizeNumberLiteral(), and tokenizeStringLiteral().

 {
     if(m_pos < m_length)
         return m_data.at(m_pos);
     else
         return QChar();
 }

◆ error()

Tokenizer::Token QPatternist::XQueryTokenizer::error ( )

inlinestaticprivate

Definition at line 325 of file qxquerytokenizer.cpp.

Referenced by nextToken(), tokenizeNCName(), tokenizeNumberLiteral(), and tokenizeStringLiteral().

 {
     return Token(ERROR);
 }

◆ isDigit()

bool QPatternist::XQueryTokenizer::isDigit ( const char ch )

inlinestaticprivate

Definition at line 330 of file qxquerytokenizer.cpp.

Referenced by nextToken().

 {
     return ch >= '0' && ch <= '9';
 }

◆ isNCNameBody()

bool QPatternist::XQueryTokenizer::isNCNameBody ( const QChar ch )

inlinestaticprivate

Definition at line 354 of file qxquerytokenizer.cpp.

Referenced by tokenizeNCName().

 {
     switch(ch.unicode())
     {
         case '.':
         case '_':
         case '-':
             return true;
     }
 
     switch(ch.category())
     {
         case QChar::Letter_Lowercase:
         case QChar::Letter_Uppercase:
         case QChar::Letter_Other:
         case QChar::Letter_Titlecase:
         case QChar::Number_Letter:
         case QChar::Mark_SpacingCombining:
         case QChar::Mark_Enclosing:
         case QChar::Mark_NonSpacing:
         case QChar::Letter_Modifier:
         case QChar::Number_DecimalDigit:
             return true;
         default:
             return false;
     }
 }

◆ isNCNameStart()

bool QPatternist::XQueryTokenizer::isNCNameStart ( const QChar ch )

inlinestaticprivate

Definition at line 336 of file qxquerytokenizer.cpp.

Referenced by nextToken(), tokenizeNCName(), and tokenizeNumberLiteral().

 {
     if(ch == QLatin1Char('_'))
         return true;
 
     switch(ch.category())
     {
         case QChar::Letter_Lowercase:
         case QChar::Letter_Uppercase:
         case QChar::Letter_Other:
         case QChar::Letter_Titlecase:
         case QChar::Number_Letter:
             return true;
         default:
             return false;
     }
 }

◆ isOperatorKeyword()

bool QPatternist::XQueryTokenizer::isOperatorKeyword ( const TokenType code )

inlinestaticprivate

Definition at line 406 of file qxquerytokenizer.cpp.

Referenced by nextToken().

 {
     switch(code)
     {
         /* Fallthrough all these. */
         case AS:
         case ASCENDING:
         case AT:
         case CASE:
         case CAST:
         case CASTABLE:
         case EQ:
         case EXTERNAL:
         case GE:
         case G_EQ:
         case G_GT:
         case G_LT:
         case G_NE:
         case GT:
         case IN:
         case INHERIT:
         case INSTANCE:
         case IS:
         case ITEM:
         case LE:
         case LT:
         case NE:
         case NO_INHERIT:
         case NO_PRESERVE:
         case OF:
         case PRESERVE:
         case RETURN:
         case STABLE:
         case TO:
         case TREAT:
             return true;
         default:
             return false;
     };
 }

◆ isPhraseKeyword()

bool QPatternist::XQueryTokenizer::isPhraseKeyword ( const TokenType code )

inlinestaticprivate

Determines whether code is a keyword that is followed by a second keyword. For instance declare function.

Definition at line 382 of file qxquerytokenizer.cpp.

Referenced by nextToken().

 {
     switch(code)
     {
         /* Fallthrough all these. */
         case CASTABLE:
         case CAST:
         case COPY_NAMESPACES:
         case DECLARE:
         case EMPTY:
         case MODULE:
         case IMPORT:
         case INSTANCE:
         case ORDER:
         case ORDERING:
         case XQUERY:
         case STABLE:
         case TREAT:
             return true;
         default:
             return false;
     }
 }

◆ isTypeToken()

bool QPatternist::XQueryTokenizer::isTypeToken ( const TokenType t )

staticprivate

Definition at line 447 of file qxquerytokenizer.cpp.

Referenced by nextToken().

 {
     switch(t)
     {
         /* Fallthrough all these. */
         case ATTRIBUTE:
         case COMMENT:
         case DOCUMENT:
         case DOCUMENT_NODE:
         case ELEMENT:
         case ITEM:
         case NODE:
         case PROCESSING_INSTRUCTION:
         case SCHEMA_ATTRIBUTE:
         case SCHEMA_ELEMENT:
         case TEXT:
             return true;
         default:
             return false;
     }
 }

◆ lookupKeyword()

const TokenMap * QPatternist::XQueryTokenizer::lookupKeyword ( const QString & keyword )

inlinestaticprivate

Definition at line 712 of file qxquerytokenizer.cpp.

Referenced by nextToken().

 {
     return TokenLookup::value(keyword.toAscii().constData(), keyword.length());
 }

◆ nextToken() [1/2]

Tokenizer::Token QPatternist::XQueryTokenizer::nextToken ( YYLTYPE *const sourceLocator )

virtual

Returns: the next token.

Implements QPatternist::TokenSource.

Definition at line 2182 of file qxquerytokenizer.cpp.

 {
     sourceLocator->first_line = m_line;
     sourceLocator->first_column = m_pos - m_columnOffset + 1; /* Plus 1, since m_pos is 0-based. */
 
     if(m_tokenStack.isEmpty())
         return nextToken();
     else
     {
         const Token retval(m_tokenStack.pop());
 
         switch(retval.type)
         {
             case MODULE:
             /* Fallthrough.*/
             case SCHEMA:
             /* Fallthrough.*/
             case COPY_NAMESPACES:
             {
                 setState(NamespaceKeyword);
                 break;
             }
             case VERSION:
             {
                 setState(XQueryVersion);
                 break;
             }
             case AS:
             /* Fallthrough. */
             case OF:
             {
                 setState(ItemType);
                 break;
             }
             default:
             {
                 if(isOperatorKeyword(retval.type))
                     setState(Default);
 
                 break;
             }
         };
 
         return retval;
     }
 }

◆ nextToken() [2/2]

Tokenizer::Token QPatternist::XQueryTokenizer::nextToken ( )

private

Definition at line 745 of file qxquerytokenizer.cpp.

Referenced by atEnd(), and nextToken().

 {
     switch(state())
     {
         /* We want to skip or do special whitespace handling for these
          * states. So fallthrough all of the following. */
         case AposAttributeContent:
         case Axis:
         case ElementContent:
         case EndTag:
         case Pragma:
         case PragmaContent:
         case ProcessingInstructionName:
         case QuotAttributeContent:
         case StartTag:
         case XMLComment:
             break;
         default:
             handleWhitespace();
     }
 
     switch(state())
     {
         case XMLSpaceDecl:
         /* Fallthrough. */
         case NamespaceKeyword:
         {
             switch(peekCurrent())
             {
                 case ',':
                     return tokenAndAdvance(COMMA);
                 case '"':
                 /* Fallthrough. */
                 case '\'':
                 {
                     setState(NamespaceDecl);
                     return tokenizeStringLiteral();
                 }
             }
 
             const Token id(tokenizeNCName());
 
             if(id.type != NCNAME)
                 return id;
 
             const TokenMap *const keyword = lookupKeyword(id.value);
             if(keyword)
             {
                 switch(keyword->token)
                 {
                     case INHERIT:
                     /* Fallthrough. */
                     case NO_INHERIT:
                     {
                         setState(Default);
                         break;
                     }
                     case NAMESPACE:
                     {
                         setState(NamespaceDecl);
                         break;
                     }
                     case ORDERED:
                     /* Fallthrough. */
                     case UNORDERED:
                     /* Fallthrough. */
                     case STRIP:
                     {
                         setState(Default);
                         break;
                     }
                     case PRESERVE:
                     {
                         if(state() != NamespaceKeyword)
                             setState(Default);
                     }
                     default:
                         break;
                 }
 
                 return Token(keyword->token);
             }
             else
                 return id;
 
             Q_ASSERT(false);
         }
         case NamespaceDecl:
         {
             switch(peekCurrent())
             {
                 case '=':
                     return tokenAndAdvance(G_EQ);
                 case ';':
                     return tokenAndChangeState(SEMI_COLON, Default);
                 case '\'':
                 /* Fallthrough. */
                 case '\"':
                     return tokenizeStringLiteral();
             }
 
             const Token nc(tokenizeNCName());
 
             handleWhitespace();
 
             const char pc = peekCurrent();
             const TokenMap* const t = lookupKeyword(nc.value);
 
             if(pc == '\'' || (pc == '"' && t))
                 return tokenAndChangeState(t->token, Default, 0);
             else
                 return nc;
 
             Q_ASSERT(false);
         }
         case Axis:
         {
             if(peekCurrent() == ':')
             {
                 Q_ASSERT(peekAhead() == ':');
                 m_pos += 2;
                 setState(AfterAxisSeparator);
                 return Token(COLONCOLON);
             }
             /* Fallthrough. */
         }
         case AfterAxisSeparator:
         /* Fallthrough. */
         case Default:
            /* State Operator and state Default have a lot of tokens in common except
             * for minor differences. So we treat them the same way, and sprinkles logic
             * here and there to handle the small differences. */
         /* Fallthrough. */
         case Operator:
         {
             switch(peekCurrent())
             {
                 case '=':
                     return tokenAndChangeState(G_EQ, Default);
                 case '-':
                     return tokenAndChangeState(MINUS, Default);
                 case '+':
                     return tokenAndChangeState(PLUS, Default);
                 case '[':
                     return tokenAndChangeState(LBRACKET, Default);
                 case ']':
                     return tokenAndChangeState(RBRACKET, Operator);
                 case ',':
                     return tokenAndChangeState(COMMA, Default);
                 case ';':
                     return tokenAndChangeState(SEMI_COLON, Default);
                 case '$':
                     return tokenAndChangeState(DOLLAR, VarName);
                 case '|':
                     return tokenAndChangeState(BAR, Default);
                 case '?':
                     return tokenAndChangeState(QUESTION, Operator);
                 case ')':
                     return tokenAndChangeState(RPAREN, Operator);
                 case '@':
                     return tokenAndChangeState(AT_SIGN, Default);
                 /* Fallthrough all these. */
                 case '1':
                 case '2':
                 case '3':
                 case '4':
                 case '5':
                 case '6':
                 case '7':
                 case '8':
                 case '9':
                 case '0':
                     return tokenizeNumberLiteral();
                 case '.':
                 {
                     const char next = peekAhead();
                     if(next == '.')
                         return tokenAndChangeState(DOTDOT, Operator, 2);
                     /* .5 is allowed, as short form for 0.5:
                      * <tt>[142]     DecimalLiteral     ::=     ("." Digits) | (Digits "." [0-9]*)</tt>
                      */
                     else if(isDigit(next))
                         return tokenizeNumberLiteral();
                     else
                         return tokenAndChangeState(DOT, Operator);
                 }
                 case '\'':
                 /* Fallthrough. */
                 case '"':
                 {
                     setState(Operator);
                     return tokenizeStringLiteral();
 
                 }
                 case '(':
                 {
                     if(peekAhead() == '#')
                         return tokenAndChangeState(PRAGMA_START, Pragma, 2);
                     else
                         return tokenAndChangeState(LPAREN, Default);
                 }
                 case '*':
                 {
                     if(peekAhead() == ':')
                     {
                         m_pos += 2; /* Consume *:. */
                         const Token nc = tokenizeNCName();
 
                         if(nc.hasError())
                             return error();
                         else
                             return tokenAndChangeState(ANY_PREFIX, nc.value, Operator);
                     }
                     else
                         return tokenAndChangeState(STAR, state() == Default ? Operator : Default);
                 }
                 case ':':
                 {
                     switch(peekAhead())
                     {
                         case '=':
                             return tokenAndChangeState(ASSIGN, Default, 2);
                         case ':':
                             return tokenAndChangeState(COLONCOLON, Default, 2);
                         default:
                             return error();
                     }
                 }
                 case '!':
                 {
                     if(peekAhead() == '=')
                         return tokenAndChangeState(G_NE, Default, 2);
                     else
                         return error();
                 }
                 case '<':
                 {
                     switch(peekAhead())
                     {
                         case '=':
                             return tokenAndChangeState(G_LE, Default, 2);
                         case '<':
                             return tokenAndChangeState(PRECEDES, Default, 2);
                         case '?':
                         {
                             pushState(Operator);
                             return tokenAndChangeState(PI_START, ProcessingInstructionName, 2);
                         }
                         case '!':
                         {
                             if(aheadEquals("!--", 3))
                             {
                                 m_pos += 3; /* Consume "!--". */
                                 pushState(Operator);
                                 return tokenAndChangeState(COMMENT_START, XMLComment);
                             }
                             /* Fallthrough. It's a syntax error, and this is a good way to report it. */
                         }
                         default:
                         {
                             if((m_pos + 1) < m_length && isNCNameStart(m_data.at(m_pos + 1)))
                             {
                                 /* We assume it's an element constructor. */
                                 pushState(Operator);
                             }
 
                             return tokenAndChangeState(G_LT, state() == Operator ? Default : StartTag);
                         }
                     }
                 }
                 case '>':
                 {
                     switch(peekAhead())
                     {
                         case '=':
                             return tokenAndChangeState(G_GE, Default, 2);
                         case '>':
                             return tokenAndChangeState(FOLLOWS, Default, 2);
                         default:
                             return tokenAndChangeState(G_GT, Default);
                     }
                 }
                 case '/':
                 {
                     if(peekAhead() == '/')
                         return tokenAndChangeState(SLASHSLASH, Default, 2);
                     else
                         return tokenAndChangeState(SLASH, Default);
                 }
                 case '{':
                 {
                     pushState(Operator);
                     return tokenAndChangeState(CURLY_LBRACE, Default);
                 }
                 case '}':
                 {
                     popState();
 
                     return tokenAndAdvance(CURLY_RBRACE);
                 }
             }
 
             /* Ok. We're in state Default or Operator, and it wasn't a simple
              * character. */
 
             const Token id(tokenizeNCName());
 
             if(id.type != NCNAME)
                 return id;
 
             const TokenMap *const keyword = lookupKeyword(id.value);
 
             if(state() == Operator)
             {
                 if(keyword)
                 {
                     if(keyword->token == DEFAULT || keyword->token == ASCENDING || keyword->token == DESCENDING)
                         setState(Operator);
                     else if(keyword->token == RETURN)
                         setState(Default);
                     else if(isPhraseKeyword(keyword->token))
                     {
                         const TokenType ws = consumeWhitespace();
                         if(ws == ERROR)
                             return error();
 
                         const Token id2(tokenizeNCName());
                         const TokenMap *const keyword2 = lookupKeyword(id2.value);
 
                         if(keyword2)
                         {
                             if(keyword->token == TREAT && keyword2->token == AS)
                                 setState(ItemType);
                             else if (keyword->token == CAST || (keyword->token == CASTABLE && keyword2->token == AS) || keyword2->token == BY)
                                 setState(Default);
 
                             m_tokenStack.push(Token(keyword2->token));
                         }
                         else
                             m_tokenStack.push(id2);
 
                         return Token(keyword->token);
                     }
                     else
                     {
                         /* Such that we tokenize the second token in "empty greatest". */
                         if(keyword->token != EMPTY)
                             setState(Default);
                     }
 
                     if(keyword->token == AS || keyword->token == CASE)
                         setState(ItemType);
 
                     return Token(keyword->token);
                 }
                 else
                     return id;
             }
 
             Q_ASSERT(state() == Default || state() == Axis || state() == AfterAxisSeparator);
 
             /*
              * This is hard. Consider this:
              *
              * Valid:           child       ::nameTest
              * Valid:           child::     nameTest
              * Syntax Error:    child       :localName
              * Syntax Error:    child:      localName
              *
              * Consider "child ::name". Right now, we're here:
              *                ^
              * We don't know whether "child" is a prefix and hence the whitespace is invalid,
              * or whether it's an axis and hence skippable. */
             {
                 const int wsLength = peekForColonColon();
                 /* We cannot call handleWhitespace() because it returns on
                  * END_OF_FILE, and we have parsed up keyword, and we need to
                  * deal with that.
                  *
                  * If we have a colon colon, which means the whitespace is
                  * allowed, we skip it. */
                 if(wsLength != -1)
                     m_pos += wsLength;
             }
 
             /* Handle name tests. */
             if(peekCurrent() == ':')
             {
                 switch(peekAhead())
                 {
                     case '=':
                         return id;
                     case '*':
                     {
                         m_pos += 2;
                         return tokenAndChangeState(ANY_LOCAL_NAME, id.value, Operator);
                     }
                     case ':':
                     {
                         /* We have an axis. */
                         setState(Axis);
                         return keyword ? Token(keyword->token) : id;
                     }
                     default:
                     {
                         /* It's a QName. */
                         ++m_pos; /* Consume the colon. */
 
                         const Token id2(tokenizeNCName());
 
                         if(id2.type != NCNAME)
                         {
                             --m_pos;
                             return id;
                         }
 
                         setState(Operator);
                         const int qNameLen = id.value.length() + id2.value.length() + 1;
                         return Token(QNAME, m_data.mid(m_pos - qNameLen, qNameLen));
                     }
                 }
             }
 
             if(!keyword || isOperatorKeyword(keyword->token))
             {
                 setState(Operator);
                 return id;
             }
 
             const TokenType ws = consumeWhitespace();
             if(ws == ERROR) // TODO this should test for success. Write test.
                 return Token(ERROR);
 
             if(atEnd())
             {
                 setState(Operator);
                 return id;
             }
 
             /* Let the if-body apply for constructors, and node type tests. */
             if(isTypeToken(keyword->token) ||
                keyword->token == TYPESWITCH ||
                keyword->token == ORDERED ||
                keyword->token == UNORDERED ||
                keyword->token == IF)
             {
                 switch(peekCurrent())
                 {
                     case '(':
                     {
                         // TODO See if we can remove DOCUMENT from isTypeToken.
                         if(isTypeToken(keyword->token) && keyword->token != DOCUMENT)
                         {
                             m_tokenStack.push(Token(LPAREN));
                             ++m_pos; /* Consume '('. */
                             pushState(Operator);
 
                             if(keyword->token == PROCESSING_INSTRUCTION)
                                 setState(KindTestForPI);
                             else
                                 setState(KindTest);
 
                             return Token(keyword->token);
                         }
                         else if(keyword->token == TYPESWITCH || keyword->token == IF)
                             return Token(keyword->token);
                         else /* It's a function call. */
                             return id;
                     }
                     case '{':
                     {
                         m_tokenStack.push(Token(CURLY_LBRACE));
                         ++m_pos; /* Consume '{'. */
                         pushState(Operator);
                         /* Stay in state Default. */
                         return Token(keyword->token);
                     }
                     default:
                     {
                         /* We have read in a token which is for instance
                          * "return", and now it can be an element
                          * test("element") a node kind test("element()"), or a
                          * computed element constructor("element name {...").
                          * We need to do a two-token lookahead here, because
                          * "element return" can be an element test followed by
                          * the return keyword, but it can also be an element
                          * constructor("element return {"). */
                         if(isNCNameStart(current()))
                         {
                             const int currentPos = m_pos;
                             const Token token2 = tokenizeNCNameOrQName();
 
                             if(token2.hasError())
                                 return token2;
 
                             handleWhitespace();
 
                             if(peekCurrent() == '{')
                             {
                                 /* An element constructor. */
                                 m_tokenStack.push(token2);
                                 return Token(keyword->token);
                             }
 
                             /* We jump back in the stream, we need to tokenize token2 according
                              * to the state. */
                             m_pos = currentPos;
                             setState(Operator);
                             return Token(NCNAME, QLatin1String(keyword->name));
                         }
                     }
                 }
             }
 
             if(peekCurrent() == '$')
             {
                 setState(VarName);
                 return Token(keyword->token);
             }
 
             /* It's not a node type, it's not the typeswitch expression, but it is a function callsite. */
             if(peekCurrent() == '(')
                 return id;
             else if(peekCurrent() == '{' && keyword->token == VALIDATE)
                 return Token(keyword->token);
 
             if(!isNCNameStart(current()))
             {
                 setState(Operator);
                 return id;
             }
 
             const Token id2(tokenizeNCName());
             const TokenMap *const keyword2 = lookupKeyword(id2.value);
 
             if(!keyword2)
             {
                 /* It's a syntax error. All cases of two subsequent ncnames are keywords(e.g, declarations). */
                 setState(Operator);
                 return id;
             }
 
             switch(keyword->token)
             {
                 case DECLARE:
                 {
                     switch(keyword2->token)
                     {
                         case VARIABLE:
                         /* Fallthrough. */
                         case FUNCTION:
                         {
                             m_tokenStack.push(Token(keyword2->token));
                             setState(Default);
                             return Token(keyword->token);
                         }
                         case OPTION:
                         {
                             m_tokenStack.push(Token(keyword2->token));
                             setState(Default);
                             return Token(keyword->token);
                         }
                         case COPY_NAMESPACES:
                         /* Fallthrough. */
                         case ORDERING:
                         {
                             m_tokenStack.push(Token(keyword2->token));
                             setState(NamespaceKeyword);
                             return Token(keyword->token);
                         }
                         case CONSTRUCTION:
                         {
                             // TODO identical to CONSTRUCTION?
                             m_tokenStack.push(Token(keyword2->token));
                             setState(Operator);
                             return Token(keyword->token);
                         }
                         case NAMESPACE:
                         /* Fallthrough. */
                         case BASEURI:
                         {
                             m_tokenStack.push(Token(keyword2->token));
                             setState(NamespaceDecl);
                             return Token(keyword->token);
                         }
                         case BOUNDARY_SPACE:
                         {
                             m_tokenStack.push(Token(keyword2->token));
                             setState(XMLSpaceDecl);
                             return Token(keyword->token);
                         }
                         case DEFAULT:
                         {
                             m_tokenStack.push(Token(keyword2->token));
 
                             const TokenType ws2 = consumeWhitespace();
                             if(ws2 != SUCCESS)
                             {
                                 m_tokenStack.prepend(Token(ws2));
                                 return Token(keyword->token);
                             }
 
                             const Token id3(tokenizeNCName());
 
                             if(id3.type != NCNAME)
                             {
                                 m_tokenStack.prepend(id3);
                                 return Token(keyword->token);
                             }
 
                             const TokenMap *const keyword3 = lookupKeyword(id3.value);
                             if(!keyword3)
                             {
                                 m_tokenStack.prepend(id3);
                                 return Token(keyword->token);
                             }
                             else
                             {
                                 m_tokenStack.prepend(Token(keyword3->token));
 
                                 if(keyword3->token == ORDER)
                                     setState(Operator);
                                 else
                                     setState(NamespaceDecl);
                             }
 
                             return Token(keyword->token);
                         }
                         default:
                         {
                             m_tokenStack.push(Token(keyword2->token));
                             setState(Default);
                             return id;
                         }
                     }
                 }
                 case XQUERY:
                 {
                     m_tokenStack.push(Token(keyword2->token));
 
                     if(keyword2->token == VERSION)
                     {
                         setState(NamespaceDecl);
                         return Token(keyword->token);
                     }
                     else
                     {
                         setState(Operator);
                         return id;
                     }
                 }
                 case IMPORT:
                 {
                     m_tokenStack.push(Token(keyword2->token));
 
                     switch(keyword2->token)
                     {
                         case SCHEMA:
                         /* Fallthrough. */
                         case MODULE:
                         {
                             setState(NamespaceKeyword);
                             return Token(keyword->token);
                         }
                         default:
                         {
                             setState(Operator);
                             return id;
                         }
                     }
                 }
                 case VALIDATE:
                 {
                     m_tokenStack.push(Token(keyword2->token));
 
                     switch(keyword2->token)
                     {
                         case LAX:
                         case STRICT:
                         {
                             pushState(Operator);
                             return Token(keyword->token);
                         }
                         default:
                         {
                             setState(Operator);
                             return id;
                         }
                     }
                 }
                 default:
                 {
                     m_tokenStack.push(Token(keyword2->token));
                     setState(Operator);
                     return id;
                 }
             }
 
             Q_ASSERT(false);
 
         }
         case VarName:
         {
             if(peekCurrent() == '$')
                 return tokenAndAdvance(DOLLAR);
 
             setState(Operator);
             return tokenizeNCNameOrQName();
             Q_ASSERT(false);
         }
         case ItemType:
         {
             switch(peekCurrent())
             {
                 case '(':
                     return tokenAndChangeState(LPAREN, KindTest);
                 case '$':
                     return tokenAndChangeState(DOLLAR, VarName);
             }
 
             const Token name(tokenizeNCNameOrQName());
 
             if(name.hasError())
                 return error();
 
             else if(name.type == QNAME)
             {
                 setState(OccurrenceIndicator);
                 return name;
             }
             else
             {
                 const TokenMap *const keyword = lookupKeyword(name.value);
 
                 if(keyword)
                 {
                     pushState(OccurrenceIndicator);
                     return Token(keyword->token);
                 }
                 else
                 {
                     setState(Default);
                     return name;
                 }
             }
             Q_ASSERT(false);
         }
         case KindTest:
         {
             switch(peekCurrent())
             {
                 case ')':
                 {
                     popState();
                     return tokenAndAdvance(RPAREN);
                 }
                 case '(':
                     return tokenAndAdvance(LPAREN);
                 case ',':
                     return tokenAndAdvance(COMMA);
                 case '*':
                     return tokenAndAdvance(STAR);
                 case '?':
                     return tokenAndAdvance(QUESTION);
                 case '\'':
                 /* Fallthrough. */
                 case '"':
                     return tokenizeStringLiteral();
             }
 
             const Token nc(tokenizeNCNameOrQName());
             if(nc.hasError())
                 return nc;
 
             const TokenType ws = consumeWhitespace();
             if(ws == ERROR)
                 return error();
 
             if(peekCurrent() == '(')
             {
                 const TokenMap *const keyword = lookupKeyword(nc.value);
                 if(keyword)
                 {
                     pushState(KindTest);
                     return Token(keyword->token);
                 }
                 else
                     return nc;
             }
             else
                 return nc;
             Q_ASSERT(false);
         }
         case KindTestForPI:
         {
             switch(peekCurrent())
             {
                 case ')':
                 {
                     popState();
                     return tokenAndAdvance(RPAREN);
                 }
                 case '\'':
                 /* Fallthrough. */
                 case '"':
                     return tokenizeStringLiteral();
                 default:
                     return tokenizeNCName();
             }
             Q_ASSERT(false);
         }
         case OccurrenceIndicator:
         {
             switch(peekCurrent())
             {
                 case '?':
                     return tokenAndChangeState(QUESTION, Operator);
                 case '*':
                     return tokenAndChangeState(STAR, Operator);
                 case '+':
                     return tokenAndChangeState(PLUS, Operator);
                 default:
                 {
                     setState(Operator);
                     return nextToken();
                 }
             }
             Q_ASSERT(false);
         }
         case XQueryVersion:
         {
             switch(peekCurrent())
             {
                 case '\'':
                 /* Fallthrough. */
                 case '"':
                     return tokenizeStringLiteral();
                 case ';':
                     return tokenAndChangeState(SEMI_COLON, Default);
             }
 
             const Token id(tokenizeNCName());
 
             if(id.type != NCNAME)
                 return id;
 
             const TokenMap *const keyword = lookupKeyword(id.value);
             if(keyword)
                 return tokenAndChangeState(keyword->token, Default);
             else
                 return id;
             Q_ASSERT(false);
         }
         case StartTag:
         {
             if(peekAhead(-1) == '<')
             {
                 if(current().isSpace())
                     return Token(ERROR);
             }
             else
             {
                 if(consumeRawWhitespace())
                     return Token(END_OF_FILE);
             }
 
             switch(peekCurrent())
             {
                 case '/':
                 {
                     if(peekAhead() == '>')
                     {
                         m_pos += 2;
 
                         if(m_scanOnly)
                             return Token(POSITION_SET);
                         else
                         {
                             popState();
                             return Token(QUICK_TAG_END);
                         }
                     }
                     else
                         return error();
                 }
                 case '>':
                 {
                     if(m_scanOnly)
                         return tokenAndChangeState(POSITION_SET, StartTag);
                     else
                         return tokenAndChangeState(G_GT, ElementContent);
                 }
                 case '=':
                     return tokenAndAdvance(G_EQ);
                 case '\'':
                     return tokenAndChangeState(APOS, AposAttributeContent);
                 case '"':
                     return tokenAndChangeState(QUOTE, QuotAttributeContent);
                 default:
                     return tokenizeNCNameOrQName();
             }
             Q_ASSERT(false);
         }
         case AposAttributeContent:
         /* Fallthrough. */
         case QuotAttributeContent:
         {
             const QChar sep(state() == AposAttributeContent ? QLatin1Char('\'') : QLatin1Char('"'));
             QString result;
             result.reserve(20);
 
             if(m_scanOnly)
             {
                 int stack = 0;
                 return attributeAsRaw(sep, stack, m_pos, true, result);
             }
 
             Q_ASSERT(!m_scanOnly);
             while(true)
             {
                 if(atEnd())
                 {
                     /* In the case that the XSL-T tokenizer invokes us with
                      * default state QuotAttributeContent, we need to be able
                      * to return a single string, in case that is all we have
                      * accumulated. */
                     if(result.isEmpty())
                         return Token(END_OF_FILE);
                     else
                         return Token(STRING_LITERAL, result);
                 }
 
                 const QChar curr(current());
 
                 if(curr == sep)
                 {
                     if(m_pos + 1 == m_length)
                         return Token(END_OF_FILE);
 
                     if(m_data.at(m_pos + 1) == sep)
                     {
                         /* The quoting mechanism was used. */
                         m_pos += 2;
                         result.append(sep);
                         continue;
                     }
 
                     const QChar next(m_data.at(m_pos + 1));
                     if(!next.isSpace() && next != QLatin1Char('/') && next != QLatin1Char('>'))
                         return Token(ERROR); // i18n Space must separate attributes
                     else if(result.isEmpty())
                     {
                         return tokenAndChangeState(state() == AposAttributeContent ? APOS : QUOTE,
                                                    StartTag, 1);
                     }
                     else
                     {
                         /* Don't consume the sep, but leave it so we next time return a token for it. */
                         return Token(STRING_LITERAL, result);
                     }
 
                     ++m_pos;
                     continue;
                 }
                 else if(curr == QLatin1Char('{'))
                 {
                     if(m_pos + 1 == m_length)
                         return Token(END_OF_FILE);
                     else if(peekAhead() == '{')
                     {
                         ++m_pos;
                         result.append(QLatin1Char('{'));
                     }
                     else
                     {
                         if(result.isEmpty())
                         {
                             /* The Attribute Value Template appeared directly in the attribute. */
                             pushState();
                             return tokenAndChangeState(CURLY_LBRACE, Default);
                         }
                         else
                         {
                             /* We don't advance, keep '{' as next token. */
                             return Token(STRING_LITERAL, result);
                         }
                     }
                 }
                 else if(curr == QLatin1Char('}'))
                 {
                     if(m_pos + 1 == m_length)
                         return Token(END_OF_FILE);
                     else if(peekAhead() == '}')
                     {
                         ++m_pos;
                         result.append(QLatin1Char('}'));
                     }
                     else
                         return Token(ERROR);
                 }
                 else if(curr == QLatin1Char('&'))
                 {
                     const QString ret(tokenizeCharacterReference());
                     if(ret.isNull())
                         return Token(ERROR);
                     else
                         result.append(ret);
                 }
                 else if(curr == QLatin1Char('<'))
                     return Token(STRING_LITERAL, result);
                 else
                 {
                     /* See Extensible Markup Language (XML) 1.0 (Fourth Edition),
                      * 3.3.3 Attribute-Value Normalization.
                      *
                      * However, it is complicated a bit by that AVN is defined on top of
                      * EOL normalization and we do those two in one go here. */
                     switch(curr.unicode())
                     {
                         case 0xD:
                         {
                             if(peekAhead() == '\n')
                             {
                                 result.append(QLatin1Char(' '));
                                 ++m_pos;
                                 break;
                             }
                         }
                         case 0xA:
                         /* Fallthrough. */
                         case 0x9:
                         {
                             result.append(QLatin1Char(' '));
                             break;
                         }
                         default:
                             result.append(curr);
                     }
                 }
 
                 ++m_pos;
             }
             Q_ASSERT(false);
         }
         case ElementContent:
         {
             QString result;
             result.reserve(20);
 
             /* Whether the text node, result, may be whitespace only. Character references
              * and CDATA sections disables that. */
             bool mayBeWS = true;
 
             CharacterSkips skipEOLNormalization;
 
             while(true)
             {
                 if(atEnd())
                     return Token(END_OF_FILE);
 
                 switch(peekCurrent())
                 {
                     case '<':
                     {
                         if(!result.isEmpty() && peekAhead(2) != '[')
                         {
                             /* We encountered the end, and it was not a CDATA section. */
                             /* We don't advance. Next time we'll handle the <... stuff. */
                             return Token(mayBeWS ? STRING_LITERAL : NON_BOUNDARY_WS, normalizeEOL(result, skipEOLNormalization));
                         }
 
                         ++m_pos;
                         if(atEnd())
                             return Token(END_OF_FILE);
 
                         const QChar ahead(current());
                         if(ahead.isSpace())
                             return error();
                         else if(ahead == QLatin1Char('/'))
                         {
                             if(m_pos + 1 == m_length)
                                 return Token(END_OF_FILE);
                             else if(m_data.at(m_pos + 1).isSpace())
                                 return error();
                             else
                                 return tokenAndChangeState(BEGIN_END_TAG, EndTag);
                         }
                         else if(isNCNameStart(ahead))
                         {
                             pushState();
                             return tokenAndChangeState(G_LT, StartTag, 0);
                         }
                         else if(aheadEquals("!--", 3, 0))
                         {
                             pushState();
                             m_pos += 3;
                             return tokenAndChangeState(COMMENT_START, XMLComment, 0);
                         }
                         else if(aheadEquals("![CDATA[", 8, 0))
                         {
                             mayBeWS = false;
                             m_pos += 8;
                             const int start = m_pos;
                             const int len = scanUntil("]]>");
 
                             if(len == -1)
                                 return Token(END_OF_FILE);
 
                             m_pos += 2; /* Consume "]]>". Note that m_pos is on '!'. */
                             result.append(m_data.mid(start, len));
                             break;
                         }
                         else if(ahead == QLatin1Char('?'))
                         {
                             pushState();
                             return tokenAndChangeState(PI_START, ProcessingInstructionName);
                         }
                         else
                             return Token(G_LT);
                     }
                     case '&':
                     {
                         const QString ret(tokenizeCharacterReference());
                         if(ret.isNull())
                             return Token(ERROR);
                         else
                         {
                             skipEOLNormalization.insert(result.count());
                             result.append(ret);
                             mayBeWS = false;
                             break;
                         }
                     }
                     case '{':
                     {
                         // TODO remove this check, also below.
                         if(m_pos + 1 == m_length)
                             return Token(END_OF_FILE);
                         else if(peekAhead() == '{')
                         {
                             ++m_pos;
                             result.append(QLatin1Char('{'));
                         }
                         else
                         {
                             if(result.isEmpty())
                             {
                                 pushState();
                                 return tokenAndChangeState(CURLY_LBRACE, Default);
                             }
                             else
                             {
                                 /* We don't advance here. */
                                 return Token(mayBeWS ? STRING_LITERAL : NON_BOUNDARY_WS, normalizeEOL(result, skipEOLNormalization));
                             }
                         }
                         break;
                     }
                     case '}':
                     {
                         if(m_pos + 1 == m_length)
                             return Token(END_OF_FILE);
                         else if(peekAhead() == '}')
                         {
                             ++m_pos;
                             result.append(QLatin1Char('}'));
                         }
                         else
                         {
                             /* This is a parse error, and the grammar won't be able
                              * to reduce this CURLY_RBRACE. */
                             return tokenAndChangeState(CURLY_RBRACE, Default);
                         }
                         break;
                     }
                     case '\n':
                     {
                         /* We want to translate \r\n into \n. */
                         if(peekAhead(-1) == '\r')
                             break;
                         /* else, fallthrough. */
                     }
                     case '\r':
                     {
                         result.append(QLatin1Char('\n'));
                         break;
                     }
                     default:
                     {
                         result.append(current());
                         break;
                     }
                 }
                 ++m_pos;
             }
             Q_ASSERT(false);
         }
         case ProcessingInstructionName:
         {
             const int start = m_pos;
 
             while(true)
             {
                 ++m_pos;
                 if(m_pos >= m_length)
                     return Token(END_OF_FILE);
 
                 const QChar next(current());
                 if(next.isSpace() || next == QLatin1Char('?'))
                 {
                     return tokenAndChangeState(PI_TARGET, m_data.mid(start, m_pos - start),
                                                ProcessingInstructionContent);
                 }
             }
             Q_ASSERT(false);
         }
         case ProcessingInstructionContent:
         {
             /* Consume whitespace between the name and the content. */
             if(consumeRawWhitespace())
                 return Token(END_OF_FILE);
 
             const int start = m_pos;
             const int len = scanUntil("?>");
 
             if(len == -1)
                 return Token(END_OF_FILE);
             else
             {
                 m_pos += 2; /* Consume "?>" */
                 popState();
                 return Token(PI_CONTENT, normalizeEOL(m_data.mid(start, len), CharacterSkips()));
             }
             Q_ASSERT(false);
         }
         case EndTag:
         {
             if(consumeRawWhitespace())
                 return END_OF_FILE;
 
             if(peekCurrent() == '>')
             {
                 popState();
                 return tokenAndAdvance(G_GT);
             }
             else
                 return tokenizeNCNameOrQName();
             Q_ASSERT(false);
         }
         case XMLComment:
         {
             const int start = m_pos;
             const int len = scanUntil("--");
 
             if(len == -1)
                 return END_OF_FILE;
             else
             {
                 m_pos += 2; /* Consume "--". */
                 popState();
 
                 if(peekCurrent() == '>')
                 {
                     ++m_pos;
                     return Token(COMMENT_CONTENT, normalizeEOL(m_data.mid(start, len), CharacterSkips()));
                 }
                 else
                     return error();
             }
             Q_ASSERT(false);
         }
         case Pragma:
         {
             /* Consume whitespace. */
             if(consumeRawWhitespace())
                 return Token(END_OF_FILE);
 
             setState(PragmaContent);
             return tokenizeNCNameOrQName();
         }
         case PragmaContent:
         {
             QString result;
             result.reserve(20);
 
             const bool hasWS = m_pos < m_length && current().isSpace();
 
             /* Consume all whitespace up to the pragma content(if any). */
             if(consumeRawWhitespace())
                 return Token(END_OF_FILE);
 
             if(peekCurrent() == '#' && peekAhead() == ')')
             {
                 /* We reached the end, and there's no pragma content. */
                 return tokenAndChangeState(PRAGMA_END, Default, 2);
             }
             else if(!hasWS)
             {
                 /* A separating space is required if there's pragma content. */
                 return error(); /* i18n */
             }
 
             const int start = m_pos;
             const int len = scanUntil("#)");
             if(len == -1)
                 return Token(END_OF_FILE);
 
             return Token(STRING_LITERAL, m_data.mid(start, len));
             Q_ASSERT(false);
         }
     }
 
     Q_ASSERT(false);
     return error();
 }

◆ normalizeEOL()

QString QPatternist::XQueryTokenizer::normalizeEOL	(	const QString &	input,
		const CharacterSkips &	characterSkips
	)

staticprivate

Returns input, normalized according to XQuery 1.0: An XML Query Language, A.2.3 End-of-Line Handling

Definition at line 146 of file qxquerytokenizer.cpp.

Referenced by nextToken(), and tokenizeStringLiteral().

 {
     const int len = input.count();
     QString result;
 
     /* The likely hood is rather high it'll be the same content. */
     result.reserve(len);
 
     for(int i = 0; i < len; ++i)
     {
         const QChar &at = input.at(i);
 
         if(characterSkips.contains(i))
         {
             result.append(at);
             continue;
         }
         switch(input.at(i).unicode())
         {
             case '\r':
             {
                 if(i + 1 < len && input.at(i + 1) == QLatin1Char('\n'))
                     ++i;
 
                 /* Else, fallthrough. */
             }
             case '\n':
             {
                 result.append(QLatin1Char('\n'));
                 continue;
             }
             default:
             {
                 result.append(at);
             }
         }
     }
 
     return result;
 }

◆ peekAhead()

char QPatternist::XQueryTokenizer::peekAhead ( const int length = 1 ) const

inlineprivate

Returns: the character length characters from the current position.

Definition at line 317 of file qxquerytokenizer.cpp.

Referenced by attributeAsRaw(), consumeComment(), consumeRawWhitespace(), consumeWhitespace(), nextToken(), peekForColonColon(), and tokenizeNCNameOrQName().

 {
     if(m_pos + length < m_length)
         return m_data.at(m_pos + length).toAscii();
     else
         return 0;
 }

◆ peekCurrent()

char QPatternist::XQueryTokenizer::peekCurrent ( ) const

inlineprivate

Returns the character at the current position, converted to ASCII.

Equivalent to calling:

current().toAscii();

Definition at line 84 of file qxquerytokenizer.cpp.

Referenced by attributeAsRaw(), consumeComment(), consumeRawWhitespace(), consumeWhitespace(), nextToken(), tokenizeCharacterReference(), and tokenizeNCNameOrQName().

 {
     return current().toAscii();
 }

◆ peekForColonColon()

int QPatternist::XQueryTokenizer::peekForColonColon ( ) const

private

hadWhitespace is always set to a proper value.

Returns: the length of whitespace scanned before reaching "::", or -1 if something else was found.

Definition at line 89 of file qxquerytokenizer.cpp.

Referenced by nextToken().

 {
     /* Note, we don't modify m_pos in this function, so we need to do offset
      * calculations. */
     int pos = m_pos;
 
     while(pos < m_length)
     {
         switch(m_data.at(pos).toAscii())
         {
             /* Fallthrough these four. */
             case ' ':
             case '\t':
             case '\n':
             case '\r':
                 break;
             case ':':
             {
                 if(peekAhead((pos - m_pos) + 1) == ':')
                     return pos - m_pos;
                 /* Fallthrough. */
             }
             default:
                 return -1;
         }
         ++pos;
     }
 
     return -1;
 }

◆ popState()

void QPatternist::XQueryTokenizer::popState ( )

inlineprivate

Definition at line 737 of file qxquerytokenizer.cpp.

Referenced by nextToken().

 {
     /* QStack::pop() asserts if it's empty, so we need to check
      * it, since we might receive unbalanced curlies. */
     if(!m_stateStack.isEmpty())
         m_state = m_stateStack.pop();
 }

◆ pushState() [1/2]

void QPatternist::XQueryTokenizer::pushState ( const State state )

inlineprivate

Definition at line 727 of file qxquerytokenizer.cpp.

 {
     m_stateStack.push(s);
 }

◆ pushState() [2/2]

void QPatternist::XQueryTokenizer::pushState ( )

inlineprivate

Same as calling:

pushState(currentState());

Definition at line 732 of file qxquerytokenizer.cpp.

Referenced by nextToken().

 {
     m_stateStack.push(m_state);
 }

◆ resumeTokenizationFrom()

void QPatternist::XQueryTokenizer::resumeTokenizationFrom ( const int position )

virtual

Resumes regular parsing from position. The tokenizer must be in the scan-only state, which the commenceScanOnly() call transists to.

The tokenizer will return the token POSITION_SET once after this function has been called.

Implements QPatternist::Tokenizer.

Definition at line 2235 of file qxquerytokenizer.cpp.

 {
     m_scanOnly = false;
     m_pos = pos;
 }

◆ scanUntil()

int QPatternist::XQueryTokenizer::scanUntil ( const char *const content )

private

Advances m_pos until content is encountered.

Returned is the length stretching from m_pos when starting, until content is encountered. content is not included in the length.

Definition at line 593 of file qxquerytokenizer.cpp.

Referenced by nextToken().

 {
     const int end = m_data.indexOf(QString::fromLatin1(content), m_pos);
 
     if(end == -1)
         return -1;
     else
     {
         const int len = end - m_pos;
         m_pos += len;
         return len;
     }
 }

◆ setParserContext()

void QPatternist::XQueryTokenizer::setParserContext ( const ParserContext::Ptr & parseInfo )

virtual

Does nothing.

Implements QPatternist::Tokenizer.

Definition at line 2241 of file qxquerytokenizer.cpp.

2242 {

2243 }

◆ setState()

void QPatternist::XQueryTokenizer::setState ( const State s )

inlineprivate

Definition at line 722 of file qxquerytokenizer.cpp.

Referenced by attributeAsRaw(), nextToken(), tokenAndChangeState(), and tokenizeNumberLiteral().

 {
     m_state = s;
 }

◆ state()

XQueryTokenizer::State QPatternist::XQueryTokenizer::state ( ) const

inlineprivate

Definition at line 717 of file qxquerytokenizer.cpp.

Referenced by nextToken().

 {
     return m_state;
 }

◆ tokenAndAdvance()

Tokenizer::Token QPatternist::XQueryTokenizer::tokenAndAdvance	(	const TokenType	code,
		const int	advance = `1`
	)

inlineprivate

Definition at line 138 of file qxquerytokenizer.cpp.

Referenced by nextToken().

 {
     Q_ASSERT(advance >= 0);
     m_pos += advance;
     return Token(code);
 }

◆ tokenAndChangeState() [1/2]

Tokenizer::Token QPatternist::XQueryTokenizer::tokenAndChangeState	(	const TokenType	code,
		const State	state,
		const int	advance = `1`
	)

inlineprivate

Definition at line 120 of file qxquerytokenizer.cpp.

Referenced by nextToken().

 {
     Q_ASSERT(advance >= 0);
     m_pos += advance;
     setState(s);
     return Token(code);
 }

◆ tokenAndChangeState() [2/2]

Tokenizer::Token QPatternist::XQueryTokenizer::tokenAndChangeState	(	const TokenType	code,
		const QString &	value,
		const State	state
	)

inlineprivate

Definition at line 130 of file qxquerytokenizer.cpp.

 {
     setState(s);
     return Token(code, value);
 }

◆ tokenizeCharacterReference()

QString QPatternist::XQueryTokenizer::tokenizeCharacterReference ( )

private

Definition at line 532 of file qxquerytokenizer.cpp.

Referenced by attributeAsRaw(), nextToken(), and tokenizeStringLiteral().

 {
     Q_ASSERT(peekCurrent() == '&');
 
     const int theEnd = m_data.indexOf(QLatin1Char(';'), m_pos + 1);
 
     if(theEnd == -1) /* No ';' found, a syntax error. i18n. */
         return QString();
 
     QString content(m_data.mid(m_pos + 1, (theEnd - m_pos) - 1));
     m_pos = theEnd;
 
     const QChar charRef(charForReference(content));
 
     if(!charRef.isNull())
         return charRef;
     else if(content.startsWith(QLatin1Char('#')))
     {
         int base;
 
         /* It is only '#' or '#x'. */
         if(content.length() < 2)
             return QString();
 
         /* We got a hex number if it starts with 'x', otherwise it's a decimal. */
         if(content.at(1) == QLatin1Char('x'))
         {
             base = 16;
             content = content.mid(2); /* Remove "#x". */
         }
         else
         {
             base = 10;
             content = content.mid(1); /* Remove "#". */
         }
 
         bool conversionOK = false;
         const int codepoint = content.toInt(&conversionOK, base);
 
         if(conversionOK)
         {
             const QChar ch(codepoint);
 
             if(ch.isNull())
             {
                 /* We likely have something which require surrogate pairs. */
                 QString result;
                 result += QChar(QChar::highSurrogate(codepoint));
                 result += QChar(QChar::lowSurrogate(codepoint));
                 return result;
             }
             else
                 return ch;
         }
         else
             return QString();
     }
     else
         return QString();
 }

◆ tokenizeNCName()

Tokenizer::Token QPatternist::XQueryTokenizer::tokenizeNCName ( )

inlineprivate

Definition at line 673 of file qxquerytokenizer.cpp.

Referenced by nextToken(), and tokenizeNCNameOrQName().

 {
     const int startPos = m_pos;
 
     if(m_pos < m_length && isNCNameStart(current()))
     {
         ++m_pos;
 
         for(; m_pos < m_length; ++m_pos)
         {
             if(!isNCNameBody(current()))
                 break;
         }
 
         return Token(NCNAME, m_data.mid(startPos, m_pos - startPos));
     }
     else
         return error();
 }

◆ tokenizeNCNameOrQName()

Tokenizer::Token QPatternist::XQueryTokenizer::tokenizeNCNameOrQName ( )

inlineprivate

Definition at line 469 of file qxquerytokenizer.cpp.

Referenced by nextToken().

 {
     const int start = m_pos;
 
     const Token t1 = tokenizeNCName();
     if(t1.hasError())
         return t1;
 
     if(peekCurrent() != ':' || peekAhead() == '=')
         return t1;
 
     ++m_pos;
 
     const Token t2 = tokenizeNCName();
     if(t2.hasError())
         return t2;
     else
         return Token(QNAME, m_data.mid(start, m_pos - start));
 }

◆ tokenizeNumberLiteral()

Tokenizer::Token QPatternist::XQueryTokenizer::tokenizeNumberLiteral ( )

inlineprivate

Definition at line 489 of file qxquerytokenizer.cpp.

Referenced by nextToken().

 {
     setState(Operator);
     const int startPos = m_pos;
     bool hasDot = false;
     bool isXPath20 = false;
 
     for(; m_pos < m_length; ++m_pos)
     {
         QChar ch(current());
 
         char cell = ch.cell();
 
         if(cell == 'e' || cell == 'E')
         {
             isXPath20 = true;
             ++m_pos;
             ch = current();
 
             if(ch.row() != 0)
                 break;
 
             cell = ch.cell();
 
             if(cell == '+' || cell == '-')
                 continue;
         }
 
         if(isNCNameStart(ch))
             return error();
 
         if(cell < '0' || cell > '9')
         {
             if(cell == '.' && !hasDot)
                 hasDot = true;
             else
                 break;
         }
     }
 
     return Token(isXPath20 ? XPATH2_NUMBER : NUMBER, m_data.mid(startPos, m_pos - startPos));
 }

◆ tokenizeStringLiteral()

Tokenizer::Token QPatternist::XQueryTokenizer::tokenizeStringLiteral ( )

inlineprivate

Definition at line 623 of file qxquerytokenizer.cpp.

Referenced by nextToken().

 {
     const QChar delimiter(current());
     /* We cannot unfortunately just scan and then do mid(),
      * since we can encounter character references. */
     QString result;
 
     /* This is more likely than QString's default allocation. */
     result.reserve(8);
 
     CharacterSkips skipEOLNormalization;
 
     /* Advance over the initial quote character. */
     ++m_pos;
 
     for(; m_pos < m_length; ++m_pos)
     {
         const QChar c(current());
 
         if(c == QLatin1Char('&'))
         {
             const QString charRef(tokenizeCharacterReference());
 
             if(charRef.isNull())
                 return error();
             else
             {
                 skipEOLNormalization.insert(result.count());
                 result.append(charRef);
             }
 
         }
         else if(c == delimiter)
         {
             /* Maybe the escaping mechanism is used. For instance, "s""s"
              * has the value `s"s'. */
             ++m_pos;
 
             if(current() == delimiter) /* Double quote. */
                 result += delimiter;
             else
                 return Token(STRING_LITERAL, normalizeEOL(result, skipEOLNormalization));
         }
         else
             result += c;
     }
 
     return error();
 }

Properties

◆ m_charRefs

QHash<QString, QChar> QPatternist::XQueryTokenizer::m_charRefs

private

Definition at line 321 of file qxquerytokenizer_p.h.

Referenced by charForReference().

◆ m_columnOffset

int QPatternist::XQueryTokenizer::m_columnOffset

private

The offset into m_length for where the current column starts. So m_length - m_columnOffset is the current column.

The line number and column number both starts at 1.

Definition at line 317 of file qxquerytokenizer_p.h.

Referenced by consumeComment(), consumeRawWhitespace(), consumeWhitespace(), and nextToken().

◆ m_data

const QString QPatternist::XQueryTokenizer::m_data

private

Definition at line 297 of file qxquerytokenizer_p.h.

Referenced by aheadEquals(), current(), nextToken(), peekAhead(), peekForColonColon(), scanUntil(), tokenizeCharacterReference(), tokenizeNCName(), tokenizeNCNameOrQName(), and tokenizeNumberLiteral().

◆ m_length

const int QPatternist::XQueryTokenizer::m_length

private

Definition at line 298 of file qxquerytokenizer_p.h.

Referenced by aheadEquals(), atEnd(), consumeComment(), consumeRawWhitespace(), consumeWhitespace(), current(), nextToken(), peekAhead(), peekForColonColon(), tokenizeNCName(), tokenizeNumberLiteral(), and tokenizeStringLiteral().

◆ m_line

int QPatternist::XQueryTokenizer::m_line

private

The current line number.

The line number and column number both starts at 1.

Definition at line 308 of file qxquerytokenizer_p.h.

Referenced by consumeComment(), consumeRawWhitespace(), consumeWhitespace(), and nextToken().

◆ m_namePool

const NamePool::Ptr QPatternist::XQueryTokenizer::m_namePool

private

Definition at line 319 of file qxquerytokenizer_p.h.

◆ m_pos

int QPatternist::XQueryTokenizer::m_pos

private

◆ m_scanOnly

bool QPatternist::XQueryTokenizer::m_scanOnly

private

Definition at line 322 of file qxquerytokenizer_p.h.

Referenced by commenceScanOnly(), nextToken(), and resumeTokenizationFrom().

◆ m_state

State QPatternist::XQueryTokenizer::m_state

private

Definition at line 299 of file qxquerytokenizer_p.h.

Referenced by popState(), pushState(), setState(), and state().

◆ m_stateStack

QStack<State> QPatternist::XQueryTokenizer::m_stateStack

private

Definition at line 300 of file qxquerytokenizer_p.h.

Referenced by popState(), and pushState().

◆ m_tokenStack

QStack<Token> QPatternist::XQueryTokenizer::m_tokenStack

private

Definition at line 320 of file qxquerytokenizer_p.h.

Referenced by nextToken().

The documentation for this class was generated from the following files:

/src/xmlpatterns/parser/qxquerytokenizer_p.h
/src/xmlpatterns/parser/qxquerytokenizer.cpp

Public Types

Public Functions

Private Types

Private Functions

Static Private Functions

Properties

Additional Inherited Members

Detailed Description

Typedefs

◆ CharacterSkips

Enumerations

◆ State

Constructors and Destructors

◆ XQueryTokenizer()

Functions

◆ aheadEquals()

◆ atEnd()

◆ attributeAsRaw()

◆ charForReference()

◆ commenceScanOnly()

◆ consumeComment()

◆ consumeRawWhitespace()

◆ consumeWhitespace()

◆ current()

◆ error()

◆ isDigit()

◆ isNCNameBody()

◆ isNCNameStart()

◆ isOperatorKeyword()

◆ isPhraseKeyword()

◆ isTypeToken()

◆ lookupKeyword()

◆ nextToken() [1/2]

◆ nextToken() [2/2]

◆ normalizeEOL()

◆ peekAhead()

◆ peekCurrent()

◆ peekForColonColon()

◆ popState()

◆ pushState() [1/2]

◆ pushState() [2/2]

◆ resumeTokenizationFrom()

◆ scanUntil()

◆ setParserContext()

◆ setState()

◆ state()

◆ tokenAndAdvance()

◆ tokenAndChangeState() [1/2]

◆ tokenAndChangeState() [2/2]

◆ tokenizeCharacterReference()

◆ tokenizeNCName()

◆ tokenizeNCNameOrQName()

◆ tokenizeNumberLiteral()

◆ tokenizeStringLiteral()

Properties

◆ m_charRefs

◆ m_columnOffset

◆ m_data

◆ m_length

◆ m_line

◆ m_namePool

◆ m_pos

◆ m_scanOnly

◆ m_state

◆ m_stateStack

◆ m_tokenStack