Qt 4.8
qxquerytokenizer.cpp
Go to the documentation of this file.
1 /****************************************************************************
2 **
3 ** Copyright (C) 2014 Digia Plc and/or its subsidiary(-ies).
4 ** Contact: http://www.qt-project.org/legal
5 **
6 ** This file is part of the QtXmlPatterns module of the Qt Toolkit.
7 **
8 ** $QT_BEGIN_LICENSE:LGPL$
9 ** Commercial License Usage
10 ** Licensees holding valid commercial Qt licenses may use this file in
11 ** accordance with the commercial license agreement provided with the
12 ** Software or, alternatively, in accordance with the terms contained in
13 ** a written agreement between you and Digia. For licensing terms and
14 ** conditions see http://qt.digia.com/licensing. For further information
15 ** use the contact form at http://qt.digia.com/contact-us.
16 **
17 ** GNU Lesser General Public License Usage
18 ** Alternatively, this file may be used under the terms of the GNU Lesser
19 ** General Public License version 2.1 as published by the Free Software
20 ** Foundation and appearing in the file LICENSE.LGPL included in the
21 ** packaging of this file. Please review the following information to
22 ** ensure the GNU Lesser General Public License version 2.1 requirements
23 ** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
24 **
25 ** In addition, as a special exception, Digia gives you certain additional
26 ** rights. These rights are described in the Digia Qt LGPL Exception
27 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
28 **
29 ** GNU General Public License Usage
30 ** Alternatively, this file may be used under the terms of the GNU
31 ** General Public License version 3.0 as published by the Free Software
32 ** Foundation and appearing in the file LICENSE.GPL included in the
33 ** packaging of this file. Please review the following information to
34 ** ensure the GNU General Public License version 3.0 requirements will be
35 ** met: http://www.gnu.org/copyleft/gpl.html.
36 **
37 **
38 ** $QT_END_LICENSE$
39 **
40 ****************************************************************************/
41 
42 #include <QByteArray>
43 
45 
46 #include "qxquerytokenizer_p.h"
47 
48 #include "qtokenlookup.cpp"
49 
51 
52 namespace QPatternist
53 {
54 
55 #define handleWhitespace() \
56 { \
57  const TokenType t = consumeWhitespace(); \
58  if(t != SUCCESS) \
59  return Token(t); \
60 }
61 
63  const QUrl &location,
64  const State startingState) : Tokenizer(location)
65  , m_data(query)
66  , m_length(query.length())
67  , m_state(startingState)
68  , m_pos(0)
69  , m_line(1)
70  , m_columnOffset(0)
71  , m_scanOnly(false)
72 {
73  Q_ASSERT(location.isValid() || location.isEmpty());
74 }
75 
77 {
78  if(m_pos < m_length)
79  return m_data.at(m_pos);
80  else
81  return QChar();
82 }
83 
85 {
86  return current().toAscii();
87 }
88 
90 {
91  /* Note, we don't modify m_pos in this function, so we need to do offset
92  * calculations. */
93  int pos = m_pos;
94 
95  while(pos < m_length)
96  {
97  switch(m_data.at(pos).toAscii())
98  {
99  /* Fallthrough these four. */
100  case ' ':
101  case '\t':
102  case '\n':
103  case '\r':
104  break;
105  case ':':
106  {
107  if(peekAhead((pos - m_pos) + 1) == ':')
108  return pos - m_pos;
109  /* Fallthrough. */
110  }
111  default:
112  return -1;
113  }
114  ++pos;
115  }
116 
117  return -1;
118 }
119 
121  const State s,
122  const int advance)
123 {
124  Q_ASSERT(advance >= 0);
125  m_pos += advance;
126  setState(s);
127  return Token(code);
128 }
129 
131  const QString &value,
132  const State s)
133 {
134  setState(s);
135  return Token(code, value);
136 }
137 
139  const int advance)
140 {
141  Q_ASSERT(advance >= 0);
142  m_pos += advance;
143  return Token(code);
144 }
145 
147  const CharacterSkips &characterSkips)
148 {
149  const int len = input.count();
150  QString result;
151 
152  /* The likely hood is rather high it'll be the same content. */
153  result.reserve(len);
154 
155  for(int i = 0; i < len; ++i)
156  {
157  const QChar &at = input.at(i);
158 
159  if(characterSkips.contains(i))
160  {
161  result.append(at);
162  continue;
163  }
164  switch(input.at(i).unicode())
165  {
166  case '\r':
167  {
168  if(i + 1 < len && input.at(i + 1) == QLatin1Char('\n'))
169  ++i;
170 
171  /* Else, fallthrough. */
172  }
173  case '\n':
174  {
175  result.append(QLatin1Char('\n'));
176  continue;
177  }
178  default:
179  {
180  result.append(at);
181  }
182  }
183  }
184 
185  return result;
186 }
187 
189 {
190  /* Below, we return ERROR instead of END_OF_FILE such that the parser
191  * sees an invalid comment. */
192  while(m_pos < m_length)
193  {
194  switch(peekCurrent())
195  {
196  case ':':
197  {
198  ++m_pos; /* Consume ':' */
199  if(atEnd())
200  return ERROR;
201 
202  if(peekCurrent() == ')')
203  {
204  ++m_pos; /* Consume ')' */
205  return SUCCESS; /* The comment closed nicely. */
206  }
207  continue; /* We don't want to increment m_pos twice. */
208  }
209  case '(':
210  { /* It looks like the start of a comment. */
211  ++m_pos;
212 
213  if(atEnd())
214  return END_OF_FILE;
215  else if(peekCurrent() == ':')
216  {
217  /* And it is a nested comment -- parse it. */
218  const TokenType retval = consumeComment();
219  if(retval == SUCCESS)
220  continue; /* Continue with our "own" comment. */
221  else
222  return retval; /* Return the error in the nested comment. */
223  }
224  break;
225  }
226  case '\n':
227  /* Fallthrough. */
228  case '\r':
229  {
230  /* We want to count \r\n as a single line break. */
231  if(peekAhead() == '\n')
232  ++m_pos;
233 
235  ++m_line;
236 
237  break;
238  }
239  }
240  ++m_pos;
241  }
242 
243  return ERROR; /* Error: we reached the end while inside a comment. */
244 }
245 
247 {
248  while(m_pos < m_length)
249  {
250  switch(peekCurrent())
251  {
252  case ' ':
253  case '\t':
254  break;
255  case '\n':
256  case '\r':
257  {
258  if(peekAhead() == '\n')
259  ++m_pos;
260 
262  ++m_line;
263 
264  break;
265  }
266  default:
267  return false;
268  }
269  ++m_pos;
270  }
271  return true;
272 }
273 
275 {
276  while(m_pos < m_length)
277  {
278  switch(peekCurrent())
279  {
280  case ' ':
281  case '\t':
282  break;
283  case '\n':
284  case '\r':
285  {
286  /* We want to count \r\n as a single line break. */
287  if(peekAhead() == '\n')
288  ++m_pos;
289 
291  ++m_line;
292 
293  break;
294  }
295  case '(':
296  {
297  if(peekAhead() == ':')
298  {
299  m_pos += 2; /* Consume "(:" */
300 
301  const TokenType comment = consumeComment();
302  if(comment == SUCCESS)
303  continue;
304  else
305  return comment;
306  }
307  }
308  default:
309  return SUCCESS;
310  }
311  ++m_pos;
312  }
313 
314  return END_OF_FILE;
315 }
316 
317 char XQueryTokenizer::peekAhead(const int length) const
318 {
319  if(m_pos + length < m_length)
320  return m_data.at(m_pos + length).toAscii();
321  else
322  return 0;
323 }
324 
326 {
327  return Token(ERROR);
328 }
329 
330 bool XQueryTokenizer::isDigit(const char ch)
331 {
332  return ch >= '0' && ch <= '9';
333 }
334 
335 /* Replace with function in QXmlUtils. Write test cases for this. */
337 {
338  if(ch == QLatin1Char('_'))
339  return true;
340 
341  switch(ch.category())
342  {
345  case QChar::Letter_Other:
348  return true;
349  default:
350  return false;
351  }
352 }
353 
355 {
356  switch(ch.unicode())
357  {
358  case '.':
359  case '_':
360  case '-':
361  return true;
362  }
363 
364  switch(ch.category())
365  {
368  case QChar::Letter_Other:
376  return true;
377  default:
378  return false;
379  }
380 }
381 
383 {
384  switch(code)
385  {
386  /* Fallthrough all these. */
387  case CASTABLE:
388  case CAST:
389  case COPY_NAMESPACES:
390  case DECLARE:
391  case EMPTY:
392  case MODULE:
393  case IMPORT:
394  case INSTANCE:
395  case ORDER:
396  case ORDERING:
397  case XQUERY:
398  case STABLE:
399  case TREAT:
400  return true;
401  default:
402  return false;
403  }
404 }
405 
407 {
408  switch(code)
409  {
410  /* Fallthrough all these. */
411  case AS:
412  case ASCENDING:
413  case AT:
414  case CASE:
415  case CAST:
416  case CASTABLE:
417  case EQ:
418  case EXTERNAL:
419  case GE:
420  case G_EQ:
421  case G_GT:
422  case G_LT:
423  case G_NE:
424  case GT:
425  case IN:
426  case INHERIT:
427  case INSTANCE:
428  case IS:
429  case ITEM:
430  case LE:
431  case LT:
432  case NE:
433  case NO_INHERIT:
434  case NO_PRESERVE:
435  case OF:
436  case PRESERVE:
437  case RETURN:
438  case STABLE:
439  case TO:
440  case TREAT:
441  return true;
442  default:
443  return false;
444  };
445 }
446 
448 {
449  switch(t)
450  {
451  /* Fallthrough all these. */
452  case ATTRIBUTE:
453  case COMMENT:
454  case DOCUMENT:
455  case DOCUMENT_NODE:
456  case ELEMENT:
457  case ITEM:
458  case NODE:
460  case SCHEMA_ATTRIBUTE:
461  case SCHEMA_ELEMENT:
462  case TEXT:
463  return true;
464  default:
465  return false;
466  }
467 }
468 
470 {
471  const int start = m_pos;
472 
473  const Token t1 = tokenizeNCName();
474  if(t1.hasError())
475  return t1;
476 
477  if(peekCurrent() != ':' || peekAhead() == '=')
478  return t1;
479 
480  ++m_pos;
481 
482  const Token t2 = tokenizeNCName();
483  if(t2.hasError())
484  return t2;
485  else
486  return Token(QNAME, m_data.mid(start, m_pos - start));
487 }
488 
490 {
492  const int startPos = m_pos;
493  bool hasDot = false;
494  bool isXPath20 = false;
495 
496  for(; m_pos < m_length; ++m_pos)
497  {
498  QChar ch(current());
499 
500  char cell = ch.cell();
501 
502  if(cell == 'e' || cell == 'E')
503  {
504  isXPath20 = true;
505  ++m_pos;
506  ch = current();
507 
508  if(ch.row() != 0)
509  break;
510 
511  cell = ch.cell();
512 
513  if(cell == '+' || cell == '-')
514  continue;
515  }
516 
517  if(isNCNameStart(ch))
518  return error();
519 
520  if(cell < '0' || cell > '9')
521  {
522  if(cell == '.' && !hasDot)
523  hasDot = true;
524  else
525  break;
526  }
527  }
528 
529  return Token(isXPath20 ? XPATH2_NUMBER : NUMBER, m_data.mid(startPos, m_pos - startPos));
530 }
531 
533 {
534  Q_ASSERT(peekCurrent() == '&');
535 
536  const int theEnd = m_data.indexOf(QLatin1Char(';'), m_pos + 1);
537 
538  if(theEnd == -1) /* No ';' found, a syntax error. i18n. */
539  return QString();
540 
541  QString content(m_data.mid(m_pos + 1, (theEnd - m_pos) - 1));
542  m_pos = theEnd;
543 
544  const QChar charRef(charForReference(content));
545 
546  if(!charRef.isNull())
547  return charRef;
548  else if(content.startsWith(QLatin1Char('#')))
549  {
550  int base;
551 
552  /* It is only '#' or '#x'. */
553  if(content.length() < 2)
554  return QString();
555 
556  /* We got a hex number if it starts with 'x', otherwise it's a decimal. */
557  if(content.at(1) == QLatin1Char('x'))
558  {
559  base = 16;
560  content = content.mid(2); /* Remove "#x". */
561  }
562  else
563  {
564  base = 10;
565  content = content.mid(1); /* Remove "#". */
566  }
567 
568  bool conversionOK = false;
569  const int codepoint = content.toInt(&conversionOK, base);
570 
571  if(conversionOK)
572  {
573  const QChar ch(codepoint);
574 
575  if(ch.isNull())
576  {
577  /* We likely have something which require surrogate pairs. */
578  QString result;
579  result += QChar(QChar::highSurrogate(codepoint));
580  result += QChar(QChar::lowSurrogate(codepoint));
581  return result;
582  }
583  else
584  return ch;
585  }
586  else
587  return QString();
588  }
589  else
590  return QString();
591 }
592 
593 int XQueryTokenizer::scanUntil(const char *const content)
594 {
595  const int end = m_data.indexOf(QString::fromLatin1(content), m_pos);
596 
597  if(end == -1)
598  return -1;
599  else
600  {
601  const int len = end - m_pos;
602  m_pos += len;
603  return len;
604  }
605 }
606 
608 {
609  if(m_charRefs.isEmpty())
610  {
611  /* Initialize. */
612  m_charRefs.reserve(5);
617  m_charRefs.insert(QLatin1String("apos"), QLatin1Char('\''));
618  }
619 
620  return m_charRefs.value(reference);
621 }
622 
624 {
625  const QChar delimiter(current());
626  /* We cannot unfortunately just scan and then do mid(),
627  * since we can encounter character references. */
628  QString result;
629 
630  /* This is more likely than QString's default allocation. */
631  result.reserve(8);
632 
633  CharacterSkips skipEOLNormalization;
634 
635  /* Advance over the initial quote character. */
636  ++m_pos;
637 
638  for(; m_pos < m_length; ++m_pos)
639  {
640  const QChar c(current());
641 
642  if(c == QLatin1Char('&'))
643  {
644  const QString charRef(tokenizeCharacterReference());
645 
646  if(charRef.isNull())
647  return error();
648  else
649  {
650  skipEOLNormalization.insert(result.count());
651  result.append(charRef);
652  }
653 
654  }
655  else if(c == delimiter)
656  {
657  /* Maybe the escaping mechanism is used. For instance, "s""s"
658  * has the value `s"s'. */
659  ++m_pos;
660 
661  if(current() == delimiter) /* Double quote. */
662  result += delimiter;
663  else
664  return Token(STRING_LITERAL, normalizeEOL(result, skipEOLNormalization));
665  }
666  else
667  result += c;
668  }
669 
670  return error();
671 }
672 
674 {
675  const int startPos = m_pos;
676 
677  if(m_pos < m_length && isNCNameStart(current()))
678  {
679  ++m_pos;
680 
681  for(; m_pos < m_length; ++m_pos)
682  {
683  if(!isNCNameBody(current()))
684  break;
685  }
686 
687  return Token(NCNAME, m_data.mid(startPos, m_pos - startPos));
688  }
689  else
690  return error();
691 }
692 
693 bool XQueryTokenizer::aheadEquals(const char *const chs,
694  const int len,
695  const int offset) const
696 {
697  Q_ASSERT(len > 0);
698  Q_ASSERT(qstrlen(chs) == uint(len));
699 
700  if(m_pos + len >= m_length)
701  return false;
702 
703  for(int i = offset; i < (len + offset); ++i)
704  {
705  if(m_data.at(m_pos + i).toAscii() != chs[i - offset])
706  return false;
707  }
708 
709  return true;
710 }
711 
713 {
714  return TokenLookup::value(keyword.toAscii().constData(), keyword.length());
715 }
716 
718 {
719  return m_state;
720 }
721 
723 {
724  m_state = s;
725 }
726 
728 {
729  m_stateStack.push(s);
730 }
731 
733 {
735 }
736 
738 {
739  /* QStack::pop() asserts if it's empty, so we need to check
740  * it, since we might receive unbalanced curlies. */
741  if(!m_stateStack.isEmpty())
743 }
744 
746 {
747  switch(state())
748  {
749  /* We want to skip or do special whitespace handling for these
750  * states. So fallthrough all of the following. */
752  case Axis:
753  case ElementContent:
754  case EndTag:
755  case Pragma:
756  case PragmaContent:
759  case StartTag:
760  case XMLComment:
761  break;
762  default:
764  }
765 
766  switch(state())
767  {
768  case XMLSpaceDecl:
769  /* Fallthrough. */
770  case NamespaceKeyword:
771  {
772  switch(peekCurrent())
773  {
774  case ',':
775  return tokenAndAdvance(COMMA);
776  case '"':
777  /* Fallthrough. */
778  case '\'':
779  {
781  return tokenizeStringLiteral();
782  }
783  }
784 
785  const Token id(tokenizeNCName());
786 
787  if(id.type != NCNAME)
788  return id;
789 
790  const TokenMap *const keyword = lookupKeyword(id.value);
791  if(keyword)
792  {
793  switch(keyword->token)
794  {
795  case INHERIT:
796  /* Fallthrough. */
797  case NO_INHERIT:
798  {
799  setState(Default);
800  break;
801  }
802  case NAMESPACE:
803  {
805  break;
806  }
807  case ORDERED:
808  /* Fallthrough. */
809  case UNORDERED:
810  /* Fallthrough. */
811  case STRIP:
812  {
813  setState(Default);
814  break;
815  }
816  case PRESERVE:
817  {
818  if(state() != NamespaceKeyword)
819  setState(Default);
820  }
821  default:
822  break;
823  }
824 
825  return Token(keyword->token);
826  }
827  else
828  return id;
829 
830  Q_ASSERT(false);
831  }
832  case NamespaceDecl:
833  {
834  switch(peekCurrent())
835  {
836  case '=':
837  return tokenAndAdvance(G_EQ);
838  case ';':
840  case '\'':
841  /* Fallthrough. */
842  case '\"':
843  return tokenizeStringLiteral();
844  }
845 
846  const Token nc(tokenizeNCName());
847 
849 
850  const char pc = peekCurrent();
851  const TokenMap* const t = lookupKeyword(nc.value);
852 
853  if(pc == '\'' || (pc == '"' && t))
854  return tokenAndChangeState(t->token, Default, 0);
855  else
856  return nc;
857 
858  Q_ASSERT(false);
859  }
860  case Axis:
861  {
862  if(peekCurrent() == ':')
863  {
864  Q_ASSERT(peekAhead() == ':');
865  m_pos += 2;
867  return Token(COLONCOLON);
868  }
869  /* Fallthrough. */
870  }
871  case AfterAxisSeparator:
872  /* Fallthrough. */
873  case Default:
874  /* State Operator and state Default have a lot of tokens in common except
875  * for minor differences. So we treat them the same way, and sprinkles logic
876  * here and there to handle the small differences. */
877  /* Fallthrough. */
878  case Operator:
879  {
880  switch(peekCurrent())
881  {
882  case '=':
884  case '-':
886  case '+':
888  case '[':
890  case ']':
892  case ',':
894  case ';':
896  case '$':
898  case '|':
900  case '?':
902  case ')':
904  case '@':
906  /* Fallthrough all these. */
907  case '1':
908  case '2':
909  case '3':
910  case '4':
911  case '5':
912  case '6':
913  case '7':
914  case '8':
915  case '9':
916  case '0':
917  return tokenizeNumberLiteral();
918  case '.':
919  {
920  const char next = peekAhead();
921  if(next == '.')
922  return tokenAndChangeState(DOTDOT, Operator, 2);
923  /* .5 is allowed, as short form for 0.5:
924  * <tt>[142] DecimalLiteral ::= ("." Digits) | (Digits "." [0-9]*)</tt>
925  */
926  else if(isDigit(next))
927  return tokenizeNumberLiteral();
928  else
930  }
931  case '\'':
932  /* Fallthrough. */
933  case '"':
934  {
936  return tokenizeStringLiteral();
937 
938  }
939  case '(':
940  {
941  if(peekAhead() == '#')
943  else
945  }
946  case '*':
947  {
948  if(peekAhead() == ':')
949  {
950  m_pos += 2; /* Consume *:. */
951  const Token nc = tokenizeNCName();
952 
953  if(nc.hasError())
954  return error();
955  else
957  }
958  else
960  }
961  case ':':
962  {
963  switch(peekAhead())
964  {
965  case '=':
966  return tokenAndChangeState(ASSIGN, Default, 2);
967  case ':':
969  default:
970  return error();
971  }
972  }
973  case '!':
974  {
975  if(peekAhead() == '=')
976  return tokenAndChangeState(G_NE, Default, 2);
977  else
978  return error();
979  }
980  case '<':
981  {
982  switch(peekAhead())
983  {
984  case '=':
985  return tokenAndChangeState(G_LE, Default, 2);
986  case '<':
988  case '?':
989  {
992  }
993  case '!':
994  {
995  if(aheadEquals("!--", 3))
996  {
997  m_pos += 3; /* Consume "!--". */
1000  }
1001  /* Fallthrough. It's a syntax error, and this is a good way to report it. */
1002  }
1003  default:
1004  {
1005  if((m_pos + 1) < m_length && isNCNameStart(m_data.at(m_pos + 1)))
1006  {
1007  /* We assume it's an element constructor. */
1009  }
1010 
1012  }
1013  }
1014  }
1015  case '>':
1016  {
1017  switch(peekAhead())
1018  {
1019  case '=':
1020  return tokenAndChangeState(G_GE, Default, 2);
1021  case '>':
1022  return tokenAndChangeState(FOLLOWS, Default, 2);
1023  default:
1024  return tokenAndChangeState(G_GT, Default);
1025  }
1026  }
1027  case '/':
1028  {
1029  if(peekAhead() == '/')
1031  else
1033  }
1034  case '{':
1035  {
1038  }
1039  case '}':
1040  {
1041  popState();
1042 
1043  return tokenAndAdvance(CURLY_RBRACE);
1044  }
1045  }
1046 
1047  /* Ok. We're in state Default or Operator, and it wasn't a simple
1048  * character. */
1049 
1050  const Token id(tokenizeNCName());
1051 
1052  if(id.type != NCNAME)
1053  return id;
1054 
1055  const TokenMap *const keyword = lookupKeyword(id.value);
1056 
1057  if(state() == Operator)
1058  {
1059  if(keyword)
1060  {
1061  if(keyword->token == DEFAULT || keyword->token == ASCENDING || keyword->token == DESCENDING)
1062  setState(Operator);
1063  else if(keyword->token == RETURN)
1064  setState(Default);
1065  else if(isPhraseKeyword(keyword->token))
1066  {
1067  const TokenType ws = consumeWhitespace();
1068  if(ws == ERROR)
1069  return error();
1070 
1071  const Token id2(tokenizeNCName());
1072  const TokenMap *const keyword2 = lookupKeyword(id2.value);
1073 
1074  if(keyword2)
1075  {
1076  if(keyword->token == TREAT && keyword2->token == AS)
1077  setState(ItemType);
1078  else if (keyword->token == CAST || (keyword->token == CASTABLE && keyword2->token == AS) || keyword2->token == BY)
1079  setState(Default);
1080 
1081  m_tokenStack.push(Token(keyword2->token));
1082  }
1083  else
1084  m_tokenStack.push(id2);
1085 
1086  return Token(keyword->token);
1087  }
1088  else
1089  {
1090  /* Such that we tokenize the second token in "empty greatest". */
1091  if(keyword->token != EMPTY)
1092  setState(Default);
1093  }
1094 
1095  if(keyword->token == AS || keyword->token == CASE)
1096  setState(ItemType);
1097 
1098  return Token(keyword->token);
1099  }
1100  else
1101  return id;
1102  }
1103 
1104  Q_ASSERT(state() == Default || state() == Axis || state() == AfterAxisSeparator);
1105 
1106  /*
1107  * This is hard. Consider this:
1108  *
1109  * Valid: child ::nameTest
1110  * Valid: child:: nameTest
1111  * Syntax Error: child :localName
1112  * Syntax Error: child: localName
1113  *
1114  * Consider "child ::name". Right now, we're here:
1115  * ^
1116  * We don't know whether "child" is a prefix and hence the whitespace is invalid,
1117  * or whether it's an axis and hence skippable. */
1118  {
1119  const int wsLength = peekForColonColon();
1120  /* We cannot call handleWhitespace() because it returns on
1121  * END_OF_FILE, and we have parsed up keyword, and we need to
1122  * deal with that.
1123  *
1124  * If we have a colon colon, which means the whitespace is
1125  * allowed, we skip it. */
1126  if(wsLength != -1)
1127  m_pos += wsLength;
1128  }
1129 
1130  /* Handle name tests. */
1131  if(peekCurrent() == ':')
1132  {
1133  switch(peekAhead())
1134  {
1135  case '=':
1136  return id;
1137  case '*':
1138  {
1139  m_pos += 2;
1140  return tokenAndChangeState(ANY_LOCAL_NAME, id.value, Operator);
1141  }
1142  case ':':
1143  {
1144  /* We have an axis. */
1145  setState(Axis);
1146  return keyword ? Token(keyword->token) : id;
1147  }
1148  default:
1149  {
1150  /* It's a QName. */
1151  ++m_pos; /* Consume the colon. */
1152 
1153  const Token id2(tokenizeNCName());
1154 
1155  if(id2.type != NCNAME)
1156  {
1157  --m_pos;
1158  return id;
1159  }
1160 
1161  setState(Operator);
1162  const int qNameLen = id.value.length() + id2.value.length() + 1;
1163  return Token(QNAME, m_data.mid(m_pos - qNameLen, qNameLen));
1164  }
1165  }
1166  }
1167 
1168  if(!keyword || isOperatorKeyword(keyword->token))
1169  {
1170  setState(Operator);
1171  return id;
1172  }
1173 
1174  const TokenType ws = consumeWhitespace();
1175  if(ws == ERROR) // TODO this should test for success. Write test.
1176  return Token(ERROR);
1177 
1178  if(atEnd())
1179  {
1180  setState(Operator);
1181  return id;
1182  }
1183 
1184  /* Let the if-body apply for constructors, and node type tests. */
1185  if(isTypeToken(keyword->token) ||
1186  keyword->token == TYPESWITCH ||
1187  keyword->token == ORDERED ||
1188  keyword->token == UNORDERED ||
1189  keyword->token == IF)
1190  {
1191  switch(peekCurrent())
1192  {
1193  case '(':
1194  {
1195  // TODO See if we can remove DOCUMENT from isTypeToken.
1196  if(isTypeToken(keyword->token) && keyword->token != DOCUMENT)
1197  {
1199  ++m_pos; /* Consume '('. */
1201 
1202  if(keyword->token == PROCESSING_INSTRUCTION)
1204  else
1205  setState(KindTest);
1206 
1207  return Token(keyword->token);
1208  }
1209  else if(keyword->token == TYPESWITCH || keyword->token == IF)
1210  return Token(keyword->token);
1211  else /* It's a function call. */
1212  return id;
1213  }
1214  case '{':
1215  {
1217  ++m_pos; /* Consume '{'. */
1219  /* Stay in state Default. */
1220  return Token(keyword->token);
1221  }
1222  default:
1223  {
1224  /* We have read in a token which is for instance
1225  * "return", and now it can be an element
1226  * test("element") a node kind test("element()"), or a
1227  * computed element constructor("element name {...").
1228  * We need to do a two-token lookahead here, because
1229  * "element return" can be an element test followed by
1230  * the return keyword, but it can also be an element
1231  * constructor("element return {"). */
1232  if(isNCNameStart(current()))
1233  {
1234  const int currentPos = m_pos;
1235  const Token token2 = tokenizeNCNameOrQName();
1236 
1237  if(token2.hasError())
1238  return token2;
1239 
1240  handleWhitespace();
1241 
1242  if(peekCurrent() == '{')
1243  {
1244  /* An element constructor. */
1245  m_tokenStack.push(token2);
1246  return Token(keyword->token);
1247  }
1248 
1249  /* We jump back in the stream, we need to tokenize token2 according
1250  * to the state. */
1251  m_pos = currentPos;
1252  setState(Operator);
1253  return Token(NCNAME, QLatin1String(keyword->name));
1254  }
1255  }
1256  }
1257  }
1258 
1259  if(peekCurrent() == '$')
1260  {
1261  setState(VarName);
1262  return Token(keyword->token);
1263  }
1264 
1265  /* It's not a node type, it's not the typeswitch expression, but it is a function callsite. */
1266  if(peekCurrent() == '(')
1267  return id;
1268  else if(peekCurrent() == '{' && keyword->token == VALIDATE)
1269  return Token(keyword->token);
1270 
1271  if(!isNCNameStart(current()))
1272  {
1273  setState(Operator);
1274  return id;
1275  }
1276 
1277  const Token id2(tokenizeNCName());
1278  const TokenMap *const keyword2 = lookupKeyword(id2.value);
1279 
1280  if(!keyword2)
1281  {
1282  /* It's a syntax error. All cases of two subsequent ncnames are keywords(e.g, declarations). */
1283  setState(Operator);
1284  return id;
1285  }
1286 
1287  switch(keyword->token)
1288  {
1289  case DECLARE:
1290  {
1291  switch(keyword2->token)
1292  {
1293  case VARIABLE:
1294  /* Fallthrough. */
1295  case FUNCTION:
1296  {
1297  m_tokenStack.push(Token(keyword2->token));
1298  setState(Default);
1299  return Token(keyword->token);
1300  }
1301  case OPTION:
1302  {
1303  m_tokenStack.push(Token(keyword2->token));
1304  setState(Default);
1305  return Token(keyword->token);
1306  }
1307  case COPY_NAMESPACES:
1308  /* Fallthrough. */
1309  case ORDERING:
1310  {
1311  m_tokenStack.push(Token(keyword2->token));
1313  return Token(keyword->token);
1314  }
1315  case CONSTRUCTION:
1316  {
1317  // TODO identical to CONSTRUCTION?
1318  m_tokenStack.push(Token(keyword2->token));
1319  setState(Operator);
1320  return Token(keyword->token);
1321  }
1322  case NAMESPACE:
1323  /* Fallthrough. */
1324  case BASEURI:
1325  {
1326  m_tokenStack.push(Token(keyword2->token));
1328  return Token(keyword->token);
1329  }
1330  case BOUNDARY_SPACE:
1331  {
1332  m_tokenStack.push(Token(keyword2->token));
1334  return Token(keyword->token);
1335  }
1336  case DEFAULT:
1337  {
1338  m_tokenStack.push(Token(keyword2->token));
1339 
1340  const TokenType ws2 = consumeWhitespace();
1341  if(ws2 != SUCCESS)
1342  {
1343  m_tokenStack.prepend(Token(ws2));
1344  return Token(keyword->token);
1345  }
1346 
1347  const Token id3(tokenizeNCName());
1348 
1349  if(id3.type != NCNAME)
1350  {
1351  m_tokenStack.prepend(id3);
1352  return Token(keyword->token);
1353  }
1354 
1355  const TokenMap *const keyword3 = lookupKeyword(id3.value);
1356  if(!keyword3)
1357  {
1358  m_tokenStack.prepend(id3);
1359  return Token(keyword->token);
1360  }
1361  else
1362  {
1363  m_tokenStack.prepend(Token(keyword3->token));
1364 
1365  if(keyword3->token == ORDER)
1366  setState(Operator);
1367  else
1369  }
1370 
1371  return Token(keyword->token);
1372  }
1373  default:
1374  {
1375  m_tokenStack.push(Token(keyword2->token));
1376  setState(Default);
1377  return id;
1378  }
1379  }
1380  }
1381  case XQUERY:
1382  {
1383  m_tokenStack.push(Token(keyword2->token));
1384 
1385  if(keyword2->token == VERSION)
1386  {
1388  return Token(keyword->token);
1389  }
1390  else
1391  {
1392  setState(Operator);
1393  return id;
1394  }
1395  }
1396  case IMPORT:
1397  {
1398  m_tokenStack.push(Token(keyword2->token));
1399 
1400  switch(keyword2->token)
1401  {
1402  case SCHEMA:
1403  /* Fallthrough. */
1404  case MODULE:
1405  {
1407  return Token(keyword->token);
1408  }
1409  default:
1410  {
1411  setState(Operator);
1412  return id;
1413  }
1414  }
1415  }
1416  case VALIDATE:
1417  {
1418  m_tokenStack.push(Token(keyword2->token));
1419 
1420  switch(keyword2->token)
1421  {
1422  case LAX:
1423  case STRICT:
1424  {
1426  return Token(keyword->token);
1427  }
1428  default:
1429  {
1430  setState(Operator);
1431  return id;
1432  }
1433  }
1434  }
1435  default:
1436  {
1437  m_tokenStack.push(Token(keyword2->token));
1438  setState(Operator);
1439  return id;
1440  }
1441  }
1442 
1443  Q_ASSERT(false);
1444 
1445  }
1446  case VarName:
1447  {
1448  if(peekCurrent() == '$')
1449  return tokenAndAdvance(DOLLAR);
1450 
1451  setState(Operator);
1452  return tokenizeNCNameOrQName();
1453  Q_ASSERT(false);
1454  }
1455  case ItemType:
1456  {
1457  switch(peekCurrent())
1458  {
1459  case '(':
1461  case '$':
1463  }
1464 
1465  const Token name(tokenizeNCNameOrQName());
1466 
1467  if(name.hasError())
1468  return error();
1469 
1470  else if(name.type == QNAME)
1471  {
1473  return name;
1474  }
1475  else
1476  {
1477  const TokenMap *const keyword = lookupKeyword(name.value);
1478 
1479  if(keyword)
1480  {
1482  return Token(keyword->token);
1483  }
1484  else
1485  {
1486  setState(Default);
1487  return name;
1488  }
1489  }
1490  Q_ASSERT(false);
1491  }
1492  case KindTest:
1493  {
1494  switch(peekCurrent())
1495  {
1496  case ')':
1497  {
1498  popState();
1499  return tokenAndAdvance(RPAREN);
1500  }
1501  case '(':
1502  return tokenAndAdvance(LPAREN);
1503  case ',':
1504  return tokenAndAdvance(COMMA);
1505  case '*':
1506  return tokenAndAdvance(STAR);
1507  case '?':
1508  return tokenAndAdvance(QUESTION);
1509  case '\'':
1510  /* Fallthrough. */
1511  case '"':
1512  return tokenizeStringLiteral();
1513  }
1514 
1515  const Token nc(tokenizeNCNameOrQName());
1516  if(nc.hasError())
1517  return nc;
1518 
1519  const TokenType ws = consumeWhitespace();
1520  if(ws == ERROR)
1521  return error();
1522 
1523  if(peekCurrent() == '(')
1524  {
1525  const TokenMap *const keyword = lookupKeyword(nc.value);
1526  if(keyword)
1527  {
1529  return Token(keyword->token);
1530  }
1531  else
1532  return nc;
1533  }
1534  else
1535  return nc;
1536  Q_ASSERT(false);
1537  }
1538  case KindTestForPI:
1539  {
1540  switch(peekCurrent())
1541  {
1542  case ')':
1543  {
1544  popState();
1545  return tokenAndAdvance(RPAREN);
1546  }
1547  case '\'':
1548  /* Fallthrough. */
1549  case '"':
1550  return tokenizeStringLiteral();
1551  default:
1552  return tokenizeNCName();
1553  }
1554  Q_ASSERT(false);
1555  }
1556  case OccurrenceIndicator:
1557  {
1558  switch(peekCurrent())
1559  {
1560  case '?':
1562  case '*':
1564  case '+':
1566  default:
1567  {
1568  setState(Operator);
1569  return nextToken();
1570  }
1571  }
1572  Q_ASSERT(false);
1573  }
1574  case XQueryVersion:
1575  {
1576  switch(peekCurrent())
1577  {
1578  case '\'':
1579  /* Fallthrough. */
1580  case '"':
1581  return tokenizeStringLiteral();
1582  case ';':
1584  }
1585 
1586  const Token id(tokenizeNCName());
1587 
1588  if(id.type != NCNAME)
1589  return id;
1590 
1591  const TokenMap *const keyword = lookupKeyword(id.value);
1592  if(keyword)
1593  return tokenAndChangeState(keyword->token, Default);
1594  else
1595  return id;
1596  Q_ASSERT(false);
1597  }
1598  case StartTag:
1599  {
1600  if(peekAhead(-1) == '<')
1601  {
1602  if(current().isSpace())
1603  return Token(ERROR);
1604  }
1605  else
1606  {
1607  if(consumeRawWhitespace())
1608  return Token(END_OF_FILE);
1609  }
1610 
1611  switch(peekCurrent())
1612  {
1613  case '/':
1614  {
1615  if(peekAhead() == '>')
1616  {
1617  m_pos += 2;
1618 
1619  if(m_scanOnly)
1620  return Token(POSITION_SET);
1621  else
1622  {
1623  popState();
1624  return Token(QUICK_TAG_END);
1625  }
1626  }
1627  else
1628  return error();
1629  }
1630  case '>':
1631  {
1632  if(m_scanOnly)
1634  else
1636  }
1637  case '=':
1638  return tokenAndAdvance(G_EQ);
1639  case '\'':
1641  case '"':
1643  default:
1644  return tokenizeNCNameOrQName();
1645  }
1646  Q_ASSERT(false);
1647  }
1648  case AposAttributeContent:
1649  /* Fallthrough. */
1650  case QuotAttributeContent:
1651  {
1652  const QChar sep(state() == AposAttributeContent ? QLatin1Char('\'') : QLatin1Char('"'));
1653  QString result;
1654  result.reserve(20);
1655 
1656  if(m_scanOnly)
1657  {
1658  int stack = 0;
1659  return attributeAsRaw(sep, stack, m_pos, true, result);
1660  }
1661 
1662  Q_ASSERT(!m_scanOnly);
1663  while(true)
1664  {
1665  if(atEnd())
1666  {
1667  /* In the case that the XSL-T tokenizer invokes us with
1668  * default state QuotAttributeContent, we need to be able
1669  * to return a single string, in case that is all we have
1670  * accumulated. */
1671  if(result.isEmpty())
1672  return Token(END_OF_FILE);
1673  else
1674  return Token(STRING_LITERAL, result);
1675  }
1676 
1677  const QChar curr(current());
1678 
1679  if(curr == sep)
1680  {
1681  if(m_pos + 1 == m_length)
1682  return Token(END_OF_FILE);
1683 
1684  if(m_data.at(m_pos + 1) == sep)
1685  {
1686  /* The quoting mechanism was used. */
1687  m_pos += 2;
1688  result.append(sep);
1689  continue;
1690  }
1691 
1692  const QChar next(m_data.at(m_pos + 1));
1693  if(!next.isSpace() && next != QLatin1Char('/') && next != QLatin1Char('>'))
1694  return Token(ERROR); // i18n Space must separate attributes
1695  else if(result.isEmpty())
1696  {
1698  StartTag, 1);
1699  }
1700  else
1701  {
1702  /* Don't consume the sep, but leave it so we next time return a token for it. */
1703  return Token(STRING_LITERAL, result);
1704  }
1705 
1706  ++m_pos;
1707  continue;
1708  }
1709  else if(curr == QLatin1Char('{'))
1710  {
1711  if(m_pos + 1 == m_length)
1712  return Token(END_OF_FILE);
1713  else if(peekAhead() == '{')
1714  {
1715  ++m_pos;
1716  result.append(QLatin1Char('{'));
1717  }
1718  else
1719  {
1720  if(result.isEmpty())
1721  {
1722  /* The Attribute Value Template appeared directly in the attribute. */
1723  pushState();
1725  }
1726  else
1727  {
1728  /* We don't advance, keep '{' as next token. */
1729  return Token(STRING_LITERAL, result);
1730  }
1731  }
1732  }
1733  else if(curr == QLatin1Char('}'))
1734  {
1735  if(m_pos + 1 == m_length)
1736  return Token(END_OF_FILE);
1737  else if(peekAhead() == '}')
1738  {
1739  ++m_pos;
1740  result.append(QLatin1Char('}'));
1741  }
1742  else
1743  return Token(ERROR);
1744  }
1745  else if(curr == QLatin1Char('&'))
1746  {
1747  const QString ret(tokenizeCharacterReference());
1748  if(ret.isNull())
1749  return Token(ERROR);
1750  else
1751  result.append(ret);
1752  }
1753  else if(curr == QLatin1Char('<'))
1754  return Token(STRING_LITERAL, result);
1755  else
1756  {
1757  /* See Extensible Markup Language (XML) 1.0 (Fourth Edition),
1758  * 3.3.3 Attribute-Value Normalization.
1759  *
1760  * However, it is complicated a bit by that AVN is defined on top of
1761  * EOL normalization and we do those two in one go here. */
1762  switch(curr.unicode())
1763  {
1764  case 0xD:
1765  {
1766  if(peekAhead() == '\n')
1767  {
1768  result.append(QLatin1Char(' '));
1769  ++m_pos;
1770  break;
1771  }
1772  }
1773  case 0xA:
1774  /* Fallthrough. */
1775  case 0x9:
1776  {
1777  result.append(QLatin1Char(' '));
1778  break;
1779  }
1780  default:
1781  result.append(curr);
1782  }
1783  }
1784 
1785  ++m_pos;
1786  }
1787  Q_ASSERT(false);
1788  }
1789  case ElementContent:
1790  {
1791  QString result;
1792  result.reserve(20);
1793 
1794  /* Whether the text node, result, may be whitespace only. Character references
1795  * and CDATA sections disables that. */
1796  bool mayBeWS = true;
1797 
1798  CharacterSkips skipEOLNormalization;
1799 
1800  while(true)
1801  {
1802  if(atEnd())
1803  return Token(END_OF_FILE);
1804 
1805  switch(peekCurrent())
1806  {
1807  case '<':
1808  {
1809  if(!result.isEmpty() && peekAhead(2) != '[')
1810  {
1811  /* We encountered the end, and it was not a CDATA section. */
1812  /* We don't advance. Next time we'll handle the <... stuff. */
1813  return Token(mayBeWS ? STRING_LITERAL : NON_BOUNDARY_WS, normalizeEOL(result, skipEOLNormalization));
1814  }
1815 
1816  ++m_pos;
1817  if(atEnd())
1818  return Token(END_OF_FILE);
1819 
1820  const QChar ahead(current());
1821  if(ahead.isSpace())
1822  return error();
1823  else if(ahead == QLatin1Char('/'))
1824  {
1825  if(m_pos + 1 == m_length)
1826  return Token(END_OF_FILE);
1827  else if(m_data.at(m_pos + 1).isSpace())
1828  return error();
1829  else
1831  }
1832  else if(isNCNameStart(ahead))
1833  {
1834  pushState();
1835  return tokenAndChangeState(G_LT, StartTag, 0);
1836  }
1837  else if(aheadEquals("!--", 3, 0))
1838  {
1839  pushState();
1840  m_pos += 3;
1842  }
1843  else if(aheadEquals("![CDATA[", 8, 0))
1844  {
1845  mayBeWS = false;
1846  m_pos += 8;
1847  const int start = m_pos;
1848  const int len = scanUntil("]]>");
1849 
1850  if(len == -1)
1851  return Token(END_OF_FILE);
1852 
1853  m_pos += 2; /* Consume "]]>". Note that m_pos is on '!'. */
1854  result.append(m_data.mid(start, len));
1855  break;
1856  }
1857  else if(ahead == QLatin1Char('?'))
1858  {
1859  pushState();
1861  }
1862  else
1863  return Token(G_LT);
1864  }
1865  case '&':
1866  {
1867  const QString ret(tokenizeCharacterReference());
1868  if(ret.isNull())
1869  return Token(ERROR);
1870  else
1871  {
1872  skipEOLNormalization.insert(result.count());
1873  result.append(ret);
1874  mayBeWS = false;
1875  break;
1876  }
1877  }
1878  case '{':
1879  {
1880  // TODO remove this check, also below.
1881  if(m_pos + 1 == m_length)
1882  return Token(END_OF_FILE);
1883  else if(peekAhead() == '{')
1884  {
1885  ++m_pos;
1886  result.append(QLatin1Char('{'));
1887  }
1888  else
1889  {
1890  if(result.isEmpty())
1891  {
1892  pushState();
1894  }
1895  else
1896  {
1897  /* We don't advance here. */
1898  return Token(mayBeWS ? STRING_LITERAL : NON_BOUNDARY_WS, normalizeEOL(result, skipEOLNormalization));
1899  }
1900  }
1901  break;
1902  }
1903  case '}':
1904  {
1905  if(m_pos + 1 == m_length)
1906  return Token(END_OF_FILE);
1907  else if(peekAhead() == '}')
1908  {
1909  ++m_pos;
1910  result.append(QLatin1Char('}'));
1911  }
1912  else
1913  {
1914  /* This is a parse error, and the grammar won't be able
1915  * to reduce this CURLY_RBRACE. */
1917  }
1918  break;
1919  }
1920  case '\n':
1921  {
1922  /* We want to translate \r\n into \n. */
1923  if(peekAhead(-1) == '\r')
1924  break;
1925  /* else, fallthrough. */
1926  }
1927  case '\r':
1928  {
1929  result.append(QLatin1Char('\n'));
1930  break;
1931  }
1932  default:
1933  {
1934  result.append(current());
1935  break;
1936  }
1937  }
1938  ++m_pos;
1939  }
1940  Q_ASSERT(false);
1941  }
1943  {
1944  const int start = m_pos;
1945 
1946  while(true)
1947  {
1948  ++m_pos;
1949  if(m_pos >= m_length)
1950  return Token(END_OF_FILE);
1951 
1952  const QChar next(current());
1953  if(next.isSpace() || next == QLatin1Char('?'))
1954  {
1955  return tokenAndChangeState(PI_TARGET, m_data.mid(start, m_pos - start),
1957  }
1958  }
1959  Q_ASSERT(false);
1960  }
1962  {
1963  /* Consume whitespace between the name and the content. */
1964  if(consumeRawWhitespace())
1965  return Token(END_OF_FILE);
1966 
1967  const int start = m_pos;
1968  const int len = scanUntil("?>");
1969 
1970  if(len == -1)
1971  return Token(END_OF_FILE);
1972  else
1973  {
1974  m_pos += 2; /* Consume "?>" */
1975  popState();
1976  return Token(PI_CONTENT, normalizeEOL(m_data.mid(start, len), CharacterSkips()));
1977  }
1978  Q_ASSERT(false);
1979  }
1980  case EndTag:
1981  {
1982  if(consumeRawWhitespace())
1983  return END_OF_FILE;
1984 
1985  if(peekCurrent() == '>')
1986  {
1987  popState();
1988  return tokenAndAdvance(G_GT);
1989  }
1990  else
1991  return tokenizeNCNameOrQName();
1992  Q_ASSERT(false);
1993  }
1994  case XMLComment:
1995  {
1996  const int start = m_pos;
1997  const int len = scanUntil("--");
1998 
1999  if(len == -1)
2000  return END_OF_FILE;
2001  else
2002  {
2003  m_pos += 2; /* Consume "--". */
2004  popState();
2005 
2006  if(peekCurrent() == '>')
2007  {
2008  ++m_pos;
2009  return Token(COMMENT_CONTENT, normalizeEOL(m_data.mid(start, len), CharacterSkips()));
2010  }
2011  else
2012  return error();
2013  }
2014  Q_ASSERT(false);
2015  }
2016  case Pragma:
2017  {
2018  /* Consume whitespace. */
2019  if(consumeRawWhitespace())
2020  return Token(END_OF_FILE);
2021 
2023  return tokenizeNCNameOrQName();
2024  }
2025  case PragmaContent:
2026  {
2027  QString result;
2028  result.reserve(20);
2029 
2030  const bool hasWS = m_pos < m_length && current().isSpace();
2031 
2032  /* Consume all whitespace up to the pragma content(if any). */
2033  if(consumeRawWhitespace())
2034  return Token(END_OF_FILE);
2035 
2036  if(peekCurrent() == '#' && peekAhead() == ')')
2037  {
2038  /* We reached the end, and there's no pragma content. */
2040  }
2041  else if(!hasWS)
2042  {
2043  /* A separating space is required if there's pragma content. */
2044  return error(); /* i18n */
2045  }
2046 
2047  const int start = m_pos;
2048  const int len = scanUntil("#)");
2049  if(len == -1)
2050  return Token(END_OF_FILE);
2051 
2052  return Token(STRING_LITERAL, m_data.mid(start, len));
2053  Q_ASSERT(false);
2054  }
2055  }
2056 
2057  Q_ASSERT(false);
2058  return error();
2059 }
2060 
2062  int &sepStack,
2063  const int startPos,
2064  const bool aInLiteral,
2065  QString &result)
2066 {
2067  bool inLiteral = aInLiteral;
2068  const char otherSep = (sep == QLatin1Char('"') ? '\'' : '"');
2069 
2070  while(true)
2071  {
2072  if(atEnd())
2073  return END_OF_FILE;
2074 
2075  if(peekCurrent() == sep.unicode())
2076  {
2077  if(inLiteral)
2078  inLiteral = false;
2079  else
2080  inLiteral = true;
2081 
2082  if(peekAhead() == sep.unicode())
2083  {
2084  /* The quoting mechanism was used. */
2085  result.append(current());
2086  m_pos += 2;
2087  continue;
2088  }
2089  else
2090  {
2091  /* Don't consume the separator, such that we
2092  * return a token for it next time. */
2093  if(m_pos == startPos)
2094  {
2095  ++m_pos;
2096  setState(StartTag);
2097  return Token(sep == QLatin1Char('"') ? QUOTE : APOS);
2098  }
2099 
2100 
2101  if(sepStack == 0)
2102  {
2103  return Token(STRING_LITERAL, result);
2104  }
2105  else
2106  {
2107  result.append(current());
2108  ++m_pos;
2109  continue;
2110  }
2111  }
2112  }
2113  else if(peekCurrent() == '&')
2114  {
2115  const QString ret(tokenizeCharacterReference());
2116  if(ret.isNull())
2117  return Token(ERROR);
2118  else
2119  {
2120  result.append(ret);
2121  ++m_pos;
2122  continue;
2123  }
2124  }
2125  else if(peekCurrent() == otherSep)
2126  {
2127  result.append(current());
2128  ++m_pos;
2129 
2130  if(peekCurrent() == otherSep)
2131  ++m_pos;
2132 
2133  if(inLiteral)
2134  inLiteral = false;
2135  else
2136  inLiteral = true;
2137 
2138  continue;
2139  }
2140  else if(peekCurrent() == '{')
2141  {
2142  result.append(current());
2143 
2144  if(peekAhead() == '{')
2145  {
2146  m_pos += 2;
2147  continue;
2148  }
2149  else
2150  {
2151  ++m_pos;
2152  ++sepStack;
2153  const Token t(attributeAsRaw(sep, sepStack, startPos, false, result));
2154  if(t.type != SUCCESS)
2155  return t;
2156  }
2157 
2158  }
2159  else if(peekCurrent() == '}')
2160  {
2161  if(inLiteral && peekAhead() == '}')
2162  {
2163  result.append(current());
2164  m_pos += 2;
2165  continue;
2166  }
2167  else
2168  {
2169  ++m_pos;
2170  --sepStack;
2171  return Token(SUCCESS); /* The return value is arbitrary. */
2172  }
2173  }
2174  else
2175  {
2176  result.append(current());
2177  ++m_pos;
2178  }
2179  }
2180 }
2181 
2183 {
2184  sourceLocator->first_line = m_line;
2185  sourceLocator->first_column = m_pos - m_columnOffset + 1; /* Plus 1, since m_pos is 0-based. */
2186 
2187  if(m_tokenStack.isEmpty())
2188  return nextToken();
2189  else
2190  {
2191  const Token retval(m_tokenStack.pop());
2192 
2193  switch(retval.type)
2194  {
2195  case MODULE:
2196  /* Fallthrough.*/
2197  case SCHEMA:
2198  /* Fallthrough.*/
2199  case COPY_NAMESPACES:
2200  {
2202  break;
2203  }
2204  case VERSION:
2205  {
2207  break;
2208  }
2209  case AS:
2210  /* Fallthrough. */
2211  case OF:
2212  {
2213  setState(ItemType);
2214  break;
2215  }
2216  default:
2217  {
2218  if(isOperatorKeyword(retval.type))
2219  setState(Default);
2220 
2221  break;
2222  }
2223  };
2224 
2225  return retval;
2226  }
2227 }
2228 
2230 {
2231  m_scanOnly = true;
2232  return m_pos;
2233 }
2234 
2236 {
2237  m_scanOnly = false;
2238  m_pos = pos;
2239 }
2240 
2242 {
2243 }
2244 
2245 #undef handleWhitespace
2246 
2247 } // namespace QPatternist
2248 
int scanUntil(const char *const content)
int type
Definition: qmetatype.cpp:239
bool isValid() const
Returns true if the URL is valid; otherwise returns false.
Definition: qurl.cpp:4303
unsigned char c[8]
Definition: qnumeric_p.h:62
#define QT_END_NAMESPACE
This macro expands to.
Definition: qglobal.h:90
Q_CORE_EXPORT QTextStream & ws(QTextStream &s)
const QChar at(int i) const
Returns the character at the given index position in the string.
Definition: qstring.h:698
int toInt(bool *ok=0, int base=10) const
Returns the string converted to an int using base base, which is 10 by default and must be between 2 ...
Definition: qstring.cpp:6090
ushort unicode() const
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: qchar.h:251
bool isNull() const
Returns true if the character is the Unicode character 0x0000 (&#39;\0&#39;); otherwise returns false...
Definition: qchar.h:262
Token tokenAndAdvance(const TokenType code, const int advance=1)
#define at(className, varName)
bool isEmpty() const
Returns true if the URL has no data; otherwise returns false.
Definition: qurl.cpp:4317
int length() const
Returns the number of characters in this string.
Definition: qstring.h:696
QLatin1String(DBUS_INTERFACE_DBUS))) Q_GLOBAL_STATIC_WITH_ARGS(QString
virtual void setParserContext(const ParserContext::Ptr &parseInfo)
The QUrl class provides a convenient interface for working with URLs.
Definition: qurl.h:61
The QString class provides a Unicode character string.
Definition: qstring.h:83
#define Q_ASSERT(cond)
Definition: qglobal.h:1823
static QString normalizeEOL(const QString &input, const CharacterSkips &characterSkips)
static const uint base
Definition: qurl.cpp:268
The QChar class provides a 16-bit Unicode character.
Definition: qchar.h:72
static const TokenMap * lookupKeyword(const QString &keyword)
const T value(const Key &key) const
Returns the value associated with the key.
Definition: qhash.h:606
bool isSpace() const
Returns true if the character is a separator character (Separator_* categories); otherwise returns fa...
Definition: qchar.cpp:609
Category category() const
Returns the character&#39;s category.
Definition: qchar.cpp:853
void reserve(int size)
Ensures that the QHash&#39;s internal hash table consists of at least size buckets.
Definition: qhash.h:846
iterator insert(const Key &key, const T &value)
Inserts a new item with the key and a value of value.
Definition: qhash.h:753
static ushort highSurrogate(uint ucs4)
Returns the high surrogate value of a ucs4 code point.
Definition: qchar.h:303
static ushort lowSurrogate(uint ucs4)
Returns the low surrogate value of a ucs4 code point.
Definition: qchar.h:306
T pop()
Removes the top item from the stack and returns it.
Definition: qstack.h:67
void reserve(int size)
Attempts to allocate memory for at least size characters.
Definition: qstring.h:881
#define QT_BEGIN_NAMESPACE
This macro expands to.
Definition: qglobal.h:89
const Tokenizer::TokenType token
bool contains(const T &value) const
Definition: qset.h:91
static bool isDigit(const char ch)
bool isEmpty() const
Returns true if the string has no characters; otherwise returns false.
Definition: qstring.h:704
const char * name
The namespace for the internal API of QtXmlPatterns.
bool isEmpty() const
Returns true if the hash contains no items; otherwise returns false.
Definition: qhash.h:297
const_iterator insert(const T &value)
Definition: qset.h:179
unsigned int uint
Definition: qglobal.h:996
int indexOf(QChar c, int from=0, Qt::CaseSensitivity cs=Qt::CaseSensitive) const
Definition: qstring.cpp:2838
virtual void resumeTokenizationFrom(const int position)
The State element defines configurations of objects and properties.
#define handleWhitespace()
Tokenizer::TokenType consumeComment()
Parses comments: (: comment content :). It recurses for parsing nested comments.
bool aheadEquals(const char *const chs, const int len, const int offset=1) const
QHash< QString, QChar > m_charRefs
void push(const T &t)
Adds element t to the top of the stack.
Definition: qstack.h:60
int count() const
Definition: qstring.h:103
char toAscii() const
Returns the character value of the QChar obtained using the current codec used to read C strings...
Definition: qchar.cpp:1490
static const struct TokenMap * value(const char *str, unsigned int len)
const char * constData() const
Returns a pointer to the data stored in the byte array.
Definition: qbytearray.h:433
bool isNull() const
Returns true if this string is null; otherwise returns false.
Definition: qstring.h:505
QString mid(int position, int n=-1) const Q_REQUIRED_RESULT
Returns a string that contains n characters of this string, starting at the specified position index...
Definition: qstring.cpp:3706
uint qstrlen(const char *str)
Definition: qbytearray.h:79
Base class for the XPath Data Model&#39;s type hierarchy.
Definition: qitemtype_p.h:82
XQueryTokenizer(const QString &query, const QUrl &location, const State startingState=Default)
QString & append(QChar c)
Definition: qstring.cpp:1777
static bool isTypeToken(const TokenType t)
char peekAhead(const int length=1) const
static QString fromLatin1(const char *, int size=-1)
Returns a QString initialized with the first size characters of the Latin-1 string str...
Definition: qstring.cpp:4188
static QTestResult::TestLocation location
Definition: qtestresult.cpp:63
QByteArray toAscii() const Q_REQUIRED_RESULT
Returns an 8-bit representation of the string as a QByteArray.
Definition: qstring.cpp:4014
uchar cell() const
Returns the cell (least significant byte) of the Unicode character.
Definition: qchar.h:283
char peekCurrent() const
Returns the character at the current position, converted to ASCII.
static bool isNCNameStart(const QChar ch)
static bool isOperatorKeyword(const TokenType)
bool isEmpty() const
Returns true if the vector has size 0; otherwise returns false.
Definition: qvector.h:139
Base class for all tokenizers.
Definition: qtokenizer_p.h:155
static const KeyPair *const end
void prepend(const T &t)
Inserts value at the beginning of the vector.
Definition: qvector.h:378
Token tokenAndChangeState(const TokenType code, const State state, const int advance=1)
static bool isNCNameBody(const QChar ch)
uchar row() const
Returns the row (most significant byte) of the Unicode character.
Definition: qchar.h:284
The QLatin1Char class provides an 8-bit ASCII/Latin-1 character.
Definition: qchar.h:55
static bool isPhraseKeyword(const TokenType code)
Token attributeAsRaw(const QChar separator, int &stack, const int startPos, const bool inLiteral, QString &result)
QChar charForReference(const QString &reference)