001: /*
002: * Sun Public License Notice
003: *
004: * The contents of this file are subject to the Sun Public License
005: * Version 1.0 (the "License"). You may not use this file except in
006: * compliance with the License. A copy of the License is available at
007: * http://www.sun.com/
008: *
009: * The Original Code is NetBeans. The Initial Developer of the Original
010: * Code is Sun Microsystems, Inc. Portions Copyright 1997-2006 Sun
011: * Microsystems, Inc. All Rights Reserved.
012:
013: If you wish your version of this file to be governed by only the CDDL
014: or only the GPL Version 2, indicate your decision by adding
015: "[Contributor] elects to include this software in this distribution
016: under the [CDDL or GPL Version 2] license." If you do not indicate a
017: single choice of license, a recipient has the option to distribute
018: your version of this file under either the CDDL, the GPL Version 2 or
019: to extend the choice of license to its licensees as provided above.
020: However, if you add GPL Version 2 code and therefore, elected the GPL
021: Version 2 license, then the option applies only if the new code is
022: made subject to such option by the copyright holder.
023:
024: If you wish your version of this file to be governed by only the CDDL
025: or only the GPL Version 2, indicate your decision by adding
026: "[Contributor] elects to include this software in this distribution
027: under the [CDDL or GPL Version 2] license." If you do not indicate a
028: single choice of license, a recipient has the option to distribute
029: your version of this file under either the CDDL, the GPL Version 2 or
030: to extend the choice of license to its licensees as provided above.
031: However, if you add GPL Version 2 code and therefore, elected the GPL
032: Version 2 license, then the option applies only if the new code is
033: made subject to such option by the copyright holder.
034: */
035:
036: package org.netbeans.lib.xml.lexer;
037:
038: import org.netbeans.api.xml.lexer.XMLTokenId;
039: import org.netbeans.api.lexer.Token;
040: import org.netbeans.spi.lexer.Lexer;
041: import org.netbeans.spi.lexer.LexerInput;
042: import org.netbeans.spi.lexer.LexerRestartInfo;
043: import org.netbeans.spi.lexer.TokenFactory;
044:
045: /**
046: * Lexical analyzer for XML. Based on original XML lexer from xml/editor module.
047: *
048: * @author Petr Nejedly
049: * @author Miloslav Metelka
050: * @author Jan Lahoda
051: * @author Marek Fukala
052: * @author Tomasz Slota
053: * @version 1.00
054: */
055:
056: public class XMLLexer implements Lexer<XMLTokenId> {
057: private LexerInput input;
058:
059: private TokenFactory<XMLTokenId> tokenFactory;
060:
061: public Object state() {
062: Integer encoded = (subState << 020) + (this .state << 010)
063: + (subInternalDTD ? 1 : 0);
064: return encoded;
065: }
066:
067: private void loadState(final Object state) {
068: if (state == null) {
069: subState = INIT;
070: this .state = INIT;
071: subInternalDTD = false;
072: } else {
073: int encoded = ((Integer) state).intValue();
074:
075: subState = (encoded & 0xff0000) >> 020;
076: this .state = (encoded & 0xff00) >> 010;
077: subInternalDTD = encoded % 2 == 1;
078: }
079: }
080:
081: /**
082: * Internal state of the lexical analyzer before entering subanalyzer of
083: * character references. It is initially set to INIT, but before first
084: * usage, this will be overwritten with state, which originated
085: * ransition to charref subanalyzer.
086: */
087: protected int state = INIT;
088:
089: /**
090: * Internal state of the lexical analyzer before entering subanalyzer of
091: * character references. It is initially set to INIT, but before first
092: * usage, this will be overwritten with state, which originated
093: * ransition to charref subanalyzer.
094: */
095: protected int subState = INIT;
096:
097: /**
098: * Identifies internal DTD layer. Most of functionality is same
099: * as at document layer, however there are minor exceptions.
100: * @see isInternalDTD checks in code
101: */
102: protected boolean subInternalDTD = false;
103:
104: /** Initial internal state of the analyzer */
105: public static final int INIT = 0;
106:
107: // Internal states I = in state
108: // P = expected (char probed but not consumed)
109: // A = after (char probed and consumed)
110:
111: private static final int ISI_TEXT = 1; // Plain text between tags
112: private static final int ISI_ERROR = 2; // Syntax error in XML syntax
113: private static final int ISA_LT = 3; // After start of tag delimiter - "<"
114: private static final int ISA_SLASH = 4; // After ETAGO - "</"
115: private static final int ISI_ENDTAG = 5; // Inside endtag - "</[a..Z]+"
116: private static final int ISP_ENDTAG_X = 6; // X-switch after ENDTAG's name
117: private static final int ISP_ENDTAG_WS = 7; // In WS in ENDTAG - "</A_ _>"
118: private static final int ISI_TAG = 8; // Inside tag - "<[a..Z]+"
119: private static final int ISP_TAG_X = 9; // X-switch after TAG's name
120: private static final int ISP_TAG_WS = 10; // In WS in TAG - "<A_ _...>"
121: private static final int ISI_ARG = 11; // Inside tag's argument - "<A h_r_...>"
122: private static final int ISP_ARG_X = 12; // X-switch after ARGUMENT's name
123: private static final int ISP_ARG_WS = 13; // Inside WS after argument awaiting '='
124: private static final int ISP_EQ = 14; // X-switch after '=' in TAG's ARGUMENT
125: private static final int ISP_EQ_WS = 15; // In WS after '='
126: private static final int ISI_VAL_APOS = 17; // Single-quoted value - may contain " chars
127: private static final int ISI_VAL_QUOT = 18; // Double-quoted value - may contain ' chars
128: private static final int ISA_SGML_ESCAPE = 19; // After "<!"
129: private static final int ISA_SGML_DASH = 20; // After "<!-"
130: private static final int ISI_XML_COMMENT = 21; // Somewhere after "<!--"
131: private static final int ISA_XML_COMMENT_DASH = 22; // Dash in comment - maybe end of comment
132: private static final int ISI_XML_COMMENT_WS = 23; // After end of comment, awaiting end of comment declaration
133: private static final int ISI_SGML_DECL = 24;
134: private static final int ISA_SGML_DECL_DASH = 25;
135: // private static final int ISI_SGML_COMMENT = 26;
136: // private static final int ISA_SGML_COMMENT_DASH = 27;
137: private static final int ISA_REF = 28; // when comes to character reference, e.g. &, after &
138: private static final int ISI_REF_NAME = 29; // if the reference is symbolic - by predefined name
139: private static final int ISA_REF_HASH = 30; // for numeric references - after &#
140: private static final int ISI_REF_DEC = 31; // decimal character reference, e.g. ř
141: private static final int ISA_REF_X = 32; //
142: private static final int ISI_REF_HEX = 33; // hexadecimal reference, in 
.. of 	..
143:
144: private static final int ISI_PI = 35; //after <?...
145: private static final int ISI_PI_TARGET = 36; //in <?..|..
146: private static final int ISP_PI_TARGET_WS = 37; //after <?...|
147: private static final int ISI_PI_CONTENT = 38; //in PI content
148: private static final int ISA_PI_CONTENT_QMARK = 39; //after ? in content
149: private static final int ISP_PI_CONTENT_QMARK = 40; //spotet ? in content
150:
151: // CDATA section handler
152: private static final int ISA_LTEXBR = 41;
153: private static final int ISA_LTEXBRC = 42;
154: private static final int ISA_LTEXBRCD = 43;
155: private static final int ISA_LTEXBRCDA = 44;
156: private static final int ISA_LTEXBRCDAT = 45;
157: private static final int ISA_LTEXBRCDATA = 46;
158: private static final int ISI_CDATA = 47;
159: private static final int ISA_CDATA_BR = 48;
160: private static final int ISA_CDATA_BRBR = 49;
161:
162: // strings in declaration
163: private static final int ISI_DECL_CHARS = 50;
164: private static final int ISI_DECL_STRING = 51;
165: private static final int ISP_DECL_CHARS = 52;
166: private static final int ISP_DECL_STRING = 53;
167:
168: // internal DTD handling
169: private static final int ISA_INIT_BR = 54;
170:
171: public XMLLexer(LexerRestartInfo<XMLTokenId> info) {
172: this .input = info.input();
173: this .tokenFactory = info.tokenFactory();
174: loadState(info.state());
175: }
176:
177: private final boolean isAZ(int ch) {
178: return ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z'));
179: }
180:
181: /**
182: * Resolves if given char is whitespace in terms of XML4.0 specs
183: * According to specs, following characters are treated as whitespace:
184: * Space - <CODE>'\u0020'</CODE>, Tab - <CODE>'\u0009'</CODE>,
185: * Formfeed - <CODE>'\u000C'</CODE>,Zero-width space - <CODE>'\u200B'</CODE>,
186: * Carriage return - <CODE>'
187: '</CODE> and Line feed - <CODE>'
188: '</CODE>
189: * CR's are included for completenes only, they should never appear in document
190: */
191:
192: private final boolean isWS(int ch) {
193: return Character.isWhitespace(ch);
194: // return ( ch == '\u0020' || ch == '\u0009' || ch == '\u000c'
195: // || ch == '\u200b' || ch == '\n' || ch == '\r' );
196: }
197:
198: private void enterInternalDTD() {
199: subInternalDTD = true;
200: }
201:
202: private void leaveInternalDTD() {
203: subInternalDTD = false;
204: }
205:
206: private boolean isInternalDTD() {
207: return subInternalDTD;
208: }
209:
210: public Token<XMLTokenId> nextToken() {
211:
212: int actChar;
213: while (true) {
214: actChar = input.read();
215:
216: if (actChar == LexerInput.EOF) {
217:
218: if (input.readLength() == 0) {
219: return null;
220: }
221:
222: input.backup(1);
223: break;
224: }
225:
226: switch (state) {
227: case INIT: // DONE
228: switch (actChar) {
229: case '<':
230: state = ISA_LT;
231: break;
232: case '&':
233: if (isInternalDTD() == false) {
234: state = ISA_REF;
235: subState = ISI_TEXT;
236: } else {
237: state = ISI_TEXT;
238: }
239: break;
240: case '%':
241: if (isInternalDTD()) {
242: state = ISA_REF;
243: subState = INIT;
244: } else {
245: state = ISI_TEXT;
246: }
247: break;
248: case ']':
249: if (isInternalDTD()) {
250: state = ISA_INIT_BR;
251: } else {
252: state = ISI_TEXT;
253: }
254: break;
255: default:
256: state = ISI_TEXT;
257: break;
258: }
259:
260: break;
261:
262: case ISI_TEXT: // DONE
263: switch (actChar) {
264: case '<':
265: state = INIT;
266: input.backup(1);
267: if (input.readLength() > 0) {
268: return token(XMLTokenId.TEXT);
269: }
270: break;
271: case '&':
272: if (isInternalDTD() == false) {
273: state = INIT;
274: input.backup(1);
275: if (input.readLength() > 0) {
276: return token(XMLTokenId.TEXT);
277: }
278: }
279: break;
280: case '%':
281: if (isInternalDTD()) {
282: state = INIT;
283: input.backup(1);
284: return token(XMLTokenId.TEXT);
285: }
286: break;
287: case ']':
288: if (isInternalDTD()) {
289: state = ISA_INIT_BR;
290: }
291: break;
292: }
293: break;
294:
295: case ISI_ERROR: // DONE
296: state = INIT;
297: return token(XMLTokenId.ERROR);
298:
299: case ISA_LT: // DONE
300:
301: if (UnicodeClasses.isXMLNameStartChar(actChar)
302: && isInternalDTD() == false) {
303: state = ISI_TAG;
304: break;
305: }
306: switch (actChar) {
307: case '/': // ETAGO - </
308: state = ISA_SLASH;
309: break;
310: case '!':
311: state = ISA_SGML_ESCAPE;
312: break;
313: case '?':
314: state = ISI_PI;
315: return token(XMLTokenId.PI_START);
316: default:
317: state = ISI_TEXT; //RELAXED to allow editing in the middle of document
318: continue; // don't eat the char, maybe its '&'
319: }
320: break;
321:
322: case ISI_PI:
323: if (UnicodeClasses.isXMLNameStartChar(actChar)) {
324: state = ISI_PI_TARGET;
325: break;
326: }
327: state = ISI_ERROR;
328: break;
329:
330: case ISI_PI_TARGET:
331: if (UnicodeClasses.isXMLNameChar(actChar))
332: break;
333: if (isWS(actChar)) {
334: state = ISP_PI_TARGET_WS;
335: input.backup(1);
336: return token(XMLTokenId.PI_TARGET);
337: }
338: state = ISI_ERROR;
339: break;
340:
341: case ISP_PI_TARGET_WS:
342: if (isWS(actChar))
343: break;
344: state = ISI_PI_CONTENT;
345: input.backup(1);
346: return token(XMLTokenId.WS);
347:
348: case ISI_PI_CONTENT:
349: if (actChar != '?')
350: break; // eat content
351: state = ISP_PI_CONTENT_QMARK;
352: input.backup(1);
353: return token(XMLTokenId.PI_CONTENT); // may do extra break
354:
355: case ISP_PI_CONTENT_QMARK:
356: if (actChar != '?')
357: throw new IllegalStateException(
358: "'?' expected in ISP_PI_CONTENT_QMARK");
359: state = ISA_PI_CONTENT_QMARK;
360: break;
361:
362: case ISA_PI_CONTENT_QMARK:
363: if (actChar != '>') {
364: state = ISI_PI_CONTENT;
365: break;
366: }
367: state = INIT;
368: return token(XMLTokenId.PI_END);
369:
370: case ISA_SLASH: // DONE
371:
372: if (UnicodeClasses.isXMLNameStartChar(actChar)) {
373: state = ISI_ENDTAG;
374: break;
375: }
376: switch (actChar) {
377: case ' ':
378: state = ISI_TEXT;
379: continue;
380: case '\n':
381: state = ISI_TEXT;
382: continue;
383: case '\r':
384: state = ISI_TEXT;
385: continue;
386: default: // Part of text, e.g. </3, </'\n', RELAXED
387: state = ISI_TEXT;
388: continue; // don'e eat the char
389: }
390: //break;
391:
392: case ISI_ENDTAG: // DONE
393: if (UnicodeClasses.isXMLNameChar(actChar)) {
394: break; // Still in endtag identifier, eat next char
395: }
396:
397: state = ISP_ENDTAG_X;
398: input.backup(1);
399: return token(XMLTokenId.TAG);
400:
401: case ISP_ENDTAG_X: // DONE
402: if (isWS(actChar)) {
403: state = ISP_ENDTAG_WS;
404: break;
405: }
406: switch (actChar) {
407: case '>': // Closing of endtag, e.g. </H6 _>_
408: state = INIT;
409: return token(XMLTokenId.TAG);
410: default:
411: state = ISI_ERROR;
412: continue; //don't eat
413: }
414: //break;
415:
416: case ISP_ENDTAG_WS: // DONE
417: if (isWS(actChar))
418: break; // eat all WS
419: state = ISP_ENDTAG_X;
420: input.backup(1);
421: return token(XMLTokenId.WS);
422:
423: case ISI_TAG: // DONE
424: if (UnicodeClasses.isXMLNameChar(actChar))
425: break; // Still in tag identifier, eat next char
426: state = ISP_TAG_X;
427: input.backup(1);
428: return token(XMLTokenId.TAG);
429:
430: case ISP_TAG_X: // DONE
431: if (isWS(actChar)) {
432: state = ISP_TAG_WS;
433: break;
434: }
435: if (UnicodeClasses.isXMLNameStartChar(actChar)) {
436: state = ISI_ARG;
437: break;
438: }
439: switch (actChar) {
440: case '/':
441: break;
442: case '?': //Prolog and PI's now similar to Tag
443: break;
444: case '>':
445: state = INIT;
446: return token(XMLTokenId.TAG);
447: default:
448: state = ISI_ERROR;
449: continue;
450: }
451: break;
452:
453: case ISP_TAG_WS: // DONE
454: //input.backup(1);
455: if (isWS(actChar))
456: break; // eat all WS
457: state = ISP_TAG_X;
458: input.backup(1);
459: return token(XMLTokenId.WS);
460:
461: case ISI_ARG: // DONE
462: if (UnicodeClasses.isXMLNameChar(actChar))
463: break; // eat next char
464: state = ISP_ARG_X;
465: input.backup(1);
466: return token(XMLTokenId.ARGUMENT);
467:
468: case ISP_ARG_X:
469: if (isWS(actChar)) {
470: state = ISP_ARG_WS;
471: break;
472: }
473: switch (actChar) {
474: case '=':
475: state = ISP_EQ;
476: return token(XMLTokenId.OPERATOR);
477: default:
478: state = ISI_ERROR;
479: continue;
480: }
481: //break;
482:
483: case ISP_ARG_WS:
484: if (isWS(actChar))
485: break; // Eat all WhiteSpace
486: state = ISP_ARG_X;
487: input.backup(1);
488: return token(XMLTokenId.WS);
489:
490: case ISP_EQ:
491: if (isWS(actChar)) {
492: state = ISP_EQ_WS;
493: break;
494: }
495: switch (actChar) {
496: case '\'':
497: state = ISI_VAL_APOS;
498: break;
499: case '"':
500: state = ISI_VAL_QUOT;
501: break;
502: default:
503: state = ISI_ERROR;
504: continue;
505: }
506: break;
507:
508: case ISP_EQ_WS:
509: if (isWS(actChar))
510: break; // Consume all WS
511: state = ISP_EQ;
512: input.backup(1);
513: return token(XMLTokenId.WS);
514:
515: case ISI_VAL_APOS:
516: switch (actChar) {
517: case '\'':
518: state = ISP_TAG_X;
519: return token(XMLTokenId.VALUE);
520: case '&':
521: if (input.readLength() == 1) {
522: subState = state;
523: state = ISA_REF;
524: break;
525: } else {
526: input.backup(1);
527: return token(XMLTokenId.VALUE);
528: }
529: }
530: break; // else simply consume next char of VALUE
531:
532: case ISI_VAL_QUOT:
533: switch (actChar) {
534: case '"':
535: state = ISP_TAG_X;
536: return token(XMLTokenId.VALUE);
537: case '&':
538: if (input.readLength() == 1) {
539: subState = state;
540: state = ISA_REF;
541: break;
542: } else {
543: input.backup(1);
544: return token(XMLTokenId.VALUE);
545: }
546: }
547: break; // else simply consume next char of VALUE
548:
549: case ISA_SGML_ESCAPE: // DONE
550: if (actChar == '[') {
551: state = ISA_LTEXBR;
552: break;
553: } else if (isAZ(actChar)) {
554: state = ISI_SGML_DECL;
555: break;
556: }
557: switch (actChar) {
558: case '-':
559: state = ISA_SGML_DASH;
560: break;
561: default:
562: state = ISI_TEXT;
563: continue;
564: }
565: break;
566:
567: case ISA_LTEXBR:
568: if (actChar == 'C') {
569: state = ISA_LTEXBRC;
570: break;
571: } else {
572: state = ISI_TEXT;
573: continue;
574: }
575:
576: case ISA_LTEXBRC:
577: if (actChar == 'D') {
578: state = ISA_LTEXBRCD;
579: break;
580: } else {
581: state = ISI_TEXT;
582: continue;
583: }
584:
585: case ISA_LTEXBRCD:
586: if (actChar == 'A') {
587: state = ISA_LTEXBRCDA;
588: break;
589: } else {
590: state = ISI_TEXT;
591: continue;
592: }
593:
594: case ISA_LTEXBRCDA:
595: if (actChar == 'T') {
596: state = ISA_LTEXBRCDAT;
597: break;
598: } else {
599: state = ISI_TEXT;
600: continue;
601: }
602:
603: case ISA_LTEXBRCDAT:
604: if (actChar == 'A') {
605: state = ISA_LTEXBRCDATA;
606: break;
607: } else {
608: state = ISI_TEXT;
609: continue;
610: }
611:
612: case ISA_LTEXBRCDATA:
613: if (actChar == '[') {
614: state = ISI_CDATA;
615: break;
616: } else {
617: state = ISI_TEXT;
618: continue;
619: }
620:
621: case ISI_CDATA:
622: if (actChar == ']') {
623: state = ISA_CDATA_BR;
624: break;
625: }
626:
627: case ISA_CDATA_BR:
628: if (actChar == ']') {
629: state = ISA_CDATA_BRBR;
630: break;
631: } else {
632: state = ISI_CDATA;
633: break;
634: }
635:
636: case ISA_CDATA_BRBR:
637: if (actChar == '>') {
638: state = ISI_TEXT; //It s allowed only in content
639: return token(XMLTokenId.CDATA_SECTION);
640: } else if (actChar == ']') {
641: // stay in the same state
642: break;
643: } else {
644: state = ISI_CDATA;
645: break;
646: }
647:
648: case ISA_SGML_DASH: // DONE
649: switch (actChar) {
650: case '-':
651: state = ISI_XML_COMMENT;
652: break;
653: default:
654: state = ISI_ERROR;
655: continue;
656: }
657: break;
658:
659: case ISI_XML_COMMENT: // DONE
660: switch (actChar) {
661: case '-':
662: state = ISA_XML_COMMENT_DASH;
663: break;
664: //create an XML comment token for each line of the comment - a workaround fix for performance bug #39446
665: //this also causes a SyntaxtElement to be created for each line of the comment - see XMLSyntaxSupport.createElement:277
666: //PENDING - this code can be removed after editor solve it somehow in their code
667: case '\n':
668: //leave the some state - we are still in an XML comment,
669: //we just need to create a token for each line.
670: return token(XMLTokenId.BLOCK_COMMENT);
671: }
672: break;
673:
674: case ISA_XML_COMMENT_DASH:
675: switch (actChar) {
676: case '-':
677: state = ISI_XML_COMMENT_WS;
678: break;
679: default:
680: state = ISI_XML_COMMENT;
681: continue;
682: }
683: break;
684:
685: case ISI_XML_COMMENT_WS: // DONE
686: if (isWS(actChar))
687: break; // Consume all WS
688: switch (actChar) {
689: case '>':
690: state = INIT;
691: return token(XMLTokenId.BLOCK_COMMENT);
692: default:
693: state = ISI_ERROR;
694: input.backup(1);
695: return token(XMLTokenId.BLOCK_COMMENT);
696: }
697:
698: case ISP_DECL_STRING:
699: if (actChar != '"')
700: throw new IllegalStateException("Unexpected "
701: + actChar);
702: state = ISI_DECL_STRING;
703: break;
704:
705: case ISI_DECL_STRING:
706: if (actChar == '"') {
707: state = ISI_SGML_DECL;
708: return token(XMLTokenId.VALUE);
709: }
710: break;
711:
712: case ISP_DECL_CHARS:
713: if (actChar != '\'')
714: throw new IllegalStateException("Unexpected "
715: + actChar);
716: state = ISI_DECL_CHARS;
717: break;
718:
719: case ISI_DECL_CHARS:
720: if (actChar == '\'') {
721: state = ISI_SGML_DECL;
722: return token(XMLTokenId.VALUE);
723: }
724: break;
725:
726: case ISI_SGML_DECL:
727: switch (actChar) {
728: case '"':
729: state = ISP_DECL_STRING;
730: input.backup(1);
731: return token(XMLTokenId.DECLARATION);
732: case '\'':
733: state = ISP_DECL_CHARS;
734: input.backup(1);
735: return token(XMLTokenId.DECLARATION);
736: case '[':
737: state = INIT;
738: enterInternalDTD();
739: return token(XMLTokenId.DECLARATION);
740: case '>':
741: state = INIT;
742: return token(XMLTokenId.DECLARATION);
743: }
744: break;
745:
746: case ISA_INIT_BR:
747: if (isWS(actChar))
748: break;
749: if (actChar == '>') {
750: state = INIT;
751: leaveInternalDTD();
752: return token(XMLTokenId.DECLARATION);
753: } else {
754: state = INIT;
755: input.backup(1);
756: return token(XMLTokenId.ERROR);
757: }
758:
759: case ISA_SGML_DECL_DASH:
760: if (actChar == '-') {
761: state = ISI_ERROR;
762: break;
763: } else {
764: if (isWS(actChar)) {
765: state = ISI_ERROR;
766: continue;
767: } else {
768: state = ISI_SGML_DECL;
769: continue;
770: }
771: }
772:
773: case ISA_REF:
774: if (UnicodeClasses.isXMLNameStartChar(actChar)) {
775: state = ISI_REF_NAME;
776: break;
777: }
778: if (actChar == '#') {
779: state = ISA_REF_HASH;
780: break;
781: }
782: state = subState;
783: continue;
784:
785: case ISI_REF_NAME:
786: if (UnicodeClasses.isXMLNameChar(actChar))
787: break;
788: if (actChar != ';')
789: input.backup(1);
790: state = subState;
791: return token(XMLTokenId.CHARACTER);
792:
793: case ISA_REF_HASH:
794: if (actChar >= '0' && actChar <= '9') {
795: state = ISI_REF_DEC;
796: break;
797: }
798: if (actChar == 'x' || actChar == 'X') {
799: state = ISA_REF_X;
800: break;
801: }
802: if (isAZ(actChar)) {
803: state = subState;
804: return token(XMLTokenId.ERROR);
805: }
806: state = subState;
807: continue;
808:
809: case ISI_REF_DEC:
810: if (actChar >= '0' && actChar <= '9')
811: break;
812: if (actChar != ';')
813: input.backup(1);
814: state = subState;
815: return token(XMLTokenId.CHARACTER);
816:
817: case ISA_REF_X:
818: if (isHex(actChar)) {
819: state = ISI_REF_HEX;
820: break;
821: }
822: state = subState;
823: input.backup(1);
824: return token(XMLTokenId.ERROR); // error on previous "&#x" sequence
825:
826: case ISI_REF_HEX:
827: if (isHex(actChar))
828: break;
829: if (actChar != ';')
830: input.backup(1);
831: state = subState;
832: return token(XMLTokenId.CHARACTER);
833: }
834: } // end of while(offset...)
835:
836: switch (state) {
837: case INIT:
838: case ISI_TEXT:
839: case ISA_LT:
840: case ISA_SLASH:
841: case ISA_SGML_ESCAPE:
842: case ISA_SGML_DASH:
843: return token(XMLTokenId.TEXT);
844:
845: case ISA_REF:
846: case ISA_REF_HASH:
847: if (subState == ISI_TEXT)
848: return token(XMLTokenId.TEXT);
849: else
850: return token(XMLTokenId.VALUE);
851:
852: case ISI_XML_COMMENT:
853: case ISA_XML_COMMENT_DASH:
854: case ISI_XML_COMMENT_WS:
855: return token(XMLTokenId.BLOCK_COMMENT);
856:
857: case ISI_TAG:
858: case ISI_ENDTAG:
859: return token(XMLTokenId.TAG);
860:
861: case ISI_ARG:
862: return token(XMLTokenId.ARGUMENT);
863:
864: case ISI_ERROR:
865: return token(XMLTokenId.ERROR);
866:
867: case ISP_ARG_WS:
868: case ISP_TAG_WS:
869: case ISP_ENDTAG_WS:
870: case ISP_EQ_WS:
871: return token(XMLTokenId.WS);
872:
873: case ISP_ARG_X:
874: case ISP_TAG_X:
875: case ISP_ENDTAG_X:
876: case ISP_EQ:
877: return token(XMLTokenId.WS);
878:
879: case ISI_VAL_APOS:
880: case ISI_VAL_QUOT:
881: case ISI_DECL_CHARS:
882: case ISI_DECL_STRING:
883: return token(XMLTokenId.VALUE);
884:
885: case ISI_SGML_DECL:
886: case ISA_SGML_DECL_DASH:
887: case ISP_DECL_STRING:
888: case ISP_DECL_CHARS:
889: return token(XMLTokenId.DECLARATION);
890:
891: case ISI_REF_NAME:
892: case ISI_REF_DEC:
893: case ISA_REF_X:
894: case ISI_REF_HEX:
895: return token(XMLTokenId.CHARACTER);
896:
897: case ISI_PI:
898: return token(XMLTokenId.PI_START);
899: case ISI_PI_TARGET:
900: return token(XMLTokenId.PI_TARGET);
901: case ISP_PI_TARGET_WS:
902: return token(XMLTokenId.WS);
903: case ISI_PI_CONTENT:
904: return token(XMLTokenId.PI_CONTENT);
905: case ISA_PI_CONTENT_QMARK:
906: case ISP_PI_CONTENT_QMARK:
907: // we are at end of the last buffer and expect that next char will be '>'
908: return token(XMLTokenId.PI_END);
909:
910: case ISA_LTEXBR:
911: case ISA_LTEXBRC:
912: case ISA_LTEXBRCD:
913: case ISA_LTEXBRCDA:
914: case ISA_LTEXBRCDAT:
915: case ISA_LTEXBRCDATA:
916: return token(XMLTokenId.TEXT);
917:
918: case ISI_CDATA:
919: case ISA_CDATA_BR:
920: case ISA_CDATA_BRBR:
921: return token(XMLTokenId.CDATA_SECTION);
922:
923: case ISA_INIT_BR:
924: return token(XMLTokenId.TEXT);
925:
926: default:
927: throw new IllegalStateException(
928: "Last buffer does not handle state " + state + "!"); //NOI18N
929: }
930:
931: }
932:
933: private Token<XMLTokenId> token(XMLTokenId id) {
934: // System.out.print("--- token(" + id + "; '" + input.readText().toString() + "')");
935: // if(input.readLength() == 0) {
936: // System.out.println("XMLLexer error - zero length token!");
937: // }
938: Token<XMLTokenId> t = tokenFactory.createToken(id);
939: // System.out.println(t.id() + "; " + t.length());
940: return t;
941: }
942:
943: private boolean isHex(int ch) {
944: return (ch >= '0' && ch <= '9') || isAF(ch);
945: }
946:
947: private boolean isAF(int ch) {
948: return ((ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F'));
949: }
950:
951: public void release() {
952: }
953:
954: }
|