001: /*
002: * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
003: *
004: * Copyright 1997-2007 Sun Microsystems, Inc. All rights reserved.
005: *
006: * The contents of this file are subject to the terms of either the GNU
007: * General Public License Version 2 only ("GPL") or the Common
008: * Development and Distribution License("CDDL") (collectively, the
009: * "License"). You may not use this file except in compliance with the
010: * License. You can obtain a copy of the License at
011: * http://www.netbeans.org/cddl-gplv2.html
012: * or nbbuild/licenses/CDDL-GPL-2-CP. See the License for the
013: * specific language governing permissions and limitations under the
014: * License. When distributing the software, include this License Header
015: * Notice in each file and include the License file at
016: * nbbuild/licenses/CDDL-GPL-2-CP. Sun designates this
017: * particular file as subject to the "Classpath" exception as provided
018: * by Sun in the GPL Version 2 section of the License file that
019: * accompanied this code. If applicable, add the following below the
020: * License Header, with the fields enclosed by brackets [] replaced by
021: * your own identifying information:
022: * "Portions Copyrighted [year] [name of copyright owner]"
023: *
024: * Contributor(s):
025: *
026: * The Original Software is NetBeans. The Initial Developer of the Original
027: * Software is Sun Microsystems, Inc. Portions Copyright 1997-2006 Sun
028: * Microsystems, Inc. All Rights Reserved.
029: *
030: * If you wish your version of this file to be governed by only the CDDL
031: * or only the GPL Version 2, indicate your decision by adding
032: * "[Contributor] elects to include this software in this distribution
033: * under the [CDDL or GPL Version 2] license." If you do not indicate a
034: * single choice of license, a recipient has the option to distribute
035: * your version of this file under either the CDDL, the GPL Version 2 or
036: * to extend the choice of license to its licensees as provided above.
037: * However, if you add GPL Version 2 code and therefore, elected the GPL
038: * Version 2 license, then the option applies only if the new code is
039: * made subject to such option by the copyright holder.
040: */
041: package org.netbeans.modules.cnd.lexer;
042:
043: import org.netbeans.cnd.api.lexer.CndLexerUtilities;
044: import org.netbeans.cnd.api.lexer.CppTokenId;
045: import org.netbeans.api.lexer.PartType;
046: import org.netbeans.api.lexer.Token;
047: import org.netbeans.spi.lexer.Lexer;
048: import org.netbeans.spi.lexer.LexerInput;
049: import org.netbeans.spi.lexer.LexerRestartInfo;
050: import org.netbeans.spi.lexer.TokenFactory;
051:
052: /**
053: * Base of Lexical analyzers for C/C++ languages.
054: * <br/>
055: * It handles escaped lines and delegate identifier to keyword recognition
056: * to language-flavor specific filter
057: *
058: * @author Miloslav Metelka
059: * @version 1.00
060: */
061: public abstract class CndLexer implements Lexer<CppTokenId> {
062:
063: protected static final int EOF = LexerInput.EOF;
064: private final LexerInput input;
065: private final TokenFactory<CppTokenId> tokenFactory;
066: private int escapedEatenChars;
067: private int tokenSplittedByEscapedLine;
068: private int lastTokenEndedByEscapedLine;
069:
070: protected CndLexer(LexerRestartInfo<CppTokenId> info) {
071: this .input = info.input();
072: this .tokenFactory = info.tokenFactory();
073: }
074:
075: public Object state() {
076: return null; // always in default state
077: }
078:
079: protected final void backup(int n) {
080: input.backup(n + escapedEatenChars);
081: lastTokenEndedByEscapedLine = escapedEatenChars;
082: tokenSplittedByEscapedLine -= escapedEatenChars;
083: }
084:
085: @SuppressWarnings("fallthrough")
086: protected final int read(boolean skipEscapedLF) {
087: int c = input.read();
088: escapedEatenChars = 0;
089: if (skipEscapedLF) { // skip escaped LF
090: int next;
091: while (c == '\\') {
092: escapedEatenChars++;
093: switch (next = input.read()) {
094: case '\r':
095: if (consumeNewline()) {
096: escapedEatenChars++;
097: }
098: // nobreak
099: case '\n':
100: escapedEatenChars++;
101: next = input.read();
102: break;
103: default:
104: input.backup(1);
105: escapedEatenChars--;
106: assert c == '\\' : "must be backslash " + (char) c;
107: tokenSplittedByEscapedLine += escapedEatenChars;
108: return c; // normal backslash, not escaped LF
109: }
110: c = next;
111: }
112: tokenSplittedByEscapedLine += escapedEatenChars;
113: }
114: return c;
115: }
116:
117: protected final boolean consumeNewline() {
118: return input.consumeNewline();
119: }
120:
121: @SuppressWarnings("fallthrough")
122: public Token<CppTokenId> nextToken() {
123: while (true) {
124: // special handling for escaped lines
125: if (lastTokenEndedByEscapedLine > 0) {
126: int c = read(false);
127: lastTokenEndedByEscapedLine--;
128: assert c == '\\' : "there must be \\";
129: c = read(false);
130: assert c == '\n' || c == '\r' : "there must be \r or \n";
131: if (c == '\r') {
132: lastTokenEndedByEscapedLine--;
133: if (input.consumeNewline()) {
134: lastTokenEndedByEscapedLine--;
135: }
136: return token(CppTokenId.ESCAPED_LINE);
137: } else {
138: lastTokenEndedByEscapedLine--;
139: return token(CppTokenId.ESCAPED_LINE, "\\\n",
140: PartType.COMPLETE); // NOI18N
141: }
142: } else {
143: int c = read(true);
144: // if read of the first char caused skipping escaped line
145: // do we need to backup and create escaped lines first?
146: switch (c) {
147: case '"': {
148: Token<CppTokenId> out = finishDblQuote();
149: assert out != null : "not handled dobule quote";
150: return out;
151: }
152: case '\'': // char literal
153: while (true) {
154: switch (read(true)) {
155: case '\'': // NOI18N
156: return token(CppTokenId.CHAR_LITERAL);
157: case '\\':
158: read(false); // read escaped char
159: break;
160: case '\r':
161: // consumeNewline();
162: case '\n':
163: backup(1); // leave new line for the own token
164: case EOF:
165: return tokenPart(CppTokenId.CHAR_LITERAL,
166: PartType.START);
167: }
168: }
169:
170: case '#': {
171: Token<CppTokenId> out = finishSharp();
172: assert out != null : "not handled #";
173: return out;
174: }
175:
176: case '/':
177: switch (read(true)) {
178: case '/': // in single-line comment
179: while (true) {
180: switch (read(true)) {
181: case '\r':
182: case '\n':
183: case EOF:
184: backup(1);
185: return token(CppTokenId.LINE_COMMENT);
186: }
187: }
188: case '=': // found /=
189: return token(CppTokenId.SLASHEQ);
190: case '*': // in multi-line or doxygen comment
191: {
192: Token<CppTokenId> out = finishComment(true);
193: assert out != null : "not handled /*";
194: return out;
195: }
196: } // end of switch()
197: backup(1);
198: return token(CppTokenId.SLASH);
199:
200: case '=':
201: if (read(true) == '=') {
202: return token(CppTokenId.EQEQ);
203: }
204: backup(1);
205: return token(CppTokenId.EQ);
206:
207: case '>':
208: switch (read(true)) {
209: case '>': // >>
210: if (read(true) == '=') {
211: return token(CppTokenId.GTGTEQ);
212: }
213: backup(1);
214: return token(CppTokenId.GTGT);
215: case '=': // >=
216: return token(CppTokenId.GTEQ);
217: }
218: backup(1);
219: return token(CppTokenId.GT);
220:
221: case '<': {
222: Token<CppTokenId> out = finishLT();
223: assert out != null : "not handled '<'";
224: return out;
225: }
226:
227: case '+':
228: switch (read(true)) {
229: case '+':
230: return token(CppTokenId.PLUSPLUS);
231: case '=':
232: return token(CppTokenId.PLUSEQ);
233: }
234: backup(1);
235: return token(CppTokenId.PLUS);
236:
237: case '-':
238: switch (read(true)) {
239: case '-':
240: return token(CppTokenId.MINUSMINUS);
241: case '>':
242: if (read(true) == '*') {
243: return token(CppTokenId.ARROWMBR);
244: }
245: backup(1);
246: return token(CppTokenId.ARROW);
247: case '=':
248: return token(CppTokenId.MINUSEQ);
249: }
250: backup(1);
251: return token(CppTokenId.MINUS);
252:
253: case '*':
254: switch (read(true)) {
255: case '/': // invalid comment end - */ or int*/* */
256: if (read(true) == '*') {
257: backup(2);
258: return token(CppTokenId.STAR);
259: }
260: backup(1);
261: return token(CppTokenId.INVALID_COMMENT_END);
262: case '=':
263: return token(CppTokenId.STAREQ);
264: }
265: backup(1);
266: return token(CppTokenId.STAR);
267:
268: case '|':
269: switch (read(true)) {
270: case '|':
271: return token(CppTokenId.BARBAR);
272: case '=':
273: return token(CppTokenId.BAREQ);
274: }
275: backup(1);
276: return token(CppTokenId.BAR);
277:
278: case '&':
279: switch (read(true)) {
280: case '&':
281: return token(CppTokenId.AMPAMP);
282: case '=':
283: return token(CppTokenId.AMPEQ);
284: }
285: backup(1);
286: return token(CppTokenId.AMP);
287:
288: case '%':
289: if (read(true) == '=') {
290: return token(CppTokenId.PERCENTEQ);
291: }
292: backup(1);
293: return token(CppTokenId.PERCENT);
294:
295: case '^':
296: if (read(true) == '=') {
297: return token(CppTokenId.CARETEQ);
298: }
299: backup(1);
300: return token(CppTokenId.CARET);
301:
302: case '!':
303: if (read(true) == '=') {
304: return token(CppTokenId.NOTEQ);
305: }
306: backup(1);
307: return token(CppTokenId.NOT);
308:
309: case '.':
310: if ((c = read(true)) == '.') {
311: if (read(true) == '.') { // ellipsis ...
312: return token(CppTokenId.ELLIPSIS);
313: } else {
314: input.backup(2);
315: }
316: } else if ('0' <= c && c <= '9') { // float literal
317: return finishNumberLiteral(read(true), true);
318: } else if (c == '*') {
319: return token(CppTokenId.DOTMBR);
320: } else {
321: backup(1);
322: }
323: return token(CppTokenId.DOT);
324:
325: case ':':
326: if (read(true) == ':') {
327: return token(CppTokenId.SCOPE);
328: }
329: backup(1);
330: return token(CppTokenId.COLON);
331:
332: case '~':
333: return token(CppTokenId.TILDE);
334: case ',':
335: return token(CppTokenId.COMMA);
336: case ';':
337: return token(CppTokenId.SEMICOLON);
338:
339: case '?':
340: return token(CppTokenId.QUESTION);
341: case '(':
342: return token(CppTokenId.LPAREN);
343: case ')':
344: return token(CppTokenId.RPAREN);
345: case '[':
346: return token(CppTokenId.LBRACKET);
347: case ']':
348: return token(CppTokenId.RBRACKET);
349: case '{':
350: return token(CppTokenId.LBRACE);
351: case '}':
352: return token(CppTokenId.RBRACE);
353: case '@':
354: return token(CppTokenId.AT);
355:
356: case '0': // in a number literal
357: c = read(true);
358: if (c == 'x' || c == 'X') { // in hexadecimal (possibly floating-point) literal
359: boolean inFraction = false;
360: while (true) {
361: switch (read(true)) {
362: case '0':
363: case '1':
364: case '2':
365: case '3':
366: case '4':
367: case '5':
368: case '6':
369: case '7':
370: case '8':
371: case '9':
372: case 'a':
373: case 'b':
374: case 'c':
375: case 'd':
376: case 'e':
377: case 'f':
378: case 'A':
379: case 'B':
380: case 'C':
381: case 'D':
382: case 'E':
383: case 'F':
384: break;
385: case '.': // hex float literal
386: if (!inFraction) {
387: inFraction = true;
388: } else { // two dots in the float literal
389: return token(CppTokenId.FLOAT_LITERAL_INVALID);
390: }
391: break;
392: case 'p':
393: case 'P': // binary exponent
394: return finishFloatExponent();
395: default:
396: backup(1);
397: // if float then before mandatory binary exponent => invalid
398: return token(inFraction ? CppTokenId.FLOAT_LITERAL_INVALID
399: : CppTokenId.INT_LITERAL);
400: }
401: } // end of while(true)
402: }
403: return finishNumberLiteral(c, false);
404:
405: case '1':
406: case '2':
407: case '3':
408: case '4':
409: case '5':
410: case '6':
411: case '7':
412: case '8':
413: case '9':
414: return finishNumberLiteral(read(true), false);
415: case '\\':
416: return token(CppTokenId.BACK_SLASH);
417: case '$':
418: return token(CppTokenId.DOLLAR);
419:
420: case '\r':
421: consumeNewline();
422: return token(CppTokenId.NEW_LINE);
423: case '\n':
424: return token(CppTokenId.NEW_LINE, "\n",
425: PartType.COMPLETE); // NOI18N
426: // All Character.isWhitespace(c) below 0x80 follow
427: // ['\t' - '\f'] and [0x1c - ' ']
428: case '\t':
429: case 0x0b:
430: case '\f':
431: case 0x1c:
432: case 0x1d:
433: case 0x1e:
434: case 0x1f:
435: return finishWhitespace();
436: case ' ':
437: c = read(true);
438: if (c == EOF || !Character.isWhitespace(c)
439: || c == '\n' || c == '\r') { // Return single space as flyweight token
440: backup(1);
441: return token(CppTokenId.WHITESPACE, " ",
442: PartType.COMPLETE); // NOI18N
443:
444: }
445: return finishWhitespace();
446:
447: case EOF:
448: return null;
449:
450: default:
451: c = translateSurrogates(c);
452: if (CndLexerUtilities.isCppIdentifierStart(c)) {
453: return keywordOrIdentifier(c);
454: }
455: if (Character.isWhitespace(c)) {
456: return finishWhitespace();
457: }
458:
459: // Invalid char
460: return token(CppTokenId.ERROR);
461: }
462: } // end of switch (c)
463: } // end of while(true)
464: }
465:
466: protected abstract CppTokenId getKeywordOrIdentifierID(
467: CharSequence text);
468:
469: protected final Token<CppTokenId> finishComment(boolean createToken) {
470: int c = read(true);
471: if (c == '*') { // either doxygen comment or empty multi-line comment /**/
472: c = read(true);
473: if (c == '/') {
474: return !createToken ? null
475: : token(CppTokenId.BLOCK_COMMENT);
476: }
477: while (true) { // in doxygen comment
478: while (c == '*') {
479: c = read(true);
480: if (c == '/') {
481: return !createToken ? null
482: : token(CppTokenId.DOXYGEN_COMMENT);
483: } else if (c == EOF) {
484: return !createToken ? null : tokenPart(
485: CppTokenId.DOXYGEN_COMMENT,
486: PartType.START);
487: }
488: }
489: if (c == EOF) {
490: return !createToken ? null : tokenPart(
491: CppTokenId.DOXYGEN_COMMENT, PartType.START);
492: }
493: c = read(true);
494: }
495: } else { // in multi-line comment (and not after '*')
496: while (true) {
497: c = read(true);
498: while (c == '*') {
499: c = read(true);
500: if (c == '/') {
501: return !createToken ? null
502: : token(CppTokenId.BLOCK_COMMENT);
503: } else if (c == EOF) {
504: return !createToken ? null : tokenPart(
505: CppTokenId.BLOCK_COMMENT,
506: PartType.START);
507: }
508: }
509: if (c == EOF) {
510: return !createToken ? null : tokenPart(
511: CppTokenId.BLOCK_COMMENT, PartType.START);
512: }
513: }
514: }
515: }
516:
517: private int translateSurrogates(int c) {
518: if (Character.isHighSurrogate((char) c)) {
519: int lowSurr = read(true);
520: if (lowSurr != EOF
521: && Character.isLowSurrogate((char) lowSurr)) {
522: // c and lowSurr form the integer unicode char.
523: c = Character.toCodePoint((char) c, (char) lowSurr);
524: } else {
525: // Otherwise it's error: Low surrogate does not follow the high one.
526: // Leave the original character unchanged.
527: // As the surrogates do not belong to any
528: // specific unicode category the lexer should finally
529: // categorize them as a lexical error.
530: backup(1);
531: }
532: }
533: return c;
534: }
535:
536: private Token<CppTokenId> finishWhitespace() {
537: while (true) {
538: int c = read(true);
539: // There should be no surrogates possible for whitespace
540: // so do not call translateSurrogates()
541: if (c == EOF || !Character.isWhitespace(c) || c == '\n'
542: || c == '\r') {
543: backup(1);
544: return token(CppTokenId.WHITESPACE);
545: }
546: }
547: }
548:
549: private Token<CppTokenId> keywordOrIdentifier(int c) {
550: StringBuilder idText = new StringBuilder();
551: idText.append((char) c);
552: while (true) {
553: c = read(true);
554: if (c == EOF
555: || !CndLexerUtilities
556: .isCppIdentifierPart(c = translateSurrogates(c))) {
557: // For surrogate 2 chars must be backed up
558: backup((c >= Character.MIN_SUPPLEMENTARY_CODE_POINT) ? 2
559: : 1);
560: CppTokenId id = getKeywordOrIdentifierID(idText
561: .toString());
562: assert id != null : "must be valid id for " + idText;
563: return token(id);
564: } else {
565: idText.append((char) c);
566: }
567: }
568: }
569:
570: private Token<CppTokenId> finishNumberLiteral(int c,
571: boolean inFraction) {
572: while (true) {
573: switch (c) {
574: case '.':
575: if (!inFraction) {
576: inFraction = true;
577: } else { // two dots in the literal
578: return token(CppTokenId.FLOAT_LITERAL_INVALID);
579: }
580: break;
581: case 'l':
582: case 'L': // 0l or 0L
583: return token(CppTokenId.LONG_LITERAL);
584: // case 'd':
585: // case 'D':
586: // return token(CppTokenId.DOUBLE_LITERAL);
587: case 'f':
588: case 'F':
589: return token(CppTokenId.FLOAT_LITERAL);
590: case 'u':
591: case 'U':
592: return token(CppTokenId.UNSIGNED_LITERAL);
593: case '0':
594: case '1':
595: case '2':
596: case '3':
597: case '4':
598: case '5':
599: case '6':
600: case '7':
601: case '8':
602: case '9':
603: break;
604: case 'e':
605: case 'E': // exponent part
606: return finishFloatExponent();
607: default:
608: backup(1);
609: return token(inFraction ? CppTokenId.DOUBLE_LITERAL
610: : CppTokenId.INT_LITERAL);
611: }
612: c = read(true);
613: }
614: }
615:
616: private Token<CppTokenId> finishFloatExponent() {
617: int c = read(true);
618: if (c == '+' || c == '-') {
619: c = read(true);
620: }
621: if (c < '0' || '9' < c) {
622: return token(CppTokenId.FLOAT_LITERAL_INVALID);
623: }
624: do {
625: c = read(true);
626: } while ('0' <= c && c <= '9'); // reading exponent
627: switch (c) {
628: // case 'd':
629: // case 'D':
630: // return token(CppTokenId.DOUBLE_LITERAL);
631: case 'f':
632: case 'F':
633: return token(CppTokenId.FLOAT_LITERAL);
634: default:
635: backup(1);
636: return token(CppTokenId.DOUBLE_LITERAL);
637: }
638: }
639:
640: protected final Token<CppTokenId> token(CppTokenId id) {
641: return token(id, id.fixedText(), PartType.COMPLETE);
642: }
643:
644: protected final Token<CppTokenId> tokenPart(CppTokenId id,
645: PartType part) {
646: return token(id, null, part);
647: }
648:
649: private Token<CppTokenId> token(CppTokenId id, String fixedText,
650: PartType part) {
651: assert id != null : "id must be not null";
652: Token<CppTokenId> token = null;
653: if (fixedText != null && !isTokenSplittedByEscapedLine()) {
654: // create flyweight token
655: token = tokenFactory.getFlyweightToken(id, fixedText);
656: } else {
657: if (part != PartType.COMPLETE) {
658: token = tokenFactory.createToken(id,
659: input.readLength(), part);
660: } else {
661: token = tokenFactory.createToken(id);
662: }
663: }
664: tokenSplittedByEscapedLine = 0;
665: escapedEatenChars = 0;
666: assert token != null : "token must be created as result for "
667: + id;
668: postTokenCreate(id);
669: return token;
670: }
671:
672: protected Token<CppTokenId> finishSharp() {
673: if (read(true) == '#') {
674: return token(CppTokenId.DBL_SHARP);
675: }
676: backup(1);
677: return token(CppTokenId.SHARP);
678: }
679:
680: @SuppressWarnings("fallthrough")
681: protected Token<CppTokenId> finishDblQuote() {
682: while (true) { // string literal
683: switch (read(true)) {
684: case '"': // NOI18N
685: return token(CppTokenId.STRING_LITERAL);
686: case '\\':
687: read(false); // read escaped char
688: break;
689: case '\r':
690: // consumeNewline();
691: case '\n':
692: backup(1); // leave new line for the own token
693: case EOF:
694: return tokenPart(CppTokenId.STRING_LITERAL,
695: PartType.START);
696: }
697: }
698: }
699:
700: protected Token<CppTokenId> finishLT() {
701: switch (read(true)) {
702: case '<': // after <<
703: if (read(true) == '=') {
704: return token(CppTokenId.LTLTEQ);
705: }
706: backup(1);
707: return token(CppTokenId.LTLT);
708: case '=': // <=
709: return token(CppTokenId.LTEQ);
710: }
711: backup(1);
712: return token(CppTokenId.LT);
713: }
714:
715: protected void postTokenCreate(CppTokenId id) {
716:
717: }
718:
719: public void release() {
720: }
721:
722: private final boolean isTokenSplittedByEscapedLine() {
723: return tokenSplittedByEscapedLine > 0;
724: }
725: }
|