Source Code Cross Referenced for Tokenizer.java in » XML » XPath-Saxon » net » sf » saxon » expr » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI
Java
Java Tutorial
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » XML » XPath Saxon » net.sf.saxon.expr
Source Cross Referenced Class Diagram Java Document (Java Doc)
0001:        package net.sf.saxon.expr;
0002:
0003:        import net.sf.saxon.functions.NormalizeSpace;
0004:        import net.sf.saxon.trans.StaticError;
0005:
0006:        import java.util.ArrayList;
0007:        import java.util.List;
0008:
0009:        /**
0010:         * Tokenizer for expressions and inputs.
0011:         *
0012:         * This code was originally derived from James Clark's xt, though it has been greatly modified since.
0013:         * See copyright notice at end of file.
0014:         */
0015:
0016:        public final class Tokenizer {
0017:
0018:            public int getState() {
0019:                return state;
0020:            }
0021:
0022:            public void setState(int state) {
0023:                this .state = state;
0024:                if (state == DEFAULT_STATE) {
0025:                    // force the followsOperator() test to return true
0026:                    precedingToken = Token.UNKNOWN;
0027:                    currentToken = Token.UNKNOWN;
0028:                } else if (state == OPERATOR_STATE) {
0029:                    precedingToken = Token.RPAR;
0030:                    currentToken = Token.RPAR;
0031:                }
0032:            }
0033:
0034:            private int state = DEFAULT_STATE;
0035:            // we may need to make this a stack at some time
0036:
0037:            /**
0038:             * Initial default state of the Tokenizer
0039:             */
0040:            public static final int DEFAULT_STATE = 0;
0041:
0042:            /**
0043:             * State in which a name is NOT to be merged with what comes next, for example "("
0044:             */
0045:            public static final int BARE_NAME_STATE = 1;
0046:
0047:            /**
0048:             * State in which the next thing to be read is a SequenceType
0049:             */
0050:            public static final int SEQUENCE_TYPE_STATE = 2;
0051:            /**
0052:             * State in which the next thing to be read is an operator
0053:             */
0054:
0055:            public static final int OPERATOR_STATE = 3;
0056:
0057:            /**
0058:             * The starting line number (for XPath in XSLT, the line number in the stylesheet)
0059:             */
0060:            public int startLineNumber;
0061:            /**
0062:             * The number identifying the most recently read token
0063:             */
0064:            public int currentToken = Token.EOF;
0065:            /**
0066:             * The string value of the most recently read token
0067:             */
0068:            public String currentTokenValue = null;
0069:            /**
0070:             * The position in the input expression where the current token starts
0071:             */
0072:            public int currentTokenStartOffset = 0;
0073:            /**
0074:             * The number of the next token to be returned
0075:             */
0076:            private int nextToken = Token.EOF;
0077:            /**
0078:             * The string value of the next token to be returned
0079:             */
0080:            private String nextTokenValue = null;
0081:            /**
0082:             * The position in the expression of the start of the next token
0083:             */
0084:            private int nextTokenStartOffset = 0;
0085:            /**
0086:             * The string being parsed
0087:             */
0088:            public String input;
0089:            /**
0090:             * The current position within the input string
0091:             */
0092:            public int inputOffset = 0;
0093:            /**
0094:             * The length of the input string
0095:             */
0096:            private int inputLength;
0097:            /**
0098:             * The line number (within the expression) of the current token
0099:             */
0100:            private int lineNumber = 1;
0101:            /**
0102:             * The line number (within the expression) of the next token
0103:             */
0104:            private int nextLineNumber = 1;
0105:
0106:            /**
0107:             * List containing the positions (offsets in the input string) at which newline characters
0108:             * occur
0109:             */
0110:
0111:            private List newlineOffsets = null;
0112:
0113:            /**
0114:             * The token number of the token that preceded the current token
0115:             */
0116:            private int precedingToken = Token.UNKNOWN;
0117:
0118:            //public boolean recognizePragmas = false;
0119:            //public String lastPragma = null;
0120:
0121:            //
0122:            // Lexical analyser for expressions, queries, and XSLT patterns
0123:            //
0124:
0125:            /**
0126:             * Prepare a string for tokenization.
0127:             * The actual tokens are obtained by calls on next()
0128:             *
0129:             * @param input the string to be tokenized
0130:             * @param start start point within the string
0131:             * @param end end point within the string (last character not read):
0132:             * -1 means end of string
0133:             * @exception net.sf.saxon.trans.StaticError if a lexical error occurs, e.g. unmatched
0134:             *     string quotes
0135:             */
0136:            public void tokenize(String input, int start, int end,
0137:                    int lineNumber) throws StaticError {
0138:                nextToken = Token.EOF;
0139:                nextTokenValue = null;
0140:                nextTokenStartOffset = 0;
0141:                inputOffset = start;
0142:                this .input = input;
0143:                this .startLineNumber = lineNumber;
0144:                this .lineNumber = lineNumber;
0145:                this .nextLineNumber = lineNumber;
0146:                if (end == -1) {
0147:                    this .inputLength = input.length();
0148:                } else {
0149:                    this .inputLength = end;
0150:                }
0151:
0152:                // The tokenizer actually reads one token ahead. The raw lexical analysis performed by
0153:                // the lookAhead() method does not (in general) distinguish names used as QNames from names
0154:                // used for operators, axes, and functions. The next() routine further refines names into the
0155:                // correct category, by looking at the following token. In addition, it combines compound tokens
0156:                // such as "instance of" and "cast as".
0157:
0158:                lookAhead();
0159:                next();
0160:            }
0161:
0162:            //diagnostic version of next(): change real version to realnext()
0163:            //
0164:            //public void next() throws XPathException {
0165:            //    realnext();
0166:            //    System.err.println("Token: " + currentToken + "[" + tokens[currentToken] + "]");
0167:            //}
0168:
0169:            /**
0170:             * Get the next token from the input expression. The type of token is returned in the
0171:             * currentToken variable, the string value of the token in currentTokenValue.
0172:             *
0173:             * @exception net.sf.saxon.trans.StaticError if a lexical error is detected
0174:             */
0175:
0176:            public void next() throws StaticError {
0177:                precedingToken = currentToken;
0178:                currentToken = nextToken;
0179:                currentTokenValue = nextTokenValue;
0180:                if (currentTokenValue == null) {
0181:                    currentTokenValue = "";
0182:                }
0183:                currentTokenStartOffset = nextTokenStartOffset;
0184:                lineNumber = nextLineNumber;
0185:
0186:                // disambiguate the current token based on the tokenizer state
0187:
0188:                switch (currentToken) {
0189:                case Token.NAME:
0190:                    int optype = getBinaryOp(currentTokenValue);
0191:                    if (optype != Token.UNKNOWN && !followsOperator()) {
0192:                        currentToken = optype;
0193:                    }
0194:                    break;
0195:                case Token.LT:
0196:                    if (followsOperator()) {
0197:                        currentToken = Token.TAG;
0198:                    }
0199:                    break;
0200:                case Token.STAR:
0201:                    if (!followsOperator()) {
0202:                        currentToken = Token.MULT;
0203:                    }
0204:                    break;
0205:                }
0206:
0207:                if (currentToken == Token.TAG || currentToken == Token.RCURLY) {
0208:                    // No lookahead after encountering "<" at the start of an XML-like tag.
0209:                    // After an RCURLY, the parser must do an explicit lookahead() to continue
0210:                    // tokenizing; otherwise it can continue with direct character reading
0211:                    return;
0212:                }
0213:
0214:                lookAhead();
0215:
0216:                if (currentToken == Token.NAME) {
0217:                    if (state == BARE_NAME_STATE) {
0218:                        return;
0219:                    }
0220:                    switch (nextToken) {
0221:                    case Token.LPAR:
0222:                        int op = getBinaryOp(currentTokenValue);
0223:                        if (op == Token.UNKNOWN) {
0224:                            currentToken = getFunctionType(currentTokenValue);
0225:                            lookAhead(); // swallow the "("
0226:                        } else {
0227:                            currentToken = op;
0228:                        }
0229:                        break;
0230:
0231:                    case Token.LCURLY:
0232:                        if (!(state == SEQUENCE_TYPE_STATE)) {
0233:                            currentToken = Token.KEYWORD_CURLY;
0234:                            lookAhead(); // swallow the "{"
0235:                        }
0236:                        break;
0237:
0238:                    case Token.COLONCOLON:
0239:                        lookAhead();
0240:                        currentToken = Token.AXIS;
0241:                        break;
0242:
0243:                    case Token.COLONSTAR:
0244:                        lookAhead();
0245:                        currentToken = Token.PREFIX;
0246:                        break;
0247:
0248:                    case Token.DOLLAR:
0249:                        if (currentTokenValue == "for") {
0250:                            currentToken = Token.FOR;
0251:                        } else if (currentTokenValue == "some") {
0252:                            currentToken = Token.SOME;
0253:                        } else if (currentTokenValue == "every") {
0254:                            currentToken = Token.EVERY;
0255:                        } else if (currentTokenValue == "let") {
0256:                            currentToken = Token.LET;
0257:                        }
0258:                        break;
0259:
0260:                    case Token.NAME:
0261:                        int candidate = -1;
0262:                        if (currentTokenValue.equals("element")) {
0263:                            candidate = Token.ELEMENT_QNAME;
0264:                        } else if (currentTokenValue.equals("attribute")) {
0265:                            candidate = Token.ATTRIBUTE_QNAME;
0266:                        } else if (currentTokenValue
0267:                                .equals("processing-instruction")) {
0268:                            candidate = Token.PI_QNAME;
0269:                        }
0270:                        if (candidate != -1) {
0271:                            // <'element' QName '{'> constructor
0272:                            // <'attribute' QName '{'> constructor
0273:                            // <'processing-instruction' QName '{'> constructor
0274:
0275:                            String qname = nextTokenValue;
0276:                            String saveTokenValue = currentTokenValue;
0277:                            int savePosition = inputOffset;
0278:                            lookAhead();
0279:                            if (nextToken == Token.LCURLY) {
0280:                                currentToken = candidate;
0281:                                currentTokenValue = qname;
0282:                                lookAhead();
0283:                                return;
0284:                            } else {
0285:                                // backtrack (we don't have 2-token lookahead; this is the
0286:                                // only case where it's needed. So we backtrack instead.)
0287:                                currentToken = Token.NAME;
0288:                                currentTokenValue = saveTokenValue;
0289:                                inputOffset = savePosition;
0290:                                nextToken = Token.NAME;
0291:                                nextTokenValue = qname;
0292:                            }
0293:
0294:                        }
0295:                        String composite = currentTokenValue + ' '
0296:                                + nextTokenValue;
0297:                        Integer val = (Integer) Token.doubleKeywords
0298:                                .get(composite);
0299:                        if (val == null) {
0300:                            break;
0301:                        } else {
0302:                            currentToken = val.intValue();
0303:                            currentTokenValue = composite;
0304:                            lookAhead();
0305:                            return;
0306:                        }
0307:                    default:
0308:                        // no action needed
0309:                    }
0310:                }
0311:            }
0312:
0313:            /**
0314:             * Force the current token to be treated as an operator if possible
0315:             */
0316:
0317:            public void treatCurrentAsOperator() {
0318:                switch (currentToken) {
0319:                case Token.NAME:
0320:                    int optype = getBinaryOp(currentTokenValue);
0321:                    if (optype != Token.UNKNOWN) {
0322:                        currentToken = optype;
0323:                    }
0324:                    break;
0325:                case Token.STAR:
0326:                    currentToken = Token.MULT;
0327:                    break;
0328:                }
0329:            }
0330:
0331:            /**
0332:             * Look ahead by one token. This method does the real tokenization work.
0333:             * The method is normally called internally, but the XQuery parser also
0334:             * calls it to resume normal tokenization after dealing with pseudo-XML
0335:             * syntax.
0336:             * @exception net.sf.saxon.trans.StaticError if a lexical error occurs
0337:             */
0338:            public void lookAhead() throws StaticError {
0339:                precedingToken = nextToken;
0340:                nextTokenValue = null;
0341:                nextTokenStartOffset = inputOffset;
0342:                for (;;) {
0343:                    if (inputOffset >= inputLength) {
0344:                        nextToken = Token.EOF;
0345:                        return;
0346:                    }
0347:                    char c = input.charAt(inputOffset++);
0348:                    switch (c) {
0349:                    case '/':
0350:                        if (inputOffset < inputLength
0351:                                && input.charAt(inputOffset) == '/') {
0352:                            inputOffset++;
0353:                            nextToken = Token.SLSL;
0354:                            return;
0355:                        }
0356:                        nextToken = Token.SLASH;
0357:                        return;
0358:                    case ':':
0359:                        if (inputOffset < inputLength) {
0360:                            if (input.charAt(inputOffset) == ':') {
0361:                                inputOffset++;
0362:                                nextToken = Token.COLONCOLON;
0363:                                return;
0364:                            } else if (input.charAt(inputOffset) == '=') {
0365:                                nextToken = Token.ASSIGN;
0366:                                inputOffset++;
0367:                                return;
0368:                            }
0369:                        }
0370:                        throw new StaticError(
0371:                                "Unexpected colon at start of token");
0372:                    case '@':
0373:                        nextToken = Token.AT;
0374:                        return;
0375:                    case '?':
0376:                        nextToken = Token.QMARK;
0377:                        return;
0378:                    case '[':
0379:                        nextToken = Token.LSQB;
0380:                        return;
0381:                    case ']':
0382:                        nextToken = Token.RSQB;
0383:                        return;
0384:                    case '{':
0385:                        nextToken = Token.LCURLY;
0386:                        return;
0387:                    case '}':
0388:                        nextToken = Token.RCURLY;
0389:                        return;
0390:                    case ';':
0391:                        nextToken = Token.SEMICOLON;
0392:                        state = DEFAULT_STATE;
0393:                        return;
0394:                    case '(':
0395:                        if (inputOffset < inputLength
0396:                                && input.charAt(inputOffset) == '#') {
0397:                            inputOffset++;
0398:                            int pragmaStart = inputOffset;
0399:                            int nestingDepth = 1;
0400:                            while (nestingDepth > 0
0401:                                    && inputOffset < (inputLength - 1)) {
0402:                                if (input.charAt(inputOffset) == '\n') {
0403:                                    incrementLineNumber();
0404:                                } else if (input.charAt(inputOffset) == '#'
0405:                                        && input.charAt(inputOffset + 1) == ')') {
0406:                                    nestingDepth--;
0407:                                    inputOffset++;
0408:                                } else if (input.charAt(inputOffset) == '('
0409:                                        && input.charAt(inputOffset + 1) == '#') {
0410:                                    nestingDepth++;
0411:                                    inputOffset++;
0412:                                }
0413:                                inputOffset++;
0414:                            }
0415:                            if (nestingDepth > 0) {
0416:                                throw new StaticError("Unclosed XQuery pragma");
0417:                            }
0418:                            nextToken = Token.PRAGMA;
0419:                            nextTokenValue = input.substring(pragmaStart,
0420:                                    inputOffset - 2);
0421:                            return;
0422:                        }
0423:                        if (inputOffset < inputLength
0424:                                && input.charAt(inputOffset) == ':') {
0425:                            // XPath comment syntax is (: .... :)
0426:                            // Comments may be nested, and may now be empty
0427:                            inputOffset++;
0428:                            int nestingDepth = 1;
0429:                            while (nestingDepth > 0
0430:                                    && inputOffset < (inputLength - 1)) {
0431:                                if (input.charAt(inputOffset) == '\n') {
0432:                                    incrementLineNumber();
0433:                                } else if (input.charAt(inputOffset) == ':'
0434:                                        && input.charAt(inputOffset + 1) == ')') {
0435:                                    //                            if (input.charAt(inputOffset-2) == '(' &&
0436:                                    //                                    input.charAt(inputOffset-1) == ':') {
0437:                                    //                                throw new StaticError("Empty XPath comments are not allowed");
0438:                                    //                            }
0439:                                    nestingDepth--;
0440:                                    inputOffset++;
0441:                                } else if (input.charAt(inputOffset) == '('
0442:                                        && input.charAt(inputOffset + 1) == ':') {
0443:                                    nestingDepth++;
0444:                                    inputOffset++;
0445:                                }
0446:                                inputOffset++;
0447:                            }
0448:                            if (nestingDepth > 0) {
0449:                                throw new StaticError("Unclosed XPath comment");
0450:                            }
0451:                            lookAhead();
0452:                        } else {
0453:                            nextToken = Token.LPAR;
0454:                        }
0455:                        return;
0456:                    case ')':
0457:                        nextToken = Token.RPAR;
0458:                        return;
0459:                    case '+':
0460:                        nextToken = Token.PLUS;
0461:                        return;
0462:                    case '-':
0463:                        nextToken = Token.MINUS; // not detected if part of a name
0464:                        return;
0465:                    case '=':
0466:                        nextToken = Token.EQUALS;
0467:                        return;
0468:                    case '!':
0469:                        if (inputOffset < inputLength
0470:                                && input.charAt(inputOffset) == '=') {
0471:                            inputOffset++;
0472:                            nextToken = Token.NE;
0473:                            return;
0474:                        }
0475:                        throw new StaticError("'!' without '='");
0476:                    case '*':
0477:                        // disambiguation of MULT and STAR is now done later
0478:                        //if (followsOperator()) {
0479:                        if (inputOffset < inputLength
0480:                                && input.charAt(inputOffset) == ':') {
0481:                            inputOffset++;
0482:                            nextToken = Token.SUFFIX;
0483:                            // we leave the parser to get the following name as a separate
0484:                            // token, but first check there's no intervening white space
0485:                            if (inputOffset < inputLength) {
0486:                                char ahead = input.charAt(inputOffset);
0487:                                if (" \r\t\n".indexOf(ahead) >= 0) {
0488:                                    throw new StaticError(
0489:                                            "Whitespace is not allowed after '*:'");
0490:                                }
0491:                            }
0492:                            return;
0493:                        }
0494:                        nextToken = Token.STAR;
0495:                        //} else {
0496:                        //    nextToken = MULT;
0497:                        //}
0498:                        return;
0499:                    case ',':
0500:                        nextToken = Token.COMMA;
0501:                        return;
0502:                    case '$':
0503:                        nextToken = Token.DOLLAR;
0504:                        return;
0505:                    case '|':
0506:                        nextToken = Token.UNION;
0507:                        return;
0508:                    case '<':
0509:                        if (inputOffset < inputLength
0510:                                && input.charAt(inputOffset) == '=') {
0511:                            inputOffset++;
0512:                            nextToken = Token.LE;
0513:                            return;
0514:                        }
0515:                        if (inputOffset < inputLength
0516:                                && input.charAt(inputOffset) == '<') {
0517:                            inputOffset++;
0518:                            nextToken = Token.PRECEDES;
0519:                            return;
0520:                        }
0521:                        nextToken = Token.LT;
0522:                        return;
0523:                    case '>':
0524:                        if (inputOffset < inputLength
0525:                                && input.charAt(inputOffset) == '=') {
0526:                            inputOffset++;
0527:                            nextToken = Token.GE;
0528:                            return;
0529:                        }
0530:                        if (inputOffset < inputLength
0531:                                && input.charAt(inputOffset) == '>') {
0532:                            inputOffset++;
0533:                            nextToken = Token.FOLLOWS;
0534:                            return;
0535:                        }
0536:                        nextToken = Token.GT;
0537:                        return;
0538:                    case '.':
0539:                        if (inputOffset < inputLength
0540:                                && input.charAt(inputOffset) == '.') {
0541:                            inputOffset++;
0542:                            nextToken = Token.DOTDOT;
0543:                            return;
0544:                        }
0545:                        if (inputOffset == inputLength
0546:                                || input.charAt(inputOffset) < '0'
0547:                                || input.charAt(inputOffset) > '9') {
0548:                            nextToken = Token.DOT;
0549:                            return;
0550:                        }
0551:                        // otherwise drop through: we have a number starting with a decimal point
0552:                    case '0':
0553:                    case '1':
0554:                    case '2':
0555:                    case '3':
0556:                    case '4':
0557:                    case '5':
0558:                    case '6':
0559:                    case '7':
0560:                    case '8':
0561:                    case '9':
0562:                        // The logic here can return some tokens that are not legitimate numbers,
0563:                        // for example "23e" or "1.0e+". However, this will only happen if the XPath
0564:                        // expression as a whole is syntactically incorrect.
0565:                        // These errors will be caught by the numeric constructor.
0566:                        boolean allowE = true;
0567:                        boolean allowSign = false;
0568:                        boolean allowDot = true;
0569:                        boolean endOfNum = false;
0570:                        numloop: while (!endOfNum) {
0571:                            switch (c) {
0572:                            case '0':
0573:                            case '1':
0574:                            case '2':
0575:                            case '3':
0576:                            case '4':
0577:                            case '5':
0578:                            case '6':
0579:                            case '7':
0580:                            case '8':
0581:                            case '9':
0582:                                allowSign = false;
0583:                                break;
0584:                            case '.':
0585:                                if (allowDot) {
0586:                                    allowDot = false;
0587:                                    allowSign = false;
0588:                                } else {
0589:                                    inputOffset--;
0590:                                    break numloop;
0591:                                }
0592:                                break;
0593:                            case 'E':
0594:                            case 'e':
0595:                                if (allowE) {
0596:                                    allowSign = true;
0597:                                    allowE = false;
0598:                                } else {
0599:                                    inputOffset--;
0600:                                    break numloop;
0601:                                }
0602:                                break;
0603:                            case '+':
0604:                            case '-':
0605:                                if (allowSign) {
0606:                                    allowSign = false;
0607:                                } else {
0608:                                    inputOffset--;
0609:                                    break numloop;
0610:                                }
0611:                                break;
0612:                            default:
0613:                                if (('a' <= c && c <= 'z') || c > 127) {
0614:                                    // this prevents the famous "10div 3"
0615:                                    throw new StaticError(
0616:                                            "Separator needed after numeric literal");
0617:                                }
0618:                                inputOffset--;
0619:                                break numloop;
0620:                            }
0621:                            if (inputOffset >= inputLength)
0622:                                break;
0623:                            c = input.charAt(inputOffset++);
0624:                        }
0625:                        nextTokenValue = input.substring(nextTokenStartOffset,
0626:                                inputOffset);
0627:                        nextToken = Token.NUMBER;
0628:                        return;
0629:                    case '"':
0630:                    case '\'':
0631:                        nextTokenValue = "";
0632:                        while (true) {
0633:                            inputOffset = input.indexOf(c, inputOffset);
0634:                            if (inputOffset < 0) {
0635:                                inputOffset = nextTokenStartOffset + 1;
0636:                                throw new StaticError(
0637:                                        "Unmatched quote in expression");
0638:                            }
0639:                            nextTokenValue += input.substring(
0640:                                    nextTokenStartOffset + 1, inputOffset++);
0641:                            // look for doubled delimiters
0642:                            if (inputOffset < inputLength
0643:                                    && input.charAt(inputOffset) == c) {
0644:                                nextTokenValue += c;
0645:                                nextTokenStartOffset = inputOffset;
0646:                                inputOffset++;
0647:                            } else {
0648:                                break;
0649:                            }
0650:                        }
0651:
0652:                        // maintain line number if there are newlines in the string
0653:                        if (nextTokenValue.indexOf('\n') >= 0) {
0654:                            for (int i = 0; i < nextTokenValue.length(); i++) {
0655:                                if (nextTokenValue.charAt(i) == '\n') {
0656:                                    lineNumber++;
0657:                                    if (newlineOffsets == null) {
0658:                                        newlineOffsets = new ArrayList(20);
0659:                                    }
0660:                                    newlineOffsets.add(new Integer(
0661:                                            nextTokenStartOffset + i));
0662:                                }
0663:                            }
0664:                        }
0665:                        nextTokenValue = nextTokenValue.intern();
0666:                        nextToken = Token.STRING_LITERAL;
0667:                        return;
0668:                    case '\n':
0669:                        incrementLineNumber();
0670:                        // drop through
0671:                    case ' ':
0672:                    case '\t':
0673:                    case '\r':
0674:                        nextTokenStartOffset = inputOffset;
0675:                        break;
0676:                    default:
0677:                        if (c < 0x80 && !Character.isLetter(c)) {
0678:                            throw new StaticError("Invalid character '" + c
0679:                                    + "' in expression");
0680:                        }
0681:                        /* fall through */
0682:                    case '_':
0683:                        loop: for (; inputOffset < inputLength; inputOffset++) {
0684:                            c = input.charAt(inputOffset);
0685:                            switch (c) {
0686:                            case ':':
0687:                                if (inputOffset + 1 < inputLength) {
0688:                                    char nc = input.charAt(inputOffset + 1);
0689:                                    if (nc == ':') {
0690:                                        nextTokenValue = input.substring(
0691:                                                nextTokenStartOffset,
0692:                                                inputOffset).intern();
0693:                                        nextToken = Token.AXIS;
0694:                                        inputOffset += 2;
0695:                                        return;
0696:                                    } else if (nc == '*') {
0697:                                        nextTokenValue = input.substring(
0698:                                                nextTokenStartOffset,
0699:                                                inputOffset).intern();
0700:                                        nextToken = Token.PREFIX;
0701:                                        inputOffset += 2;
0702:                                        return;
0703:                                    } else if (nc == '=') {
0704:                                        // as in "let $x:=2"
0705:                                        nextTokenValue = input.substring(
0706:                                                nextTokenStartOffset,
0707:                                                inputOffset).intern();
0708:                                        nextToken = Token.NAME;
0709:                                        return;
0710:                                    }
0711:                                }
0712:                                break;
0713:                            case '.':
0714:                            case '-':
0715:                            case '_':
0716:                                break;
0717:
0718:                            default:
0719:                                if (c < 0x80 && !Character.isLetterOrDigit(c))
0720:                                    break loop;
0721:                                break;
0722:                            }
0723:                        }
0724:                        nextTokenValue = input.substring(nextTokenStartOffset,
0725:                                inputOffset).intern();
0726:                        nextToken = Token.NAME;
0727:                        return;
0728:                    }
0729:                }
0730:            }
0731:
0732:            /**
0733:             * Identify a binary operator
0734:             *
0735:             * @param s String representation of the operator - must be interned
0736:             * @return the token number of the operator, or UNKNOWN if it is not a
0737:             *     known operator
0738:             */
0739:
0740:            private static int getBinaryOp(String s) {
0741:                switch (s.length()) {
0742:                case 2:
0743:                    if (s == "or")
0744:                        return Token.OR;
0745:                    if (s == "is")
0746:                        return Token.IS;
0747:                    if (s == "to")
0748:                        return Token.TO;
0749:                    if (s == "in")
0750:                        return Token.IN;
0751:                    if (s == "eq")
0752:                        return Token.FEQ;
0753:                    if (s == "ne")
0754:                        return Token.FNE;
0755:                    if (s == "gt")
0756:                        return Token.FGT;
0757:                    if (s == "ge")
0758:                        return Token.FGE;
0759:                    if (s == "lt")
0760:                        return Token.FLT;
0761:                    if (s == "le")
0762:                        return Token.FLE;
0763:                    break;
0764:                case 3:
0765:                    if (s == "and")
0766:                        return Token.AND;
0767:                    if (s == "div")
0768:                        return Token.DIV;
0769:                    if (s == "mod")
0770:                        return Token.MOD;
0771:                    break;
0772:                case 4:
0773:                    if (s == "idiv")
0774:                        return Token.IDIV;
0775:                    if (s == "then")
0776:                        return Token.THEN;
0777:                    if (s == "else")
0778:                        return Token.ELSE;
0779:                    if (s == "case")
0780:                        return Token.CASE;
0781:                    break;
0782:                case 5:
0783:                    if (s == "where")
0784:                        return Token.WHERE;
0785:                    if (s == "union")
0786:                        return Token.UNION;
0787:                    break;
0788:                case 6:
0789:                    if (s == "except")
0790:                        return Token.EXCEPT;
0791:                    if (s == "return")
0792:                        return Token.RETURN;
0793:                    break;
0794:                case 7:
0795:                    if (s == "default")
0796:                        return Token.DEFAULT;
0797:                case 9:
0798:                    if (s == "intersect")
0799:                        return Token.INTERSECT;
0800:                    if (s == "satisfies")
0801:                        return Token.SATISFIES;
0802:                    break;
0803:                }
0804:                return Token.UNKNOWN;
0805:            }
0806:
0807:            /**
0808:             * Distinguish nodekind names, "if", and function names, which are all
0809:             * followed by a "("
0810:             *
0811:             * @param s the name - must be interned
0812:             * @return the token number
0813:             */
0814:
0815:            private static int getFunctionType(String s) {
0816:                switch (s.length()) {
0817:                case 2:
0818:                    if (s == "if")
0819:                        return Token.IF;
0820:                    break;
0821:                case 4:
0822:                    if (s == "node")
0823:                        return Token.NODEKIND;
0824:                    if (s == "item")
0825:                        return Token.NODEKIND;
0826:                    if (s == "text")
0827:                        return Token.NODEKIND;
0828:                    break;
0829:                case 7:
0830:                    if (s == "element")
0831:                        return Token.NODEKIND;
0832:                    if (s == "comment")
0833:                        return Token.NODEKIND;
0834:                    break;
0835:                case 9:
0836:                    if (s == "attribute")
0837:                        return Token.NODEKIND;
0838:                    if (s == "namespace")
0839:                        return Token.NODEKIND;
0840:                    break;
0841:                case 10:
0842:                    if (s == "typeswitch")
0843:                        return Token.TYPESWITCH;
0844:                    break;
0845:                default:
0846:                    if (s == "document-node")
0847:                        return Token.NODEKIND;
0848:                    if (s == "empty-sequence")
0849:                        return Token.NODEKIND;
0850:                    if (s == "schema-element")
0851:                        return Token.NODEKIND;
0852:                    if (s == "schema-attribute")
0853:                        return Token.NODEKIND;
0854:                    if (s == "processing-instruction")
0855:                        return Token.NODEKIND;
0856:
0857:                    break;
0858:                }
0859:                return Token.FUNCTION;
0860:            }
0861:
0862:            /**
0863:             * Test whether the previous token is an operator
0864:             * @return true if the previous token is an operator token
0865:             */
0866:
0867:            private boolean followsOperator() {
0868:                return precedingToken <= Token.LAST_OPERATOR;
0869:            }
0870:
0871:            /**
0872:             * Read next character directly. Used by the XQuery parser when parsing pseudo-XML syntax
0873:             * @return the next character from the input
0874:             * @throws StringIndexOutOfBoundsException if an attempt is made to read beyond
0875:             * the end of the string. This will only occur in the event of a syntax error in the
0876:             * input.
0877:             */
0878:
0879:            public char nextChar() throws StringIndexOutOfBoundsException {
0880:                char c = input.charAt(inputOffset++);
0881:                //c = normalizeLineEnding(c);
0882:                if (c == '\n') {
0883:                    incrementLineNumber();
0884:                    lineNumber++;
0885:                }
0886:                return c;
0887:            }
0888:
0889:            /**
0890:             * Normalize line endings according to the rules in XML 1.1.
0891:             * @param c the most recently read character. The value of inputOffset must be the immediately following
0892:             * character
0893:             * @return c the current character after newline normalization
0894:             */
0895:
0896:            //    private char normalizeLineEnding(char c) throws StringIndexOutOfBoundsException {
0897:            //        switch (c)  {
0898:            //            case '\r':
0899:            //                if (input.charAt(inputOffset) == '\n' || input.charAt(inputOffset) == 0x85) {
0900:            //                    inputOffset++;
0901:            //                    return '\n';
0902:            //                } else {
0903:            //                    return '\n';
0904:            //                }
0905:            //            case 0x85:
0906:            //                return '\n';
0907:            //            case 0x2028:
0908:            //                return '\n';
0909:            //            default:
0910:            //                return c;
0911:            //        }
0912:            //    }
0913:            /**
0914:             * Increment the line number, making a record of where in the input string the newline character occurred.
0915:             */
0916:
0917:            private void incrementLineNumber() {
0918:                nextLineNumber++;
0919:                if (newlineOffsets == null) {
0920:                    newlineOffsets = new ArrayList(20);
0921:                }
0922:                newlineOffsets.add(new Integer(inputOffset - 1));
0923:            }
0924:
0925:            /**
0926:             * Step back one character. If this steps back to a previous line, adjust the line number.
0927:             */
0928:
0929:            public void unreadChar() {
0930:                if (input.charAt(--inputOffset) == '\n') {
0931:                    nextLineNumber--;
0932:                    lineNumber--;
0933:                    if (newlineOffsets != null) {
0934:                        newlineOffsets.remove(newlineOffsets.size() - 1);
0935:                    }
0936:                }
0937:            }
0938:
0939:            /**
0940:             * Get the most recently read text (for use in an error message)
0941:             */
0942:
0943:            public String recentText() {
0944:                if (inputOffset > inputLength) {
0945:                    inputOffset = inputLength;
0946:                }
0947:                if (inputOffset < 34) {
0948:                    return input.substring(0, inputOffset);
0949:                } else {
0950:                    return NormalizeSpace.normalize(
0951:                            "..."
0952:                                    + input.substring(inputOffset - 30,
0953:                                            inputOffset)).toString();
0954:                }
0955:            }
0956:
0957:            /**
0958:             * Get the line number of the current token
0959:             */
0960:
0961:            public int getLineNumber() {
0962:                return lineNumber;
0963:            }
0964:
0965:            /**
0966:             * Get the column number of the current token
0967:             */
0968:
0969:            public int getColumnNumber() {
0970:                return (int) (getLineAndColumn(currentTokenStartOffset) & 0x7fffffff);
0971:            }
0972:
0973:            // --Commented out by Inspection START (16/12/04 14:40):
0974:            //    /**
0975:            //     * Get the line and column number of the current token,
0976:            //     * as a long value with the line number in the top half
0977:            //     * and the column number in the lower half
0978:            //     * @return the line and column number, packed together
0979:            //     */
0980:            //
0981:            //    public long getLineAndColumn() {
0982:            //        return ((long)getLineNumber()) << 32 | ((long)getColumnNumber());
0983:            //    }
0984:            // --Commented out by Inspection STOP (16/12/04 14:40)
0985:
0986:            /**
0987:             * Get the line and column number corresponding to a given offset in the input expression,
0988:             * as a long value with the line number in the top half
0989:             * and the column number in the lower half
0990:             * @return the line and column number, packed together
0991:             */
0992:
0993:            public long getLineAndColumn(int offset) {
0994:                if (newlineOffsets == null) {
0995:                    return ((long) startLineNumber) << 32 | (long) offset;
0996:                }
0997:                for (int line = newlineOffsets.size() - 1; line >= 0; line--) {
0998:                    int nloffset = ((Integer) newlineOffsets.get(line))
0999:                            .intValue();
1000:                    if (offset > nloffset) {
1001:                        return ((long) (line + startLineNumber + 1) << 32)
1002:                                | ((long) (offset - nloffset));
1003:                    }
1004:                }
1005:                return ((long) startLineNumber) << 32 | (long) (offset + 1);
1006:            }
1007:
1008:            public int getLineNumber(int offset) {
1009:                return (int) ((getLineAndColumn(offset)) >> 32);
1010:            }
1011:
1012:            public int getColumnNumber(int offset) {
1013:                return (int) ((getLineAndColumn(offset)) & 0x7fffffff);
1014:            }
1015:
1016:        }
1017:
1018:        /*
1019:
1020:         The following copyright notice is copied from the licence for xt, from which the
1021:         original version of this module was derived:
1022:         --------------------------------------------------------------------------------
1023:         Copyright (c) 1998, 1999 James Clark
1024:
1025:         Permission is hereby granted, free of charge, to any person obtaining
1026:         a copy of this software and associated documentation files (the
1027:         "Software"), to deal in the Software without restriction, including
1028:         without limitation the rights to use, copy, modify, merge, publish,
1029:         distribute, sublicense, and/or sell copies of the Software, and to
1030:         permit persons to whom the Software is furnished to do so, subject to
1031:         the following conditions:
1032:
1033:         The above copyright notice and this permission notice shall be included
1034:         in all copies or substantial portions of the Software.
1035:
1036:         THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND, EXPRESS
1037:         OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
1038:         MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
1039:         IN NO EVENT SHALL JAMES CLARK BE LIABLE FOR ANY CLAIM, DAMAGES OR
1040:         OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
1041:         ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
1042:         OTHER DEALINGS IN THE SOFTWARE.
1043:
1044:         Except as contained in this notice, the name of James Clark shall
1045:         not be used in advertising or otherwise to promote the sale, use or
1046:         other dealings in this Software without prior written authorization
1047:         from James Clark.
1048:         ---------------------------------------------------------------------------
1049:         */
1050:
1051:        //
1052:        // The contents of this file are subject to the Mozilla Public License Version 1.0 (the "License");
1053:        // you may not use this file except in compliance with the License. You may obtain a copy of the
1054:        // License at http://www.mozilla.org/MPL/
1055:        //
1056:        // Software distributed under the License is distributed on an "AS IS" basis,
1057:        // WITHOUT WARRANTY OF ANY KIND, either express or implied.
1058:        // See the License for the specific language governing rights and limitations under the License.
1059:        //
1060:        // The Original Code is: all this file, other than the parts developed by James Clark as part of xt.
1061:        //
1062:        // The Initial Developer of the Original Code is Michael H. Kay.
1063:        //
1064:        // Portions created by (your name) are Copyright (C) (your legal entity). All Rights Reserved.
1065:        //
1066:        // Contributor(s): none.
1067:        //
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.