001: /*
002: * Tokenizer.java February 2006
003: *
004: * Copyright (C) 2006, Niall Gallagher <niallg@users.sf.net>
005: *
006: * This library is free software; you can redistribute it and/or
007: * modify it under the terms of the GNU Lesser General Public
008: * License as published by the Free Software Foundation.
009: *
010: * This library is distributed in the hope that it will be useful,
011: * but WITHOUT ANY WARRANTY; without even the implied warranty of
012: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013: * GNU Lesser General Public License for more details.
014: *
015: * You should have received a copy of the GNU Lesser General
016: * Public License along with this library; if not, write to the
017: * Free Software Foundation, Inc., 59 Temple Place, Suite 330,
018: * Boston, MA 02111-1307 USA
019: */
020:
021: package simple.page.translate;
022:
023: /**
024: * The <code>Tokenizer</code> is used to extract valid tokens from
025: * the stream of bytes given to it for scanning. Identifying the
026: * tokens from the stream of input is done using delimiters to
027: * specify the start and end of a valid token. For example take
028: * the well known JSP syntax. A parsable segment typically opens
029: * using the following token <code><%</code> and closes with
030: * the <code>%></code>, as shown in the JSP text shown below.
031: * <pre>
032: *
033: * <%= new java.util.Date() %>
034: *
035: * </pre>
036: * This tokenizer can be used to extract HTML expressions and
037: * other such formats by specifying the starting and ending of
038: * the expression. For example the following HTML could be used
039: * to specify the opening and closure of an valid token.
040: * <pre>
041: *
042: * <script language='groovy'>
043: * java.util.Date();
044: * </script>
045: *
046: * </pre>
047: * The above token will be identified using a case insensitive
048: * match, and whitespace characters can be ignored, such that
049: * the HTML does not have to be formatted correctly in order
050: * for this tokenizer to extract the HTML as a valid token.
051: *
052: * @author Niall Gallagher
053: */
054: final class Tokenizer implements Lexer {
055:
056: /**
057: * Parses the tokens extracted from the bytes stream.
058: */
059: private Parser parser;
060:
061: /**
062: * Identifies what characters are considered special.
063: */
064: private char[] special;
065:
066: /**
067: * The sequence of characters that start a valid token.
068: */
069: private char[] start;
070:
071: /**
072: * The sequence of characters that finish a valid token.
073: */
074: private char[] finish;
075:
076: /**
077: * Buffers the scanned bytes so this can cascade easily.
078: */
079: private char[] buf;
080:
081: /**
082: * The number of characters the internal buffer has.
083: */
084: private int capacity;
085:
086: /**
087: * The current read offset within the active buffer.
088: */
089: private int off;
090:
091: /**
092: * The number of valid characters within the buffer.
093: */
094: private int count;
095:
096: /**
097: * This identifies the start of a valid token string.
098: */
099: private int mark;
100:
101: /**
102: * This specifies the number of characters in the token.
103: */
104: private int size;
105:
106: /**
107: * Constructor for the <code>Tokenizer</code> object. This
108: * is used to scan a stream of bytes and pass any extracted
109: * tokens from the stream to the <code>Parser</code>.
110: *
111: * @param parser the parser used to parse extracted tokens
112: */
113: public Tokenizer(Parser parser) {
114: this .parser = parser;
115: }
116:
117: /**
118: * This method tells the lexer how to extract the tokens
119: * from the source document. This is given the opening and
120: * closing tokens used to identify a segment. Typically
121: * with languages such as JSP and PHP code segments are
122: * opened with a delimiter like <code><%</code> for JSP
123: * and <code><?php</code> for PHP. This method allows
124: * the lexer to be configured to process such delimiters.
125: *
126: * @param start this is the opening token for a segment
127: * @param finish this is the closing token for a segment
128: */
129: public void match(String start, String finish) {
130: match(start, finish, "");
131: }
132:
133: /**
134: * This method tells the lexer how to extract the tokens
135: * from the source document. This is given the opening and
136: * closing tokens used to identify a segment. Typically
137: * with languages such as JSP and PHP code segments are
138: * opened with a delimiter like <code><%</code> for JSP
139: * and <code><?php</code> for PHP. This method allows
140: * the lexer to be configured to process such delimiters.
141: * <p>
142: * With this <code>match</code> method a collection of
143: * special characters can be specified. These characters
144: * tell the lexer what it should allow whitespace to
145: * surround. For example take the HTML expressions below.
146: * <pre>
147: *
148: * < script language ='groovy' >
149: * <script language='groovy'>
150: *
151: * </pre>
152: * The above two HTML expressions should be considered
153: * equal using the special characters <code><</code>,
154: * <code>></code>, and <code>=</code>.
155: *
156: * @param start this is the opening token for a segment
157: * @param finish this is the closing token for a segment
158: * @param special this is the set of special characters
159: */
160: public void match(String start, String finish, String special) {
161: this .special = special.toCharArray();
162: this .finish = finish.toCharArray();
163: this .start = start.toCharArray();
164: }
165:
166: /**
167: * The <code>skip</code> method is used to read the specified
168: * text from the bytes within the current buffer. This will
169: * perform a case insensitive comparison of the characters
170: * from both sources. Also, this makes use of the collection
171: * of special characters to ignore whitespace.
172: *
173: * @param text this is the text that is to be skipped
174: *
175: * @return this returns true if the text was fully skipped
176: */
177: private boolean skip(String text) {
178: return skip(text.toCharArray());
179: }
180:
181: /**
182: * The <code>skip</code> method is used to read the specified
183: * text from the bytes within the current buffer. This will
184: * perform a case insensitive comparison of the characters
185: * from both sources. Also, this makes use of the collection
186: * of special characters to ignore whitespace.
187: *
188: * @param text this is the text that is to be skipped
189: *
190: * @return this returns true if the text was fully skipped
191: */
192: private boolean skip(char[] text) {
193: int size = text.length;
194: int seek = off;
195: int scan = 0;
196:
197: if (off + size > count) {
198: return false;
199: }
200: for (int pos = 0; pos < size;) {
201: char peek = buf[seek++];
202:
203: if (special(text[pos++])) {
204: scan = pos;
205: }
206: if (pos > 1) {
207: if (special(text[pos - 2]))
208: scan = pos;
209: }
210: while (seek < count && scan > 0) {
211: peek = buf[seek - 1];
212:
213: if (equals(text[pos - 1], peek)) {
214: scan = 0;
215: } else {
216: if (space(peek)) {
217: seek++;
218: } else {
219: return false;
220: }
221: }
222: }
223: if (!equals(text[pos - 1], peek)) {
224: return false;
225: }
226: }
227: off = seek;
228: return true;
229: }
230:
231: /**
232: * The <code>peek</code> method is used to determine if the
233: * specified text can be fully or partially read from the
234: * current buffer. This performs the same comparative checks
235: * as the <code>skip</code> methods. However, the offset to
236: * the current buffer remains unchanged if the peek is
237: * successful. Also, not all of the text needs to be read
238: * if the end of the stream of bytes is reached first.
239: *
240: * @param text this is the text that is to be examined
241: *
242: * @return this returns true if the text was fully examined
243: */
244: private boolean peek(String text) {
245: return peek(text.toCharArray());
246: }
247:
248: /**
249: * The <code>peek</code> method is used to determine if the
250: * specified text can be fully or partially read from the
251: * current buffer. This performs the same comparative checks
252: * as the <code>skip</code> methods. However, the offset to
253: * the current buffer remains unchanged if the peek is
254: * successful. Also, not all of the text needs to be read
255: * if the end of the stream of bytes is reached first.
256: *
257: * @param text this is the text that is to be examined
258: *
259: * @return this returns true if the text was fully examined
260: */
261: private boolean peek(char[] text) {
262: int size = text.length;
263: int seek = off;
264: int scan = 0;
265:
266: for (int pos = 0; pos < size;) {
267: if (seek >= count) {
268: return true;
269: }
270: char peek = buf[seek++];
271:
272: if (special(text[pos++])) {
273: scan = pos;
274: }
275: if (pos > 1) {
276: if (special(text[pos - 2])) {
277: scan = pos;
278: }
279: }
280: while (seek < count && scan > 0) {
281: peek = buf[seek - 1];
282:
283: if (equals(text[pos - 1], peek)) {
284: scan = 0;
285: } else {
286: if (space(peek)) {
287: seek++;
288: } else {
289: return false;
290: }
291: }
292: }
293: if (!equals(text[pos - 1], peek)) {
294: return false;
295: }
296: }
297: return true;
298: }
299:
300: /**
301: * Emits a token to the <code>Parser</code> so that it can
302: * process it. This will write bytes from the internal buffer
303: * that have been marked out during the scanning phase.
304: */
305: private void emit() {
306: parser.parse(buf, mark, size);
307: }
308:
309: /**
310: * This will scan the provided bytes for tokens that should be
311: * emitted to the <code>Parser</code>. The tokens emitted to
312: * the parser object are either plain text tokens or valid
313: * segments that require further processing by the parser.
314: *
315: * @param text this is the buffer that contains the bytes
316: */
317: public void scan(char[] text) {
318: scan(text, 0, text.length);
319: }
320:
321: /**
322: * This will scan the provided bytes for tokens that should be
323: * emitted to the <code>Parser</code>. The tokens emitted to
324: * the parser object are either plain text tokens or valid
325: * segments that require further processing by the parser.
326: *
327: * @param text this is the buffer that contains the bytes
328: * @param pos this is the offset within the buffer to read
329: * @param len this is the number of bytes to use
330: */
331: public void scan(char[] text, int pos, int len) {
332: if (len + count > capacity) {
333: resize(len + count);
334: }
335: for (int i = 0; i < len; i++) {
336: buf[count++] = text[pos + i];
337: }
338: process();
339: reset();
340: }
341:
342: /**
343: * This will reset the offset pointer so that the next scan
344: * will account for any segments that were not fully processed.
345: * This also ensures that the size of the internal buffer does
346: * not get any larger than the largest segment scanned.
347: */
348: private void reset() {
349: if (mark < count) {
350: count -= mark;
351:
352: for (int i = 0; i < count; i++) {
353: buf[i] = buf[mark++];
354: }
355: off = mark = 0;
356: } else {
357: off = mark = count = 0;
358: }
359: }
360:
361: /**
362: * This is used to expand the capacity of the internal buffer.
363: * Because the size of the segments within the source text can
364: * vary, this is used to ensure that the maximum segment can
365: * be stored before it is emitted to the <code>Parser</code>.
366: *
367: * @param size this is the minimum size the buffer should be
368: */
369: private void resize(int size) {
370: if (capacity < size) {
371: char[] large = new char[size];
372:
373: for (int i = 0; i < capacity; i++) {
374: large[i] = buf[i];
375: }
376: capacity = size;
377: buf = large;
378: }
379: }
380:
381: /**
382: * Once the bytes have been scanned in they are processed for
383: * valid segments. This will determine if segments have been
384: * encountered and will appropriately emit them to the
385: * <code>Parser</code>. Data that is not contained within the
386: * specified delimiters will be emitted as plain text tokens.
387: */
388: private void process() {
389: while (off < count) {
390: if (peek(start)) {
391: if (segment()) {
392: emit();
393: } else {
394: break;
395: }
396: } else {
397: text();
398: emit();
399: }
400: mark = off;
401: }
402: }
403:
404: /**
405: * This will extract all plain text tokens from the source
406: * text. If at any stage the opening delimiter of a segment
407: * is encountered this will cease reading bytes and return.
408: */
409: private void text() {
410: for (size = 0; off < count;) {
411: if (peek(start)) {
412: break;
413: }
414: size++;
415: off++;
416: }
417: }
418:
419: /**
420: * This is used to extract the segment from the source text.
421: * This will first check to see if the opening token can
422: * be read from the source text, if it can then this will try
423: * to consume the contents of the segment until it reaches
424: * the closing delimiter. If the closing delimiter cannot be
425: * read from the contents of the buffer this returns false.
426: * <p>
427: * An added feature of this method is that Java style quoted
428: * strings and comments will not be scanned for the closing
429: * delimiter, which means that comments and strings do not
430: * need to be given special attention.
431: *
432: * @return this returns true if a segment has been read
433: */
434: private boolean segment() {
435: if (start()) {
436: if (body()) {
437: return finish();
438: }
439: }
440: return false;
441: }
442:
443: /**
444: * This will check to see if the starting delimeter for the
445: * segment can be read. If the segment start can is read this
446: * will copy the start segment to the start of the token so
447: * that unformatted HTML does not have to be interpreted by
448: * by the <code>Parser</code> implementation.
449: *
450: * @return this returns true if the start token is read
451: */
452: private boolean start() {
453: if (skip(start)) {
454: int len = start.length;
455:
456: for (mark = off; len-- > 0;) {
457: buf[--mark] = start[len];
458: }
459: return true;
460: }
461: return false;
462: }
463:
464: /**
465: * This will attempt to read the body of the segment. If the
466: * end delimiter for a segment can be read from the remaining
467: * bytes then this will return true. This will ensure that
468: * the end delimiter cannot be read from a Java style quoted
469: * string such as <code>" %> "</code> for JSP tags. Also,
470: * this will not read the end delimiter from a Java comment.
471: *
472: * @return this returns true if the end token is reached
473: */
474: private boolean body() {
475: while (off < count) {
476: if (comment()) {
477: continue;
478: } else if (quoted()) {
479: continue;
480: } else {
481: if (peek(finish)) {
482: return true;
483: }
484: off++;
485: }
486: }
487: return false;
488: }
489:
490: /**
491: * This provides a similar function to the <code>start</code>
492: * method which reads the opening delimiter for a segment.
493: * This will attempt to read the closing or end delimiter for
494: * a segment. If it is read it is copied to the end of the
495: * buffer so that the <code>Parser</code> implementation does
496: * not have to deal with unformatted HTML tags.
497: *
498: * @return this returns true if the end token has been read
499: */
500: private boolean finish() {
501: int len = off - mark;
502:
503: if (skip(finish)) {
504: int count = finish.length;
505: int pos = 0;
506:
507: for (size = len; pos < count; pos++) {
508: buf[mark + size++] = finish[pos];
509: }
510: return true;
511: }
512: return false;
513: }
514:
515: /**
516: * This method will read a Java style comment from the source
517: * text. This can read embedded comments that start with the
518: * <code>/*</code> token and end with <code>*/</code>.
519: * Also single line comments beginning with <code>//</code>
520: * are also accounted for. If the comment can not be fully
521: * read from the source text this will return false.
522: *
523: * @return this returns true if a comment has been read
524: */
525: private boolean comment() {
526: char peek = buf[off];
527:
528: if (count - off > 1 && peek == '/') {
529: if (buf[off + 1] == '*') {
530: while (off < count) {
531: peek = buf[off];
532:
533: if (count - off < 2) {
534: return false;
535: }
536: if (peek == '*') {
537: if (buf[off + 1] == '/') {
538: off += 2;
539: return true;
540: }
541: }
542: off++;
543: }
544: } else if (buf[off + 1] == '/') {
545: while (off < count) {
546: peek = buf[off++];
547:
548: if (peek == '\n' || peek == '\r') {
549: return true;
550: }
551: }
552: }
553: }
554: return false;
555: }
556:
557: /**
558: * This method is used to extract a Java style quoted string
559: * from the template. This will ensure that the quoted string
560: * can have escaped comments such that <code>\"</code> will
561: * not evaluate to the end of the quoted string.
562: *
563: * @return this returns true if a quoted string is read
564: */
565: private boolean quoted() {
566: if (off < count) {
567: if (buf[off++] == '"') {
568: for (int slash = 0; off < count;) {
569: if (buf[off++] == '\\') {
570: slash++;
571: } else {
572: if (buf[off - 1] == '"') {
573: if (slash % 2 < 1)
574: return true;
575: }
576: slash = 0;
577: }
578: }
579: } else {
580: off--;
581: }
582: }
583: return false;
584: }
585:
586: /**
587: * This method is used to determine if the two characters are
588: * equal in a case insensitive manner. This will make use of
589: * the <code>Character.toLowerCase</code> for UCS-2 characters.
590: *
591: * @param one this is a character to be examined
592: * @param two this is the second character to compare
593: */
594: private boolean equals(char one, char two) {
595: return toLower(one) == toLower(two);
596: }
597:
598: /**
599: * This is used to determine if the given character is a special
600: * character. The character is a special character is it was
601: * specified to this object with the <code>match</code> method.
602: *
603: * @param ch this to be checked against the special characters
604: *
605: * @return true if the character is a special character
606: */
607: private boolean special(char ch) {
608: int len = special.length;
609:
610: for (int pos = 0; pos < len; pos++) {
611: if (equals(special[pos], ch))
612: return true;
613: }
614: return false;
615: }
616:
617: /**
618: * This converts the provided character to lower case so that
619: * a comparison can be made in a case insensitive manner. This
620: * delegates to the <code>Character.toLowerCase</code> method.
621: *
622: * @param ch this is the character to convert to lower case
623: *
624: * @return the character equivalent in lower case format
625: */
626: private char toLower(char ch) {
627: return Character.toLowerCase(ch);
628: }
629:
630: /**
631: * This is used to determine if a given UCS-2 character is a
632: * space character. That is a whitespace character this sees
633: * the, space, carriage return and line feed characters as
634: * whitespace characters.
635: *
636: * @param ch the character that is being determined by this
637: *
638: * @return true if the character given is a space character
639: */
640: private boolean space(char ch) {
641: switch (ch) {
642: case ' ':
643: case '\t':
644: case '\n':
645: case '\r':
646: return true;
647: default:
648: return false;
649: }
650: }
651: }
|