001: /*
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: */
017: package org.apache.wicket.markup.parser;
018:
019: import java.io.BufferedInputStream;
020: import java.io.ByteArrayInputStream;
021: import java.io.IOException;
022: import java.io.InputStream;
023: import java.text.ParseException;
024:
025: import org.apache.wicket.markup.MarkupElement;
026: import org.apache.wicket.util.io.FullyBufferedReader;
027: import org.apache.wicket.util.io.XmlReader;
028: import org.apache.wicket.util.parse.metapattern.parsers.TagNameParser;
029: import org.apache.wicket.util.parse.metapattern.parsers.VariableAssignmentParser;
030: import org.apache.wicket.util.resource.ResourceStreamNotFoundException;
031:
032: /**
033: * A fairly shallow markup pull parser which parses a markup string of a given
034: * type of markup (for example, html, xml, vxml or wml) into ComponentTag and
035: * RawMarkup tokens.
036: *
037: * @author Jonathan Locke
038: * @author Juergen Donnerstag
039: */
040: public final class XmlPullParser extends AbstractMarkupFilter implements
041: IXmlPullParser {
042: /** next() must be called at least once for the Type to be valid */
043: public static final int NOT_INITIALIZED = 0;
044:
045: /** <name ...> */
046: public static final int TAG = 1;
047:
048: /** Tag body in between two tags */
049: public static final int BODY = 2;
050:
051: /** <!-- ... --> */
052: public static final int COMMENT = 3;
053:
054: /** <![CDATA[ .. ]]> */
055: public static final int CDATA = 4;
056:
057: /** <?...> */
058: public static final int PROCESSING_INSTRUCTION = 5;
059:
060: /** all other tags which look like <!.. > */
061: public static final int SPECIAL_TAG = 6;
062:
063: /**
064: * Reads the xml data from an input stream and converts the chars according
065: * to its encoding (<?xml ... encoding="..." ?>)
066: */
067: private XmlReader xmlReader;
068:
069: /**
070: * A XML independent reader which loads the whole source data into memory
071: * and which provides convinience methods to access the data.
072: */
073: private FullyBufferedReader input;
074:
075: /** temporary variable which will hold the name of the closing tag. */
076: private String skipUntilText;
077:
078: /** The last substring selected from the input */
079: private CharSequence lastText;
080:
081: /** The type of what is in lastText */
082: private int lastType = NOT_INITIALIZED;
083:
084: /** If lastType == TAG, than ... */
085: private XmlTag lastTag;
086:
087: /**
088: * Construct.
089: */
090: public XmlPullParser() {
091: }
092:
093: /**
094: *
095: * @see org.apache.wicket.markup.parser.IXmlPullParser#getEncoding()
096: */
097: public String getEncoding() {
098: return this .xmlReader.getEncoding();
099: }
100:
101: /**
102: *
103: * @see org.apache.wicket.markup.parser.IXmlPullParser#getXmlDeclaration()
104: */
105: public String getXmlDeclaration() {
106: return this .xmlReader.getXmlDeclaration();
107: }
108:
109: /**
110: *
111: * @see org.apache.wicket.markup.parser.IXmlPullParser#getInputFromPositionMarker(int)
112: */
113: public final CharSequence getInputFromPositionMarker(final int toPos) {
114: return this .input.getSubstring(toPos);
115: }
116:
117: /**
118: *
119: * @see org.apache.wicket.markup.parser.IXmlPullParser#getInput(int, int)
120: */
121: public final CharSequence getInput(final int fromPos,
122: final int toPos) {
123: return this .input.getSubstring(fromPos, toPos);
124: }
125:
126: /**
127: * Whatever will be in between the current index and the closing tag, will
128: * be ignored (and thus treated as raw markup (text). This is useful for
129: * tags like 'script'.
130: *
131: * @throws ParseException
132: */
133: private final void skipUntil() throws ParseException {
134: // this is a tag with non-XHTML text as body - skip this until the
135: // skipUntilText is found.
136: final int startIndex = this .input.getPosition();
137: final int tagNameLen = this .skipUntilText.length();
138:
139: int pos = this .input.getPosition() - 1;
140: String endTagText = null;
141: int lastPos = 0;
142: while (!skipUntilText.equalsIgnoreCase(endTagText)) {
143: pos = this .input.find("</", pos + 1);
144: if ((pos == -1)
145: || ((pos + (tagNameLen + 2)) >= this .input.size())) {
146: throw new ParseException(skipUntilText
147: + " tag not closed (line "
148: + this .input.getLineNumber() + ", column "
149: + this .input.getColumnNumber() + ")",
150: startIndex);
151: }
152:
153: lastPos = pos + 2;
154: endTagText = this .input.getSubstring(lastPos,
155: lastPos + tagNameLen).toString();
156: }
157:
158: this .input.setPosition(pos);
159: this .lastText = this .input.getSubstring(startIndex, pos);
160: this .lastType = BODY;
161:
162: // Check that the tag is properly closed
163: lastPos = this .input.find('>', lastPos + tagNameLen);
164: if (lastPos == -1) {
165: throw new ParseException("Script tag not closed (line "
166: + this .input.getLineNumber() + ", column "
167: + this .input.getColumnNumber() + ")", startIndex);
168: }
169:
170: // Reset the state variable
171: this .skipUntilText = null;
172: }
173:
174: /**
175: * Gets the next tag from the input string.
176: *
177: * @return The extracted tag (will always be of type XmlTag).
178: * @throws ParseException
179: */
180: public final boolean next() throws ParseException {
181: // Reached end of markup file?
182: if (this .input.getPosition() >= this .input.size()) {
183: return false;
184: }
185:
186: if (this .skipUntilText != null) {
187: skipUntil();
188: return true;
189: }
190:
191: // Any more tags in the markup?
192: final int openBracketIndex = this .input.find('<');
193:
194: // Tag or Body?
195: if (this .input.charAt(this .input.getPosition()) != '<') {
196: if (openBracketIndex == -1) {
197: // There is no next matching tag.
198: this .lastText = this .input.getSubstring(-1);
199: this .input.setPosition(this .input.size());
200: this .lastType = BODY;
201: return true;
202: }
203:
204: this .lastText = this .input.getSubstring(openBracketIndex);
205: this .input.setPosition(openBracketIndex);
206: this .lastType = BODY;
207: return true;
208: }
209:
210: // Determine the line number
211: this .input.countLinesTo(openBracketIndex);
212:
213: // Get index of closing tag and advance past the tag
214: int closeBracketIndex = this .input.find('>',
215: openBracketIndex + 1);
216: if (closeBracketIndex == -1) {
217: throw new ParseException(
218: "No matching close bracket at position "
219: + openBracketIndex, this .input
220: .getPosition());
221: }
222:
223: // Get the complete tag text
224: this .lastText = this .input.getSubstring(openBracketIndex,
225: closeBracketIndex + 1);
226:
227: // Get the tagtext between open and close brackets
228: String tagText = this .lastText.subSequence(1,
229: this .lastText.length() - 1).toString();
230: if (tagText.length() == 0) {
231: throw new ParseException(
232: "Found empty tag: '<>' at position "
233: + openBracketIndex, this .input
234: .getPosition());
235: }
236:
237: // Handle special tags like <!-- and <![CDATA ...
238: final char firstChar = tagText.charAt(0);
239: if ((firstChar == '!') || (firstChar == '?')) {
240: specialTagHandling(tagText, openBracketIndex,
241: closeBracketIndex);
242: return true;
243: }
244:
245: // Type of the tag, to be determined next
246: final XmlTag.Type type;
247:
248: // If the tag ends in '/', it's a "simple" tag like <foo/>
249: if (tagText.endsWith("/")) {
250: type = XmlTag.OPEN_CLOSE;
251: tagText = tagText.substring(0, tagText.length() - 1);
252: } else if (tagText.startsWith("/")) {
253: // The tag text starts with a '/', it's a simple close tag
254: type = XmlTag.CLOSE;
255: tagText = tagText.substring(1);
256: } else {
257: // It must be an open tag
258: type = XmlTag.OPEN;
259:
260: // If open tag and starts with "s" like "script" or "style", than
261: // ...
262: if ((tagText.length() > 5)
263: && ((tagText.charAt(0) == 's') || (tagText
264: .charAt(0) == 'S'))) {
265: final String lowerCase = tagText.substring(0, 6)
266: .toLowerCase();
267: if (lowerCase.startsWith("script")) {
268: // prepare to skip everything between the open and close tag
269: this .skipUntilText = "script";
270: } else if (lowerCase.startsWith("style")) {
271: // prepare to skip everything between the open and close tag
272: this .skipUntilText = "style";
273: }
274: }
275: }
276:
277: // Parse remaining tag text, obtaining a tag object or null
278: // if it's invalid
279: this .lastTag = parseTagText(tagText);
280: if (this .lastTag != null) {
281: // Populate tag fields
282: this .lastTag.type = type;
283: this .lastTag.pos = openBracketIndex;
284: this .lastTag.length = this .lastText.length();
285: this .lastTag.text = this .lastText;
286: this .lastTag.lineNumber = this .input.getLineNumber();
287: this .lastTag.columnNumber = this .input.getColumnNumber();
288:
289: // Move to position after the tag
290: this .input.setPosition(closeBracketIndex + 1);
291: this .lastType = TAG;
292: return true;
293: } else {
294: throw new ParseException("Malformed tag (line "
295: + this .input.getLineNumber() + ", column "
296: + this .input.getColumnNumber() + ")",
297: openBracketIndex);
298: }
299: }
300:
301: /**
302: * Handle special tags like <!-- --> or <![CDATA[..]]> or <?xml>
303: *
304: * @param tagText
305: * @param openBracketIndex
306: * @param closeBracketIndex
307: * @throws ParseException
308: */
309: private void specialTagHandling(String tagText,
310: final int openBracketIndex, int closeBracketIndex)
311: throws ParseException {
312: // Handle comments
313: if (tagText.startsWith("!--")) {
314: // Normal comment section.
315: // Skip ahead to "-->". Note that you can not simply test for
316: // tagText.endsWith("--") as the comment might contain a '>'
317: // inside.
318: int pos = this .input.find("-->", openBracketIndex + 1);
319: if (pos == -1) {
320: throw new ParseException(
321: "Unclosed comment beginning at line:"
322: + input.getLineNumber() + " column:"
323: + input.getColumnNumber(),
324: openBracketIndex);
325: }
326:
327: pos += 3;
328: this .lastText = this .input.getSubstring(openBracketIndex,
329: pos);
330: this .lastType = COMMENT;
331:
332: // Conditional comment? <!--[if ...]>..<![endif]-->
333: if (tagText.startsWith("!--[if ")
334: && tagText.endsWith("]")
335: && this .lastText.toString()
336: .endsWith("<![endif]-->")) {
337: // Actually it is no longer a comment. It is now
338: // up to the browser to select the section appropriate.
339: this .input.setPosition(closeBracketIndex + 1);
340: } else {
341: this .input.setPosition(pos);
342: }
343: return;
344: }
345:
346: // The closing tag of a conditional comment <!--[if IE]>...<![endif]-->
347: if (tagText.equals("![endif]--")) {
348: this .lastType = COMMENT;
349: this .input.setPosition(closeBracketIndex + 1);
350: return;
351: }
352:
353: // CDATA sections might contain "<" which is not part of an XML tag.
354: // Make sure escaped "<" are treated right
355: if (tagText.startsWith("![")) {
356: final String startText = (tagText.length() <= 8 ? tagText
357: : tagText.substring(0, 8));
358: if (startText.toUpperCase().equals("![CDATA[")) {
359: int pos1 = openBracketIndex;
360: do {
361: // Get index of closing tag and advance past the tag
362: closeBracketIndex = findChar('>', pos1);
363:
364: if (closeBracketIndex == -1) {
365: throw new ParseException(
366: "No matching close bracket at line:"
367: + input.getLineNumber()
368: + " column:"
369: + input.getColumnNumber(),
370: this .input.getPosition());
371: }
372:
373: // Get the tagtext between open and close brackets
374: tagText = this .input.getSubstring(
375: openBracketIndex + 1, closeBracketIndex)
376: .toString();
377:
378: pos1 = closeBracketIndex + 1;
379: } while (tagText.endsWith("]]") == false);
380:
381: // Move to position after the tag
382: this .input.setPosition(closeBracketIndex + 1);
383:
384: this .lastText = tagText;
385: this .lastType = CDATA;
386: return;
387: }
388: }
389:
390: if (tagText.charAt(0) == '?') {
391: this .lastType = PROCESSING_INSTRUCTION;
392:
393: // Move to position after the tag
394: this .input.setPosition(closeBracketIndex + 1);
395: return;
396: }
397:
398: // Move to position after the tag
399: this .lastType = SPECIAL_TAG;
400: this .input.setPosition(closeBracketIndex + 1);
401: }
402:
403: /**
404: * Gets the next tag from the input string.
405: *
406: * @return The extracted tag (will always be of type XmlTag).
407: * @throws ParseException
408: */
409: public final MarkupElement nextTag() throws ParseException {
410: while (next()) {
411: switch (this .lastType) {
412: case TAG:
413: return this .lastTag;
414:
415: case BODY:
416: break;
417:
418: case COMMENT:
419: break;
420:
421: case CDATA:
422: break;
423:
424: case PROCESSING_INSTRUCTION:
425: break;
426:
427: case SPECIAL_TAG:
428: break;
429: }
430: }
431:
432: return null;
433: }
434:
435: /**
436: * Find the char but ignore any text within ".." and '..'
437: *
438: * @param ch
439: * The character to search
440: * @param startIndex
441: * Start index
442: * @return -1 if not found, else the index
443: */
444: private int findChar(final char ch, int startIndex) {
445: char quote = 0;
446:
447: for (; startIndex < this .input.size(); startIndex++) {
448: final char charAt = this .input.charAt(startIndex);
449: if (quote != 0) {
450: if (quote == charAt) {
451: quote = 0;
452: }
453: } else if ((charAt == '"') || (charAt == '\'')) {
454: quote = charAt;
455: } else if (charAt == ch) {
456: return startIndex;
457: }
458: }
459:
460: return -1;
461: }
462:
463: /**
464: * Parse the given string.
465: * <p>
466: * Note: xml character encoding is NOT applied. It is assumed the input
467: * provided does have the correct encoding already.
468: *
469: * @param string
470: * The input string
471: * @throws IOException
472: * Error while reading the resource
473: * @throws ResourceStreamNotFoundException
474: * Resource not found
475: */
476: public void parse(final CharSequence string) throws IOException,
477: ResourceStreamNotFoundException {
478: parse(new ByteArrayInputStream(string.toString().getBytes()),
479: null);
480: }
481:
482: /**
483: * Reads and parses markup from an input stream, using UTF-8 encoding by
484: * default when not specified in XML declaration.
485: *
486: * @param in
487: * The input stream to read and parse
488: * @throws IOException
489: * @throws ResourceStreamNotFoundException
490: */
491: public void parse(final InputStream in) throws IOException,
492: ResourceStreamNotFoundException {
493: // When XML declaration does not specify encoding, it defaults to UTF-8
494: parse(in, "UTF-8");
495: }
496:
497: /**
498: * Reads and parses markup from an input stream
499: *
500: * @param inputStream
501: * The input stream to read and parse
502: * @param encoding
503: * The default character encoding of the input
504: * @throws IOException
505: * @throws ResourceStreamNotFoundException
506: */
507: public void parse(final InputStream inputStream,
508: final String encoding) throws IOException,
509: ResourceStreamNotFoundException {
510: try {
511: this .xmlReader = new XmlReader(new BufferedInputStream(
512: inputStream, 4000), encoding);
513: this .input = new FullyBufferedReader(this .xmlReader);
514: } finally {
515: inputStream.close();
516: if (this .xmlReader != null) {
517: this .xmlReader.close();
518: }
519: }
520: }
521:
522: /**
523: *
524: * @see org.apache.wicket.markup.parser.IXmlPullParser#setPositionMarker()
525: */
526: public final void setPositionMarker() {
527: this .input.setPositionMarker(this .input.getPosition());
528: }
529:
530: /**
531: *
532: * @see org.apache.wicket.markup.parser.IXmlPullParser#setPositionMarker(int)
533: */
534: public final void setPositionMarker(final int pos) {
535: this .input.setPositionMarker(pos);
536: }
537:
538: /**
539: *
540: * @see java.lang.Object#toString()
541: */
542: public String toString() {
543: return this .input.toString();
544: }
545:
546: /**
547: * Parses the text between tags. For example, "a href=foo.html".
548: *
549: * @param tagText
550: * The text between tags
551: * @return A new Tag object or null if the tag is invalid
552: * @throws ParseException
553: */
554: private XmlTag parseTagText(final String tagText)
555: throws ParseException {
556: // Get the length of the tagtext
557: final int tagTextLength = tagText.length();
558:
559: // If we match tagname pattern
560: final TagNameParser tagnameParser = new TagNameParser(tagText);
561: if (tagnameParser.matcher().lookingAt()) {
562: final XmlTag tag = new XmlTag();
563:
564: // Extract the tag from the pattern matcher
565: tag.name = tagnameParser.getName();
566: tag.namespace = tagnameParser.getNamespace();
567:
568: // Are we at the end? Then there are no attributes, so we just
569: // return the tag
570: int pos = tagnameParser.matcher().end(0);
571: if (pos == tagTextLength) {
572: return tag;
573: }
574:
575: // Extract attributes
576: final VariableAssignmentParser attributeParser = new VariableAssignmentParser(
577: tagText);
578: while (attributeParser.matcher().find(pos)) {
579: // Get key and value using attribute pattern
580: String value = attributeParser.getValue();
581:
582: // In case like <html xmlns:wicket> will the value be null
583: if (value == null) {
584: value = "";
585: }
586:
587: // Set new position to end of attribute
588: pos = attributeParser.matcher().end(0);
589:
590: // Chop off double quotes or single quotes
591: if (value.startsWith("\"") || value.startsWith("\'")) {
592: value = value.substring(1, value.length() - 1);
593: }
594:
595: // Trim trailing whitespace
596: value = value.trim();
597:
598: // Get key
599: final String key = attributeParser.getKey();
600:
601: // Put the attribute in the attributes hash
602: if (null != tag.put(key, value)) {
603: throw new ParseException(
604: "Same attribute found twice: " + key,
605: this .input.getPosition());
606: }
607:
608: // The input has to match exactly (no left over junk after
609: // attributes)
610: if (pos == tagTextLength) {
611: return tag;
612: }
613: }
614:
615: return tag;
616: }
617:
618: return null;
619: }
620: }
|