001: /*
002:
003: Licensed to the Apache Software Foundation (ASF) under one or more
004: contributor license agreements. See the NOTICE file distributed with
005: this work for additional information regarding copyright ownership.
006: The ASF licenses this file to You under the Apache License, Version 2.0
007: (the "License"); you may not use this file except in compliance with
008: the License. You may obtain a copy of the License at
009:
010: http://www.apache.org/licenses/LICENSE-2.0
011:
012: Unless required by applicable law or agreed to in writing, software
013: distributed under the License is distributed on an "AS IS" BASIS,
014: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015: See the License for the specific language governing permissions and
016: limitations under the License.
017:
018: */
019:
020: package org.apache.batik.xml;
021:
022: import java.io.ByteArrayInputStream;
023: import java.io.IOException;
024: import java.io.InputStream;
025: import java.io.InputStreamReader;
026: import java.io.PushbackInputStream;
027: import java.io.Reader;
028:
029: import org.apache.batik.util.EncodingUtilities;
030:
031: /**
032: * A collection of utility functions for XML.
033: *
034: * @author <a href="mailto:stephane@hillion.org">Stephane Hillion</a>
035: * @version $Id: XMLUtilities.java 475685 2006-11-16 11:16:05Z cam $
036: */
037: public class XMLUtilities extends XMLCharacters {
038:
039: /**
040: * This class does not need to be instantiated.
041: */
042: protected XMLUtilities() {
043: }
044:
045: /**
046: * Tests whether the given character is a valid space.
047: */
048: public static boolean isXMLSpace(char c) {
049: return (c <= 0x0020)
050: && (((((1L << 0x0009) | (1L << 0x000A) | (1L << 0x000D) | (1L << 0x0020)) >> c) & 1L) != 0);
051: }
052:
053: /**
054: * Tests whether the given character is usable as the
055: * first character of an XML name.
056: */
057: public static boolean isXMLNameFirstCharacter(char c) {
058: return (NAME_FIRST_CHARACTER[c / 32] & (1 << (c % 32))) != 0;
059: }
060:
061: /**
062: * Tests whether the given character is usable as the
063: * first character of an XML 1.1 name.
064: */
065: public static boolean isXML11NameFirstCharacter(char c) {
066: return (NAME11_FIRST_CHARACTER[c / 32] & (1 << (c % 32))) != 0;
067: }
068:
069: /**
070: * Tests whether the given character is a valid XML name character.
071: */
072: public static boolean isXMLNameCharacter(char c) {
073: return (NAME_CHARACTER[c / 32] & (1 << (c % 32))) != 0;
074: }
075:
076: /**
077: * Tests whether the given character is a valid XML 1.1 name character.
078: */
079: public static boolean isXML11NameCharacter(char c) {
080: return (NAME11_CHARACTER[c / 32] & (1 << (c % 32))) != 0;
081: }
082:
083: /**
084: * Tests whether the given 32 bits character is valid in XML documents.
085: */
086: public static boolean isXMLCharacter(int c) {
087: return (c >= 0x10000 && c <= 0x10ffff)
088: || (XML_CHARACTER[c / 32] & (1 << (c % 32))) != 0;
089: }
090:
091: /**
092: * Tests whether the given 32 bit character is a valid XML 1.1 character.
093: */
094: public static boolean isXML11Character(int c) {
095: return c >= 1 && c <= 0xd7ff || c >= 0xe000 && c <= 0xfffd
096: || c >= 0x10000 && c <= 0x10ffff;
097: }
098:
099: /**
100: * Tests whether the given character is a valid XML public ID character.
101: */
102: public static boolean isXMLPublicIdCharacter(char c) {
103: return (c < 128)
104: && (PUBLIC_ID_CHARACTER[c / 32] & (1 << (c % 32))) != 0;
105: }
106:
107: /**
108: * Tests whether the given character is a valid XML version character.
109: */
110: public static boolean isXMLVersionCharacter(char c) {
111: return (c < 128)
112: && (VERSION_CHARACTER[c / 32] & (1 << (c % 32))) != 0;
113: }
114:
115: /**
116: * Tests whether the given character is a valid aphabetic character.
117: */
118: public static boolean isXMLAlphabeticCharacter(char c) {
119: return (c < 128)
120: && (ALPHABETIC_CHARACTER[c / 32] & (1 << (c % 32))) != 0;
121: }
122:
123: /**
124: * Creates a Reader initialized to scan the characters in the given
125: * XML document's InputStream.
126: * @param is The input stream positionned at the beginning of an
127: * XML document.
128: * @return a Reader positionned at the beginning of the XML document
129: * It is created from an encoding figured out from the first
130: * few bytes of the document. As a consequence the given
131: * input stream is not positionned anymore at the beginning
132: * of the document when this method returns.
133: */
134: public static Reader createXMLDocumentReader(InputStream is)
135: throws IOException {
136: PushbackInputStream pbis = new PushbackInputStream(is, 128);
137: byte[] buf = new byte[4];
138:
139: int len = pbis.read(buf);
140: if (len > 0) {
141: pbis.unread(buf, 0, len);
142: }
143:
144: if (len == 4) {
145: switch (buf[0] & 0x00FF) {
146: case 0:
147: if (buf[1] == 0x003c && buf[2] == 0x0000
148: && buf[3] == 0x003f) {
149: return new InputStreamReader(pbis, "UnicodeBig");
150: }
151: break;
152:
153: case '<':
154: switch (buf[1] & 0x00FF) {
155: case 0:
156: if (buf[2] == 0x003f && buf[3] == 0x0000) {
157: return new InputStreamReader(pbis,
158: "UnicodeLittle");
159: }
160: break;
161:
162: case '?':
163: if (buf[2] == 'x' && buf[3] == 'm') {
164: Reader r = createXMLDeclarationReader(pbis,
165: "UTF8");
166: String enc = getXMLDeclarationEncoding(r,
167: "UTF8");
168: return new InputStreamReader(pbis, enc);
169: }
170: }
171: break;
172:
173: case 0x004C:
174: if (buf[1] == 0x006f && (buf[2] & 0x00FF) == 0x00a7
175: && (buf[3] & 0x00FF) == 0x0094) {
176: Reader r = createXMLDeclarationReader(pbis, "CP037");
177: String enc = getXMLDeclarationEncoding(r, "CP037");
178: return new InputStreamReader(pbis, enc);
179: }
180: break;
181:
182: case 0x00FE:
183: if ((buf[1] & 0x00FF) == 0x00FF) {
184: return new InputStreamReader(pbis, "Unicode");
185: }
186: break;
187:
188: case 0x00FF:
189: if ((buf[1] & 0x00FF) == 0x00FE) {
190: return new InputStreamReader(pbis, "Unicode");
191: }
192: }
193: }
194:
195: return new InputStreamReader(pbis, "UTF8");
196: }
197:
198: /**
199: * Creates a reader from the given input stream and encoding.
200: * This method assumes the input stream working buffer is at least
201: * 128 byte long. The input stream is restored before this method
202: * returns. The 4 first bytes are skipped before creating the reader.
203: */
204: protected static Reader createXMLDeclarationReader(
205: PushbackInputStream pbis, String enc) throws IOException {
206: byte[] buf = new byte[128];
207: int len = pbis.read(buf);
208:
209: if (len > 0) {
210: pbis.unread(buf, 0, len);
211: }
212:
213: return new InputStreamReader(new ByteArrayInputStream(buf, 4,
214: len), enc);
215: }
216:
217: /**
218: * Reads an XML declaration to get the encoding declaration value.
219: * @param r a reader positioned just after '<?xm'.
220: * @param e the encoding to return by default or on error.
221: */
222: protected static String getXMLDeclarationEncoding(Reader r, String e)
223: throws IOException {
224: int c;
225:
226: if ((c = r.read()) != 'l') {
227: return e;
228: }
229:
230: if (!isXMLSpace((char) (c = r.read()))) {
231: return e;
232: }
233:
234: while (isXMLSpace((char) (c = r.read())))
235: ;
236:
237: if (c != 'v') {
238: return e;
239: }
240: if ((c = r.read()) != 'e') {
241: return e;
242: }
243: if ((c = r.read()) != 'r') {
244: return e;
245: }
246: if ((c = r.read()) != 's') {
247: return e;
248: }
249: if ((c = r.read()) != 'i') {
250: return e;
251: }
252: if ((c = r.read()) != 'o') {
253: return e;
254: }
255: if ((c = r.read()) != 'n') {
256: return e;
257: }
258:
259: c = r.read();
260: while (isXMLSpace((char) c)) {
261: c = r.read();
262: }
263:
264: if (c != '=') {
265: return e;
266: }
267:
268: while (isXMLSpace((char) (c = r.read())))
269: ;
270:
271: if (c != '"' && c != '\'') {
272: return e;
273: }
274: char sc = (char) c;
275:
276: for (;;) {
277: c = r.read();
278: if (c == sc) {
279: break;
280: }
281: if (!isXMLVersionCharacter((char) c)) {
282: return e;
283: }
284: }
285:
286: if (!isXMLSpace((char) (c = r.read()))) {
287: return e;
288: }
289: while (isXMLSpace((char) (c = r.read())))
290: ;
291:
292: if (c != 'e') {
293: return e;
294: }
295: if ((c = r.read()) != 'n') {
296: return e;
297: }
298: if ((c = r.read()) != 'c') {
299: return e;
300: }
301: if ((c = r.read()) != 'o') {
302: return e;
303: }
304: if ((c = r.read()) != 'd') {
305: return e;
306: }
307: if ((c = r.read()) != 'i') {
308: return e;
309: }
310: if ((c = r.read()) != 'n') {
311: return e;
312: }
313: if ((c = r.read()) != 'g') {
314: return e;
315: }
316:
317: c = r.read();
318: while (isXMLSpace((char) c)) {
319: c = r.read();
320: }
321:
322: if (c != '=') {
323: return e;
324: }
325:
326: while (isXMLSpace((char) (c = r.read())))
327: ;
328:
329: if (c != '"' && c != '\'') {
330: return e;
331: }
332: sc = (char) c;
333:
334: StringBuffer enc = new StringBuffer();
335: for (;;) {
336: c = r.read();
337: if (c == -1) {
338: return e;
339: }
340: if (c == sc) {
341: return encodingToJavaEncoding(enc.toString(), e);
342: }
343: enc.append((char) c);
344: }
345: }
346:
347: /**
348: * Converts the given standard encoding representation to the
349: * corresponding Java encoding string.
350: * @param e the encoding string to convert.
351: * @param de the encoding string if no corresponding encoding was found.
352: */
353: public static String encodingToJavaEncoding(String e, String de) {
354: String result = EncodingUtilities.javaEncoding(e);
355: return (result == null) ? de : result;
356: }
357: }
|