001: /*
002:
003: Licensed to the Apache Software Foundation (ASF) under one or more
004: contributor license agreements. See the NOTICE file distributed with
005: this work for additional information regarding copyright ownership.
006: The ASF licenses this file to You under the Apache License, Version 2.0
007: (the "License"); you may not use this file except in compliance with
008: the License. You may obtain a copy of the License at
009:
010: http://www.apache.org/licenses/LICENSE-2.0
011:
012: Unless required by applicable law or agreed to in writing, software
013: distributed under the License is distributed on an "AS IS" BASIS,
014: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015: See the License for the specific language governing permissions and
016: limitations under the License.
017:
018: */
019: package org.apache.batik.util;
020:
021: import java.io.BufferedInputStream;
022: import java.io.IOException;
023: import java.io.InputStream;
024: import java.net.HttpURLConnection;
025: import java.net.MalformedURLException;
026: import java.net.URL;
027: import java.net.URLConnection;
028: import java.util.Iterator;
029: import java.util.LinkedList;
030: import java.util.List;
031: import java.util.zip.GZIPInputStream;
032: import java.util.zip.InflaterInputStream;
033: import java.util.zip.ZipException;
034:
035: /**
036: * Holds the data for more URLs.
037: *
038: * @author <a href="mailto:deweese@apache.org">Thomas DeWeese</a>
039: * @version $Id: ParsedURLData.java 501495 2007-01-30 18:00:36Z dvholten $
040: */
041: public class ParsedURLData {
042:
043: protected static final String HTTP_USER_AGENT_HEADER = "User-Agent";
044:
045: protected static final String HTTP_ACCEPT_HEADER = "Accept";
046: protected static final String HTTP_ACCEPT_LANGUAGE_HEADER = "Accept-Language";
047: protected static final String HTTP_ACCEPT_ENCODING_HEADER = "Accept-Encoding";
048:
049: protected static List acceptedEncodings = new LinkedList();
050: static {
051: acceptedEncodings.add("gzip");
052: }
053:
054: /**
055: * GZIP header magic number bytes, like found in a gzipped
056: * files, which are encoded in Intel format (i.e. little indian).
057: */
058: public static final byte[] GZIP_MAGIC = { (byte) 0x1f, (byte) 0x8b };
059:
060: /**
061: * This is a utility function others can call that checks if
062: * is is a GZIP stream if so it returns a GZIPInputStream that
063: * will decode the contents, otherwise it returns (or a
064: * buffered version of is) untouched.
065: * @param is Stream that may potentially be a GZIP stream.
066: */
067: public static InputStream checkGZIP(InputStream is)
068: throws IOException {
069:
070: if (!is.markSupported())
071: is = new BufferedInputStream(is);
072: byte[] data = new byte[2];
073: try {
074: is.mark(2);
075: is.read(data);
076: is.reset();
077: } catch (Exception ex) {
078: is.reset();
079: return is;
080: }
081: if ((data[0] == GZIP_MAGIC[0]) && (data[1] == GZIP_MAGIC[1]))
082: return new GZIPInputStream(is);
083:
084: if (((data[0] & 0x0F) == 8) && ((data[0] >>> 4) <= 7)) {
085: // Check for a zlib (deflate) stream
086: int chk = ((((int) data[0]) & 0xFF) * 256 + (((int) data[1]) & 0xFF));
087: if ((chk % 31) == 0) {
088: try {
089: // I'm not really as certain of this check
090: // as I would like so I want to force it
091: // to decode part of the stream.
092: is.mark(100);
093: InputStream ret = new InflaterInputStream(is);
094: if (!ret.markSupported())
095: ret = new BufferedInputStream(ret);
096: ret.mark(2);
097: ret.read(data);
098: is.reset();
099: ret = new InflaterInputStream(is);
100: return ret;
101: } catch (ZipException ze) {
102: is.reset();
103: return is;
104: }
105: }
106: }
107:
108: return is;
109: }
110:
111: /**
112: * Since the Data instance is 'hidden' in the ParsedURL
113: * instance we make all our methods public. This makes it
114: * easy for the various Protocol Handlers to update an
115: * instance as parsing proceeds.
116: */
117: public String protocol = null;
118: public String host = null;
119: public int port = -1;
120: public String path = null;
121: public String ref = null;
122: public String contentType = null;
123: public String contentEncoding = null;
124:
125: public InputStream stream = null;
126: public boolean hasBeenOpened = false;
127:
128: /**
129: * The extracted type/subtype from the Content-Type header.
130: */
131: protected String contentTypeMediaType;
132:
133: /**
134: * The extracted charset parameter from the Content-Type header.
135: */
136: protected String contentTypeCharset;
137:
138: /**
139: * Void constructor
140: */
141: public ParsedURLData() {
142: }
143:
144: /**
145: * Build from an existing URL.
146: */
147: public ParsedURLData(URL url) {
148: protocol = url.getProtocol();
149: if ((protocol != null) && (protocol.length() == 0))
150: protocol = null;
151:
152: host = url.getHost();
153: if ((host != null) && (host.length() == 0))
154: host = null;
155:
156: port = url.getPort();
157:
158: path = url.getFile();
159: if ((path != null) && (path.length() == 0))
160: path = null;
161:
162: ref = url.getRef();
163: if ((ref != null) && (ref.length() == 0))
164: ref = null;
165: }
166:
167: /**
168: * Attempts to build a normal java.net.URL instance from this
169: * URL.
170: */
171: protected URL buildURL() throws MalformedURLException {
172:
173: // System.out.println("File: " + file);
174: // if (ref != null)
175: // file += "#" + ref;
176: // System.err.println("Building: " + protocol + " - " +
177: // host + " - " + path);
178:
179: if ((protocol != null) && (host != null)) {
180: String file = "";
181: if (path != null)
182: file = path;
183: if (port == -1)
184: return new URL(protocol, host, file);
185:
186: return new URL(protocol, host, port, file);
187: }
188:
189: return new URL(toString());
190: }
191:
192: /**
193: * Implement Object.hashCode.
194: */
195: public int hashCode() {
196: int hc = port;
197: if (protocol != null)
198: hc ^= protocol.hashCode();
199: if (host != null)
200: hc ^= host.hashCode();
201:
202: // For some URLs path and ref can get fairly long
203: // and the most unique part is towards the end
204: // so we grab that part for HC purposes
205: if (path != null) {
206: int len = path.length();
207: if (len > 20)
208: hc ^= path.substring(len - 20).hashCode();
209: else
210: hc ^= path.hashCode();
211: }
212: if (ref != null) {
213: int len = ref.length();
214: if (len > 20)
215: hc ^= ref.substring(len - 20).hashCode();
216: else
217: hc ^= ref.hashCode();
218: }
219:
220: return hc;
221: }
222:
223: /**
224: * Implement Object.equals for ParsedURLData.
225: */
226: public boolean equals(Object obj) {
227: if (obj == null)
228: return false;
229: if (!(obj instanceof ParsedURLData))
230: return false;
231:
232: ParsedURLData ud = (ParsedURLData) obj;
233: if (ud.port != port)
234: return false;
235:
236: if (ud.protocol == null) {
237: if (protocol != null)
238: return false;
239: } else if (protocol == null)
240: return false;
241: else if (!ud.protocol.equals(protocol))
242: return false;
243:
244: if (ud.host == null) {
245: if (host != null)
246: return false;
247: } else if (host == null)
248: return false;
249: else if (!ud.host.equals(host))
250: return false;
251:
252: if (ud.ref == null) {
253: if (ref != null)
254: return false;
255: } else if (ref == null)
256: return false;
257: else if (!ud.ref.equals(ref))
258: return false;
259:
260: if (ud.path == null) {
261: if (path != null)
262: return false;
263: } else if (path == null)
264: return false;
265: else if (!ud.path.equals(path))
266: return false;
267:
268: return true;
269: }
270:
271: /**
272: * Returns the content type if available. This is only available
273: * for some protocols.
274: */
275: public String getContentType(String userAgent) {
276: if (contentType != null)
277: return contentType;
278:
279: if (!hasBeenOpened) {
280: try {
281: openStreamInternal(userAgent, null, null);
282: } catch (IOException ioe) { /* nothing */
283: }
284: }
285:
286: return contentType;
287: }
288:
289: /**
290: * Returns the content type's type/subtype, if available. This is
291: * only available for some protocols.
292: */
293: public String getContentTypeMediaType(String userAgent) {
294: if (contentTypeMediaType != null) {
295: return contentTypeMediaType;
296: }
297:
298: extractContentTypeParts(userAgent);
299:
300: return contentTypeMediaType;
301: }
302:
303: /**
304: * Returns the content type's charset parameter, if available. This is
305: * only available for some protocols.
306: */
307: public String getContentTypeCharset(String userAgent) {
308: if (contentTypeMediaType != null) {
309: return contentTypeCharset;
310: }
311:
312: extractContentTypeParts(userAgent);
313:
314: return contentTypeCharset;
315: }
316:
317: /**
318: * Returns whether the Content-Type header has the given parameter.
319: */
320: public boolean hasContentTypeParameter(String userAgent,
321: String param) {
322: getContentType(userAgent);
323: if (contentType == null) {
324: return false;
325: }
326: int i = 0;
327: int len = contentType.length();
328: int plen = param.length();
329: loop1: while (i < len) {
330: switch (contentType.charAt(i)) {
331: case ' ':
332: case ';':
333: break loop1;
334: }
335: i++;
336: }
337: if (i == len) {
338: contentTypeMediaType = contentType;
339: } else {
340: contentTypeMediaType = contentType.substring(0, i);
341: }
342: loop2: for (;;) {
343: while (i < len && contentType.charAt(i) != ';') {
344: i++;
345: }
346: if (i == len) {
347: return false;
348: }
349: i++;
350: while (i < len && contentType.charAt(i) == ' ') {
351: i++;
352: }
353: if (i >= len - plen - 1) {
354: return false;
355: }
356: for (int j = 0; j < plen; j++) {
357: if (!(contentType.charAt(i++) == param.charAt(j))) {
358: continue loop2;
359: }
360: }
361: if (contentType.charAt(i) == '=') {
362: return true;
363: }
364: }
365: }
366:
367: /**
368: * Extracts the type/subtype and charset parameter from the Content-Type
369: * header.
370: */
371: protected void extractContentTypeParts(String userAgent) {
372: getContentType(userAgent);
373: if (contentType == null) {
374: return;
375: }
376: int i = 0;
377: int len = contentType.length();
378: loop1: while (i < len) {
379: switch (contentType.charAt(i)) {
380: case ' ':
381: case ';':
382: break loop1;
383: }
384: i++;
385: }
386: if (i == len) {
387: contentTypeMediaType = contentType;
388: } else {
389: contentTypeMediaType = contentType.substring(0, i);
390: }
391: for (;;) {
392: while (i < len && contentType.charAt(i) != ';') {
393: i++;
394: }
395: if (i == len) {
396: return;
397: }
398: i++;
399: while (i < len && contentType.charAt(i) == ' ') {
400: i++;
401: }
402: if (i >= len - 8) {
403: return;
404: }
405: if (contentType.charAt(i++) == 'c') {
406: if (contentType.charAt(i++) != 'h')
407: continue;
408: if (contentType.charAt(i++) != 'a')
409: continue;
410: if (contentType.charAt(i++) != 'r')
411: continue;
412: if (contentType.charAt(i++) != 's')
413: continue;
414: if (contentType.charAt(i++) != 'e')
415: continue;
416: if (contentType.charAt(i++) != 't')
417: continue;
418: if (contentType.charAt(i++) != '=')
419: continue;
420: int j = i;
421: loop2: while (i < len) {
422: switch (contentType.charAt(i)) {
423: case ' ':
424: case ';':
425: break loop2;
426: }
427: i++;
428: }
429: contentTypeCharset = contentType.substring(j, i);
430: return;
431: }
432: }
433: }
434:
435: /**
436: * Returns the content encoding if available. This is only available
437: * for some protocols.
438: */
439: public String getContentEncoding(String userAgent) {
440: if (contentEncoding != null)
441: return contentEncoding;
442:
443: if (!hasBeenOpened) {
444: try {
445: openStreamInternal(userAgent, null, null);
446: } catch (IOException ioe) { /* nothing */
447: }
448: }
449:
450: return contentEncoding;
451: }
452:
453: /**
454: * Returns true if the URL looks well formed and complete.
455: * This does not garuntee that the stream can be opened but
456: * is a good indication that things aren't totally messed up.
457: */
458: public boolean complete() {
459: try {
460: buildURL();
461: } catch (MalformedURLException mue) {
462: return false;
463: }
464: return true;
465: }
466:
467: /**
468: * Open the stream and check for common compression types. If
469: * the stream is found to be compressed with a standard
470: * compression type it is automatically decompressed.
471: * @param userAgent The user agent opening the stream (may be null).
472: * @param mimeTypes The expected mime types of the content
473: * in the returned InputStream (mapped to Http accept
474: * header among other possability). The elements of
475: * the iterator must be strings (may be null)
476: */
477: public InputStream openStream(String userAgent, Iterator mimeTypes)
478: throws IOException {
479: InputStream raw = openStreamInternal(userAgent, mimeTypes,
480: acceptedEncodings.iterator());
481: if (raw == null)
482: return null;
483: stream = null;
484:
485: return checkGZIP(raw);
486: }
487:
488: /**
489: * Open the stream and returns it. No checks are made to see
490: * if the stream is compressed or encoded in any way.
491: * @param userAgent The user agent opening the stream (may be null).
492: * @param mimeTypes The expected mime types of the content
493: * in the returned InputStream (mapped to Http accept
494: * header among other possability). The elements of
495: * the iterator must be strings (may be null)
496: */
497: public InputStream openStreamRaw(String userAgent,
498: Iterator mimeTypes) throws IOException {
499:
500: InputStream ret = openStreamInternal(userAgent, mimeTypes, null);
501: stream = null;
502: return ret;
503: }
504:
505: protected InputStream openStreamInternal(String userAgent,
506: Iterator mimeTypes, Iterator encodingTypes)
507: throws IOException {
508: if (stream != null)
509: return stream;
510:
511: hasBeenOpened = true;
512:
513: URL url = null;
514: try {
515: url = buildURL();
516: } catch (MalformedURLException mue) {
517: throw new IOException(
518: "Unable to make sense of URL for connection");
519: }
520:
521: if (url == null)
522: return null;
523:
524: URLConnection urlC = url.openConnection();
525: if (urlC instanceof HttpURLConnection) {
526: if (userAgent != null)
527: urlC.setRequestProperty(HTTP_USER_AGENT_HEADER,
528: userAgent);
529:
530: if (mimeTypes != null) {
531: String acceptHeader = "";
532: while (mimeTypes.hasNext()) {
533: acceptHeader += mimeTypes.next();
534: if (mimeTypes.hasNext())
535: acceptHeader += ",";
536: }
537: urlC.setRequestProperty(HTTP_ACCEPT_HEADER,
538: acceptHeader);
539: }
540:
541: if (encodingTypes != null) {
542: String encodingHeader = "";
543: while (encodingTypes.hasNext()) {
544: encodingHeader += encodingTypes.next();
545: if (encodingTypes.hasNext())
546: encodingHeader += ",";
547: }
548: urlC.setRequestProperty(HTTP_ACCEPT_ENCODING_HEADER,
549: encodingHeader);
550: }
551:
552: contentType = urlC.getContentType();
553: contentEncoding = urlC.getContentEncoding();
554: }
555:
556: return (stream = urlC.getInputStream());
557: }
558:
559: /**
560: * Returns the URL up to and include the port number on
561: * the host. Does not include the path or fragment pieces.
562: */
563: public String getPortStr() {
564: String portStr = "";
565: if (protocol != null)
566: portStr += protocol + ":";
567:
568: if ((host != null) || (port != -1)) {
569: portStr += "//";
570: if (host != null)
571: portStr += host;
572: if (port != -1)
573: portStr += ":" + port;
574: }
575:
576: return portStr;
577: }
578:
579: protected boolean sameFile(ParsedURLData other) {
580: if (this == other)
581: return true;
582:
583: // Check if the rest of the two PURLs matche other than
584: // the 'ref'
585: if ((port == other.port)
586: && ((path == other.path) || ((path != null) && path
587: .equals(other.path)))
588: && ((host == other.host) || ((host != null) && host
589: .equals(other.host)))
590: && ((protocol == other.protocol) || ((protocol != null) && protocol
591: .equals(other.protocol))))
592: return true;
593:
594: return false;
595: }
596:
597: /**
598: * Return a string representation of the data.
599: */
600: public String toString() {
601: String ret = getPortStr();
602: if (path != null)
603: ret += path;
604:
605: if (ref != null)
606: ret += "#" + ref;
607:
608: return ret;
609: }
610: }
|