001: /*
002: * Copyright 2001 Sun Microsystems, Inc. All rights reserved.
003: * PROPRIETARY/CONFIDENTIAL. Use of this product is subject to license terms.
004: */
005: package com.sun.portal.rproxy.rewriter.util.http;
006:
007: import com.sun.portal.rewriter.engines.LanguageConstants;
008: import com.sun.portal.log.common.PortalLogger;
009: import com.sun.portal.rewriter.util.StringHelper;
010: import com.sun.portal.rewriter.util.xml.Tag;
011: import com.sun.portal.rewriter.util.xml.TagParser;
012:
013: import java.io.IOException;
014:
015: public final class MIMEAndEncodingParser {
016: private static final String CONTENT_TYPE_PATTERN = "Content-type:";
017: private static final String CONTENT_PATTERN = "Content";
018: private static final String CHARSET_PATTERN = "charset";
019: private static final String CHARSET_EQUALS_PATTERN = CHARSET_PATTERN
020: + "=";
021: private static boolean charsetDetectionEnabled = false;
022:
023: static {
024:
025: /* Check if charecter detection is enabled. It is optional since it is a performance issue.
026: Right now we will just check if the chardet.jar is in classpath. If it is there then
027: set the flag to true.
028: */
029: try {
030: Class.forName("org.mozilla.intl.chardet.nsDetector");
031: charsetDetectionEnabled = true;
032: } catch (ClassNotFoundException e) {
033: // Character detection jar file not found.
034: }
035:
036: }
037:
038: /**
039: * Content type can contain the encoding seperated with ';'
040: * eg. text/html;charset=UTF-8
041: */
042: private static String parseMIME(final String aContentType,
043: final String aContentTypeLowerCase) {
044: int i = aContentTypeLowerCase.indexOf(';');
045: if (i == -1) {
046: return StringHelper.trimQuotes(aContentType);
047: } else {
048: return StringHelper
049: .trimQuotes(aContentType.substring(0, i));
050: }
051: }//parseMIME()
052:
053: /**
054: * Content type can contain the encoding seperated with ';'
055: * eg. text/html;charset=UTF-8
056: */
057: private static final String parseEncoding(
058: final String aContentType,
059: final String aContentTypeLowerCase)
060:
061: {
062: String lEncoding = null;
063: int start = aContentTypeLowerCase
064: .indexOf(CHARSET_EQUALS_PATTERN);
065:
066: if (start != -1) {
067: /*
068: BugNo:4711193
069: When the webserver encoding is set ISO-8859-1, and the page
070: had meta tag with charset=UTF-8, the headder sent was of
071: the below pattern:
072: Content-type: text/html;charset=UTF-8; charset=ISO-8859-1
073: Netscape and IE takes the first one as charset i.e UTF-8
074: */
075: int end = aContentTypeLowerCase.indexOf(';', start);
076: if (end == -1) {
077: end = aContentTypeLowerCase.length();
078: }
079:
080: if (end - start > CHARSET_EQUALS_PATTERN.length()) {
081: lEncoding = aContentType.substring(start
082: + CHARSET_EQUALS_PATTERN.length(), end);
083: }
084: }
085:
086: return validateEncoding(lEncoding);
087: }//parseEncoding()
088:
089: private static String validateEncoding(String aPrabableEncoding) {
090: if (aPrabableEncoding == null) {
091: return null;
092: }
093:
094: if (aPrabableEncoding.length() > 0) {
095: //BugNo:4750041
096: aPrabableEncoding = StringHelper
097: .trimQuotes(aPrabableEncoding);
098: //Bug No:4865959
099: //We found one charset within a <meta> tag
100: //the charset value can only contain letter, digit,
101: //charcter '-' or '_'
102: //Another important thing is we need to elimated such kind of stmt
103: //written in DHTML..
104: char ch;
105: int endCE = -1;
106: while (++endCE < aPrabableEncoding.length()) {
107: ch = aPrabableEncoding.charAt(endCE);
108: if (!(Character.isLetterOrDigit(ch) || (ch == '-') || (ch == '_'))) {
109: break;
110: }
111: }//while loop
112:
113: if (endCE > 0 && endCE <= aPrabableEncoding.length()) {
114: return aPrabableEncoding.substring(0, endCE);
115: }
116: }
117: return null;
118: }//validateEncoding()
119:
120: private static void parseMIMEAndEncoding(
121: final ContentInfo aContentTypeInfo,
122: final String aContentType) {
123: String lContentType = StringHelper.trimQuotes(aContentType);
124:
125: String lContentTypeLowerCase = lContentType.toLowerCase();
126:
127: //may mime is present in protocol headder and not in meta tag
128: //or wise versa
129: aContentTypeInfo.setMIME(parseMIME(lContentType,
130: lContentTypeLowerCase));
131:
132: //may be encoding is present in protocol headder and not in meta tag
133: //or wise versa
134: aContentTypeInfo.setEncoding(parseEncoding(lContentType,
135: lContentTypeLowerCase));
136:
137: }//parseMIMEAndEncoding()
138:
139: /**
140: * This method gets the character encoding from the response
141: * hearer. If not found, it trys to go through the response
142: * aContent to figure out the character encoding setting
143: * possiblely specified in the html <meta> tag.
144: */
145: public static final ContentInfo parse(HTTPData aHTTPData)
146: throws IOException {
147: parseHeadder(aHTTPData);
148: if (!isNeeds2ScanBody(aHTTPData)) {
149: return aHTTPData.getContentInfo();
150: }
151:
152: /**
153: * The character encoding info was not found in the response
154: * header. We have to parse through the aContent portion to
155: * figure it out. It may be specified in the html <meta> tag
156: * as the following;
157: *
158: * <html>
159: * <head>
160: * <meta http-equiv="xyz" type="yes">
161: * <meta http-equiv="Content-Type" Content="text/html; charset=gb2312">
162: * </head>
163: * ...
164: * Make sure your logic would take of multiple meta tags and one of
165: * the tag would have content type details
166: */
167: //initialize the RawXXX vars
168: String lContent = aHTTPData.getRawEncodedString();
169: String lContentLowerCase = aHTTPData
170: .getRawEncodedStringLowerCase();
171:
172: int bMetaTagStartIndex;
173: int bMetaTagEndIndex;
174: int cursorIndex = 0;
175:
176: int bCharsetStartIndex;
177: while (true) {
178: bMetaTagStartIndex = lContentLowerCase.indexOf("<meta",
179: cursorIndex);
180:
181: if (bMetaTagStartIndex == -1) {
182: //There are no meta tags found
183: break;
184: }
185:
186: bMetaTagEndIndex = lContentLowerCase.indexOf(">",
187: bMetaTagStartIndex);
188: if (bMetaTagEndIndex == -1) {
189: break;
190: }
191:
192: bCharsetStartIndex = lContentLowerCase.indexOf(
193: CHARSET_EQUALS_PATTERN, bMetaTagStartIndex);
194: if (bCharsetStartIndex == -1) {
195: break;
196: }
197:
198: if (bMetaTagEndIndex < bCharsetStartIndex) {
199: //charset index is out side end index so igore this meta tag
200: cursorIndex = bMetaTagEndIndex;
201: continue;
202: } else {
203: String bMetaTag = lContent.substring(
204: bMetaTagStartIndex, bMetaTagEndIndex - 1);
205: Tag bTag = TagParser.parse(bMetaTag);
206: String bContentValue = bTag.get(CONTENT_PATTERN);
207: parseMIMEAndEncoding(aHTTPData.getContentInfo(),
208: bContentValue);
209: if (aHTTPData.getContentInfo().isInvalidEncoding()) {
210: //Handle case of kind
211: // <meta http-equiv="content-type" content="text/html" charset="windows-1251" />
212: aHTTPData.getContentInfo()
213: .setEncoding(
214: validateEncoding(bTag
215: .get(CHARSET_PATTERN)));
216: }
217: break;
218: }
219: }//while loop
220:
221: // If mozilla character detection module is installed, use it to check for the encoding.
222: if ((aHTTPData.getContentInfo().isInvalidEncoding())) {
223: if (charsetDetectionEnabled) {
224: CharsetDetector cdet = new CharsetDetector();
225: String aEncoding = cdet.detectCharset(aHTTPData);
226: aHTTPData.getContentInfo().setEncoding(aEncoding);
227: }
228: }
229:
230: return aHTTPData.getContentInfo();
231: }//parse()
232:
233: public static void parseHeadder(HTTPData aHTTPData) {
234: String headder = aHTTPData.getContentType();
235: /**
236: * At first, try the Content-Type header in the response
237: * header portion.
238: */
239: if (headder != null && headder.length() > 0)
240:
241: {
242: int index = headder.indexOf(CONTENT_TYPE_PATTERN);
243: String bContentType;
244: if (index == -1) {
245: bContentType = headder;
246: } else {
247: bContentType = headder.substring(index
248: + CONTENT_TYPE_PATTERN.length());
249: }
250:
251: parseMIMEAndEncoding(aHTTPData.getContentInfo(),
252: bContentType);
253: }
254: }//parseHeadder()
255:
256: private static boolean isNeeds2ScanBody(HTTPData aHTTPData)
257: throws IOException {
258: ContentInfo lContentInfo = aHTTPData.getContentInfo();
259: //for images and other binary formats encoding has no meaning..
260: if (lContentInfo.isInvalidMIME()
261: || (lContentInfo.isInvalidEncoding() && (lContentInfo
262: .getMIME().equalsIgnoreCase(
263: LanguageConstants.HTML_MIME) || lContentInfo
264: .getMIME().equalsIgnoreCase("text/htm")))) {
265: if ((aHTTPData.getContentBytes() == null)
266: || (aHTTPData.getContentBytes().length == 0)) {
267: return false;
268: }
269:
270: return true;
271: }
272:
273: return false;
274: }//isNeeds2ScanBody()
275:
276: }//class MIMEAndEncodingParser
|