001: /*
002: * Copyright 2001 Sun Microsystems, Inc. All rights reserved.
003: * PROPRIETARY/CONFIDENTIAL. Use of this product is subject to license terms.
004: */
005: package com.sun.portal.rproxy.rewriter.util.http;
006:
007: import java.io.*;
008: import com.sun.portal.log.common.PortalLogger;
009: import java.util.ArrayList;
010:
011: import org.mozilla.intl.chardet.nsDetector;
012: import org.mozilla.intl.chardet.nsICharsetDetectionObserver;
013: import org.mozilla.intl.chardet.nsPSMDetector;
014:
015: public class CharsetDetector {
016: private static boolean charsetDetect = true;
017: private String languageHint = System.getProperty("file.encoding");
018: private boolean found = false;
019: private String encoding;
020:
021: private static ArrayList chinese = new ArrayList();
022: private static ArrayList japanese = new ArrayList();
023: private static ArrayList korean = new ArrayList();
024: static {
025: chinese.add("GB2312");
026: chinese.add("GB18030");
027: chinese.add("Big5");
028: chinese.add("ISO-2022-CN");
029: chinese.add("HZ-GB-2312");
030: chinese.add("x-euc-tw");
031:
032: korean.add("EUC-KR");
033: korean.add("ISO-2022-KR");
034:
035: japanese.add("Shift_JIS");
036: japanese.add("EUC-JP");
037: japanese.add("ISO-2022-JP");
038: }
039:
040: public CharsetDetector() {
041: }
042:
043: public static boolean charsetDetectEnabled() {
044: return charsetDetect;
045: }
046:
047: public int getLanguageHint() {
048: return getLanguageHint(languageHint);
049: }
050:
051: public int getLanguageHint(String languageHint) {
052: int hint = nsPSMDetector.ALL;
053: if (chinese.contains(languageHint))
054: hint = nsPSMDetector.CHINESE;
055: else if (korean.contains(languageHint))
056: hint = nsPSMDetector.KOREAN;
057: else if (japanese.contains(languageHint))
058: hint = nsPSMDetector.JAPANESE;
059: return hint;
060: }
061:
062: public String detectCharset(HTTPData aHTTPData) {
063: // Initalize the nsDetector() ;
064: int lang = getLanguageHint();
065: nsDetector det = new nsDetector(lang);
066:
067: // Set an observer...
068: // The Notify() will be called when a matching charset is found.
069:
070: det.Init(new nsICharsetDetectionObserver() {
071: public void Notify(String charset) {
072: found = true;
073: encoding = charset;
074: }
075: });
076: try {
077: BufferedInputStream bis = new BufferedInputStream(
078: new ByteArrayInputStream(aHTTPData
079: .getContentBytes()));
080: byte[] buf = new byte[1024];
081: int len;
082: boolean done = false;
083: boolean isAscii = true;
084:
085: while ((len = bis.read(buf, 0, buf.length)) != -1) {
086:
087: // Check if the stream is only ascii.
088: if (isAscii)
089: isAscii = det.isAscii(buf, len);
090:
091: // DoIt if non-ascii and not done yet.
092: if (!isAscii && !done)
093: done = det.DoIt(buf, len, false);
094: }
095: det.DataEnd();
096:
097: if (isAscii) {
098: found = true;
099: encoding = "ASCII";
100: }
101:
102: if (!found) {
103: det.getProbableCharsets();
104: String[] probableEncoding = det.getProbableCharsets();
105: if (probableEncoding.length > 0) {
106: if (!(encoding
107: .equalsIgnoreCase(probableEncoding[0])))
108: encoding = new String(probableEncoding[0]);
109: }
110: }
111: } catch (java.io.IOException ioe) {
112: // Probably return without checking.
113: }
114: return encoding;
115: }
116: }
|