001: /*
002: * Copyright 1997-2006 Sun Microsystems, Inc. All Rights Reserved.
003: * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
004: *
005: * This code is free software; you can redistribute it and/or modify it
006: * under the terms of the GNU General Public License version 2 only, as
007: * published by the Free Software Foundation. Sun designates this
008: * particular file as subject to the "Classpath" exception as provided
009: * by Sun in the LICENSE file that accompanied this code.
010: *
011: * This code is distributed in the hope that it will be useful, but WITHOUT
012: * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
013: * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
014: * version 2 for more details (a copy is included in the LICENSE file that
015: * accompanied this code).
016: *
017: * You should have received a copy of the GNU General Public License version
018: * 2 along with this work; if not, write to the Free Software Foundation,
019: * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
020: *
021: * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
022: * CA 95054 USA or visit www.sun.com if you need additional information or
023: * have any questions.
024: */
025:
026: package com.sun.activation.registries;
027:
028: /**
029: * A tokenizer for strings in the form of "foo/bar; prop1=val1; ... ".
030: * Useful for parsing MIME content types.
031: */
032: public class MailcapTokenizer {
033:
034: public static final int UNKNOWN_TOKEN = 0;
035: public static final int START_TOKEN = 1;
036: public static final int STRING_TOKEN = 2;
037: public static final int EOI_TOKEN = 5;
038: public static final int SLASH_TOKEN = '/';
039: public static final int SEMICOLON_TOKEN = ';';
040: public static final int EQUALS_TOKEN = '=';
041:
042: /**
043: * Constructor
044: *
045: * @parameter inputString the string to tokenize
046: */
047: public MailcapTokenizer(String inputString) {
048: data = inputString;
049: dataIndex = 0;
050: dataLength = inputString.length();
051:
052: currentToken = START_TOKEN;
053: currentTokenValue = "";
054:
055: isAutoquoting = false;
056: autoquoteChar = ';';
057: }
058:
059: /**
060: * Set whether auto-quoting is on or off.
061: *
062: * Auto-quoting means that all characters after the first
063: * non-whitespace, non-control character up to the auto-quote
064: * terminator character or EOI (minus any whitespace immediatley
065: * preceeding it) is considered a token.
066: *
067: * This is required for handling command strings in a mailcap entry.
068: */
069: public void setIsAutoquoting(boolean value) {
070: isAutoquoting = value;
071: }
072:
073: /**
074: * Retrieve current token.
075: *
076: * @returns The current token value
077: */
078: public int getCurrentToken() {
079: return currentToken;
080: }
081:
082: /*
083: * Get a String that describes the given token.
084: */
085: public static String nameForToken(int token) {
086: String name = "really unknown";
087:
088: switch (token) {
089: case UNKNOWN_TOKEN:
090: name = "unknown";
091: break;
092: case START_TOKEN:
093: name = "start";
094: break;
095: case STRING_TOKEN:
096: name = "string";
097: break;
098: case EOI_TOKEN:
099: name = "EOI";
100: break;
101: case SLASH_TOKEN:
102: name = "'/'";
103: break;
104: case SEMICOLON_TOKEN:
105: name = "';'";
106: break;
107: case EQUALS_TOKEN:
108: name = "'='";
109: break;
110: }
111:
112: return name;
113: }
114:
115: /*
116: * Retrieve current token value.
117: *
118: * @returns A String containing the current token value
119: */
120: public String getCurrentTokenValue() {
121: return currentTokenValue;
122: }
123:
124: /*
125: * Process the next token.
126: *
127: * @returns the next token
128: */
129: public int nextToken() {
130: if (dataIndex < dataLength) {
131: // skip white space
132: while ((dataIndex < dataLength)
133: && (isWhiteSpaceChar(data.charAt(dataIndex)))) {
134: ++dataIndex;
135: }
136:
137: if (dataIndex < dataLength) {
138: // examine the current character and see what kind of token we have
139: char c = data.charAt(dataIndex);
140: if (isAutoquoting) {
141: if (c == ';' || c == '=') {
142: currentToken = c;
143: currentTokenValue = new Character(c).toString();
144: ++dataIndex;
145: } else {
146: processAutoquoteToken();
147: }
148: } else {
149: if (isStringTokenChar(c)) {
150: processStringToken();
151: } else if ((c == '/') || (c == ';') || (c == '=')) {
152: currentToken = c;
153: currentTokenValue = new Character(c).toString();
154: ++dataIndex;
155: } else {
156: currentToken = UNKNOWN_TOKEN;
157: currentTokenValue = new Character(c).toString();
158: ++dataIndex;
159: }
160: }
161: } else {
162: currentToken = EOI_TOKEN;
163: currentTokenValue = null;
164: }
165: } else {
166: currentToken = EOI_TOKEN;
167: currentTokenValue = null;
168: }
169:
170: return currentToken;
171: }
172:
173: private void processStringToken() {
174: // capture the initial index
175: int initialIndex = dataIndex;
176:
177: // skip to 1st non string token character
178: while ((dataIndex < dataLength)
179: && isStringTokenChar(data.charAt(dataIndex))) {
180: ++dataIndex;
181: }
182:
183: currentToken = STRING_TOKEN;
184: currentTokenValue = data.substring(initialIndex, dataIndex);
185: }
186:
187: private void processAutoquoteToken() {
188: // capture the initial index
189: int initialIndex = dataIndex;
190:
191: // now skip to the 1st non-escaped autoquote termination character
192: // XXX - doesn't actually consider escaping
193: boolean foundTerminator = false;
194: while ((dataIndex < dataLength) && !foundTerminator) {
195: char c = data.charAt(dataIndex);
196: if (c != autoquoteChar) {
197: ++dataIndex;
198: } else {
199: foundTerminator = true;
200: }
201: }
202:
203: currentToken = STRING_TOKEN;
204: currentTokenValue = fixEscapeSequences(data.substring(
205: initialIndex, dataIndex));
206: }
207:
208: private static boolean isSpecialChar(char c) {
209: boolean lAnswer = false;
210:
211: switch (c) {
212: case '(':
213: case ')':
214: case '<':
215: case '>':
216: case '@':
217: case ',':
218: case ';':
219: case ':':
220: case '\\':
221: case '"':
222: case '/':
223: case '[':
224: case ']':
225: case '?':
226: case '=':
227: lAnswer = true;
228: break;
229: }
230:
231: return lAnswer;
232: }
233:
234: private static boolean isControlChar(char c) {
235: return Character.isISOControl(c);
236: }
237:
238: private static boolean isWhiteSpaceChar(char c) {
239: return Character.isWhitespace(c);
240: }
241:
242: private static boolean isStringTokenChar(char c) {
243: return !isSpecialChar(c) && !isControlChar(c)
244: && !isWhiteSpaceChar(c);
245: }
246:
247: private static String fixEscapeSequences(String inputString) {
248: int inputLength = inputString.length();
249: StringBuffer buffer = new StringBuffer();
250: buffer.ensureCapacity(inputLength);
251:
252: for (int i = 0; i < inputLength; ++i) {
253: char currentChar = inputString.charAt(i);
254: if (currentChar != '\\') {
255: buffer.append(currentChar);
256: } else {
257: if (i < inputLength - 1) {
258: char nextChar = inputString.charAt(i + 1);
259: buffer.append(nextChar);
260:
261: // force a skip over the next character too
262: ++i;
263: } else {
264: buffer.append(currentChar);
265: }
266: }
267: }
268:
269: return buffer.toString();
270: }
271:
272: private String data;
273: private int dataIndex;
274: private int dataLength;
275: private int currentToken;
276: private String currentTokenValue;
277: private boolean isAutoquoting;
278: private char autoquoteChar;
279:
280: /*
281: public static void main(String[] args) {
282: for (int i = 0; i < args.length; ++i) {
283: MailcapTokenizer tokenizer = new MailcapTokenizer(args[i]);
284:
285: System.out.println("Original: |" + args[i] + "|");
286:
287: int currentToken = tokenizer.nextToken();
288: while (currentToken != EOI_TOKEN) {
289: switch(currentToken) {
290: case UNKNOWN_TOKEN:
291: System.out.println(" Unknown Token: |" + tokenizer.getCurrentTokenValue() + "|");
292: break;
293: case START_TOKEN:
294: System.out.println(" Start Token: |" + tokenizer.getCurrentTokenValue() + "|");
295: break;
296: case STRING_TOKEN:
297: System.out.println(" String Token: |" + tokenizer.getCurrentTokenValue() + "|");
298: break;
299: case EOI_TOKEN:
300: System.out.println(" EOI Token: |" + tokenizer.getCurrentTokenValue() + "|");
301: break;
302: case SLASH_TOKEN:
303: System.out.println(" Slash Token: |" + tokenizer.getCurrentTokenValue() + "|");
304: break;
305: case SEMICOLON_TOKEN:
306: System.out.println(" Semicolon Token: |" + tokenizer.getCurrentTokenValue() + "|");
307: break;
308: case EQUALS_TOKEN:
309: System.out.println(" Equals Token: |" + tokenizer.getCurrentTokenValue() + "|");
310: break;
311: default:
312: System.out.println(" Really Unknown Token: |" + tokenizer.getCurrentTokenValue() + "|");
313: break;
314: }
315:
316: currentToken = tokenizer.nextToken();
317: }
318:
319: System.out.println("");
320: }
321: }
322: */
323: }
|