001: /*
002: * Portions Copyright 2003-2005 Sun Microsystems, Inc. All Rights Reserved.
003: * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
004: *
005: * This code is free software; you can redistribute it and/or modify it
006: * under the terms of the GNU General Public License version 2 only, as
007: * published by the Free Software Foundation. Sun designates this
008: * particular file as subject to the "Classpath" exception as provided
009: * by Sun in the LICENSE file that accompanied this code.
010: *
011: * This code is distributed in the hope that it will be useful, but WITHOUT
012: * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
013: * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
014: * version 2 for more details (a copy is included in the LICENSE file that
015: * accompanied this code).
016: *
017: * You should have received a copy of the GNU General Public License version
018: * 2 along with this work; if not, write to the Free Software Foundation,
019: * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
020: *
021: * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
022: * CA 95054 USA or visit www.sun.com if you need additional information or
023: * have any questions.
024: */
025:
026: /*
027: *******************************************************************************
028: * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
029: * *
030: * The original version of this source code and documentation is copyrighted *
031: * and owned by IBM, These materials are provided under terms of a License *
032: * Agreement between IBM and Sun. This technology is protected by multiple *
033: * US and International patents. This notice and attribution to IBM may not *
034: * to removed. *
035: *******************************************************************************
036: */
037:
038: package sun.text.normalizer;
039:
040: import java.io.InputStream;
041: import java.io.DataInputStream;
042: import java.io.IOException;
043: import java.util.Arrays;
044:
045: /**
046: * <p>A trie is a kind of compressed, serializable table of values
047: * associated with Unicode code points (0..0x10ffff).</p>
048: * <p>This class defines the basic structure of a trie and provides methods
049: * to <b>retrieve the offsets to the actual data</b>.</p>
050: * <p>Data will be the form of an array of basic types, char or int.</p>
051: * <p>The actual data format will have to be specified by the user in the
052: * inner static interface com.ibm.icu.impl.Trie.DataManipulate.</p>
053: * <p>This trie implementation is optimized for getting offset while walking
054: * forward through a UTF-16 string.
055: * Therefore, the simplest and fastest access macros are the
056: * fromLead() and fromOffsetTrail() methods.
057: * The fromBMP() method are a little more complicated; they get offsets even
058: * for lead surrogate codepoints, while the fromLead() method get special
059: * "folded" offsets for lead surrogate code units if there is relevant data
060: * associated with them.
061: * From such a folded offsets, an offset needs to be extracted to supply
062: * to the fromOffsetTrail() methods.
063: * To handle such supplementary codepoints, some offset information are kept
064: * in the data.</p>
065: * <p>Methods in com.ibm.icu.impl.Trie.DataManipulate are called to retrieve
066: * that offset from the folded value for the lead surrogate unit.</p>
067: * <p>For examples of use, see com.ibm.icu.impl.CharTrie or
068: * com.ibm.icu.impl.IntTrie.</p>
069: * @author synwee
070: * @see com.ibm.icu.impl.CharTrie
071: * @see com.ibm.icu.impl.IntTrie
072: * @since release 2.1, Jan 01 2002
073: */
074: public abstract class Trie {
075: // public class declaration ----------------------------------------
076:
077: /**
078: * Character data in com.ibm.impl.Trie have different user-specified format
079: * for different purposes.
080: * This interface specifies methods to be implemented in order for
081: * com.ibm.impl.Trie, to surrogate offset information encapsulated within
082: * the data.
083: * @draft 2.1
084: */
085: public static interface DataManipulate {
086: /**
087: * Called by com.ibm.icu.impl.Trie to extract from a lead surrogate's
088: * data
089: * the index array offset of the indexes for that lead surrogate.
090: * @param value data value for a surrogate from the trie, including the
091: * folding offset
092: * @return data offset or 0 if there is no data for the lead surrogate
093: * @draft 2.1
094: */
095: public int getFoldingOffset(int value);
096: }
097:
098: // protected constructor -------------------------------------------
099:
100: /**
101: * Trie constructor for CharTrie use.
102: * @param inputStream ICU data file input stream which contains the
103: * trie
104: * @param dataManipulate object containing the information to parse the
105: * trie data
106: * @throws IOException thrown when input stream does not have the
107: * right header.
108: * @draft 2.1
109: */
110: protected Trie(InputStream inputStream,
111: DataManipulate dataManipulate) throws IOException {
112: DataInputStream input = new DataInputStream(inputStream);
113: // Magic number to authenticate the data.
114: int signature = input.readInt();
115: m_options_ = input.readInt();
116:
117: if (!checkHeader(signature)) {
118: throw new IllegalArgumentException(
119: "ICU data file error: Trie header authentication failed, please check if you have the most updated ICU data file");
120: }
121:
122: m_dataManipulate_ = dataManipulate;
123: m_isLatin1Linear_ = (m_options_ & HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0;
124: m_dataOffset_ = input.readInt();
125: m_dataLength_ = input.readInt();
126: unserialize(inputStream);
127: }
128:
129: /**
130: * Trie constructor
131: * @param index array to be used for index
132: * @param options used by the trie
133: * @param dataManipulate object containing the information to parse the
134: * trie data
135: * @draft 2.2
136: */
137: protected Trie(char index[], int options,
138: DataManipulate dataManipulate) {
139: m_options_ = options;
140: m_dataManipulate_ = dataManipulate;
141: m_isLatin1Linear_ = (m_options_ & HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0;
142: m_index_ = index;
143: m_dataOffset_ = m_index_.length;
144: }
145:
146: // protected data members ------------------------------------------
147:
148: /**
149: * Lead surrogate code points' index displacement in the index array.
150: * 0x10000-0xd800=0x2800
151: * 0x2800 >> INDEX_STAGE_1_SHIFT_
152: */
153: protected static final int LEAD_INDEX_OFFSET_ = 0x2800 >> 5;
154: /**
155: * Shift size for shifting right the input index. 1..9
156: * @draft 2.1
157: */
158: protected static final int INDEX_STAGE_1_SHIFT_ = 5;
159: /**
160: * Shift size for shifting left the index array values.
161: * Increases possible data size with 16-bit index values at the cost
162: * of compactability.
163: * This requires blocks of stage 2 data to be aligned by
164: * DATA_GRANULARITY.
165: * 0..INDEX_STAGE_1_SHIFT
166: * @draft 2.1
167: */
168: protected static final int INDEX_STAGE_2_SHIFT_ = 2;
169: /**
170: * Mask for getting the lower bits from the input index.
171: * DATA_BLOCK_LENGTH_ - 1.
172: * @draft 2.1
173: */
174: protected static final int INDEX_STAGE_3_MASK_ = (1 << INDEX_STAGE_1_SHIFT_) - 1;
175: /**
176: * Surrogate mask to use when shifting offset to retrieve supplementary
177: * values
178: * @draft 2.1
179: */
180: protected static final int SURROGATE_MASK_ = 0x3FF;
181: /**
182: * Index or UTF16 characters
183: * @draft 2.1
184: */
185: protected char m_index_[];
186: /**
187: * Internal TrieValue which handles the parsing of the data value.
188: * This class is to be implemented by the user
189: * @draft 2.1
190: */
191: protected DataManipulate m_dataManipulate_;
192: /**
193: * Start index of the data portion of the trie. CharTrie combines
194: * index and data into a char array, so this is used to indicate the
195: * initial offset to the data portion.
196: * Note this index always points to the initial value.
197: * @draft 2.1
198: */
199: protected int m_dataOffset_;
200: /**
201: * Length of the data array
202: */
203: protected int m_dataLength_;
204:
205: // protected methods -----------------------------------------------
206:
207: /**
208: * Gets the offset to the data which the surrogate pair points to.
209: * @param lead lead surrogate
210: * @param trail trailing surrogate
211: * @return offset to data
212: * @draft 2.1
213: */
214: protected abstract int getSurrogateOffset(char lead, char trail);
215:
216: /**
217: * Gets the value at the argument index
218: * @param index value at index will be retrieved
219: * @return 32 bit value
220: * @draft 2.1
221: */
222: protected abstract int getValue(int index);
223:
224: /**
225: * Gets the default initial value
226: * @return 32 bit value
227: * @draft 2.1
228: */
229: protected abstract int getInitialValue();
230:
231: /**
232: * Gets the offset to the data which the index ch after variable offset
233: * points to.
234: * Note for locating a non-supplementary character data offset, calling
235: * <p>
236: * getRawOffset(0, ch);
237: * </p>
238: * will do. Otherwise if it is a supplementary character formed by
239: * surrogates lead and trail. Then we would have to call getRawOffset()
240: * with getFoldingIndexOffset(). See getSurrogateOffset().
241: * @param offset index offset which ch is to start from
242: * @param ch index to be used after offset
243: * @return offset to the data
244: * @draft 2.1
245: */
246: protected final int getRawOffset(int offset, char ch) {
247: return (m_index_[offset + (ch >> INDEX_STAGE_1_SHIFT_)] << INDEX_STAGE_2_SHIFT_)
248: + (ch & INDEX_STAGE_3_MASK_);
249: }
250:
251: /**
252: * Gets the offset to data which the BMP character points to
253: * Treats a lead surrogate as a normal code point.
254: * @param ch BMP character
255: * @return offset to data
256: * @draft 2.1
257: */
258: protected final int getBMPOffset(char ch) {
259: return (ch >= UTF16.LEAD_SURROGATE_MIN_VALUE && ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) ? getRawOffset(
260: LEAD_INDEX_OFFSET_, ch)
261: : getRawOffset(0, ch);
262: // using a getRawOffset(ch) makes no diff
263: }
264:
265: /**
266: * Gets the offset to the data which this lead surrogate character points
267: * to.
268: * Data at the returned offset may contain folding offset information for
269: * the next trailing surrogate character.
270: * @param ch lead surrogate character
271: * @return offset to data
272: * @draft 2.1
273: */
274: protected final int getLeadOffset(char ch) {
275: return getRawOffset(0, ch);
276: }
277:
278: /**
279: * Internal trie getter from a code point.
280: * Could be faster(?) but longer with
281: * if((c32)<=0xd7ff) { (result)=_TRIE_GET_RAW(trie, data, 0, c32); }
282: * Gets the offset to data which the codepoint points to
283: * @param ch codepoint
284: * @return offset to data
285: * @draft 2.1
286: */
287: protected final int getCodePointOffset(int ch) {
288: // if ((ch >> 16) == 0) slower
289: if (ch >= UTF16.CODEPOINT_MIN_VALUE
290: && ch < UTF16.SUPPLEMENTARY_MIN_VALUE) {
291: // BMP codepoint
292: return getBMPOffset((char) ch);
293: }
294: // for optimization
295: if (ch >= UTF16.CODEPOINT_MIN_VALUE
296: && ch <= UCharacter.MAX_VALUE) {
297: // look at the construction of supplementary characters
298: // trail forms the ends of it.
299: return getSurrogateOffset(UTF16.getLeadSurrogate(ch),
300: (char) (ch & SURROGATE_MASK_));
301: }
302: // return -1 if there is an error, in this case we return
303: return -1;
304: }
305:
306: /**
307: * <p>Parses the inputstream and creates the trie index with it.</p>
308: * <p>This is overwritten by the child classes.
309: * @param inputStream input stream containing the trie information
310: * @exception IOException thrown when data reading fails.
311: * @draft 2.1
312: */
313: protected void unserialize(InputStream inputStream)
314: throws IOException {
315: //indexLength is a multiple of 1024 >> INDEX_STAGE_2_SHIFT_
316: m_index_ = new char[m_dataOffset_];
317: DataInputStream input = new DataInputStream(inputStream);
318: for (int i = 0; i < m_dataOffset_; i++) {
319: m_index_[i] = input.readChar();
320: }
321: }
322:
323: /**
324: * Determines if this is a 32 bit trie
325: * @return true if options specifies this is a 32 bit trie
326: * @draft 2.1
327: */
328: protected final boolean isIntTrie() {
329: return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) != 0;
330: }
331:
332: /**
333: * Determines if this is a 16 bit trie
334: * @return true if this is a 16 bit trie
335: * @draft 2.1
336: */
337: protected final boolean isCharTrie() {
338: return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) == 0;
339: }
340:
341: // private data members --------------------------------------------
342:
343: /**
344: * Signature index
345: */
346: private static final int HEADER_SIGNATURE_INDEX_ = 0;
347: /**
348: * Options index
349: */
350: private static final int HEADER_OPTIONS_INDEX_ = 1 << 1;
351: /**
352: * Index length index
353: */
354: private static final int HEADER_INDEX_LENGTH_INDEX_ = 2 << 1;
355: /**
356: * Data length index
357: */
358: private static final int HEADER_DATA_LENGTH_INDEX_ = 3 << 1;
359: /**
360: * Size of header
361: */
362: private static final int HEADER_LENGTH_ = 4 << 1;
363: /**
364: * Latin 1 option mask
365: */
366: private static final int HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_ = 0x200;
367: /**
368: * Constant number to authenticate the byte block
369: */
370: private static final int HEADER_SIGNATURE_ = 0x54726965;
371: /**
372: * Header option formatting
373: */
374: private static final int HEADER_OPTIONS_SHIFT_MASK_ = 0xF;
375: private static final int HEADER_OPTIONS_INDEX_SHIFT_ = 4;
376: private static final int HEADER_OPTIONS_DATA_IS_32_BIT_ = 0x100;
377:
378: /**
379: * Flag indicator for Latin quick access data block
380: */
381: private boolean m_isLatin1Linear_;
382:
383: /**
384: * <p>Trie options field.</p>
385: * <p>options bit field:<br>
386: * 9 1 = Latin-1 data is stored linearly at data + DATA_BLOCK_LENGTH<br>
387: * 8 0 = 16-bit data, 1=32-bit data<br>
388: * 7..4 INDEX_STAGE_1_SHIFT // 0..INDEX_STAGE_2_SHIFT<br>
389: * 3..0 INDEX_STAGE_2_SHIFT // 1..9<br>
390: */
391: private int m_options_;
392:
393: // private methods ---------------------------------------------------
394:
395: /**
396: * Authenticates raw data header.
397: * Checking the header information, signature and options.
398: * @param rawdata array of char data to be checked
399: * @return true if the header is authenticated valid
400: * @draft 2.1
401: */
402: private final boolean checkHeader(int signature) {
403: // check the signature
404: // Trie in big-endian US-ASCII (0x54726965).
405: // Magic number to authenticate the data.
406: if (signature != HEADER_SIGNATURE_) {
407: return false;
408: }
409:
410: if ((m_options_ & HEADER_OPTIONS_SHIFT_MASK_) != INDEX_STAGE_1_SHIFT_
411: || ((m_options_ >> HEADER_OPTIONS_INDEX_SHIFT_) & HEADER_OPTIONS_SHIFT_MASK_) != INDEX_STAGE_2_SHIFT_) {
412: return false;
413: }
414: return true;
415: }
416: }
|