001: /*
002: * Portions Copyright 2005-2006 Sun Microsystems, Inc. All Rights Reserved.
003: * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
004: *
005: * This code is free software; you can redistribute it and/or modify it
006: * under the terms of the GNU General Public License version 2 only, as
007: * published by the Free Software Foundation. Sun designates this
008: * particular file as subject to the "Classpath" exception as provided
009: * by Sun in the LICENSE file that accompanied this code.
010: *
011: * This code is distributed in the hope that it will be useful, but WITHOUT
012: * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
013: * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
014: * version 2 for more details (a copy is included in the LICENSE file that
015: * accompanied this code).
016: *
017: * You should have received a copy of the GNU General Public License version
018: * 2 along with this work; if not, write to the Free Software Foundation,
019: * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
020: *
021: * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
022: * CA 95054 USA or visit www.sun.com if you need additional information or
023: * have any questions.
024: */
025:
026: /*
027: *******************************************************************************
028: * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
029: * *
030: * The original version of this source code and documentation is copyrighted *
031: * and owned by IBM, These materials are provided under terms of a License *
032: * Agreement between IBM and Sun. This technology is protected by multiple *
033: * US and International patents. This notice and attribution to IBM may not *
034: * to removed. *
035: *******************************************************************************
036: */
037:
038: package sun.text.normalizer;
039:
040: import java.io.BufferedInputStream;
041: import java.io.InputStream;
042: import java.io.IOException;
043: import java.text.BreakIterator;
044: import java.util.Locale;
045:
046: /**
047: * <p>Internal class used for Unicode character property database.</p>
048: * <p>This classes store binary data read from uprops.icu.
049: * It does not have the capability to parse the data into more high-level
050: * information. It only returns bytes of information when required.</p>
051: * <p>Due to the form most commonly used for retrieval, array of char is used
052: * to store the binary data.</p>
053: * <p>UCharacterPropertyDB also contains information on accessing indexes to
054: * significant points in the binary data.</p>
055: * <p>Responsibility for molding the binary data into more meaning form lies on
056: * <a href=UCharacter.html>UCharacter</a>.</p>
057: * @author Syn Wee Quek
058: * @since release 2.1, february 1st 2002
059: * @draft 2.1
060: */
061:
062: public final class UCharacterProperty implements Trie.DataManipulate {
063: // public data members -----------------------------------------------
064:
065: /**
066: * Trie data
067: */
068: public CharTrie m_trie_;
069: /**
070: * Optimization
071: * CharTrie index array
072: */
073: public char[] m_trieIndex_;
074: /**
075: * Optimization
076: * CharTrie data array
077: */
078: public char[] m_trieData_;
079: /**
080: * Optimization
081: * CharTrie data offset
082: */
083: public int m_trieInitialValue_;
084: /**
085: * Character property table
086: */
087: public int m_property_[];
088: /**
089: * Unicode version
090: */
091: public VersionInfo m_unicodeVersion_;
092: /**
093: * Exception indicator for uppercase type
094: */
095: public static final int EXC_UPPERCASE_ = 0;
096: /**
097: * Exception indicator for lowercase type
098: */
099: public static final int EXC_LOWERCASE_ = 1;
100: /**
101: * Exception indicator for titlecase type
102: */
103: public static final int EXC_TITLECASE_ = 2;
104: /**
105: * Exception indicator for digit type
106: */
107: public static final int EXC_UNUSED_ = 3;
108: /**
109: * Exception indicator for numeric type
110: */
111: public static final int EXC_NUMERIC_VALUE_ = 4;
112: /**
113: * Exception indicator for denominator type
114: */
115: public static final int EXC_DENOMINATOR_VALUE_ = 5;
116: /**
117: * Exception indicator for mirror type
118: */
119: public static final int EXC_MIRROR_MAPPING_ = 6;
120: /**
121: * Exception indicator for special casing type
122: */
123: public static final int EXC_SPECIAL_CASING_ = 7;
124: /**
125: * Exception indicator for case folding type
126: */
127: public static final int EXC_CASE_FOLDING_ = 8;
128: /**
129: * EXC_COMBINING_CLASS_ is not found in ICU.
130: * Used to retrieve the combining class of the character in the exception
131: * value
132: */
133: public static final int EXC_COMBINING_CLASS_ = 9;
134:
135: /**
136: * Latin lowercase i
137: */
138: public static final char LATIN_SMALL_LETTER_I_ = 0x69;
139: /**
140: * Character type mask
141: */
142: public static final int TYPE_MASK = 0x1F;
143: /**
144: * Exception test mask
145: */
146: public static final int EXCEPTION_MASK = 0x20;
147:
148: // public methods ----------------------------------------------------
149:
150: /**
151: * Java friends implementation
152: */
153: public void setIndexData(CharTrie.FriendAgent friendagent) {
154: m_trieIndex_ = friendagent.getPrivateIndex();
155: m_trieData_ = friendagent.getPrivateData();
156: m_trieInitialValue_ = friendagent.getPrivateInitialValue();
157: }
158:
159: /**
160: * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's
161: * data the index array offset of the indexes for that lead surrogate.
162: * @param value data value for a surrogate from the trie, including the
163: * folding offset
164: * @return data offset or 0 if there is no data for the lead surrogate
165: */
166: public int getFoldingOffset(int value) {
167: if ((value & SUPPLEMENTARY_FOLD_INDICATOR_MASK_) != 0) {
168: return (value & SUPPLEMENTARY_FOLD_OFFSET_MASK_);
169: } else {
170: return 0;
171: }
172: }
173:
174: /**
175: * Gets the property value at the index.
176: * This is optimized.
177: * Note this is alittle different from CharTrie the index m_trieData_
178: * is never negative.
179: * @param ch code point whose property value is to be retrieved
180: * @return property value of code point
181: */
182: public int getProperty(int ch) {
183: if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE
184: || (ch > UTF16.LEAD_SURROGATE_MAX_VALUE && ch < UTF16.SUPPLEMENTARY_MIN_VALUE)) {
185: // BMP codepoint
186: // optimized
187: try {
188: return m_property_[m_trieData_[(m_trieIndex_[ch >> Trie.INDEX_STAGE_1_SHIFT_] << Trie.INDEX_STAGE_2_SHIFT_)
189: + (ch & Trie.INDEX_STAGE_3_MASK_)]];
190: } catch (ArrayIndexOutOfBoundsException e) {
191: return m_property_[m_trieInitialValue_];
192: }
193: }
194: if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
195: return m_property_[m_trieData_[(m_trieIndex_[Trie.LEAD_INDEX_OFFSET_
196: + (ch >> Trie.INDEX_STAGE_1_SHIFT_)] << Trie.INDEX_STAGE_2_SHIFT_)
197: + (ch & Trie.INDEX_STAGE_3_MASK_)]];
198: }
199: // for optimization
200: if (ch <= UTF16.CODEPOINT_MAX_VALUE) {
201: // look at the construction of supplementary characters
202: // trail forms the ends of it.
203: return m_property_[m_trie_.getSurrogateValue(UTF16
204: .getLeadSurrogate(ch),
205: (char) (ch & Trie.SURROGATE_MASK_))];
206: }
207: // return m_dataOffset_ if there is an error, in this case we return
208: // the default value: m_initialValue_
209: // we cannot assume that m_initialValue_ is at offset 0
210: // this is for optimization.
211: return m_property_[m_trieInitialValue_];
212: // return m_property_[m_trie_.getCodePointValue(ch)];
213: }
214:
215: /**
216: * Getting the signed numeric value of a character embedded in the property
217: * argument
218: * @param prop the character
219: * @return signed numberic value
220: */
221: public static int getSignedValue(int prop) {
222: return (prop >> VALUE_SHIFT_);
223: }
224:
225: /**
226: * Getting the exception index for argument property
227: * @param prop character property
228: * @return exception index
229: */
230: public static int getExceptionIndex(int prop) {
231: return (prop >> VALUE_SHIFT_)
232: & UNSIGNED_VALUE_MASK_AFTER_SHIFT_;
233: }
234:
235: /**
236: * Determines if the exception value passed in has the kind of information
237: * which the indicator wants, e.g if the exception value contains the digit
238: * value of the character
239: * @param index exception index
240: * @param indicator type indicator
241: * @return true if type value exist
242: */
243: public boolean hasExceptionValue(int index, int indicator) {
244: return (m_exception_[index] & (1 << indicator)) != 0;
245: }
246:
247: /**
248: * Gets the exception value at the index, assuming that data type is
249: * available. Result is undefined if data is not available. Use
250: * hasExceptionValue() to determine data's availability.
251: * @param index
252: * @param etype exception data type
253: * @return exception data type value at index
254: */
255: public int getException(int index, int etype) {
256: // contained in exception data
257: if (etype == EXC_COMBINING_CLASS_) {
258: return m_exception_[index];
259: }
260: // contained in the exception digit address
261: index = addExceptionOffset(m_exception_[index], etype, ++index);
262: return m_exception_[index];
263: }
264:
265: /**
266: * Gets the folded case value at the index
267: * @param index of the case value to be retrieved
268: * @param count number of characters to retrieve
269: * @param str string buffer to which to append the result
270: */
271: public void getFoldCase(int index, int count, StringBuffer str) {
272: // first 2 chars are for the simple mappings
273: index += 2;
274: while (count > 0) {
275: str.append(m_case_[index]);
276: index++;
277: count--;
278: }
279: }
280:
281: /**
282: * Gets the unicode additional properties.
283: * C version getUnicodeProperties.
284: * @param codepoint codepoint whose additional properties is to be
285: * retrieved
286: * @return unicode properties
287: */
288: public int getAdditional(int codepoint) {
289: return m_additionalVectors_[m_additionalTrie_
290: .getCodePointValue(codepoint)];
291: }
292:
293: /**
294: * <p>Get the "age" of the code point.</p>
295: * <p>The "age" is the Unicode version when the code point was first
296: * designated (as a non-character or for Private Use) or assigned a
297: * character.</p>
298: * <p>This can be useful to avoid emitting code points to receiving
299: * processes that do not accept newer characters.</p>
300: * <p>The data is from the UCD file DerivedAge.txt.</p>
301: * <p>This API does not check the validity of the codepoint.</p>
302: * @param codepoint The code point.
303: * @return the Unicode version number
304: * @draft ICU 2.1
305: */
306: public VersionInfo getAge(int codepoint) {
307: int version = getAdditional(codepoint) >> AGE_SHIFT_;
308: return VersionInfo.getInstance((version >> FIRST_NIBBLE_SHIFT_)
309: & LAST_NIBBLE_MASK_, version & LAST_NIBBLE_MASK_, 0, 0);
310: }
311:
312: /**
313: * Forms a supplementary code point from the argument character<br>
314: * Note this is for internal use hence no checks for the validity of the
315: * surrogate characters are done
316: * @param lead lead surrogate character
317: * @param trail trailing surrogate character
318: * @return code point of the supplementary character
319: */
320: public static int getRawSupplementary(char lead, char trail) {
321: return (lead << LEAD_SURROGATE_SHIFT_) + trail
322: + SURROGATE_OFFSET_;
323: }
324:
325: /**
326: * Loads the property data and initialize the UCharacterProperty instance.
327: * @throws RuntimeException when data is missing or data has been corrupted
328: */
329: public static UCharacterProperty getInstance()
330: throws RuntimeException {
331: if (INSTANCE_ == null) {
332: try {
333: INSTANCE_ = new UCharacterProperty();
334: } catch (Exception e) {
335: throw new RuntimeException(e.getMessage());
336: }
337: }
338: return INSTANCE_;
339: }
340:
341: /**
342: * Checks if the argument c is to be treated as a white space in ICU
343: * rules. Usually ICU rule white spaces are ignored unless quoted.
344: * @param c codepoint to check
345: * @return true if c is a ICU white space
346: */
347: public static boolean isRuleWhiteSpace(int c) {
348: /* "white space" in the sense of ICU rule parsers
349: This is a FIXED LIST that is NOT DEPENDENT ON UNICODE PROPERTIES.
350: See UTR #31: http://www.unicode.org/reports/tr31/.
351: U+0009..U+000D, U+0020, U+0085, U+200E..U+200F, and U+2028..U+2029
352: */
353: return (c >= 0x0009 && c <= 0x2029 && (c <= 0x000D
354: || c == 0x0020 || c == 0x0085 || c == 0x200E
355: || c == 0x200F || c >= 0x2028));
356: }
357:
358: // protected variables -----------------------------------------------
359:
360: /**
361: * Case table
362: */
363: char m_case_[];
364:
365: /**
366: * Exception property table
367: */
368: int m_exception_[];
369: /**
370: * Extra property trie
371: */
372: CharTrie m_additionalTrie_;
373: /**
374: * Extra property vectors, 1st column for age and second for binary
375: * properties.
376: */
377: int m_additionalVectors_[];
378: /**
379: * Number of additional columns
380: */
381: int m_additionalColumnsCount_;
382: /**
383: * Maximum values for block, bits used as in vector word
384: * 0
385: */
386: int m_maxBlockScriptValue_;
387: /**
388: * Maximum values for script, bits used as in vector word
389: * 0
390: */
391: int m_maxJTGValue_;
392:
393: // private variables -------------------------------------------------
394:
395: /**
396: * UnicodeData.txt property object
397: */
398: private static UCharacterProperty INSTANCE_ = null;
399:
400: /**
401: * Default name of the datafile
402: */
403: private static final String DATA_FILE_NAME_ = "/sun/text/resources/uprops.icu";
404:
405: /**
406: * Default buffer size of datafile
407: */
408: private static final int DATA_BUFFER_SIZE_ = 25000;
409:
410: /**
411: * This, from what i infer is the max size of the indicators used for the
412: * exception values.
413: * Number of bits in an 8-bit integer value
414: */
415: private static final int EXC_GROUP_ = 8;
416:
417: /**
418: * Mask to get the group
419: */
420: private static final int EXC_GROUP_MASK_ = 255;
421:
422: /**
423: * Mask to get the digit value in the exception result
424: */
425: private static final int EXC_DIGIT_MASK_ = 0xFFFF;
426:
427: /**
428: * Offset table for data in exception block.<br>
429: * Table formed by the number of bits used for the index, e.g. 0 = 0 bits,
430: * 1 = 1 bits.
431: */
432: private static final byte FLAGS_OFFSET_[] = { 0, 1, 1, 2, 1, 2, 2,
433: 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3,
434: 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4,
435: 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2,
436: 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4,
437: 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4,
438: 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6,
439: 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3,
440: 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4,
441: 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5,
442: 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5,
443: 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4,
444: 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6,
445: 7, 5, 6, 6, 7, 6, 7, 7, 8 };
446:
447: /**
448: * Numeric value shift
449: */
450: private static final int VALUE_SHIFT_ = 20;
451:
452: /**
453: * Mask to be applied after shifting to obtain an unsigned numeric value
454: */
455: private static final int UNSIGNED_VALUE_MASK_AFTER_SHIFT_ = 0x7FF;
456:
457: /**
458: *
459: */
460: private static final int NUMERIC_TYPE_SHIFT = 12;
461:
462: /**
463: * Folding indicator mask
464: */
465: private static final int SUPPLEMENTARY_FOLD_INDICATOR_MASK_ = 0x8000;
466:
467: /**
468: * Folding offset mask
469: */
470: private static final int SUPPLEMENTARY_FOLD_OFFSET_MASK_ = 0x7FFF;
471:
472: /**
473: * Shift value for lead surrogate to form a supplementary character.
474: */
475: private static final int LEAD_SURROGATE_SHIFT_ = 10;
476:
477: /**
478: * Offset to add to combined surrogate pair to avoid msking.
479: */
480: private static final int SURROGATE_OFFSET_ = UTF16.SUPPLEMENTARY_MIN_VALUE
481: - (UTF16.SURROGATE_MIN_VALUE << LEAD_SURROGATE_SHIFT_)
482: - UTF16.TRAIL_SURROGATE_MIN_VALUE;
483:
484: /**
485: * To get the last character out from a data type
486: */
487: private static final int LAST_CHAR_MASK_ = 0xFFFF;
488:
489: /**
490: * First nibble shift
491: */
492: private static final int FIRST_NIBBLE_SHIFT_ = 0x4;
493:
494: /**
495: * Second nibble mask
496: */
497: private static final int LAST_NIBBLE_MASK_ = 0xF;
498: /**
499: * Age value shift
500: */
501: private static final int AGE_SHIFT_ = 24;
502:
503: // private constructors --------------------------------------------------
504:
505: /**
506: * Constructor
507: * @exception thrown when data reading fails or data corrupted
508: */
509: private UCharacterProperty() throws IOException {
510: // jar access
511: InputStream is = ICUData.getRequiredStream(DATA_FILE_NAME_);
512: BufferedInputStream b = new BufferedInputStream(is,
513: DATA_BUFFER_SIZE_);
514: UCharacterPropertyReader reader = new UCharacterPropertyReader(
515: b);
516: reader.read(this );
517: b.close();
518:
519: m_trie_.putIndexData(this );
520: }
521:
522: /* Is followed by {case-ignorable}* cased ? */
523: /**
524: * Getting the correct address for data in the exception value
525: * @param evalue exception value
526: * @param indicator type of data to retrieve
527: * @param address current address to move from
528: * @return the correct address
529: */
530: private int addExceptionOffset(int evalue, int indicator,
531: int address) {
532: int result = address;
533: if (indicator >= EXC_GROUP_) {
534: result += FLAGS_OFFSET_[evalue & EXC_GROUP_MASK_];
535: evalue >>= EXC_GROUP_;
536: indicator -= EXC_GROUP_;
537: }
538: int mask = (1 << indicator) - 1;
539: result += FLAGS_OFFSET_[evalue & mask];
540: return result;
541: }
542:
543: private static final int TAB = 0x0009;
544: private static final int LF = 0x000a;
545: private static final int FF = 0x000c;
546: private static final int CR = 0x000d;
547: private static final int U_A = 0x0041;
548: private static final int U_Z = 0x005a;
549: private static final int U_a = 0x0061;
550: private static final int U_z = 0x007a;
551: private static final int DEL = 0x007f;
552: private static final int NL = 0x0085;
553: private static final int NBSP = 0x00a0;
554: private static final int CGJ = 0x034f;
555: private static final int FIGURESP = 0x2007;
556: private static final int HAIRSP = 0x200a;
557: private static final int ZWNJ = 0x200c;
558: private static final int ZWJ = 0x200d;
559: private static final int RLM = 0x200f;
560: private static final int NNBSP = 0x202f;
561: private static final int WJ = 0x2060;
562: private static final int INHSWAP = 0x206a;
563: private static final int NOMDIG = 0x206f;
564: private static final int ZWNBSP = 0xfeff;
565:
566: public UnicodeSet addPropertyStarts(UnicodeSet set) {
567: int c;
568:
569: /* add the start code point of each same-value range of each trie */
570: //utrie_enum(&normTrie, NULL, _enumPropertyStartsRange, set);
571: TrieIterator propsIter = new TrieIterator(m_trie_);
572: RangeValueIterator.Element propsResult = new RangeValueIterator.Element();
573: while (propsIter.next(propsResult)) {
574: set.add(propsResult.start);
575: }
576: //utrie_enum(&propsVectorsTrie, NULL, _enumPropertyStartsRange, set);
577: TrieIterator propsVectorsIter = new TrieIterator(
578: m_additionalTrie_);
579: RangeValueIterator.Element propsVectorsResult = new RangeValueIterator.Element();
580: while (propsVectorsIter.next(propsVectorsResult)) {
581: set.add(propsVectorsResult.start);
582: }
583:
584: /* add code points with hardcoded properties, plus the ones following them */
585:
586: /* add for IS_THAT_CONTROL_SPACE() */
587: set.add(TAB); /* range TAB..CR */
588: set.add(CR + 1);
589: set.add(0x1c);
590: set.add(0x1f + 1);
591: set.add(NL);
592: set.add(NL + 1);
593:
594: /* add for u_isIDIgnorable() what was not added above */
595: set.add(DEL); /* range DEL..NBSP-1, NBSP added below */
596: set.add(HAIRSP);
597: set.add(RLM + 1);
598: set.add(INHSWAP);
599: set.add(NOMDIG + 1);
600: set.add(ZWNBSP);
601: set.add(ZWNBSP + 1);
602:
603: /* add no-break spaces for u_isWhitespace() what was not added above */
604: set.add(NBSP);
605: set.add(NBSP + 1);
606: set.add(FIGURESP);
607: set.add(FIGURESP + 1);
608: set.add(NNBSP);
609: set.add(NNBSP + 1);
610:
611: /* add for u_charDigitValue() */
612: set.add(0x3007);
613: set.add(0x3008);
614: set.add(0x4e00);
615: set.add(0x4e01);
616: set.add(0x4e8c);
617: set.add(0x4e8d);
618: set.add(0x4e09);
619: set.add(0x4e0a);
620: set.add(0x56db);
621: set.add(0x56dc);
622: set.add(0x4e94);
623: set.add(0x4e95);
624: set.add(0x516d);
625: set.add(0x516e);
626: set.add(0x4e03);
627: set.add(0x4e04);
628: set.add(0x516b);
629: set.add(0x516c);
630: set.add(0x4e5d);
631: set.add(0x4e5e);
632:
633: /* add for u_digit() */
634: set.add(U_a);
635: set.add(U_z + 1);
636: set.add(U_A);
637: set.add(U_Z + 1);
638:
639: /* add for UCHAR_DEFAULT_IGNORABLE_CODE_POINT what was not added above */
640: set.add(WJ); /* range WJ..NOMDIG */
641: set.add(0xfff0);
642: set.add(0xfffb + 1);
643: set.add(0xe0000);
644: set.add(0xe0fff + 1);
645:
646: /* add for UCHAR_GRAPHEME_BASE and others */
647: set.add(CGJ);
648: set.add(CGJ + 1);
649:
650: /* add for UCHAR_JOINING_TYPE */
651: set.add(ZWNJ); /* range ZWNJ..ZWJ */
652: set.add(ZWJ + 1);
653:
654: /* add Jamo type boundaries for UCHAR_HANGUL_SYLLABLE_TYPE */
655: set.add(0x1100);
656: int value = UCharacter.HangulSyllableType.LEADING_JAMO;
657: int value2;
658: for (c = 0x115a; c <= 0x115f; ++c) {
659: value2 = UCharacter.getIntPropertyValue(c,
660: UProperty.HANGUL_SYLLABLE_TYPE);
661: if (value != value2) {
662: value = value2;
663: set.add(c);
664: }
665: }
666:
667: set.add(0x1160);
668: value = UCharacter.HangulSyllableType.VOWEL_JAMO;
669: for (c = 0x11a3; c <= 0x11a7; ++c) {
670: value2 = UCharacter.getIntPropertyValue(c,
671: UProperty.HANGUL_SYLLABLE_TYPE);
672: if (value != value2) {
673: value = value2;
674: set.add(c);
675: }
676: }
677:
678: set.add(0x11a8);
679: value = UCharacter.HangulSyllableType.TRAILING_JAMO;
680: for (c = 0x11fa; c <= 0x11ff; ++c) {
681: value2 = UCharacter.getIntPropertyValue(c,
682: UProperty.HANGUL_SYLLABLE_TYPE);
683: if (value != value2) {
684: value = value2;
685: set.add(c);
686: }
687: }
688:
689: /*
690: * Omit code points for u_charCellWidth() because
691: * - it is deprecated and not a real Unicode property
692: * - they are probably already set from the trie enumeration
693: */
694:
695: /*
696: * Omit code points with hardcoded specialcasing properties
697: * because we do not build property UnicodeSets for them right now.
698: */
699: return set; // for chaining
700: }
701:
702: /*----------------------------------------------------------------
703: * Inclusions list
704: *----------------------------------------------------------------*/
705:
706: /*
707: * Return a set of characters for property enumeration.
708: * The set implicitly contains 0x110000 as well, which is one more than the highest
709: * Unicode code point.
710: *
711: * This set is used as an ordered list - its code points are ordered, and
712: * consecutive code points (in Unicode code point order) in the set define a range.
713: * For each two consecutive characters (start, limit) in the set,
714: * all of the UCD/normalization and related properties for
715: * all code points start..limit-1 are all the same,
716: * except for character names and ISO comments.
717: *
718: * All Unicode code points U+0000..U+10ffff are covered by these ranges.
719: * The ranges define a partition of the Unicode code space.
720: * ICU uses the inclusions set to enumerate properties for generating
721: * UnicodeSets containing all code points that have a certain property value.
722: *
723: * The Inclusion List is generated from the UCD. It is generated
724: * by enumerating the data tries, and code points for hardcoded properties
725: * are added as well.
726: *
727: * --------------------------------------------------------------------------
728: *
729: * The following are ideas for getting properties-unique code point ranges,
730: * with possible optimizations beyond the current implementation.
731: * These optimizations would require more code and be more fragile.
732: * The current implementation generates one single list (set) for all properties.
733: *
734: * To enumerate properties efficiently, one needs to know ranges of
735: * repetitive values, so that the value of only each start code point
736: * can be applied to the whole range.
737: * This information is in principle available in the uprops.icu/unorm.icu data.
738: *
739: * There are two obstacles:
740: *
741: * 1. Some properties are computed from multiple data structures,
742: * making it necessary to get repetitive ranges by intersecting
743: * ranges from multiple tries.
744: *
745: * 2. It is not economical to write code for getting repetitive ranges
746: * that are precise for each of some 50 properties.
747: *
748: * Compromise ideas:
749: *
750: * - Get ranges per trie, not per individual property.
751: * Each range contains the same values for a whole group of properties.
752: * This would generate currently five range sets, two for uprops.icu tries
753: * and three for unorm.icu tries.
754: *
755: * - Combine sets of ranges for multiple tries to get sufficient sets
756: * for properties, e.g., the uprops.icu main and auxiliary tries
757: * for all non-normalization properties.
758: *
759: * Ideas for representing ranges and combining them:
760: *
761: * - A UnicodeSet could hold just the start code points of ranges.
762: * Multiple sets are easily combined by or-ing them together.
763: *
764: * - Alternatively, a UnicodeSet could hold each even-numbered range.
765: * All ranges could be enumerated by using each start code point
766: * (for the even-numbered ranges) as well as each limit (end+1) code point
767: * (for the odd-numbered ranges).
768: * It should be possible to combine two such sets by xor-ing them,
769: * but no more than two.
770: *
771: * The second way to represent ranges may(?!) yield smaller UnicodeSet arrays,
772: * but the first one is certainly simpler and applicable for combining more than
773: * two range sets.
774: *
775: * It is possible to combine all range sets for all uprops/unorm tries into one
776: * set that can be used for all properties.
777: * As an optimization, there could be less-combined range sets for certain
778: * groups of properties.
779: * The relationship of which less-combined range set to use for which property
780: * depends on the implementation of the properties and must be hardcoded
781: * - somewhat error-prone and higher maintenance but can be tested easily
782: * by building property sets "the simple way" in test code.
783: *
784: * ---
785: *
786: * Do not use a UnicodeSet pattern because that causes infinite recursion;
787: * UnicodeSet depends on the inclusions set.
788: */
789: public UnicodeSet getInclusions() {
790: UnicodeSet set = new UnicodeSet();
791: NormalizerImpl.addPropertyStarts(set);
792: addPropertyStarts(set);
793: return set;
794: }
795:
796: }
|