001: /*
002: * Portions Copyright 2005 Sun Microsystems, Inc. All Rights Reserved.
003: * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
004: *
005: * This code is free software; you can redistribute it and/or modify it
006: * under the terms of the GNU General Public License version 2 only, as
007: * published by the Free Software Foundation. Sun designates this
008: * particular file as subject to the "Classpath" exception as provided
009: * by Sun in the LICENSE file that accompanied this code.
010: *
011: * This code is distributed in the hope that it will be useful, but WITHOUT
012: * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
013: * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
014: * version 2 for more details (a copy is included in the LICENSE file that
015: * accompanied this code).
016: *
017: * You should have received a copy of the GNU General Public License version
018: * 2 along with this work; if not, write to the Free Software Foundation,
019: * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
020: *
021: * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
022: * CA 95054 USA or visit www.sun.com if you need additional information or
023: * have any questions.
024: */
025: /*
026: *******************************************************************************
027: * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
028: * *
029: * The original version of this source code and documentation is copyrighted *
030: * and owned by IBM, These materials are provided under terms of a License *
031: * Agreement between IBM and Sun. This technology is protected by multiple *
032: * US and International patents. This notice and attribution to IBM may not *
033: * to removed. *
034: *******************************************************************************
035: */
036:
037: package sun.text.normalizer;
038:
039: import java.lang.ref.SoftReference;
040: import java.util.HashMap;
041: import java.util.Locale;
042: import java.util.Map;
043:
044: /**
045: * <p>
046: * The UCharacter class provides extensions to the
047: * <a href=http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.html>
048: * java.lang.Character</a> class. These extensions provide support for
049: * Unicode 3.2 properties and together with the <a href=../text/UTF16.html>UTF16</a>
050: * class, provide support for supplementary characters (those with code
051: * points above U+FFFF).
052: * </p>
053: * <p>
054: * Code points are represented in these API using ints. While it would be
055: * more convenient in Java to have a separate primitive datatype for them,
056: * ints suffice in the meantime.
057: * </p>
058: * <p>
059: * To use this class please add the jar file name icu4j.jar to the
060: * class path, since it contains data files which supply the information used
061: * by this file.<br>
062: * E.g. In Windows <br>
063: * <code>set CLASSPATH=%CLASSPATH%;$JAR_FILE_PATH/ucharacter.jar</code>.<br>
064: * Otherwise, another method would be to copy the files uprops.dat and
065: * unames.icu from the icu4j source subdirectory
066: * <i>$ICU4J_SRC/src/com.ibm.icu.impl.data</i> to your class directory
067: * <i>$ICU4J_CLASS/com.ibm.icu.impl.data</i>.
068: * </p>
069: * <p>
070: * Aside from the additions for UTF-16 support, and the updated Unicode 3.1
071: * properties, the main differences between UCharacter and Character are:
072: * <ul>
073: * <li> UCharacter is not designed to be a char wrapper and does not have
074: * APIs to which involves management of that single char.<br>
075: * These include:
076: * <ul>
077: * <li> char charValue(),
078: * <li> int compareTo(java.lang.Character, java.lang.Character), etc.
079: * </ul>
080: * <li> UCharacter does not include Character APIs that are deprecated, not
081: * does it include the Java-specific character information, such as
082: * boolean isJavaIdentifierPart(char ch).
083: * <li> Character maps characters 'A' - 'Z' and 'a' - 'z' to the numeric
084: * values '10' - '35'. UCharacter also does this in digit and
085: * getNumericValue, to adhere to the java semantics of these
086: * methods. New methods unicodeDigit, and
087: * getUnicodeNumericValue do not treat the above code points
088: * as having numeric values. This is a semantic change from ICU4J 1.3.1.
089: * </ul>
090: * <p>
091: * Further detail differences can be determined from the program
092: * <a href = http://oss.software.ibm.com/developerworks/opensource/cvs/icu4j/~checkout~/icu4j/src/com/ibm/icu/dev/test/lang/UCharacterCompare.java>
093: * com.ibm.icu.dev.test.lang.UCharacterCompare</a>
094: * </p>
095: * <p>
096: * This class is not subclassable
097: * </p>
098: * @author Syn Wee Quek
099: * @stable ICU 2.1
100: * @see com.ibm.icu.lang.UCharacterEnums
101: */
102:
103: public final class UCharacter {
104:
105: /**
106: * Numeric Type constants.
107: * @see UProperty#NUMERIC_TYPE
108: * @stable ICU 2.4
109: */
110: public static interface NumericType {
111: /**
112: * @stable ICU 2.4
113: */
114: public static final int NONE = 0;
115: /**
116: * @stable ICU 2.4
117: */
118: public static final int DECIMAL = 1;
119: /**
120: * @stable ICU 2.4
121: */
122: public static final int DIGIT = 2;
123: /**
124: * @stable ICU 2.4
125: */
126: public static final int NUMERIC = 3;
127: /**
128: * @stable ICU 2.4
129: */
130: public static final int COUNT = 4;
131: }
132:
133: /**
134: * Hangul Syllable Type constants.
135: *
136: * @see UProperty#HANGUL_SYLLABLE_TYPE
137: * @stable ICU 2.6
138: */
139: public static interface HangulSyllableType {
140: /**
141: * @stable ICU 2.6
142: */
143: public static final int NOT_APPLICABLE = 0; /*[NA]*//*See note !!*/
144: /**
145: * @stable ICU 2.6
146: */
147: public static final int LEADING_JAMO = 1; /*[L]*/
148: /**
149: * @stable ICU 2.6
150: */
151: public static final int VOWEL_JAMO = 2; /*[V]*/
152: /**
153: * @stable ICU 2.6
154: */
155: public static final int TRAILING_JAMO = 3; /*[T]*/
156: /**
157: * @stable ICU 2.6
158: */
159: public static final int LV_SYLLABLE = 4; /*[LV]*/
160: /**
161: * @stable ICU 2.6
162: */
163: public static final int LVT_SYLLABLE = 5; /*[LVT]*/
164: /**
165: * @stable ICU 2.6
166: */
167: public static final int COUNT = 6;
168: }
169:
170: /**
171: * [Sun] This interface moved from UCharacterEnums.java.
172: *
173: * 'Enum' for the CharacterCategory constants. These constants are
174: * compatible in name <b>but not in value</b> with those defined in
175: * <code>java.lang.Character</code>.
176: * @see UCharacterCategory
177: * @draft ICU 3.0
178: * @deprecated This is a draft API and might change in a future release of ICU.
179: */
180: public static interface ECharacterCategory {
181: /**
182: * Character type Lu
183: * @stable ICU 2.1
184: */
185: public static final int UPPERCASE_LETTER = 1;
186:
187: /**
188: * Character type Lt
189: * @stable ICU 2.1
190: */
191: public static final int TITLECASE_LETTER = 3;
192:
193: /**
194: * Character type Lo
195: * @stable ICU 2.1
196: */
197: public static final int OTHER_LETTER = 5;
198: }
199:
200: // public data members -----------------------------------------------
201:
202: /**
203: * The lowest Unicode code point value.
204: * @stable ICU 2.1
205: */
206: public static final int MIN_VALUE = UTF16.CODEPOINT_MIN_VALUE;
207:
208: /**
209: * The highest Unicode code point value (scalar value) according to the
210: * Unicode Standard.
211: * This is a 21-bit value (21 bits, rounded up).<br>
212: * Up-to-date Unicode implementation of java.lang.Character.MIN_VALUE
213: * @stable ICU 2.1
214: */
215: public static final int MAX_VALUE = UTF16.CODEPOINT_MAX_VALUE;
216:
217: /**
218: * The minimum value for Supplementary code points
219: * @stable ICU 2.1
220: */
221: public static final int SUPPLEMENTARY_MIN_VALUE = UTF16.SUPPLEMENTARY_MIN_VALUE;
222:
223: /**
224: * Special value that is returned by getUnicodeNumericValue(int) when no
225: * numeric value is defined for a code point.
226: * @stable ICU 2.4
227: * @see #getUnicodeNumericValue
228: */
229: public static final double NO_NUMERIC_VALUE = -123456789;
230:
231: // public methods ----------------------------------------------------
232:
233: /**
234: * Retrieves the numeric value of a decimal digit code point.
235: * <br>This method observes the semantics of
236: * <code>java.lang.Character.digit()</code>. Note that this
237: * will return positive values for code points for which isDigit
238: * returns false, just like java.lang.Character.
239: * <br><em>Semantic Change:</em> In release 1.3.1 and
240: * prior, this did not treat the European letters as having a
241: * digit value, and also treated numeric letters and other numbers as
242: * digits.
243: * This has been changed to conform to the java semantics.
244: * <br>A code point is a valid digit if and only if:
245: * <ul>
246: * <li>ch is a decimal digit or one of the european letters, and
247: * <li>the value of ch is less than the specified radix.
248: * </ul>
249: * @param ch the code point to query
250: * @param radix the radix
251: * @return the numeric value represented by the code point in the
252: * specified radix, or -1 if the code point is not a decimal digit
253: * or if its value is too large for the radix
254: * @stable ICU 2.1
255: */
256: public static int digit(int ch, int radix) {
257: // when ch is out of bounds getProperty == 0
258: int props = getProperty(ch);
259: if (getNumericType(props) != NumericType.DECIMAL) {
260: return (radix <= 10) ? -1 : getEuropeanDigit(ch);
261: }
262: // if props == 0, it will just fall through and return -1
263: if (isNotExceptionIndicator(props)) {
264: // not contained in exception data
265: // getSignedValue is just shifting so we can check for the sign
266: // first
267: // Optimization
268: // int result = UCharacterProperty.getSignedValue(props);
269: // if (result >= 0) {
270: // return result;
271: // }
272: if (props >= 0) {
273: return UCharacterProperty.getSignedValue(props);
274: }
275: } else {
276: int index = UCharacterProperty.getExceptionIndex(props);
277: if (PROPERTY_.hasExceptionValue(index,
278: UCharacterProperty.EXC_NUMERIC_VALUE_)) {
279: int result = PROPERTY_.getException(index,
280: UCharacterProperty.EXC_NUMERIC_VALUE_);
281: if (result >= 0) {
282: return result;
283: }
284: }
285: }
286:
287: if (radix > 10) {
288: int result = getEuropeanDigit(ch);
289: if (result >= 0 && result < radix) {
290: return result;
291: }
292: }
293: return -1;
294: }
295:
296: /**
297: * <p>Get the numeric value for a Unicode code point as defined in the
298: * Unicode Character Database.</p>
299: * <p>A "double" return type is necessary because some numeric values are
300: * fractions, negative, or too large for int.</p>
301: * <p>For characters without any numeric values in the Unicode Character
302: * Database, this function will return NO_NUMERIC_VALUE.</p>
303: * <p><em>API Change:</em> In release 2.2 and prior, this API has a
304: * return type int and returns -1 when the argument ch does not have a
305: * corresponding numeric value. This has been changed to synch with ICU4C
306: * </p>
307: * This corresponds to the ICU4C function u_getNumericValue.
308: * @param ch Code point to get the numeric value for.
309: * @return numeric value of ch, or NO_NUMERIC_VALUE if none is defined.
310: * @stable ICU 2.4
311: */
312: public static double getUnicodeNumericValue(int ch) {
313: // equivalent to c version double u_getNumericValue(UChar32 c)
314: int props = PROPERTY_.getProperty(ch);
315: int numericType = getNumericType(props);
316: if (numericType > NumericType.NONE
317: && numericType < NumericType.COUNT) {
318: if (isNotExceptionIndicator(props)) {
319: return UCharacterProperty.getSignedValue(props);
320: } else {
321: int index = UCharacterProperty.getExceptionIndex(props);
322: boolean nex = false;
323: boolean dex = false;
324: double numerator = 0;
325: if (PROPERTY_.hasExceptionValue(index,
326: UCharacterProperty.EXC_NUMERIC_VALUE_)) {
327: int num = PROPERTY_.getException(index,
328: UCharacterProperty.EXC_NUMERIC_VALUE_);
329: // There are special values for huge numbers that are
330: // powers of ten. genprops/store.c documents:
331: // if numericValue = 0x7fffff00 + x then
332: // numericValue = 10 ^ x
333: if (num >= NUMERATOR_POWER_LIMIT_) {
334: num &= 0xff;
335: // 10^x without math.h
336: numerator = Math.pow(10, num);
337: } else {
338: numerator = num;
339: }
340: nex = true;
341: }
342: double denominator = 0;
343: if (PROPERTY_.hasExceptionValue(index,
344: UCharacterProperty.EXC_DENOMINATOR_VALUE_)) {
345: denominator = PROPERTY_.getException(index,
346: UCharacterProperty.EXC_DENOMINATOR_VALUE_);
347: // faster path not in c
348: if (numerator != 0) {
349: return numerator / denominator;
350: }
351: dex = true;
352: }
353:
354: if (nex) {
355: if (dex) {
356: return numerator / denominator;
357: }
358: return numerator;
359: }
360: if (dex) {
361: return 1 / denominator;
362: }
363: }
364: }
365: return NO_NUMERIC_VALUE;
366: }
367:
368: /**
369: * Returns a value indicating a code point's Unicode category.
370: * Up-to-date Unicode implementation of java.lang.Character.getType()
371: * except for the above mentioned code points that had their category
372: * changed.<br>
373: * Return results are constants from the interface
374: * <a href=UCharacterCategory.html>UCharacterCategory</a><br>
375: * <em>NOTE:</em> the UCharacterCategory values are <em>not</em> compatible with
376: * those returned by java.lang.Character.getType. UCharacterCategory values
377: * match the ones used in ICU4C, while java.lang.Character type
378: * values, though similar, skip the value 17.</p>
379: * @param ch code point whose type is to be determined
380: * @return category which is a value of UCharacterCategory
381: * @stable ICU 2.1
382: */
383: public static int getType(int ch) {
384: return getProperty(ch) & UCharacterProperty.TYPE_MASK;
385: }
386:
387: //// for StringPrep
388: /**
389: * Returns a code point corresponding to the two UTF16 characters.
390: * @param lead the lead char
391: * @param trail the trail char
392: * @return code point if surrogate characters are valid.
393: * @exception IllegalArgumentException thrown when argument characters do
394: * not form a valid codepoint
395: * @stable ICU 2.1
396: */
397: public static int getCodePoint(char lead, char trail) {
398: if (lead >= UTF16.LEAD_SURROGATE_MIN_VALUE
399: && lead <= UTF16.LEAD_SURROGATE_MAX_VALUE
400: && trail >= UTF16.TRAIL_SURROGATE_MIN_VALUE
401: && trail <= UTF16.TRAIL_SURROGATE_MAX_VALUE) {
402: return UCharacterProperty.getRawSupplementary(lead, trail);
403: }
404: throw new IllegalArgumentException(
405: "Illegal surrogate characters");
406: }
407:
408: //// for StringPrep
409: /**
410: * Returns the Bidirection property of a code point.
411: * For example, 0x0041 (letter A) has the LEFT_TO_RIGHT directional
412: * property.<br>
413: * Result returned belongs to the interface
414: * <a href=UCharacterDirection.html>UCharacterDirection</a>
415: * @param ch the code point to be determined its direction
416: * @return direction constant from UCharacterDirection.
417: * @stable ICU 2.1
418: */
419: public static int getDirection(int ch) {
420: // when ch is out of bounds getProperty == 0
421: return (getProperty(ch) >> BIDI_SHIFT_)
422: & BIDI_MASK_AFTER_SHIFT_;
423: }
424:
425: /**
426: * The given string is mapped to its case folding equivalent according to
427: * UnicodeData.txt and CaseFolding.txt; if any character has no case
428: * folding equivalent, the character itself is returned.
429: * "Full", multiple-code point case folding mappings are returned here.
430: * For "simple" single-code point mappings use the API
431: * foldCase(int ch, boolean defaultmapping).
432: * @param str the String to be converted
433: * @param defaultmapping Indicates if all mappings defined in
434: * CaseFolding.txt is to be used, otherwise the
435: * mappings for dotted I and dotless i marked with
436: * 'I' in CaseFolding.txt will be skipped.
437: * @return the case folding equivalent of the character, if
438: * any; otherwise the character itself.
439: * @see #foldCase(int, boolean)
440: * @stable ICU 2.1
441: */
442: public static String foldCase(String str, boolean defaultmapping) {
443: int size = str.length();
444: StringBuffer result = new StringBuffer(size);
445: int offset = 0;
446: int ch;
447:
448: // case mapping loop
449: while (offset < size) {
450: ch = UTF16.charAt(str, offset);
451: offset += UTF16.getCharCount(ch);
452: int props = PROPERTY_.getProperty(ch);
453: if (isNotExceptionIndicator(props)) {
454: int type = UCharacterProperty.TYPE_MASK & props;
455: if (type == ECharacterCategory.UPPERCASE_LETTER
456: || type == ECharacterCategory.TITLECASE_LETTER) {
457: ch += UCharacterProperty.getSignedValue(props);
458: }
459: } else {
460: int index = UCharacterProperty.getExceptionIndex(props);
461: if (PROPERTY_.hasExceptionValue(index,
462: UCharacterProperty.EXC_CASE_FOLDING_)) {
463: int exception = PROPERTY_.getException(index,
464: UCharacterProperty.EXC_CASE_FOLDING_);
465: if (exception != 0) {
466: PROPERTY_.getFoldCase(exception
467: & LAST_CHAR_MASK_,
468: exception >> SHIFT_24_, result);
469: } else {
470: // special case folding mappings, hardcoded
471: if (ch != 0x49 && ch != 0x130) {
472: // return ch itself because there is no special
473: // mapping for it
474: UTF16.append(result, ch);
475: continue;
476: }
477: if (defaultmapping) {
478: // default mappings
479: if (ch == 0x49) {
480: // 0049; C; 0069; # LATIN CAPITAL LETTER I
481: result
482: .append(UCharacterProperty.LATIN_SMALL_LETTER_I_);
483: } else if (ch == 0x130) {
484: // 0130; F; 0069 0307;
485: // # LATIN CAPITAL LETTER I WITH DOT ABOVE
486: result
487: .append(UCharacterProperty.LATIN_SMALL_LETTER_I_);
488: result.append((char) 0x307);
489: }
490: } else {
491: // Turkic mappings
492: if (ch == 0x49) {
493: // 0049; T; 0131; # LATIN CAPITAL LETTER I
494: result.append((char) 0x131);
495: } else if (ch == 0x130) {
496: // 0130; T; 0069;
497: // # LATIN CAPITAL LETTER I WITH DOT ABOVE
498: result
499: .append(UCharacterProperty.LATIN_SMALL_LETTER_I_);
500: }
501: }
502: }
503: // do not fall through to the output of c
504: continue;
505: } else {
506: if (PROPERTY_.hasExceptionValue(index,
507: UCharacterProperty.EXC_LOWERCASE_)) {
508: ch = PROPERTY_.getException(index,
509: UCharacterProperty.EXC_LOWERCASE_);
510: }
511: }
512:
513: }
514:
515: // handle 1:1 code point mappings from UnicodeData.txt
516: UTF16.append(result, ch);
517: }
518:
519: return result.toString();
520: }
521:
522: /**
523: * <p>Get the "age" of the code point.</p>
524: * <p>The "age" is the Unicode version when the code point was first
525: * designated (as a non-character or for Private Use) or assigned a
526: * character.
527: * <p>This can be useful to avoid emitting code points to receiving
528: * processes that do not accept newer characters.</p>
529: * <p>The data is from the UCD file DerivedAge.txt.</p>
530: * @param ch The code point.
531: * @return the Unicode version number
532: * @stable ICU 2.6
533: */
534: public static VersionInfo getAge(int ch) {
535: if (ch < MIN_VALUE || ch > MAX_VALUE) {
536: throw new IllegalArgumentException(
537: "Codepoint out of bounds");
538: }
539: return PROPERTY_.getAge(ch);
540: }
541:
542: /**
543: * <p>Gets the property value for an Unicode property type of a code point.
544: * Also returns binary and mask property values.</p>
545: * <p>Unicode, especially in version 3.2, defines many more properties than
546: * the original set in UnicodeData.txt.</p>
547: * <p>The properties APIs are intended to reflect Unicode properties as
548: * defined in the Unicode Character Database (UCD) and Unicode Technical
549: * Reports (UTR). For details about the properties see
550: * http://www.unicode.org/.</p>
551: * <p>For names of Unicode properties see the UCD file PropertyAliases.txt.
552: * </p>
553: * <pre>
554: * Sample usage:
555: * int ea = UCharacter.getIntPropertyValue(c, UProperty.EAST_ASIAN_WIDTH);
556: * int ideo = UCharacter.getIntPropertyValue(c, UProperty.IDEOGRAPHIC);
557: * boolean b = (ideo == 1) ? true : false;
558: * </pre>
559: * @param ch code point to test.
560: * @param type UProperty selector constant, identifies which binary
561: * property to check. Must be
562: * UProperty.BINARY_START <= type < UProperty.BINARY_LIMIT or
563: * UProperty.INT_START <= type < UProperty.INT_LIMIT or
564: * UProperty.MASK_START <= type < UProperty.MASK_LIMIT.
565: * @return numeric value that is directly the property value or,
566: * for enumerated properties, corresponds to the numeric value of
567: * the enumerated constant of the respective property value
568: * enumeration type (cast to enum type if necessary).
569: * Returns 0 or 1 (for false / true) for binary Unicode properties.
570: * Returns a bit-mask for mask properties.
571: * Returns 0 if 'type' is out of bounds or if the Unicode version
572: * does not have data for the property at all, or not for this code
573: * point.
574: * @see UProperty
575: * @see #hasBinaryProperty
576: * @see #getIntPropertyMinValue
577: * @see #getIntPropertyMaxValue
578: * @see #getUnicodeVersion
579: * @stable ICU 2.4
580: */
581: public static int getIntPropertyValue(int ch, int type) {
582: /*
583: * For Normalizer with Unicode 3.2, this method is called only for
584: * HANGUL_SYLLABLE_TYPE in UnicodeSet.addPropertyStarts().
585: */
586: if (type == UProperty.HANGUL_SYLLABLE_TYPE) {
587: /* purely algorithmic; hardcode known characters, check for assigned new ones */
588: if (ch < NormalizerImpl.JAMO_L_BASE) {
589: /* NA */
590: } else if (ch <= 0x11ff) {
591: /* Jamo range */
592: if (ch <= 0x115f) {
593: /* Jamo L range, HANGUL CHOSEONG ... */
594: if (ch == 0x115f
595: || ch <= 0x1159
596: || getType(ch) == ECharacterCategory.OTHER_LETTER) {
597: return HangulSyllableType.LEADING_JAMO;
598: }
599: } else if (ch <= 0x11a7) {
600: /* Jamo V range, HANGUL JUNGSEONG ... */
601: if (ch <= 0x11a2
602: || getType(ch) == ECharacterCategory.OTHER_LETTER) {
603: return HangulSyllableType.VOWEL_JAMO;
604: }
605: } else {
606: /* Jamo T range */
607: if (ch <= 0x11f9
608: || getType(ch) == ECharacterCategory.OTHER_LETTER) {
609: return HangulSyllableType.TRAILING_JAMO;
610: }
611: }
612: } else if ((ch -= NormalizerImpl.HANGUL_BASE) < 0) {
613: /* NA */
614: } else if (ch < NormalizerImpl.HANGUL_COUNT) {
615: /* Hangul syllable */
616: return ch % NormalizerImpl.JAMO_T_COUNT == 0 ? HangulSyllableType.LV_SYLLABLE
617: : HangulSyllableType.LVT_SYLLABLE;
618: }
619: }
620: return 0; /* NA */
621: }
622:
623: // private variables -------------------------------------------------
624:
625: /**
626: * Database storing the sets of character property
627: */
628: private static final UCharacterProperty PROPERTY_;
629: /**
630: * For optimization
631: */
632: private static final char[] PROPERTY_TRIE_INDEX_;
633: private static final char[] PROPERTY_TRIE_DATA_;
634: private static final int[] PROPERTY_DATA_;
635: private static final int PROPERTY_INITIAL_VALUE_;
636:
637: // block to initialise character property database
638: static {
639: try {
640: PROPERTY_ = UCharacterProperty.getInstance();
641: PROPERTY_TRIE_INDEX_ = PROPERTY_.m_trieIndex_;
642: PROPERTY_TRIE_DATA_ = PROPERTY_.m_trieData_;
643: PROPERTY_DATA_ = PROPERTY_.m_property_;
644: PROPERTY_INITIAL_VALUE_ = PROPERTY_DATA_[PROPERTY_.m_trieInitialValue_];
645: } catch (Exception e) {
646: throw new RuntimeException(e.getMessage());
647: }
648: }
649:
650: /**
651: * To get the last character out from a data type
652: */
653: private static final int LAST_CHAR_MASK_ = 0xFFFF;
654:
655: /**
656: * To get the last byte out from a data type
657: */
658: // private static final int LAST_BYTE_MASK_ = 0xFF;
659: /**
660: * Shift 16 bits
661: */
662: // private static final int SHIFT_16_ = 16;
663: /**
664: * Shift 24 bits
665: */
666: private static final int SHIFT_24_ = 24;
667:
668: /**
669: * Shift to get numeric type
670: */
671: private static final int NUMERIC_TYPE_SHIFT_ = 12;
672: /**
673: * Mask to get numeric type
674: */
675: private static final int NUMERIC_TYPE_MASK_ = 0x7 << NUMERIC_TYPE_SHIFT_;
676: /**
677: * Shift to get bidi bits
678: */
679: private static final int BIDI_SHIFT_ = 6;
680:
681: /**
682: * Mask to be applied after shifting to get bidi bits
683: */
684: private static final int BIDI_MASK_AFTER_SHIFT_ = 0x1F;
685:
686: /**
687: * <p>Numerator power limit.
688: * There are special values for huge numbers that are powers of ten.</p>
689: * <p>c version genprops/store.c documents:
690: * if numericValue = 0x7fffff00 + x then numericValue = 10 ^ x</p>
691: */
692: private static final int NUMERATOR_POWER_LIMIT_ = 0x7fffff00;
693: /**
694: * Integer properties mask and shift values for joining type.
695: * Equivalent to icu4c UPROPS_JT_MASK.
696: */
697: private static final int JOINING_TYPE_MASK_ = 0x00003800;
698: /**
699: * Integer properties mask and shift values for joining type.
700: * Equivalent to icu4c UPROPS_JT_SHIFT.
701: */
702: private static final int JOINING_TYPE_SHIFT_ = 11;
703: /**
704: * Integer properties mask and shift values for joining group.
705: * Equivalent to icu4c UPROPS_JG_MASK.
706: */
707: private static final int JOINING_GROUP_MASK_ = 0x000007e0;
708: /**
709: * Integer properties mask and shift values for joining group.
710: * Equivalent to icu4c UPROPS_JG_SHIFT.
711: */
712: private static final int JOINING_GROUP_SHIFT_ = 5;
713: /**
714: * Integer properties mask for decomposition type.
715: * Equivalent to icu4c UPROPS_DT_MASK.
716: */
717: private static final int DECOMPOSITION_TYPE_MASK_ = 0x0000001f;
718: /**
719: * Integer properties mask and shift values for East Asian cell width.
720: * Equivalent to icu4c UPROPS_EA_MASK
721: */
722: private static final int EAST_ASIAN_MASK_ = 0x00038000;
723: /**
724: * Integer properties mask and shift values for East Asian cell width.
725: * Equivalent to icu4c UPROPS_EA_SHIFT
726: */
727: private static final int EAST_ASIAN_SHIFT_ = 15;
728:
729: /**
730: * Integer properties mask and shift values for line breaks.
731: * Equivalent to icu4c UPROPS_LB_MASK
732: */
733: private static final int LINE_BREAK_MASK_ = 0x007C0000;
734: /**
735: * Integer properties mask and shift values for line breaks.
736: * Equivalent to icu4c UPROPS_LB_SHIFT
737: */
738: private static final int LINE_BREAK_SHIFT_ = 18;
739: /**
740: * Integer properties mask and shift values for blocks.
741: * Equivalent to icu4c UPROPS_BLOCK_MASK
742: */
743: private static final int BLOCK_MASK_ = 0x00007f80;
744: /**
745: * Integer properties mask and shift values for blocks.
746: * Equivalent to icu4c UPROPS_BLOCK_SHIFT
747: */
748: private static final int BLOCK_SHIFT_ = 7;
749: /**
750: * Integer properties mask and shift values for scripts.
751: * Equivalent to icu4c UPROPS_SHIFT_MASK
752: */
753: private static final int SCRIPT_MASK_ = 0x0000007f;
754:
755: // private constructor -----------------------------------------------
756: ///CLOVER:OFF
757: /**
758: * Private constructor to prevent instantiation
759: */
760: private UCharacter() {
761: }
762:
763: ///CLOVER:ON
764: // private methods ---------------------------------------------------
765:
766: /**
767: * Getting the digit values of characters like 'A' - 'Z', normal,
768: * half-width and full-width. This method assumes that the other digit
769: * characters are checked by the calling method.
770: * @param ch character to test
771: * @return -1 if ch is not a character of the form 'A' - 'Z', otherwise
772: * its corresponding digit will be returned.
773: */
774: private static int getEuropeanDigit(int ch) {
775: if ((ch > 0x7a && ch < 0xff21) || ch < 0x41
776: || (ch > 0x5a && ch < 0x61) || ch > 0xff5a
777: || (ch > 0xff31 && ch < 0xff41)) {
778: return -1;
779: }
780: if (ch <= 0x7a) {
781: // ch >= 0x41 or ch < 0x61
782: return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61);
783: }
784: // ch >= 0xff21
785: if (ch <= 0xff3a) {
786: return ch + 10 - 0xff21;
787: }
788: // ch >= 0xff41 && ch <= 0xff5a
789: return ch + 10 - 0xff41;
790: }
791:
792: /**
793: * Gets the numeric type of the property argument
794: * @param props 32 bit property
795: * @return the numeric type
796: */
797: private static int getNumericType(int props) {
798: return (props & NUMERIC_TYPE_MASK_) >> NUMERIC_TYPE_SHIFT_;
799: }
800:
801: /**
802: * Checks if the property value has a exception indicator
803: * @param props 32 bit property value
804: * @return true if property does not have a exception indicator, false
805: * otherwise
806: */
807: private static boolean isNotExceptionIndicator(int props) {
808: return (props & UCharacterProperty.EXCEPTION_MASK) == 0;
809: }
810:
811: /**
812: * Gets the property value at the index.
813: * This is optimized.
814: * Note this is alittle different from CharTrie the index m_trieData_
815: * is never negative.
816: * This is a duplicate of UCharacterProperty.getProperty. For optimization
817: * purposes, this method calls the trie data directly instead of through
818: * UCharacterProperty.getProperty.
819: * @param ch code point whose property value is to be retrieved
820: * @return property value of code point
821: * @stable ICU 2.6
822: */
823: private static int getProperty(int ch) {
824: if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE
825: || (ch > UTF16.LEAD_SURROGATE_MAX_VALUE && ch < UTF16.SUPPLEMENTARY_MIN_VALUE)) {
826: // BMP codepoint
827: try { // using try for < 0 ch is faster than using an if statement
828: return PROPERTY_DATA_[PROPERTY_TRIE_DATA_[(PROPERTY_TRIE_INDEX_[ch >> 5] << 2)
829: + (ch & 0x1f)]];
830: } catch (ArrayIndexOutOfBoundsException e) {
831: return PROPERTY_INITIAL_VALUE_;
832: }
833: }
834: if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
835: // surrogate
836: return PROPERTY_DATA_[PROPERTY_TRIE_DATA_[(PROPERTY_TRIE_INDEX_[(0x2800 >> 5)
837: + (ch >> 5)] << 2)
838: + (ch & 0x1f)]];
839: }
840: // for optimization
841: if (ch <= UTF16.CODEPOINT_MAX_VALUE) {
842: // look at the construction of supplementary characters
843: // trail forms the ends of it.
844: return PROPERTY_DATA_[PROPERTY_.m_trie_.getSurrogateValue(
845: UTF16.getLeadSurrogate(ch), (char) (ch & 0x3ff))];
846: }
847: // return m_dataOffset_ if there is an error, in this case we return
848: // the default value: m_initialValue_
849: // we cannot assume that m_initialValue_ is at offset 0
850: // this is for optimization.
851: return PROPERTY_INITIAL_VALUE_;
852: }
853: }
|