0001: /*
0002: * Portions Copyright 2001-2006 Sun Microsystems, Inc. All Rights Reserved.
0003: * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
0004: *
0005: * This code is free software; you can redistribute it and/or modify it
0006: * under the terms of the GNU General Public License version 2 only, as
0007: * published by the Free Software Foundation. Sun designates this
0008: * particular file as subject to the "Classpath" exception as provided
0009: * by Sun in the LICENSE file that accompanied this code.
0010: *
0011: * This code is distributed in the hope that it will be useful, but WITHOUT
0012: * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
0013: * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
0014: * version 2 for more details (a copy is included in the LICENSE file that
0015: * accompanied this code).
0016: *
0017: * You should have received a copy of the GNU General Public License version
0018: * 2 along with this work; if not, write to the Free Software Foundation,
0019: * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
0020: *
0021: * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
0022: * CA 95054 USA or visit www.sun.com if you need additional information or
0023: * have any questions.
0024: */
0025:
0026: /*
0027: *******************************************************************************
0028: * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
0029: * *
0030: * The original version of this source code and documentation is copyrighted *
0031: * and owned by IBM, These materials are provided under terms of a License *
0032: * Agreement between IBM and Sun. This technology is protected by multiple *
0033: * US and International patents. This notice and attribution to IBM may not *
0034: * to removed. *
0035: *******************************************************************************
0036: */
0037:
0038: package sun.text.normalizer;
0039:
0040: import java.text.CharacterIterator;
0041: import java.text.Normalizer;
0042:
0043: /**
0044: * Unicode Normalization
0045: *
0046: * <h2>Unicode normalization API</h2>
0047: *
0048: * <code>normalize</code> transforms Unicode text into an equivalent composed or
0049: * decomposed form, allowing for easier sorting and searching of text.
0050: * <code>normalize</code> supports the standard normalization forms described in
0051: * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
0052: * Unicode Standard Annex #15 — Unicode Normalization Forms</a>.
0053: *
0054: * Characters with accents or other adornments can be encoded in
0055: * several different ways in Unicode. For example, take the character A-acute.
0056: * In Unicode, this can be encoded as a single character (the
0057: * "composed" form):
0058: *
0059: * <p>
0060: * 00C1 LATIN CAPITAL LETTER A WITH ACUTE
0061: * </p>
0062: *
0063: * or as two separate characters (the "decomposed" form):
0064: *
0065: * <p>
0066: * 0041 LATIN CAPITAL LETTER A
0067: * 0301 COMBINING ACUTE ACCENT
0068: * </p>
0069: *
0070: * To a user of your program, however, both of these sequences should be
0071: * treated as the same "user-level" character "A with acute accent". When you
0072: * are searching or comparing text, you must ensure that these two sequences are
0073: * treated equivalently. In addition, you must handle characters with more than
0074: * one accent. Sometimes the order of a character's combining accents is
0075: * significant, while in other cases accent sequences in different orders are
0076: * really equivalent.
0077: *
0078: * Similarly, the string "ffi" can be encoded as three separate letters:
0079: *
0080: * <p>
0081: * 0066 LATIN SMALL LETTER F
0082: * 0066 LATIN SMALL LETTER F
0083: * 0069 LATIN SMALL LETTER I
0084: * </p>
0085: *
0086: * or as the single character
0087: *
0088: * <p>
0089: * FB03 LATIN SMALL LIGATURE FFI
0090: * </p>
0091: *
0092: * The ffi ligature is not a distinct semantic character, and strictly speaking
0093: * it shouldn't be in Unicode at all, but it was included for compatibility
0094: * with existing character sets that already provided it. The Unicode standard
0095: * identifies such characters by giving them "compatibility" decompositions
0096: * into the corresponding semantic characters. When sorting and searching, you
0097: * will often want to use these mappings.
0098: *
0099: * <code>normalize</code> helps solve these problems by transforming text into
0100: * the canonical composed and decomposed forms as shown in the first example
0101: * above. In addition, you can have it perform compatibility decompositions so
0102: * that you can treat compatibility characters the same as their equivalents.
0103: * Finally, <code>normalize</code> rearranges accents into the proper canonical
0104: * order, so that you do not have to worry about accent rearrangement on your
0105: * own.
0106: *
0107: * Form FCD, "Fast C or D", is also designed for collation.
0108: * It allows to work on strings that are not necessarily normalized
0109: * with an algorithm (like in collation) that works under "canonical closure",
0110: * i.e., it treats precomposed characters and their decomposed equivalents the
0111: * same.
0112: *
0113: * It is not a normalization form because it does not provide for uniqueness of
0114: * representation. Multiple strings may be canonically equivalent (their NFDs
0115: * are identical) and may all conform to FCD without being identical themselves.
0116: *
0117: * The form is defined such that the "raw decomposition", the recursive
0118: * canonical decomposition of each character, results in a string that is
0119: * canonically ordered. This means that precomposed characters are allowed for
0120: * as long as their decompositions do not need canonical reordering.
0121: *
0122: * Its advantage for a process like collation is that all NFD and most NFC texts
0123: * - and many unnormalized texts - already conform to FCD and do not need to be
0124: * normalized (NFD) for such a process. The FCD quick check will return YES for
0125: * most strings in practice.
0126: *
0127: * normalize(FCD) may be implemented with NFD.
0128: *
0129: * For more details on FCD see the collation design document:
0130: * http://oss.software.ibm.com/cvs/icu/~checkout~/icuhtml/design/collation/ICU_collation_design.htm
0131: *
0132: * ICU collation performs either NFD or FCD normalization automatically if
0133: * normalization is turned on for the collator object. Beyond collation and
0134: * string search, normalized strings may be useful for string equivalence
0135: * comparisons, transliteration/transcription, unique representations, etc.
0136: *
0137: * The W3C generally recommends to exchange texts in NFC.
0138: * Note also that most legacy character encodings use only precomposed forms and
0139: * often do not encode any combining marks by themselves. For conversion to such
0140: * character encodings the Unicode text needs to be normalized to NFC.
0141: * For more usage examples, see the Unicode Standard Annex.
0142: * @stable ICU 2.8
0143: */
0144:
0145: public final class NormalizerBase implements Cloneable {
0146:
0147: //-------------------------------------------------------------------------
0148: // Private data
0149: //-------------------------------------------------------------------------
0150: private char[] buffer = new char[100];
0151: private int bufferStart = 0;
0152: private int bufferPos = 0;
0153: private int bufferLimit = 0;
0154:
0155: // The input text and our position in it
0156: private UCharacterIterator text;
0157: private Mode mode = NFC;
0158: private int options = 0;
0159: private int currentIndex;
0160: private int nextIndex;
0161:
0162: /**
0163: * Options bit set value to select Unicode 3.2 normalization
0164: * (except NormalizationCorrections).
0165: * At most one Unicode version can be selected at a time.
0166: * @stable ICU 2.6
0167: */
0168: public static final int UNICODE_3_2 = 0x20;
0169:
0170: /**
0171: * Constant indicating that the end of the iteration has been reached.
0172: * This is guaranteed to have the same value as {@link UCharacterIterator#DONE}.
0173: * @stable ICU 2.8
0174: */
0175: public static final int DONE = UCharacterIterator.DONE;
0176:
0177: /**
0178: * Constants for normalization modes.
0179: * @stable ICU 2.8
0180: */
0181: public static class Mode {
0182: private int modeValue;
0183:
0184: private Mode(int value) {
0185: modeValue = value;
0186: }
0187:
0188: /**
0189: * This method is used for method dispatch
0190: * @stable ICU 2.6
0191: */
0192: protected int normalize(char[] src, int srcStart, int srcLimit,
0193: char[] dest, int destStart, int destLimit, UnicodeSet nx) {
0194: int srcLen = (srcLimit - srcStart);
0195: int destLen = (destLimit - destStart);
0196: if (srcLen > destLen) {
0197: return srcLen;
0198: }
0199: System.arraycopy(src, srcStart, dest, destStart, srcLen);
0200: return srcLen;
0201: }
0202:
0203: /**
0204: * This method is used for method dispatch
0205: * @stable ICU 2.6
0206: */
0207: protected int normalize(char[] src, int srcStart, int srcLimit,
0208: char[] dest, int destStart, int destLimit, int options) {
0209: return normalize(src, srcStart, srcLimit, dest, destStart,
0210: destLimit, NormalizerImpl.getNX(options));
0211: }
0212:
0213: /**
0214: * This method is used for method dispatch
0215: * @stable ICU 2.6
0216: */
0217: protected String normalize(String src, int options) {
0218: return src;
0219: }
0220:
0221: /**
0222: * This method is used for method dispatch
0223: * @stable ICU 2.8
0224: */
0225: protected int getMinC() {
0226: return -1;
0227: }
0228:
0229: /**
0230: * This method is used for method dispatch
0231: * @stable ICU 2.8
0232: */
0233: protected int getMask() {
0234: return -1;
0235: }
0236:
0237: /**
0238: * This method is used for method dispatch
0239: * @stable ICU 2.8
0240: */
0241: protected IsPrevBoundary getPrevBoundary() {
0242: return null;
0243: }
0244:
0245: /**
0246: * This method is used for method dispatch
0247: * @stable ICU 2.8
0248: */
0249: protected IsNextBoundary getNextBoundary() {
0250: return null;
0251: }
0252:
0253: /**
0254: * This method is used for method dispatch
0255: * @stable ICU 2.6
0256: */
0257: protected QuickCheckResult quickCheck(char[] src, int start,
0258: int limit, boolean allowMaybe, UnicodeSet nx) {
0259: if (allowMaybe) {
0260: return MAYBE;
0261: }
0262: return NO;
0263: }
0264:
0265: /**
0266: * This method is used for method dispatch
0267: * @stable ICU 2.8
0268: */
0269: protected boolean isNFSkippable(int c) {
0270: return true;
0271: }
0272: }
0273:
0274: /**
0275: * No decomposition/composition.
0276: * @stable ICU 2.8
0277: */
0278: public static final Mode NONE = new Mode(1);
0279:
0280: /**
0281: * Canonical decomposition.
0282: * @stable ICU 2.8
0283: */
0284: public static final Mode NFD = new NFDMode(2);
0285:
0286: private static final class NFDMode extends Mode {
0287: private NFDMode(int value) {
0288: super (value);
0289: }
0290:
0291: protected int normalize(char[] src, int srcStart, int srcLimit,
0292: char[] dest, int destStart, int destLimit, UnicodeSet nx) {
0293: int[] trailCC = new int[1];
0294: return NormalizerImpl.decompose(src, srcStart, srcLimit,
0295: dest, destStart, destLimit, false, trailCC, nx);
0296: }
0297:
0298: protected String normalize(String src, int options) {
0299: return decompose(src, false, options);
0300: }
0301:
0302: protected int getMinC() {
0303: return NormalizerImpl.MIN_WITH_LEAD_CC;
0304: }
0305:
0306: protected IsPrevBoundary getPrevBoundary() {
0307: return new IsPrevNFDSafe();
0308: }
0309:
0310: protected IsNextBoundary getNextBoundary() {
0311: return new IsNextNFDSafe();
0312: }
0313:
0314: protected int getMask() {
0315: return (NormalizerImpl.CC_MASK | NormalizerImpl.QC_NFD);
0316: }
0317:
0318: protected QuickCheckResult quickCheck(char[] src, int start,
0319: int limit, boolean allowMaybe, UnicodeSet nx) {
0320: return NormalizerImpl
0321: .quickCheck(
0322: src,
0323: start,
0324: limit,
0325: NormalizerImpl
0326: .getFromIndexesArr(NormalizerImpl.INDEX_MIN_NFD_NO_MAYBE),
0327: NormalizerImpl.QC_NFD, 0, allowMaybe, nx);
0328: }
0329:
0330: protected boolean isNFSkippable(int c) {
0331: return NormalizerImpl.isNFSkippable(c, this ,
0332: (NormalizerImpl.CC_MASK | NormalizerImpl.QC_NFD));
0333: }
0334: }
0335:
0336: /**
0337: * Compatibility decomposition.
0338: * @stable ICU 2.8
0339: */
0340: public static final Mode NFKD = new NFKDMode(3);
0341:
0342: private static final class NFKDMode extends Mode {
0343: private NFKDMode(int value) {
0344: super (value);
0345: }
0346:
0347: protected int normalize(char[] src, int srcStart, int srcLimit,
0348: char[] dest, int destStart, int destLimit, UnicodeSet nx) {
0349: int[] trailCC = new int[1];
0350: return NormalizerImpl.decompose(src, srcStart, srcLimit,
0351: dest, destStart, destLimit, true, trailCC, nx);
0352: }
0353:
0354: protected String normalize(String src, int options) {
0355: return decompose(src, true, options);
0356: }
0357:
0358: protected int getMinC() {
0359: return NormalizerImpl.MIN_WITH_LEAD_CC;
0360: }
0361:
0362: protected IsPrevBoundary getPrevBoundary() {
0363: return new IsPrevNFDSafe();
0364: }
0365:
0366: protected IsNextBoundary getNextBoundary() {
0367: return new IsNextNFDSafe();
0368: }
0369:
0370: protected int getMask() {
0371: return (NormalizerImpl.CC_MASK | NormalizerImpl.QC_NFKD);
0372: }
0373:
0374: protected QuickCheckResult quickCheck(char[] src, int start,
0375: int limit, boolean allowMaybe, UnicodeSet nx) {
0376: return NormalizerImpl
0377: .quickCheck(
0378: src,
0379: start,
0380: limit,
0381: NormalizerImpl
0382: .getFromIndexesArr(NormalizerImpl.INDEX_MIN_NFKD_NO_MAYBE),
0383: NormalizerImpl.QC_NFKD,
0384: NormalizerImpl.OPTIONS_COMPAT, allowMaybe,
0385: nx);
0386: }
0387:
0388: protected boolean isNFSkippable(int c) {
0389: return NormalizerImpl.isNFSkippable(c, this ,
0390: (NormalizerImpl.CC_MASK | NormalizerImpl.QC_NFKD));
0391: }
0392: }
0393:
0394: /**
0395: * Canonical decomposition followed by canonical composition.
0396: * @stable ICU 2.8
0397: */
0398: public static final Mode NFC = new NFCMode(4);
0399:
0400: private static final class NFCMode extends Mode {
0401: private NFCMode(int value) {
0402: super (value);
0403: }
0404:
0405: protected int normalize(char[] src, int srcStart, int srcLimit,
0406: char[] dest, int destStart, int destLimit, UnicodeSet nx) {
0407: return NormalizerImpl.compose(src, srcStart, srcLimit,
0408: dest, destStart, destLimit, 0, nx);
0409: }
0410:
0411: protected String normalize(String src, int options) {
0412: return compose(src, false, options);
0413: }
0414:
0415: protected int getMinC() {
0416: return NormalizerImpl
0417: .getFromIndexesArr(NormalizerImpl.INDEX_MIN_NFC_NO_MAYBE);
0418: }
0419:
0420: protected IsPrevBoundary getPrevBoundary() {
0421: return new IsPrevTrueStarter();
0422: }
0423:
0424: protected IsNextBoundary getNextBoundary() {
0425: return new IsNextTrueStarter();
0426: }
0427:
0428: protected int getMask() {
0429: return (NormalizerImpl.CC_MASK | NormalizerImpl.QC_NFC);
0430: }
0431:
0432: protected QuickCheckResult quickCheck(char[] src, int start,
0433: int limit, boolean allowMaybe, UnicodeSet nx) {
0434: return NormalizerImpl
0435: .quickCheck(
0436: src,
0437: start,
0438: limit,
0439: NormalizerImpl
0440: .getFromIndexesArr(NormalizerImpl.INDEX_MIN_NFC_NO_MAYBE),
0441: NormalizerImpl.QC_NFC, 0, allowMaybe, nx);
0442: }
0443:
0444: protected boolean isNFSkippable(int c) {
0445: return NormalizerImpl
0446: .isNFSkippable(
0447: c,
0448: this ,
0449: (NormalizerImpl.CC_MASK
0450: | NormalizerImpl.COMBINES_ANY | (NormalizerImpl.QC_NFC & NormalizerImpl.QC_ANY_NO)));
0451: }
0452: };
0453:
0454: /**
0455: * Compatibility decomposition followed by canonical composition.
0456: * @stable ICU 2.8
0457: */
0458: public static final Mode NFKC = new NFKCMode(5);
0459:
0460: private static final class NFKCMode extends Mode {
0461: private NFKCMode(int value) {
0462: super (value);
0463: }
0464:
0465: protected int normalize(char[] src, int srcStart, int srcLimit,
0466: char[] dest, int destStart, int destLimit, UnicodeSet nx) {
0467: return NormalizerImpl.compose(src, srcStart, srcLimit,
0468: dest, destStart, destLimit,
0469: NormalizerImpl.OPTIONS_COMPAT, nx);
0470: }
0471:
0472: protected String normalize(String src, int options) {
0473: return compose(src, true, options);
0474: }
0475:
0476: protected int getMinC() {
0477: return NormalizerImpl
0478: .getFromIndexesArr(NormalizerImpl.INDEX_MIN_NFKC_NO_MAYBE);
0479: }
0480:
0481: protected IsPrevBoundary getPrevBoundary() {
0482: return new IsPrevTrueStarter();
0483: }
0484:
0485: protected IsNextBoundary getNextBoundary() {
0486: return new IsNextTrueStarter();
0487: }
0488:
0489: protected int getMask() {
0490: return (NormalizerImpl.CC_MASK | NormalizerImpl.QC_NFKC);
0491: }
0492:
0493: protected QuickCheckResult quickCheck(char[] src, int start,
0494: int limit, boolean allowMaybe, UnicodeSet nx) {
0495: return NormalizerImpl
0496: .quickCheck(
0497: src,
0498: start,
0499: limit,
0500: NormalizerImpl
0501: .getFromIndexesArr(NormalizerImpl.INDEX_MIN_NFKC_NO_MAYBE),
0502: NormalizerImpl.QC_NFKC,
0503: NormalizerImpl.OPTIONS_COMPAT, allowMaybe,
0504: nx);
0505: }
0506:
0507: protected boolean isNFSkippable(int c) {
0508: return NormalizerImpl
0509: .isNFSkippable(
0510: c,
0511: this ,
0512: (NormalizerImpl.CC_MASK
0513: | NormalizerImpl.COMBINES_ANY | (NormalizerImpl.QC_NFKC & NormalizerImpl.QC_ANY_NO)));
0514: }
0515: };
0516:
0517: /**
0518: * Result values for quickCheck().
0519: * For details see Unicode Technical Report 15.
0520: * @stable ICU 2.8
0521: */
0522: public static final class QuickCheckResult {
0523: private int resultValue;
0524:
0525: private QuickCheckResult(int value) {
0526: resultValue = value;
0527: }
0528: }
0529:
0530: /**
0531: * Indicates that string is not in the normalized format
0532: * @stable ICU 2.8
0533: */
0534: public static final QuickCheckResult NO = new QuickCheckResult(0);
0535:
0536: /**
0537: * Indicates that string is in the normalized format
0538: * @stable ICU 2.8
0539: */
0540: public static final QuickCheckResult YES = new QuickCheckResult(1);
0541:
0542: /**
0543: * Indicates it cannot be determined if string is in the normalized
0544: * format without further thorough checks.
0545: * @stable ICU 2.8
0546: */
0547: public static final QuickCheckResult MAYBE = new QuickCheckResult(2);
0548:
0549: //-------------------------------------------------------------------------
0550: // Constructors
0551: //-------------------------------------------------------------------------
0552:
0553: /**
0554: * Creates a new <tt>Normalizer</tt> object for iterating over the
0555: * normalized form of a given string.
0556: * <p>
0557: * The <tt>options</tt> parameter specifies which optional
0558: * <tt>Normalizer</tt> features are to be enabled for this object.
0559: * <p>
0560: * @param str The string to be normalized. The normalization
0561: * will start at the beginning of the string.
0562: *
0563: * @param mode The normalization mode.
0564: *
0565: * @param opt Any optional features to be enabled.
0566: * Currently the only available option is {@link #UNICODE_3_2}.
0567: * If you want the default behavior corresponding to one of the
0568: * standard Unicode Normalization Forms, use 0 for this argument.
0569: * @stable ICU 2.6
0570: */
0571: public NormalizerBase(String str, Mode mode, int opt) {
0572: this .text = UCharacterIterator.getInstance(str);
0573: this .mode = mode;
0574: this .options = opt;
0575: }
0576:
0577: /**
0578: * Creates a new <tt>Normalizer</tt> object for iterating over the
0579: * normalized form of the given text.
0580: * <p>
0581: * @param iter The input text to be normalized. The normalization
0582: * will start at the beginning of the string.
0583: *
0584: * @param mode The normalization mode.
0585: */
0586: public NormalizerBase(CharacterIterator iter, Mode mode) {
0587: this (iter, mode, UNICODE_LATEST);
0588: }
0589:
0590: /**
0591: * Creates a new <tt>Normalizer</tt> object for iterating over the
0592: * normalized form of the given text.
0593: * <p>
0594: * @param iter The input text to be normalized. The normalization
0595: * will start at the beginning of the string.
0596: *
0597: * @param mode The normalization mode.
0598: *
0599: * @param opt Any optional features to be enabled.
0600: * Currently the only available option is {@link #UNICODE_3_2}.
0601: * If you want the default behavior corresponding to one of the
0602: * standard Unicode Normalization Forms, use 0 for this argument.
0603: * @stable ICU 2.6
0604: */
0605: public NormalizerBase(CharacterIterator iter, Mode mode, int opt) {
0606: this .text = UCharacterIterator
0607: .getInstance((CharacterIterator) iter.clone());
0608: this .mode = mode;
0609: this .options = opt;
0610: }
0611:
0612: /**
0613: * Clones this <tt>Normalizer</tt> object. All properties of this
0614: * object are duplicated in the new object, including the cloning of any
0615: * {@link CharacterIterator} that was passed in to the constructor
0616: * or to {@link #setText(CharacterIterator) setText}.
0617: * However, the text storage underlying
0618: * the <tt>CharacterIterator</tt> is not duplicated unless the
0619: * iterator's <tt>clone</tt> method does so.
0620: * @stable ICU 2.8
0621: */
0622: public Object clone() {
0623: try {
0624: NormalizerBase copy = (NormalizerBase) super .clone();
0625: copy.text = (UCharacterIterator) text.clone();
0626: //clone the internal buffer
0627: if (buffer != null) {
0628: copy.buffer = new char[buffer.length];
0629: System.arraycopy(buffer, 0, copy.buffer, 0,
0630: buffer.length);
0631: }
0632: return copy;
0633: } catch (CloneNotSupportedException e) {
0634: throw new InternalError(e.toString());
0635: }
0636: }
0637:
0638: //--------------------------------------------------------------------------
0639: // Static Utility methods
0640: //--------------------------------------------------------------------------
0641:
0642: /**
0643: * Compose a string.
0644: * The string will be composed to according the the specified mode.
0645: * @param str The string to compose.
0646: * @param compat If true the string will be composed accoding to
0647: * NFKC rules and if false will be composed according to
0648: * NFC rules.
0649: * @param options The only recognized option is UNICODE_3_2
0650: * @return String The composed string
0651: * @stable ICU 2.6
0652: */
0653: public static String compose(String str, boolean compat, int options) {
0654:
0655: char[] dest, src;
0656: if (options == UNICODE_3_2_0_ORIGINAL) {
0657: String mappedStr = NormalizerImpl.convert(str);
0658: dest = new char[mappedStr.length() * MAX_BUF_SIZE_COMPOSE];
0659: src = mappedStr.toCharArray();
0660: } else {
0661: dest = new char[str.length() * MAX_BUF_SIZE_COMPOSE];
0662: src = str.toCharArray();
0663: }
0664: int destSize = 0;
0665:
0666: UnicodeSet nx = NormalizerImpl.getNX(options);
0667:
0668: /* reset options bits that should only be set here or inside compose() */
0669: options &= ~(NormalizerImpl.OPTIONS_SETS_MASK
0670: | NormalizerImpl.OPTIONS_COMPAT | NormalizerImpl.OPTIONS_COMPOSE_CONTIGUOUS);
0671:
0672: if (compat) {
0673: options |= NormalizerImpl.OPTIONS_COMPAT;
0674: }
0675:
0676: for (;;) {
0677: destSize = NormalizerImpl.compose(src, 0, src.length, dest,
0678: 0, dest.length, options, nx);
0679: if (destSize <= dest.length) {
0680: return new String(dest, 0, destSize);
0681: } else {
0682: dest = new char[destSize];
0683: }
0684: }
0685: }
0686:
0687: private static final int MAX_BUF_SIZE_COMPOSE = 2;
0688: private static final int MAX_BUF_SIZE_DECOMPOSE = 3;
0689:
0690: /**
0691: * Decompose a string.
0692: * The string will be decomposed to according the the specified mode.
0693: * @param str The string to decompose.
0694: * @param compat If true the string will be decomposed accoding to NFKD
0695: * rules and if false will be decomposed according to NFD
0696: * rules.
0697: * @return String The decomposed string
0698: * @stable ICU 2.8
0699: */
0700: public static String decompose(String str, boolean compat) {
0701: return decompose(str, compat, UNICODE_LATEST);
0702: }
0703:
0704: /**
0705: * Decompose a string.
0706: * The string will be decomposed to according the the specified mode.
0707: * @param str The string to decompose.
0708: * @param compat If true the string will be decomposed accoding to NFKD
0709: * rules and if false will be decomposed according to NFD
0710: * rules.
0711: * @param options The normalization options, ORed together (0 for no options).
0712: * @return String The decomposed string
0713: * @stable ICU 2.6
0714: */
0715: public static String decompose(String str, boolean compat,
0716: int options) {
0717:
0718: int[] trailCC = new int[1];
0719: int destSize = 0;
0720: UnicodeSet nx = NormalizerImpl.getNX(options);
0721: char[] dest;
0722:
0723: if (options == UNICODE_3_2_0_ORIGINAL) {
0724: String mappedStr = NormalizerImpl.convert(str);
0725: dest = new char[mappedStr.length() * MAX_BUF_SIZE_DECOMPOSE];
0726:
0727: for (;;) {
0728: destSize = NormalizerImpl.decompose(mappedStr
0729: .toCharArray(), 0, mappedStr.length(), dest, 0,
0730: dest.length, compat, trailCC, nx);
0731: if (destSize <= dest.length) {
0732: return new String(dest, 0, destSize);
0733: } else {
0734: dest = new char[destSize];
0735: }
0736: }
0737: } else {
0738: dest = new char[str.length() * MAX_BUF_SIZE_DECOMPOSE];
0739:
0740: for (;;) {
0741: destSize = NormalizerImpl.decompose(str.toCharArray(),
0742: 0, str.length(), dest, 0, dest.length, compat,
0743: trailCC, nx);
0744: if (destSize <= dest.length) {
0745: return new String(dest, 0, destSize);
0746: } else {
0747: dest = new char[destSize];
0748: }
0749: }
0750: }
0751: }
0752:
0753: /**
0754: * Normalize a string.
0755: * The string will be normalized according the the specified normalization
0756: * mode and options.
0757: * @param src The char array to compose.
0758: * @param srcStart Start index of the source
0759: * @param srcLimit Limit index of the source
0760: * @param dest The char buffer to fill in
0761: * @param destStart Start index of the destination buffer
0762: * @param destLimit End index of the destination buffer
0763: * @param mode The normalization mode; one of Normalizer.NONE,
0764: * Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC,
0765: * Normalizer.NFKD, Normalizer.DEFAULT
0766: * @param options The normalization options, ORed together (0 for no options).
0767: * @return int The total buffer size needed;if greater than length of
0768: * result, the output was truncated.
0769: * @exception IndexOutOfBoundsException if the target capacity is
0770: * less than the required length
0771: * @stable ICU 2.6
0772: */
0773: public static int normalize(char[] src, int srcStart, int srcLimit,
0774: char[] dest, int destStart, int destLimit, Mode mode,
0775: int options) {
0776: int length = mode.normalize(src, srcStart, srcLimit, dest,
0777: destStart, destLimit, options);
0778:
0779: if (length <= (destLimit - destStart)) {
0780: return length;
0781: } else {
0782: throw new IndexOutOfBoundsException(Integer
0783: .toString(length));
0784: }
0785: }
0786:
0787: //-------------------------------------------------------------------------
0788: // Iteration API
0789: //-------------------------------------------------------------------------
0790:
0791: /**
0792: * Return the current character in the normalized text->
0793: * @return The codepoint as an int
0794: * @stable ICU 2.8
0795: */
0796: public int current() {
0797: if (bufferPos < bufferLimit || nextNormalize()) {
0798: return getCodePointAt(bufferPos);
0799: } else {
0800: return DONE;
0801: }
0802: }
0803:
0804: /**
0805: * Return the next character in the normalized text and advance
0806: * the iteration position by one. If the end
0807: * of the text has already been reached, {@link #DONE} is returned.
0808: * @return The codepoint as an int
0809: * @stable ICU 2.8
0810: */
0811: public int next() {
0812: if (bufferPos < bufferLimit || nextNormalize()) {
0813: int c = getCodePointAt(bufferPos);
0814: bufferPos += (c > 0xFFFF) ? 2 : 1;
0815: return c;
0816: } else {
0817: return DONE;
0818: }
0819: }
0820:
0821: /**
0822: * Return the previous character in the normalized text and decrement
0823: * the iteration position by one. If the beginning
0824: * of the text has already been reached, {@link #DONE} is returned.
0825: * @return The codepoint as an int
0826: * @stable ICU 2.8
0827: */
0828: public int previous() {
0829: if (bufferPos > 0 || previousNormalize()) {
0830: int c = getCodePointAt(bufferPos - 1);
0831: bufferPos -= (c > 0xFFFF) ? 2 : 1;
0832: return c;
0833: } else {
0834: return DONE;
0835: }
0836: }
0837:
0838: /**
0839: * Reset the index to the beginning of the text.
0840: * This is equivalent to setIndexOnly(startIndex)).
0841: * @stable ICU 2.8
0842: */
0843: public void reset() {
0844: text.setIndex(0);
0845: currentIndex = nextIndex = 0;
0846: clearBuffer();
0847: }
0848:
0849: /**
0850: * Set the iteration position in the input text that is being normalized,
0851: * without any immediate normalization.
0852: * After setIndexOnly(), getIndex() will return the same index that is
0853: * specified here.
0854: *
0855: * @param index the desired index in the input text.
0856: * @stable ICU 2.8
0857: */
0858: public void setIndexOnly(int index) {
0859: text.setIndex(index);
0860: currentIndex = nextIndex = index; // validates index
0861: clearBuffer();
0862: }
0863:
0864: /**
0865: * Set the iteration position in the input text that is being normalized
0866: * and return the first normalized character at that position.
0867: * <p>
0868: * <b>Note:</b> This method sets the position in the <em>input</em> text,
0869: * while {@link #next} and {@link #previous} iterate through characters
0870: * in the normalized <em>output</em>. This means that there is not
0871: * necessarily a one-to-one correspondence between characters returned
0872: * by <tt>next</tt> and <tt>previous</tt> and the indices passed to and
0873: * returned from <tt>setIndex</tt> and {@link #getIndex}.
0874: * <p>
0875: * @param index the desired index in the input text->
0876: *
0877: * @return the first normalized character that is the result of iterating
0878: * forward starting at the given index.
0879: *
0880: * @throws IllegalArgumentException if the given index is less than
0881: * {@link #getBeginIndex} or greater than {@link #getEndIndex}.
0882: * @return The codepoint as an int
0883: * @deprecated ICU 3.2
0884: * @obsolete ICU 3.2
0885: */
0886: public int setIndex(int index) {
0887: setIndexOnly(index);
0888: return current();
0889: }
0890:
0891: /**
0892: * Retrieve the index of the start of the input text. This is the begin
0893: * index of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the
0894: * <tt>String</tt> over which this <tt>Normalizer</tt> is iterating
0895: * @deprecated ICU 2.2. Use startIndex() instead.
0896: * @return The codepoint as an int
0897: * @see #startIndex
0898: */
0899: public int getBeginIndex() {
0900: return 0;
0901: }
0902:
0903: /**
0904: * Retrieve the index of the end of the input text. This is the end index
0905: * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
0906: * over which this <tt>Normalizer</tt> is iterating
0907: * @deprecated ICU 2.2. Use endIndex() instead.
0908: * @return The codepoint as an int
0909: * @see #endIndex
0910: */
0911: public int getEndIndex() {
0912: return endIndex();
0913: }
0914:
0915: /**
0916: * Retrieve the current iteration position in the input text that is
0917: * being normalized. This method is useful in applications such as
0918: * searching, where you need to be able to determine the position in
0919: * the input text that corresponds to a given normalized output character.
0920: * <p>
0921: * <b>Note:</b> This method sets the position in the <em>input</em>, while
0922: * {@link #next} and {@link #previous} iterate through characters in the
0923: * <em>output</em>. This means that there is not necessarily a one-to-one
0924: * correspondence between characters returned by <tt>next</tt> and
0925: * <tt>previous</tt> and the indices passed to and returned from
0926: * <tt>setIndex</tt> and {@link #getIndex}.
0927: * @return The current iteration position
0928: * @stable ICU 2.8
0929: */
0930: public int getIndex() {
0931: if (bufferPos < bufferLimit) {
0932: return currentIndex;
0933: } else {
0934: return nextIndex;
0935: }
0936: }
0937:
0938: /**
0939: * Retrieve the index of the end of the input text-> This is the end index
0940: * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
0941: * over which this <tt>Normalizer</tt> is iterating
0942: * @return The current iteration position
0943: * @stable ICU 2.8
0944: */
0945: public int endIndex() {
0946: return text.getLength();
0947: }
0948:
0949: //-------------------------------------------------------------------------
0950: // Property access methods
0951: //-------------------------------------------------------------------------
0952: /**
0953: * Set the normalization mode for this object.
0954: * <p>
0955: * <b>Note:</b>If the normalization mode is changed while iterating
0956: * over a string, calls to {@link #next} and {@link #previous} may
0957: * return previously buffers characters in the old normalization mode
0958: * until the iteration is able to re-sync at the next base character.
0959: * It is safest to call {@link #setText setText()}, {@link #first},
0960: * {@link #last}, etc. after calling <tt>setMode</tt>.
0961: * <p>
0962: * @param newMode the new mode for this <tt>Normalizer</tt>.
0963: * The supported modes are:
0964: * <ul>
0965: * <li>{@link #COMPOSE} - Unicode canonical decompositiion
0966: * followed by canonical composition.
0967: * <li>{@link #COMPOSE_COMPAT} - Unicode compatibility decompositiion
0968: * follwed by canonical composition.
0969: * <li>{@link #DECOMP} - Unicode canonical decomposition
0970: * <li>{@link #DECOMP_COMPAT} - Unicode compatibility decomposition.
0971: * <li>{@link #NO_OP} - Do nothing but return characters
0972: * from the underlying input text.
0973: * </ul>
0974: *
0975: * @see #getMode
0976: * @stable ICU 2.8
0977: */
0978: public void setMode(Mode newMode) {
0979: mode = newMode;
0980: }
0981:
0982: /**
0983: * Return the basic operation performed by this <tt>Normalizer</tt>
0984: *
0985: * @see #setMode
0986: * @stable ICU 2.8
0987: */
0988: public Mode getMode() {
0989: return mode;
0990: }
0991:
0992: /**
0993: * Set the input text over which this <tt>Normalizer</tt> will iterate.
0994: * The iteration position is set to the beginning of the input text->
0995: * @param newText The new string to be normalized.
0996: * @stable ICU 2.8
0997: */
0998: public void setText(String newText) {
0999:
1000: UCharacterIterator newIter = UCharacterIterator
1001: .getInstance(newText);
1002: if (newIter == null) {
1003: throw new InternalError(
1004: "Could not create a new UCharacterIterator");
1005: }
1006: text = newIter;
1007: reset();
1008: }
1009:
1010: /**
1011: * Set the input text over which this <tt>Normalizer</tt> will iterate.
1012: * The iteration position is set to the beginning of the input text->
1013: * @param newText The new string to be normalized.
1014: * @stable ICU 2.8
1015: */
1016: public void setText(CharacterIterator newText) {
1017:
1018: UCharacterIterator newIter = UCharacterIterator
1019: .getInstance(newText);
1020: if (newIter == null) {
1021: throw new InternalError(
1022: "Could not create a new UCharacterIterator");
1023: }
1024: text = newIter;
1025: currentIndex = nextIndex = 0;
1026: clearBuffer();
1027: }
1028:
1029: //-------------------------------------------------------------------------
1030: // Private utility methods
1031: //-------------------------------------------------------------------------
1032:
1033: /* backward iteration --------------------------------------------------- */
1034:
1035: /*
1036: * read backwards and get norm32
1037: * return 0 if the character is <minC
1038: * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first
1039: * surrogate but read second!)
1040: */
1041:
1042: private static long getPrevNorm32(UCharacterIterator src,
1043: int/*unsigned*/minC, int/*unsigned*/mask, char[] chars) {
1044: long norm32;
1045: int ch = 0;
1046: /* need src.hasPrevious() */
1047: if ((ch = src.previous()) == UCharacterIterator.DONE) {
1048: return 0;
1049: }
1050: chars[0] = (char) ch;
1051: chars[1] = 0;
1052:
1053: /* check for a surrogate before getting norm32 to see if we need to
1054: * predecrement further */
1055: if (chars[0] < minC) {
1056: return 0;
1057: } else if (!UTF16.isSurrogate(chars[0])) {
1058: return NormalizerImpl.getNorm32(chars[0]);
1059: } else if (UTF16.isLeadSurrogate(chars[0])
1060: || (src.getIndex() == 0)) {
1061: /* unpaired surrogate */
1062: chars[1] = (char) src.current();
1063: return 0;
1064: } else if (UTF16.isLeadSurrogate(chars[1] = (char) src
1065: .previous())) {
1066: norm32 = NormalizerImpl.getNorm32(chars[1]);
1067: if ((norm32 & mask) == 0) {
1068: /* all surrogate pairs with this lead surrogate have irrelevant
1069: * data */
1070: return 0;
1071: } else {
1072: /* norm32 must be a surrogate special */
1073: return NormalizerImpl.getNorm32FromSurrogatePair(
1074: norm32, chars[0]);
1075: }
1076: } else {
1077: /* unpaired second surrogate, undo the c2=src.previous() movement */
1078: src.moveIndex(1);
1079: return 0;
1080: }
1081: }
1082:
1083: private interface IsPrevBoundary {
1084: public boolean isPrevBoundary(UCharacterIterator src,
1085: int/*unsigned*/minC, int/*unsigned*/mask, char[] chars);
1086: }
1087:
1088: private static final class IsPrevNFDSafe implements IsPrevBoundary {
1089: /*
1090: * for NF*D:
1091: * read backwards and check if the lead combining class is 0
1092: * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first
1093: * surrogate but read second!)
1094: */
1095: public boolean isPrevBoundary(UCharacterIterator src,
1096: int/*unsigned*/minC, int/*unsigned*/ccOrQCMask,
1097: char[] chars) {
1098:
1099: return NormalizerImpl.isNFDSafe(getPrevNorm32(src, minC,
1100: ccOrQCMask, chars), ccOrQCMask, ccOrQCMask
1101: & NormalizerImpl.QC_MASK);
1102: }
1103: }
1104:
1105: private static final class IsPrevTrueStarter implements
1106: IsPrevBoundary {
1107: /*
1108: * read backwards and check if the character is (or its decomposition
1109: * begins with) a "true starter" (cc==0 and NF*C_YES)
1110: * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first
1111: * surrogate but read second!)
1112: */
1113: public boolean isPrevBoundary(UCharacterIterator src,
1114: int/*unsigned*/minC, int/*unsigned*/ccOrQCMask,
1115: char[] chars) {
1116: long norm32;
1117: int/*unsigned*/decompQCMask;
1118:
1119: decompQCMask = (ccOrQCMask << 2) & 0xf; /*decomposition quick check mask*/
1120: norm32 = getPrevNorm32(src, minC,
1121: ccOrQCMask | decompQCMask, chars);
1122: return NormalizerImpl.isTrueStarter(norm32, ccOrQCMask,
1123: decompQCMask);
1124: }
1125: }
1126:
1127: private static int findPreviousIterationBoundary(
1128: UCharacterIterator src, IsPrevBoundary obj,
1129: int/*unsigned*/minC, int/*mask*/mask, char[] buffer,
1130: int[] startIndex) {
1131: char[] chars = new char[2];
1132: boolean isBoundary;
1133:
1134: /* fill the buffer from the end backwards */
1135: startIndex[0] = buffer.length;
1136: chars[0] = 0;
1137: while (src.getIndex() > 0
1138: && chars[0] != UCharacterIterator.DONE) {
1139: isBoundary = obj.isPrevBoundary(src, minC, mask, chars);
1140:
1141: /* always write this character to the front of the buffer */
1142: /* make sure there is enough space in the buffer */
1143: if (startIndex[0] < (chars[1] == 0 ? 1 : 2)) {
1144:
1145: // grow the buffer
1146: char[] newBuf = new char[buffer.length * 2];
1147: /* move the current buffer contents up */
1148: System
1149: .arraycopy(
1150: buffer,
1151: startIndex[0],
1152: newBuf,
1153: newBuf.length
1154: - (buffer.length - startIndex[0]),
1155: buffer.length - startIndex[0]);
1156: //adjust the startIndex
1157: startIndex[0] += newBuf.length - buffer.length;
1158:
1159: buffer = newBuf;
1160: newBuf = null;
1161:
1162: }
1163:
1164: buffer[--startIndex[0]] = chars[0];
1165: if (chars[1] != 0) {
1166: buffer[--startIndex[0]] = chars[1];
1167: }
1168:
1169: /* stop if this just-copied character is a boundary */
1170: if (isBoundary) {
1171: break;
1172: }
1173: }
1174:
1175: /* return the length of the buffer contents */
1176: return buffer.length - startIndex[0];
1177: }
1178:
1179: private static int previous(UCharacterIterator src, char[] dest,
1180: int destStart, int destLimit, Mode mode,
1181: boolean doNormalize, boolean[] pNeededToNormalize,
1182: int options) {
1183:
1184: IsPrevBoundary isPreviousBoundary;
1185: int destLength, bufferLength;
1186: int/*unsigned*/mask;
1187: int c, c2;
1188:
1189: char minC;
1190: int destCapacity = destLimit - destStart;
1191: destLength = 0;
1192:
1193: if (pNeededToNormalize != null) {
1194: pNeededToNormalize[0] = false;
1195: }
1196: minC = (char) mode.getMinC();
1197: mask = mode.getMask();
1198: isPreviousBoundary = mode.getPrevBoundary();
1199:
1200: if (isPreviousBoundary == null) {
1201: destLength = 0;
1202: if ((c = src.previous()) >= 0) {
1203: destLength = 1;
1204: if (UTF16.isTrailSurrogate((char) c)) {
1205: c2 = src.previous();
1206: if (c2 != UCharacterIterator.DONE) {
1207: if (UTF16.isLeadSurrogate((char) c2)) {
1208: if (destCapacity >= 2) {
1209: dest[1] = (char) c; // trail surrogate
1210: destLength = 2;
1211: }
1212: // lead surrogate to be written below
1213: c = c2;
1214: } else {
1215: src.moveIndex(1);
1216: }
1217: }
1218: }
1219:
1220: if (destCapacity > 0) {
1221: dest[0] = (char) c;
1222: }
1223: }
1224: return destLength;
1225: }
1226:
1227: char[] buffer = new char[100];
1228: int[] startIndex = new int[1];
1229: bufferLength = findPreviousIterationBoundary(src,
1230: isPreviousBoundary, minC, mask, buffer, startIndex);
1231: if (bufferLength > 0) {
1232: if (doNormalize) {
1233: destLength = NormalizerBase.normalize(buffer,
1234: startIndex[0], startIndex[0] + bufferLength,
1235: dest, destStart, destLimit, mode, options);
1236:
1237: if (pNeededToNormalize != null) {
1238: pNeededToNormalize[0] = (boolean) (destLength != bufferLength || Utility
1239: .arrayRegionMatches(buffer, 0, dest,
1240: destStart, destLimit));
1241: }
1242: } else {
1243: /* just copy the source characters */
1244: if (destCapacity > 0) {
1245: System
1246: .arraycopy(
1247: buffer,
1248: startIndex[0],
1249: dest,
1250: 0,
1251: (bufferLength < destCapacity) ? bufferLength
1252: : destCapacity);
1253: }
1254: }
1255: }
1256:
1257: return destLength;
1258: }
1259:
1260: /* forward iteration ---------------------------------------------------- */
1261: /*
1262: * read forward and check if the character is a next-iteration boundary
1263: * if c2!=0 then (c, c2) is a surrogate pair
1264: */
1265: private interface IsNextBoundary {
1266: boolean isNextBoundary(UCharacterIterator src,
1267: int/*unsigned*/minC, int/*unsigned*/mask, int[] chars);
1268: }
1269:
1270: /*
1271: * read forward and get norm32
1272: * return 0 if the character is <minC
1273: * if c2!=0 then (c2, c) is a surrogate pair
1274: * always reads complete characters
1275: */
1276: private static long /*unsigned*/getNextNorm32(
1277: UCharacterIterator src, int/*unsigned*/minC,
1278: int/*unsigned*/mask, int[] chars) {
1279: long norm32;
1280:
1281: /* need src.hasNext() to be true */
1282: chars[0] = src.next();
1283: chars[1] = 0;
1284:
1285: if (chars[0] < minC) {
1286: return 0;
1287: }
1288:
1289: norm32 = NormalizerImpl.getNorm32((char) chars[0]);
1290: if (UTF16.isLeadSurrogate((char) chars[0])) {
1291: if (src.current() != UCharacterIterator.DONE
1292: && UTF16.isTrailSurrogate((char) (chars[1] = src
1293: .current()))) {
1294: src.moveIndex(1); /* skip the c2 surrogate */
1295: if ((norm32 & mask) == 0) {
1296: /* irrelevant data */
1297: return 0;
1298: } else {
1299: /* norm32 must be a surrogate special */
1300: return NormalizerImpl.getNorm32FromSurrogatePair(
1301: norm32, (char) chars[1]);
1302: }
1303: } else {
1304: /* unmatched surrogate */
1305: return 0;
1306: }
1307: }
1308: return norm32;
1309: }
1310:
1311: /*
1312: * for NF*D:
1313: * read forward and check if the lead combining class is 0
1314: * if c2!=0 then (c, c2) is a surrogate pair
1315: */
1316: private static final class IsNextNFDSafe implements IsNextBoundary {
1317: public boolean isNextBoundary(UCharacterIterator src,
1318: int/*unsigned*/minC, int/*unsigned*/ccOrQCMask,
1319: int[] chars) {
1320: return NormalizerImpl.isNFDSafe(getNextNorm32(src, minC,
1321: ccOrQCMask, chars), ccOrQCMask, ccOrQCMask
1322: & NormalizerImpl.QC_MASK);
1323: }
1324: }
1325:
1326: /*
1327: * for NF*C:
1328: * read forward and check if the character is (or its decomposition begins
1329: * with) a "true starter" (cc==0 and NF*C_YES)
1330: * if c2!=0 then (c, c2) is a surrogate pair
1331: */
1332: private static final class IsNextTrueStarter implements
1333: IsNextBoundary {
1334: public boolean isNextBoundary(UCharacterIterator src,
1335: int/*unsigned*/minC, int/*unsigned*/ccOrQCMask,
1336: int[] chars) {
1337: long norm32;
1338: int/*unsigned*/decompQCMask;
1339:
1340: decompQCMask = (ccOrQCMask << 2) & 0xf; /*decomposition quick check mask*/
1341: norm32 = getNextNorm32(src, minC,
1342: ccOrQCMask | decompQCMask, chars);
1343: return NormalizerImpl.isTrueStarter(norm32, ccOrQCMask,
1344: decompQCMask);
1345: }
1346: }
1347:
1348: private static int findNextIterationBoundary(
1349: UCharacterIterator src, IsNextBoundary obj,
1350: int/*unsigned*/minC, int/*unsigned*/mask, char[] buffer) {
1351: if (src.current() == UCharacterIterator.DONE) {
1352: return 0;
1353: }
1354:
1355: /* get one character and ignore its properties */
1356: int[] chars = new int[2];
1357: chars[0] = src.next();
1358: buffer[0] = (char) chars[0];
1359: int bufferIndex = 1;
1360:
1361: if (UTF16.isLeadSurrogate((char) chars[0])
1362: && src.current() != UCharacterIterator.DONE) {
1363: if (UTF16.isTrailSurrogate((char) (chars[1] = src.next()))) {
1364: buffer[bufferIndex++] = (char) chars[1];
1365: } else {
1366: src.moveIndex(-1); /* back out the non-trail-surrogate */
1367: }
1368: }
1369:
1370: /* get all following characters until we see a boundary */
1371: /* checking hasNext() instead of c!=DONE on the off-chance that U+ffff
1372: * is part of the string */
1373: while (src.current() != UCharacterIterator.DONE) {
1374: if (obj.isNextBoundary(src, minC, mask, chars)) {
1375: /* back out the latest movement to stop at the boundary */
1376: src.moveIndex(chars[1] == 0 ? -1 : -2);
1377: break;
1378: } else {
1379: if (bufferIndex + (chars[1] == 0 ? 1 : 2) <= buffer.length) {
1380: buffer[bufferIndex++] = (char) chars[0];
1381: if (chars[1] != 0) {
1382: buffer[bufferIndex++] = (char) chars[1];
1383: }
1384: } else {
1385: char[] newBuf = new char[buffer.length * 2];
1386: System.arraycopy(buffer, 0, newBuf, 0, bufferIndex);
1387: buffer = newBuf;
1388: buffer[bufferIndex++] = (char) chars[0];
1389: if (chars[1] != 0) {
1390: buffer[bufferIndex++] = (char) chars[1];
1391: }
1392: }
1393: }
1394: }
1395:
1396: /* return the length of the buffer contents */
1397: return bufferIndex;
1398: }
1399:
1400: private static int next(UCharacterIterator src, char[] dest,
1401: int destStart, int destLimit, NormalizerBase.Mode mode,
1402: boolean doNormalize, boolean[] pNeededToNormalize,
1403: int options) {
1404:
1405: IsNextBoundary isNextBoundary;
1406: int /*unsigned*/mask;
1407: int /*unsigned*/bufferLength;
1408: int c, c2;
1409: char minC;
1410: int destCapacity = destLimit - destStart;
1411: int destLength = 0;
1412: if (pNeededToNormalize != null) {
1413: pNeededToNormalize[0] = false;
1414: }
1415:
1416: minC = (char) mode.getMinC();
1417: mask = mode.getMask();
1418: isNextBoundary = mode.getNextBoundary();
1419:
1420: if (isNextBoundary == null) {
1421: destLength = 0;
1422: c = src.next();
1423: if (c != UCharacterIterator.DONE) {
1424: destLength = 1;
1425: if (UTF16.isLeadSurrogate((char) c)) {
1426: c2 = src.next();
1427: if (c2 != UCharacterIterator.DONE) {
1428: if (UTF16.isTrailSurrogate((char) c2)) {
1429: if (destCapacity >= 2) {
1430: dest[1] = (char) c2; // trail surrogate
1431: destLength = 2;
1432: }
1433: // lead surrogate to be written below
1434: } else {
1435: src.moveIndex(-1);
1436: }
1437: }
1438: }
1439:
1440: if (destCapacity > 0) {
1441: dest[0] = (char) c;
1442: }
1443: }
1444: return destLength;
1445: }
1446:
1447: char[] buffer = new char[100];
1448: int[] startIndex = new int[1];
1449: bufferLength = findNextIterationBoundary(src, isNextBoundary,
1450: minC, mask, buffer);
1451: if (bufferLength > 0) {
1452: if (doNormalize) {
1453: destLength = mode.normalize(buffer, startIndex[0],
1454: bufferLength, dest, destStart, destLimit,
1455: options);
1456:
1457: if (pNeededToNormalize != null) {
1458: pNeededToNormalize[0] = (boolean) (destLength != bufferLength || Utility
1459: .arrayRegionMatches(buffer, startIndex[0],
1460: dest, destStart, destLength));
1461: }
1462: } else {
1463: /* just copy the source characters */
1464: if (destCapacity > 0) {
1465: System.arraycopy(buffer, 0, dest, destStart, Math
1466: .min(bufferLength, destCapacity));
1467: }
1468:
1469: }
1470: }
1471: return destLength;
1472: }
1473:
1474: private void clearBuffer() {
1475: bufferLimit = bufferStart = bufferPos = 0;
1476: }
1477:
1478: private boolean nextNormalize() {
1479:
1480: clearBuffer();
1481: currentIndex = nextIndex;
1482: text.setIndex(nextIndex);
1483:
1484: bufferLimit = next(text, buffer, bufferStart, buffer.length,
1485: mode, true, null, options);
1486:
1487: nextIndex = text.getIndex();
1488: return (bufferLimit > 0);
1489: }
1490:
1491: private boolean previousNormalize() {
1492:
1493: clearBuffer();
1494: nextIndex = currentIndex;
1495: text.setIndex(currentIndex);
1496: bufferLimit = previous(text, buffer, bufferStart,
1497: buffer.length, mode, true, null, options);
1498:
1499: currentIndex = text.getIndex();
1500: bufferPos = bufferLimit;
1501: return bufferLimit > 0;
1502: }
1503:
1504: private int getCodePointAt(int index) {
1505: if (UTF16.isSurrogate(buffer[index])) {
1506: if (UTF16.isLeadSurrogate(buffer[index])) {
1507: if ((index + 1) < bufferLimit
1508: && UTF16.isTrailSurrogate(buffer[index + 1])) {
1509: return UCharacterProperty.getRawSupplementary(
1510: buffer[index], buffer[index + 1]);
1511: }
1512: } else if (UTF16.isTrailSurrogate(buffer[index])) {
1513: if (index > 0
1514: && UTF16.isLeadSurrogate(buffer[index - 1])) {
1515: return UCharacterProperty.getRawSupplementary(
1516: buffer[index - 1], buffer[index]);
1517: }
1518: }
1519: }
1520: return buffer[index];
1521:
1522: }
1523:
1524: /**
1525: * Internal API
1526: * @internal
1527: */
1528: public static boolean isNFSkippable(int c, Mode mode) {
1529: return mode.isNFSkippable(c);
1530: }
1531:
1532: //
1533: // Options
1534: //
1535:
1536: /*
1537: * Default option for Unicode 3.2.0 normalization.
1538: * Corrigendum 4 was fixed in Unicode 3.2.0 but isn't supported in
1539: * IDNA/StringPrep.
1540: * The public review issue #29 was fixed in Unicode 4.1.0. Corrigendum 5
1541: * allowed Unicode 3.2 to 4.0.1 to apply the fix for PRI #29, but it isn't
1542: * supported by IDNA/StringPrep as well as Corrigendum 4.
1543: */
1544: public static final int UNICODE_3_2_0_ORIGINAL = UNICODE_3_2
1545: | NormalizerImpl.WITHOUT_CORRIGENDUM4_CORRECTIONS
1546: | NormalizerImpl.BEFORE_PRI_29;
1547:
1548: /*
1549: * Default option for the latest Unicode normalization. This option is
1550: * provided mainly for testing.
1551: * The value zero means that normalization is done with the fixes for
1552: * - Corrigendum 4 (Five CJK Canonical Mapping Errors)
1553: * - Corrigendum 5 (Normalization Idempotency)
1554: */
1555: public static final int UNICODE_LATEST = 0x00;
1556:
1557: //
1558: // public constructor and methods for java.text.Normalizer and
1559: // sun.text.Normalizer
1560: //
1561:
1562: /**
1563: * Creates a new <tt>Normalizer</tt> object for iterating over the
1564: * normalized form of a given string.
1565: *
1566: * @param str The string to be normalized. The normalization
1567: * will start at the beginning of the string.
1568: *
1569: * @param mode The normalization mode.
1570: */
1571: public NormalizerBase(String str, Mode mode) {
1572: this (str, mode, UNICODE_LATEST);
1573: }
1574:
1575: /**
1576: * Normalizes a <code>String</code> using the given normalization form.
1577: *
1578: * @param str the input string to be normalized.
1579: * @param form the normalization form
1580: */
1581: public static String normalize(String str, Normalizer.Form form) {
1582: return normalize(str, form, UNICODE_LATEST);
1583: }
1584:
1585: /**
1586: * Normalizes a <code>String</code> using the given normalization form.
1587: *
1588: * @param str the input string to be normalized.
1589: * @param form the normalization form
1590: * @param options the optional features to be enabled.
1591: */
1592: public static String normalize(String str, Normalizer.Form form,
1593: int options) {
1594: switch (form) {
1595: case NFC:
1596: return NFC.normalize(str, options);
1597: case NFD:
1598: return NFD.normalize(str, options);
1599: case NFKC:
1600: return NFKC.normalize(str, options);
1601: case NFKD:
1602: return NFKD.normalize(str, options);
1603: }
1604:
1605: throw new IllegalArgumentException(
1606: "Unexpected normalization form: " + form);
1607: }
1608:
1609: /**
1610: * Test if a string is in a given normalization form.
1611: * This is semantically equivalent to source.equals(normalize(source, mode)).
1612: *
1613: * Unlike quickCheck(), this function returns a definitive result,
1614: * never a "maybe".
1615: * For NFD, NFKD, and FCD, both functions work exactly the same.
1616: * For NFC and NFKC where quickCheck may return "maybe", this function will
1617: * perform further tests to arrive at a true/false result.
1618: * @param str the input string to be checked to see if it is normalized
1619: * @param form the normalization form
1620: * @param options the optional features to be enabled.
1621: */
1622: public static boolean isNormalized(String str, Normalizer.Form form) {
1623: return isNormalized(str, form, UNICODE_LATEST);
1624: }
1625:
1626: /**
1627: * Test if a string is in a given normalization form.
1628: * This is semantically equivalent to source.equals(normalize(source, mode)).
1629: *
1630: * Unlike quickCheck(), this function returns a definitive result,
1631: * never a "maybe".
1632: * For NFD, NFKD, and FCD, both functions work exactly the same.
1633: * For NFC and NFKC where quickCheck may return "maybe", this function will
1634: * perform further tests to arrive at a true/false result.
1635: * @param str the input string to be checked to see if it is normalized
1636: * @param form the normalization form
1637: * @param options the optional features to be enabled.
1638: */
1639: public static boolean isNormalized(String str,
1640: Normalizer.Form form, int options) {
1641: switch (form) {
1642: case NFC:
1643: return (NFC.quickCheck(str.toCharArray(), 0, str.length(),
1644: false, NormalizerImpl.getNX(options)) == YES);
1645: case NFD:
1646: return (NFD.quickCheck(str.toCharArray(), 0, str.length(),
1647: false, NormalizerImpl.getNX(options)) == YES);
1648: case NFKC:
1649: return (NFKC.quickCheck(str.toCharArray(), 0, str.length(),
1650: false, NormalizerImpl.getNX(options)) == YES);
1651: case NFKD:
1652: return (NFKD.quickCheck(str.toCharArray(), 0, str.length(),
1653: false, NormalizerImpl.getNX(options)) == YES);
1654: }
1655:
1656: throw new IllegalArgumentException(
1657: "Unexpected normalization form: " + form);
1658: }
1659: }
|