001: /*
002: * Portions Copyright 2005-2006 Sun Microsystems, Inc. All Rights Reserved.
003: * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
004: *
005: * This code is free software; you can redistribute it and/or modify it
006: * under the terms of the GNU General Public License version 2 only, as
007: * published by the Free Software Foundation. Sun designates this
008: * particular file as subject to the "Classpath" exception as provided
009: * by Sun in the LICENSE file that accompanied this code.
010: *
011: * This code is distributed in the hope that it will be useful, but WITHOUT
012: * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
013: * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
014: * version 2 for more details (a copy is included in the LICENSE file that
015: * accompanied this code).
016: *
017: * You should have received a copy of the GNU General Public License version
018: * 2 along with this work; if not, write to the Free Software Foundation,
019: * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
020: *
021: * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
022: * CA 95054 USA or visit www.sun.com if you need additional information or
023: * have any questions.
024: */
025:
026: /*
027: *******************************************************************************
028: * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
029: * *
030: * The original version of this source code and documentation is copyrighted *
031: * and owned by IBM, These materials are provided under terms of a License *
032: * Agreement between IBM and Sun. This technology is protected by multiple *
033: * US and International patents. This notice and attribution to IBM may not *
034: * to removed. *
035: *******************************************************************************
036: */
037:
038: package sun.text.normalizer;
039:
040: /**
041: * <p>Standalone utility class providing UTF16 character conversions and
042: * indexing conversions.</p>
043: * <p>Code that uses strings alone rarely need modification.
044: * By design, UTF-16 does not allow overlap, so searching for strings is a safe
045: * operation. Similarly, concatenation is always safe. Substringing is safe if
046: * the start and end are both on UTF-32 boundaries. In normal code, the values
047: * for start and end are on those boundaries, since they arose from operations
048: * like searching. If not, the nearest UTF-32 boundaries can be determined
049: * using <code>bounds()</code>.</p>
050: * <strong>Examples:</strong>
051: * <p>The following examples illustrate use of some of these methods.
052: * <pre>
053: * // iteration forwards: Original
054: * for (int i = 0; i < s.length(); ++i) {
055: * char ch = s.charAt(i);
056: * doSomethingWith(ch);
057: * }
058: *
059: * // iteration forwards: Changes for UTF-32
060: * int ch;
061: * for (int i = 0; i < s.length(); i+=UTF16.getCharCount(ch)) {
062: * ch = UTF16.charAt(s,i);
063: * doSomethingWith(ch);
064: * }
065: *
066: * // iteration backwards: Original
067: * for (int i = s.length() -1; i >= 0; --i) {
068: * char ch = s.charAt(i);
069: * doSomethingWith(ch);
070: * }
071: *
072: * // iteration backwards: Changes for UTF-32
073: * int ch;
074: * for (int i = s.length() -1; i > 0; i-=UTF16.getCharCount(ch)) {
075: * ch = UTF16.charAt(s,i);
076: * doSomethingWith(ch);
077: * }
078: * </pre>
079: * <strong>Notes:</strong>
080: * <ul>
081: * <li>
082: * <strong>Naming:</strong> For clarity, High and Low surrogates are called
083: * <code>Lead</code> and <code>Trail</code> in the API, which gives a better
084: * sense of their ordering in a string. <code>offset16</code> and
085: * <code>offset32</code> are used to distinguish offsets to UTF-16
086: * boundaries vs offsets to UTF-32 boundaries. <code>int char32</code> is
087: * used to contain UTF-32 characters, as opposed to <code>char16</code>,
088: * which is a UTF-16 code unit.
089: * </li>
090: * <li>
091: * <strong>Roundtripping Offsets:</strong> You can always roundtrip from a
092: * UTF-32 offset to a UTF-16 offset and back. Because of the difference in
093: * structure, you can roundtrip from a UTF-16 offset to a UTF-32 offset and
094: * back if and only if <code>bounds(string, offset16) != TRAIL</code>.
095: * </li>
096: * <li>
097: * <strong>Exceptions:</strong> The error checking will throw an exception
098: * if indices are out of bounds. Other than than that, all methods will
099: * behave reasonably, even if unmatched surrogates or out-of-bounds UTF-32
100: * values are present. <code>UCharacter.isLegal()</code> can be used to check
101: * for validity if desired.
102: * </li>
103: * <li>
104: * <strong>Unmatched Surrogates:</strong> If the string contains unmatched
105: * surrogates, then these are counted as one UTF-32 value. This matches
106: * their iteration behavior, which is vital. It also matches common display
107: * practice as missing glyphs (see the Unicode Standard Section 5.4, 5.5).
108: * </li>
109: * <li>
110: * <strong>Optimization:</strong> The method implementations may need
111: * optimization if the compiler doesn't fold static final methods. Since
112: * surrogate pairs will form an exceeding small percentage of all the text
113: * in the world, the singleton case should always be optimized for.
114: * </li>
115: * </ul>
116: * @author Mark Davis, with help from Markus Scherer
117: * @stable ICU 2.1
118: */
119:
120: public final class UTF16 {
121: // public variables ---------------------------------------------------
122:
123: /**
124: * The lowest Unicode code point value.
125: * @stable ICU 2.1
126: */
127: public static final int CODEPOINT_MIN_VALUE = 0;
128: /**
129: * The highest Unicode code point value (scalar value) according to the
130: * Unicode Standard.
131: * @stable ICU 2.1
132: */
133: public static final int CODEPOINT_MAX_VALUE = 0x10ffff;
134: /**
135: * The minimum value for Supplementary code points
136: * @stable ICU 2.1
137: */
138: public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000;
139: /**
140: * Lead surrogate minimum value
141: * @stable ICU 2.1
142: */
143: public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800;
144: /**
145: * Trail surrogate minimum value
146: * @stable ICU 2.1
147: */
148: public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00;
149: /**
150: * Lead surrogate maximum value
151: * @stable ICU 2.1
152: */
153: public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF;
154: /**
155: * Trail surrogate maximum value
156: * @stable ICU 2.1
157: */
158: public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF;
159: /**
160: * Surrogate minimum value
161: * @stable ICU 2.1
162: */
163: public static final int SURROGATE_MIN_VALUE = LEAD_SURROGATE_MIN_VALUE;
164:
165: // public method ------------------------------------------------------
166:
167: /**
168: * Extract a single UTF-32 value from a string.
169: * Used when iterating forwards or backwards (with
170: * <code>UTF16.getCharCount()</code>, as well as random access. If a
171: * validity check is required, use
172: * <code><a href="../lang/UCharacter.html#isLegal(char)">
173: * UCharacter.isLegal()</a></code> on the return value.
174: * If the char retrieved is part of a surrogate pair, its supplementary
175: * character will be returned. If a complete supplementary character is
176: * not found the incomplete character will be returned
177: * @param source array of UTF-16 chars
178: * @param offset16 UTF-16 offset to the start of the character.
179: * @return UTF-32 value for the UTF-32 value that contains the char at
180: * offset16. The boundaries of that codepoint are the same as in
181: * <code>bounds32()</code>.
182: * @exception IndexOutOfBoundsException thrown if offset16 is out of
183: * bounds.
184: * @stable ICU 2.1
185: */
186: public static int charAt(String source, int offset16) {
187: if (offset16 < 0 || offset16 >= source.length()) {
188: throw new StringIndexOutOfBoundsException(offset16);
189: }
190:
191: char single = source.charAt(offset16);
192: if (single < LEAD_SURROGATE_MIN_VALUE
193: || single > TRAIL_SURROGATE_MAX_VALUE) {
194: return single;
195: }
196:
197: // Convert the UTF-16 surrogate pair if necessary.
198: // For simplicity in usage, and because the frequency of pairs is
199: // low, look both directions.
200:
201: if (single <= LEAD_SURROGATE_MAX_VALUE) {
202: ++offset16;
203: if (source.length() != offset16) {
204: char trail = source.charAt(offset16);
205: if (trail >= TRAIL_SURROGATE_MIN_VALUE
206: && trail <= TRAIL_SURROGATE_MAX_VALUE) {
207: return UCharacterProperty.getRawSupplementary(
208: single, trail);
209: }
210: }
211: } else {
212: --offset16;
213: if (offset16 >= 0) {
214: // single is a trail surrogate so
215: char lead = source.charAt(offset16);
216: if (lead >= LEAD_SURROGATE_MIN_VALUE
217: && lead <= LEAD_SURROGATE_MAX_VALUE) {
218: return UCharacterProperty.getRawSupplementary(lead,
219: single);
220: }
221: }
222: }
223: return single; // return unmatched surrogate
224: }
225:
226: /**
227: * Extract a single UTF-32 value from a substring.
228: * Used when iterating forwards or backwards (with
229: * <code>UTF16.getCharCount()</code>, as well as random access. If a
230: * validity check is required, use
231: * <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
232: * </a></code> on the return value.
233: * If the char retrieved is part of a surrogate pair, its supplementary
234: * character will be returned. If a complete supplementary character is
235: * not found the incomplete character will be returned
236: * @param source array of UTF-16 chars
237: * @param start offset to substring in the source array for analyzing
238: * @param limit offset to substring in the source array for analyzing
239: * @param offset16 UTF-16 offset relative to start
240: * @return UTF-32 value for the UTF-32 value that contains the char at
241: * offset16. The boundaries of that codepoint are the same as in
242: * <code>bounds32()</code>.
243: * @exception IndexOutOfBoundsException thrown if offset16 is not within
244: * the range of start and limit.
245: * @stable ICU 2.1
246: */
247: public static int charAt(char source[], int start, int limit,
248: int offset16) {
249: offset16 += start;
250: if (offset16 < start || offset16 >= limit) {
251: throw new ArrayIndexOutOfBoundsException(offset16);
252: }
253:
254: char single = source[offset16];
255: if (!isSurrogate(single)) {
256: return single;
257: }
258:
259: // Convert the UTF-16 surrogate pair if necessary.
260: // For simplicity in usage, and because the frequency of pairs is
261: // low, look both directions.
262: if (single <= LEAD_SURROGATE_MAX_VALUE) {
263: offset16++;
264: if (offset16 >= limit) {
265: return single;
266: }
267: char trail = source[offset16];
268: if (isTrailSurrogate(trail)) {
269: return UCharacterProperty.getRawSupplementary(single,
270: trail);
271: }
272: } else { // isTrailSurrogate(single), so
273: if (offset16 == start) {
274: return single;
275: }
276: offset16--;
277: char lead = source[offset16];
278: if (isLeadSurrogate(lead))
279: return UCharacterProperty.getRawSupplementary(lead,
280: single);
281: }
282: return single; // return unmatched surrogate
283: }
284:
285: /**
286: * Determines how many chars this char32 requires.
287: * If a validity check is required, use <code>
288: * <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on
289: * char32 before calling.
290: * @param char32 the input codepoint.
291: * @return 2 if is in supplementary space, otherwise 1.
292: * @stable ICU 2.1
293: */
294: public static int getCharCount(int char32) {
295: if (char32 < SUPPLEMENTARY_MIN_VALUE) {
296: return 1;
297: }
298: return 2;
299: }
300:
301: /**
302: * Determines whether the code value is a surrogate.
303: * @param char16 the input character.
304: * @return true iff the input character is a surrogate.
305: * @stable ICU 2.1
306: */
307: public static boolean isSurrogate(char char16) {
308: return LEAD_SURROGATE_MIN_VALUE <= char16
309: && char16 <= TRAIL_SURROGATE_MAX_VALUE;
310: }
311:
312: /**
313: * Determines whether the character is a trail surrogate.
314: * @param char16 the input character.
315: * @return true iff the input character is a trail surrogate.
316: * @stable ICU 2.1
317: */
318: public static boolean isTrailSurrogate(char char16) {
319: return (TRAIL_SURROGATE_MIN_VALUE <= char16 && char16 <= TRAIL_SURROGATE_MAX_VALUE);
320: }
321:
322: /**
323: * Determines whether the character is a lead surrogate.
324: * @param char16 the input character.
325: * @return true iff the input character is a lead surrogate
326: * @stable ICU 2.1
327: */
328: public static boolean isLeadSurrogate(char char16) {
329: return LEAD_SURROGATE_MIN_VALUE <= char16
330: && char16 <= LEAD_SURROGATE_MAX_VALUE;
331: }
332:
333: /**
334: * Returns the lead surrogate.
335: * If a validity check is required, use
336: * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
337: * on char32 before calling.
338: * @param char32 the input character.
339: * @return lead surrogate if the getCharCount(ch) is 2; <br>
340: * and 0 otherwise (note: 0 is not a valid lead surrogate).
341: * @stable ICU 2.1
342: */
343: public static char getLeadSurrogate(int char32) {
344: if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
345: return (char) (LEAD_SURROGATE_OFFSET_ + (char32 >> LEAD_SURROGATE_SHIFT_));
346: }
347:
348: return 0;
349: }
350:
351: /**
352: * Returns the trail surrogate.
353: * If a validity check is required, use
354: * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
355: * on char32 before calling.
356: * @param char32 the input character.
357: * @return the trail surrogate if the getCharCount(ch) is 2; <br>otherwise
358: * the character itself
359: * @stable ICU 2.1
360: */
361: public static char getTrailSurrogate(int char32) {
362: if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
363: return (char) (TRAIL_SURROGATE_MIN_VALUE + (char32 & TRAIL_SURROGATE_MASK_));
364: }
365:
366: return (char) char32;
367: }
368:
369: /**
370: * Convenience method corresponding to String.valueOf(char). Returns a one
371: * or two char string containing the UTF-32 value in UTF16 format. If a
372: * validity check is required, use
373: * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
374: * on char32 before calling.
375: * @param char32 the input character.
376: * @return string value of char32 in UTF16 format
377: * @exception IllegalArgumentException thrown if char32 is a invalid
378: * codepoint.
379: * @stable ICU 2.1
380: */
381: public static String valueOf(int char32) {
382: if (char32 < CODEPOINT_MIN_VALUE
383: || char32 > CODEPOINT_MAX_VALUE) {
384: throw new IllegalArgumentException("Illegal codepoint");
385: }
386: return toString(char32);
387: }
388:
389: /**
390: * Append a single UTF-32 value to the end of a StringBuffer.
391: * If a validity check is required, use
392: * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
393: * on char32 before calling.
394: * @param target the buffer to append to
395: * @param char32 value to append.
396: * @return the updated StringBuffer
397: * @exception IllegalArgumentException thrown when char32 does not lie
398: * within the range of the Unicode codepoints
399: * @stable ICU 2.1
400: */
401: public static StringBuffer append(StringBuffer target, int char32) {
402: // Check for irregular values
403: if (char32 < CODEPOINT_MIN_VALUE
404: || char32 > CODEPOINT_MAX_VALUE) {
405: throw new IllegalArgumentException("Illegal codepoint: "
406: + Integer.toHexString(char32));
407: }
408:
409: // Write the UTF-16 values
410: if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
411: target.append(getLeadSurrogate(char32));
412: target.append(getTrailSurrogate(char32));
413: } else {
414: target.append((char) char32);
415: }
416: return target;
417: }
418:
419: //// for StringPrep
420: /**
421: * Shifts offset16 by the argument number of codepoints within a subarray.
422: * @param source char array
423: * @param start position of the subarray to be performed on
424: * @param limit position of the subarray to be performed on
425: * @param offset16 UTF16 position to shift relative to start
426: * @param shift32 number of codepoints to shift
427: * @return new shifted offset16 relative to start
428: * @exception IndexOutOfBoundsException if the new offset16 is out of
429: * bounds with respect to the subarray or the subarray bounds
430: * are out of range.
431: * @stable ICU 2.1
432: */
433: public static int moveCodePointOffset(char source[], int start,
434: int limit, int offset16, int shift32) {
435: int size = source.length;
436: int count;
437: char ch;
438: int result = offset16 + start;
439: if (start < 0 || limit < start) {
440: throw new StringIndexOutOfBoundsException(start);
441: }
442: if (limit > size) {
443: throw new StringIndexOutOfBoundsException(limit);
444: }
445: if (offset16 < 0 || result > limit) {
446: throw new StringIndexOutOfBoundsException(offset16);
447: }
448: if (shift32 > 0) {
449: if (shift32 + result > size) {
450: throw new StringIndexOutOfBoundsException(result);
451: }
452: count = shift32;
453: while (result < limit && count > 0) {
454: ch = source[result];
455: if (isLeadSurrogate(ch) && (result + 1 < limit)
456: && isTrailSurrogate(source[result + 1])) {
457: result++;
458: }
459: count--;
460: result++;
461: }
462: } else {
463: if (result + shift32 < start) {
464: throw new StringIndexOutOfBoundsException(result);
465: }
466: for (count = -shift32; count > 0; count--) {
467: result--;
468: if (result < start) {
469: break;
470: }
471: ch = source[result];
472: if (isTrailSurrogate(ch) && result > start
473: && isLeadSurrogate(source[result - 1])) {
474: result--;
475: }
476: }
477: }
478: if (count != 0) {
479: throw new StringIndexOutOfBoundsException(shift32);
480: }
481: result -= start;
482: return result;
483: }
484:
485: // private data members -------------------------------------------------
486:
487: /**
488: * Shift value for lead surrogate to form a supplementary character.
489: */
490: private static final int LEAD_SURROGATE_SHIFT_ = 10;
491:
492: /**
493: * Mask to retrieve the significant value from a trail surrogate.
494: */
495: private static final int TRAIL_SURROGATE_MASK_ = 0x3FF;
496:
497: /**
498: * Value that all lead surrogate starts with
499: */
500: private static final int LEAD_SURROGATE_OFFSET_ = LEAD_SURROGATE_MIN_VALUE
501: - (SUPPLEMENTARY_MIN_VALUE >> LEAD_SURROGATE_SHIFT_);
502:
503: // private methods ------------------------------------------------------
504:
505: /**
506: * <p>Converts argument code point and returns a String object representing
507: * the code point's value in UTF16 format.</p>
508: * <p>This method does not check for the validity of the codepoint, the
509: * results are not guaranteed if a invalid codepoint is passed as
510: * argument.</p>
511: * <p>The result is a string whose length is 1 for non-supplementary code
512: * points, 2 otherwise.</p>
513: * @param ch code point
514: * @return string representation of the code point
515: */
516: private static String toString(int ch) {
517: if (ch < SUPPLEMENTARY_MIN_VALUE) {
518: return String.valueOf((char) ch);
519: }
520:
521: StringBuffer result = new StringBuffer();
522: result.append(getLeadSurrogate(ch));
523: result.append(getTrailSurrogate(ch));
524: return result.toString();
525: }
526: }
|