Source Code Cross Referenced for DoubleMetaphone.java in  » Library » Apache-common-codec » org » apache » commons » codec » language » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Library » Apache common codec » org.apache.commons.codec.language 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


0001:        /*
0002:         * Copyright 2001-2004 The Apache Software Foundation.
0003:         * 
0004:         * Licensed under the Apache License, Version 2.0 (the "License");
0005:         * you may not use this file except in compliance with the License.
0006:         * You may obtain a copy of the License at
0007:         * 
0008:         *      http://www.apache.org/licenses/LICENSE-2.0
0009:         * 
0010:         * Unless required by applicable law or agreed to in writing, software
0011:         * distributed under the License is distributed on an "AS IS" BASIS,
0012:         * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0013:         * See the License for the specific language governing permissions and
0014:         * limitations under the License.
0015:         */
0016:
0017:        package org.apache.commons.codec.language;
0018:
0019:        import org.apache.commons.codec.EncoderException;
0020:        import org.apache.commons.codec.StringEncoder;
0021:
0022:        /**
0023:         * Encodes a string into a double metaphone value.
0024:         * This Implementation is based on the algorithm by <CITE>Lawrence Philips</CITE>.
0025:         * <ul>
0026:         * <li>Original Article: <a 
0027:         * href="http://www.cuj.com/documents/s=8038/cuj0006philips/">
0028:         * http://www.cuj.com/documents/s=8038/cuj0006philips/</a></li>
0029:         * <li>Original Source Code: <a href="ftp://ftp.cuj.com/pub/2000/1806/philips.zip">
0030:         * ftp://ftp.cuj.com/pub/2000/1806/philips.zip</a></li>
0031:         * </ul>
0032:         * 
0033:         * @author Apache Software Foundation
0034:         * @version $Id: DoubleMetaphone.java,v 1.24 2004/06/05 18:32:04 ggregory Exp $
0035:         */
0036:        public class DoubleMetaphone implements  StringEncoder {
0037:
0038:            /**
0039:             * "Vowels" to test for
0040:             */
0041:            private static final String VOWELS = "AEIOUY";
0042:
0043:            /**
0044:             * Prefixes when present which are not pronounced
0045:             */
0046:            private static final String[] SILENT_START = { "GN", "KN", "PN",
0047:                    "WR", "PS" };
0048:            private static final String[] L_R_N_M_B_H_F_V_W_SPACE = { "L", "R",
0049:                    "N", "M", "B", "H", "F", "V", "W", " " };
0050:            private static final String[] ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER = {
0051:                    "ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI",
0052:                    "ER" };
0053:            private static final String[] L_T_K_S_N_M_B_Z = { "L", "T", "K",
0054:                    "S", "N", "M", "B", "Z" };
0055:
0056:            /**
0057:             * Maximum length of an encoding, default is 4
0058:             */
0059:            protected int maxCodeLen = 4;
0060:
0061:            /**
0062:             * Creates an instance of this DoubleMetaphone encoder
0063:             */
0064:            public DoubleMetaphone() {
0065:                super ();
0066:            }
0067:
0068:            /**
0069:             * Encode a value with Double Metaphone
0070:             *
0071:             * @param value String to encode
0072:             * @return an encoded string
0073:             */
0074:            public String doubleMetaphone(String value) {
0075:                return doubleMetaphone(value, false);
0076:            }
0077:
0078:            /**
0079:             * Encode a value with Double Metaphone, optionally using the alternate
0080:             * encoding.
0081:             *
0082:             * @param value String to encode
0083:             * @param alternate use alternate encode
0084:             * @return an encoded string
0085:             */
0086:            public String doubleMetaphone(String value, boolean alternate) {
0087:                value = cleanInput(value);
0088:                if (value == null) {
0089:                    return null;
0090:                }
0091:
0092:                boolean slavoGermanic = isSlavoGermanic(value);
0093:                int index = isSilentStart(value) ? 1 : 0;
0094:
0095:                DoubleMetaphoneResult result = new DoubleMetaphoneResult(this 
0096:                        .getMaxCodeLen());
0097:
0098:                while (!result.isComplete() && index <= value.length() - 1) {
0099:                    switch (value.charAt(index)) {
0100:                    case 'A':
0101:                    case 'E':
0102:                    case 'I':
0103:                    case 'O':
0104:                    case 'U':
0105:                    case 'Y':
0106:                        index = handleAEIOUY(value, result, index);
0107:                        break;
0108:                    case 'B':
0109:                        result.append('P');
0110:                        index = charAt(value, index + 1) == 'B' ? index + 2
0111:                                : index + 1;
0112:                        break;
0113:                    case '\u00C7':
0114:                        // A C with a Cedilla
0115:                        result.append('S');
0116:                        index++;
0117:                        break;
0118:                    case 'C':
0119:                        index = handleC(value, result, index);
0120:                        break;
0121:                    case 'D':
0122:                        index = handleD(value, result, index);
0123:                        break;
0124:                    case 'F':
0125:                        result.append('F');
0126:                        index = charAt(value, index + 1) == 'F' ? index + 2
0127:                                : index + 1;
0128:                        break;
0129:                    case 'G':
0130:                        index = handleG(value, result, index, slavoGermanic);
0131:                        break;
0132:                    case 'H':
0133:                        index = handleH(value, result, index);
0134:                        break;
0135:                    case 'J':
0136:                        index = handleJ(value, result, index, slavoGermanic);
0137:                        break;
0138:                    case 'K':
0139:                        result.append('K');
0140:                        index = charAt(value, index + 1) == 'K' ? index + 2
0141:                                : index + 1;
0142:                        break;
0143:                    case 'L':
0144:                        index = handleL(value, result, index);
0145:                        break;
0146:                    case 'M':
0147:                        result.append('M');
0148:                        index = conditionM0(value, index) ? index + 2
0149:                                : index + 1;
0150:                        break;
0151:                    case 'N':
0152:                        result.append('N');
0153:                        index = charAt(value, index + 1) == 'N' ? index + 2
0154:                                : index + 1;
0155:                        break;
0156:                    case '\u00D1':
0157:                        // N with a tilde (spanish ene)
0158:                        result.append('N');
0159:                        index++;
0160:                        break;
0161:                    case 'P':
0162:                        index = handleP(value, result, index);
0163:                        break;
0164:                    case 'Q':
0165:                        result.append('K');
0166:                        index = charAt(value, index + 1) == 'Q' ? index + 2
0167:                                : index + 1;
0168:                        break;
0169:                    case 'R':
0170:                        index = handleR(value, result, index, slavoGermanic);
0171:                        break;
0172:                    case 'S':
0173:                        index = handleS(value, result, index, slavoGermanic);
0174:                        break;
0175:                    case 'T':
0176:                        index = handleT(value, result, index);
0177:                        break;
0178:                    case 'V':
0179:                        result.append('F');
0180:                        index = charAt(value, index + 1) == 'V' ? index + 2
0181:                                : index + 1;
0182:                        break;
0183:                    case 'W':
0184:                        index = handleW(value, result, index);
0185:                        break;
0186:                    case 'X':
0187:                        index = handleX(value, result, index);
0188:                        break;
0189:                    case 'Z':
0190:                        index = handleZ(value, result, index, slavoGermanic);
0191:                        break;
0192:                    default:
0193:                        index++;
0194:                        break;
0195:                    }
0196:                }
0197:
0198:                return alternate ? result.getAlternate() : result.getPrimary();
0199:            }
0200:
0201:            /**
0202:             * Encode the value using DoubleMetaphone.  It will only work if 
0203:             * <code>obj</code> is a <code>String</code> (like <code>Metaphone</code>).
0204:             *
0205:             * @param obj Object to encode (should be of type String)
0206:             * @return An encoded Object (will be of type String)
0207:             * @throws EncoderException encode parameter is not of type String
0208:             */
0209:            public Object encode(Object obj) throws EncoderException {
0210:                if (!(obj instanceof  String)) {
0211:                    throw new EncoderException(
0212:                            "DoubleMetaphone encode parameter is not of type String");
0213:                }
0214:                return doubleMetaphone((String) obj);
0215:            }
0216:
0217:            /**
0218:             * Encode the value using DoubleMetaphone.
0219:             *
0220:             * @param value String to encode
0221:             * @return An encoded String
0222:             */
0223:            public String encode(String value) {
0224:                return doubleMetaphone(value);
0225:            }
0226:
0227:            /**
0228:             * Check if the Double Metaphone values of two <code>String</code> values
0229:             * are equal.
0230:             * 
0231:             * @param value1 The left-hand side of the encoded {@link String#equals(Object)}.
0232:             * @param value2 The right-hand side of the encoded {@link String#equals(Object)}.
0233:             * @return <code>true</code> if the encoded <code>String</code>s are equal;
0234:             *          <code>false</code> otherwise.
0235:             * @see #isDoubleMetaphoneEqual(String,String,boolean)
0236:             */
0237:            public boolean isDoubleMetaphoneEqual(String value1, String value2) {
0238:                return isDoubleMetaphoneEqual(value1, value2, false);
0239:            }
0240:
0241:            /**
0242:             * Check if the Double Metaphone values of two <code>String</code> values
0243:             * are equal, optionally using the alternate value.
0244:             * 
0245:             * @param value1 The left-hand side of the encoded {@link String#equals(Object)}.
0246:             * @param value2 The right-hand side of the encoded {@link String#equals(Object)}.
0247:             * @param alternate use the alternate value if <code>true</code>.
0248:             * @return <code>true</code> if the encoded <code>String</code>s are equal;
0249:             *          <code>false</code> otherwise.
0250:             */
0251:            public boolean isDoubleMetaphoneEqual(String value1, String value2,
0252:                    boolean alternate) {
0253:                return doubleMetaphone(value1, alternate).equals(
0254:                        doubleMetaphone(value2, alternate));
0255:            }
0256:
0257:            /**
0258:             * Returns the maxCodeLen.
0259:             * @return int
0260:             */
0261:            public int getMaxCodeLen() {
0262:                return this .maxCodeLen;
0263:            }
0264:
0265:            /**
0266:             * Sets the maxCodeLen.
0267:             * @param maxCodeLen The maxCodeLen to set
0268:             */
0269:            public void setMaxCodeLen(int maxCodeLen) {
0270:                this .maxCodeLen = maxCodeLen;
0271:            }
0272:
0273:            //-- BEGIN HANDLERS --//
0274:
0275:            /**
0276:             * Handles 'A', 'E', 'I', 'O', 'U', and 'Y' cases
0277:             */
0278:            private int handleAEIOUY(String value,
0279:                    DoubleMetaphoneResult result, int index) {
0280:                if (index == 0) {
0281:                    result.append('A');
0282:                }
0283:                return index + 1;
0284:            }
0285:
0286:            /**
0287:             * Handles 'C' cases
0288:             */
0289:            private int handleC(String value, DoubleMetaphoneResult result,
0290:                    int index) {
0291:                if (conditionC0(value, index)) { // very confusing, moved out
0292:                    result.append('K');
0293:                    index += 2;
0294:                } else if (index == 0 && contains(value, index, 6, "CAESAR")) {
0295:                    result.append('S');
0296:                    index += 2;
0297:                } else if (contains(value, index, 2, "CH")) {
0298:                    index = handleCH(value, result, index);
0299:                } else if (contains(value, index, 2, "CZ")
0300:                        && !contains(value, index - 2, 4, "WICZ")) {
0301:                    //-- "Czerny" --//
0302:                    result.append('S', 'X');
0303:                    index += 2;
0304:                } else if (contains(value, index + 1, 3, "CIA")) {
0305:                    //-- "focaccia" --//
0306:                    result.append('X');
0307:                    index += 3;
0308:                } else if (contains(value, index, 2, "CC")
0309:                        && !(index == 1 && charAt(value, 0) == 'M')) {
0310:                    //-- double "cc" but not "McClelland" --//
0311:                    return handleCC(value, result, index);
0312:                } else if (contains(value, index, 2, "CK", "CG", "CQ")) {
0313:                    result.append('K');
0314:                    index += 2;
0315:                } else if (contains(value, index, 2, "CI", "CE", "CY")) {
0316:                    //-- Italian vs. English --//
0317:                    if (contains(value, index, 3, "CIO", "CIE", "CIA")) {
0318:                        result.append('S', 'X');
0319:                    } else {
0320:                        result.append('S');
0321:                    }
0322:                    index += 2;
0323:                } else {
0324:                    result.append('K');
0325:                    if (contains(value, index + 1, 2, " C", " Q", " G")) {
0326:                        //-- Mac Caffrey, Mac Gregor --//
0327:                        index += 3;
0328:                    } else if (contains(value, index + 1, 1, "C", "K", "Q")
0329:                            && !contains(value, index + 1, 2, "CE", "CI")) {
0330:                        index += 2;
0331:                    } else {
0332:                        index++;
0333:                    }
0334:                }
0335:
0336:                return index;
0337:            }
0338:
0339:            /**
0340:             * Handles 'CC' cases
0341:             */
0342:            private int handleCC(String value, DoubleMetaphoneResult result,
0343:                    int index) {
0344:                if (contains(value, index + 2, 1, "I", "E", "H")
0345:                        && !contains(value, index + 2, 2, "HU")) {
0346:                    //-- "bellocchio" but not "bacchus" --//
0347:                    if ((index == 1 && charAt(value, index - 1) == 'A')
0348:                            || contains(value, index - 1, 5, "UCCEE", "UCCES")) {
0349:                        //-- "accident", "accede", "succeed" --//
0350:                        result.append("KS");
0351:                    } else {
0352:                        //-- "bacci", "bertucci", other Italian --//
0353:                        result.append('X');
0354:                    }
0355:                    index += 3;
0356:                } else { // Pierce's rule
0357:                    result.append('K');
0358:                    index += 2;
0359:                }
0360:
0361:                return index;
0362:            }
0363:
0364:            /**
0365:             * Handles 'CH' cases
0366:             */
0367:            private int handleCH(String value, DoubleMetaphoneResult result,
0368:                    int index) {
0369:                if (index > 0 && contains(value, index, 4, "CHAE")) { // Michael
0370:                    result.append('K', 'X');
0371:                    return index + 2;
0372:                } else if (conditionCH0(value, index)) {
0373:                    //-- Greek roots ("chemistry", "chorus", etc.) --//
0374:                    result.append('K');
0375:                    return index + 2;
0376:                } else if (conditionCH1(value, index)) {
0377:                    //-- Germanic, Greek, or otherwise 'ch' for 'kh' sound --//
0378:                    result.append('K');
0379:                    return index + 2;
0380:                } else {
0381:                    if (index > 0) {
0382:                        if (contains(value, 0, 2, "MC")) {
0383:                            result.append('K');
0384:                        } else {
0385:                            result.append('X', 'K');
0386:                        }
0387:                    } else {
0388:                        result.append('X');
0389:                    }
0390:                    return index + 2;
0391:                }
0392:            }
0393:
0394:            /**
0395:             * Handles 'D' cases
0396:             */
0397:            private int handleD(String value, DoubleMetaphoneResult result,
0398:                    int index) {
0399:                if (contains(value, index, 2, "DG")) {
0400:                    //-- "Edge" --//
0401:                    if (contains(value, index + 2, 1, "I", "E", "Y")) {
0402:                        result.append('J');
0403:                        index += 3;
0404:                        //-- "Edgar" --//
0405:                    } else {
0406:                        result.append("TK");
0407:                        index += 2;
0408:                    }
0409:                } else if (contains(value, index, 2, "DT", "DD")) {
0410:                    result.append('T');
0411:                    index += 2;
0412:                } else {
0413:                    result.append('T');
0414:                    index++;
0415:                }
0416:                return index;
0417:            }
0418:
0419:            /**
0420:             * Handles 'G' cases
0421:             */
0422:            private int handleG(String value, DoubleMetaphoneResult result,
0423:                    int index, boolean slavoGermanic) {
0424:                if (charAt(value, index + 1) == 'H') {
0425:                    index = handleGH(value, result, index);
0426:                } else if (charAt(value, index + 1) == 'N') {
0427:                    if (index == 1 && isVowel(charAt(value, 0))
0428:                            && !slavoGermanic) {
0429:                        result.append("KN", "N");
0430:                    } else if (!contains(value, index + 2, 2, "EY")
0431:                            && charAt(value, index + 1) != 'Y'
0432:                            && !slavoGermanic) {
0433:                        result.append("N", "KN");
0434:                    } else {
0435:                        result.append("KN");
0436:                    }
0437:                    index = index + 2;
0438:                } else if (contains(value, index + 1, 2, "LI")
0439:                        && !slavoGermanic) {
0440:                    result.append("KL", "L");
0441:                    index += 2;
0442:                } else if (index == 0
0443:                        && (charAt(value, index + 1) == 'Y' || contains(value,
0444:                                index + 1, 2, ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER))) {
0445:                    //-- -ges-, -gep-, -gel-, -gie- at beginning --//
0446:                    result.append('K', 'J');
0447:                    index += 2;
0448:                } else if ((contains(value, index + 1, 2, "ER") || charAt(
0449:                        value, index + 1) == 'Y')
0450:                        && !contains(value, 0, 6, "DANGER", "RANGER", "MANGER")
0451:                        && !contains(value, index - 1, 1, "E", "I")
0452:                        && !contains(value, index - 1, 3, "RGY", "OGY")) {
0453:                    //-- -ger-, -gy- --//
0454:                    result.append('K', 'J');
0455:                    index += 2;
0456:                } else if (contains(value, index + 1, 1, "E", "I", "Y")
0457:                        || contains(value, index - 1, 4, "AGGI", "OGGI")) {
0458:                    //-- Italian "biaggi" --//
0459:                    if ((contains(value, 0, 4, "VAN ", "VON ") || contains(
0460:                            value, 0, 3, "SCH"))
0461:                            || contains(value, index + 1, 2, "ET")) {
0462:                        //-- obvious germanic --//
0463:                        result.append('K');
0464:                    } else if (contains(value, index + 1, 4, "IER")) {
0465:                        result.append('J');
0466:                    } else {
0467:                        result.append('J', 'K');
0468:                    }
0469:                    index += 2;
0470:                } else if (charAt(value, index + 1) == 'G') {
0471:                    index += 2;
0472:                    result.append('K');
0473:                } else {
0474:                    index++;
0475:                    result.append('K');
0476:                }
0477:                return index;
0478:            }
0479:
0480:            /**
0481:             * Handles 'GH' cases
0482:             */
0483:            private int handleGH(String value, DoubleMetaphoneResult result,
0484:                    int index) {
0485:                if (index > 0 && !isVowel(charAt(value, index - 1))) {
0486:                    result.append('K');
0487:                    index += 2;
0488:                } else if (index == 0) {
0489:                    if (charAt(value, index + 2) == 'I') {
0490:                        result.append('J');
0491:                    } else {
0492:                        result.append('K');
0493:                    }
0494:                    index += 2;
0495:                } else if ((index > 1 && contains(value, index - 2, 1, "B",
0496:                        "H", "D"))
0497:                        || (index > 2 && contains(value, index - 3, 1, "B",
0498:                                "H", "D"))
0499:                        || (index > 3 && contains(value, index - 4, 1, "B", "H"))) {
0500:                    //-- Parker's rule (with some further refinements) - "hugh"
0501:                    index += 2;
0502:                } else {
0503:                    if (index > 2
0504:                            && charAt(value, index - 1) == 'U'
0505:                            && contains(value, index - 3, 1, "C", "G", "L",
0506:                                    "R", "T")) {
0507:                        //-- "laugh", "McLaughlin", "cough", "gough", "rough", "tough"
0508:                        result.append('F');
0509:                    } else if (index > 0 && charAt(value, index - 1) != 'I') {
0510:                        result.append('K');
0511:                    }
0512:                    index += 2;
0513:                }
0514:                return index;
0515:            }
0516:
0517:            /**
0518:             * Handles 'H' cases
0519:             */
0520:            private int handleH(String value, DoubleMetaphoneResult result,
0521:                    int index) {
0522:                //-- only keep if first & before vowel or between 2 vowels --//
0523:                if ((index == 0 || isVowel(charAt(value, index - 1)))
0524:                        && isVowel(charAt(value, index + 1))) {
0525:                    result.append('H');
0526:                    index += 2;
0527:                    //-- also takes car of "HH" --//
0528:                } else {
0529:                    index++;
0530:                }
0531:                return index;
0532:            }
0533:
0534:            /**
0535:             * Handles 'J' cases
0536:             */
0537:            private int handleJ(String value, DoubleMetaphoneResult result,
0538:                    int index, boolean slavoGermanic) {
0539:                if (contains(value, index, 4, "JOSE")
0540:                        || contains(value, 0, 4, "SAN ")) {
0541:                    //-- obvious Spanish, "Jose", "San Jacinto" --//
0542:                    if ((index == 0 && (charAt(value, index + 4) == ' ') || value
0543:                            .length() == 4)
0544:                            || contains(value, 0, 4, "SAN ")) {
0545:                        result.append('H');
0546:                    } else {
0547:                        result.append('J', 'H');
0548:                    }
0549:                    index++;
0550:                } else {
0551:                    if (index == 0 && !contains(value, index, 4, "JOSE")) {
0552:                        result.append('J', 'A');
0553:                    } else if (isVowel(charAt(value, index - 1))
0554:                            && !slavoGermanic
0555:                            && (charAt(value, index + 1) == 'A' || charAt(
0556:                                    value, index + 1) == 'O')) {
0557:                        result.append('J', 'H');
0558:                    } else if (index == value.length() - 1) {
0559:                        result.append('J', ' ');
0560:                    } else if (!contains(value, index + 1, 1, L_T_K_S_N_M_B_Z)
0561:                            && !contains(value, index - 1, 1, "S", "K", "L")) {
0562:                        result.append('J');
0563:                    }
0564:
0565:                    if (charAt(value, index + 1) == 'J') {
0566:                        index += 2;
0567:                    } else {
0568:                        index++;
0569:                    }
0570:                }
0571:                return index;
0572:            }
0573:
0574:            /**
0575:             * Handles 'L' cases
0576:             */
0577:            private int handleL(String value, DoubleMetaphoneResult result,
0578:                    int index) {
0579:                result.append('L');
0580:                if (charAt(value, index + 1) == 'L') {
0581:                    if (conditionL0(value, index)) {
0582:                        result.appendAlternate(' ');
0583:                    }
0584:                    index += 2;
0585:                } else {
0586:                    index++;
0587:                }
0588:                return index;
0589:            }
0590:
0591:            /**
0592:             * Handles 'P' cases
0593:             */
0594:            private int handleP(String value, DoubleMetaphoneResult result,
0595:                    int index) {
0596:                if (charAt(value, index + 1) == 'H') {
0597:                    result.append('F');
0598:                    index += 2;
0599:                } else {
0600:                    result.append('P');
0601:                    index = contains(value, index + 1, 1, "P", "B") ? index + 2
0602:                            : index + 1;
0603:                }
0604:                return index;
0605:            }
0606:
0607:            /**
0608:             * Handles 'R' cases
0609:             */
0610:            private int handleR(String value, DoubleMetaphoneResult result,
0611:                    int index, boolean slavoGermanic) {
0612:                if (index == value.length() - 1 && !slavoGermanic
0613:                        && contains(value, index - 2, 2, "IE")
0614:                        && !contains(value, index - 4, 2, "ME", "MA")) {
0615:                    result.appendAlternate('R');
0616:                } else {
0617:                    result.append('R');
0618:                }
0619:                return charAt(value, index + 1) == 'R' ? index + 2 : index + 1;
0620:            }
0621:
0622:            /**
0623:             * Handles 'S' cases
0624:             */
0625:            private int handleS(String value, DoubleMetaphoneResult result,
0626:                    int index, boolean slavoGermanic) {
0627:                if (contains(value, index - 1, 3, "ISL", "YSL")) {
0628:                    //-- special cases "island", "isle", "carlisle", "carlysle" --//
0629:                    index++;
0630:                } else if (index == 0 && contains(value, index, 5, "SUGAR")) {
0631:                    //-- special case "sugar-" --//
0632:                    result.append('X', 'S');
0633:                    index++;
0634:                } else if (contains(value, index, 2, "SH")) {
0635:                    if (contains(value, index + 1, 4, "HEIM", "HOEK", "HOLM",
0636:                            "HOLZ")) {
0637:                        //-- germanic --//
0638:                        result.append('S');
0639:                    } else {
0640:                        result.append('X');
0641:                    }
0642:                    index += 2;
0643:                } else if (contains(value, index, 3, "SIO", "SIA")
0644:                        || contains(value, index, 4, "SIAN")) {
0645:                    //-- Italian and Armenian --//
0646:                    if (slavoGermanic) {
0647:                        result.append('S');
0648:                    } else {
0649:                        result.append('S', 'X');
0650:                    }
0651:                    index += 3;
0652:                } else if ((index == 0 && contains(value, index + 1, 1, "M",
0653:                        "N", "L", "W"))
0654:                        || contains(value, index + 1, 1, "Z")) {
0655:                    //-- german & anglicisations, e.g. "smith" match "schmidt" //
0656:                    // "snider" match "schneider" --//
0657:                    //-- also, -sz- in slavic language altho in hungarian it //
0658:                    //   is pronounced "s" --//
0659:                    result.append('S', 'X');
0660:                    index = contains(value, index + 1, 1, "Z") ? index + 2
0661:                            : index + 1;
0662:                } else if (contains(value, index, 2, "SC")) {
0663:                    index = handleSC(value, result, index);
0664:                } else {
0665:                    if (index == value.length() - 1
0666:                            && contains(value, index - 2, 2, "AI", "OI")) {
0667:                        //-- french e.g. "resnais", "artois" --//
0668:                        result.appendAlternate('S');
0669:                    } else {
0670:                        result.append('S');
0671:                    }
0672:                    index = contains(value, index + 1, 1, "S", "Z") ? index + 2
0673:                            : index + 1;
0674:                }
0675:                return index;
0676:            }
0677:
0678:            /**
0679:             * Handles 'SC' cases
0680:             */
0681:            private int handleSC(String value, DoubleMetaphoneResult result,
0682:                    int index) {
0683:                if (charAt(value, index + 2) == 'H') {
0684:                    //-- Schlesinger's rule --//
0685:                    if (contains(value, index + 3, 2, "OO", "ER", "EN", "UY",
0686:                            "ED", "EM")) {
0687:                        //-- Dutch origin, e.g. "school", "schooner" --//
0688:                        if (contains(value, index + 3, 2, "ER", "EN")) {
0689:                            //-- "schermerhorn", "schenker" --//
0690:                            result.append("X", "SK");
0691:                        } else {
0692:                            result.append("SK");
0693:                        }
0694:                    } else {
0695:                        if (index == 0 && !isVowel(charAt(value, 3))
0696:                                && charAt(value, 3) != 'W') {
0697:                            result.append('X', 'S');
0698:                        } else {
0699:                            result.append('X');
0700:                        }
0701:                    }
0702:                } else if (contains(value, index + 2, 1, "I", "E", "Y")) {
0703:                    result.append('S');
0704:                } else {
0705:                    result.append("SK");
0706:                }
0707:                return index + 3;
0708:            }
0709:
0710:            /**
0711:             * Handles 'T' cases
0712:             */
0713:            private int handleT(String value, DoubleMetaphoneResult result,
0714:                    int index) {
0715:                if (contains(value, index, 4, "TION")) {
0716:                    result.append('X');
0717:                    index += 3;
0718:                } else if (contains(value, index, 3, "TIA", "TCH")) {
0719:                    result.append('X');
0720:                    index += 3;
0721:                } else if (contains(value, index, 2, "TH")
0722:                        || contains(value, index, 3, "TTH")) {
0723:                    if (contains(value, index + 2, 2, "OM", "AM")
0724:                            ||
0725:                            //-- special case "thomas", "thames" or germanic --//
0726:                            contains(value, 0, 4, "VAN ", "VON ")
0727:                            || contains(value, 0, 3, "SCH")) {
0728:                        result.append('T');
0729:                    } else {
0730:                        result.append('0', 'T');
0731:                    }
0732:                    index += 2;
0733:                } else {
0734:                    result.append('T');
0735:                    index = contains(value, index + 1, 1, "T", "D") ? index + 2
0736:                            : index + 1;
0737:                }
0738:                return index;
0739:            }
0740:
0741:            /**
0742:             * Handles 'W' cases
0743:             */
0744:            private int handleW(String value, DoubleMetaphoneResult result,
0745:                    int index) {
0746:                if (contains(value, index, 2, "WR")) {
0747:                    //-- can also be in middle of word --//
0748:                    result.append('R');
0749:                    index += 2;
0750:                } else {
0751:                    if (index == 0
0752:                            && (isVowel(charAt(value, index + 1)) || contains(
0753:                                    value, index, 2, "WH"))) {
0754:                        if (isVowel(charAt(value, index + 1))) {
0755:                            //-- Wasserman should match Vasserman --//
0756:                            result.append('A', 'F');
0757:                        } else {
0758:                            //-- need Uomo to match Womo --//
0759:                            result.append('A');
0760:                        }
0761:                        index++;
0762:                    } else if ((index == value.length() - 1 && isVowel(charAt(
0763:                            value, index - 1)))
0764:                            || contains(value, index - 1, 5, "EWSKI", "EWSKY",
0765:                                    "OWSKI", "OWSKY")
0766:                            || contains(value, 0, 3, "SCH")) {
0767:                        //-- Arnow should match Arnoff --//
0768:                        result.appendAlternate('F');
0769:                        index++;
0770:                    } else if (contains(value, index, 4, "WICZ", "WITZ")) {
0771:                        //-- Polish e.g. "filipowicz" --//
0772:                        result.append("TS", "FX");
0773:                        index += 4;
0774:                    } else {
0775:                        index++;
0776:                    }
0777:                }
0778:                return index;
0779:            }
0780:
0781:            /**
0782:             * Handles 'X' cases
0783:             */
0784:            private int handleX(String value, DoubleMetaphoneResult result,
0785:                    int index) {
0786:                if (index == 0) {
0787:                    result.append('S');
0788:                    index++;
0789:                } else {
0790:                    if (!((index == value.length() - 1) && (contains(value,
0791:                            index - 3, 3, "IAU", "EAU") || contains(value,
0792:                            index - 2, 2, "AU", "OU")))) {
0793:                        //-- French e.g. breaux --//
0794:                        result.append("KS");
0795:                    }
0796:                    index = contains(value, index + 1, 1, "C", "X") ? index + 2
0797:                            : index + 1;
0798:                }
0799:                return index;
0800:            }
0801:
0802:            /**
0803:             * Handles 'Z' cases
0804:             */
0805:            private int handleZ(String value, DoubleMetaphoneResult result,
0806:                    int index, boolean slavoGermanic) {
0807:                if (charAt(value, index + 1) == 'H') {
0808:                    //-- Chinese pinyin e.g. "zhao" or Angelina "Zhang" --//
0809:                    result.append('J');
0810:                    index += 2;
0811:                } else {
0812:                    if (contains(value, index + 1, 2, "ZO", "ZI", "ZA")
0813:                            || (slavoGermanic && (index > 0 && charAt(value,
0814:                                    index - 1) != 'T'))) {
0815:                        result.append("S", "TS");
0816:                    } else {
0817:                        result.append('S');
0818:                    }
0819:                    index = charAt(value, index + 1) == 'Z' ? index + 2
0820:                            : index + 1;
0821:                }
0822:                return index;
0823:            }
0824:
0825:            //-- BEGIN CONDITIONS --//
0826:
0827:            /**
0828:             * Complex condition 0 for 'C'
0829:             */
0830:            private boolean conditionC0(String value, int index) {
0831:                if (contains(value, index, 4, "CHIA")) {
0832:                    return true;
0833:                } else if (index <= 1) {
0834:                    return false;
0835:                } else if (isVowel(charAt(value, index - 2))) {
0836:                    return false;
0837:                } else if (!contains(value, index - 1, 3, "ACH")) {
0838:                    return false;
0839:                } else {
0840:                    char c = charAt(value, index + 2);
0841:                    return (c != 'I' && c != 'E')
0842:                            || contains(value, index - 2, 6, "BACHER", "MACHER");
0843:                }
0844:            }
0845:
0846:            /**
0847:             * Complex condition 0 for 'CH'
0848:             */
0849:            private boolean conditionCH0(String value, int index) {
0850:                if (index != 0) {
0851:                    return false;
0852:                } else if (!contains(value, index + 1, 5, "HARAC", "HARIS")
0853:                        && !contains(value, index + 1, 3, "HOR", "HYM", "HIA",
0854:                                "HEM")) {
0855:                    return false;
0856:                } else if (contains(value, 0, 5, "CHORE")) {
0857:                    return false;
0858:                } else {
0859:                    return true;
0860:                }
0861:            }
0862:
0863:            /**
0864:             * Complex condition 1 for 'CH'
0865:             */
0866:            private boolean conditionCH1(String value, int index) {
0867:                return ((contains(value, 0, 4, "VAN ", "VON ") || contains(
0868:                        value, 0, 3, "SCH"))
0869:                        || contains(value, index - 2, 6, "ORCHES", "ARCHIT",
0870:                                "ORCHID")
0871:                        || contains(value, index + 2, 1, "T", "S") || ((contains(
0872:                        value, index - 1, 1, "A", "O", "U", "E") || index == 0) && (contains(
0873:                        value, index + 2, 1, L_R_N_M_B_H_F_V_W_SPACE) || index + 1 == value
0874:                        .length() - 1)));
0875:            }
0876:
0877:            /**
0878:             * Complex condition 0 for 'L'
0879:             */
0880:            private boolean conditionL0(String value, int index) {
0881:                if (index == value.length() - 3
0882:                        && contains(value, index - 1, 4, "ILLO", "ILLA", "ALLE")) {
0883:                    return true;
0884:                } else if ((contains(value, index - 1, 2, "AS", "OS") || contains(
0885:                        value, value.length() - 1, 1, "A", "O"))
0886:                        && contains(value, index - 1, 4, "ALLE")) {
0887:                    return true;
0888:                } else {
0889:                    return false;
0890:                }
0891:            }
0892:
0893:            /**
0894:             * Complex condition 0 for 'M'
0895:             */
0896:            private boolean conditionM0(String value, int index) {
0897:                if (charAt(value, index + 1) == 'M') {
0898:                    return true;
0899:                }
0900:                return contains(value, index - 1, 3, "UMB")
0901:                        && ((index + 1) == value.length() - 1 || contains(
0902:                                value, index + 2, 2, "ER"));
0903:            }
0904:
0905:            //-- BEGIN HELPER FUNCTIONS --//
0906:
0907:            /**
0908:             * Determines whether or not a value is of slavo-germanic orgin. A value is
0909:             * of slavo-germanic origin if it contians any of 'W', 'K', 'CZ', or 'WITZ'.
0910:             */
0911:            private boolean isSlavoGermanic(String value) {
0912:                return value.indexOf('W') > -1 || value.indexOf('K') > -1
0913:                        || value.indexOf("CZ") > -1
0914:                        || value.indexOf("WITZ") > -1;
0915:            }
0916:
0917:            /**
0918:             * Determines whether or not a character is a vowel or not
0919:             */
0920:            private boolean isVowel(char ch) {
0921:                return VOWELS.indexOf(ch) != -1;
0922:            }
0923:
0924:            /**
0925:             * Determines whether or not the value starts with a silent letter.  It will
0926:             * return <code>true</code> if the value starts with any of 'GN', 'KN',
0927:             * 'PN', 'WR' or 'PS'.
0928:             */
0929:            private boolean isSilentStart(String value) {
0930:                boolean result = false;
0931:                for (int i = 0; i < SILENT_START.length; i++) {
0932:                    if (value.startsWith(SILENT_START[i])) {
0933:                        result = true;
0934:                        break;
0935:                    }
0936:                }
0937:                return result;
0938:            }
0939:
0940:            /**
0941:             * Cleans the input
0942:             */
0943:            private String cleanInput(String input) {
0944:                if (input == null) {
0945:                    return null;
0946:                }
0947:                input = input.trim();
0948:                if (input.length() == 0) {
0949:                    return null;
0950:                }
0951:                return input.toUpperCase();
0952:            }
0953:
0954:            /**
0955:             * Gets the character at index <code>index</code> if available, otherwise
0956:             * it returns <code>Character.MIN_VALUE</code> so that there is some sort
0957:             * of a default
0958:             */
0959:            protected char charAt(String value, int index) {
0960:                if (index < 0 || index >= value.length()) {
0961:                    return Character.MIN_VALUE;
0962:                }
0963:                return value.charAt(index);
0964:            }
0965:
0966:            /**
0967:             * Shortcut method with 1 criteria
0968:             */
0969:            private static boolean contains(String value, int start,
0970:                    int length, String criteria) {
0971:                return contains(value, start, length, new String[] { criteria });
0972:            }
0973:
0974:            /**
0975:             * Shortcut method with 2 criteria
0976:             */
0977:            private static boolean contains(String value, int start,
0978:                    int length, String criteria1, String criteria2) {
0979:                return contains(value, start, length, new String[] { criteria1,
0980:                        criteria2 });
0981:            }
0982:
0983:            /**
0984:             * Shortcut method with 3 criteria
0985:             */
0986:            private static boolean contains(String value, int start,
0987:                    int length, String criteria1, String criteria2,
0988:                    String criteria3) {
0989:                return contains(value, start, length, new String[] { criteria1,
0990:                        criteria2, criteria3 });
0991:            }
0992:
0993:            /**
0994:             * Shortcut method with 4 criteria
0995:             */
0996:            private static boolean contains(String value, int start,
0997:                    int length, String criteria1, String criteria2,
0998:                    String criteria3, String criteria4) {
0999:                return contains(value, start, length, new String[] { criteria1,
1000:                        criteria2, criteria3, criteria4 });
1001:            }
1002:
1003:            /**
1004:             * Shortcut method with 5 criteria
1005:             */
1006:            private static boolean contains(String value, int start,
1007:                    int length, String criteria1, String criteria2,
1008:                    String criteria3, String criteria4, String criteria5) {
1009:                return contains(value, start, length, new String[] { criteria1,
1010:                        criteria2, criteria3, criteria4, criteria5 });
1011:            }
1012:
1013:            /**
1014:             * Shortcut method with 6 criteria
1015:             */
1016:            private static boolean contains(String value, int start,
1017:                    int length, String criteria1, String criteria2,
1018:                    String criteria3, String criteria4, String criteria5,
1019:                    String criteria6) {
1020:                return contains(value, start, length, new String[] { criteria1,
1021:                        criteria2, criteria3, criteria4, criteria5, criteria6 });
1022:            }
1023:
1024:            /**
1025:             * Determines whether <code>value</code> contains any of the criteria 
1026:             starting
1027:             * at index <code>start</code> and matching up to length <code>length</code>
1028:             */
1029:            protected static boolean contains(String value, int start,
1030:                    int length, String[] criteria) {
1031:                boolean result = false;
1032:                if (start >= 0 && start + length <= value.length()) {
1033:                    String target = value.substring(start, start + length);
1034:
1035:                    for (int i = 0; i < criteria.length; i++) {
1036:                        if (target.equals(criteria[i])) {
1037:                            result = true;
1038:                            break;
1039:                        }
1040:                    }
1041:                }
1042:                return result;
1043:            }
1044:
1045:            //-- BEGIN INNER CLASSES --//
1046:
1047:            /**
1048:             * Inner class for storing results, since there is the optional alternate
1049:             * encoding.
1050:             */
1051:            public class DoubleMetaphoneResult {
1052:
1053:                private StringBuffer primary = new StringBuffer(getMaxCodeLen());
1054:                private StringBuffer alternate = new StringBuffer(
1055:                        getMaxCodeLen());
1056:                private int maxLength;
1057:
1058:                public DoubleMetaphoneResult(int maxLength) {
1059:                    this .maxLength = maxLength;
1060:                }
1061:
1062:                public void append(char value) {
1063:                    appendPrimary(value);
1064:                    appendAlternate(value);
1065:                }
1066:
1067:                public void append(char primary, char alternate) {
1068:                    appendPrimary(primary);
1069:                    appendAlternate(alternate);
1070:                }
1071:
1072:                public void appendPrimary(char value) {
1073:                    if (this .primary.length() < this .maxLength) {
1074:                        this .primary.append(value);
1075:                    }
1076:                }
1077:
1078:                public void appendAlternate(char value) {
1079:                    if (this .alternate.length() < this .maxLength) {
1080:                        this .alternate.append(value);
1081:                    }
1082:                }
1083:
1084:                public void append(String value) {
1085:                    appendPrimary(value);
1086:                    appendAlternate(value);
1087:                }
1088:
1089:                public void append(String primary, String alternate) {
1090:                    appendPrimary(primary);
1091:                    appendAlternate(alternate);
1092:                }
1093:
1094:                public void appendPrimary(String value) {
1095:                    int addChars = this .maxLength - this .primary.length();
1096:                    if (value.length() <= addChars) {
1097:                        this .primary.append(value);
1098:                    } else {
1099:                        this .primary.append(value.substring(0, addChars));
1100:                    }
1101:                }
1102:
1103:                public void appendAlternate(String value) {
1104:                    int addChars = this .maxLength - this .alternate.length();
1105:                    if (value.length() <= addChars) {
1106:                        this .alternate.append(value);
1107:                    } else {
1108:                        this .alternate.append(value.substring(0, addChars));
1109:                    }
1110:                }
1111:
1112:                public String getPrimary() {
1113:                    return this .primary.toString();
1114:                }
1115:
1116:                public String getAlternate() {
1117:                    return this .alternate.toString();
1118:                }
1119:
1120:                public boolean isComplete() {
1121:                    return this.primary.length() >= this.maxLength
1122:                            && this.alternate.length() >= this.maxLength;
1123:                }
1124:            }
1125:        }
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.