Source Code Cross Referenced for yacyURL.java in  » Search-Engine » yacy » de » anomic » yacy » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Search Engine » yacy » de.anomic.yacy 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


0001:        // yacyURL.java
0002:        // (C) 2006 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
0003:        // first published 13.07.2006 on http://yacy.net
0004:        //
0005:        // $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
0006:        // $LastChangedRevision: 1986 $
0007:        // $LastChangedBy: orbiter $
0008:        //
0009:        // LICENSE
0010:        // 
0011:        // This program is free software; you can redistribute it and/or modify
0012:        // it under the terms of the GNU General Public License as published by
0013:        // the Free Software Foundation; either version 2 of the License, or
0014:        // (at your option) any later version.
0015:        //
0016:        // This program is distributed in the hope that it will be useful,
0017:        // but WITHOUT ANY WARRANTY; without even the implied warranty of
0018:        // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
0019:        // GNU General Public License for more details.
0020:        //
0021:        // You should have received a copy of the GNU General Public License
0022:        // along with this program; if not, write to the Free Software
0023:        // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
0024:
0025:        package de.anomic.yacy;
0026:
0027:        // this class exsist to provide a system-wide normal form representation of urls,
0028:        // and to prevent that java.net.URL usage causes DNS queries which are used in java.net.
0029:
0030:        import java.io.File;
0031:        import java.net.MalformedURLException;
0032:        import java.util.HashMap;
0033:        import java.util.Iterator;
0034:        import java.util.TreeSet;
0035:        import java.util.regex.Matcher;
0036:        import java.util.regex.Pattern;
0037:
0038:        import de.anomic.kelondro.kelondroBase64Order;
0039:        import de.anomic.server.serverCodings;
0040:        import de.anomic.server.serverDomains;
0041:
0042:        public class yacyURL {
0043:
0044:            // TLD separation in political and cultural parts
0045:            // https://www.cia.gov/cia/publications/factbook/index.html
0046:            // http://en.wikipedia.org/wiki/List_of_countries_by_continent
0047:
0048:            private static final String[] TLD_NorthAmericaOceania = {
0049:                    // primary english-speaking countries
0050:                    // english-speaking countries from central america are also included
0051:                    // includes also dutch and french colonies in the caribbean sea
0052:                    // and US/English/Australian military bases in asia
0053:                    "EDU=US Educational",
0054:                    "GOV=US Government",
0055:                    "MIL=US Military",
0056:                    "NET=Network",
0057:                    "ORG=Non-Profit Organization",
0058:                    "AN=Netherlands Antilles",
0059:                    "AS=American Samoa",
0060:                    "AG=Antigua and Barbuda",
0061:                    "AI=Anguilla",
0062:                    "AU=Australia",
0063:                    "BB=Barbados",
0064:                    "BZ=Belize",
0065:                    "BM=Bermuda",
0066:                    "BS=Bahamas",
0067:                    "CA=Canada",
0068:                    "CC=Cocos (Keeling) Islands",
0069:                    "CK=Cook Islands",
0070:                    "CX=Christmas Island", // located in the Indian Ocean, but belongs to Australia
0071:                    "DM=Dominica",
0072:                    "FM=Micronesia",
0073:                    "FJ=Fiji",
0074:                    "GD=Grenada",
0075:                    "GP=Guadeloupe",
0076:                    "GS=South Georgia and the South Sandwich Islands", // south of south america, but administrated by british, has only a scientific base
0077:                    "GU=Guam", // strategical US basis close to Japan
0078:                    "HM=Heard and McDonald Islands", // uninhabited, sub-Antarctic island, owned by Australia
0079:                    "HT=Haiti",
0080:                    "IO=British Indian Ocean Territory", // UK-US naval support facility in the Indian Ocean
0081:                    "KI=Kiribati", // 33 coral atolls in the pacific, formerly owned by UK
0082:                    "KN=Saint Kitts and Nevis", // islands in the carribean see
0083:                    "KY=Cayman Islands",
0084:                    "LC=Saint Lucia",
0085:                    "MH=Marshall Islands", // formerly US atomic bomb test site, now a key installation in the US missile defense network
0086:                    "MP=Northern Mariana Islands", // US strategic location in the western Pacific Ocean
0087:                    "NC=New Caledonia",
0088:                    "NF=Norfolk Island",
0089:                    "NR=Nauru", // independent UN island
0090:                    "NU=Niue", // one of world's largest coral islands
0091:                    "NZ=New Zealand (Aotearoa)",
0092:                    "PG=Papua New Guinea",
0093:                    "PN=Pitcairn", // overseas territory of the UK
0094:                    "PR=Puerto Rico", // territory of the US with commonwealth status
0095:                    "PW=Palau", // was once governed by Micronesia
0096:                    "Sb=Solomon Islands",
0097:                    "TC=Turks and Caicos Islands", // overseas territory of the UK
0098:                    "TK=Tokelau", // group of three atolls in the South Pacific Ocean, british protectorat
0099:                    "TO=Tonga",
0100:                    "TT=Trinidad and Tobago",
0101:                    "TV=Tuvalu", //  nine coral atolls in the South Pacific Ocean; in 2000, Tuvalu leased its TLD ".tv" for $50 million over a 12-year period
0102:                    "UM=US Minor Outlying Islands", // nine insular United States possessions in the Pacific Ocean and the Caribbean Sea
0103:                    "US=United States", "VC=Saint Vincent and the Grenadines",
0104:                    "VG=Virgin Islands (British)", "VI=Virgin Islands (U.S.)",
0105:                    "VU=Vanuatu", "WF=Wallis and Futuna Islands", "WS=Samoa" };
0106:            private static final String[] TLD_MiddleSouthAmerica = {
0107:                    // primary spanish and portugese-speaking
0108:                    "AR=Argentina", "AW=Aruba", "BR=Brazil", "BO=Bolivia",
0109:                    "CL=Chile", "CO=Colombia", "CR=Costa Rica", "CU=Cuba",
0110:                    "DO=Dominican Republic", "EC=Ecuador",
0111:                    "FK=Falkland Islands (Malvinas)", "GF=French Guiana",
0112:                    "GT=Guatemala", "GY=Guyana", "HN=Honduras", "JM=Jamaica",
0113:                    "MX=Mexico", "NI=Nicaragua", "PA=Panama", "PE=Peru",
0114:                    "PY=Paraguay", "SR=Suriname", "SV=El Salvador",
0115:                    "UY=Uruguay", "VE=Venezuela" };
0116:            private static final String[] TLD_EuropaRussia = {
0117:                    // includes also countries that are mainly french- dutch- speaking
0118:                    // and culturally close to europe
0119:                    "AD=Andorra",
0120:                    "AL=Albania",
0121:                    "AQ=Antarctica",
0122:                    "AT=Austria",
0123:                    "BA=Bosnia and Herzegovina",
0124:                    "BE=Belgium",
0125:                    "BG=Bulgaria",
0126:                    "BV=Bouvet Island", // this island is uninhabited and covered by ice, south of africa but governed by Norway
0127:                    "BY=Belarus",
0128:                    "CH=Switzerland",
0129:                    "CS=Czechoslovakia (former)",
0130:                    "CZ=Czech Republic",
0131:                    "CY=Cyprus",
0132:                    "DE=Germany",
0133:                    "DK=Denmark",
0134:                    "ES=Spain",
0135:                    "EE=Estonia",
0136:                    "FI=Finland",
0137:                    "FO=Faroe Islands", // Viking Settlers
0138:                    "FR=France", "FX=France, Metropolitan",
0139:                    "GB=Great Britain (UK)", "GI=Gibraltar", "GL=Greenland",
0140:                    "GR=Greece", "HR=Croatia (Hrvatska)", "HU=Hungary",
0141:                    "IE=Ireland", "IS=Iceland",
0142:                    "IT=Italy",
0143:                    "LI=Liechtenstein",
0144:                    "LT=Lithuania",
0145:                    "LU=Luxembourg",
0146:                    "LV=Latvia",
0147:                    "MD=Moldova",
0148:                    "MC=Monaco",
0149:                    "MK=Macedonia",
0150:                    "MN=Mongolia",
0151:                    "MS=Montserrat", // British island in the Caribbean Sea, almost not populated because of strong vulcanic activity
0152:                    "MT=Malta",
0153:                    "MQ=Martinique", // island in the eastern Caribbean Sea, overseas department of France
0154:                    "NATO=Nato field",
0155:                    "NL=Netherlands",
0156:                    "NO=Norway",
0157:                    "PF=French Polynesia", // French annexed Polynesian island in the South Pacific, French atomic bomb test site
0158:                    "PL=Poland",
0159:                    "PM=St. Pierre and Miquelon", // french-administrated colony close to canada, belongs to France
0160:                    "PT=Portugal", "RO=Romania", "RU=Russia",
0161:                    "SE=Sweden",
0162:                    "SI=Slovenia",
0163:                    "SJ=Svalbard and Jan Mayen Islands", // part of Norway
0164:                    "SM=San Marino", "SK=Slovak Republic",
0165:                    "SU=USSR (former)",
0166:                    "TF=French Southern Territories", // islands in the arctic see, no inhabitants
0167:                    "UK=United Kingdom", "UA=Ukraine",
0168:                    "VA=Vatican City State (Holy See)", "YU=Yugoslavia" };
0169:
0170:            private static final String[] TLD_MiddleEastWestAsia = {
0171:                    // states that are influenced by islamic culture and arabic language
0172:                    // includes also eurasia states and those that had been part of the former USSR and close to southwest asia
0173:                    "AE=United Arab Emirates", "AF=Afghanistan", "AM=Armenia",
0174:                    "AZ=Azerbaijan", "BH=Bahrain", "GE=Georgia", "IL=Israel",
0175:                    "IQ=Iraq", "IR=Iran", "JO=Jordan", "KG=Kyrgyzstan",
0176:                    "KZ=Kazakhstan", "KW=Kuwait", "LB=Lebanon", "OM=Oman",
0177:                    "QA=Qatar", "SA=Saudi Arabia", "SY=Syria", "TJ=Tajikistan",
0178:                    "TM=Turkmenistan", "PK=Pakistan", "TR=Turkey",
0179:                    "UZ=Uzbekistan", "YE=Yemen" };
0180:            private static final String[] TLD_SouthEastAsia = {
0181:                    "BD=Bangladesh", "BN=Brunei Darussalam", "BT=Bhutan",
0182:                    "CN=China", "HK=Hong Kong", "ID=Indonesia", "IN=India",
0183:                    "LA=Laos", "NP=Nepal", "JP=Japan", "KH=Cambodia",
0184:                    "KP=Korea (North)", "KR=Korea (South)", "LK=Sri Lanka",
0185:                    "MY=Malaysia",
0186:                    "MM=Myanmar", // formerly known as Burma
0187:                    "MO=Macau", // Portuguese settlement, part of China, but has some autonomy
0188:                    "MV=Maldives", // group of atolls in the Indian Ocean
0189:                    "PH=Philippines", "SG=Singapore", "TP=East Timor",
0190:                    "TH=Thailand", "TW=Taiwan", "VN=Viet Nam" };
0191:            private static final String[] TLD_Africa = { "AO=Angola",
0192:                    "BF=Burkina Faso", "BI=Burundi", "BJ=Benin", "BW=Botswana",
0193:                    "CF=Central African Republic", "CG=Congo",
0194:                    "CI=Cote D'Ivoire (Ivory Coast)", "CM=Cameroon",
0195:                    "CV=Cape Verde", "DJ=Djibouti", "DZ=Algeria", "EG=Egypt",
0196:                    "EH=Western Sahara", "ER=Eritrea", "ET=Ethiopia",
0197:                    "GA=Gabon", "GH=Ghana", "GM=Gambia", "GN=Guinea",
0198:                    "GQ=Equatorial Guinea", "GW=Guinea-Bissau", "KE=Kenya",
0199:                    "KM=Comoros", "LR=Liberia", "LS=Lesotho", "LY=Libya",
0200:                    "MA=Morocco", "MG=Madagascar", "ML=Mali", "MR=Mauritania",
0201:                    "MU=Mauritius", "MW=Malawi", "MZ=Mozambique", "NA=Namibia",
0202:                    "NE=Niger", "NG=Nigeria", "RE=Reunion", "RW=Rwanda",
0203:                    "SC=Seychelles", "SD=Sudan", "SH=St. Helena",
0204:                    "SL=Sierra Leone", "SN=Senegal", "SO=Somalia",
0205:                    "ST=Sao Tome and Principe", "SZ=Swaziland", "TD=Chad",
0206:                    "TG=Togo", "TN=Tunisia", "TZ=Tanzania", "UG=Uganda",
0207:                    "ZA=South Africa", "ZM=Zambia", "ZR=Zaire", "ZW=Zimbabwe",
0208:                    "YT=Mayotte" };
0209:            private static final String[] TLD_Generic = { "COM=US Commercial",
0210:                    "AERO=", "BIZ=", "COOP=", "INFO=", "MUSEUM=", "NAME=",
0211:                    "PRO=", "ARPA=", "INT=International", "ARPA=Arpanet",
0212:                    "NT=Neutral Zone" };
0213:
0214:            /*
0215:             * TLDs: aero, biz, com, coop, edu, gov, info, int, mil, museum, name, net,
0216:             * org, pro, arpa AC, AD, AE, AERO, AF, AG, AI, AL, AM, AN, AO, AQ, AR,
0217:             * ARPA, AS, AT, AU, AW, AZ, BA, BB, BD, BE, BF, BG, BH, BI, BIZ, BJ, BM,
0218:             * BN, BO, BR, BS, BT, BV, BW, BY, BZ, CA, CC, CD, CF, CG, CH, CI, CK, CL,
0219:             * CM, CN, CO, COM, COOP, CR, CU, CV, CX, CY, CZ, DE, DJ, DK, DM, DO, DZ,
0220:             * EC, EDU, EE, EG, ER, ES, ET, EU, FI, FJ, FK, FM, FO, FR, GA, GB, GD, GE,
0221:             * GF, GG, GH, GI, GL, GM, GN, GOV, GP, GQ, GR, GS, GT, GU, GW, GY, HK, HM,
0222:             * HN, HR, HT, HU, ID, IE, IL, IM, IN, INFO, INT, IO, IQ, IR, IS, IT, JE,
0223:             * JM, JO, JOBS, JP, KE, KG, KH, KI, KM, KN, KR, KW, KY, KZ, LA, LB, LC, LI,
0224:             * LK, LR, LS, LT, LU, LV, LY, MA, MC, MD, MG, MH, MIL, MK, ML, MM, MN, MO,
0225:             * MOBI, MP, MQ, MR, MS, MT, MU, MUSEUM, MV, MW, MX, MY, MZ, NA, NAME, NC,
0226:             * NE, NET, NF, NG, NI, NL, NO, NP, NR, NU, NZ, OM, ORG, PA, PE, PF, PG, PH,
0227:             * PK, PL, PM, PN, PR, PRO, PS, PT, PW, PY, QA, RE, RO, RU, RW, SA, SB, SC,
0228:             * SD, SE, SG, SH, SI, SJ, SK, SL, SM, SN, SO, SR, ST, SU, SV, SY, SZ, TC,
0229:             * TD, TF, TG, TH, TJ, TK, TL, TM, TN, TO, TP, TR, TRAVEL, TT, TV, TW, TZ,
0230:             * UA, UG, UK, UM, US, UY, UZ, VA, VC, VE, VG, VI, VN, VU, WF, WS, YE, YT,
0231:             * YU, ZA, ZM, ZW
0232:             */
0233:
0234:            public static String dummyHash;
0235:
0236:            private static HashMap<String, Integer> TLDID = new HashMap<String, Integer>();
0237:            private static HashMap<String, String> TLDName = new HashMap<String, String>();
0238:
0239:            private static void insertTLDProps(String[] TLDList, int id) {
0240:                int p;
0241:                String tld, name;
0242:                Integer ID = new Integer(id);
0243:                for (int i = 0; i < TLDList.length; i++) {
0244:                    p = TLDList[i].indexOf('=');
0245:                    if (p > 0) {
0246:                        tld = TLDList[i].substring(0, p).toLowerCase();
0247:                        name = TLDList[i].substring(p + 1);
0248:                        TLDID.put(tld, ID);
0249:                        TLDName.put(tld, name);
0250:                    }
0251:                }
0252:            }
0253:
0254:            static {
0255:                // create a dummy hash
0256:                dummyHash = "";
0257:                for (int i = 0; i < yacySeedDB.commonHashLength; i++)
0258:                    dummyHash += "-";
0259:
0260:                // assign TLD-ids and names
0261:                insertTLDProps(TLD_EuropaRussia, 0);
0262:                insertTLDProps(TLD_MiddleSouthAmerica, 1);
0263:                insertTLDProps(TLD_SouthEastAsia, 2);
0264:                insertTLDProps(TLD_MiddleEastWestAsia, 3);
0265:                insertTLDProps(TLD_NorthAmericaOceania, 4);
0266:                insertTLDProps(TLD_Africa, 5);
0267:                insertTLDProps(TLD_Generic, 6);
0268:                // the id=7 is used to flag local addresses
0269:            }
0270:
0271:            // class variables
0272:            private String protocol, host, userInfo, path, quest, ref, hash;
0273:            private int port;
0274:
0275:            public yacyURL(String url, String hash)
0276:                    throws MalformedURLException {
0277:                if (url == null)
0278:                    throw new MalformedURLException("url string is null");
0279:                parseURLString(url);
0280:                this .hash = hash;
0281:            }
0282:
0283:            private void parseURLString(String url)
0284:                    throws MalformedURLException {
0285:                // identify protocol
0286:                assert (url != null);
0287:                url = url.trim();
0288:                int p = url.indexOf(':');
0289:                if (p < 0) {
0290:                    if (url.startsWith("www.")) {
0291:                        url = "http://" + url;
0292:                        p = 4;
0293:                    } else {
0294:                        throw new MalformedURLException(
0295:                                "protocol is not given in '" + url + "'");
0296:                    }
0297:                }
0298:                this .protocol = url.substring(0, p).toLowerCase().trim();
0299:                if (url.length() < p + 4)
0300:                    throw new MalformedURLException("URL not parseable: '"
0301:                            + url + "'");
0302:                if (url.substring(p + 1, p + 3).equals("//")) {
0303:                    // identify host, userInfo and file for http and ftp protocol
0304:                    int q = url.indexOf('/', p + 3);
0305:                    int r;
0306:                    if (q < 0) {
0307:                        if ((r = url.indexOf('@', p + 3)) < 0) {
0308:                            host = url.substring(p + 3);
0309:                            userInfo = null;
0310:                        } else {
0311:                            host = url.substring(r + 1);
0312:                            userInfo = url.substring(p + 3, r);
0313:                        }
0314:                        path = "/";
0315:                    } else {
0316:                        host = url.substring(p + 3, q);
0317:                        if ((r = host.indexOf('@')) < 0) {
0318:                            userInfo = null;
0319:                        } else {
0320:                            userInfo = host.substring(0, r);
0321:                            host = host.substring(r + 1);
0322:                        }
0323:                        path = url.substring(q);
0324:                    }
0325:
0326:                    path = resolveBackpath(path);
0327:                    identPort(url, (protocol.equals("http") ? 80 : ((protocol
0328:                            .equals("https")) ? 443
0329:                            : ((protocol.equals("ftp")) ? 21 : -1))));
0330:                    identRef();
0331:                    identQuest();
0332:                    escape();
0333:                } else {
0334:                    // this is not a http or ftp url
0335:                    if (protocol.equals("mailto")) {
0336:                        // parse email url
0337:                        int q = url.indexOf('@', p + 3);
0338:                        if (q < 0) {
0339:                            throw new MalformedURLException(
0340:                                    "wrong email address: " + url);
0341:                        } else {
0342:                            userInfo = url.substring(p + 1, q);
0343:                            host = url.substring(q + 1);
0344:                            path = null;
0345:                            port = -1;
0346:                            quest = null;
0347:                            ref = null;
0348:                        }
0349:                    } else {
0350:                        throw new MalformedURLException("unknown protocol: "
0351:                                + url);
0352:                    }
0353:                }
0354:            }
0355:
0356:            public yacyURL(File file) throws MalformedURLException {
0357:                this ("file", "", -1, file.getAbsolutePath());
0358:            }
0359:
0360:            public static yacyURL newURL(String baseURL, String relPath)
0361:                    throws MalformedURLException {
0362:                if ((baseURL == null) || (relPath.startsWith("http://"))
0363:                        || (relPath.startsWith("https://"))
0364:                        || (relPath.startsWith("ftp://"))
0365:                        || (relPath.startsWith("file://"))
0366:                        || (relPath.startsWith("smb://"))) {
0367:                    return new yacyURL(relPath, null);
0368:                } else {
0369:                    return new yacyURL(new yacyURL(baseURL, null), relPath);
0370:                }
0371:            }
0372:
0373:            public static yacyURL newURL(yacyURL baseURL, String relPath)
0374:                    throws MalformedURLException {
0375:                if ((baseURL == null) || (relPath.startsWith("http://"))
0376:                        || (relPath.startsWith("https://"))
0377:                        || (relPath.startsWith("ftp://"))
0378:                        || (relPath.startsWith("file://"))
0379:                        || (relPath.startsWith("smb://"))) {
0380:                    return new yacyURL(relPath, null);
0381:                } else {
0382:                    return new yacyURL(baseURL, relPath);
0383:                }
0384:            }
0385:
0386:            private yacyURL(yacyURL baseURL, String relPath)
0387:                    throws MalformedURLException {
0388:                if (baseURL == null)
0389:                    throw new MalformedURLException("base URL is null");
0390:                if (relPath == null)
0391:                    throw new MalformedURLException("relPath is null");
0392:
0393:                this .hash = null;
0394:                this .protocol = baseURL.protocol;
0395:                this .host = baseURL.host;
0396:                this .port = baseURL.port;
0397:                this .userInfo = baseURL.userInfo;
0398:                if (relPath.toLowerCase().startsWith("javascript:")) {
0399:                    this .path = baseURL.path;
0400:                } else if ((relPath.startsWith("http://"))
0401:                        || (relPath.startsWith("https://"))
0402:                        || (relPath.startsWith("ftp://"))
0403:                        || (relPath.startsWith("file://"))
0404:                        || (relPath.startsWith("smb://"))) {
0405:                    this .path = baseURL.path;
0406:                } else if (relPath.startsWith("/")) {
0407:                    this .path = relPath;
0408:                } else if (baseURL.path.endsWith("/")) {
0409:                    if (relPath.startsWith("#") || relPath.startsWith("?")) {
0410:                        throw new MalformedURLException(
0411:                                "relative path malformed: " + relPath);
0412:                    } else {
0413:                        this .path = baseURL.path + relPath;
0414:                    }
0415:                } else {
0416:                    if (relPath.startsWith("#") || relPath.startsWith("?")) {
0417:                        this .path = baseURL.path + relPath;
0418:                    } else {
0419:                        int q = baseURL.path.lastIndexOf('/');
0420:                        if (q < 0) {
0421:                            this .path = relPath;
0422:                        } else {
0423:                            this .path = baseURL.path.substring(0, q + 1)
0424:                                    + relPath;
0425:                        }
0426:                    }
0427:                }
0428:                this .quest = baseURL.quest;
0429:                this .ref = baseURL.ref;
0430:
0431:                path = resolveBackpath(path);
0432:                identRef();
0433:                identQuest();
0434:                escape();
0435:            }
0436:
0437:            public yacyURL(String protocol, String host, int port, String path)
0438:                    throws MalformedURLException {
0439:                if (protocol == null)
0440:                    throw new MalformedURLException("protocol is null");
0441:                this .protocol = protocol;
0442:                this .host = host;
0443:                this .port = port;
0444:                this .path = path;
0445:                this .hash = null;
0446:                identRef();
0447:                identQuest();
0448:                escape();
0449:            }
0450:
0451:            //  resolve '..'
0452:            String resolveBackpath(String path) /* throws MalformedURLException */{
0453:                /* original version by [MC]
0454:                int p;
0455:                while ((p = path.indexOf("/..")) >= 0) {
0456:                    String head = path.substring(0, p);
0457:                    int q = head.lastIndexOf('/');
0458:                    if (q < 0) throw new MalformedURLException("backpath cannot be resolved in path = " + path);
0459:                    path = head.substring(0, q) + path.substring(p + 3);
0460:                }*/
0461:
0462:                /* by [MT] */
0463:                if (path.length() == 0 || path.charAt(0) != '/') {
0464:                    path = "/" + path;
0465:                }
0466:
0467:                Pattern pathPattern = Pattern
0468:                        .compile("(/[^/]+(?<!/\\.{1,2})/)[.]{2}(?=/|$)|/\\.(?=/)|/(?=/)");
0469:                Matcher matcher = pathPattern.matcher(path);
0470:                while (matcher.find()) {
0471:                    path = matcher.replaceAll("");
0472:                    matcher.reset(path);
0473:                }
0474:
0475:                return path.equals("") ? "/" : path;
0476:            }
0477:
0478:            /**
0479:             * Escapes the following parts of the url, this object already contains:
0480:             * <ul>
0481:             * <li>path: see {@link #escape(String)}</li>
0482:             * <li>ref: same as above</li>
0483:             * <li>quest: same as above without the ampersand ("&amp;") and the equals symbol</li>
0484:             * </ul>
0485:             */
0486:            private void escape() {
0487:                if (path != null && path.indexOf('%') == -1)
0488:                    escapePath();
0489:                if (quest != null && quest.indexOf('%') == -1)
0490:                    escapeQuest();
0491:                if (ref != null && ref.indexOf('%') == -1)
0492:                    escapeRef();
0493:            }
0494:
0495:            private void escapePath() {
0496:                String[] pathp = path.split("/", -1);
0497:                String ptmp = "";
0498:                for (int i = 0; i < pathp.length; i++) {
0499:                    ptmp += "/" + escape(pathp[i]);
0500:                }
0501:                path = ptmp.substring((ptmp.length() > 0) ? 1 : 0);
0502:            }
0503:
0504:            private void escapeRef() {
0505:                ref = escape(ref);
0506:            }
0507:
0508:            private void escapeQuest() {
0509:                String[] questp = quest.split("&", -1);
0510:                String qtmp = "";
0511:                for (int i = 0; i < questp.length; i++) {
0512:                    if (questp[i].indexOf('=') != -1) {
0513:                        qtmp += "&"
0514:                                + escape(questp[i].substring(0, questp[i]
0515:                                        .indexOf('=')));
0516:                        qtmp += "="
0517:                                + escape(questp[i].substring(questp[i]
0518:                                        .indexOf('=') + 1));
0519:                    } else {
0520:                        qtmp += "&" + escape(questp[i]);
0521:                    }
0522:                }
0523:                quest = qtmp.substring((qtmp.length() > 0) ? 1 : 0);
0524:            }
0525:
0526:            private final static String[] hex = { "%00", "%01", "%02", "%03",
0527:                    "%04", "%05", "%06", "%07", "%08", "%09", "%0A", "%0B",
0528:                    "%0C", "%0D", "%0E", "%0F", "%10", "%11", "%12", "%13",
0529:                    "%14", "%15", "%16", "%17", "%18", "%19", "%1A", "%1B",
0530:                    "%1C", "%1D", "%1E", "%1F", "%20", "%21", "%22", "%23",
0531:                    "%24", "%25", "%26", "%27", "%28", "%29", "%2A", "%2B",
0532:                    "%2C", "%2D", "%2E", "%2F", "%30", "%31", "%32", "%33",
0533:                    "%34", "%35", "%36", "%37", "%38", "%39", "%3A", "%3B",
0534:                    "%3C", "%3D", "%3E", "%3F", "%40", "%41", "%42", "%43",
0535:                    "%44", "%45", "%46", "%47", "%48", "%49", "%4A", "%4B",
0536:                    "%4C", "%4D", "%4E", "%4F", "%50", "%51", "%52", "%53",
0537:                    "%54", "%55", "%56", "%57", "%58", "%59", "%5A", "%5B",
0538:                    "%5C", "%5D", "%5E", "%5F", "%60", "%61", "%62", "%63",
0539:                    "%64", "%65", "%66", "%67", "%68", "%69", "%6A", "%6B",
0540:                    "%6C", "%6D", "%6E", "%6F", "%70", "%71", "%72", "%73",
0541:                    "%74", "%75", "%76", "%77", "%78", "%79", "%7A", "%7B",
0542:                    "%7C", "%7D", "%7E", "%7F", "%80", "%81", "%82", "%83",
0543:                    "%84", "%85", "%86", "%87", "%88", "%89", "%8A", "%8B",
0544:                    "%8C", "%8D", "%8E", "%8F", "%90", "%91", "%92", "%93",
0545:                    "%94", "%95", "%96", "%97", "%98", "%99", "%9A", "%9B",
0546:                    "%9C", "%9D", "%9E", "%9F", "%A0", "%A1", "%A2", "%A3",
0547:                    "%A4", "%A5", "%A6", "%A7", "%A8", "%A9", "%AA", "%AB",
0548:                    "%AC", "%AD", "%AE", "%AF", "%B0", "%B1", "%B2", "%B3",
0549:                    "%B4", "%B5", "%B6", "%B7", "%B8", "%B9", "%BA", "%BB",
0550:                    "%BC", "%BD", "%BE", "%BF", "%C0", "%C1", "%C2", "%C3",
0551:                    "%C4", "%C5", "%C6", "%C7", "%C8", "%C9", "%CA", "%CB",
0552:                    "%CC", "%CD", "%CE", "%CF", "%D0", "%D1", "%D2", "%D3",
0553:                    "%D4", "%D5", "%D6", "%D7", "%D8", "%D9", "%DA", "%DB",
0554:                    "%DC", "%DD", "%DE", "%DF", "%E0", "%E1", "%E2", "%E3",
0555:                    "%E4", "%E5", "%E6", "%E7", "%E8", "%E9", "%EA", "%EB",
0556:                    "%EC", "%ED", "%EE", "%EF", "%F0", "%F1", "%F2", "%F3",
0557:                    "%F4", "%F5", "%F6", "%F7", "%F8", "%F9", "%FA", "%FB",
0558:                    "%FC", "%FD", "%FE", "%FF" };
0559:
0560:            /**
0561:             * Encode a string to the "x-www-form-urlencoded" form, enhanced
0562:             * with the UTF-8-in-URL proposal. This is what happens:
0563:             *
0564:             * <ul>
0565:             * <li>The ASCII characters 'a' through 'z', 'A' through 'Z',
0566:             *     and '0' through '9' remain the same.
0567:             *
0568:             * <li>The unreserved characters - _ . ! ~ * ' ( ) remain the same.
0569:             *
0570:             * <li>All other ASCII characters are converted into the
0571:             *     3-character string "%xy", where xy is
0572:             *     the two-digit hexadecimal representation of the character
0573:             *     code
0574:             *
0575:             * <li>All non-ASCII characters are encoded in two steps: first
0576:             *     to a sequence of 2 or 3 bytes, using the UTF-8 algorithm;
0577:             *     secondly each of these bytes is encoded as "%xx".
0578:             * </ul>
0579:             *
0580:             * @param s The string to be encoded
0581:             * @return The encoded string
0582:             */
0583:            // from: http://www.w3.org/International/URLUTF8Encoder.java
0584:            public static String escape(String s) {
0585:                StringBuffer sbuf = new StringBuffer();
0586:                int len = s.length();
0587:                for (int i = 0; i < len; i++) {
0588:                    int ch = s.charAt(i);
0589:                    if ('A' <= ch && ch <= 'Z') { // 'A'..'Z'
0590:                        sbuf.append((char) ch);
0591:                    } else if ('a' <= ch && ch <= 'z') { // 'a'..'z'
0592:                        sbuf.append((char) ch);
0593:                    } else if ('0' <= ch && ch <= '9') { // '0'..'9'
0594:                        sbuf.append((char) ch);
0595:                    } else if (ch == ' ') { // space
0596:                        sbuf.append("%20");
0597:                    } else if (ch == '&'
0598:                            || ch == ':' // unreserved
0599:                            || ch == '-' || ch == '_' || ch == '.' || ch == '!'
0600:                            || ch == '~' || ch == '*' || ch == '\''
0601:                            || ch == '(' || ch == ')' || ch == ';') {
0602:                        sbuf.append((char) ch);
0603:                    } else if (ch <= 0x007f) { // other ASCII
0604:                        sbuf.append(hex[ch]);
0605:                    } else if (ch <= 0x07FF) { // non-ASCII <= 0x7FF
0606:                        sbuf.append(hex[0xc0 | (ch >> 6)]);
0607:                        sbuf.append(hex[0x80 | (ch & 0x3F)]);
0608:                    } else { // 0x7FF < ch <= 0xFFFF
0609:                        sbuf.append(hex[0xe0 | (ch >> 12)]);
0610:                        sbuf.append(hex[0x80 | ((ch >> 6) & 0x3F)]);
0611:                        sbuf.append(hex[0x80 | (ch & 0x3F)]);
0612:                    }
0613:                }
0614:                return sbuf.toString();
0615:            }
0616:
0617:            // from: http://www.w3.org/International/unescape.java
0618:            public static String unescape(String s) {
0619:                StringBuffer sbuf = new StringBuffer();
0620:                int l = s.length();
0621:                int ch = -1;
0622:                int b, sumb = 0;
0623:                for (int i = 0, more = -1; i < l; i++) {
0624:                    /* Get next byte b from URL segment s */
0625:                    switch (ch = s.charAt(i)) {
0626:                    case '%':
0627:                        ch = s.charAt(++i);
0628:                        int hb = (Character.isDigit((char) ch) ? ch - '0'
0629:                                : 10 + Character.toLowerCase((char) ch) - 'a') & 0xF;
0630:                        ch = s.charAt(++i);
0631:                        int lb = (Character.isDigit((char) ch) ? ch - '0'
0632:                                : 10 + Character.toLowerCase((char) ch) - 'a') & 0xF;
0633:                        b = (hb << 4) | lb;
0634:                        break;
0635:                    case '+':
0636:                        b = ' ';
0637:                        break;
0638:                    default:
0639:                        b = ch;
0640:                    }
0641:                    /* Decode byte b as UTF-8, sumb collects incomplete chars */
0642:                    if ((b & 0xc0) == 0x80) { // 10xxxxxx (continuation byte)
0643:                        sumb = (sumb << 6) | (b & 0x3f); // Add 6 bits to sumb
0644:                        if (--more == 0)
0645:                            sbuf.append((char) sumb); // Add char to sbuf
0646:                    } else if ((b & 0x80) == 0x00) { // 0xxxxxxx (yields 7 bits)
0647:                        sbuf.append((char) b); // Store in sbuf
0648:                    } else if ((b & 0xe0) == 0xc0) { // 110xxxxx (yields 5 bits)
0649:                        sumb = b & 0x1f;
0650:                        more = 1; // Expect 1 more byte
0651:                    } else if ((b & 0xf0) == 0xe0) { // 1110xxxx (yields 4 bits)
0652:                        sumb = b & 0x0f;
0653:                        more = 2; // Expect 2 more bytes
0654:                    } else if ((b & 0xf8) == 0xf0) { // 11110xxx (yields 3 bits)
0655:                        sumb = b & 0x07;
0656:                        more = 3; // Expect 3 more bytes
0657:                    } else if ((b & 0xfc) == 0xf8) { // 111110xx (yields 2 bits)
0658:                        sumb = b & 0x03;
0659:                        more = 4; // Expect 4 more bytes
0660:                    } else /*if ((b & 0xfe) == 0xfc)*/{ // 1111110x (yields 1 bit)
0661:                        sumb = b & 0x01;
0662:                        more = 5; // Expect 5 more bytes
0663:                    }
0664:                    /* We don't test if the UTF-8 encoding is well-formed */
0665:                }
0666:                return sbuf.toString();
0667:            }
0668:
0669:            private void identPort(String inputURL, int dflt)
0670:                    throws MalformedURLException {
0671:                // identify ref in file
0672:                int r = this .host.indexOf(':');
0673:                if (r < 0) {
0674:                    this .port = dflt;
0675:                } else {
0676:                    try {
0677:                        String portStr = this .host.substring(r + 1);
0678:                        if (portStr.trim().length() > 0)
0679:                            this .port = Integer.parseInt(portStr);
0680:                        else
0681:                            this .port = -1;
0682:                        this .host = this .host.substring(0, r);
0683:                    } catch (NumberFormatException e) {
0684:                        throw new MalformedURLException(
0685:                                "wrong port in host fragment '" + this .host
0686:                                        + "' of input url '" + inputURL + "'");
0687:                    }
0688:                }
0689:            }
0690:
0691:            private void identRef() {
0692:                // identify ref in file
0693:                int r = path.indexOf('#');
0694:                if (r < 0) {
0695:                    this .ref = null;
0696:                } else {
0697:                    this .ref = path.substring(r + 1);
0698:                    this .path = path.substring(0, r);
0699:                }
0700:            }
0701:
0702:            private void identQuest() {
0703:                // identify quest in file
0704:                int r = path.indexOf('?');
0705:                if (r < 0) {
0706:                    this .quest = null;
0707:                } else {
0708:                    this .quest = path.substring(r + 1);
0709:                    this .path = path.substring(0, r);
0710:                }
0711:            }
0712:
0713:            public String getFile() {
0714:                return getFile(true);
0715:            }
0716:
0717:            public String getFile(boolean includeReference) {
0718:                // this is the path plus quest plus ref
0719:                // if there is no quest and no ref the result is identical to getPath
0720:                // this is defined according to http://java.sun.com/j2se/1.4.2/docs/api/java/net/URL.html#getFile()
0721:                if (quest != null)
0722:                    return ((includeReference) && (ref != null)) ? path + "?"
0723:                            + quest + "#" + ref : path + "?" + quest;
0724:                return ((includeReference) && (ref != null)) ? path + "#" + ref
0725:                        : path;
0726:            }
0727:
0728:            public String getFileName() {
0729:                // this is a method not defined in any sun api
0730:                // it returns the last portion of a path without any reference
0731:                int p = path.lastIndexOf('/');
0732:                if (p < 0)
0733:                    return path;
0734:                if (p == path.length() - 1)
0735:                    return ""; // no file name, this is a path to a directory
0736:                return path.substring(p + 1); // the 'real' file name
0737:            }
0738:
0739:            public String getPath() {
0740:                return path;
0741:            }
0742:
0743:            public String getAuthority() {
0744:                return ((port >= 0) && (host != null)) ? host + ":" + port
0745:                        : ((host != null) ? host : "");
0746:            }
0747:
0748:            public String getHost() {
0749:                return host;
0750:            }
0751:
0752:            public int getPort() {
0753:                return port;
0754:            }
0755:
0756:            public String getProtocol() {
0757:                return protocol;
0758:            }
0759:
0760:            public String getRef() {
0761:                return ref;
0762:            }
0763:
0764:            public String getUserInfo() {
0765:                return userInfo;
0766:            }
0767:
0768:            public String getQuery() {
0769:                return quest;
0770:            }
0771:
0772:            public String toString() {
0773:                return toNormalform(false, true);
0774:            }
0775:
0776:            public String toNormalform(boolean stripReference, boolean stripAmp) {
0777:                if (stripAmp)
0778:                    return toNormalform(!stripReference).replaceAll("&amp;",
0779:                            "&");
0780:                else
0781:                    return toNormalform(!stripReference);
0782:            }
0783:
0784:            private String toNormalform(boolean includeReference) {
0785:                // generates a normal form of the URL
0786:                boolean defaultPort = false;
0787:                if (this .protocol.equals("mailto")) {
0788:                    return this .protocol + ":" + this .userInfo + "@"
0789:                            + this .host;
0790:                } else if (this .protocol.equals("http")) {
0791:                    if (this .port < 0 || this .port == 80) {
0792:                        defaultPort = true;
0793:                    }
0794:                } else if (this .protocol.equals("ftp")) {
0795:                    if (this .port < 0 || this .port == 21) {
0796:                        defaultPort = true;
0797:                    }
0798:                } else if (this .protocol.equals("https")) {
0799:                    if (this .port < 0 || this .port == 443) {
0800:                        defaultPort = true;
0801:                    }
0802:                }
0803:                String path = resolveBackpath(this .getFile(includeReference));
0804:
0805:                if (defaultPort) {
0806:                    return this .protocol
0807:                            + "://"
0808:                            + ((this .userInfo != null) ? (this .userInfo + "@")
0809:                                    : ("")) + this .getHost().toLowerCase()
0810:                            + path;
0811:                }
0812:                return this .protocol
0813:                        + "://"
0814:                        + ((this .userInfo != null) ? (this .userInfo + "@")
0815:                                : ("")) + this .getHost().toLowerCase()
0816:                        + ((defaultPort) ? ("") : (":" + this .port)) + path;
0817:            }
0818:
0819:            public boolean equals(yacyURL other) {
0820:                return (((this .protocol == other.protocol) || (this .protocol
0821:                        .equals(other.protocol)))
0822:                        && ((this .host == other.host) || (this .host
0823:                                .equals(other.host)))
0824:                        && ((this .userInfo == other.userInfo) || (this .userInfo
0825:                                .equals(other.userInfo)))
0826:                        && ((this .path == other.path) || (this .path
0827:                                .equals(other.path)))
0828:                        && ((this .quest == other.quest) || (this .quest
0829:                                .equals(other.quest)))
0830:                        && ((this .ref == other.ref) || (this .ref
0831:                                .equals(other.ref))) && ((this .port == other.port)));
0832:            }
0833:
0834:            public int hashCode() {
0835:                return this .hash().hashCode();
0836:            }
0837:
0838:            public int compareTo(Object h) {
0839:                assert (h instanceof  yacyURL);
0840:                return this .toString().compareTo(((yacyURL) h).toString());
0841:            }
0842:
0843:            public boolean isPOST() {
0844:                return (this .quest != null) && (this .quest.length() > 0);
0845:            }
0846:
0847:            public boolean isCGI() {
0848:                String ls = path.toLowerCase();
0849:                return ((ls.indexOf(".cgi") >= 0) || (ls.indexOf(".exe") >= 0)
0850:                        || (ls.indexOf(";jsessionid=") >= 0)
0851:                        || (ls.indexOf("sessionid/") >= 0)
0852:                        || (ls.indexOf("phpsessid=") >= 0)
0853:                        || (ls.indexOf("search.php?sid=") >= 0) || (ls
0854:                        .indexOf("memberlist.php?sid=") >= 0));
0855:            }
0856:
0857:            // static methods from plasmaURL
0858:
0859:            public static final int flagTypeID(String hash) {
0860:                return (kelondroBase64Order.enhancedCoder.decodeByte(hash
0861:                        .charAt(11)) & 32) >> 5;
0862:            }
0863:
0864:            public static final int flagTLDID(String hash) {
0865:                return (kelondroBase64Order.enhancedCoder.decodeByte(hash
0866:                        .charAt(11)) & 28) >> 2;
0867:            }
0868:
0869:            public static final int flagLengthID(String hash) {
0870:                return (kelondroBase64Order.enhancedCoder.decodeByte(hash
0871:                        .charAt(11)) & 3);
0872:            }
0873:
0874:            public final String hash() {
0875:                // in case that the object was initialized without a known url hash, compute it now
0876:                if (this .hash == null)
0877:                    this .hash = urlHashComputation();
0878:                return this .hash;
0879:            }
0880:
0881:            private final String urlHashComputation() {
0882:                // the url hash computation needs a DNS lookup to check if the addresses domain is local
0883:                // that causes that this method may be very slow
0884:
0885:                assert this .hash == null; // should only be called if the hash was not computed bevore
0886:
0887:                int p = this .host.lastIndexOf('.');
0888:                String tld = "", dom = tld;
0889:                if (p > 0) {
0890:                    tld = host.substring(p + 1);
0891:                    dom = host.substring(0, p);
0892:                }
0893:                Integer ID = (serverDomains.isLocal(tld)) ? null
0894:                        : (Integer) TLDID.get(tld); // identify local addresses
0895:                int id = (ID == null) ? 7 : ID.intValue(); // local addresses are flagged with id=7
0896:                boolean isHTTP = this .protocol.equals("http");
0897:                p = dom.lastIndexOf('.'); // locate subdomain
0898:                String subdom = "";
0899:                if (p > 0) {
0900:                    subdom = dom.substring(0, p);
0901:                    dom = dom.substring(p + 1);
0902:                }
0903:
0904:                // find rootpath
0905:                String pathx = new String(this .path);
0906:                if (pathx.startsWith("/"))
0907:                    pathx = pathx.substring(1);
0908:                if (pathx.endsWith("/"))
0909:                    pathx = pathx.substring(0, pathx.length() - 1);
0910:                p = pathx.indexOf('/');
0911:                String rootpath = "";
0912:                if (p > 0) {
0913:                    rootpath = pathx.substring(0, p);
0914:                }
0915:
0916:                // we collected enough information to compute the fragments that are
0917:                // basis for hashes
0918:                int l = dom.length();
0919:                int domlengthKey = (l <= 8) ? 0 : (l <= 12) ? 1 : (l <= 16) ? 2
0920:                        : 3;
0921:                byte flagbyte = (byte) (((isHTTP) ? 0 : 32) | (id << 2) | domlengthKey);
0922:
0923:                // combine the attributes
0924:                StringBuffer hash = new StringBuffer(12);
0925:                // form the 'local' part of the hash
0926:                hash.append(kelondroBase64Order.enhancedCoder.encode(
0927:                        serverCodings.encodeMD5Raw(toNormalform(true, true)))
0928:                        .substring(0, 5)); // 5 chars
0929:                hash.append(subdomPortPath(subdom, port, rootpath)); // 1 char
0930:                // form the 'global' part of the hash
0931:                hash.append(protocolHostPort(this .protocol, host, port)); // 5 chars
0932:                hash.append(kelondroBase64Order.enhancedCoder
0933:                        .encodeByte(flagbyte)); // 1 char
0934:
0935:                // return result hash
0936:                return new String(hash);
0937:            }
0938:
0939:            private static char subdomPortPath(String subdom, int port,
0940:                    String rootpath) {
0941:                return kelondroBase64Order.enhancedCoder.encode(
0942:                        serverCodings.encodeMD5Raw(subdom + ":" + port + ":"
0943:                                + rootpath)).charAt(0);
0944:            }
0945:
0946:            private static final char rootURLFlag0 = subdomPortPath("", 80, "");
0947:            private static final char rootURLFlag1 = subdomPortPath("www", 80,
0948:                    "");
0949:
0950:            public static final boolean probablyRootURL(String urlHash) {
0951:                return (urlHash.charAt(5) == rootURLFlag0)
0952:                        || (urlHash.charAt(5) == rootURLFlag1);
0953:            }
0954:
0955:            private static String protocolHostPort(String protocol,
0956:                    String host, int port) {
0957:                return kelondroBase64Order.enhancedCoder.encode(
0958:                        serverCodings.encodeMD5Raw(protocol + ":" + host + ":"
0959:                                + port)).substring(0, 5);
0960:            }
0961:
0962:            private static String[] testTLDs = new String[] { "com", "net",
0963:                    "org", "uk", "fr", "de", "es", "it" };
0964:
0965:            public static final yacyURL probablyWordURL(String urlHash,
0966:                    TreeSet<String> words) {
0967:                Iterator<String> wi = words.iterator();
0968:                String word;
0969:                while (wi.hasNext()) {
0970:                    word = wi.next();
0971:                    if ((word == null) || (word.length() == 0))
0972:                        continue;
0973:                    String pattern = urlHash.substring(6, 11);
0974:                    for (int i = 0; i < testTLDs.length; i++) {
0975:                        if (pattern.equals(protocolHostPort("http", "www."
0976:                                + word.toLowerCase() + "." + testTLDs[i], 80)))
0977:                            try {
0978:                                return new yacyURL("http://www."
0979:                                        + word.toLowerCase() + "."
0980:                                        + testTLDs[i], null);
0981:                            } catch (MalformedURLException e) {
0982:                                return null;
0983:                            }
0984:                    }
0985:                }
0986:                return null;
0987:            }
0988:
0989:            public static final boolean isWordRootURL(String givenURLHash,
0990:                    TreeSet<String> words) {
0991:                if (!(probablyRootURL(givenURLHash)))
0992:                    return false;
0993:                yacyURL wordURL = probablyWordURL(givenURLHash, words);
0994:                if (wordURL == null)
0995:                    return false;
0996:                if (wordURL.hash().equals(givenURLHash))
0997:                    return true;
0998:                return false;
0999:            }
1000:
1001:            public static final int domLengthEstimation(String urlHash) {
1002:                // generates an estimation of the original domain length
1003:                assert (urlHash != null);
1004:                assert (urlHash.length() == 12) : "urlhash = " + urlHash;
1005:                int flagbyte = kelondroBase64Order.enhancedCoder
1006:                        .decodeByte(urlHash.charAt(11));
1007:                int domLengthKey = flagbyte & 3;
1008:                switch (domLengthKey) {
1009:                case 0:
1010:                    return 4;
1011:                case 1:
1012:                    return 10;
1013:                case 2:
1014:                    return 14;
1015:                case 3:
1016:                    return 20;
1017:                }
1018:                return 20;
1019:            }
1020:
1021:            public static int domLengthNormalized(String urlHash) {
1022:                return domLengthEstimation(urlHash) << 8 / 20;
1023:            }
1024:
1025:            public static final int domDomain(String urlHash) {
1026:                // returns the ID of the domain of the domain
1027:                assert (urlHash != null);
1028:                assert (urlHash.length() == 12) : "urlhash = " + urlHash;
1029:                int flagbyte = kelondroBase64Order.enhancedCoder
1030:                        .decodeByte(urlHash.charAt(11));
1031:                return (flagbyte & 12) >> 2;
1032:            }
1033:
1034:            public static boolean isGlobalDomain(String urlhash) {
1035:                return domDomain(urlhash) != 7;
1036:            }
1037:
1038:            // checks for local/global IP range and local IP
1039:            public boolean isLocal() {
1040:                return serverDomains.isLocal(this .host);
1041:            }
1042:
1043:            // language calculation
1044:            public static String language(yacyURL url) {
1045:                String language = "uk";
1046:                String host = url.getHost();
1047:                int pos = host.lastIndexOf(".");
1048:                if ((pos > 0) && (host.length() - pos == 3))
1049:                    language = host.substring(pos + 1).toLowerCase();
1050:                return language;
1051:            }
1052:
1053:            public static void main(String[] args) {
1054:                String[][] test = new String[][] {
1055:                        new String[] { null,
1056:                                "http://www.anomic.de/home/test?x=1#home" },
1057:                        new String[] { null,
1058:                                "http://www.anomic.de/home/test?x=1" },
1059:                        new String[] { null,
1060:                                "http://www.anomic.de/home/test#home" },
1061:                        new String[] { null,
1062:                                "ftp://ftp.anomic.de/home/test#home" },
1063:                        new String[] { null,
1064:                                "http://www.anomic.de/home/../abc/" },
1065:                        new String[] { null, "mailto:abcdefg@nomailnomail.com" },
1066:                        new String[] { "http://www.anomic.de/home", "test" },
1067:                        new String[] { "http://www.anomic.de/home", "test/" },
1068:                        new String[] { "http://www.anomic.de/home/", "test" },
1069:                        new String[] { "http://www.anomic.de/home/", "test/" },
1070:                        new String[] { "http://www.anomic.de/home/index.html",
1071:                                "test.htm" },
1072:                        new String[] { "http://www.anomic.de/home/index.html",
1073:                                "http://www.yacy.net/test" },
1074:                        new String[] { "http://www.anomic.de/home/index.html",
1075:                                "ftp://ftp.yacy.net/test" },
1076:                        new String[] { "http://www.anomic.de/home/index.html",
1077:                                "../test" },
1078:                        new String[] { "http://www.anomic.de/home/index.html",
1079:                                "mailto:abcdefg@nomailnomail.com" },
1080:                        new String[] { null, "news:de.test" },
1081:                        new String[] { "http://www.anomic.de/home",
1082:                                "news:de.test" },
1083:                        new String[] { "http://www.anomic.de/home",
1084:                                "ftp://ftp.anomic.de/src" },
1085:                        new String[] { null, "ftp://ftp.delegate.org/" },
1086:                        new String[] { "http://www.anomic.de/home",
1087:                                "ftp://ftp.delegate.org/" },
1088:                        new String[] { "http://www.anomic.de",
1089:                                "mailto:yacy@weltherrschaft.org" },
1090:                        new String[] { "http://www.anomic.de", "javascipt:temp" },
1091:                        new String[] {
1092:                                null,
1093:                                "http://yacy-websuche.de/wiki/index.php?title=De:IntroInformationFreedom&action=history" },
1094:                        new String[] {
1095:                                null,
1096:                                "http://diskusjion.no/index.php?s=5bad5f431a106d9a8355429b81bb0ca5&showuser=23585" },
1097:                        new String[] {
1098:                                null,
1099:                                "http://diskusjion.no/index.php?s=5bad5f431a106d9a8355429b81bb0ca5&amp;showuser=23585" } };
1100:                String environment, url;
1101:                yacyURL aURL, aURL1;
1102:                java.net.URL jURL;
1103:                for (int i = 0; i < test.length; i++) {
1104:                    environment = test[i][0];
1105:                    url = test[i][1];
1106:                    try {
1107:                        aURL = yacyURL.newURL(environment, url);
1108:                    } catch (MalformedURLException e) {
1109:                        aURL = null;
1110:                    }
1111:                    if (environment == null) {
1112:                        try {
1113:                            jURL = new java.net.URL(url);
1114:                        } catch (MalformedURLException e) {
1115:                            jURL = null;
1116:                        }
1117:                    } else {
1118:                        try {
1119:                            jURL = new java.net.URL(new java.net.URL(
1120:                                    environment), url);
1121:                        } catch (MalformedURLException e) {
1122:                            jURL = null;
1123:                        }
1124:                    }
1125:
1126:                    // check equality to java.net.URL
1127:                    if (((aURL == null) && (jURL != null))
1128:                            || ((aURL != null) && (jURL == null))
1129:                            || ((aURL != null) && (jURL != null) && (!(jURL
1130:                                    .toString().equals(aURL.toString()))))) {
1131:                        System.out.println("Difference for environment="
1132:                                + environment + ", url=" + url + ":");
1133:                        System.out
1134:                                .println((jURL == null) ? "jURL rejected input"
1135:                                        : "jURL=" + jURL.toString());
1136:                        System.out
1137:                                .println((aURL == null) ? "aURL rejected input"
1138:                                        : "aURL=" + aURL.toString());
1139:                    }
1140:
1141:                    // check stability: the normalform of the normalform must be equal to the normalform
1142:                    if (aURL != null)
1143:                        try {
1144:                            aURL1 = new yacyURL(aURL.toNormalform(false, true),
1145:                                    null);
1146:                            if (!(aURL1.toNormalform(false, true).equals(aURL
1147:                                    .toNormalform(false, true)))) {
1148:                                System.out.println("no stability for url:");
1149:                                System.out.println("aURL0=" + aURL.toString());
1150:                                System.out.println("aURL1=" + aURL1.toString());
1151:                            }
1152:                        } catch (MalformedURLException e) {
1153:                            System.out.println("no stability for url:");
1154:                            System.out.println("aURL0=" + aURL.toString());
1155:                            System.out.println("aURL1 cannot be computed:"
1156:                                    + e.getMessage());
1157:                        }
1158:                }
1159:            }
1160:        }
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.