001: // yacySearch.java
002: // -------------------------------------
003: // (C) by Michael Peter Christen; mc@anomic.de
004: // first published on http://www.anomic.de
005: // Frankfurt, Germany, 2004
006: //
007: // $LastChangedDate: 2008-01-24 23:58:18 +0000 (Do, 24 Jan 2008) $
008: // $LastChangedRevision: 4398 $
009: // $LastChangedBy: orbiter $
010: //
011: // This program is free software; you can redistribute it and/or modify
012: // it under the terms of the GNU General Public License as published by
013: // the Free Software Foundation; either version 2 of the License, or
014: // (at your option) any later version.
015: //
016: // This program is distributed in the hope that it will be useful,
017: // but WITHOUT ANY WARRANTY; without even the implied warranty of
018: // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: // GNU General Public License for more details.
020: //
021: // You should have received a copy of the GNU General Public License
022: // along with this program; if not, write to the Free Software
023: // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: //
025: // Using this software in any meaning (reading, learning, copying, compiling,
026: // running) means that you agree that the Author(s) is (are) not responsible
027: // for cost, loss of data or any harm that may be caused directly or indirectly
028: // by usage of this softare or this documentation. The usage of this software
029: // is on your own risk. The installation and usage (starting/running) of this
030: // software may allow other people or application to access your computer and
031: // any attached devices and is highly dependent on the configuration of the
032: // software which must be done by the user of the software; the author(s) is
033: // (are) also not responsible for proper configuration and usage of the
034: // software, even if provoked by documentation provided together with
035: // the software.
036: //
037: // Any changes to this file according to the GPL as documented in the file
038: // gpl.txt aside this file in the shipment you received can be done to the
039: // lines that follows this copyright notice here, but changes must not be
040: // done inside the copyright notice above. A re-distribution must contain
041: // the intact and unchanged copyright notice.
042: // Contributions and changes to the program code must be marked as such.
043:
044: package de.anomic.yacy;
045:
046: import java.util.ArrayList;
047: import java.util.HashMap;
048: import java.util.Iterator;
049: import java.util.Map;
050: import java.util.Set;
051: import java.util.TreeMap;
052:
053: import de.anomic.kelondro.kelondroBitfield;
054: import de.anomic.kelondro.kelondroMScoreCluster;
055: import de.anomic.plasma.plasmaSearchRankingProcess;
056: import de.anomic.plasma.plasmaSearchQuery;
057: import de.anomic.plasma.plasmaSearchRankingProfile;
058: import de.anomic.plasma.plasmaWordIndex;
059: import de.anomic.plasma.urlPattern.plasmaURLPattern;
060: import de.anomic.server.logging.serverLog;
061:
062: public class yacySearch extends Thread {
063:
064: final private String wordhashes, excludehashes, urlhashes;
065: final private boolean global;
066: final private int partitions;
067: final private plasmaWordIndex wordIndex;
068: final private plasmaSearchRankingProcess containerCache;
069: final private Map<String, TreeMap<String, String>> abstractCache;
070: final private plasmaURLPattern blacklist;
071: final private yacySeed targetPeer;
072: private String[] urls;
073: private int count, maxDistance;
074: final private plasmaSearchRankingProfile rankingProfile;
075: final private String prefer, filter;
076: final private kelondroBitfield constraint;
077:
078: public yacySearch(String wordhashes, String excludehashes,
079: String urlhashes, String prefer, String filter, int count,
080: int maxDistance, boolean global, int partitions,
081: yacySeed targetPeer, plasmaWordIndex wordIndex,
082: plasmaSearchRankingProcess containerCache,
083: Map<String, TreeMap<String, String>> abstractCache,
084: plasmaURLPattern blacklist,
085: plasmaSearchRankingProfile rankingProfile,
086: kelondroBitfield constraint) {
087: super ("yacySearch_" + targetPeer.getName());
088: //System.out.println("DEBUG - yacySearch thread " + this.getName() + " initialized " + ((urlhashes.length() == 0) ? "(primary)" : "(secondary)"));
089: this .wordhashes = wordhashes;
090: this .excludehashes = excludehashes;
091: this .urlhashes = urlhashes;
092: this .prefer = prefer;
093: this .filter = filter;
094: this .global = global;
095: this .partitions = partitions;
096: this .wordIndex = wordIndex;
097: this .containerCache = containerCache;
098: this .abstractCache = abstractCache;
099: this .blacklist = blacklist;
100: this .targetPeer = targetPeer;
101: this .urls = null;
102: this .count = count;
103: this .maxDistance = maxDistance;
104: this .rankingProfile = rankingProfile;
105: this .constraint = constraint;
106: }
107:
108: public void run() {
109: this .urls = yacyClient.search(wordhashes, excludehashes,
110: urlhashes, prefer, filter, count, maxDistance, global,
111: partitions, targetPeer, wordIndex, containerCache,
112: abstractCache, blacklist, rankingProfile, constraint);
113: if (urls != null) {
114: // urls is an array of url hashes. this is only used for log output
115: StringBuffer urllist = new StringBuffer(
116: this .urls.length * 13);
117: for (int i = 0; i < this .urls.length; i++)
118: urllist.append(this .urls[i]).append(' ');
119: yacyCore.log.logInfo("REMOTE SEARCH - remote peer "
120: + targetPeer.hash + ":" + targetPeer.getName()
121: + " contributed " + urls.length
122: + " links for word hash " + wordhashes + ": "
123: + new String(urllist));
124: yacyCore.seedDB.mySeed().incRI(urls.length);
125: yacyCore.seedDB.mySeed().incRU(urls.length);
126: } else {
127: yacyCore.log
128: .logInfo("REMOTE SEARCH - no answer from remote peer "
129: + targetPeer.hash
130: + ":"
131: + targetPeer.getName());
132: }
133: }
134:
135: public static String set2string(Set<String> hashes) {
136: String wh = "";
137: final Iterator<String> iter = hashes.iterator();
138: while (iter.hasNext()) {
139: wh = wh + (String) iter.next();
140: }
141: return wh;
142: }
143:
144: public int links() {
145: return this .urls.length;
146: }
147:
148: public int count() {
149: return this .count;
150: }
151:
152: public yacySeed target() {
153: return targetPeer;
154: }
155:
156: private static yacySeed[] selectClusterPeers(
157: TreeMap<String, String> peerhashes) {
158: Iterator<Map.Entry<String, String>> i = peerhashes.entrySet()
159: .iterator();
160: ArrayList<yacySeed> l = new ArrayList<yacySeed>();
161: Map.Entry<String, String> entry;
162: yacySeed s;
163: while (i.hasNext()) {
164: entry = i.next();
165: s = yacyCore.seedDB.get(entry.getKey()); // should be getConnected; get only during testing time
166: if (s != null) {
167: s.setAlternativeAddress(entry.getValue());
168: l.add(s);
169: }
170: }
171: yacySeed[] result = new yacySeed[l.size()];
172: for (int j = 0; j < l.size(); j++) {
173: result[j] = l.get(j);
174: }
175: return result;
176: //return (yacySeed[]) l.toArray();
177: }
178:
179: private static yacySeed[] selectSearchTargets(
180: Set<String> wordhashes, int seedcount) {
181: // find out a specific number of seeds, that would be relevant for the given word hash(es)
182: // the result is ordered by relevance: [0] is most relevant
183: // the seedcount is the maximum number of wanted results
184: if (yacyCore.seedDB == null) {
185: return null;
186: }
187: if ((seedcount >= yacyCore.seedDB.sizeConnected())
188: || (yacyCore.seedDB.noDHTActivity())) {
189: seedcount = yacyCore.seedDB.sizeConnected();
190: }
191:
192: // put in seeds according to dht
193: final kelondroMScoreCluster<String> ranking = new kelondroMScoreCluster<String>();
194: final HashMap<String, yacySeed> seeds = new HashMap<String, yacySeed>();
195: yacySeed seed;
196: Iterator<yacySeed> dhtEnum;
197: int c;
198: String wordhash;
199: double distance;
200: Iterator<String> iter = wordhashes.iterator();
201: while (iter.hasNext()) {
202: wordhash = iter.next();
203: dhtEnum = yacyCore.dhtAgent.getDHTSeeds(true, wordhash,
204: (float) 0.0);
205: c = seedcount;
206: while (dhtEnum.hasNext() && c > 0) {
207: seed = (yacySeed) dhtEnum.next();
208: if (seed == null)
209: continue;
210: distance = yacyDHTAction.dhtDistance(seed.hash,
211: wordhash);
212: if (distance > 0.2)
213: continue; // catch bug in peer selection
214: if (!seed.getFlagAcceptRemoteIndex())
215: continue; // probably a robinson peer
216: serverLog.logFine("PLASMA", "selectPeers/DHTorder: "
217: + seed.hash + ":" + seed.getName() + "/"
218: + distance + " for wordhash " + wordhash
219: + ", score " + c);
220: ranking.addScore(seed.hash, c--);
221: seeds.put(seed.hash, seed);
222: }
223: }
224:
225: // put in seeds according to size of peer
226: dhtEnum = yacyCore.seedDB.seedsSortedConnected(false,
227: yacySeed.ICOUNT);
228: c = seedcount;
229: int score;
230: if (c > yacyCore.seedDB.sizeConnected()) {
231: c = yacyCore.seedDB.sizeConnected();
232: }
233: while (dhtEnum.hasNext() && c > 0) {
234: seed = dhtEnum.next();
235: if (seed == null)
236: continue;
237: if (!seed.getFlagAcceptRemoteIndex())
238: continue; // probably a robinson peer
239: score = (int) Math.round(Math.random() * ((c / 3) + 3));
240: serverLog.logFine("PLASMA", "selectPeers/RWIcount: "
241: + seed.hash + ":" + seed.getName() + ", RWIcount="
242: + seed.get(yacySeed.ICOUNT, "") + ", score "
243: + score);
244: ranking.addScore(seed.hash, score);
245: seeds.put(seed.hash, seed);
246: c--;
247: }
248:
249: // put in seeds that are public robinson peers and where the peer tags match with query
250: // or seeds that are newbies to ensure that public demonstrations always work
251: dhtEnum = yacyCore.seedDB.seedsConnected(true, false, null,
252: (float) 0.50);
253: while (dhtEnum.hasNext()) {
254: seed = dhtEnum.next();
255: if (seed == null)
256: continue;
257: if (seed.matchPeerTags(wordhashes)) { // access robinson peers with matching tag
258: serverLog.logInfo("PLASMA", "selectPeers/PeerTags: "
259: + seed.hash + ":" + seed.getName()
260: + ", is specialized peer for "
261: + seed.getPeerTags().toString());
262: ranking.addScore(seed.hash, seedcount);
263: seeds.put(seed.hash, seed);
264: }
265: if (seed.getAge() < 1) { // the 'workshop feature'
266: serverLog.logInfo("PLASMA", "selectPeers/Age: "
267: + seed.hash + ":" + seed.getName()
268: + ", is newbie, age = " + seed.getAge());
269: ranking.addScore(seed.hash, seedcount);
270: seeds.put(seed.hash, seed);
271: }
272: }
273:
274: // evaluate the ranking score and select seeds
275: if (ranking.size() < seedcount) {
276: seedcount = ranking.size();
277: }
278: yacySeed[] result = new yacySeed[seedcount];
279: c = 0;
280: iter = ranking.scores(false); // higher are better
281: while (iter.hasNext() && c < result.length) {
282: seed = (yacySeed) seeds.get((String) iter.next());
283: seed.selectscore = c;
284: serverLog.logFine("PLASMA", "selectPeers/_lineup_: "
285: + seed.hash + ":" + seed.getName() + " is choice "
286: + c);
287: result[c++] = seed;
288: }
289:
290: // System.out.println("DEBUG yacySearch.selectPeers = " + seedcount + " seeds:"); for (int i = 0; i < seedcount; i++) System.out.println(" #" + i + ":" + result[i]); // debug
291: return result;
292: }
293:
294: public static yacySearch[] primaryRemoteSearches(String wordhashes,
295: String excludehashes, String urlhashes, String prefer,
296: String filter, int count, int maxDist,
297: plasmaWordIndex wordIndex,
298: plasmaSearchRankingProcess containerCache,
299: Map<String, TreeMap<String, String>> abstractCache,
300: int targets, plasmaURLPattern blacklist,
301: plasmaSearchRankingProfile rankingProfile,
302: kelondroBitfield constraint,
303: TreeMap<String, String> clusterselection) {
304: // check own peer status
305: if (yacyCore.seedDB.mySeed() == null
306: || yacyCore.seedDB.mySeed().getPublicAddress() == null) {
307: return null;
308: }
309:
310: // prepare seed targets and threads
311: final yacySeed[] targetPeers = (clusterselection == null) ? selectSearchTargets(
312: plasmaSearchQuery.hashes2Set(wordhashes), targets)
313: : selectClusterPeers(clusterselection);
314: if (targetPeers == null)
315: return new yacySearch[0];
316: targets = targetPeers.length;
317: if (targets == 0)
318: return new yacySearch[0];
319: yacySearch[] searchThreads = new yacySearch[targets];
320: for (int i = 0; i < targets; i++) {
321: searchThreads[i] = new yacySearch(wordhashes,
322: excludehashes, urlhashes, prefer, filter, count,
323: maxDist, true, targets, targetPeers[i], wordIndex,
324: containerCache, abstractCache, blacklist,
325: rankingProfile, constraint);
326: searchThreads[i].start();
327: //try {Thread.sleep(20);} catch (InterruptedException e) {}
328: }
329: return searchThreads;
330: }
331:
332: public static yacySearch secondaryRemoteSearch(String wordhashes,
333: String excludehashes, String urlhashes,
334: plasmaWordIndex wordIndex,
335: plasmaSearchRankingProcess containerCache,
336: String targethash, plasmaURLPattern blacklist,
337: plasmaSearchRankingProfile rankingProfile,
338: kelondroBitfield constraint,
339: TreeMap<String, String> clusterselection) {
340: // check own peer status
341: if (yacyCore.seedDB.mySeed() == null
342: || yacyCore.seedDB.mySeed().getPublicAddress() == null) {
343: return null;
344: }
345:
346: // prepare seed targets and threads
347: final yacySeed targetPeer = yacyCore.seedDB
348: .getConnected(targethash);
349: if (targetPeer == null)
350: return null;
351: if (clusterselection != null)
352: targetPeer.setAlternativeAddress((String) clusterselection
353: .get(targetPeer.hash));
354: yacySearch searchThread = new yacySearch(wordhashes,
355: excludehashes, urlhashes, "", "", 0, 9999, true, 0,
356: targetPeer, wordIndex, containerCache,
357: new TreeMap<String, TreeMap<String, String>>(),
358: blacklist, rankingProfile, constraint);
359: searchThread.start();
360: return searchThread;
361: }
362:
363: public static int remainingWaiting(yacySearch[] searchThreads) {
364: if (searchThreads == null)
365: return 0;
366: int alive = 0;
367: for (int i = 0; i < searchThreads.length; i++) {
368: if (searchThreads == null)
369: break; // may occur
370: if (searchThreads[i].isAlive())
371: alive++;
372: }
373: return alive;
374: }
375:
376: public static int collectedLinks(yacySearch[] searchThreads) {
377: int links = 0;
378: for (int i = 0; i < searchThreads.length; i++) {
379: if (!(searchThreads[i].isAlive()))
380: links += searchThreads[i].urls.length;
381: }
382: return links;
383: }
384:
385: public static void interruptAlive(yacySearch[] searchThreads) {
386: for (int i = 0; i < searchThreads.length; i++) {
387: if (searchThreads[i].isAlive())
388: searchThreads[i].interrupt();
389: }
390: }
391:
392: }
|