Source Code Cross Referenced for PartitionDocumentally.java in  » Search-Engine » mg4j » it » unimi » dsi » mg4j » tool » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Search Engine » mg4j » it.unimi.dsi.mg4j.tool 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


001:        package it.unimi.dsi.mg4j.tool;
002:
003:        /*		 
004:         * MG4J: Managing Gigabytes for Java
005:         *
006:         * Copyright (C) 2006-2007 Sebastiano Vigna 
007:         *
008:         *  This library is free software; you can redistribute it and/or modify it
009:         *  under the terms of the GNU Lesser General Public License as published by the Free
010:         *  Software Foundation; either version 2.1 of the License, or (at your option)
011:         *  any later version.
012:         *
013:         *  This library is distributed in the hope that it will be useful, but
014:         *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
015:         *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
016:         *  for more details.
017:         *
018:         *  You should have received a copy of the GNU Lesser General Public License
019:         *  along with this program; if not, write to the Free Software
020:         *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
021:         *
022:         */
023:
024:        import it.unimi.dsi.fastutil.ints.IntList;
025:        import it.unimi.dsi.fastutil.io.BinIO;
026:        import it.unimi.dsi.mg4j.index.BitStreamIndex;
027:        import it.unimi.dsi.mg4j.index.CachingOutputBitStream;
028:        import it.unimi.dsi.mg4j.index.CompressionFlags;
029:        import it.unimi.dsi.mg4j.index.DiskBasedIndex;
030:        import it.unimi.dsi.mg4j.index.BitStreamHPIndexWriter;
031:        import it.unimi.dsi.mg4j.index.Index;
032:        import it.unimi.dsi.mg4j.index.IndexIterator;
033:        import it.unimi.dsi.mg4j.index.IndexReader;
034:        import it.unimi.dsi.mg4j.index.BitStreamIndexWriter;
035:        import it.unimi.dsi.mg4j.index.IndexWriter;
036:        import it.unimi.dsi.mg4j.index.SkipBitStreamIndexWriter;
037:        import it.unimi.dsi.mg4j.index.CompressionFlags.Coding;
038:        import it.unimi.dsi.mg4j.index.CompressionFlags.Component;
039:        import it.unimi.dsi.mg4j.index.cluster.ContiguousDocumentalStrategy;
040:        import it.unimi.dsi.mg4j.index.cluster.DocumentalCluster;
041:        import it.unimi.dsi.mg4j.index.cluster.DocumentalConcatenatedCluster;
042:        import it.unimi.dsi.mg4j.index.cluster.DocumentalMergedCluster;
043:        import it.unimi.dsi.mg4j.index.cluster.DocumentalPartitioningStrategy;
044:        import it.unimi.dsi.mg4j.index.cluster.DocumentalStrategies;
045:        import it.unimi.dsi.mg4j.index.cluster.IndexCluster;
046:        import it.unimi.dsi.mg4j.index.payload.Payload;
047:        import it.unimi.dsi.io.FastBufferedReader;
048:        import it.unimi.dsi.io.InputBitStream;
049:        import it.unimi.dsi.io.OutputBitStream;
050:        import it.unimi.dsi.Util;
051:        import it.unimi.dsi.lang.MutableString;
052:        import it.unimi.dsi.logging.ProgressLogger;
053:        import it.unimi.dsi.sux4j.util.ShiftAddXorSignedStringMap;
054:        import it.unimi.dsi.util.BloomFilter;
055:        import it.unimi.dsi.util.ImmutableExternalPrefixMap;
056:        import it.unimi.dsi.util.PrefixMap;
057:        import it.unimi.dsi.util.Properties;
058:        import it.unimi.dsi.util.StringMap;
059:
060:        import java.io.BufferedWriter;
061:        import java.io.File;
062:        import java.io.FileInputStream;
063:        import java.io.FileOutputStream;
064:        import java.io.IOException;
065:        import java.io.InputStreamReader;
066:        import java.io.OutputStreamWriter;
067:        import java.io.PrintWriter;
068:        import java.net.URISyntaxException;
069:        import java.util.Map;
070:
071:        import org.apache.commons.configuration.ConfigurationException;
072:        import org.apache.commons.configuration.ConfigurationMap;
073:        import org.apache.log4j.Logger;
074:
075:        import com.martiansoftware.jsap.FlaggedOption;
076:        import com.martiansoftware.jsap.JSAP;
077:        import com.martiansoftware.jsap.JSAPResult;
078:        import com.martiansoftware.jsap.Parameter;
079:        import com.martiansoftware.jsap.SimpleJSAP;
080:        import com.martiansoftware.jsap.Switch;
081:        import com.martiansoftware.jsap.UnflaggedOption;
082:
083:        /** Partitions an index documentally.
084:         *
085:         * <p>A global index is partitioned documentally by providing a {@link DocumentalPartitioningStrategy}
086:         * that specifies a destination local index for each document, and a local document pointer. The global index
087:         * is scanned, and the postings are partitioned among the local indices using the provided strategy. For instance,
088:         * a {@link ContiguousDocumentalStrategy} divides an index into blocks of contiguous documents.
089:         * 
090:         * <p>Since each local index contains a (proper) subset of the original set of documents, it contains in general a (proper)
091:         * subset of the terms in the global index. Thus, the local term numbers and the global term numbers will not in general coincide.
092:         * As a result, when a set of local indices is accessed transparently as a single index
093:         * using a {@link it.unimi.dsi.mg4j.index.cluster.DocumentalCluster}, 
094:         * a call to {@link it.unimi.dsi.mg4j.index.Index#documents(int)} will throw an {@link java.lang.UnsupportedOperationException},
095:         * because there is no way to map the global term numbers to local term numbers.
096:         * 
097:         * <p>On the other hand, a call to {@link it.unimi.dsi.mg4j.index.Index#documents(CharSequence)} will be passed each local index to
098:         * build a global iterator. To speed up this phase for not-so-frequent terms, when partitioning an index you can require
099:         * the construction of {@linkplain BloomFilter Bloom filters} that will be used to try to avoid
100:         * inquiring indices that do not contain a term. The precision of the filters is settable.
101:         *
102:         * <p>The property file will use a {@link it.unimi.dsi.mg4j.index.cluster.DocumentalMergedCluster} unless you provide
103:         * a {@link ContiguousDocumentalStrategy}, in which case a 
104:         * {@link it.unimi.dsi.mg4j.index.cluster.DocumentalConcatenatedCluster} will be used instead. Note that there might
105:         * be other cases in which the latter is adapt, in which case you can edit manually the property file.
106:         * 
107:         * <strong>Important</strong>: this class just partitions the index. No auxiliary files (most notably, {@linkplain StringMap term maps} 
108:         * or {@linkplain PrefixMap prefix maps}) will be generated. Please refer to a {@link StringMap} implementation (e.g.,
109:         * {@link ShiftAddXorSignedStringMap} or {@link ImmutableExternalPrefixMap}).
110:         * 
111:         * <h2>Write-once output and distributed index partitioning</h2>
112:         * 
113:         * Plase see {@link it.unimi.dsi.mg4j.tool.PartitionLexically}&mdash;the same comments apply.
114:         * 
115:         * @author Alessandro Arrabito
116:         * @author Sebastiano Vigna
117:         * 
118:         * @since 1.0.1
119:         */
120:
121:        public class PartitionDocumentally {
122:            private final static Logger LOGGER = Util
123:                    .getLogger(PartitionDocumentally.class);
124:
125:            /**  The default buffer size for all involved indices. */
126:            public final static int DEFAULT_BUFFER_SIZE = 1024 * 1024;
127:
128:            /** The number of local indices. */
129:            private final int numIndices;
130:            /** The output basenames. */
131:            private final String outputBasename;
132:            /** The array of local output basenames. */
133:            private final String[] localBasename;
134:            /** The input basename. */
135:            private final String inputBasename;
136:            /** The properties of the input index. */
137:            private final Properties inputProperties;
138:            /** The size of I/O buffers. */
139:            private final int bufferSize;
140:            /** The filename of the strategy used to partition the index. */
141:            private final String strategyFilename;
142:            /** The strategy used to perform the partitioning. */
143:            private final DocumentalPartitioningStrategy strategy;
144:            /** The additional local properties of each local index. */
145:            private final Properties[] strategyProperties;
146:            /** The logging interval. */
147:            private final long logInterval;
148:            /** The global index to be partitioned. */
149:            private final BitStreamIndex globalIndex;
150:            /** A reader on {@link #globalIndex}. */
151:            private final IndexReader indexReader;
152:            /** A reader for the terms of the global index. */
153:            private final FastBufferedReader terms;
154:            /** An index writer for each local index. */
155:            private final IndexWriter[] indexWriter;
156:            /** Whether each {@link #indexWriter} has counts. */
157:            private final boolean haveCounts;
158:            /** Whether each {@link #indexWriter} has positions. */
159:            private final boolean havePositions;
160:            /** Whether each {@link #indexWriter} has payloads. */
161:            private final boolean havePayloads;
162:            /** A bit output stream for global counts of each local index. */
163:            private final OutputBitStream[] localGlobCounts;
164:            /** A bit output stream for the frequencies of each local index. */
165:            private final OutputBitStream[] localFrequencies;
166:            /** A print writer for the terms of each local index. */
167:            private final PrintWriter[] localTerms;
168:            /** The maximum size of a document in each local index. */
169:            private final int[] maxDocSize;
170:            /** The maximum number of positions in each local index. */
171:            private final int[] maxDocPos;
172:            /** The number of terms in each local index. */
173:            private final int[] numTerms;
174:            /** The number of postings in each local index. */
175:            private final long[] numPostings;
176:            /** The number of occurrences in each local index. */
177:            private final long[] numOccurrences;
178:            /** The global count for each local index. */
179:            private final long[] globCount;
180:            /** The required precision for Bloom filters (0 means no filter). */
181:            private final int bloomFilterPrecision;
182:
183:            public PartitionDocumentally(final String inputBasename,
184:                    final String outputBasename,
185:                    final DocumentalPartitioningStrategy strategy,
186:                    final String strategyFilename,
187:                    final int bloomFilterPrecision, final int bufferSize,
188:                    final Map<Component, Coding> writerFlags,
189:                    boolean interleaved, final boolean skips,
190:                    final int quantum, final int height,
191:                    final int skipBufferSize, final long logInterval)
192:                    throws ConfigurationException, IOException,
193:                    ClassNotFoundException, SecurityException,
194:                    InstantiationException, IllegalAccessException {
195:
196:                this .inputBasename = inputBasename;
197:                this .outputBasename = outputBasename;
198:                this .strategy = strategy;
199:                this .strategyFilename = strategyFilename;
200:                this .strategyProperties = strategy.properties();
201:                this .bufferSize = bufferSize;
202:                this .logInterval = logInterval;
203:                this .bloomFilterPrecision = bloomFilterPrecision;
204:
205:                numIndices = strategy.numberOfLocalIndices();
206:
207:                final Coding positionCoding = writerFlags
208:                        .get(Component.POSITIONS);
209:
210:                inputProperties = new Properties(inputBasename
211:                        + DiskBasedIndex.PROPERTIES_EXTENSION);
212:                globalIndex = DiskBasedIndex.getInstance(inputBasename,
213:                        inputProperties, false, positionCoding == Coding.GOLOMB
214:                                || positionCoding == Coding.INTERPOLATIVE,
215:                        false, null);
216:                indexReader = globalIndex.getReader();
217:
218:                localBasename = new String[numIndices];
219:                for (int i = 0; i < numIndices; i++)
220:                    localBasename[i] = outputBasename + "-" + i;
221:
222:                localGlobCounts = new OutputBitStream[numIndices];
223:                localFrequencies = new OutputBitStream[numIndices];
224:                localTerms = new PrintWriter[numIndices];
225:                maxDocSize = new int[numIndices];
226:                maxDocPos = new int[numIndices];
227:                numTerms = new int[numIndices];
228:                globCount = new long[numIndices];
229:                numOccurrences = new long[numIndices];
230:                numPostings = new long[numIndices];
231:                indexWriter = new IndexWriter[numIndices];
232:
233:                if ((havePayloads = writerFlags.containsKey(Component.PAYLOADS))
234:                        && !globalIndex.hasPayloads)
235:                    throw new IllegalArgumentException(
236:                            "You requested payloads, but the global index does not contain them.");
237:                if ((haveCounts = writerFlags.containsKey(Component.COUNTS))
238:                        && !globalIndex.hasCounts)
239:                    throw new IllegalArgumentException(
240:                            "You requested counts, but the global index does not contain them.");
241:                if ((havePositions = writerFlags
242:                        .containsKey(Component.POSITIONS))
243:                        && !globalIndex.hasPositions)
244:                    throw new IllegalArgumentException(
245:                            "You requested positions, but the global index does not contain them.");
246:
247:                interleaved |= !havePositions || havePayloads;
248:
249:                for (int i = 0; i < numIndices; i++) {
250:                    String name = localBasename[i];
251:                    if (!interleaved)
252:                        indexWriter[i] = new BitStreamHPIndexWriter(
253:                                localBasename[i],
254:                                strategy.numberOfDocuments(i), true,
255:                                skipBufferSize, writerFlags, quantum, height);
256:                    else if (!skips)
257:                        indexWriter[i] = new BitStreamIndexWriter(
258:                                localBasename[i],
259:                                strategy.numberOfDocuments(i), true,
260:                                writerFlags);
261:                    else
262:                        indexWriter[i] = new SkipBitStreamIndexWriter(
263:                                localBasename[i],
264:                                strategy.numberOfDocuments(i), true,
265:                                skipBufferSize, writerFlags, quantum, height);
266:
267:                    if (haveCounts)
268:                        localGlobCounts[i] = new OutputBitStream(name
269:                                + DiskBasedIndex.GLOBCOUNTS_EXTENSION);
270:                    localFrequencies[i] = new OutputBitStream(name
271:                            + DiskBasedIndex.FREQUENCIES_EXTENSION);
272:                    localTerms[i] = new PrintWriter(new BufferedWriter(
273:                            new OutputStreamWriter(new FileOutputStream(
274:                                    localBasename[i]
275:                                            + DiskBasedIndex.TERMS_EXTENSION),
276:                                    "UTF-8")));
277:                }
278:
279:                terms = new FastBufferedReader(new InputStreamReader(
280:                        new FileInputStream(inputBasename
281:                                + DiskBasedIndex.TERMS_EXTENSION), "UTF-8"));
282:            }
283:
284:            private void partitionSizes() throws IOException {
285:                final File sizesFile = new File(inputBasename
286:                        + DiskBasedIndex.SIZES_EXTENSION);
287:                if (sizesFile.exists()) {
288:                    LOGGER.info("Partitioning sizes...");
289:                    final InputBitStream sizes = new InputBitStream(sizesFile);
290:                    final OutputBitStream localSizes[] = new OutputBitStream[numIndices];
291:                    for (int i = 0; i < numIndices; i++)
292:                        localSizes[i] = new OutputBitStream(localBasename[i]
293:                                + DiskBasedIndex.SIZES_EXTENSION);
294:
295:                    // ALERT: for the time being, we decide whether to "fill the gaps" in sizes using as sole indicator the equality between global and local number of documents.
296:                    int size, localIndex;
297:                    if (globalIndex.numberOfDocuments == strategy
298:                            .numberOfDocuments(0)) {
299:                        for (int i = 0; i < globalIndex.numberOfDocuments; i++) {
300:                            localSizes[localIndex = strategy.localIndex(i)]
301:                                    .writeGamma(size = sizes.readGamma());
302:                            if (maxDocSize[localIndex] < size)
303:                                maxDocSize[localIndex] = size;
304:                            for (int l = numIndices; l-- != 0;)
305:                                if (l != localIndex)
306:                                    localSizes[l].writeGamma(0);
307:                        }
308:                    } else {
309:                        for (int i = 0; i < globalIndex.numberOfDocuments; i++) {
310:                            localSizes[localIndex = strategy.localIndex(i)]
311:                                    .writeGamma(size = sizes.readGamma());
312:                            if (maxDocSize[localIndex] < size)
313:                                maxDocSize[localIndex] = size;
314:                        }
315:                    }
316:
317:                    sizes.close();
318:                    for (int i = 0; i < numIndices; i++)
319:                        localSizes[i].close();
320:                }
321:            }
322:
323:            public void run() throws Exception {
324:                final ProgressLogger pl = new ProgressLogger(LOGGER,
325:                        logInterval);
326:                final IntList sizeList = globalIndex.sizes;
327:                partitionSizes();
328:
329:                final int[] position = new int[globalIndex.maxCount];
330:                final int[] localFrequency = new int[numIndices];
331:                final int[] usedIndex = new int[numIndices];
332:                final InputBitStream[] direct = new InputBitStream[numIndices];
333:                final InputBitStream[] indirect = new InputBitStream[numIndices];
334:                final BloomFilter[] bloomFilter = bloomFilterPrecision != 0 ? new BloomFilter[numIndices]
335:                        : null;
336:                final File[] tempFile = new File[numIndices];
337:                final CachingOutputBitStream[] temp = new CachingOutputBitStream[numIndices];
338:                IndexIterator indexIterator;
339:
340:                for (int i = 0; i < numIndices; i++) {
341:                    tempFile[i] = new File(localBasename[i] + ".temp");
342:                    temp[i] = new CachingOutputBitStream(tempFile[i],
343:                            bufferSize);
344:                    direct[i] = new InputBitStream(temp[i].buffer());
345:                    indirect[i] = new InputBitStream(tempFile[i]);
346:                    if (bloomFilterPrecision != 0)
347:                        bloomFilter[i] = new BloomFilter(
348:                                globalIndex.numberOfTerms, bloomFilterPrecision);
349:                }
350:                int usedIndices;
351:                MutableString currentTerm = new MutableString();
352:                Payload payload = null;
353:                int frequency, globalPointer, localIndex, localPointer, count = -1;
354:
355:                pl.expectedUpdates = globalIndex.numberOfPostings;
356:                pl.itemsName = "postings";
357:                pl.logInterval = logInterval;
358:                pl.start("Partitioning index...");
359:
360:                for (int t = 0; t < globalIndex.numberOfTerms; t++) {
361:                    terms.readLine(currentTerm);
362:                    indexIterator = indexReader.nextIterator();
363:                    usedIndices = 0;
364:                    frequency = indexIterator.frequency();
365:
366:                    for (int j = 0; j < frequency; j++) {
367:                        globalPointer = indexIterator.nextDocument();
368:                        localIndex = strategy.localIndex(globalPointer);
369:
370:                        if (localFrequency[localIndex] == 0) {
371:                            // First time we see a document for this index.
372:                            currentTerm.println(localTerms[localIndex]);
373:                            numTerms[localIndex]++;
374:                            usedIndex[usedIndices++] = localIndex;
375:                            if (bloomFilterPrecision != 0)
376:                                bloomFilter[localIndex].add(currentTerm);
377:                        }
378:
379:                        /* Store temporarily posting data; note that we save the global pointer as we
380:                         * will have to access the size list. */
381:
382:                        localFrequency[localIndex]++;
383:                        numPostings[localIndex]++;
384:                        temp[localIndex].writeGamma(globalPointer);
385:
386:                        if (globalIndex.hasPayloads)
387:                            payload = indexIterator.payload();
388:                        if (havePayloads)
389:                            payload.write(temp[localIndex]);
390:
391:                        if (haveCounts) {
392:                            count = indexIterator.count();
393:                            temp[localIndex].writeGamma(count);
394:                            globCount[localIndex] += count;
395:                            if (maxDocPos[localIndex] < count)
396:                                maxDocPos[localIndex] = count;
397:                            if (havePositions) {
398:                                final int[] pos = indexIterator.positionArray();
399:                                // TODO: compress this stuff
400:                                for (int p = 0; p < count; p++)
401:                                    temp[localIndex].writeGamma(pos[p]);
402:                            }
403:                        }
404:                    }
405:
406:                    // We now run through the indices used by this term and copy from the temporary buffer.
407:
408:                    OutputBitStream obs;
409:
410:                    for (int k = 0; k < usedIndices; k++) {
411:                        final int i = usedIndex[k];
412:
413:                        localFrequencies[i].writeGamma(localFrequency[i]);
414:                        if (haveCounts)
415:                            numOccurrences[i] += globCount[i];
416:                        if (localGlobCounts[i] != null)
417:                            localGlobCounts[i].writeLongGamma(globCount[i]);
418:                        globCount[i] = 0;
419:
420:                        InputBitStream ibs;
421:                        indexWriter[i].newInvertedList();
422:
423:                        temp[i].align();
424:                        if (temp[i].buffer() != null)
425:                            ibs = direct[i];
426:                        else {
427:                            // We cannot read directly from the internal buffer.
428:                            ibs = indirect[i];
429:                            ibs.flush();
430:                            temp[i].flush();
431:                        }
432:
433:                        ibs.position(0);
434:
435:                        indexWriter[i].writeFrequency(localFrequency[i]);
436:                        for (int j = 0; j < localFrequency[i]; j++) {
437:                            obs = indexWriter[i].newDocumentRecord();
438:                            globalPointer = ibs.readGamma();
439:                            localPointer = strategy.localPointer(globalPointer);
440:                            indexWriter[i].writeDocumentPointer(obs,
441:                                    localPointer);
442:                            if (havePayloads) {
443:                                payload.read(ibs);
444:                                indexWriter[i].writePayload(obs, payload);
445:                            }
446:                            if (haveCounts)
447:                                indexWriter[i].writePositionCount(obs,
448:                                        count = ibs.readGamma());
449:                            if (havePositions) {
450:                                for (int p = 0; p < count; p++)
451:                                    position[p] = ibs.readGamma();
452:                                indexWriter[i].writeDocumentPositions(obs,
453:                                        position, 0, count,
454:                                        sizeList != null ? sizeList
455:                                                .getInt(globalPointer) : -1);
456:                            }
457:
458:                        }
459:                        temp[i].position(0);
460:                        temp[i].writtenBits(0);
461:                        localFrequency[i] = 0;
462:                    }
463:
464:                    usedIndices = 0;
465:                    pl.count += frequency - 1;
466:                    pl.update();
467:                }
468:
469:                pl.done();
470:
471:                Properties globalProperties = new Properties();
472:                globalProperties.setProperty(Index.PropertyKeys.FIELD,
473:                        inputProperties.getProperty(Index.PropertyKeys.FIELD));
474:                globalProperties.setProperty(Index.PropertyKeys.TERMPROCESSOR,
475:                        inputProperties
476:                                .getProperty(Index.PropertyKeys.TERMPROCESSOR));
477:
478:                for (int i = 0; i < numIndices; i++) {
479:                    localFrequencies[i].close();
480:                    if (localGlobCounts[i] != null)
481:                        localGlobCounts[i].close();
482:                    localTerms[i].close();
483:                    indexWriter[i].close();
484:                    if (bloomFilterPrecision != 0)
485:                        BinIO.storeObject(bloomFilter[i], localBasename[i]
486:                                + DocumentalCluster.BLOOM_EXTENSION);
487:                    temp[i].close();
488:                    tempFile[i].delete();
489:
490:                    Properties localProperties = indexWriter[i].properties();
491:                    localProperties.addAll(globalProperties);
492:                    localProperties.setProperty(Index.PropertyKeys.MAXCOUNT,
493:                            String.valueOf(maxDocPos[i]));
494:                    localProperties.setProperty(Index.PropertyKeys.MAXDOCSIZE,
495:                            maxDocSize[i]);
496:                    localProperties.setProperty(Index.PropertyKeys.FIELD,
497:                            globalProperties
498:                                    .getProperty(Index.PropertyKeys.FIELD));
499:                    localProperties.setProperty(Index.PropertyKeys.OCCURRENCES,
500:                            haveCounts ? numOccurrences[i] : -1);
501:                    localProperties.setProperty(Index.PropertyKeys.POSTINGS,
502:                            numPostings[i]);
503:                    localProperties.setProperty(Index.PropertyKeys.TERMS,
504:                            numTerms[i]);
505:                    if (havePayloads)
506:                        localProperties.setProperty(
507:                                Index.PropertyKeys.PAYLOADCLASS, payload
508:                                        .getClass().getName());
509:                    if (strategyProperties[i] != null)
510:                        localProperties.addAll(strategyProperties[i]);
511:                    localProperties.save(localBasename[i]
512:                            + DiskBasedIndex.PROPERTIES_EXTENSION);
513:                }
514:
515:                if (strategyFilename != null)
516:                    globalProperties.setProperty(
517:                            IndexCluster.PropertyKeys.STRATEGY,
518:                            strategyFilename);
519:                for (int i = 0; i < numIndices; i++)
520:                    globalProperties.addProperty(
521:                            IndexCluster.PropertyKeys.LOCALINDEX,
522:                            localBasename[i]);
523:                globalProperties.setProperty(
524:                        DocumentalCluster.PropertyKeys.BLOOM,
525:                        bloomFilterPrecision != 0);
526:                // If we partition an index with a single term, by definition we have a flat cluster
527:                globalProperties.setProperty(
528:                        DocumentalCluster.PropertyKeys.FLAT, inputProperties
529:                                .getInt(Index.PropertyKeys.TERMS) <= 1);
530:                globalProperties.setProperty(Index.PropertyKeys.MAXCOUNT,
531:                        inputProperties
532:                                .getProperty(Index.PropertyKeys.MAXCOUNT));
533:                globalProperties.setProperty(Index.PropertyKeys.MAXDOCSIZE,
534:                        inputProperties
535:                                .getProperty(Index.PropertyKeys.MAXDOCSIZE));
536:                globalProperties.setProperty(Index.PropertyKeys.POSTINGS,
537:                        inputProperties
538:                                .getProperty(Index.PropertyKeys.POSTINGS));
539:                globalProperties.setProperty(Index.PropertyKeys.OCCURRENCES,
540:                        inputProperties
541:                                .getProperty(Index.PropertyKeys.OCCURRENCES));
542:                globalProperties.setProperty(Index.PropertyKeys.DOCUMENTS,
543:                        inputProperties
544:                                .getProperty(Index.PropertyKeys.DOCUMENTS));
545:                globalProperties.setProperty(Index.PropertyKeys.TERMS,
546:                        inputProperties.getProperty(Index.PropertyKeys.TERMS));
547:                if (havePayloads)
548:                    globalProperties.setProperty(
549:                            Index.PropertyKeys.PAYLOADCLASS, payload.getClass()
550:                                    .getName());
551:
552:                /* For the general case, we must rely on a merged cluster. However, if we detect a contiguous
553:                 * strategy we can optimise a bit. */
554:
555:                globalProperties
556:                        .setProperty(
557:                                Index.PropertyKeys.INDEXCLASS,
558:                                strategy instanceof  ContiguousDocumentalStrategy ? DocumentalConcatenatedCluster.class
559:                                        .getName()
560:                                        : DocumentalMergedCluster.class
561:                                                .getName());
562:
563:                globalProperties.save(outputBasename
564:                        + DiskBasedIndex.PROPERTIES_EXTENSION);
565:                LOGGER.debug("Properties for clustered index " + outputBasename
566:                        + ": " + new ConfigurationMap(globalProperties));
567:
568:            }
569:
570:            public static void main(final String arg[])
571:                    throws ConfigurationException, IOException,
572:                    URISyntaxException, ClassNotFoundException, Exception {
573:
574:                SimpleJSAP jsap = new SimpleJSAP(
575:                        PartitionDocumentally.class.getName(),
576:                        "Partitions an index documentally.",
577:                        new Parameter[] {
578:                                new FlaggedOption(
579:                                        "bufferSize",
580:                                        JSAP.INTSIZE_PARSER,
581:                                        Util
582:                                                .formatBinarySize(DEFAULT_BUFFER_SIZE),
583:                                        JSAP.NOT_REQUIRED, 'b', "buffer-size",
584:                                        "The size of an I/O buffer."),
585:                                new FlaggedOption(
586:                                        "logInterval",
587:                                        JSAP.LONG_PARSER,
588:                                        Long
589:                                                .toString(ProgressLogger.DEFAULT_LOG_INTERVAL),
590:                                        JSAP.NOT_REQUIRED, 'l', "log-interval",
591:                                        "The minimum time interval between activity logs in milliseconds."),
592:                                new FlaggedOption("strategy",
593:                                        JSAP.STRING_PARSER, JSAP.NO_DEFAULT,
594:                                        JSAP.NOT_REQUIRED, 's', "strategy",
595:                                        "A serialised documental partitioning strategy."),
596:                                new FlaggedOption("uniformStrategy",
597:                                        JSAP.INTEGER_PARSER, JSAP.NO_DEFAULT,
598:                                        JSAP.NOT_REQUIRED, 'u', "uniform",
599:                                        "Requires a uniform partitioning in the given number of parts."),
600:                                new FlaggedOption("bloom", JSAP.INTEGER_PARSER,
601:                                        "0", JSAP.NOT_REQUIRED, 'B', "bloom",
602:                                        "Generates Bloom filters with given precision."),
603:                                new FlaggedOption("comp", JSAP.STRING_PARSER,
604:                                        JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED,
605:                                        'c', "comp",
606:                                        "A compression flag for the index (may be specified several times).")
607:                                        .setAllowMultipleDeclarations(true),
608:                                new Switch(
609:                                        "skips",
610:                                        JSAP.NO_SHORTFLAG,
611:                                        "skips",
612:                                        "Requires skips (which however are present by default, unless you required an interleaved index)."),
613:                                new Switch("interleaved", JSAP.NO_SHORTFLAG,
614:                                        "interleaved",
615:                                        "Forces an interleaved index."),
616:                                new FlaggedOption("quantum",
617:                                        JSAP.INTSIZE_PARSER, "64",
618:                                        JSAP.NOT_REQUIRED, 'Q', "quantum",
619:                                        "The skip quantum."),
620:                                new FlaggedOption("height",
621:                                        JSAP.INTSIZE_PARSER, "8",
622:                                        JSAP.NOT_REQUIRED, 'H', "height",
623:                                        "The skip height."),
624:                                new FlaggedOption(
625:                                        "skipBufferSize",
626:                                        JSAP.INTSIZE_PARSER,
627:                                        Util
628:                                                .formatBinarySize(SkipBitStreamIndexWriter.DEFAULT_TEMP_BUFFER_SIZE),
629:                                        JSAP.NOT_REQUIRED,
630:                                        JSAP.NO_SHORTFLAG,
631:                                        "skip-buffer-size",
632:                                        "The size of the internal temporary buffer used while creating an index with skips."),
633:                                new UnflaggedOption("inputBasename",
634:                                        JSAP.STRING_PARSER, JSAP.REQUIRED,
635:                                        "The basename of the global index."),
636:                                new UnflaggedOption("outputBasename",
637:                                        JSAP.STRING_PARSER, JSAP.REQUIRED,
638:                                        "The basename of the local indices.") });
639:
640:                JSAPResult jsapResult = jsap.parse(arg);
641:                if (jsap.messagePrinted())
642:                    return;
643:                String inputBasename = jsapResult.getString("inputBasename");
644:                String outputBasename = jsapResult.getString("outputBasename");
645:                String strategyFilename = jsapResult.getString("strategy");
646:                DocumentalPartitioningStrategy strategy = null;
647:
648:                if (jsapResult.userSpecified("uniformStrategy")) {
649:                    strategy = DocumentalStrategies.uniform(jsapResult
650:                            .getInt("uniformStrategy"), Index
651:                            .getInstance(inputBasename).numberOfDocuments);
652:                    BinIO.storeObject(strategy,
653:                            strategyFilename = outputBasename
654:                                    + IndexCluster.STRATEGY_DEFAULT_EXTENSION);
655:                } else if (strategyFilename != null)
656:                    strategy = (DocumentalPartitioningStrategy) BinIO
657:                            .loadObject(strategyFilename);
658:                else
659:                    throw new IllegalArgumentException(
660:                            "You must specify a partitioning strategy");
661:
662:                final boolean skips = jsapResult.getBoolean("skips");
663:                final boolean interleaved = jsapResult
664:                        .getBoolean("interleaved");
665:                if (interleaved
666:                        && !skips
667:                        && (jsapResult.userSpecified("quantum") || jsapResult
668:                                .userSpecified("height"))) {
669:                    System.err
670:                            .println("You specified quantum or height, but did not turn on skips.");
671:                    return;
672:                }
673:
674:                new PartitionDocumentally(inputBasename, outputBasename,
675:                        strategy, strategyFilename, jsapResult.getInt("bloom"),
676:                        jsapResult.getInt("bufferSize"),
677:                        CompressionFlags.valueOf(jsapResult
678:                                .getStringArray("comp"),
679:                                CompressionFlags.DEFAULT_STANDARD_INDEX),
680:                        interleaved, skips, jsapResult.getInt("quantum"),
681:                        jsapResult.getInt("height"), jsapResult
682:                                .getInt("skipBufferSize"), jsapResult
683:                                .getLong("logInterval")).run();
684:            }
685:        }
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.