Source Code Cross Referenced for Combine.java in » Search-Engine » mg4j » it » unimi » dsi » mg4j » tool » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI
Java
Java Tutorial
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Search Engine » mg4j » it.unimi.dsi.mg4j.tool
Source Cross Referenced Class Diagram Java Document (Java Doc)
001:        package it.unimi.dsi.mg4j.tool;
002:
003:        /*		 
004:         * MG4J: Managing Gigabytes for Java
005:         *
006:         * Copyright (C) 2005-2007 Sebastiano Vigna 
007:         *
008:         *  This library is free software; you can redistribute it and/or modify it
009:         *  under the terms of the GNU Lesser General Public License as published by the Free
010:         *  Software Foundation; either version 2.1 of the License, or (at your option)
011:         *  any later version.
012:         *
013:         *  This library is distributed in the hope that it will be useful, but
014:         *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
015:         *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
016:         *  for more details.
017:         *
018:         *  You should have received a copy of the GNU Lesser General Public License
019:         *  along with this program; if not, write to the Free Software
020:         *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
021:         *
022:         */
023:
024:        import it.unimi.dsi.fastutil.ints.AbstractIntIterator;
025:        import it.unimi.dsi.fastutil.ints.IntIterator;
026:        import it.unimi.dsi.fastutil.objects.ObjectHeapSemiIndirectPriorityQueue;
027:        import it.unimi.dsi.mg4j.index.BitStreamHPIndexWriter;
028:        import it.unimi.dsi.mg4j.index.BitStreamIndex;
029:        import it.unimi.dsi.mg4j.index.BitStreamIndexWriter;
030:        import it.unimi.dsi.mg4j.index.CompressionFlags;
031:        import it.unimi.dsi.mg4j.index.DiskBasedIndex;
032:        import it.unimi.dsi.mg4j.index.Index;
033:        import it.unimi.dsi.mg4j.index.IndexIterator;
034:        import it.unimi.dsi.mg4j.index.IndexReader;
035:        import it.unimi.dsi.mg4j.index.IndexWriter;
036:        import it.unimi.dsi.mg4j.index.SkipBitStreamIndexWriter;
037:        import it.unimi.dsi.mg4j.index.TermProcessor;
038:        import it.unimi.dsi.mg4j.index.CompressionFlags.Coding;
039:        import it.unimi.dsi.mg4j.index.CompressionFlags.Component;
040:        import it.unimi.dsi.mg4j.index.cluster.IndexCluster;
041:        import it.unimi.dsi.mg4j.index.payload.Payload;
042:        import it.unimi.dsi.io.FastBufferedReader;
043:        import it.unimi.dsi.io.InputBitStream;
044:        import it.unimi.dsi.io.OutputBitStream;
045:        import it.unimi.dsi.Util;
046:        import it.unimi.dsi.lang.MutableString;
047:        import it.unimi.dsi.lang.ObjectParser;
048:        import it.unimi.dsi.logging.ProgressLogger;
049:        import it.unimi.dsi.util.Properties;
050:
051:        import java.io.BufferedWriter;
052:        import java.io.Closeable;
053:        import java.io.File;
054:        import java.io.FileInputStream;
055:        import java.io.FileNotFoundException;
056:        import java.io.FileOutputStream;
057:        import java.io.IOException;
058:        import java.io.InputStreamReader;
059:        import java.io.OutputStreamWriter;
060:        import java.io.PrintStream;
061:        import java.io.PrintWriter;
062:        import java.lang.reflect.InvocationTargetException;
063:        import java.net.URISyntaxException;
064:        import java.util.Arrays;
065:        import java.util.Map;
066:
067:        import org.apache.commons.configuration.ConfigurationException;
068:        import org.apache.commons.configuration.ConfigurationMap;
069:        import org.apache.log4j.Logger;
070:
071:        import com.martiansoftware.jsap.FlaggedOption;
072:        import com.martiansoftware.jsap.JSAP;
073:        import com.martiansoftware.jsap.JSAPException;
074:        import com.martiansoftware.jsap.JSAPResult;
075:        import com.martiansoftware.jsap.Parameter;
076:        import com.martiansoftware.jsap.SimpleJSAP;
077:        import com.martiansoftware.jsap.Switch;
078:        import com.martiansoftware.jsap.UnflaggedOption;
079:        import com.martiansoftware.jsap.stringparsers.FileStringParser;
080:
081:        /** Combines several indices.
082:         * 
083:         * <p>Indices may be combined in several different ways. This abstract class
084:         * contains code that is common to classes such as {@link it.unimi.dsi.mg4j.tool.Merge}
085:         * or {@link it.unimi.dsi.mg4j.tool.Concatenate}: essentially, command line parsing,
086:         * inded opening, and term list fusion is taken care of. Then, the template method
087:         * {@link #combine(int)} must write into {@link #indexWriter} the combined inverted
088:         * list, returning the resulting frequency.
089:         * 
090:         * <p>Note that by combining a single index into a new one you can recompress an index
091:         * with different compression parameters (which includes the possibility of eliminating
092:         * positions or counts).
093:         * 
094:         * <p>The subclasses of this class must implement {@link #combine(int)} so that indices
095:         * with different sets of features are combined keeping the largest set of features requested
096:         * by the user. For instance, combining an index with positions and an index with counts, but
097:         * no positions, should generate an index with counts but no positions. 
098:         *
099:         * <p><strong>Warning</strong>: a combination requires opening <em>three</em> files per input index,
100:         * plus a few more files for the output index. If the combination process is interrupted by
101:         * an exception claiming that there are too many open files, check how to increase the
102:         * number of files you can open (usually, for instance on UN*X, there is a global and a per-process limit,
103:         * so be sure to set both).
104:         * 
105:         * <h2>Read-once indices, readers, and distributed index combination</h2>
106:         * 
107:         * <p>If the {@linkplain it.unimi.dsi.mg4j.index.Index indices} and 
108:         * {@linkplain it.unimi.dsi.mg4j.index.BitStreamIndexReader bitstream index readers} involved in the
109:         * combination are <em>read-once</em> (i.e., opening an index and reading once its contents sequentially
110:         * causes each file composing the index to be read exactly once) 
111:         * <em>then also {@link it.unimi.dsi.mg4j.tool.Combine} implementations should be read-once</em> ({@link it.unimi.dsi.mg4j.tool.Concatenate},
112:         * {@link it.unimi.dsi.mg4j.tool.Merge} and {@link it.unimi.dsi.mg4j.tool.Paste} are).
113:         * 
114:         * <p>This means, in particular, that index combination can be performed from <em>pipes</em>, which in
115:         * turn can be filled, for instance, with data coming from the network. In other words, albeit this
116:         * class is theoretically based on a number of indices existing on a local disk, those indices can be
117:         * substituted with suitable pipes filled with remote data without affecting the combination process.
118:         * For instance, the following <samp>bash</samp> code creates three sets of pipes:
119:         * <pre style="margin: 1em 0">
120:         * for i in 0 1 2; do
121:         *   for e in frequencies globcounts index offsets properties sizes terms; do 
122:         *     mkfifo pipe$i.$e
123:         *   done
124:         * done
125:         * </pre> 
126:         * 
127:         * <p>Each pipe should be then filled with suitable data, for instance obtained from the net (assuming
128:         * you have indices <samp>index0</samp>, <samp>index1</samp> and <samp>index2</samp> on <samp>example.com</samp>):
129:         * <pre style="margin: 1em 0">
130:         * for i in 0 1 2; do 
131:         *   for e in frequencies globcounts index offsets properties sizes terms; do 
132:         *     (ssh -x example.com cat index$i.$e >pipe$i.$e &)
133:         *   done
134:         * done
135:         * </pre> 
136:         * <p>Now all pipes will be filled with data from the corresponding remote files, and
137:         * combining the indices <samp>pipe0</samp>, <samp>pipe1</samp> and <samp>pipe2</samp>
138:         * will give the same result as combining <samp>index0</samp>, <samp>index1</samp> and <samp>index2</samp>
139:         * on the remote system.
140:         * 
141:         * @author Sebastiano Vigna
142:         * @since 1.0
143:         */
144:
145:        public abstract class Combine {
146:            private static final Logger LOGGER = Util.getLogger(Combine.class);
147:            private final static boolean DEBUG = false;
148:
149:            /** The default buffer size. */
150:            public static final int DEFAULT_BUFFER_SIZE = 1024 * 1024;
151:
152:            /** The number of indices to be merged. */
153:            final protected int numIndices;
154:            /** The array of indices to be merged. */
155:            final protected BitStreamIndex[] index;
156:            /** An array of index readers parallel to {@link #index}. */
157:            final protected IndexReader[] indexReader;
158:            /** An array of index iterators parallel to {@link #index} (filled by concrete implementations). */
159:            final protected IndexIterator[] indexIterator;
160:            /** An array of input bit streams, returning the global counts for each index. */
161:            private final InputBitStream[] globCounts;
162:            /** Whether to output global counts. */
163:            private boolean writeGlobCounts;
164:            /** Whether to output sizes. */
165:            private boolean writeSizes;
166:            /** Compute only index metadata (sizes, terms and globcounts). */
167:            private final boolean metadataOnly;
168:            /** An array of mutable strings, containing the last term read for a given index. */
169:            private MutableString[] term;
170:            /** An array of fast buffered readers, used to read the terms of each index. */
171:            private FastBufferedReader[] termReader;
172:            /** The queue containing terms. */
173:            protected ObjectHeapSemiIndirectPriorityQueue<MutableString> termQueue;
174:            /** The overall number of documents. */
175:            protected final int numberOfDocuments;
176:            /** The overall number of occurrences. */
177:            protected long numberOfOccurrences;
178:            /** The maximum count in the merged index. */
179:            protected int maxCount;
180:            /** The array of input basenames. */
181:            protected final String[] inputBasename;
182:            /** The output basename. */
183:            private final String outputBasename;
184:            /** The size of I/O buffers. */
185:            private final int bufferSize;
186:            /** The logging interval. */
187:            private final long logInterval;
188:            /** The index writer for the merged index. */
189:            protected IndexWriter indexWriter;
190:            /** Whether {@link #indexWriter} has counts. */
191:            protected final boolean hasCounts;
192:            /** Whether {@link #indexWriter} has positions. */
193:            protected final boolean hasPositions;
194:            /** Whether {@link #indexWriter} has payloads. */
195:            protected final boolean hasPayloads;
196:            /** Additional properties for the merged index. */
197:            private Properties additionalProperties;
198:            /** An array partially filled with the indices (as offsets in {@link #index}) participating to the merge process for the current term. */
199:            protected int[] usedIndex;
200:            /** For each index, the frequency of the current term (given that it is present). */
201:            final protected int[] frequency;
202:            /** A cache for positions. */
203:            protected int[] position;
204:            /** The size of each document. */
205:            protected int[] size;
206:
207:            public Combine(final String outputBasename,
208:                    final String[] inputBasename, final boolean metadataOnly,
209:                    final int bufferSize,
210:                    final Map<Component, Coding> writerFlags,
211:                    boolean interleaved, final boolean skips,
212:                    final int quantum, final int height,
213:                    final int skipBufferSize, final long logInterval)
214:                    throws IOException, ConfigurationException,
215:                    URISyntaxException, ClassNotFoundException,
216:                    SecurityException, InstantiationException,
217:                    IllegalAccessException, InvocationTargetException,
218:                    NoSuchMethodException {
219:
220:                this .logInterval = logInterval;
221:
222:                LOGGER.debug("Combining indices "
223:                        + Arrays.toString(inputBasename) + " into "
224:                        + outputBasename);
225:
226:                this .inputBasename = inputBasename;
227:                this .outputBasename = outputBasename;
228:                this .metadataOnly = metadataOnly;
229:                this .bufferSize = bufferSize;
230:
231:                numIndices = inputBasename.length;
232:                index = new BitStreamIndex[numIndices];
233:                indexReader = new IndexReader[numIndices];
234:                indexIterator = new IndexIterator[numIndices];
235:                globCounts = new InputBitStream[numIndices];
236:                term = new MutableString[numIndices];
237:                termReader = new FastBufferedReader[numIndices];
238:                termQueue = new ObjectHeapSemiIndirectPriorityQueue<MutableString>(
239:                        term, numIndices);
240:
241:                // This will remain set if *all* indices to be merged agree
242:                boolean haveCounts = true, havePositions = true;
243:                /* This will be set if *all* indices to be merged agree. Moreover, if some
244:                 * indices disagree we will emit a warning. */
245:                TermProcessor termProcessor = null;
246:                /* This will be set if *all* indices to be merged agree. Moreover, if some
247:                 * indices disagree we will emit a warning. */
248:                Payload payload = null;
249:                String field = null;
250:                writeGlobCounts = writeSizes = true;
251:                boolean someGlobCounts = false, someSizes = false;
252:
253:                for (int i = 0; i < numIndices; i++) {
254:                    index[i] = getIndex(inputBasename[i]);
255:                    if (i == 0) {
256:                        termProcessor = index[0].termProcessor.copy();
257:                        payload = index[0].payload == null ? null
258:                                : index[0].payload.copy();
259:                    } else {
260:                        if (!termProcessor.equals(index[i].termProcessor))
261:                            throw new IllegalStateException(
262:                                    "The term processor of the first index ("
263:                                            + termProcessor
264:                                            + ") is different from the term processor of index "
265:                                            + i + " (" + index[i].termProcessor
266:                                            + ")");
267:                        if ((payload == null) != (index[i].payload == null)
268:                                || payload != null
269:                                && !payload.compatibleWith(index[i].payload))
270:                            throw new IllegalStateException(
271:                                    "The payload specification of index "
272:                                            + index[0]
273:                                            + " is not compatible with that of index "
274:                                            + index[i]);
275:                    }
276:
277:                    if (index[i].field != null) {
278:                        if (field == null) {
279:                            if (i != 0)
280:                                LOGGER
281:                                        .warn("Not all indices specify the field property");
282:                            field = index[i].field;
283:                        } else if (!field.equals(index[i].field))
284:                            LOGGER.warn("Index fields disagree: \"" + field
285:                                    + "\", \"" + index[i].field + "\"");
286:                    }
287:
288:                    haveCounts &= index[i].hasCounts;
289:                    havePositions &= index[i].hasPositions;
290:                    maxCount = Math.max(maxCount, index[i].maxCount);
291:                    if (!metadataOnly)
292:                        indexReader[i] = index[i].getReader(bufferSize);
293:                    if (index[i].properties
294:                            .getLong(Index.PropertyKeys.OCCURRENCES) == -1)
295:                        numberOfOccurrences = -1;
296:                    if (numberOfOccurrences != -1)
297:                        numberOfOccurrences += index[i].properties
298:                                .getLong(Index.PropertyKeys.OCCURRENCES);
299:                    final File globCountsFile = new File(inputBasename[i]
300:                            + DiskBasedIndex.GLOBCOUNTS_EXTENSION);
301:                    writeGlobCounts &= globCountsFile.exists();
302:                    someGlobCounts |= globCountsFile.exists();
303:                    if (writeGlobCounts)
304:                        globCounts[i] = new InputBitStream(globCountsFile);
305:
306:                    final File sizesFile = new File(inputBasename[i]
307:                            + DiskBasedIndex.SIZES_EXTENSION);
308:                    writeSizes &= sizesFile.exists();
309:                    someSizes |= sizesFile.exists();
310:
311:                    term[i] = new MutableString();
312:                    termReader[i] = new FastBufferedReader(
313:                            new InputStreamReader(new FileInputStream(
314:                                    inputBasename[i]
315:                                            + DiskBasedIndex.TERMS_EXTENSION),
316:                                    "UTF-8"));
317:                    if (termReader[i].readLine(term[i]) != null)
318:                        termQueue.enqueue(i); // If the term list is nonempty, we enqueue it
319:                }
320:
321:                if (writeGlobCounts != someGlobCounts)
322:                    LOGGER
323:                            .warn("Some (but not all) global-counts file missing");
324:                if (writeSizes != someSizes)
325:                    LOGGER.warn("Some (but not all) sizes file missing");
326:
327:                additionalProperties = new Properties();
328:                additionalProperties.setProperty(
329:                        Index.PropertyKeys.TERMPROCESSOR, ObjectParser
330:                                .toSpec(termProcessor));
331:                if (payload != null) {
332:                    additionalProperties.setProperty(
333:                            Index.PropertyKeys.PAYLOADCLASS, payload.getClass()
334:                                    .getName());
335:                    //writerFlags.put( Component.PAYLOADS, null );
336:                }
337:                additionalProperties.setProperty(Index.PropertyKeys.BATCHES,
338:                        inputBasename.length);
339:                if (field != null)
340:                    additionalProperties.setProperty(Index.PropertyKeys.FIELD,
341:                            field);
342:
343:                usedIndex = new int[numIndices];
344:                frequency = new int[numIndices];
345:                position = new int[maxCount];
346:                numberOfDocuments = combineNumberOfDocuments();
347:
348:                if ((hasCounts = writerFlags.containsKey(Component.COUNTS))
349:                        && !haveCounts)
350:                    throw new IllegalArgumentException(
351:                            "Some of the indices to be combined do not have counts.");
352:                if ((hasPositions = writerFlags
353:                        .containsKey(Component.POSITIONS))
354:                        && !havePositions)
355:                    throw new IllegalArgumentException(
356:                            "Some of the indices to be combined do not have positions.");
357:                if ((hasPayloads = writerFlags.containsKey(Component.PAYLOADS))
358:                        && payload == null)
359:                    throw new IllegalArgumentException(
360:                            "Indices to be combined do not have payloads.");
361:
362:                interleaved |= !hasPositions || hasPayloads;
363:
364:                if (!metadataOnly) {
365:                    if (!interleaved)
366:                        indexWriter = new BitStreamHPIndexWriter(
367:                                outputBasename, numberOfDocuments, true,
368:                                skipBufferSize, writerFlags, quantum, height);
369:                    else if (!skips)
370:                        indexWriter = new BitStreamIndexWriter(outputBasename,
371:                                numberOfDocuments, true, writerFlags);
372:                    else
373:                        indexWriter = new SkipBitStreamIndexWriter(
374:                                outputBasename, numberOfDocuments, true,
375:                                skipBufferSize, writerFlags, quantum, height);
376:                    //else indexWriter = new SqrtSkipIndexWriter( outputBasename, numberOfDocuments, true, writerFlags );
377:                }
378:            }
379:
380:            /** Return a index with given basename, loaded with options suitable to perform the combination.
381:             * 
382:             * <p>This basic implementation calls {@link it.unimi.dsi.mg4j.index.Index#getInstance(CharSequence, boolean, boolean)}
383:             * with all Boolean parameters set to false. Subclasses can override this
384:             * method to load more data.
385:             * 
386:             * @param basename an index basename.
387:             * @return an index loaded with the correct options for the combining strategy.
388:             */
389:            protected BitStreamIndex getIndex(final CharSequence basename)
390:                    throws ConfigurationException, IOException,
391:                    URISyntaxException, ClassNotFoundException,
392:                    SecurityException, InstantiationException,
393:                    IllegalAccessException, InvocationTargetException,
394:                    NoSuchMethodException {
395:                return (BitStreamIndex) Index.getInstance(basename, false,
396:                        false, false);
397:            }
398:
399:            /** Combines the number of documents.
400:             * 
401:             * @return the number of documents of the combined index.
402:             */
403:            protected abstract int combineNumberOfDocuments();
404:
405:            /** A partial {@link IntIterator} implementation based on &gamma;-coded integers.
406:             * 
407:             * <p>Instances of this class adapt an {@link InputBitStream} to an {@link IntIterator}
408:             * by reading &gamma;-coded integers. The implementation is partial because {@link #hasNext()}
409:             * always returns true&mdash;the user must know in advance how many times {@link #nextInt()}
410:             * may be safely called. 
411:             * 
412:             * @see #sizes(int)
413:             */
414:            protected static final class GammaCodedIntIterator extends
415:                    AbstractIntIterator implements  Closeable {
416:                final private InputBitStream inputBitStream;
417:
418:                public GammaCodedIntIterator(final InputBitStream inputBitStream) {
419:                    this .inputBitStream = inputBitStream;
420:                }
421:
422:                /** Returns true.
423:                 * @return true
424:                 */
425:                public boolean hasNext() {
426:                    return true;
427:                }
428:
429:                /** Returns the next &gamma;-coded integer in the underlying {@link InputBitStream}. 
430:                 * @return the result of {@link InputBitStream#readGamma()}.
431:                 */
432:                public int nextInt() {
433:                    try {
434:                        return inputBitStream.readGamma();
435:                    } catch (IOException e) {
436:                        throw new RuntimeException(e);
437:                    }
438:                }
439:
440:                /** Delegates to the underlying {@link InputBitStream}. */
441:                public void close() throws IOException {
442:                    inputBitStream.close();
443:                }
444:            }
445:
446:            /** Returns an iterator on sizes.
447:             * 
448:             * <p>The purpose of this method is to provide {@link #combineSizes()} implementations with
449:             * a way to access the size list from a disk file or from {@link BitStreamIndex#sizes} transparently.
450:             * This mechanism is essential to ensure that size files are read exactly once.
451:             * 
452:             * <p>The caller should check whether the returned object implements {@link Closeable},
453:             * and, in this case, invoke {@link Closeable#close()} after usage.
454:             *
455:             * @param numIndex the number of an index.
456:             * @return an iterator on the sizes of the index.
457:             */
458:
459:            protected IntIterator sizes(int numIndex)
460:                    throws FileNotFoundException {
461:                if (index[numIndex].sizes != null)
462:                    return index[numIndex].sizes.listIterator();
463:                LOGGER.debug("Reading sizes from " + inputBasename[numIndex]
464:                        + DiskBasedIndex.SIZES_EXTENSION);
465:                return new GammaCodedIntIterator(new InputBitStream(
466:                        inputBasename[numIndex]
467:                                + DiskBasedIndex.SIZES_EXTENSION));
468:            }
469:
470:            /** Combines size lists.
471:             * 
472:             * @return the maximum size of a document in the combined index.
473:             * @throws IOException
474:             */
475:            protected abstract int combineSizes() throws IOException;
476:
477:            /** Combines several indices.
478:             * 
479:             * <p>When this method is called, exactly <code>numUsedIndices</code> entries
480:             * of {@link #usedIndex} contain, in increasing order, the indices containing
481:             * inverted lists for the current term. Implementations of this method must
482:             * combine the inverted list, save the total global count for the current
483:             * term and return the resulting frequency.
484:             * 
485:             * @param numUsedIndices the number of valid entries in {@link #usedIndex}.
486:             * @return the frequency of the combined lists.
487:             */
488:
489:            protected abstract int combine(int numUsedIndices)
490:                    throws IOException;
491:
492:            public void run() throws ConfigurationException, IOException {
493:                final Logger logger = Util.getLogger(this .getClass());
494:                final ProgressLogger pl = new ProgressLogger(logger,
495:                        logInterval);
496:
497:                final int maxDocSize;
498:                long totalSize = 0;
499:
500:                if (writeSizes) {
501:                    size = new int[numberOfDocuments];
502:                    logger.info("Combining sizes...");
503:
504:                    maxDocSize = combineSizes();
505:
506:                    final OutputBitStream outputSizes = new OutputBitStream(
507:                            outputBasename + DiskBasedIndex.SIZES_EXTENSION,
508:                            bufferSize);
509:                    for (int i = 0; i < numberOfDocuments; i++) {
510:                        totalSize += size[i];
511:                        outputSizes.writeGamma(size[i]);
512:                    }
513:                    outputSizes.close();
514:
515:                    logger.info("Sizes combined.");
516:                } else
517:                    maxDocSize = -1;
518:
519:                // To write the global count of each term
520:                final OutputBitStream outputGlobCounts = writeGlobCounts ? new OutputBitStream(
521:                        outputBasename + DiskBasedIndex.GLOBCOUNTS_EXTENSION)
522:                        : null;
523:                // To write the frequency of each term
524:                final OutputBitStream frequencies = metadataOnly ? null
525:                        : new OutputBitStream(outputBasename
526:                                + DiskBasedIndex.FREQUENCIES_EXTENSION);
527:                // To write the new term list
528:                final PrintWriter termFile = new PrintWriter(
529:                        new BufferedWriter(new OutputStreamWriter(
530:                                new FileOutputStream(outputBasename
531:                                        + DiskBasedIndex.TERMS_EXTENSION),
532:                                "UTF-8"), bufferSize));
533:
534:                // The current term
535:                MutableString currTerm;
536:
537:                // Total number of pointers and occurrences
538:                long numPointers = 0;
539:
540:                pl.expectedUpdates = writeGlobCounts ? numberOfOccurrences : -1;
541:                pl.itemsName = "occurrences";
542:                pl.logInterval = logInterval;
543:                pl.start("Combining lists...");
544:
545:                int totalFrequency, numTerms = 0, numUsedIndices, k;
546:                long totalGlobCount = 0;
547:
548:                // TODO: use the front of the queue?
549:                while (!termQueue.isEmpty()) {
550:                    numUsedIndices = 0;
551:                    // We read a new word from the queue, copy it and write it to the term file
552:                    currTerm = term[k = usedIndex[numUsedIndices++] = termQueue
553:                            .first()].copy();
554:
555:                    if (DEBUG)
556:                        System.err.println("Merging term " + currTerm);
557:
558:                    currTerm.println(termFile);
559:                    if (termReader[k].readLine(term[k]) == null)
560:                        termQueue.dequeue();
561:                    else
562:                        termQueue.changed();
563:
564:                    // Then, we extract all equal words from the queue, accumulating the set of indices in inIndex and currIndex
565:                    while (!termQueue.isEmpty()
566:                            && term[termQueue.first()].equals(currTerm)) {
567:                        k = usedIndex[numUsedIndices++] = termQueue.first();
568:                        if (termReader[k].readLine(term[k]) == null)
569:                            termQueue.dequeue();
570:                        else
571:                            termQueue.changed();
572:                    }
573:
574:                    if (numUsedIndices > 1)
575:                        Arrays.sort(usedIndex, 0, numUsedIndices);
576:
577:                    // Load index iterators
578:                    for (int i = numUsedIndices; i-- != 0;)
579:                        indexIterator[usedIndex[i]] = indexReader[usedIndex[i]]
580:                                .nextIterator();
581:
582:                    numTerms++;
583:
584:                    if (writeGlobCounts) {
585:                        // Compute and write the total global count. This works for all kind of indices.
586:                        totalGlobCount = 0;
587:                        for (int i = 0; i < numUsedIndices; i++)
588:                            totalGlobCount += globCounts[usedIndex[i]]
589:                                    .readGamma();
590:                        outputGlobCounts.writeLongGamma(totalGlobCount);
591:                    }
592:
593:                    if (!metadataOnly) {
594:                        totalFrequency = combine(numUsedIndices);
595:                        frequencies.writeGamma(totalFrequency);
596:                        numPointers += totalFrequency;
597:                    }
598:
599:                    /* A trick to get a correct prediction. */
600:                    if (writeGlobCounts)
601:                        pl.count += totalGlobCount - 1;
602:                    pl.update();
603:                }
604:                pl.done();
605:
606:                if (writeGlobCounts)
607:                    outputGlobCounts.close();
608:                termFile.close();
609:
610:                if (!metadataOnly) {
611:                    frequencies.close();
612:                    for (int i = numIndices; i-- != 0;) {
613:                        indexReader[i].close();
614:                        if (writeGlobCounts)
615:                            globCounts[i].close();
616:                        termReader[i].close();
617:                    }
618:                    final long indexSize = indexWriter.writtenBits();
619:                    indexWriter.close();
620:                    final Properties properties = indexWriter.properties();
621:                    additionalProperties.setProperty(Index.PropertyKeys.SIZE,
622:                            indexSize);
623:                    additionalProperties.setProperty(
624:                            Index.PropertyKeys.MAXDOCSIZE, maxDocSize);
625:                    additionalProperties
626:                            .setProperty(Index.PropertyKeys.OCCURRENCES,
627:                                    numberOfOccurrences);
628:                    properties.addAll(additionalProperties);
629:                    logger.debug("Post-merge properties: "
630:                            + new ConfigurationMap(properties));
631:                    properties.save(outputBasename
632:                            + DiskBasedIndex.PROPERTIES_EXTENSION);
633:                }
634:
635:                final PrintStream stats = new PrintStream(new FileOutputStream(
636:                        outputBasename + DiskBasedIndex.STATS_EXTENSION));
637:                if (writeSizes)
638:                    stats.println("Average document size: "
639:                            + Util.format((double) totalSize
640:                                    / numberOfDocuments));
641:                if (!metadataOnly)
642:                    indexWriter.printStats(stats);
643:                stats.close();
644:            }
645:
646:            public static void main(final String[] arg) throws JSAPException,
647:                    ConfigurationException, IOException, URISyntaxException,
648:                    ClassNotFoundException, SecurityException,
649:                    InstantiationException, IllegalAccessException,
650:                    InvocationTargetException, NoSuchMethodException {
651:                main(arg, null);
652:            }
653:
654:            public static void main(final String[] arg,
655:                    final Class<? extends Combine> combineClass)
656:                    throws JSAPException, ConfigurationException, IOException,
657:                    URISyntaxException, ClassNotFoundException,
658:                    SecurityException, InstantiationException,
659:                    IllegalAccessException, InvocationTargetException,
660:                    NoSuchMethodException {
661:
662:                SimpleJSAP jsap = new SimpleJSAP(
663:                        Combine.class.getName(),
664:                        "Combines several indices. By default, documents are concatenated, but you can also merge or paste them by choosing the suitable options, or invoking the corresponding subclass instead of "
665:                                + Combine.class.getName()
666:                                + ". Note that by combining a single input index you can recompress an index with new parameters.",
667:                        new Parameter[] {
668:                                new FlaggedOption(
669:                                        "bufferSize",
670:                                        JSAP.INTSIZE_PARSER,
671:                                        Util
672:                                                .formatBinarySize(DEFAULT_BUFFER_SIZE),
673:                                        JSAP.NOT_REQUIRED, 'b', "buffer-size",
674:                                        "The size of an I/O buffer."),
675:                                new FlaggedOption("comp", JSAP.STRING_PARSER,
676:                                        JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED,
677:                                        'c', "comp",
678:                                        "A compression flag for the index (may be specified several times).")
679:                                        .setAllowMultipleDeclarations(true),
680:                                new Switch(
681:                                        "skips",
682:                                        JSAP.NO_SHORTFLAG,
683:                                        "skips",
684:                                        "Requires skips (which however are present by default, unless you required an interleaved index)."),
685:                                new Switch("interleaved", JSAP.NO_SHORTFLAG,
686:                                        "interleaved",
687:                                        "Forces an interleaved index."),
688:                                new FlaggedOption("quantum",
689:                                        JSAP.INTSIZE_PARSER, "64",
690:                                        JSAP.NOT_REQUIRED, 'Q', "quantum",
691:                                        "The skip quantum."),
692:                                new FlaggedOption("height",
693:                                        JSAP.INTSIZE_PARSER, "8",
694:                                        JSAP.NOT_REQUIRED, 'H', "height",
695:                                        "The skip height."),
696:                                new Switch("metadataOnly", 'o',
697:                                        "metadata-only",
698:                                        "Combines only metadata (sizes, terms and globcounts)."),
699:                                new Switch("merge", 'm', "merge",
700:                                        "Merges indices (duplicates cause an error)."),
701:                                new Switch("duplicates", 'd', "duplicates",
702:                                        "Pastes indices, concatenating the document positions for duplicates."),
703:                                new Switch(
704:                                        "properties",
705:                                        'p',
706:                                        "properties",
707:                                        "The only specified inputBasename will be used to load a property file written by the scanning process."),
708:                                new FlaggedOption("tempFileDir",
709:                                        FileStringParser.getParser(),
710:                                        JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED,
711:                                        JSAP.NO_SHORTFLAG, "temp-file-dir",
712:                                        "The directory for the temporary file used during pasting."),
713:                                new FlaggedOption(
714:                                        "tempFileBufferSize",
715:                                        JSAP.INTSIZE_PARSER,
716:                                        Util
717:                                                .formatBinarySize(Paste.DEFAULT_MEMORY_BUFFER_SIZE),
718:                                        JSAP.NOT_REQUIRED, JSAP.NO_SHORTFLAG,
719:                                        "temp-file-buffer-size",
720:                                        "The size of the buffer for the temporary file during pasting."),
721:                                new FlaggedOption(
722:                                        "skipBufferSize",
723:                                        JSAP.INTSIZE_PARSER,
724:                                        Util
725:                                                .formatBinarySize(SkipBitStreamIndexWriter.DEFAULT_TEMP_BUFFER_SIZE),
726:                                        JSAP.NOT_REQUIRED,
727:                                        JSAP.NO_SHORTFLAG,
728:                                        "skip-buffer-size",
729:                                        "The size of the internal temporary buffer used while creating an index with skips."),
730:                                new FlaggedOption(
731:                                        "logInterval",
732:                                        JSAP.LONG_PARSER,
733:                                        Long
734:                                                .toString(ProgressLogger.DEFAULT_LOG_INTERVAL),
735:                                        JSAP.NOT_REQUIRED, 'l', "log-interval",
736:                                        "The minimum time interval between activity logs in milliseconds."),
737:                                new UnflaggedOption("outputBasename",
738:                                        JSAP.STRING_PARSER, JSAP.REQUIRED,
739:                                        "The basename of the resulting index."),
740:                                new UnflaggedOption("inputBasename",
741:                                        JSAP.STRING_PARSER, JSAP.NO_DEFAULT,
742:                                        JSAP.REQUIRED, JSAP.GREEDY,
743:                                        "The basenames of the indices to be merged.") });
744:
745:                JSAPResult jsapResult = jsap.parse(arg);
746:                if (jsap.messagePrinted())
747:                    return;
748:
749:                final boolean skips = jsapResult.getBoolean("skips");
750:                final boolean interleaved = jsapResult
751:                        .getBoolean("interleaved");
752:                if (interleaved
753:                        && !skips
754:                        && (jsapResult.userSpecified("quantum") || jsapResult
755:                                .userSpecified("height"))) {
756:                    System.err
757:                            .println("You specified quantum or height, but did not turn on skips.");
758:                    return;
759:                }
760:
761:                if (combineClass != null
762:                        && jsapResult.userSpecified("duplicates")
763:                        || jsapResult.userSpecified("merge"))
764:                    throw new IllegalArgumentException("When invoking "
765:                            + Combine.class.getName() + " from "
766:                            + combineClass.getName()
767:                            + " you cannot choose the combination process");
768:
769:                final String[] inputBasename;
770:                if (jsapResult.getBoolean("properties")) {
771:                    if (jsapResult.getStringArray("inputBasename").length > 1)
772:                        throw new IllegalArgumentException(
773:                                "When using --properties, you must specify exactly one inputBasename");
774:                    inputBasename = new Properties(jsapResult
775:                            .getStringArray("inputBasename")[0]
776:                            + Scan.CLUSTER_PROPERTIES_EXTENSION)
777:                            .getStringArray(IndexCluster.PropertyKeys.LOCALINDEX);
778:                } else
779:                    inputBasename = jsapResult.getStringArray("inputBasename");
780:                // TODO: resolve problem of passing default flag values without knowing type of index
781:                (combineClass == Paste.class
782:                        || jsapResult.getBoolean("duplicates") ? (Combine) new Paste(
783:                        jsapResult.getString("outputBasename"), inputBasename,
784:                        jsapResult.getBoolean("metadataOnly"), jsapResult
785:                                .getInt("bufferSize"), jsapResult
786:                                .getFile("tempFileDir"), jsapResult
787:                                .getInt("tempFileBufferSize"),
788:                        CompressionFlags.valueOf(jsapResult
789:                                .getStringArray("comp"),
790:                                CompressionFlags.DEFAULT_STANDARD_INDEX),
791:                        interleaved, skips, jsapResult.getInt("quantum"),
792:                        jsapResult.getInt("height"), jsapResult
793:                                .getInt("skipBufferSize"), jsapResult
794:                                .getLong("logInterval"))
795:                        : combineClass == Merge.class
796:                                || jsapResult.getBoolean("merge") ? (Combine) new Merge(
797:                                jsapResult.getString("outputBasename"),
798:                                inputBasename,
799:                                jsapResult.getBoolean("metadataOnly"),
800:                                jsapResult.getInt("bufferSize"),
801:                                CompressionFlags
802:                                        .valueOf(
803:                                                jsapResult
804:                                                        .getStringArray("comp"),
805:                                                CompressionFlags.DEFAULT_STANDARD_INDEX),
806:                                interleaved, skips, jsapResult
807:                                        .getInt("quantum"), jsapResult
808:                                        .getInt("height"), jsapResult
809:                                        .getInt("skipBufferSize"), jsapResult
810:                                        .getLong("logInterval"))
811:                                : (Combine) new Concatenate(
812:                                        jsapResult.getString("outputBasename"),
813:                                        inputBasename,
814:                                        jsapResult.getBoolean("metadataOnly"),
815:                                        jsapResult.getInt("bufferSize"),
816:                                        CompressionFlags
817:                                                .valueOf(
818:                                                        jsapResult
819:                                                                .getStringArray("comp"),
820:                                                        CompressionFlags.DEFAULT_STANDARD_INDEX),
821:                                        interleaved, skips, jsapResult
822:                                                .getInt("quantum"), jsapResult
823:                                                .getInt("height"), jsapResult
824:                                                .getInt("skipBufferSize"),
825:                                        jsapResult.getLong("logInterval"))
826:
827:                ).run();
828:            }
829:        }
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.