001: package it.unimi.dsi.mg4j.tool;
002:
003: /*
004: * MG4J: Managing Gigabytes for Java
005: *
006: * Copyright (C) 2005-2007 Sebastiano Vigna
007: *
008: * This library is free software; you can redistribute it and/or modify it
009: * under the terms of the GNU Lesser General Public License as published by the Free
010: * Software Foundation; either version 2.1 of the License, or (at your option)
011: * any later version.
012: *
013: * This library is distributed in the hope that it will be useful, but
014: * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
015: * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
016: * for more details.
017: *
018: * You should have received a copy of the GNU Lesser General Public License
019: * along with this program; if not, write to the Free Software
020: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
021: *
022: */
023:
024: import it.unimi.dsi.fastutil.ints.IntHeapSemiIndirectPriorityQueue;
025: import it.unimi.dsi.fastutil.ints.IntIterator;
026: import it.unimi.dsi.mg4j.index.Index;
027: import it.unimi.dsi.mg4j.index.IndexIterator;
028: import it.unimi.dsi.mg4j.index.CompressionFlags.Coding;
029: import it.unimi.dsi.mg4j.index.CompressionFlags.Component;
030: import it.unimi.dsi.io.OutputBitStream;
031: import it.unimi.dsi.Util;
032:
033: import java.io.Closeable;
034: import java.io.IOException;
035: import java.lang.reflect.InvocationTargetException;
036: import java.net.URISyntaxException;
037: import java.util.Map;
038:
039: import org.apache.commons.configuration.ConfigurationException;
040: import org.apache.log4j.Logger;
041:
042: import com.martiansoftware.jsap.JSAPException;
043:
044: /** Merges several indices.
045: *
046: * <P>This class merges indices by performing a simple ordered list merge. Documents
047: * appearing in two indices will cause an error.
048: *
049: * @author Sebastiano Vigna
050: * @since 1.0
051: *
052: */
053:
054: public class Merge extends Combine {
055: @SuppressWarnings("unused")
056: private static final Logger LOGGER = Util.getLogger(Merge.class);
057:
058: /** The reference array of the document queue. */
059: protected int[] doc;
060: /** The queue containing document pointers (for remapped indices). */
061: protected IntHeapSemiIndirectPriorityQueue documentQueue;
062:
063: public Merge(final String outputBasename,
064: final String[] inputBasename, final boolean metadataOnly,
065: final int bufferSize,
066: final Map<Component, Coding> writerFlags,
067: final boolean interleaved, final boolean skips,
068: final int quantum, final int height,
069: final int skipBufferSize, final long logInterval)
070: throws IOException, ConfigurationException,
071: URISyntaxException, ClassNotFoundException,
072: SecurityException, InstantiationException,
073: IllegalAccessException, InvocationTargetException,
074: NoSuchMethodException {
075: super (outputBasename, inputBasename, metadataOnly, bufferSize,
076: writerFlags, interleaved, skips, quantum, height,
077: skipBufferSize, logInterval);
078:
079: doc = new int[numIndices];
080: documentQueue = new IntHeapSemiIndirectPriorityQueue(doc,
081: numIndices);
082: }
083:
084: protected int combineNumberOfDocuments() {
085: int n = 0;
086: for (int i = 0; i < numIndices; i++)
087: n = Math.max(n, index[i].numberOfDocuments);
088: return n;
089: }
090:
091: protected int combineSizes() throws IOException {
092: int currDoc = 0, maxDocSize = 0;
093: for (int i = 0; i < numIndices; i++) {
094: final IntIterator sizes = sizes(i);
095:
096: currDoc = 0;
097: int j = index[i].numberOfDocuments;
098: int s;
099: while (j-- != 0) {
100: if ((s = sizes.nextInt()) != 0) {
101: if (size[currDoc] != 0)
102: throw new IllegalArgumentException("Document "
103: + currDoc
104: + " has nonzero length in two indices");
105: size[currDoc] = s;
106: if (s > maxDocSize)
107: maxDocSize = s;
108: }
109: currDoc++;
110: }
111: if (sizes instanceof Closeable)
112: ((Closeable) sizes).close();
113: }
114: return maxDocSize;
115: }
116:
117: protected int combine(final int numUsedIndices) throws IOException {
118: // We gather the frequencies from the subindices and just add up. At the same time, we load the document queue.
119: int totalFrequency = 0, currIndex, lastIndex = -1;
120: for (int k = numUsedIndices; k-- != 0;) {
121: currIndex = usedIndex[k];
122: totalFrequency += (frequency[currIndex] = indexIterator[currIndex]
123: .frequency());
124: doc[currIndex] = indexIterator[currIndex].nextDocument();
125: documentQueue.enqueue(currIndex);
126: }
127:
128: indexWriter.newInvertedList();
129: indexWriter.writeFrequency(totalFrequency);
130:
131: int currDoc = -1, count;
132: OutputBitStream obs;
133: Index i;
134: IndexIterator ir;
135:
136: while (!documentQueue.isEmpty()) {
137: // We extract the smallest document pointer, and enqueue it in the new index.
138: if (currDoc == doc[currIndex = documentQueue.first()])
139: throw new IllegalStateException(
140: "The indices to be merged contain document "
141: + currDoc
142: + " at least twice (once in index "
143: + inputBasename[lastIndex]
144: + " and once in index "
145: + inputBasename[currIndex] + ")");
146: currDoc = doc[currIndex];
147:
148: obs = indexWriter.newDocumentRecord();
149: indexWriter.writeDocumentPointer(obs, currDoc);
150: i = index[currIndex];
151: ir = indexIterator[currIndex];
152:
153: if (i.hasPayloads)
154: indexWriter.writePayload(obs, ir.payload());
155:
156: if (i.hasCounts) {
157: count = ir.count();
158: if (hasCounts)
159: indexWriter.writePositionCount(obs, count);
160: if (i.hasPositions && hasPositions)
161: indexWriter.writeDocumentPositions(obs, ir
162: .positionArray(), 0, count, size[currDoc]);
163: }
164:
165: // If we just wrote the last document pointer of this term in index j, we dequeue it.
166: if (--frequency[currIndex] == 0)
167: documentQueue.dequeue();
168: else {
169: doc[currIndex] = ir.nextDocument();
170: documentQueue.changed();
171: }
172: lastIndex = currIndex;
173: }
174:
175: return totalFrequency;
176: }
177:
178: public static void main(String arg[])
179: throws ConfigurationException, SecurityException,
180: JSAPException, IOException, URISyntaxException,
181: ClassNotFoundException, InstantiationException,
182: IllegalAccessException, InvocationTargetException,
183: NoSuchMethodException {
184: Combine.main(arg, Merge.class);
185: }
186: }
|