001: package it.unimi.dsi.mg4j.index.cluster;
002:
003: /*
004: * MG4J: Managing Gigabytes for Java
005: *
006: * Copyright (C) 2006-2007 Sebastiano Vigna
007: *
008: * This library is free software; you can redistribute it and/or modify it
009: * under the terms of the GNU Lesser General Public License as published by the Free
010: * Software Foundation; either version 2.1 of the License, or (at your option)
011: * any later version.
012: *
013: * This library is distributed in the hope that it will be useful, but
014: * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
015: * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
016: * for more details.
017: *
018: * You should have received a copy of the GNU Lesser General Public License
019: * along with this program; if not, write to the Free Software
020: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
021: *
022: */
023:
024: import it.unimi.dsi.fastutil.ints.IntList;
025: import it.unimi.dsi.fastutil.io.BinIO;
026: import it.unimi.dsi.mg4j.index.DiskBasedIndex;
027: import it.unimi.dsi.mg4j.index.Index;
028: import it.unimi.dsi.mg4j.index.TermProcessor;
029: import it.unimi.dsi.mg4j.index.payload.Payload;
030: import it.unimi.dsi.io.InputBitStream;
031: import it.unimi.dsi.mg4j.search.score.BM25Scorer;
032: import it.unimi.dsi.Util;
033: import it.unimi.dsi.util.BloomFilter;
034: import it.unimi.dsi.util.Properties;
035:
036: import java.io.IOException;
037: import java.lang.reflect.InvocationTargetException;
038: import java.net.URISyntaxException;
039: import java.util.EnumMap;
040:
041: import org.apache.commons.configuration.ConfigurationException;
042: import org.apache.log4j.Logger;
043:
044: /** An abstract index cluster. An index cluster is an index
045: * exposing transparently a list of <em>local indices</em> as a single
046: * <em>global index</em>. A cluster usually is generated by
047: * partitioning an index {@linkplain it.unimi.dsi.mg4j.tool.PartitionLexically lexically}
048: * or {@linkplain it.unimi.dsi.mg4j.tool.PartitionDocumentally documentally}, but nothing
049: * prevents the creation of hand-made clusters.
050: *
051: * <p>Note that, upon creation of an instance, the main index key
052: * of all {@linkplain #localIndex local indices} is
053: * {@linkplain it.unimi.dsi.mg4j.index.Index#keyIndex(Index) set} to that instance.
054: *
055: * <p>An index cluster is defined by a property file. The only properties common
056: * to all index clusters are <samp>localindex</samp>, which can be specified multiple
057: * times (order is relevant) and contains the URIs of the local indices of the cluster,
058: * and <samp>strategy</samp>, which contains the filename of a serialised {@link it.unimi.dsi.mg4j.index.cluster.ClusteringStrategy}.
059: * The indices will be loaded using {@link it.unimi.dsi.mg4j.index.Index#getInstance(CharSequence,boolean,boolean)},
060: * so there is no restriction on the URIs that can be used (e.g., you can cluster
061: * a set of remote indices).
062: *
063: * <p>If you plan to use global document sizes (e.g., for {@linkplain BM25Scorer BM25 scoring}) you will need
064: * to load them explicitly using the property {@link it.unimi.dsi.mg4j.index.Index.UriKeys#SIZES}, which must specify
065: * a size file for the <em>whole collection</em>. If you are clustering a partitioned index,
066: * this is usually the original size file.
067: *
068: * <p>Optionally, an index cluster may provide {@linkplain BloomFilter Bloom filters}
069: * to reduce useless access to local indices that do not contain a term. The filters
070: * have the standard extension {@link #BLOOM_EXTENSION}.
071: *
072: * <p>This class exposes a {@linkplain #getInstance(CharSequence, boolean, boolean, EnumMap) static factory method}
073: * that uses the <samp>indexclass</samp> property to load the appropriate implementing subclass;
074: * Bloom filters are loaded automatically.
075: */
076: public abstract class IndexCluster extends Index {
077: private static final Logger LOGGER = Util
078: .getLogger(IndexCluster.class);
079:
080: /** Symbolic names for properties of an {@link it.unimi.dsi.mg4j.index.cluster.IndexCluster}. */
081: public static enum PropertyKeys {
082: /** A local index (usually used multiple times). */
083: LOCALINDEX,
084: /** The clustering strategy. */
085: STRATEGY,
086: /** A Boolean: whether the cluster has Bloom term filters. */
087: BLOOM,
088: /** A Boolean: whether the cluster is flat (i.e., it is documental and all term lists are the same). */
089: FLAT
090: }
091:
092: /** The default extension of a strategy. */
093: public static final String STRATEGY_DEFAULT_EXTENSION = ".strategy";
094:
095: /** The default extension for Bloom term filters. */
096: public static final String BLOOM_EXTENSION = ".bloom";
097:
098: /** The local indices of this cluster. */
099: protected final Index[] localIndex;
100: /** An array of Bloom filter to reduce index access, or <code>null</code>. */
101: protected final BloomFilter[] termFilter;
102:
103: protected IndexCluster(final Index[] localIndex,
104: final BloomFilter[] termFilter,
105: final int numberOfDocuments, final int numberOfTerms,
106: final long numberOfPostings,
107: final long numberOfOccurrences, final int maxCount,
108: final Payload payload, final boolean hasCounts,
109: final boolean hasPositions,
110: final TermProcessor termProcessor, final String field,
111: final IntList sizes, final Properties properties) {
112: super (numberOfDocuments, numberOfTerms, numberOfPostings,
113: numberOfOccurrences, maxCount, payload, hasCounts,
114: hasPositions, termProcessor, field, sizes, properties);
115: this .localIndex = localIndex;
116: this .termFilter = termFilter;
117: for (int i = 0; i < localIndex.length; i++)
118: localIndex[i].keyIndex(this );
119: }
120:
121: /** Returns a new index cluster.
122: *
123: * <p>This method uses the <samp>LOCALINDEX</samp> property to locate the local indices,
124: * loads them (passing on <code>randomAccess</code>) and
125: * builds a new index cluster using the appropriate implementing subclass.
126: *
127: * <p>Note that <code>documentSizes</code> is just passed to the local indices. This can be useful
128: * in {@linkplain DocumentalCluster documental clusters}, as it allows local scoring, but it is useless in
129: * {@linkplain LexicalCluster lexical clusters}, as scoring is necessarily centralised. In the
130: * latter case, the property {@link it.unimi.dsi.mg4j.index.Index.UriKeys#SIZES} can be used to specify a global sizes file (which
131: * usually comes from an original global index).
132: *
133: * @param basename the basename.
134: * @param randomAccess whether the index should be accessible randomly.
135: * @param documentSizes if true, document sizes will be loaded (note that sometimes document sizes
136: * might be loaded anyway because the compression method for positions requires it).
137: * @param queryProperties a map containing associations between {@link it.unimi.dsi.mg4j.index.Index.UriKeys} and values, or <code>null</code>.
138: */
139: @SuppressWarnings("unchecked")
140: static public Index getInstance(final CharSequence basename,
141: final boolean randomAccess, final boolean documentSizes,
142: final EnumMap<UriKeys, String> queryProperties)
143: throws ConfigurationException, IOException,
144: ClassNotFoundException, SecurityException,
145: URISyntaxException, InstantiationException,
146: IllegalAccessException, InvocationTargetException,
147: NoSuchMethodException {
148: final Properties properties = new Properties(basename
149: + DiskBasedIndex.PROPERTIES_EXTENSION);
150: ClusteringStrategy strategy = (ClusteringStrategy) BinIO
151: .loadObject(properties.getString(PropertyKeys.STRATEGY));
152: final Class<? extends IndexCluster> indexClass = (Class<? extends IndexCluster>) Class
153: .forName(properties.getString(
154: Index.PropertyKeys.INDEXCLASS,
155: "(missing index class)"));
156:
157: String[] localBasename = properties
158: .getStringArray(PropertyKeys.LOCALINDEX);
159: Index[] localIndex = new Index[localBasename.length];
160: for (int i = 0; i < localIndex.length; i++)
161: localIndex[i] = Index.getInstance(localBasename[i],
162: randomAccess, documentSizes);
163:
164: final int numberOfDocuments = properties
165: .getInt(Index.PropertyKeys.DOCUMENTS);
166: final IntList sizes = queryProperties != null
167: && queryProperties.containsKey(Index.UriKeys.SIZES) ? DiskBasedIndex
168: .readSizes(new InputBitStream(queryProperties
169: .get(Index.UriKeys.SIZES)), numberOfDocuments)
170: : null;
171:
172: if (sizes != null && documentSizes)
173: LOGGER
174: .warn("You are loading both local sizes and a global size file specified by the \"size\" properties, which is usually nonsensical");
175:
176: boolean hasCounts = true;
177: boolean hasPositions = true;
178: Payload payload = null;
179:
180: for (int i = 0; i < localIndex.length; i++) {
181: hasCounts = hasCounts && localIndex[i].hasCounts;
182: hasPositions = hasPositions && localIndex[i].hasPositions;
183:
184: if (i == 0)
185: payload = localIndex[i].payload;
186: if ((payload == null) != (localIndex[i].payload == null)
187: || payload != null
188: && !payload.compatibleWith(localIndex[i].payload))
189: throw new IllegalStateException(
190: "The payload specification of index "
191: + localIndex[0]
192: + " is not compatible with that of index "
193: + localIndex[i]);
194: }
195:
196: // We stem the names of Bloom filters from the index basename.
197: BloomFilter[] termFilter = null;
198: if (properties.getBoolean(DocumentalCluster.PropertyKeys.BLOOM)) {
199: LOGGER.debug("Loading Bloom filters...");
200: termFilter = new BloomFilter[localIndex.length];
201: for (int i = 0; i < localIndex.length; i++)
202: termFilter[i] = (BloomFilter) BinIO.loadObject(basename
203: + "-" + i + BLOOM_EXTENSION);
204: LOGGER.debug("Completed.");
205: }
206:
207: // Let us rebuild the strategy in case it's a chained strategy
208: if (strategy instanceof ChainedLexicalClusteringStrategy)
209: strategy = new ChainedLexicalClusteringStrategy(localIndex,
210: termFilter);
211: else if (strategy.numberOfLocalIndices() != localBasename.length)
212: throw new IllegalArgumentException(
213: "The number of local indices of the strategy ("
214: + localIndex.length
215: + ") and the number of local indices specified by the property file ("
216: + localBasename.length + ") differ");
217:
218: if (LexicalCluster.class.isAssignableFrom(indexClass))
219: return new LexicalCluster(localIndex,
220: (LexicalClusteringStrategy) strategy, termFilter,
221: numberOfDocuments, properties
222: .getInt(Index.PropertyKeys.TERMS),
223: properties.getLong(Index.PropertyKeys.POSTINGS),
224: properties.getLong(Index.PropertyKeys.OCCURRENCES),
225: properties.getInt(Index.PropertyKeys.MAXCOUNT),
226: payload, hasCounts, hasPositions, Index
227: .getTermProcessor(properties), properties
228: .getString(Index.PropertyKeys.FIELD),
229: sizes, properties);
230: else if (DocumentalCluster.class.isAssignableFrom(indexClass)) {
231: if (DocumentalConcatenatedCluster.class
232: .isAssignableFrom(indexClass))
233: return new DocumentalConcatenatedCluster(
234: localIndex,
235: (DocumentalClusteringStrategy) strategy,
236: properties
237: .getBoolean(IndexCluster.PropertyKeys.FLAT),
238: termFilter,
239: numberOfDocuments,
240: properties.getInt(Index.PropertyKeys.TERMS),
241: properties.getLong(Index.PropertyKeys.POSTINGS),
242: properties
243: .getLong(Index.PropertyKeys.OCCURRENCES),
244: properties.getInt(Index.PropertyKeys.MAXCOUNT),
245: payload, hasCounts, hasPositions, Index
246: .getTermProcessor(properties),
247: properties.getString(Index.PropertyKeys.FIELD),
248: sizes, properties);
249: return new DocumentalMergedCluster(
250: localIndex,
251: (DocumentalClusteringStrategy) strategy,
252: properties
253: .getBoolean(IndexCluster.PropertyKeys.FLAT),
254: termFilter, numberOfDocuments, properties
255: .getInt(Index.PropertyKeys.TERMS),
256: properties.getLong(Index.PropertyKeys.POSTINGS),
257: properties.getLong(Index.PropertyKeys.OCCURRENCES),
258: properties.getInt(Index.PropertyKeys.MAXCOUNT),
259: payload, hasCounts, hasPositions, Index
260: .getTermProcessor(properties), properties
261: .getString(Index.PropertyKeys.FIELD),
262: sizes, properties);
263: } else
264: throw new IllegalArgumentException(
265: "Unknown IndexCluster implementation: "
266: + indexClass.getName());
267:
268: }
269:
270: @Override
271: public void keyIndex(final Index newKeyIndex) {
272: super .keyIndex(newKeyIndex);
273: for (int i = 0; i < localIndex.length; i++)
274: localIndex[i].keyIndex(this);
275: }
276: }
|