001: package it.unimi.dsi.mg4j.index.cluster;
002:
003: /*
004: * MG4J: Managing Gigabytes for Java
005: *
006: * Copyright (C) 2006-2007 Sebastiano Vigna
007: *
008: * This library is free software; you can redistribute it and/or modify it
009: * under the terms of the GNU Lesser General Public License as published by the Free
010: * Software Foundation; either version 2.1 of the License, or (at your option)
011: * any later version.
012: *
013: * This library is distributed in the hope that it will be useful, but
014: * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
015: * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
016: * for more details.
017: *
018: * You should have received a copy of the GNU Lesser General Public License
019: * along with this program; if not, write to the Free Software
020: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
021: *
022: */
023:
024: import it.unimi.dsi.lang.MutableString;
025: import it.unimi.dsi.util.Properties;
026:
027: import java.util.Arrays;
028: import java.util.NoSuchElementException;
029:
030: /** A lexical strategy that partitions terms into contiguous segments.
031: *
032: * <p>To partition terms in contiguous segments, you must provide an array
033: * of <em>cutpoints</em>, which define each segment of terms. More precisely, given
034: * cutpoints <var>c</var><sub>0</sub>,<var>c</var><sub>2</sub>,…,<var>c</var><sub><var>k</var></sub>,
035: * terms will be partitioned into <var>k</var> local indices containing the terms numbered
036: * from <var>c</var><sub>0</sub> (included) to <var>c</var><sub>1</sub> (excluded), from
037: * <var>c</var><sub>1</sub> (included) to <var>c</var><sub>2</sub> and so on. Note that
038: * necessarily <var>c</var><sub>0</sub>=0 and <var>c</var><sub><var>k</var></sub>=<var>T</var>,
039: * where <var>T</var> is the number of terms in the global index.
040: *
041: * <p>To make mapping of terms (as opposed to term numbers) possible, you must also provide
042: * a parallel array of {@linkplain java.lang.CharSequence character sequences} that
043: * contain the terms corresponding to the terms numbers
044: * <var>c</var><sub>0</sub>,<var>c</var><sub>2</sub>,…,<var>c</var><sub><var>k</var></sub>.
045: * The content of the last element of the array should be <code>null</code>.
046: *
047: * <p>The {@link #properties()} method provides two properties, <samp>termfrom</samp> and <samp>termto</samp>,
048: * that contain the first (included) and last (excluded) term in the local index, and two
049: * analogous properties <samp>termnumberfrom</samp> and <samp>termnumberto</samp>.
050: *
051: * @author Alessandro Arrabito
052: * @author Sebastiano Vigna
053: */
054:
055: public class ContiguousLexicalStrategy implements
056: LexicalPartitioningStrategy, LexicalClusteringStrategy {
057: static final long serialVersionUID = 0;
058: /** The cutpoints.*/
059: private final int[] cutPoint;
060: /** The cutpoint terms.*/
061: private final MutableString[] cutPointTerm;
062: /** The (cached) number of segments. */
063: private final int k;
064:
065: /** Creates a new contiguous lexical strategy with the given cutpoints.
066: * @param cutPoint an array of cutpoints (see the class description}.
067: * @param cutPointTerm a parallel array of cutpoints terms; the last element must be <code>null</code>.
068: */
069: public ContiguousLexicalStrategy(final int[] cutPoint,
070: final CharSequence[] cutPointTerm) {
071: if (cutPoint.length == 0)
072: throw new IllegalArgumentException("Empty cutpoint array");
073: if (cutPoint.length != cutPointTerm.length)
074: throw new IllegalArgumentException(
075: "The cutpoint array and the term cutpoint array have different lengths ("
076: + cutPoint.length + ", "
077: + cutPointTerm.length + ")");
078: if (cutPoint[0] != 0)
079: throw new IllegalArgumentException(
080: "The first cutpoint must be 0");
081: this .cutPoint = cutPoint;
082: // Defensive copy
083: this .k = cutPoint.length - 1;
084: this .cutPointTerm = new MutableString[k + 1];
085: for (int i = 0; i < k; i++)
086: this .cutPointTerm[i] = new MutableString(cutPointTerm[i]);
087: this .cutPointTerm[k] = new MutableString("\uFFFF");
088: }
089:
090: public int numberOfLocalIndices() {
091: return k;
092: }
093:
094: public int localIndex(final int globalNumber) {
095: if (globalNumber >= cutPoint[k])
096: throw new IndexOutOfBoundsException(Integer
097: .toString(globalNumber));
098: for (int i = k; i-- != 0;)
099: if (cutPoint[i] <= globalNumber)
100: return i;
101: throw new IndexOutOfBoundsException(Integer
102: .toString(globalNumber));
103: }
104:
105: public int localIndex(final CharSequence term) {
106: for (int i = k; i-- != 0;)
107: if (cutPointTerm[i].compareTo(term) <= 0)
108: return i;
109: throw new NoSuchElementException(term.toString());
110: }
111:
112: public int globalNumber(final int localIndex, final int localNumber) {
113: return localNumber + cutPoint[localIndex];
114: }
115:
116: public int localNumber(final int globalNumber) {
117: return globalNumber - cutPoint[localIndex(globalNumber)];
118: }
119:
120: public Properties[] properties() {
121: Properties[] properties = new Properties[k];
122: for (int i = 0; i < k; i++) {
123: properties[i] = new Properties();
124: properties[i].addProperty("termfrom", cutPointTerm[i]);
125: properties[i].addProperty("termto", cutPointTerm[i + 1]);
126: properties[i].addProperty("termnumberfrom", cutPoint[i]);
127: properties[i].addProperty("termnumberto", cutPoint[i + 1]);
128: }
129: return properties;
130: }
131:
132: public String toString() {
133: return "{ " + Arrays.toString(cutPoint) + ", "
134: + Arrays.toString(cutPointTerm) + " }";
135: }
136: }
|