001: package it.unimi.dsi.mg4j.test;
002:
003: /*
004: * MG4J: Managing Gigabytes for Java
005: *
006: * Copyright (C) 2005-2007 Paolo Boldi and Sebastiano Vigna
007: *
008: * This library is free software; you can redistribute it and/or modify it
009: * under the terms of the GNU Lesser General Public License as published by the Free
010: * Software Foundation; either version 2.1 of the License, or (at your option)
011: * any later version.
012: *
013: * This library is distributed in the hope that it will be useful, but
014: * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
015: * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
016: * for more details.
017: *
018: * You should have received a copy of the GNU Lesser General Public License
019: * along with this program; if not, write to the Free Software
020: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
021: *
022: */
023:
024: /** Dumps various data about an inverted list.
025: */
026:
027: final public class DumpWordBlocks {
028: //private final static Logger LOGGER = Util.getLogger( DumpWordBlocks.class );
029:
030: private DumpWordBlocks() {
031: }
032:
033: /** A reasonable format for real numbers. */
034: private static final java.text.NumberFormat FORMAT_DOUBLE = new java.text.DecimalFormat(
035: "#,##0.00000");
036:
037: /** Formats a number.
038: *
039: * <P>This method formats a double separating thousands and printing just two fractional digits.
040: * @param d a number.
041: * @return a string containing a pretty print of the number.
042: */
043: public static String format(final double d) {
044: final StringBuffer s = new StringBuffer();
045: return FORMAT_DOUBLE.format(d, s,
046: new java.text.FieldPosition(0)).toString();
047: }
048:
049: /*
050: public static void main( final String[] arg ) throws IOException, JSAPException, ConfigurationException, ClassNotFoundException, InstantiationException, IllegalAccessException {
051:
052: SimpleJSAP jsap = new SimpleJSAP( DumpWordBlocks.class.getName(), "Dumps data about terms in an index.",
053: new Parameter[] {
054: new Switch( "pointers", JSAP.NO_SHORTFLAG, "pointers", "Dump pointers." ),
055: new Switch( "gaps", JSAP.NO_SHORTFLAG, "gaps", "Dump gaps between pointers." ),
056: new Switch( "gapBits", JSAP.NO_SHORTFLAG, "gab-bits", "Dump lengths in bits of gaps between pointers." ),
057: new Switch( "counts", JSAP.NO_SHORTFLAG, "counts", "Dump counts." ),
058: new Switch( "relCounts", JSAP.NO_SHORTFLAG, "rel-counts", "Dump relative counts (counts divided by document size)." ),
059: new Switch( "posBits", JSAP.NO_SHORTFLAG, "pos-bits", "Dump lengths in bits of occurrence lists." ),
060: new Switch( "recordPositions", JSAP.NO_SHORTFLAG, "record-positions", "Dump bit positions (offsets from the start of the list) to document records." ),
061: new Switch( "separators", 's', "separators", "Adds a comment containing the term index." ),
062: new FlaggedOption( "word", JSAP.INTEGER_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'w', "word", "The index of a word whose data has to be dumped." ),
063: new FlaggedOption( "frequency", JSAP.DOUBLE_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'f', "frequency", "The relative frequency that will be used to choose words to dump." ),
064: new FlaggedOption( "globalFrequency", JSAP.DOUBLE_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'g', "global-frequency", "The global count divided by the sum of document lengths that will be used to choose words to dump." ),
065: new FlaggedOption( "error", JSAP.INTEGER_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'e', "error", "The error w.r.t. frequency (as a percentage) that will be used to choose words to dump." ),
066: new UnflaggedOption( "basename", JSAP.STRING_PARSER, JSAP.REQUIRED, "The basename of the index." )
067: });
068:
069: JSAPResult jsapResult = jsap.parse( arg );
070: if ( jsap.messagePrinted() ) return;
071:
072: final boolean gaps = jsapResult.getBoolean( "gaps" );
073: final boolean pointers = jsapResult.getBoolean( "pointers" );
074: final boolean gapBits = jsapResult.getBoolean( "gapBits" );
075: final boolean counts = jsapResult.getBoolean( "counts" );
076: final boolean relCounts = jsapResult.getBoolean( "relCounts" );
077: final boolean posBits = jsapResult.getBoolean( "posBits" );
078: final boolean recordPositions = jsapResult.getBoolean( "recordPositions" );
079: final boolean separators = jsapResult.getBoolean( "separators" );
080: final double frequency = jsapResult.getObject( "frequency" ) != null ? jsapResult.getDouble( "frequency" ) : 0;
081: final double globalFrequency = jsapResult.getObject( "globalFrequency" ) != null ? jsapResult.getDouble( "globalFrequency" ) : 0;
082: final int error = jsapResult.getInt( "error", 1 );
083: final double lowFreq = frequency * ( 1 - error / 100.0 );
084: final double highFreq = frequency * ( 1 + error / 100.0 );
085: final double lowGlobFreq = globalFrequency * ( 1 - error / 100.0 );
086: final double highGlobFreq = globalFrequency * ( 1 + error / 100.0 );
087: final String basename = jsapResult.getString( "basename" );
088:
089: final Properties properties = new Properties( basename + DiskBasedIndex.PROPERTIES_EXTENSION );
090:
091: final int numberOfDocuments = properties.getInt( Index.PropertyKeys.DOCUMENTS );
092: final int numberOfTerms = properties.getInt( Index.PropertyKeys.TERMS );
093: final long numberOfOccurrences = properties.getLong( Index.PropertyKeys.OCCURRENCES );
094: final IntList size = DiskBasedIndex.readSizes( new InputBitStream( basename + DiskBasedIndex.SIZES_EXTENSION ), numberOfDocuments );
095:
096: DiskBasedIndex index = DiskBasedIndex.getInstance( basename );
097: final IntList wordsToDump;
098:
099: if ( jsapResult.getObject( "word" ) != null ) {
100: wordsToDump = IntLists.singleton( jsapResult.getInt( "word" ) );
101: LOGGER.debug( "Dumping word " + wordsToDump );
102: }
103: else {
104: if ( frequency == 0 && globalFrequency == 0 ) throw new IllegalArgumentException( "You must specify either a word or a frequency range" );
105:
106: final int min = (int)Math.round( lowFreq * numberOfDocuments );
107: final int max = (int)Math.round( highFreq * numberOfDocuments );
108: final long globMin = Math.round( lowGlobFreq * numberOfOccurrences );
109: final long globMax = Math.round( highGlobFreq * numberOfOccurrences );
110:
111: if ( frequency != 0 ) LOGGER.debug( "Dumping words in relative frequency range [" + format( lowFreq ) + ", " + format( highFreq ) + "] (" + numberOfDocuments + " documents, frequency range [" + min + ", " + max + "])" );
112: if ( globalFrequency != 0 ) LOGGER.debug( "Dumping words in relative global count range [" + format( lowGlobFreq ) + ", " + format( highGlobFreq ) + "] (" + numberOfOccurrences + " documents, global count range [" + globMin + ", " + globMax + "])" );
113: wordsToDump = new IntArrayList();
114:
115: final InputBitStream frequencies = new InputBitStream( new FileInputStream ( basename + DiskBasedIndex.FREQUENCIES_EXTENSION ) );
116: final InputBitStream globCounts = new InputBitStream( new FileInputStream ( basename + DiskBasedIndex.GLOBCOUNTS_EXTENSION ) );
117: int f;
118: long fl;
119: for( int t = 0; t < numberOfTerms; t++ ) {
120: f = frequencies.readGamma();
121: fl = globCounts.readLongGamma();
122: if ( frequency != 0 && ( f < min || f > max ) ) continue;
123: if ( globalFrequency != 0 && ( fl < globMin || fl > globMax ) ) continue;
124: wordsToDump.add( t );
125: }
126: frequencies.close();
127: globCounts.close();
128: }
129:
130: LOGGER.debug( "Dumping " + wordsToDump.size() + " words..." );
131:
132: int j, pointer, numOccs, prevPointer;
133: long start, startOccs;
134:
135: IndexReader indexReader = index.getReader();
136:
137: for( int i = 0; i < wordsToDump.size(); i++ ) {
138:
139: final int word = wordsToDump.getInt( i );
140: IndexIterator indexIterator = indexReader.documents( word );
141: j = indexIterator.frequency();
142: if ( separators ) System.out.println( "# " + word + " (frequency " + j + ", relative " + (double)j / numberOfDocuments + ")" );
143: prevPointer = -1;
144:
145: while( j-- != 0 ) {
146: start = indexReader.readBits();
147: pointer = indexIterator.nextDocument();
148: if ( pointers ) System.out.println( pointer );
149: if ( gaps ) System.out.println( pointer - prevPointer - 1 );
150: if ( gapBits ) System.out.println( indexReader.readBits() - start );
151: prevPointer = pointer;
152:
153: startOccs = indexReader.readBits();
154: numOccs = indexIterator.count();
155: if ( posBits ) System.out.println( indexReader.readBits() - startOccs );
156: if ( recordPositions ) System.out.println( indexReader.readBits() - start );
157: if ( counts ) System.out.println( numOccs );
158: if ( relCounts ) System.out.println( (double)numOccs / size.getInt( pointer ) );
159: }
160: if ( gaps ) System.out.println( numberOfDocuments - prevPointer );
161: }
162:
163: LOGGER.debug( " done." );
164: }*/
165: }
|