001: package it.unimi.dsi.mg4j.test;
002:
003: import it.unimi.dsi.mg4j.index.DiskBasedIndex;
004: import it.unimi.dsi.mg4j.index.Index;
005: import it.unimi.dsi.io.FastBufferedReader;
006: import it.unimi.dsi.io.InputBitStream;
007: import it.unimi.dsi.Util;
008: import it.unimi.dsi.lang.MutableString;
009: import it.unimi.dsi.util.Properties;
010:
011: import java.io.FileReader;
012: import java.io.IOException;
013:
014: import org.apache.commons.configuration.ConfigurationException;
015: import org.apache.log4j.Logger;
016:
017: import com.martiansoftware.jsap.FlaggedOption;
018: import com.martiansoftware.jsap.JSAP;
019: import com.martiansoftware.jsap.JSAPException;
020: import com.martiansoftware.jsap.JSAPResult;
021: import com.martiansoftware.jsap.Parameter;
022: import com.martiansoftware.jsap.SimpleJSAP;
023: import com.martiansoftware.jsap.Switch;
024: import com.martiansoftware.jsap.UnflaggedOption;
025:
026: /** Selects part of a stats using global frequency.
027: */
028:
029: final public class SelectStats {
030: @SuppressWarnings("unused")
031: private final static Logger LOGGER = Util
032: .getLogger(SelectStats.class);
033:
034: private SelectStats() {
035: }
036:
037: /** A reasonable format for real numbers. */
038: private static final java.text.NumberFormat formatDouble = new java.text.DecimalFormat(
039: "#,##0.00000");
040:
041: /** Formats a number.
042: *
043: * <P>This method formats a double separating thousands and printing just two fractional digits.
044: * @param d a number.
045: * @return a string containing a pretty print of the number.
046: */
047: public static String format(final double d) {
048: final StringBuffer s = new StringBuffer();
049: return formatDouble
050: .format(d, s, new java.text.FieldPosition(0))
051: .toString();
052: }
053:
054: public static void main(final String[] arg) throws IOException,
055: JSAPException, ConfigurationException {
056:
057: SimpleJSAP jsap = new SimpleJSAP(
058: SelectStats.class.getName(),
059: "Prints or selects parts of a stat file using global counts.",
060: new Parameter[] {
061: new Switch("print", 'p', "print",
062: "Just print global occurrences."),
063: new FlaggedOption(
064: "globalFrequency",
065: JSAP.DOUBLE_PARSER,
066: JSAP.NO_DEFAULT,
067: JSAP.NOT_REQUIRED,
068: 'g',
069: "global-frequency",
070: "The global count divided by the sum of document lengths that will be used to choose words to dump."),
071: new FlaggedOption("quantumBitLength",
072: JSAP.INTEGER_PARSER, JSAP.NO_DEFAULT,
073: JSAP.NOT_REQUIRED, 'q',
074: "quantum-bit-length",
075: "The quantum bit length that will be used to choose words to dump."),
076: new FlaggedOption(
077: "error",
078: JSAP.INTEGER_PARSER,
079: JSAP.NO_DEFAULT,
080: JSAP.NOT_REQUIRED,
081: 'e',
082: "error",
083: "The error w.r.t. frequency (as a percentage) that will be used to choose words to dump."),
084: new UnflaggedOption("basename",
085: JSAP.STRING_PARSER, JSAP.REQUIRED,
086: "The index basename."),
087: new UnflaggedOption("statFile",
088: JSAP.STRING_PARSER, JSAP.REQUIRED,
089: "The stat file to be scanned.") });
090:
091: JSAPResult jsapResult = jsap.parse(arg);
092: if (jsap.messagePrinted())
093: return;
094:
095: final boolean print = jsapResult.getBoolean("print");
096: final String basename = jsapResult.getString("basename");
097: final String statFile = jsapResult.getString("statFile");
098: final int quantumBitLength = jsapResult.getInt(
099: "quantumBitLength", 0);
100: final double globalFrequency = jsapResult.getDouble(
101: "globalFrequency", 0);
102: final int error = jsapResult.getInt("error", 1);
103: final double lowGlobFreq = globalFrequency
104: * (1 - error / 100.0);
105: final double highGlobFreq = globalFrequency
106: * (1 + error / 100.0);
107: final int lowQbl = (int) Math.round(quantumBitLength
108: * (1 - error / 100.0));
109: final int highQbl = (int) Math.round(quantumBitLength
110: * (1 + error / 100.0));
111:
112: final Properties properties = new Properties(basename
113: + DiskBasedIndex.PROPERTIES_EXTENSION);
114: final int numberOfTerms = properties
115: .getInt(Index.PropertyKeys.TERMS);
116: final long numberOfoccurrences = properties
117: .getLong(Index.PropertyKeys.OCCURRENCES);
118:
119: final InputBitStream globCounts = new InputBitStream(basename
120: + DiskBasedIndex.GLOBCOUNTS_EXTENSION);
121: long gc[] = new long[numberOfTerms];
122: for (int t = 0; t < numberOfTerms; t++)
123: gc[t] = globCounts.readLongGamma();
124: globCounts.close();
125:
126: final MutableString line = new MutableString();
127: MutableString number;
128: final FastBufferedReader reader = new FastBufferedReader(
129: new FileReader(statFile));
130:
131: boolean dumping = false;
132: int f, q;
133: reader.readLine(line);
134: while (reader.readLine(line) != null) {
135: if (line.charAt(0) == '#') {
136: number = line.substring(2);
137: f = Integer.parseInt(number.delete(number.indexOf(' '),
138: number.length()).toString());
139: double freq = (double) gc[f] / numberOfoccurrences;
140: if (print)
141: System.out.println(line + " " + format(freq));
142: else {
143: if (quantumBitLength != 0) {
144: // We choose using the quantum bit length
145: number = line.substring(2);
146: number = number
147: .substring(number.indexOf(' ') + 1);
148: q = Integer.parseInt(number.delete(
149: number.indexOf(' '), number.length())
150: .toString());
151: dumping = q >= lowQbl && q <= highQbl;
152: } else
153: dumping = freq >= lowGlobFreq
154: && freq <= highGlobFreq;
155: }
156: if (dumping)
157: line.println(System.out);
158: } else if (!print && dumping) {
159: line.println(System.out);
160: }
161: }
162: }
163: }
|