001: package it.unimi.dsi.mg4j.test;
002:
003: import it.unimi.dsi.fastutil.ints.IntArrays;
004: import it.unimi.dsi.io.FastBufferedReader;
005: import it.unimi.dsi.Util;
006: import it.unimi.dsi.lang.MutableString;
007: import it.unimi.dsi.logging.ProgressLogger;
008:
009: import java.io.IOException;
010: import java.io.InputStreamReader;
011: import java.util.ArrayList;
012: import java.util.Arrays;
013:
014: import org.apache.log4j.Logger;
015:
016: import com.martiansoftware.jsap.FlaggedOption;
017: import com.martiansoftware.jsap.JSAP;
018: import com.martiansoftware.jsap.JSAPException;
019: import com.martiansoftware.jsap.JSAPResult;
020: import com.martiansoftware.jsap.Parameter;
021: import com.martiansoftware.jsap.SimpleJSAP;
022: import com.martiansoftware.jsap.UnflaggedOption;
023:
024: /** Reads a sequence of documents represented as blank-separated
025: * sequences of words, where documents are separated by new-lines.
026: * Produces and prints <var>q</var> DNF queries (OR's of AND's)
027: * as follows: for every query, <var>k</var> documents are selected
028: * at random, and from each of them <var>h</var> words at most are
029: * selected. The query is a <var>k</var>-ary OR of the corresponding
030: * AND's.
031: */
032:
033: final public class ProduceDNFFromLines {
034: private final static Logger LOGGER = Util
035: .getLogger(ProduceDNFFromLines.class);
036:
037: private ProduceDNFFromLines() {
038: }
039:
040: public static void main(final String[] arg) throws IOException,
041: JSAPException {
042:
043: SimpleJSAP jsap = new SimpleJSAP(
044: ProduceDNFFromLines.class.getName(),
045: "Prints or selects parts of a stat file using global counts.",
046: new Parameter[] {
047: new UnflaggedOption("numberOfDocuments",
048: JSAP.INTEGER_PARSER, JSAP.REQUIRED,
049: "The number of documents."),
050: new FlaggedOption("queries",
051: JSAP.INTEGER_PARSER, "1",
052: JSAP.NOT_REQUIRED, 'q', "queries",
053: "The number of queries to be produced."),
054: new FlaggedOption("docperquery",
055: JSAP.INTEGER_PARSER, "2",
056: JSAP.NOT_REQUIRED, 'd', "docperquery",
057: "The number of documents per query."),
058: new FlaggedOption("wordsperdoc",
059: JSAP.INTEGER_PARSER, "2",
060: JSAP.NOT_REQUIRED, 'w', "words",
061: "The (maximum) number of words per document."),
062:
063: });
064:
065: JSAPResult jsapResult = jsap.parse(arg);
066: if (jsap.messagePrinted())
067: return;
068:
069: final int numberOfDocuments = jsapResult
070: .getInt("numberOfDocuments");
071: final int queries = jsapResult.getInt("queries");
072: final int docperquery = jsapResult.getInt("docperquery");
073: final int wordsperdoc = jsapResult.getInt("wordsperdoc");
074:
075: if (docperquery > numberOfDocuments) {
076: System.err
077: .println("There are not enough documents for the number of documents/query required");
078: System.exit(1);
079: }
080:
081: int i, j, q, t;
082:
083: final int docs[] = new int[numberOfDocuments];
084: final int docForQuery[][] = new int[queries][docperquery];
085: final String query[][][] = new String[queries][docperquery][wordsperdoc];
086:
087: final int coveredForQuery[] = new int[queries];
088: int maxDoc = 0;
089: final boolean[] used = new boolean[numberOfDocuments];
090: for (i = 0; i < numberOfDocuments; i++)
091: docs[i] = i;
092: for (q = 0; q < queries; q++) {
093: for (i = 0; i < docperquery; i++) {
094: j = i + (int) ((numberOfDocuments - i) * Math.random());
095: t = docs[i];
096: docs[i] = docs[j];
097: docs[j] = t;
098: docForQuery[q][i] = docs[i];
099: used[docs[i]] = true;
100: if (docs[i] > maxDoc)
101: maxDoc = docs[i];
102: }
103: Arrays.sort(docForQuery[q]);
104: }
105:
106: //for ( q = 0; q < queries; q++ ) System.out.println( "Query " + q + ": " + new IntArrayList( docForQuery[ q ] ) );
107:
108: String split[];
109: int words[] = new int[1024];
110: final FastBufferedReader reader = new FastBufferedReader(
111: new InputStreamReader(System.in, "UTF-8"));
112:
113: int lineNumber = 0;
114: int numberOfPartialQueries = queries;
115: ProgressLogger pl = new ProgressLogger(LOGGER);
116: pl.itemsName = "Klines";
117: pl.expectedUpdates = maxDoc / 1000;
118: pl.start("Generating queries...");
119: MutableString line = new MutableString();
120: while (reader.readLine(line) != null
121: && numberOfPartialQueries > 0) {
122: if (used[lineNumber]) {
123: for (q = 0; q < queries; q++)
124: if (coveredForQuery[q] < docperquery
125: && docForQuery[q][coveredForQuery[q]] == lineNumber) {
126: split = line.toString().split(" ");
127: int nw = split.length;
128: words = IntArrays.ensureCapacity(words, nw + 1);
129: for (i = 0; i < nw; i++)
130: words[i] = i;
131: for (i = 0; i < Math.min(wordsperdoc, nw); i++) {
132: j = i + (int) ((nw - i) * Math.random());
133: t = words[i];
134: words[i] = words[j];
135: words[j] = t;
136: query[q][coveredForQuery[q]][i] = split[words[i]];
137: }
138: coveredForQuery[q]++;
139: if (coveredForQuery[q] == docperquery)
140: numberOfPartialQueries--;
141: }
142: }
143: lineNumber++;
144: if (lineNumber % 1000 == 0)
145: pl.update();
146: }
147: pl.done();
148:
149: MutableString p[] = new MutableString[Math.max(queries,
150: wordsperdoc)], s = new MutableString();
151: for (i = 0; i < p.length; i++)
152: p[i] = new MutableString();
153:
154: for (q = 0; q < queries; q++) {
155: for (int d = 0; d < wordsperdoc; d++) {
156: int last = 0;
157: while (last < wordsperdoc && query[q][d][last] != null)
158: last++;
159: p[d].replace('(').append(query[q][d], 0, last, " AND ")
160: .append(')');
161: }
162: System.out.println(s.length(0)
163: .append(p, 0, queries, " OR "));
164: }
165:
166: ArrayList<String> l = new ArrayList<String>();
167: final String[] emptyArray = new String[0];
168: for (q = 0; q < queries; q++) {
169: for (int w = 0; w < wordsperdoc; w++) {
170: l.clear();
171: for (int d = 0; d < wordsperdoc; d++)
172: if (query[q][d][w] != null)
173: l.add(query[q][d][w]);
174: p[w].replace('(').append(l.toArray(emptyArray), " OR ")
175: .append(')');
176: }
177: System.err.println(s.length(0).append(p, 0, wordsperdoc,
178: " AND "));
179: }
180:
181: }
182: }
|