Source Code Cross Referenced for BitStreamIndexWriter.java in » Search-Engine » mg4j » it » unimi » dsi » mg4j » index » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI
Java
Java Tutorial
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Search Engine » mg4j » it.unimi.dsi.mg4j.index
Source Cross Referenced Class Diagram Java Document (Java Doc)
001:        package it.unimi.dsi.mg4j.index;
002:
003:        /*		 
004:         * MG4J: Managing Gigabytes for Java
005:         *
006:         * Copyright (C) 2003-2007 Paolo Boldi and Sebastiano Vigna 
007:         *
008:         *  This library is free software; you can redistribute it and/or modify it
009:         *  under the terms of the GNU Lesser General Public License as published by the Free
010:         *  Software Foundation; either version 2.1 of the License, or (at your option)
011:         *  any later version.
012:         *
013:         *  This library is distributed in the hope that it will be useful, but
014:         *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
015:         *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
016:         *  for more details.
017:         *
018:         *  You should have received a copy of the GNU Lesser General Public License
019:         *  along with this program; if not, write to the Free Software
020:         *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
021:         *
022:         */
023:
024:        import it.unimi.dsi.bits.Fast;
025:        import it.unimi.dsi.fastutil.ints.IntArrays;
026:        import it.unimi.dsi.io.OutputBitStream;
027:        import it.unimi.dsi.lang.MutableString;
028:        import it.unimi.dsi.mg4j.index.CompressionFlags.Coding;
029:        import it.unimi.dsi.mg4j.index.CompressionFlags.Component;
030:        import it.unimi.dsi.mg4j.index.payload.Payload;
031:        import it.unimi.dsi.mg4j.io.InterpolativeCoding;
032:        import it.unimi.dsi.util.Properties;
033:        import jal.ints.Sorting;
034:
035:        import java.io.FileOutputStream;
036:        import java.io.IOException;
037:        import java.util.Map;
038:
039:        /** Writes a bitstream-based interleaved index.
040:         *
041:         * <H2>Offsets bit stream</H2>
042:         *
043:         * <P>An inverted index may have an associated {@link OutputBitStream} of
044:         * offsets: this file contains <code>T+1</code> integers, where <code>T</code>
045:         * is the number of inverted lists (i.e., the number of terms), and the
046:         * <code>i</code>-th entry is a suitable coding of the position in bits where
047:         * the <code>i</code>-th inverted list starts (the last entry is actually the
048:         * length, in bytes, of the inverted index file itself). The coding used for
049:         * the offset stream is a &gamma; code of the difference between the current
050:         * position and the last position.
051:         *
052:         * @author Paolo Boldi 
053:         * @author Sebastiano Vigna 
054:         * @since 0.6
055:         */
056:
057:        public class BitStreamIndexWriter extends AbstractBitStreamIndexWriter {
058:            private static final boolean ASSERTS = false;
059:
060:            /** This value of {@link #state} means that we should call {@link #newInvertedList()}.*/
061:            protected static final int BEFORE_INVERTED_LIST = 0;
062:
063:            /** This value of {@link #state} means that we are positioned at the start of an inverted list,
064:             * and we should call {@link #writeFrequency(int)}.*/
065:            protected static final int BEFORE_FREQUENCY = 1;
066:
067:            /** This value of {@link #state} means that we are ready to call {@link #newDocumentRecord()}. */
068:            protected static final int BEFORE_DOCUMENT_RECORD = 2;
069:
070:            /** This value of {@link #state} means that we just started a new document record, and we
071:             * should call {@link #writeDocumentPointer(OutputBitStream, int)}. */
072:            protected static final int BEFORE_POINTER = 3;
073:
074:            /** This value of {@link #state} can be assumed only in indices that contain payloads; it
075:             * means that we are positioned just before the payload for the current document record. */
076:            protected static final int BEFORE_PAYLOAD = 4;
077:
078:            /** This value of {@link #state} can be assumed only in indices that contain counts; it
079:             * means that we are positioned just before the count for the current document record. */
080:            protected static final int BEFORE_COUNT = 5;
081:
082:            /** This value of {@link #state} can be assumed only in indices that contain document positions; 
083:             * it means that we are positioned just before the position list of the current document record. */
084:            protected static final int BEFORE_POSITIONS = 6;
085:
086:            /** This is the first unused state. Subclasses may start from this value to define new states. */
087:            protected static final int FIRST_UNUSED_STATE = 7;
088:
089:            /** The underlying {@link OutputBitStream}. */
090:            protected OutputBitStream obs;
091:            /** The offset {@link OutputBitStream}. */
092:            private OutputBitStream offset;
093:            /** The current state of the writer. */
094:            protected int state;
095:            /** The number of document records that the current inverted list will contain. */
096:            protected int frequency;
097:            /** The number of document records already written for the current inverted list. */
098:            protected int writtenDocuments;
099:            /** The current document pointer. */
100:            protected int currentDocument;
101:            /** The last document pointer in the current list. */
102:            protected int lastDocument;
103:            /** The position (in bytes) where the last inverted list started. */
104:            private long lastInvertedListPos;
105:            /** The parameter <code>b</code> for Golomb coding of pointers. */
106:            protected int b;
107:            /** The parameter <code>log2b</code> for Golomb coding of pointers; it is the most significant bit of {@link #b}. */
108:            protected int log2b;
109:            /** The maximum number of positions in a document record so far. */
110:            public int maxCount;
111:            /** A temporary array for skewed Golomb coding. */
112:            private int[] sortedDeltas = IntArrays.EMPTY_ARRAY;
113:
114:            /** Creates a new index writer, with the specified basename. The index will be written on a file (stemmed with <samp>.index</samp>).
115:             *  If <code>writeOffsets</code>, also an offset file will be produced (stemmed with <samp>.offsets</samp>). 
116:             *  When {@link #close()} will be called, the property file will also be produced (stemmed with <samp>.properties</samp>),
117:             *  or enriched if it already exists.
118:             * 
119:             * @param basename the basename.
120:             * @param numberOfDocuments the number of documents in the collection to be indexed.
121:             * @param writeOffsets if <code>true</code>, the offset file will also be produced.
122:             * @param flags a flag map setting the coding techniques to be used (see {@link CompressionFlags}).
123:             */
124:            public BitStreamIndexWriter(final CharSequence basename,
125:                    final int numberOfDocuments, final boolean writeOffsets,
126:                    final Map<Component, Coding> flags) throws IOException {
127:                this (new OutputBitStream(new FileOutputStream(basename
128:                        + DiskBasedIndex.INDEX_EXTENSION)),
129:                        writeOffsets ? new OutputBitStream(
130:                                new FileOutputStream(basename
131:                                        + DiskBasedIndex.OFFSETS_EXTENSION))
132:                                : null, numberOfDocuments, flags);
133:            }
134:
135:            /** Creates a new index writer with payloads using the specified underlying {@link OutputBitStream}.
136:             *
137:             * @param obs the underlying output bit stream.
138:             * @param offset the offset bit stream, or <code>null</code> if offsets should not be written.
139:             * @param numberOfDocuments the number of documents in the collection to be indexed.
140:             * @param flags a flag map setting the coding techniques to be used (see {@link CompressionFlags}).
141:             */
142:            public BitStreamIndexWriter(final OutputBitStream obs,
143:                    final OutputBitStream offset, final int numberOfDocuments,
144:                    final Map<Component, Coding> flags) {
145:                super (numberOfDocuments, flags);
146:                this .obs = obs;
147:                this .offset = offset;
148:                this .frequency = -1;
149:                this .currentTerm = -1;
150:                this .maxCount = 0;
151:
152:                if (!hasCounts && hasPositions)
153:                    throw new IllegalArgumentException(
154:                            "Index would have positions but no counts (this can't happen)");
155:            }
156:
157:            /** Creates a new index writer, with the specified underlying {@link OutputBitStream},
158:             *  without an associated offset bit stream.
159:             *
160:             * @param obs the underlying output bit stream.
161:             * @param numberOfDocuments the number of documents in the collection to be indexed.
162:             * @param flags a flag map setting the coding techniques to be used (see {@link CompressionFlags}).
163:             */
164:            public BitStreamIndexWriter(final OutputBitStream obs,
165:                    final int numberOfDocuments,
166:                    final Map<Component, Coding> flags) {
167:                this (obs, null, numberOfDocuments, flags);
168:            }
169:
170:            public long newInvertedList() throws IOException {
171:                if (frequency >= 0 && frequency != writtenDocuments)
172:                    throw new IllegalStateException(
173:                            "The number of document records ("
174:                                    + this .writtenDocuments
175:                                    + ") does not match the frequency ("
176:                                    + this .frequency + ")");
177:                if (state != BEFORE_INVERTED_LIST
178:                        && state != BEFORE_DOCUMENT_RECORD)
179:                    throw new IllegalStateException(
180:                            "Trying to start new inverted list in state "
181:                                    + state);
182:
183:                // The position (in bits) where the new inverted list starts
184:                long pos = obs.writtenBits();
185:                // Reset variables
186:                writtenDocuments = 0;
187:                currentTerm++;
188:                currentDocument = -1;
189:
190:                // If needed, write the offset
191:                if (offset != null)
192:                    offset.writeLongGamma(pos - lastInvertedListPos);
193:                lastInvertedListPos = pos;
194:                state = BEFORE_FREQUENCY;
195:                return pos;
196:            }
197:
198:            public int writeFrequency(final int frequency) throws IOException {
199:                if (state != BEFORE_FREQUENCY)
200:                    throw new IllegalStateException(
201:                            "Trying to write frequency in state " + state);
202:
203:                int bitCount;
204:                // Write the frequency
205:                switch (frequencyCoding) {
206:                case SHIFTED_GAMMA:
207:                    bitCount = obs.writeShiftedGamma(frequency - 1); // frequency cannot be 0
208:                    break;
209:                case GAMMA:
210:                    bitCount = obs.writeGamma(frequency - 1); // frequency cannot be 0
211:                    break;
212:                case DELTA:
213:                    bitCount = obs.writeDelta(frequency - 1); // frequency cannot be 0
214:                    break;
215:                default:
216:                    throw new IllegalStateException(
217:                            "The required frequency coding (" + frequencyCoding
218:                                    + ") is not supported.");
219:                }
220:
221:                this .frequency = frequency;
222:
223:                // We compute the modulus used for pointer Golomb coding 
224:                if (pointerCoding == Coding.GOLOMB) {
225:                    b = BitStreamIndex.golombModulus(frequency,
226:                            numberOfDocuments);
227:                    log2b = Fast.mostSignificantBit(b);
228:                }
229:
230:                state = BEFORE_DOCUMENT_RECORD;
231:                bitsForFrequencies += bitCount;
232:                return bitCount;
233:            }
234:
235:            public OutputBitStream newDocumentRecord() throws IOException {
236:                if (frequency == writtenDocuments)
237:                    throw new IllegalStateException(
238:                            "Document record overflow (written "
239:                                    + this .frequency + " already)");
240:                if (state != BEFORE_DOCUMENT_RECORD)
241:                    throw new IllegalStateException(
242:                            "Trying to start new document record in state "
243:                                    + state);
244:
245:                writtenDocuments++;
246:                numberOfPostings++;
247:                lastDocument = currentDocument;
248:                state = BEFORE_POINTER;
249:                return obs;
250:            }
251:
252:            public int writeDocumentPointer(final OutputBitStream out,
253:                    final int pointer) throws IOException {
254:                if (state != BEFORE_POINTER)
255:                    throw new IllegalStateException(
256:                            "Trying to write pointer in state " + state);
257:
258:                currentDocument = pointer;
259:                int bitCount = 0;
260:
261:                if (frequency != numberOfDocuments) { // We do not write pointers for everywhere occurring documents.
262:                    switch (pointerCoding) {
263:                    case SHIFTED_GAMMA:
264:                        bitCount = out.writeShiftedGamma(pointer - lastDocument
265:                                - 1);
266:                        break;
267:                    case GAMMA:
268:                        bitCount = out.writeGamma(pointer - lastDocument - 1);
269:                        break;
270:                    case DELTA:
271:                        bitCount = out.writeDelta(pointer - lastDocument - 1);
272:                        break;
273:                    case GOLOMB:
274:                        bitCount = out.writeGolomb(pointer - lastDocument - 1,
275:                                b, log2b);
276:                        break;
277:                    default:
278:                        throw new IllegalStateException(
279:                                "The required pointer coding (" + pointerCoding
280:                                        + ") is not supported.");
281:                    }
282:                } else if (pointer - lastDocument != 1)
283:                    throw new IllegalStateException(
284:                            "Term "
285:                                    + currentTerm
286:                                    + " has frequency equal to the number of documents, but pointers are not consecutive integers");
287:
288:                state = hasPayloads ? BEFORE_PAYLOAD : hasCounts ? BEFORE_COUNT
289:                        : BEFORE_DOCUMENT_RECORD;
290:                bitsForPointers += bitCount;
291:                return bitCount;
292:            }
293:
294:            public int writePayload(final OutputBitStream out,
295:                    final Payload payload) throws IOException {
296:                if (frequency < 0)
297:                    throw new IllegalStateException(
298:                            "Trying to write payload without calling newInvertedList");
299:                if (state != BEFORE_PAYLOAD)
300:                    throw new IllegalStateException(
301:                            "Trying to write payload in state " + state);
302:                final int count = payload.write(out);
303:                bitsForPayloads += count;
304:                state = hasCounts ? BEFORE_COUNT : BEFORE_DOCUMENT_RECORD;
305:                return count;
306:            }
307:
308:            public void close() throws IOException {
309:                if (state != BEFORE_DOCUMENT_RECORD
310:                        && state != BEFORE_INVERTED_LIST)
311:                    throw new IllegalStateException(
312:                            "Trying to close index in state " + state);
313:                if (frequency >= 0 && frequency != writtenDocuments)
314:                    throw new IllegalStateException(
315:                            "The number of document records ("
316:                                    + this .writtenDocuments
317:                                    + ") does not match the frequency ("
318:                                    + this .frequency + ")");
319:
320:                if (writtenBits() != obs.writtenBits())
321:                    throw new IllegalStateException(
322:                            "Written bits count mismatch: we say "
323:                                    + writtenBits() + ", the stream says "
324:                                    + obs.writtenBits());
325:
326:                if (offset != null) {
327:                    offset.writeLongGamma(obs.writtenBits()
328:                            - lastInvertedListPos);
329:                    offset.close();
330:                }
331:
332:                obs.close();
333:            }
334:
335:            public int writePositionCount(final OutputBitStream out,
336:                    final int count) throws IOException {
337:                if (frequency < 0)
338:                    throw new IllegalStateException(
339:                            "Trying to write count without calling newInvertedList");
340:                if (state != BEFORE_COUNT)
341:                    throw new IllegalStateException(
342:                            "Trying to write count in state " + state);
343:                final int bitCount;
344:
345:                numberOfOccurrences += count;
346:                switch (countCoding) {
347:                case SHIFTED_GAMMA:
348:                    bitCount = out.writeShiftedGamma(count - 1);
349:                    break;
350:                case GAMMA:
351:                    bitCount = out.writeGamma(count - 1);
352:                    break;
353:                case UNARY:
354:                    bitCount = out.writeUnary(count - 1);
355:                    break;
356:                case DELTA:
357:                    bitCount = out.writeDelta(count - 1);
358:                    break;
359:                default:
360:                    throw new IllegalStateException(
361:                            "The required count coding (" + countCoding
362:                                    + ") is not supported.");
363:                }
364:
365:                state = hasPositions ? BEFORE_POSITIONS
366:                        : BEFORE_DOCUMENT_RECORD;
367:                bitsForCounts += bitCount;
368:                return bitCount;
369:            }
370:
371:            public int writeDocumentPositions(final OutputBitStream out,
372:                    final int[] occ, final int offset, final int len,
373:                    final int docSize) throws IOException {
374:                if (frequency < 0)
375:                    throw new IllegalStateException(
376:                            "Trying to write occurrences without calling newInvertedList");
377:                if (state != BEFORE_POSITIONS)
378:                    throw new IllegalStateException(
379:                            "Trying to write positions in state " + state);
380:
381:                if (ASSERTS && docSize > 0)
382:                    for (int i = 0; i < len; i++)
383:                        assert occ[offset + i] < docSize : "Position "
384:                                + occ[offset + i] + " for document "
385:                                + currentDocument + " is too large; size is "
386:                                + docSize;
387:
388:                int i;
389:                int prev = -1;
390:                int bitCount = 0;
391:                final int end = offset + len;
392:
393:                switch (positionCoding) {
394:                case GAMMA:
395:                    for (i = offset; i < end; i++) {
396:                        bitCount += out.writeGamma(occ[i] - prev - 1);
397:                        prev = occ[i];
398:                    }
399:                    break;
400:                case DELTA:
401:                    for (i = offset; i < end; i++) {
402:                        bitCount += out.writeDelta(occ[i] - prev - 1);
403:                        prev = occ[i];
404:                    }
405:                    break;
406:                case SHIFTED_GAMMA:
407:                    for (i = offset; i < end; i++) {
408:                        bitCount += out.writeShiftedGamma(occ[i] - prev - 1);
409:                        prev = occ[i];
410:                    }
411:                    break;
412:                case GOLOMB:
413:                    if (len < 3) {
414:                        for (i = 0; i < len; i++)
415:                            bitCount += out.writeMinimalBinary(occ[i], docSize);
416:                        break;
417:                    }
418:
419:                    // We compute b and log2b for positions
420:                    final int positionB = BitStreamIndex.golombModulus(len,
421:                            docSize);
422:                    final int positionLog2b = Fast
423:                            .mostSignificantBit(positionB);
424:
425:                    for (i = offset; i < end; i++) {
426:                        bitCount += out.writeGolomb(occ[i] - prev - 1,
427:                                positionB, positionLog2b);
428:                        prev = occ[i];
429:                    }
430:                    break;
431:                case SKEWED_GOLOMB:
432:                    if (len < 3) {
433:                        for (i = 0; i < len; i++)
434:                            bitCount += out.writeMinimalBinary(occ[i], docSize);
435:                        break;
436:                    }
437:
438:                    if (sortedDeltas.length < len)
439:                        sortedDeltas = new int[len];
440:                    System.arraycopy(occ, offset, sortedDeltas, 0, len);
441:                    i = len - 1;
442:                    while (i-- != 0)
443:                        sortedDeltas[i + 1] -= sortedDeltas[i] + 1;
444:                    Sorting.nth_element(sortedDeltas, 0, len / 2, len);
445:                    final int sb = sortedDeltas[len / 2] + 1; // The median of deltas
446:
447:                    bitCount = out.writeMinimalBinary(sb - 1, docSize);
448:
449:                    for (i = offset; i < end; i++) {
450:                        bitCount += out
451:                                .writeSkewedGolomb(occ[i] - prev - 1, sb);
452:                        prev = occ[i];
453:                    }
454:                    break;
455:                case INTERPOLATIVE:
456:                    bitCount = InterpolativeCoding.write(out, occ, 0, len, 0,
457:                            docSize - 1);
458:                    break;
459:                default:
460:                    throw new IllegalStateException(
461:                            "The required position coding (" + positionCoding
462:                                    + ") is not supported.");
463:                }
464:
465:                state = BEFORE_DOCUMENT_RECORD;
466:                bitsForPositions += bitCount;
467:                if (len > maxCount)
468:                    maxCount = len;
469:                return bitCount;
470:            }
471:
472:            public long writtenBits() {
473:                return bitsForFrequencies + bitsForPointers + bitsForPayloads
474:                        + bitsForCounts + bitsForPositions;
475:            }
476:
477:            public Properties properties() {
478:                Properties result = new Properties();
479:                result.setProperty(Index.PropertyKeys.DOCUMENTS,
480:                        numberOfDocuments);
481:                result.setProperty(Index.PropertyKeys.TERMS, currentTerm + 1);
482:                result.setProperty(Index.PropertyKeys.POSTINGS,
483:                        numberOfPostings);
484:                result.setProperty(Index.PropertyKeys.MAXCOUNT, maxCount);
485:                result.setProperty(Index.PropertyKeys.INDEXCLASS,
486:                        FileIndex.class.getName());
487:                // We save all flags, except for the PAYLOAD component, which is just used internally.
488:                for (Map.Entry<Component, Coding> e : flags.entrySet())
489:                    if (e.getKey() != Component.PAYLOADS)
490:                        result.addProperty(Index.PropertyKeys.CODING,
491:                                new MutableString().append(e.getKey()).append(
492:                                        ':').append(e.getValue()));
493:                return result;
494:            }
495:        }
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.