Source Code Cross Referenced for SkipBitStreamIndexWriter.java in  » Search-Engine » mg4j » it » unimi » dsi » mg4j » index » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Search Engine » mg4j » it.unimi.dsi.mg4j.index 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


001:        package it.unimi.dsi.mg4j.index;
002:
003:        /*
004:         * MG4J: Managing Gigabytes for Java
005:         *
006:         * Copyright (C) 2003-2007 Paolo Boldi and Sebastiano Vigna
007:         *
008:         *  This library is free software; you can redistribute it and/or modify it
009:         *  under the terms of the GNU Lesser General Public License as published by the Free
010:         *  Software Foundation; either version 2.1 of the License, or (at your option)
011:         *  any later version.
012:         *
013:         *  This library is distributed in the hope that it will be useful, but
014:         *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
015:         *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
016:         *  for more details.
017:         *
018:         *  You should have received a copy of the GNU Lesser General Public License
019:         *  along with this program; if not, write to the Free Software
020:         *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
021:         *
022:         */
023:
024:        import it.unimi.dsi.bits.Fast;
025:        import it.unimi.dsi.fastutil.ints.Int2IntRBTreeMap;
026:        import it.unimi.dsi.fastutil.io.FastBufferedInputStream;
027:        import it.unimi.dsi.mg4j.index.CompressionFlags.Coding;
028:        import it.unimi.dsi.mg4j.index.CompressionFlags.Component;
029:        import it.unimi.dsi.fastutil.io.FastByteArrayOutputStream;
030:        import it.unimi.dsi.io.NullOutputStream;
031:        import it.unimi.dsi.io.OutputBitStream;
032:        import it.unimi.dsi.Util;
033:        import it.unimi.dsi.util.Properties;
034:
035:        import java.io.File;
036:        import java.io.FileInputStream;
037:        import java.io.FileOutputStream;
038:        import java.io.IOException;
039:        import java.io.PrintStream;
040:        import java.io.PrintWriter;
041:        import java.util.Map;
042:
043:        /** Provides facilities to write skip inverted indices,
044:         *  that is, inverted indices with an additional skip structure. A skip inverted index
045:         *  allows one to skip ahead when reading inverted lists. More specifically,
046:         *  when reading the inverted list relative to a certain term, one may want to
047:         *  decide to skip all document records that concern documents with pointer
048:         *  less than a given integer. In a normal inverted index this is impossible:
049:         *  one would have to read all document records sequentially.
050:         *  
051:         *  <p>The skipping structure used by this class is new: details can be found 
052:         *  <a href="http://vigna.dsi.unimi.it/papers.php#BoVCPESLQIIL">here</a>.
053:         *  
054:         * @author Paolo Boldi
055:         * @author Sebastiano Vigna
056:         * @since 0.6
057:         */
058:        public class SkipBitStreamIndexWriter extends BitStreamIndexWriter {
059:            /** The size of the buffer for the temporary file used to build an inverted list. Inverted lists
060:             * shorter than this number of bytes will be directly rebuilt from the buffer, and never flushed to disk. */
061:            public final static int DEFAULT_TEMP_BUFFER_SIZE = 64 * 1024 * 1024;
062:
063:            private static final boolean ASSERTS = false;
064:            private static final boolean DEBUG = false;
065:            private final static boolean STATS = false;
066:
067:            /** Maximum number of trials when optimising the entry bit length. */
068:            private final static int MAX_TRY = 32;
069:
070:            /** The parameter <code>h</code> (the maximum height of a skip tower). */
071:            private final int h;
072:
073:            /** The parameter <code>q</code> (2<var><sup>h</sup>q</var> documents record are kept in the cache); necessarily a power of two. */
074:            private final int q;
075:
076:            /** We have <var>w</var>=2<sup><var>h</var></sup><var>q</var>. */
077:            private final int w;
078:
079:            /** The number of document records written in the cache containing the current block. */
080:            private int cache;
081:
082:            /** The <var>k</var>-th entry of this array contains the document pointer of the <var>k</var>-th
083:             *  skip document record within the current block. For sake of simplicity, <code>pointer[cache]</code>
084:             *  contains the first document pointer within the next block. */
085:            private final int[] skipPointer;
086:
087:            /** The {@link OutputBitStream}s where cached document pointers are written. */
088:            private final OutputBitStream[] cachePointer;
089:
090:            /** The {@link FastByteArrayOutputStream}s underlying <code>cachePointer</code> . */
091:            private final FastByteArrayOutputStream[] cachePointerByte;
092:
093:            /** The {@link OutputBitStream}s where cached skip towers are written. Indices are skip
094:             *  indices. */
095:            private final OutputBitStream[] cacheSkip;
096:
097:            /** An array whose entries (as many as those of {@link #cacheSkip}) are all {@link #bitCount}. */
098:            private final OutputBitStream[] cacheSkipBitCount;
099:
100:            /** The {@link FastByteArrayOutputStream}s underlying <code>cacheSkip</code> . Indices are skip
101:             *  indices. */
102:            private final FastByteArrayOutputStream[] cacheSkipByte;
103:
104:            /** The {@link OutputBitStream} where cached document data are written. */
105:            private final CachingOutputBitStream cacheDataOut;
106:
107:            /** The {@link FastBufferedInputStream} from which cached document data are read. */
108:            private final FastBufferedInputStream cacheDataIn;
109:
110:            /** The length of the data segment for each quantum. */
111:            private final int[] cacheDataLength;
112:
113:            /** An {@link OutputBitStream} wrapping a {@link NullOutputStream} for code-length preview. */
114:            private final OutputBitStream bitCount;
115:
116:            /** The sum of all tower data computed so far. */
117:            public final TowerData towerData;
118:
119:            /** The number of bits written for quantum lengths. */
120:            public long bitsForQuantumBitLengths;
121:
122:            /** The number of bits written for entry lenghts. */
123:            public long bitsForEntryBitLengths;
124:
125:            /** The number of written blocks. */
126:            public long numberOfBlocks;
127:
128:            /** An estimate on the number of bits occupied per tower entry in the last written cache, or -1 if no cache has been
129:             * written for the current inverted list. */
130:            public int prevEntryBitLength;
131:
132:            /** An estimate on the number of bits occupied per quantum in the last written cache, or -1 if no cache has been
133:             * written for the current inverted list. */
134:            public int prevQuantumBitLength;
135:
136:            /** The Golomb modulus for a top pointer skip, for each level. */
137:            private final int[] towerTopB;
138:
139:            /** The most significant bit of the Golomb modulus for a top pointer skip, for each level. */
140:            private final int[] towerTopLog2B;
141:
142:            /** The Golomb modulus for a lower pointer skip, for each level. */
143:            private final int[] towerLowerB;
144:
145:            /** The most significant bit of the Golomb modulus for a lower pointer skip, for each level. */
146:            private final int[] towerLowerLog2B;
147:
148:            /** The prediction for a pointer skip, for each level. */
149:            private final int[] pointerPrediction;
150:
151:            private java.io.PrintWriter pointerSkipStats;
152:            private java.io.PrintWriter pointerTopSkipStats;
153:            private java.io.PrintWriter bitSkipStats;
154:            private java.io.PrintWriter bitTopSkipStats;
155:            private String pointerSkipLine, bitSkipLine;
156:
157:            /** Creates a new skip index writer, with the specified basename. The index will be written on a file (stemmed with <samp>.index</samp>).
158:             *  If <code>writeOffsets</code>, also an offset file will be produced (stemmed with <samp>.offsets</samp>). 
159:             *  
160:             *  <p>The size of the internal temporary buffer will be {@link #DEFAULT_TEMP_BUFFER_SIZE}.
161:             * 
162:             * @param basename the basename.
163:             * @param numberOfDocuments the number of documents in the collection to be indexed.
164:             * @param writeOffsets if <code>true</code>, the offset file will also be produced.
165:             * @param flags a flag map setting the coding techniques to be used (see {@link CompressionFlags}).
166:             * @param q the cache contains at most <var>2<sup>h</sup></var> document records.
167:             * @param h the maximum height of a skip tower.
168:             */
169:            public SkipBitStreamIndexWriter(final CharSequence basename,
170:                    final int numberOfDocuments, final boolean writeOffsets,
171:                    final Map<Component, Coding> flags, final int q, final int h)
172:                    throws IOException {
173:                this (new OutputBitStream(new FileOutputStream(basename
174:                        + DiskBasedIndex.INDEX_EXTENSION)),
175:                        writeOffsets ? new OutputBitStream(
176:                                new FileOutputStream(basename
177:                                        + DiskBasedIndex.OFFSETS_EXTENSION))
178:                                : null, numberOfDocuments,
179:                        DEFAULT_TEMP_BUFFER_SIZE, flags, q, h);
180:            }
181:
182:            /** Creates a new skip index writer, with the specified basename. The index will be written on a file (stemmed with <samp>.index</samp>).
183:             *  If <code>writeOffsets</code>, also an offset file will be produced (stemmed with <samp>.offsets</samp>). 
184:             * 
185:             * @param basename the basename.
186:             * @param numberOfDocuments the number of documents in the collection to be indexed.
187:             * @param writeOffsets if <code>true</code>, the offset file will also be produced.
188:             * @param tempBufferSize the size in bytes of the internal temporary buffer (inverted lists shorter than this size will never be flushed to disk). 
189:             * @param flags a flag map setting the coding techniques to be used (see {@link CompressionFlags}).
190:             * @param q the cache contains at most <var>2<sup>h</sup></var> document records.
191:             * @param h the maximum height of a skip tower.
192:             */
193:            public SkipBitStreamIndexWriter(final CharSequence basename,
194:                    final int numberOfDocuments, final boolean writeOffsets,
195:                    int tempBufferSize, final Map<Component, Coding> flags,
196:                    final int q, final int h) throws IOException {
197:                this (new OutputBitStream(new FileOutputStream(basename
198:                        + DiskBasedIndex.INDEX_EXTENSION)),
199:                        writeOffsets ? new OutputBitStream(
200:                                new FileOutputStream(basename
201:                                        + DiskBasedIndex.OFFSETS_EXTENSION))
202:                                : null, numberOfDocuments, tempBufferSize,
203:                        flags, q, h);
204:            }
205:
206:            /**
207:             * Creates a new skip index writer.
208:             * 
209:             * @param obs the underlying output bit stream.
210:             * @param offset the offset bit stream.
211:             * @param N the number of documents in the collection to be indexed.
212:             * @param tempBufferSize the size in bytes of the internal temporary buffer (inverted lists shorter than this size will never be flushed to disk). 
213:             * @param flags a flag map setting the coding techniques to be used (see {@link CompressionFlags}).
214:             * @param q the cache contains at most <var>2<sup>h</sup></var> document records.
215:             * @param h the maximum height of a skip tower.
216:             * @throws IOException
217:             */
218:            public SkipBitStreamIndexWriter(final OutputBitStream obs,
219:                    final OutputBitStream offset, final int N,
220:                    int tempBufferSize, final Map<Component, Coding> flags,
221:                    final int q, final int h) throws IOException {
222:                super (obs, offset, N, flags);
223:                if (h < 0)
224:                    throw new IllegalArgumentException("Illegal height " + h);
225:                if (q <= 0 || (q & -q) != q)
226:                    throw new IllegalArgumentException("Illegal quantum " + q);
227:                this .h = h;
228:                this .q = q;
229:
230:                int two2h = 1 << h;
231:                w = two2h * q;
232:
233:                if (DEBUG) {
234:                    System.err.println("Cache will contain at most " + w
235:                            + " records (q=" + q + ",h=" + h + ")");
236:                    System.err.print("Skip records will be ");
237:                    for (int i = 0; i < two2h; i++)
238:                        System.err.print((i * q) + " ");
239:                    System.err.println();
240:                }
241:
242:                towerData = new TowerData();
243:                tempFile = File.createTempFile("MG4J", ".data");
244:                cacheDataIn = new FastBufferedInputStream(new FileInputStream(
245:                        tempFile));
246:                cacheDataOut = new CachingOutputBitStream(tempFile,
247:                        tempBufferSize);
248:                cacheDataLength = new int[two2h];
249:                cachePointer = new OutputBitStream[two2h];
250:                cachePointerByte = new FastByteArrayOutputStream[two2h];
251:
252:                for (int i = 0; i < two2h; i++)
253:                    cachePointer[i] = new OutputBitStream(
254:                            cachePointerByte[i] = new FastByteArrayOutputStream(),
255:                            0);
256:
257:                cacheSkip = new OutputBitStream[two2h];
258:                cacheSkipBitCount = new OutputBitStream[two2h];
259:                cacheSkipByte = new FastByteArrayOutputStream[two2h];
260:
261:                for (int i = 0; i < two2h; i++) {
262:                    cacheSkip[i] = new OutputBitStream(
263:                            cacheSkipByte[i] = new FastByteArrayOutputStream(),
264:                            0);
265:                    cacheSkipBitCount[i] = new OutputBitStream(NullOutputStream
266:                            .getInstance(), 0);
267:                }
268:
269:                skipPointer = new int[two2h + 1];
270:                distance = new long[two2h + 1];
271:
272:                bitCount = new OutputBitStream(NullOutputStream.getInstance(),
273:                        0);
274:
275:                towerTopB = new int[h + 1];
276:                towerTopLog2B = new int[h + 1];
277:                towerLowerB = new int[h + 1];
278:                towerLowerLog2B = new int[h + 1];
279:                pointerPrediction = new int[h + 1];
280:
281:                if (STATS) {
282:                    try {
283:                        pointerSkipStats = new PrintWriter(
284:                                new java.io.FileWriter("pointerSkip.stats"));
285:                        pointerTopSkipStats = new PrintWriter(
286:                                new java.io.FileWriter("pointerTopSkip.stats"));
287:                        bitSkipStats = new PrintWriter(new java.io.FileWriter(
288:                                "bitSkip.stats"));
289:                        bitTopSkipStats = new PrintWriter(
290:                                new java.io.FileWriter("bitTopSkip.stats"));
291:
292:                        String comment = "# " + System.getProperty("freq")
293:                                + "\u00b1" + Integer.getInteger("error") + "%";
294:                        pointerSkipStats.println(comment);
295:                        pointerTopSkipStats.println(comment);
296:                        bitSkipStats.println(comment);
297:                        bitTopSkipStats.println(comment);
298:                    } catch (IOException e) {
299:                        System.err.println("Can't open stat files: " + e);
300:                    }
301:                }
302:            }
303:
304:            public long newInvertedList() throws IOException {
305:                if (cache != 0)
306:                    writeOutCache(-1);
307:                return super .newInvertedList();
308:            }
309:
310:            public int writeFrequency(final int frequency) throws IOException {
311:                int res = super .writeFrequency(frequency);
312:                prevQuantumBitLength = prevEntryBitLength = -1;
313:
314:                if (DEBUG)
315:                    System.err.println("----------- " + currentTerm + " ("
316:                            + frequency + ")");
317:
318:                final long pointerQuantumSigma = BitStreamIndex.quantumSigma(
319:                        frequency, numberOfDocuments, q);
320:                for (int i = Math
321:                        .min(h, Fast.mostSignificantBit(frequency / q)); i >= 0; i--) {
322:                    towerTopB[i] = BitStreamIndex.gaussianGolombModulus(
323:                            pointerQuantumSigma, i + 1);
324:                    towerTopLog2B[i] = Fast.mostSignificantBit(towerTopB[i]);
325:                    towerLowerB[i] = BitStreamIndex.gaussianGolombModulus(
326:                            pointerQuantumSigma, i);
327:                    towerLowerLog2B[i] = Fast
328:                            .mostSignificantBit(towerLowerB[i]);
329:                    pointerPrediction[i] = (int) ((q * (1L << i)
330:                            * numberOfDocuments + frequency / 2) / frequency);
331:                }
332:
333:                return res;
334:            }
335:
336:            public OutputBitStream newDocumentRecord() throws IOException {
337:                super .newDocumentRecord();
338:                return cacheDataOut;
339:            }
340:
341:            public int writeDocumentPointer(final OutputBitStream out,
342:                    final int pointer) throws IOException {
343:                // If the previous block is over, write it out!
344:
345:                if (cache == w)
346:                    writeOutCache(pointer);
347:
348:                // Record data pointer if we are on a skip; otherwise, write it to the cache.
349:                if (cache % q == 0) {
350:                    if (cache / q > 0)
351:                        cacheDataLength[cache / q - 1] = (int) cacheDataOut
352:                                .writtenBits();
353:                    cacheDataOut.align();
354:                    cacheDataOut.writtenBits(0);
355:                    skipPointer[cache / q] = pointer;
356:                    return super .writeDocumentPointer(
357:                            cachePointer[cache++ / q], pointer);
358:                } else {
359:                    cache++;
360:                    return super .writeDocumentPointer(cacheDataOut, pointer);
361:                }
362:            }
363:
364:            private int writeOutPointer(final OutputBitStream out,
365:                    final int pointer) throws IOException {
366:                if (frequency == numberOfDocuments)
367:                    return 0; // We do not write pointers for everywhere occurring terms.
368:
369:                switch (pointerCoding) {
370:                case GAMMA:
371:                    return out.writeGamma(pointer - lastDocument - 1);
372:                case DELTA:
373:                    return out.writeDelta(pointer - lastDocument - 1);
374:                case GOLOMB:
375:                    return out
376:                            .writeGolomb(pointer - lastDocument - 1, b, log2b);
377:                default:
378:                    throw new IllegalStateException(
379:                            "The required pointer coding (" + pointerCoding
380:                                    + ") is not supported.");
381:                }
382:            }
383:
384:            public void close() throws IOException {
385:
386:                if (cache != 0)
387:                    writeOutCache(-1);
388:                super .close();
389:
390:                cacheDataIn.close();
391:                cacheDataOut.close();
392:                tempFile.delete();
393:
394:                if (STATS) {
395:                    pointerSkipStats.close();
396:                    pointerTopSkipStats.close();
397:                    bitSkipStats.close();
398:                    bitTopSkipStats.close();
399:                }
400:            }
401:
402:            /** A structure maintaining statistical data about tower construction. */
403:
404:            public static class TowerData {
405:                /** The number of bits written for bit skips at the top of a tower. */
406:                public long bitsForTopBitSkips;
407:
408:                /** The number of bits written for skip pointers at the top of a tower. */
409:                public long bitsForTopSkipPointers;
410:
411:                /** The number of bits written for bit skips in the lower part of a tower. */
412:                public long bitsForLowerBitSkips;
413:
414:                /** The number of bits written for skip pointers in the lower part of a tower. */
415:                public long bitsForLowerSkipPointers;
416:
417:                /** The number of bits written for tower lengths. */
418:                public long bitsForTowerLengths;
419:
420:                /** The number of written skip towers. */
421:                public long numberOfSkipTowers;
422:
423:                /** The number of written top skip entries. */
424:                public long numberOfTopEntries;
425:
426:                /** The number of written lower skip entries. */
427:                public long numberOfLowerEntries;
428:
429:                /** Clear all fields of this tower data. */
430:
431:                void clear() {
432:                    bitsForTopBitSkips = 0;
433:                    bitsForTopSkipPointers = 0;
434:                    bitsForLowerBitSkips = 0;
435:                    bitsForLowerSkipPointers = 0;
436:                    bitsForTowerLengths = 0;
437:                    numberOfSkipTowers = 0;
438:                    numberOfTopEntries = 0;
439:                    numberOfLowerEntries = 0;
440:                }
441:
442:                /** Returns the overall number of bits used for skip pointers.
443:                 * @return the overall number of bits used for skip pointers.
444:                 */
445:                public long bitsForSkipPointers() {
446:                    return bitsForTopSkipPointers + bitsForLowerSkipPointers;
447:                }
448:
449:                /** Returns the overall number of bits used for bit skips. 
450:                 * @return the overall number of bits used for bit skips.
451:                 */
452:                public long bitsForBitSkips() {
453:                    return bitsForTopBitSkips + bitsForLowerBitSkips;
454:                }
455:
456:                /** Returns the overall number of bits used for tower entries (bits for tower lengths are not included).
457:                 * @return the overall number of bits used for tower entries.
458:                 */
459:                public long bitsForEntries() {
460:                    return bitsForSkipPointers() + bitsForBitSkips();
461:                }
462:
463:                /** Returns the overall number of bits used for towers.
464:                 * @return the overall number of bits used for towers.
465:                 */
466:                public long bitsForTowers() {
467:                    return bitsForTowerLengths + bitsForEntries();
468:                }
469:
470:                /** Returns the overall number of entries.
471:                 * @return the overall number of entries.
472:                 */
473:                public long numberOfEntries() {
474:                    return numberOfTopEntries + numberOfLowerEntries;
475:                }
476:            }
477:
478:            /** The <var>k</var>-th entry of this array contains the number of bits from the start of
479:             * the <var>k</var>-th skip tower up to the end of the current block (more precisely,
480:             * to the point that should be reached via skipping, which is just after the document pointer).
481:             * Indices are skip indices. It is used just by {@link #tryTower(int, int, long, OutputBitStream[], TowerData, boolean)}, 
482:             * but it is declared here for efficiency.
483:             */
484:            final private long[] distance;
485:
486:            /** The temporary file dumping the index data contained in a block. */
487:            final private File tempFile;
488:
489:            /** Whether we should write stats. */
490:            private boolean writeStats;
491:
492:            /** Computes the towers.
493:             * 
494:             * @param quantumBitLength the length in bits of a quantum.
495:             * @param entryBitLength the estimated length in bits of a tower entry.
496:             * @param toTheEnd the number of bits that must be skipped to reach the next tower (usually,
497:             * the length of the first pointer of the next block or 0 if this is to be the last block).
498:             * @param skip an array of output bit stream where the data related to each tower will be written.
499:             * @param towerData will be filled with statistical date about the towers.
500:             * @param doinIt if true, we are actually writing a tower, not just trying.
501:             */
502:            private void tryTower(final int quantumBitLength,
503:                    final int entryBitLength, long toTheEnd,
504:                    final OutputBitStream[] skip, final TowerData towerData,
505:                    final boolean doinIt) throws IOException {
506:                int i, k, s;
507:                long d;
508:                int basePointer;
509:                // truncated is true only for those towers (in defective blocks) whose height is strictly smaller than the height they should have
510:                boolean truncated = false;
511:
512:                if (DEBUG && doinIt)
513:                    System.err.println("Writing out tower for term "
514:                            + currentTerm + "; quantumBitLength="
515:                            + quantumBitLength + " entryBitLength="
516:                            + entryBitLength);
517:
518:                for (k = (cache - 1) / q; k >= 0; k--) {
519:                    // Where are we? At the end of the k-th quantum. So toTheEnd must be increased by
520:                    // the length of the data contained in the same quantum, moving us...
521:                    toTheEnd += cacheDataLength[k];
522:
523:                    // ...just after the k-th skip tower.
524:                    // We compute the maximum valid index of the skip tower (*MUST* be kept in sync with the subsequent loop).
525:                    s = (k == 0) ? h : Fast.leastSignificantBit(k);
526:
527:                    // This test handles defective blocks. In particular, for defective quanta s=-1,
528:                    // yielding no skipping data at all for such quanta. truncated is true if the
529:                    // current tower is truncated w.r.t. the infinite skip list.
530:                    if (cache < w) {
531:                        final int upperBound = Fast
532:                                .mostSignificantBit((cache / q) - k);
533:                        if (s > upperBound) {
534:                            s = upperBound;
535:                            truncated = true;
536:                        } else
537:                            truncated = false;
538:                    } else
539:                        truncated = k == 0;
540:
541:                    skip[k].writtenBits(0);
542:
543:                    if (s >= 0) {
544:                        if (DEBUG && doinIt)
545:                            System.err.print("% (" + k + ") [" + skipPointer[k]
546:                                    + "] ");
547:
548:                        basePointer = skipPointer[k];
549:
550:                        /* If the current tower is truncated, we must actually write the top of the tower.
551:                         * The top must be forecast in a Bernoullian way: we write it as a difference from the average pointer skip, 
552:                         * which is q 2^s / relativeFrequency. */
553:                        if (truncated) {
554:                            towerData.numberOfTopEntries++;
555:                            towerData.bitsForTopSkipPointers += skip[k]
556:                                    .writeGolomb(Fast.int2nat(skipPointer[k
557:                                            + (1 << s)]
558:                                            - basePointer
559:                                            - pointerPrediction[s]),
560:                                            towerTopB[s], towerTopLog2B[s]);
561:                            towerData.bitsForTopBitSkips += skip[k]
562:                                    .writeLongDelta(Fast
563:                                            .int2nat((int) ((toTheEnd - distance[k
564:                                                    + (1 << s)]) - (quantumBitLength
565:                                                    * (1 << s) + entryBitLength
566:                                                    * ((1 << s + 1) - s - 2)))));
567:                        }
568:
569:                        if (DEBUG && doinIt)
570:                            System.err.print((truncated ? "" : "(")
571:                                    + (skipPointer[k + (1 << s)] - basePointer)
572:                                    + ":" + (toTheEnd - distance[k + (1 << s)])
573:                                    + (truncated ? " " : ") "));
574:
575:                        if (STATS && doinIt && writeStats) {
576:                            // These *MUST* be kept in sync with the writes above.
577:                            if (truncated) {
578:                                pointerTopSkipStats
579:                                        .println(s
580:                                                + "\t"
581:                                                + (skipPointer[k + (1 << s)]
582:                                                        - basePointer - pointerPrediction[s]));
583:                                bitTopSkipStats
584:                                        .println(s
585:                                                + "\t"
586:                                                + (int) ((toTheEnd - distance[k
587:                                                        + (1 << s)]) - (quantumBitLength
588:                                                        * (1 << s) + entryBitLength
589:                                                        * ((1 << s + 1) - s - 2))));
590:                            }
591:                            pointerSkipLine = "";
592:                            bitSkipLine = "";
593:                        }
594:
595:                        // Produce a (single) tower of height s
596:                        for (i = s - 1; i >= 0; i--) {
597:                            towerData.bitsForLowerSkipPointers += skip[k]
598:                                    .writeGolomb(
599:                                            Fast
600:                                                    .int2nat((skipPointer[k
601:                                                            + (1 << i)] - basePointer)
602:                                                            - ((skipPointer[k
603:                                                                    + (1 << i + 1)] - basePointer) / 2)),
604:                                            towerLowerB[i], towerLowerLog2B[i]);
605:
606:                            if (STATS && doinIt && writeStats)
607:                                pointerSkipLine = ((skipPointer[k + (1 << i)] - basePointer) - ((skipPointer[k
608:                                        + (1 << i + 1)] - basePointer) / 2))
609:                                        + "\t" + pointerSkipLine;
610:
611:                            towerData.bitsForLowerBitSkips += skip[k]
612:                                    .writeLongDelta(Fast
613:                                            .int2nat((int) (((toTheEnd
614:                                                    - distance[k
615:                                                            + (1 << (i + 1))] - entryBitLength
616:                                                    * (i + 1)) / 2) - (toTheEnd - distance[k
617:                                                    + (1 << i)]))));
618:
619:                            if (STATS && doinIt && writeStats)
620:                                bitSkipLine = (int) (((toTheEnd
621:                                        - distance[k + (1 << (i + 1))] - entryBitLength
622:                                        * (i + 1)) / 2) - (toTheEnd - distance[k
623:                                        + (1 << i)]))
624:                                        + "\t" + bitSkipLine;
625:
626:                            if (DEBUG && doinIt)
627:                                System.err
628:                                        .print((skipPointer[k + (1 << i)] - basePointer)
629:                                                + ":"
630:                                                + (toTheEnd - distance[k
631:                                                        + (1 << i)]) + " ");
632:                        }
633:
634:                        if (s > 0) { // No length for single-entry towers.
635:                            d = bitCount.writeDelta(Fast.int2nat((int) skip[k]
636:                                    .writtenBits()
637:                                    - (s + 1) * entryBitLength));
638:                            towerData.bitsForTowerLengths += d;
639:                            toTheEnd += d;
640:                        }
641:
642:                        if (STATS && doinIt && writeStats) {
643:                            if (pointerSkipLine.length() > 0)
644:                                pointerSkipStats.println(pointerSkipLine);
645:                            if (bitSkipLine.length() > 0)
646:                                bitSkipStats.println(bitSkipLine);
647:                        }
648:
649:                        toTheEnd += skip[k].writtenBits();
650:
651:                        if (DEBUG && doinIt)
652:                            System.err.print(" (" + (int) skip[k].writtenBits()
653:                                    + " bits)");
654:
655:                        towerData.numberOfLowerEntries += s;
656:                        towerData.numberOfSkipTowers++;
657:
658:                        if (DEBUG && doinIt)
659:                            System.err.println();
660:                    }
661:
662:                    distance[k] = toTheEnd;
663:
664:                    // Where are we? Just before the beginning of the k-th skip tower
665:                    toTheEnd += cachePointer[k].writtenBits();
666:
667:                    // Where are we? Just before the beginning of the k-th document record
668:                }
669:            }
670:
671:            /** Write out the cache content.
672:             * 
673:             * @param nextPointer the first pointer of the next block, or -1 if this is the last block.
674:             */
675:            private void writeOutCache(final int nextPointer)
676:                    throws IOException {
677:                if (DEBUG)
678:                    System.err.println("Entered writeOutCache() with cache="
679:                            + cache + " (H is " + (1 << h) + ", B is " + w
680:                            + ")");
681:
682:                cacheDataLength[(cache + q - 1) / q - 1] = (int) cacheDataOut
683:                        .writtenBits();
684:
685:                /* Number of bits to go after the first pointer of the first record of the next block (or, if there
686:                   is no other block in the current list, to go to the end of the list). */
687:                long toTheEnd;
688:
689:                // Record the new document pointer for the highest tower
690:                int nextAfter = ((cache + q) - 1) / q; // This is ceil( cache / q )
691:
692:                if (nextPointer >= 0) {
693:                    skipPointer[nextAfter] = nextPointer;
694:                    toTheEnd = writeOutPointer(bitCount, nextPointer);
695:                } else {
696:                    skipPointer[nextAfter] = currentDocument + 1; // Fake: just for the last block
697:                    toTheEnd = 0;
698:                }
699:
700:                distance[nextAfter] = 0;
701:
702:                int k, s;
703:                long d;
704:
705:                // Compute quantum length in bits (without towers)
706:                int quantumBitLength = 0, entryBitLength = 0;
707:
708:                for (d = k = 0; k <= ((cache - 1) / q); k++)
709:                    d += (cachePointer[k].writtenBits() + cacheDataLength[k]);
710:                quantumBitLength = (int) (((d * q) + (cache - 1)) / cache);
711:
712:                final TowerData td = new TowerData();
713:                final Int2IntRBTreeMap candidates = new Int2IntRBTreeMap();
714:
715:                /* As a first try, we compute the tower costs using 0 as average entry bit length. */
716:                tryTower(quantumBitLength, 0, toTheEnd, cacheSkipBitCount, td,
717:                        false);
718:
719:                if (td.numberOfSkipTowers > 0) { // There actually is at least a tower.
720:                    /* Now we repeat this operation, trying to obtain the best value for the
721:                     * average entry bit length. 
722:                     */
723:
724:                    while (candidates.size() < MAX_TRY
725:                            && !candidates
726:                                    .containsValue(entryBitLength = (int) (td
727:                                            .bitsForTowers() / td
728:                                            .numberOfEntries()))) {
729:                        td.clear();
730:                        tryTower(quantumBitLength, entryBitLength, toTheEnd,
731:                                cacheSkipBitCount, td, false);
732:                        candidates.put((int) (td.bitsForTowers() / td
733:                                .numberOfEntries()), entryBitLength);
734:                    }
735:
736:                    if (ASSERTS)
737:                        assert candidates.size() < MAX_TRY;
738:
739:                    entryBitLength = candidates.get(candidates.firstIntKey());
740:
741:                    if (STATS && System.getProperty("freq") != null) {
742:                        final double freq = Double.parseDouble(System
743:                                .getProperty("freq"));
744:                        final double error = Integer.getInteger("error")
745:                                .intValue() / 100.0;
746:                        final double relativeFrequency = (double) frequency
747:                                / numberOfDocuments;
748:                        if ((writeStats = (relativeFrequency >= freq
749:                                * (1 - error) && relativeFrequency <= freq
750:                                * (1 + error)))) {
751:                            pointerSkipStats.println("# " + currentTerm);
752:                            pointerTopSkipStats.println("# " + currentTerm);
753:                            bitSkipStats.println("# " + currentTerm + " "
754:                                    + quantumBitLength + " " + entryBitLength);
755:                            bitTopSkipStats.println("# " + currentTerm + " "
756:                                    + quantumBitLength + " " + entryBitLength);
757:                        }
758:                    }
759:                    if (DEBUG)
760:                        System.err.println("Going to write tower at position "
761:                                + obs.writtenBits());
762:                    tryTower(quantumBitLength, entryBitLength, toTheEnd,
763:                            cacheSkip, towerData, true);
764:                }
765:
766:                // Ready to write out cache
767:                int maxCacheDataLength = 0;
768:                for (k = 0; k <= ((cache - 1) / q); k++)
769:                    if (cacheDataLength[k] > maxCacheDataLength)
770:                        maxCacheDataLength = cacheDataLength[k];
771:
772:                /* We have two ways of writing out cached data. If all the data is still in the output bit
773:                 * stream buffer, we just read it directly. Otherwise, we have to pour it into a temporary buffer. */
774:
775:                final byte[] buffer;
776:                final boolean direct;
777:                int pos = 0;
778:
779:                cacheDataOut.align();
780:
781:                if (cacheDataOut.buffer() != null) {
782:                    buffer = cacheDataOut.buffer();
783:                    direct = true;
784:                } else {
785:                    cacheDataOut.flush();
786:                    buffer = new byte[(maxCacheDataLength + 7) / 8];
787:                    direct = false;
788:                    cacheDataIn.flush();
789:                    cacheDataIn.position(0);
790:                }
791:
792:                for (k = 0; k <= ((cache - 1) / q); k++) {
793:
794:                    /* See comments above. */
795:                    s = (k == 0) ? h : Fast.leastSignificantBit(k);
796:
797:                    if (cache < w)
798:                        s = Math.min(s, Fast
799:                                .mostSignificantBit((cache / q) - k));
800:
801:                    d = cachePointer[k].writtenBits();
802:                    cachePointer[k].flush();
803:                    obs.write(cachePointerByte[k].array, d);
804:
805:                    d = cacheSkip[k].writtenBits();
806:                    cacheSkip[k].flush();
807:
808:                    if (s >= 0) {
809:                        if (k == 0) {
810:                            if (prevQuantumBitLength < 0) {
811:                                bitsForQuantumBitLengths += obs
812:                                        .writeLongDelta(quantumBitLength);
813:                                bitsForEntryBitLengths += obs
814:                                        .writeLongDelta(entryBitLength);
815:                            } else {
816:                                bitsForQuantumBitLengths += obs
817:                                        .writeLongDelta(Fast
818:                                                .int2nat(quantumBitLength
819:                                                        - prevQuantumBitLength));
820:                                bitsForEntryBitLengths += obs
821:                                        .writeLongDelta(Fast
822:                                                .int2nat(entryBitLength
823:                                                        - prevEntryBitLength));
824:                            }
825:
826:                            prevQuantumBitLength = quantumBitLength;
827:                            prevEntryBitLength = entryBitLength;
828:
829:                            numberOfBlocks++;
830:                        }
831:
832:                        if (s > 0)
833:                            obs.writeDelta(Fast.int2nat((int) d
834:                                    - entryBitLength * (s + 1))); // No length for single-entry towers.
835:                    } else if (ASSERTS)
836:                        assert d == 0;
837:
838:                    obs.write(cacheSkipByte[k].array, d);
839:
840:                    if (direct) {
841:                        obs.write(buffer, pos * 8, cacheDataLength[k]);
842:                        pos += (cacheDataLength[k] + 7) / 8;
843:                    } else {
844:                        cacheDataIn.read(buffer, 0,
845:                                (cacheDataLength[k] + 7) / 8);
846:                        obs.write(buffer, cacheDataLength[k]);
847:                    }
848:                }
849:
850:                // Clean used caches
851:                for (k = 0; k <= ((cache - 1) / q); k++) {
852:                    cachePointerByte[k].reset();
853:                    cachePointer[k].writtenBits(0);
854:
855:                    cacheSkipByte[k].reset();
856:                    cacheSkip[k].writtenBits(0);
857:
858:                    cacheDataOut.position(0);
859:                    cacheDataOut.writtenBits(0);
860:                }
861:
862:                cache = 0;
863:
864:                if (ASSERTS)
865:                    assert obs.writtenBits() == writtenBits();
866:            }
867:
868:            public long writtenBits() {
869:                return super .writtenBits() + towerData.bitsForTopSkipPointers
870:                        + towerData.bitsForTopBitSkips
871:                        + towerData.bitsForLowerSkipPointers
872:                        + towerData.bitsForLowerBitSkips
873:                        + towerData.bitsForTowerLengths
874:                        + bitsForQuantumBitLengths + bitsForEntryBitLengths;
875:            }
876:
877:            public Properties properties() {
878:                Properties result = super .properties();
879:                result.setProperty(Index.PropertyKeys.INDEXCLASS,
880:                        FileIndex.class.getName());
881:                result.setProperty(BitStreamIndex.PropertyKeys.SKIPQUANTUM, q);
882:                result.setProperty(BitStreamIndex.PropertyKeys.SKIPHEIGHT, h);
883:                return result;
884:            }
885:
886:            public void printStats(final PrintStream stats) {
887:                super .printStats(stats);
888:                stats.println("Skip towers: "
889:                        + Util.format(towerData.numberOfSkipTowers)
890:                        + " ("
891:                        + Util.format(towerData.bitsForTowers())
892:                        + " bits ["
893:                        + Util.format(towerData.bitsForTowers() * 100.0
894:                                / writtenBits())
895:                        + "%], "
896:                        + Util.format(towerData.bitsForTowers()
897:                                / (double) towerData.numberOfSkipTowers)
898:                        + " bits/tower)");
899:                stats.println("Skip entries: "
900:                        + Util.format(towerData.numberOfEntries())
901:                        + " ("
902:                        + Util.format(towerData.bitsForEntries()
903:                                / (double) towerData.numberOfEntries())
904:                        + " bits/entry)");
905:                // Note that lengths are written approximately every other tower.
906:                stats.println("Skip tower lengths: "
907:                        + Util.format(towerData.bitsForTowerLengths)
908:                        + " bits ("
909:                        + Util.format(2.0 * towerData.bitsForTowerLengths
910:                                / towerData.numberOfSkipTowers)
911:                        + " bits/tower)");
912:                stats.println("Quantum bit lengths: "
913:                        + Util.format(bitsForQuantumBitLengths)
914:                        + " bits ("
915:                        + Util.format(bitsForQuantumBitLengths
916:                                / (double) numberOfBlocks) + " bits/block)");
917:                stats.println("Entry bit lengths: "
918:                        + Util.format(bitsForEntryBitLengths)
919:                        + " bits ("
920:                        + Util.format(bitsForEntryBitLengths
921:                                / (double) numberOfBlocks) + " bits/block)");
922:
923:                stats.println("Top bit skips: "
924:                        + Util.format(towerData.bitsForTopBitSkips)
925:                        + " bits ("
926:                        + Util.format(towerData.bitsForTopBitSkips
927:                                / (double) towerData.numberOfTopEntries)
928:                        + " bits/skip)");
929:                stats.println("Top pointer skips: "
930:                        + Util.format(towerData.bitsForTopSkipPointers)
931:                        + " bits ("
932:                        + Util.format(towerData.bitsForTopSkipPointers
933:                                / (double) towerData.numberOfTopEntries)
934:                        + " bits/skip)");
935:                stats.println("Lower bit skips: "
936:                        + Util.format(towerData.bitsForLowerBitSkips)
937:                        + " bits ("
938:                        + Util.format(towerData.bitsForLowerBitSkips
939:                                / (double) towerData.numberOfLowerEntries)
940:                        + " bits/skip)");
941:                stats.println("Lower pointer skips: "
942:                        + Util.format(towerData.bitsForLowerSkipPointers)
943:                        + " bits ("
944:                        + Util.format(towerData.bitsForLowerSkipPointers
945:                                / (double) towerData.numberOfLowerEntries)
946:                        + " bits/skip)");
947:                stats.println("Bit skips: "
948:                        + Util.format(towerData.bitsForBitSkips())
949:                        + " bits ("
950:                        + Util.format(towerData.bitsForBitSkips()
951:                                / (double) towerData.numberOfEntries())
952:                        + " bits/skip)");
953:                stats.println("Pointer skips: "
954:                        + Util.format(towerData.bitsForSkipPointers())
955:                        + " bits ("
956:                        + Util.format(towerData.bitsForSkipPointers()
957:                                / (double) towerData.numberOfEntries())
958:                        + " bits/skip)");
959:            }
960:
961:        }
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.