test_Sort.py :  » Search » PyLucence » pylucene-3.0.1-1 » test » Python Open Source

Home
Python Open Source
1.3.1.2 Python
2.Ajax
3.Aspect Oriented
4.Blog
5.Build
6.Business Application
7.Chart Report
8.Content Management Systems
9.Cryptographic
10.Database
11.Development
12.Editor
13.Email
14.ERP
15.Game 2D 3D
16.GIS
17.GUI
18.IDE
19.Installer
20.IRC
21.Issue Tracker
22.Language Interface
23.Log
24.Math
25.Media Sound Audio
26.Mobile
27.Network
28.Parser
29.PDF
30.Project Management
31.RSS
32.Search
33.Security
34.Template Engines
35.Test
36.UML
37.USB Serial
38.Web Frameworks
39.Web Server
40.Web Services
41.Web Unit
42.Wiki
43.Windows
44.XML
Python Open Source » Search » PyLucence 
PyLucence » pylucene 3.0.1 1 » test » test_Sort.py
# ====================================================================
#   Licensed under the Apache License, Version 2.0 (the "License");
#   you may not use this file except in compliance with the License.
#   You may obtain a copy of the License at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
#   Unless required by applicable law or agreed to in writing, software
#   distributed under the License is distributed on an "AS IS" BASIS,
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#   See the License for the specific language governing permissions and
#   limitations under the License.
# ====================================================================

import math

from itertools import izip
from random import randint
from unittest import TestCase,main
from lucene import *

NUM_STRINGS = 6000



class SortTestCase(TestCase):
    """
    Unit tests for sorting code, ported from Java Lucene
    """

    def __init__(self, *args, **kwds):

        super(SortTestCase, self).__init__(*args, **kwds)

        self.data = [
    #      tracer  contents         int            float           string   custom   i18n               long                  double,                short,                byte,           custom parser encoding'
        [   "A",   "x a",           "5",           "4f",           "c",    "A-3",   u"p\u00EAche",      "10",                  "-4.0",                "3",                  "126",          "J"  ],
        [   "B",   "y a",           "5",           "3.4028235E38", "i",    "B-10",  "HAT",             "1000000000",          "40.0",                "24",                 "1",            "I"  ],
        [   "C",   "x a b c",       "2147483647",  "1.0",          "j",    "A-2",   u"p\u00E9ch\u00E9", "99999999",            "40.00002343",         "125",                "15",           "H"  ],
        [   "D",   "y a b c",       "-1",          "0.0f",         "a",     "C-0",   "HUT",             str(Long.MAX_VALUE),  str(Double.MIN_VALUE), str(Short.MIN_VALUE), str(Byte.MIN_VALUE), "G"  ],
        [   "E",   "x a b c d",     "5",           "2f",           "h",     "B-8",   "peach",           str(Long.MIN_VALUE),  str(Double.MAX_VALUE), str(Short.MAX_VALUE), str(Byte.MAX_VALUE), "F"  ],
        [   "F",   "y a b c d",     "2",           "3.14159f",     "g",     "B-1",   u"H\u00C5T",        "-44",                "343.034435444",       "-3",                 "0",            "E"  ],
        [   "G",   "x a b c d",     "3",           "-1.0",         "f",     "C-100", "sin",             "323254543543",       "4.043544",            "5",                  "100",          "D"  ],
        [   "H",   "y a b c d",     "0",           "1.4E-45",      "e",     "C-88",  u"H\u00D8T",        "1023423423005",      "4.043545",            "10",                 "-50",          "C"  ],
        [   "I",   "x a b c d e f", "-2147483648", "1.0e+0",       "d",     "A-10",  u"s\u00EDn",        "332422459999",       "4.043546",            "-340",               "51",           "B"  ],
        [   "J",   "y a b c d e f", "4",           ".5",           "b",     "C-7",   "HOT",             "34334543543",        "4.0000220343",        "300",                "2",            "A"  ],
        [   "W",   "g",             "1",           None,           None,    None,    None,              None,                 None,                  None,                 None,           None  ],
        [   "X",   "g",             "1",           "0.1",          None,    None,    None,              None,                 None,                  None,                 None,           None  ],
        [   "Y",   "g",             "1",           "0.2",          None,    None,    None,              None,                 None,                  None,                 None,           None  ],
        [   "Z",   "f g",           None,          None,           None,    None,    None,              None,                 None,                  None,                 None,           None  ],
        ]

    def _getIndex(self, even, odd):

        indexStore = RAMDirectory()
        writer = IndexWriter(indexStore, SimpleAnalyzer(), True,
                             IndexWriter.MaxFieldLength.LIMITED)
        writer.setMaxBufferedDocs(2)
        writer.setMergeFactor(1000)

        for i in xrange(len(self.data)):
            if (i % 2 == 0 and even) or (i % 2 == 1 and odd):
                doc = Document()
                doc.add(Field("tracer", self.data[i][0], Field.Store.YES,
                              Field.Index.NO))
                doc.add(Field("contents", self.data[i][1], Field.Store.NO,
                              Field.Index.ANALYZED))
                if self.data[i][2] is not None:
                    doc.add(Field("int", self.data[i][2], Field.Store.NO,
                                  Field.Index.NOT_ANALYZED))
                if self.data[i][3] is not None:
                    doc.add(Field("float", self.data[i][3], Field.Store.NO,
                                  Field.Index.NOT_ANALYZED))
                if self.data[i][4] is not None:
                    doc.add(Field("string", self.data[i][4], Field.Store.NO,
                                  Field.Index.NOT_ANALYZED))
                if self.data[i][5] is not None:
                    doc.add(Field("custom", self.data[i][5], Field.Store.NO,
                                  Field.Index.NOT_ANALYZED))
                if self.data[i][6] is not None:
                    doc.add(Field("i18n", self.data[i][6], Field.Store.NO,
                                  Field.Index.NOT_ANALYZED))
                if self.data[i][7] is not None:
                    doc.add(Field("long", self.data[i][7], Field.Store.NO,
                                  Field.Index.NOT_ANALYZED))
                if self.data[i][8] is not None:
                    doc.add(Field("double", self.data[i][8], Field.Store.NO,
                                  Field.Index.NOT_ANALYZED))
                if self.data[i][9] is not None:
                    doc.add(Field("short", self.data[i][9], Field.Store.NO,
                                  Field.Index.NOT_ANALYZED))
                if self.data[i][10] is not None:
                    doc.add(Field("byte", self.data[i][10], Field.Store.NO,
                                  Field.Index.NOT_ANALYZED))
                if self.data[i][11] is not None:
                    doc.add(Field("parser", self.data[i][11], Field.Store.NO,
                                  Field.Index.NOT_ANALYZED))
                doc.setBoost(2.0)  # produce some scores above 1.0
                writer.addDocument(doc)
        # writer.optimize()
        writer.close()
        s = IndexSearcher(indexStore, True)
        s.setDefaultFieldSortScoring(True, True)

        return s

    def _getFullIndex(self):
        return self._getIndex(True, True)

    def getFullStrings(self):

        indexStore = RAMDirectory()
        writer = IndexWriter(indexStore, SimpleAnalyzer(), True,
                             IndexWriter.MaxFieldLength.LIMITED)
        writer.setMaxBufferedDocs(4)
        writer.setMergeFactor(97)
        
        for i in xrange(NUM_STRINGS):
            doc = Document()
            num = self.getRandomCharString(self.getRandomNumber(2, 8), 48, 52)
            doc.add(Field("tracer", num, Field.Store.YES, Field.Index.NO))
            # doc.add(Field("contents", str(i), Field.Store.NO,
            #         Field.Index.ANALYZED))
            doc.add(Field("string", num, Field.Store.NO,
                          Field.Index.NOT_ANALYZED))
            num2 = self.getRandomCharString(self.getRandomNumber(1, 4), 48, 50)
            doc.add(Field("string2", num2, Field.Store.NO,
                          Field.Index.NOT_ANALYZED))
            doc.add(Field("tracer2", num2, Field.Store.YES, Field.Index.NO))
            doc.setBoost(2.0)  # produce some scores above 1.0
            writer.setMaxBufferedDocs(self.getRandomNumber(2, 12))
            writer.addDocument(doc)
      
        # writer.optimize()
        # print writer.getSegmentCount()
        writer.close()

        return IndexSearcher(indexStore, True)
  
    def getRandomNumberString(self, num, low, high):

        return ''.join([self.getRandomNumber(low, high) for i in xrange(num)])
  
    def getRandomCharString(self, num):

        return self.getRandomCharString(num, 48, 122)
  
    def getRandomCharString(self, num,  start, end):
        
        return ''.join([chr(self.getRandomNumber(start, end))
                        for i in xrange(num)])
  
    def getRandomNumber(self, low, high):
  
        return randint(low, high)

    def _getXIndex(self):
        return self._getIndex(True, False)

    def _getYIndex(self):
        return self._getIndex(False, True)

    def _getEmptyIndex(self):
        return self._getIndex(False, False)

    def setUp(self):

        self.full = self._getFullIndex()
        self.searchX = self._getXIndex()
        self.searchY = self._getYIndex()
        self.queryX = TermQuery(Term("contents", "x"))
        self.queryY = TermQuery(Term("contents", "y"))
        self.queryA = TermQuery(Term("contents", "a"))
        self.queryE = TermQuery(Term("contents", "e"))
        self.queryF = TermQuery(Term("contents", "f"))
        self.queryG = TermQuery(Term("contents", "g"))

    def testBuiltInSorts(self):
        """
        test the sorts by score and document number
        """

        sort = Sort()
        self._assertMatches(self.full, self.queryX, sort, "ACEGI")
        self._assertMatches(self.full, self.queryY, sort, "BDFHJ")

        sort.setSort(SortField.FIELD_DOC)
        self._assertMatches(self.full, self.queryX, sort, "ACEGI")
        self._assertMatches(self.full, self.queryY, sort, "BDFHJ")

    def testTypedSort(self):
        """
        test sorts where the type of field is specified
        """

        sort = Sort()

        sort.setSort([SortField("int", SortField.INT),
                      SortField.FIELD_DOC])
        self._assertMatches(self.full, self.queryX, sort, "IGAEC")
        self._assertMatches(self.full, self.queryY, sort, "DHFJB")

        sort.setSort([SortField("float", SortField.FLOAT),
                      SortField.FIELD_DOC])
        self._assertMatches(self.full, self.queryX, sort, "GCIEA")
        self._assertMatches(self.full, self.queryY, sort, "DHJFB")

        sort.setSort([SortField("long", SortField.LONG),
                      SortField.FIELD_DOC])
        self._assertMatches(self.full, self.queryX, sort, "EACGI")
        self._assertMatches(self.full, self.queryY, sort, "FBJHD")

        sort.setSort([SortField("double", SortField.DOUBLE),
                      SortField.FIELD_DOC])
        self._assertMatches(self.full, self.queryX, sort, "AGICE")
        self._assertMatches(self.full, self.queryY, sort, "DJHBF")

        sort.setSort([SortField("byte", SortField.BYTE),
                      SortField.FIELD_DOC])
        self._assertMatches(self.full, self.queryX, sort, "CIGAE")
        self._assertMatches(self.full, self.queryY, sort, "DHFBJ")

        sort.setSort([SortField("short", SortField.SHORT),
                      SortField.FIELD_DOC])
        self._assertMatches(self.full, self.queryX, sort, "IAGCE")
        self._assertMatches(self.full, self.queryY, sort, "DFHBJ")

        sort.setSort([SortField("string", SortField.STRING),
                      SortField.FIELD_DOC])
        self._assertMatches(self.full, self.queryX, sort, "AIGEC")
        self._assertMatches(self.full, self.queryY, sort, "DJHFB")
  
    def testStringSort(self):
        """
        Test String sorting: small queue to many matches, multi field sort,
        reverse sort
        """

        sort = Sort()
        searcher = self.getFullStrings()

        sort.setSort([SortField("string", SortField.STRING),
                      SortField("string2", SortField.STRING, True),
                      SortField.FIELD_DOC])

        result = searcher.search(MatchAllDocsQuery(), None, 500, sort).scoreDocs

        buff = []
        last = None
        lastSub = None
        lastDocId = 0
        fail = False

        for scoreDoc in result:
            doc2 = searcher.doc(scoreDoc.doc)
            v = doc2.getValues("tracer")
            v2 = doc2.getValues("tracer2")
            for _v, _v2 in izip(v, v2):
                if last is not None:
                    _cmp = cmp(_v, last)
                    if _cmp < 0: # ensure first field is in order
                        fail = True
                        print "fail:", _v, "<", last

                    if _cmp == 0: # ensure second field is in reverse order
                        _cmp = cmp(_v2, lastSub)
                        if _cmp > 0:
                            fail = True
                            print "rev field fail:", _v2, ">", lastSub
                        elif _cmp == 0: # ensure docid is in order
                            if scoreDoc.doc < lastDocId:
                                fail = True
                                print "doc fail:", scoreDoc.doc, ">", lastDocId

                last = _v
                lastSub = _v2
                lastDocId = scoreDoc.doc
                buff.append(_v + "(" + _v2 + ")(" + str(scoreDoc.doc) + ") ")

        if fail:
            print "topn field1(field2)(docID):", ''.join(buff)

        self.assert_(not fail, "Found sort results out of order")
  
    def testCustomFieldParserSort(self):
        """
        test sorts where the type of field is specified and a custom field
        parser is used, that uses a simple char encoding. The sorted string
        contains a character beginning from 'A' that is mapped to a numeric
        value using some "funny" algorithm to be different for each data
        type.
        """

        # since tests explicitly use different parsers on the same field name
        # we explicitly check/purge the FieldCache between each assertMatch
        fc = FieldCache.DEFAULT
        
        class intParser(PythonIntParser):
            def parseInt(_self, val):
                return (ord(val[0]) - ord('A')) * 123456

        class floatParser(PythonFloatParser):
            def parseFloat(_self, val):
                return math.sqrt(ord(val[0]))

        class longParser(PythonLongParser):
            def parseLong(_self, val):
                return (ord(val[0]) - ord('A')) * 1234567890L

        class doubleParser(PythonDoubleParser):
            def parseDouble(_self, val):
                return math.pow(ord(val[0]), ord(val[0]) - ord('A'))

        class byteParser(PythonByteParser):
            def parseByte(_self, val):
                return chr(ord(val[0]) - ord('A'))

        class shortParser(PythonShortParser):
            def parseShort(_self, val):
                return ord(val[0]) - ord('A')

        sort = Sort()
        sort.setSort([SortField("parser", intParser()),
                      SortField.FIELD_DOC])
        self._assertMatches(self.full, self.queryA, sort, "JIHGFEDCBA")
        self._assertSaneFieldCaches(self.getName() + " IntParser")
        fc.purgeAllCaches()

        sort.setSort([SortField("parser", floatParser()),
                      SortField.FIELD_DOC])
        self._assertMatches(self.full, self.queryA, sort, "JIHGFEDCBA")
        self._assertSaneFieldCaches(self.getName() + " FloatParser")
        fc.purgeAllCaches()

        sort.setSort([SortField("parser", longParser()),
                           SortField.FIELD_DOC])
        self._assertMatches(self.full, self.queryA, sort, "JIHGFEDCBA")
        self._assertSaneFieldCaches(self.getName() + " LongParser")
        fc.purgeAllCaches()

        sort.setSort([SortField("parser", doubleParser()),
                      SortField.FIELD_DOC])
        self._assertMatches(self.full, self.queryA, sort, "JIHGFEDCBA")
        self._assertSaneFieldCaches(self.getName() + " DoubleParser")
        fc.purgeAllCaches()

        sort.setSort([SortField("parser", byteParser()),
                      SortField.FIELD_DOC])
        self._assertMatches(self.full, self.queryA, sort, "JIHGFEDCBA")
        self._assertSaneFieldCaches(self.getName() + " ByteParser")
        fc.purgeAllCaches()

        sort.setSort([SortField("parser", shortParser()),
                      SortField.FIELD_DOC])
        self._assertMatches(self.full, self.queryA, sort, "JIHGFEDCBA")
        self._assertSaneFieldCaches(self.getName() + " ShortParser")
        fc.purgeAllCaches()

    def testEmptyIndex(self):
        """
        test sorts when there's nothing in the index
        """

        sort = Sort()
        empty = self._getEmptyIndex()

        self._assertMatches(empty, self.queryX, sort, "")

        sort.setSort(SortField.FIELD_DOC)
        self._assertMatches(empty, self.queryX, sort, "")

        sort.setSort([SortField("int", SortField.INT), SortField.FIELD_DOC])
        self._assertMatches(empty, self.queryX, sort, "")

        sort.setSort([SortField("string", SortField.STRING, True),
                      SortField.FIELD_DOC])
        self._assertMatches(empty, self.queryX, sort, "")

        sort.setSort([SortField("float", SortField.FLOAT),
                      SortField("string", SortField.STRING)])
        self._assertMatches(empty, self.queryX, sort, "")


    def testNewCustomFieldParserSort(self):
        """
        Test sorting w/ custom FieldComparator
        """
        sort = Sort()

        sort.setSort([SortField("parser", MyFieldComparatorSource())])
        self._assertMatches(self.full, self.queryA, sort, "JIHGFEDCBA")

    def testReverseSort(self):
        """
        test sorts in reverse
        """
        sort = Sort()

        sort.setSort([SortField(None, SortField.SCORE, True),
                      SortField.FIELD_DOC])
        self._assertMatches(self.full, self.queryX, sort, "IEGCA")
        self._assertMatches(self.full, self.queryY, sort, "JFHDB")

        sort.setSort(SortField(None, SortField.DOC, True))
        self._assertMatches(self.full, self.queryX, sort, "IGECA")
        self._assertMatches(self.full, self.queryY, sort, "JHFDB")

        sort.setSort(SortField("int", SortField.INT, True))
        self._assertMatches(self.full, self.queryX, sort, "CAEGI")
        self._assertMatches(self.full, self.queryY, sort, "BJFHD")

        sort.setSort(SortField("float", SortField.FLOAT, True))
        self._assertMatches(self.full, self.queryX, sort, "AECIG")
        self._assertMatches(self.full, self.queryY, sort, "BFJHD")

        sort.setSort(SortField("string", SortField.STRING, True))
        self._assertMatches(self.full, self.queryX, sort, "CEGIA")
        self._assertMatches(self.full, self.queryY, sort, "BFHJD")

    def testEmptyFieldSort(self):
        """
        test sorting when the sort field is empty(undefined) for some of the
        documents
        """
        sort = Sort()

        sort.setSort(SortField("string", SortField.STRING))
        self._assertMatches(self.full, self.queryF, sort, "ZJI")

        sort.setSort(SortField("string", SortField.STRING, True))
        self._assertMatches(self.full, self.queryF, sort, "IJZ")
    
        sort.setSort(SortField("i18n", Locale.ENGLISH))
        self._assertMatches(self.full, self.queryF, sort, "ZJI")
    
        sort.setSort(SortField("i18n", Locale.ENGLISH, True))
        self._assertMatches(self.full, self.queryF, sort, "IJZ")

        sort.setSort(SortField("int", SortField.INT))
        self._assertMatches(self.full, self.queryF, sort, "IZJ")

        sort.setSort(SortField("int", SortField.INT, True))
        self._assertMatches(self.full, self.queryF, sort, "JZI")

        sort.setSort(SortField("float", SortField.FLOAT))
        self._assertMatches(self.full, self.queryF, sort, "ZJI")

        # using a nonexisting field as first sort key shouldn't make a
        # difference:
        sort.setSort([SortField("nosuchfield", SortField.STRING),
                      SortField("float", SortField.FLOAT)])
        self._assertMatches(self.full, self.queryF, sort, "ZJI")

        sort.setSort(SortField("float", SortField.FLOAT, True))
        self._assertMatches(self.full, self.queryF, sort, "IJZ")

        # When a field is None for both documents, the next SortField should
        # be used. 
        # Works for
        sort.setSort([SortField("int", SortField.INT),
                      SortField("string", SortField.STRING),
                      SortField("float", SortField.FLOAT)])
        self._assertMatches(self.full, self.queryG, sort, "ZWXY")

        # Reverse the last criterium to make sure the test didn't pass by
        # chance 
        sort.setSort([SortField("int", SortField.INT),
                      SortField("string", SortField.STRING),
                      SortField("float", SortField.FLOAT, True)])
        self._assertMatches(self.full, self.queryG, sort, "ZYXW")

        # Do the same for a MultiSearcher
        multiSearcher = MultiSearcher([self.full])

        sort.setSort([SortField("int", SortField.INT),
                      SortField("string", SortField.STRING),
                      SortField("float", SortField.FLOAT)])
        self._assertMatches(multiSearcher, self.queryG, sort, "ZWXY")

        sort.setSort([SortField("int", SortField.INT),
                      SortField("string", SortField.STRING),
                      SortField("float", SortField.FLOAT, True)])
        self._assertMatches(multiSearcher, self.queryG, sort, "ZYXW")

        # Don't close the multiSearcher. it would close the full searcher too!
        # Do the same for a ParallelMultiSearcher
        parallelSearcher = ParallelMultiSearcher([self.full])

        sort.setSort([SortField("int", SortField.INT),
                      SortField("string", SortField.STRING),
                      SortField("float", SortField.FLOAT)])
        self._assertMatches(parallelSearcher, self.queryG, sort, "ZWXY")

        sort.setSort([SortField("int", SortField.INT),
                      SortField("string", SortField.STRING),
                      SortField("float", SortField.FLOAT, True)])
        self._assertMatches(parallelSearcher, self.queryG, sort, "ZYXW")

        # Don't close the parallelSearcher. it would close the full searcher
        # too!

    def testSortCombos(self):
        """
        test sorts using a series of fields
        """
        sort = Sort()

        sort.setSort([SortField("int", SortField.INT),
                      SortField("float", SortField.FLOAT)])
        self._assertMatches(self.full, self.queryX, sort, "IGEAC")

        sort.setSort([SortField("int", SortField.INT, True),
                      SortField(None, SortField.DOC, True)])
        self._assertMatches(self.full, self.queryX, sort, "CEAGI")

        sort.setSort([SortField("float", SortField.FLOAT),
                      SortField("string", SortField.STRING)])
        self._assertMatches(self.full, self.queryX, sort, "GICEA")

    def testLocaleSort(self):
        """
        test using a Locale for sorting strings
        """
        sort = Sort()

        sort.setSort([SortField("string", Locale.US)])
        self._assertMatches(self.full, self.queryX, sort, "AIGEC")
        self._assertMatches(self.full, self.queryY, sort, "DJHFB")

        sort.setSort([SortField("string", Locale.US, True)])
        self._assertMatches(self.full, self.queryX, sort, "CEGIA")
        self._assertMatches(self.full, self.queryY, sort, "BFHJD")

    def testInternationalSort(self):
        """
        test using various international locales with accented characters
        (which sort differently depending on locale)
        """
        sort = Sort()

        sort.setSort(SortField("i18n", Locale.US))
        self._assertMatches(self.full, self.queryY, sort, "BFJDH")

        sort.setSort(SortField("i18n", Locale("sv", "se")))
        self._assertMatches(self.full, self.queryY, sort, "BJDFH")

        sort.setSort(SortField("i18n", Locale("da", "dk")))
        self._assertMatches(self.full, self.queryY, sort, "BJDHF")

        sort.setSort(SortField("i18n", Locale.US))
        self._assertMatches(self.full, self.queryX, sort, "ECAGI")

        sort.setSort(SortField("i18n", Locale.FRANCE))
        self._assertMatches(self.full, self.queryX, sort, "EACGI")

    def testInternationalMultiSearcherSort(self):
        """
        Test the MultiSearcher's ability to preserve locale-sensitive ordering
        by wrapping it around a single searcher
        """
        sort = Sort()

        multiSearcher = MultiSearcher([self.full])
        sort.setSort(SortField("i18n", Locale("sv", "se")))
        self._assertMatches(multiSearcher, self.queryY, sort, "BJDFH")
    
        sort.setSort(SortField("i18n", Locale.US))
        self._assertMatches(multiSearcher, self.queryY, sort, "BFJDH")
    
        sort.setSort(SortField("i18n", Locale("da", "dk")))
        self._assertMatches(multiSearcher, self.queryY, sort, "BJDHF")
    
    def testMultiSort(self):
        """
        test a variety of sorts using more than one searcher
        """
        
        searcher = MultiSearcher([self.searchX, self.searchY])
        self.runMultiSorts(searcher, False)

    def testParallelMultiSort(self):
        """
        test a variety of sorts using a parallel multisearcher
        """

        searcher = ParallelMultiSearcher([self.searchX, self.searchY])
        self.runMultiSorts(searcher, False)

    def testNormalizedScores(self):
        """
        test that the relevancy scores are the same even if
        hits are sorted
        """

        # capture relevancy scores
        scoresX = self.getScores(self.full.search(self.queryX, None,
                                                  1000).scoreDocs, self.full)
        scoresY = self.getScores(self.full.search(self.queryY, None,
                                                  1000).scoreDocs, self.full)
        scoresA = self.getScores(self.full.search(self.queryA, None,
                                                  1000).scoreDocs, self.full)

        # we'll test searching locally, remote and multi
        multi = MultiSearcher([self.searchX, self.searchY])

        # change sorting and make sure relevancy stays the same

        sort = Sort()
        self._assertSameValues(scoresX, self.getScores(self.full.search(self.queryX, None, 1000, sort).scoreDocs, self.full))
        self._assertSameValues(scoresX, self.getScores(multi.search(self.queryX, None, 1000, sort).scoreDocs, multi))
        self._assertSameValues(scoresY, self.getScores(self.full.search(self.queryY, None, 1000, sort).scoreDocs, self.full))
        self._assertSameValues(scoresY, self.getScores(multi.search(self.queryY, None, 1000, sort).scoreDocs, multi))
        self._assertSameValues(scoresA, self.getScores(self.full.search(self.queryA, None, 1000, sort).scoreDocs, self.full))
        self._assertSameValues(scoresA, self.getScores(multi.search(self.queryA, None, 1000, sort).scoreDocs, multi))

        sort.setSort(SortField.FIELD_DOC)
        self._assertSameValues(scoresX, self.getScores(self.full.search(self.queryX, None, 1000, sort).scoreDocs, self.full))
        self._assertSameValues(scoresX, self.getScores(multi.search(self.queryX, None, 1000, sort).scoreDocs, multi))
        self._assertSameValues(scoresY, self.getScores(self.full.search(self.queryY, None, 1000, sort).scoreDocs, self.full))
        self._assertSameValues(scoresY, self.getScores(multi.search(self.queryY, None, 1000, sort).scoreDocs, multi))
        self._assertSameValues(scoresA, self.getScores(self.full.search(self.queryA, None, 1000, sort).scoreDocs, self.full))
        self._assertSameValues(scoresA, self.getScores(multi.search(self.queryA, None, 1000, sort).scoreDocs, multi))

        sort.setSort(SortField("int", SortField.INT))
        self._assertSameValues(scoresX, self.getScores(self.full.search(self.queryX, None, 1000, sort).scoreDocs, self.full))
        self._assertSameValues(scoresX, self.getScores(multi.search(self.queryX, None, 1000, sort).scoreDocs, multi))
        self._assertSameValues(scoresY, self.getScores(self.full.search(self.queryY, None, 1000, sort).scoreDocs, self.full))
        self._assertSameValues(scoresY, self.getScores(multi.search(self.queryY, None, 1000, sort).scoreDocs, multi))
        self._assertSameValues(scoresA, self.getScores(self.full.search(self.queryA, None, 1000, sort).scoreDocs, self.full))
        self._assertSameValues(scoresA, self.getScores(multi.search(self.queryA, None, 1000, sort).scoreDocs, multi))

        sort.setSort(SortField("float", SortField.FLOAT))
        self._assertSameValues(scoresX, self.getScores(self.full.search(self.queryX, None, 1000, sort).scoreDocs, self.full))
        self._assertSameValues(scoresX, self.getScores(multi.search(self.queryX, None, 1000, sort).scoreDocs, multi))
        self._assertSameValues(scoresY, self.getScores(self.full.search(self.queryY, None, 1000, sort).scoreDocs, self.full))
        self._assertSameValues(scoresY, self.getScores(multi.search(self.queryY, None, 1000, sort).scoreDocs, multi))
        self._assertSameValues(scoresA, self.getScores(self.full.search(self.queryA, None, 1000, sort).scoreDocs, self.full))
        self._assertSameValues(scoresA, self.getScores(multi.search(self.queryA, None, 1000, sort).scoreDocs, multi))

        sort.setSort(SortField("string", SortField.STRING))
        self._assertSameValues(scoresX, self.getScores(self.full.search(self.queryX, None, 1000, sort).scoreDocs, self.full))
        self._assertSameValues(scoresX, self.getScores(multi.search(self.queryX, None, 1000, sort).scoreDocs, multi))
        self._assertSameValues(scoresY, self.getScores(self.full.search(self.queryY, None, 1000, sort).scoreDocs, self.full))
        self._assertSameValues(scoresY, self.getScores(multi.search(self.queryY, None, 1000, sort).scoreDocs, multi))
        self._assertSameValues(scoresA, self.getScores(self.full.search(self.queryA, None, 1000, sort).scoreDocs, self.full))
        self._assertSameValues(scoresA, self.getScores(multi.search(self.queryA, None, 1000, sort).scoreDocs, multi))

        sort.setSort([SortField("int", SortField.INT),
                      SortField("float", SortField.FLOAT)])
        self._assertSameValues(scoresX, self.getScores(self.full.search(self.queryX, None, 1000, sort).scoreDocs, self.full))
        self._assertSameValues(scoresX, self.getScores(multi.search(self.queryX, None, 1000, sort).scoreDocs, multi))
        self._assertSameValues(scoresY, self.getScores(self.full.search(self.queryY, None, 1000, sort).scoreDocs, self.full))
        self._assertSameValues(scoresY, self.getScores(multi.search(self.queryY, None, 1000, sort).scoreDocs, multi))
        self._assertSameValues(scoresA, self.getScores(self.full.search(self.queryA, None, 1000, sort).scoreDocs, self.full))
        self._assertSameValues(scoresA, self.getScores(multi.search(self.queryA, None, 1000, sort).scoreDocs, multi))

        sort.setSort([SortField("int", SortField.INT, True),
                      SortField(None, SortField.DOC, True)])
        self._assertSameValues(scoresX, self.getScores(self.full.search(self.queryX, None, 1000, sort).scoreDocs, self.full))
        self._assertSameValues(scoresX, self.getScores(multi.search(self.queryX, None, 1000, sort).scoreDocs, multi))
        self._assertSameValues(scoresY, self.getScores(self.full.search(self.queryY, None, 1000, sort).scoreDocs, self.full))
        self._assertSameValues(scoresY, self.getScores(multi.search(self.queryY, None, 1000, sort).scoreDocs, multi))
        self._assertSameValues(scoresA, self.getScores(self.full.search(self.queryA, None, 1000, sort).scoreDocs, self.full))
        self._assertSameValues(scoresA, self.getScores(multi.search(self.queryA, None, 1000, sort).scoreDocs, multi))

        sort.setSort([SortField("float", SortField.FLOAT),
                      SortField("string", SortField.STRING)])
        self._assertSameValues(scoresX, self.getScores(self.full.search(self.queryX, None, 1000, sort).scoreDocs, self.full))
        self._assertSameValues(scoresX, self.getScores(multi.search(self.queryX, None, 1000, sort).scoreDocs, multi))
        self._assertSameValues(scoresY, self.getScores(self.full.search(self.queryY, None, 1000, sort).scoreDocs, self.full))
        self._assertSameValues(scoresY, self.getScores(multi.search(self.queryY, None, 1000, sort).scoreDocs, multi))
        self._assertSameValues(scoresA, self.getScores(self.full.search(self.queryA, None, 1000, sort).scoreDocs, self.full))
        self._assertSameValues(scoresA, self.getScores(multi.search(self.queryA, None, 1000, sort).scoreDocs, multi))

    def testTopDocsScores(self):
        """
        There was previously a bug in FieldSortedHitQueue.maxscore when only
        a single doc was added.  That is what the following tests for.
        """
        
        sort = Sort()
        nDocs = 10

        # try to pick a query that will result in an unnormalized
        # score greater than 1 to test for correct normalization
        docs1 = self.full.search(self.queryE, None, nDocs, sort)

        # a filter that only allows through the first hit
        class filter(PythonFilter):
            def getDocIdSet(_self, reader):
                bs = BitSet(reader.maxDoc())
                bs.set(0, reader.maxDoc())
                bs.set(docs1.scoreDocs[0].doc)
                return DocIdBitSet(bs)

        filt = filter()

        docs2 = self.full.search(self.queryE, filt, nDocs, sort)
        self.assertEqual(docs1.scoreDocs[0].score,
                         docs2.scoreDocs[0].score,
                         1e-6)
  
    def testSortWithoutFillFields(self):
        """
        There was previously a bug in TopFieldCollector when fillFields was
        set to False - the same doc and score was set in ScoreDoc[]
        array. This test asserts that if fillFields is False, the documents
        are set properly. It does not use Searcher's default search
        methods(with Sort) since all set fillFields to True.
        """

        sorts = [Sort(SortField.FIELD_DOC), Sort()]
        for sort in sorts:
            q = MatchAllDocsQuery()
            tdc = TopFieldCollector.create(sort, 10, False,
                                           False, False, True)
            self.full.search(q, tdc)
      
            sds = tdc.topDocs().scoreDocs
            for i in xrange(1, len(sds)):
                self.assert_(sds[i].doc != sds[i - 1].doc)

    def testSortWithoutScoreTracking(self):
        """
        Two Sort criteria to instantiate the multi/single comparators.
        """

        sorts = [Sort(SortField.FIELD_DOC), Sort()]
        for sort in sorts:
            q = MatchAllDocsQuery()
            tdc = TopFieldCollector.create(sort, 10, True, False,
                                           False, True)
      
            self.full.search(q, tdc)
      
            tds = tdc.topDocs()
            sds = tds.scoreDocs
            for sd in sds:
                self.assert_(Float.isNaN_(sd.score))

            self.assert_(Float.isNaN_(tds.getMaxScore()))

    def testSortWithScoreNoMaxScoreTracking(self):
        """
        Two Sort criteria to instantiate the multi/single comparators.
        """
        
        sorts = [Sort(SortField.FIELD_DOC), Sort()]
        for sort in sorts:
            q = MatchAllDocsQuery()
            tdc = TopFieldCollector.create(sort, 10, True, True,
                                           False, True)
      
            self.full.search(q, tdc)
      
            tds = tdc.topDocs()
            sds = tds.scoreDocs
            for sd in sds:
                self.assert_(not Float.isNaN_(sd.score))

            self.assert_(Float.isNaN_(tds.getMaxScore()))
  
    def testSortWithScoreAndMaxScoreTracking(self):
        """
        Two Sort criteria to instantiate the multi/single comparators.
        """
        
        sorts = [Sort(SortField.FIELD_DOC), Sort()]
        for sort in sorts:
            q = MatchAllDocsQuery()
            tdc = TopFieldCollector.create(sort, 10, True, True,
                                           True, True)
      
            self.full.search(q, tdc)
      
            tds = tdc.topDocs()
            sds = tds.scoreDocs
            for sd in sds:
                self.assert_(not Float.isNaN_(sd.score))

            self.assert_(not Float.isNaN_(tds.getMaxScore()))

    def testOutOfOrderDocsScoringSort(self):
        """
        Two Sort criteria to instantiate the multi/single comparators.
        """

        sorts = [Sort(SortField.FIELD_DOC), Sort()]

        tfcOptions = [[False, False, False],
                      [False, False, True],
                      [False, True, False],
                      [False, True, True],
                      [True, False, False],
                      [True, False, True],
                      [True, True, False],
                      [True, True, True]]

        actualTFCClasses = [
            "OutOfOrderOneComparatorNonScoringCollector", 
            "OutOfOrderOneComparatorScoringMaxScoreCollector", 
            "OutOfOrderOneComparatorScoringNoMaxScoreCollector", 
            "OutOfOrderOneComparatorScoringMaxScoreCollector", 
            "OutOfOrderOneComparatorNonScoringCollector", 
            "OutOfOrderOneComparatorScoringMaxScoreCollector", 
            "OutOfOrderOneComparatorScoringNoMaxScoreCollector", 
            "OutOfOrderOneComparatorScoringMaxScoreCollector" 
        ]
    
        bq = BooleanQuery()

        # Add a Query with SHOULD, since bw.scorer() returns BooleanScorer2
        # which delegates to BS if there are no mandatory clauses.
        bq.add(MatchAllDocsQuery(), BooleanClause.Occur.SHOULD)

        # Set minNrShouldMatch to 1 so that BQ will not optimize rewrite to
        # return the clause instead of BQ.
        bq.setMinimumNumberShouldMatch(1)

        for sort in sorts:
            for tfcOption, actualTFCClass in izip(tfcOptions,
                                                  actualTFCClasses):
                tdc = TopFieldCollector.create(sort, 10, tfcOption[0],
                                               tfcOption[1], tfcOption[2],
                                               False)

                self.assert_(tdc.getClass().getName().endswith("$" + actualTFCClass))
          
                self.full.search(bq, tdc)
          
                tds = tdc.topDocs()
                sds = tds.scoreDocs  
                self.assertEqual(10, len(sds))
  
    def testSortWithScoreAndMaxScoreTrackingNoResults(self):
        """
        Two Sort criteria to instantiate the multi/single comparators.
        """

        sorts = [Sort(SortField.FIELD_DOC), Sort()]
        for sort in sorts:
            tdc = TopFieldCollector.create(sort, 10, True, True, True, True)
            tds = tdc.topDocs()
            self.assertEqual(0, tds.totalHits)
            self.assert_(Float.isNaN_(tds.getMaxScore()))
  
    def runMultiSorts(self, multi, isFull):
        """
        runs a variety of sorts useful for multisearchers
        """
        sort = Sort()

        sort.setSort(SortField.FIELD_DOC)
        expected = isFull and "ABCDEFGHIJ" or "ACEGIBDFHJ"
        self._assertMatches(multi, self.queryA, sort, expected)

        sort.setSort(SortField("int", SortField.INT))
        expected = isFull and "IDHFGJABEC" or "IDHFGJAEBC"
        self._assertMatches(multi, self.queryA, sort, expected)

        sort.setSort([SortField("int", SortField.INT), SortField.FIELD_DOC])
        expected = isFull and "IDHFGJABEC" or "IDHFGJAEBC"
        self._assertMatches(multi, self.queryA, sort, expected)

        sort.setSort(SortField("int", SortField.INT))
        expected = isFull and "IDHFGJABEC" or "IDHFGJAEBC"
        self._assertMatches(multi, self.queryA, sort, expected)

        sort.setSort([SortField("float", SortField.FLOAT), SortField.FIELD_DOC])
        self._assertMatches(multi, self.queryA, sort, "GDHJCIEFAB")

        sort.setSort(SortField("float", SortField.FLOAT))
        self._assertMatches(multi, self.queryA, sort, "GDHJCIEFAB")

        sort.setSort(SortField("string", SortField.STRING))
        self._assertMatches(multi, self.queryA, sort, "DJAIHGFEBC")

        sort.setSort(SortField("int", SortField.INT, True))
        expected = isFull and "CABEJGFHDI" or "CAEBJGFHDI"
        self._assertMatches(multi, self.queryA, sort, expected)

        sort.setSort(SortField("float", SortField.FLOAT, True))
        self._assertMatches(multi, self.queryA, sort, "BAFECIJHDG")

        sort.setSort(SortField("string", SortField.STRING, True))
        self._assertMatches(multi, self.queryA, sort, "CBEFGHIAJD")

        sort.setSort([SortField("int", SortField.INT),
                      SortField("float", SortField.FLOAT)])
        self._assertMatches(multi, self.queryA, sort, "IDHFGJEABC")

        sort.setSort([SortField("float", SortField.FLOAT),
                      SortField("string", SortField.STRING)])
        self._assertMatches(multi, self.queryA, sort, "GDHJICEFAB")

        sort.setSort(SortField("int", SortField.INT))
        self._assertMatches(multi, self.queryF, sort, "IZJ")

        sort.setSort(SortField("int", SortField.INT, True))
        self._assertMatches(multi, self.queryF, sort, "JZI")

        sort.setSort(SortField("float", SortField.FLOAT))
        self._assertMatches(multi, self.queryF, sort, "ZJI")

        sort.setSort(SortField("string", SortField.STRING))
        self._assertMatches(multi, self.queryF, sort, "ZJI")

        sort.setSort(SortField("string", SortField.STRING, True))
        self._assertMatches(multi, self.queryF, sort, "IJZ")

        # up to this point, all of the searches should have "sane" 
        # FieldCache behavior, and should have reused hte cache in several
        # cases 
        self._assertSaneFieldCaches(self.getName() + " various")
        
        # next we'll check Locale based(String[]) for 'string', so purge first
        FieldCache.DEFAULT.purgeAllCaches()

        sort.setSort([SortField("string", Locale.US)])
        self._assertMatches(multi, self.queryA, sort, "DJAIHGFEBC")

        sort.setSort([SortField("string", Locale.US, True)])
        self._assertMatches(multi, self.queryA, sort, "CBEFGHIAJD")

        sort.setSort([SortField("string", Locale.UK)])
        self._assertMatches(multi, self.queryA, sort, "DJAIHGFEBC")

        self._assertSaneFieldCaches(self.getName() + " Locale.US + Locale.UK")
        FieldCache.DEFAULT.purgeAllCaches()

    def _assertMatches(self, searcher, query, sort, expectedResult):
        """
        make sure the documents returned by the search match the expected
        list
        """

        # ScoreDoc[] result = searcher.search(query, None, 1000, sort).scoreDocs
        hits = searcher.search(query, None, len(expectedResult), sort)
        sds = hits.scoreDocs

        self.assertEqual(hits.totalHits, len(expectedResult))
        buff = []
        for sd in sds:
            doc = searcher.doc(sd.doc)
            v = doc.getValues("tracer")
            for _v in v:
                buff.append(_v)

        self.assertEqual(expectedResult, ''.join(buff))

    def getScores(self, hits, searcher):

        scoreMap = {}
        for hit in hits:
            doc = searcher.doc(hit.doc)
            v = doc.getValues("tracer")
            self.assertEqual(len(v), 1)
            scoreMap[v[0]] = hit.score

        return scoreMap

    def _assertSameValues(self, m1, m2):
        """
        make sure all the values in the maps match
        """

        self.assertEquals(len(m1), len(m2))
        for key in m1.iterkeys():
            self.assertEquals(m1[key], m2[key], 1e-6)

    def getName(self):

        return type(self).__name__

    def _assertSaneFieldCaches(self, msg):

        entries = FieldCache.DEFAULT.getCacheEntries()

        insanity = FieldCacheSanityChecker.checkSanity(entries)
        self.assertEqual(0, len(insanity),
                         msg + ": Insane FieldCache usage(s) found")


class MyFieldComparator(PythonFieldComparator):

    def __init__(self, numHits):
        super(MyFieldComparator, self).__init__()
        self.slotValues = [0] * numHits

    def copy(self, slot, doc):
        self.slotValues[slot] = self.docValues[doc]

    def compare(self, slot1, slot2):
        return self.slotValues[slot1] - self.slotValues[slot2]

    def compareBottom(self, doc):
        return self.bottomValue - self.docValues[doc]

    def setBottom(self, bottom):
        self.bottomValue = self.slotValues[bottom]

    def setNextReader(self, reader, docBase):
        
        class intParser(PythonIntParser):
            def parseInt(_self, val):
                return (ord(val[0]) - ord('A')) * 123456
                
        self.docValues = FieldCache.DEFAULT.getInts(reader, "parser",
                                                    intParser())

    def value(self, slot):
        return Integer(self.slotValues[slot])


class MyFieldComparatorSource(PythonFieldComparatorSource):

    def newComparator(self, fieldname, numHits, sortPos, reversed):
        return MyFieldComparator(numHits)



if __name__ == "__main__":
    import sys, lucene
    env = lucene.initVM()
    if '-loop' in sys.argv:
        sys.argv.remove('-loop')
        while True:
            try:
                main()
            except:
                pass
#            refs = sorted(env._dumpRefs(classes=True).items(),
#                          key=lambda x: x[1], reverse=True)
#            print refs[0:4]
    else:
        main()
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.