From 2a1585f97dbbfe109f70d3b3b55098e4b5d793c9 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Thu, 14 Mar 2019 08:46:52 +0100 Subject: [PATCH 1/6] Upgrade to Lucene 8. --- pom.xml | 2 +- .../analysis/EnglishStemmingAnalyzer.java | 11 +- .../anserini/analysis/FreebaseAnalyzer.java | 28 ++--- .../io/anserini/index/IndexCollection.java | 2 +- .../java/io/anserini/index/IndexUtils.java | 13 +- .../java/io/anserini/kg/IndexFreebase.java | 2 +- .../io/anserini/kg/LookupFreebaseNodes.java | 4 +- .../io/anserini/ltr/BaseFeatureExtractor.java | 11 +- .../feature/base/BM25FeatureExtractor.java | 4 +- .../ltr/feature/base/PMIFeatureExtractor.java | 6 +- .../io/anserini/rerank/lib/AxiomReranker.java | 6 +- .../anserini/rerank/lib/RankLibReranker.java | 4 +- .../io/anserini/rerank/lib/Rm3Reranker.java | 4 +- .../java/io/anserini/search/SearchArgs.java | 8 +- .../io/anserini/search/SearchCollection.java | 19 ++- .../io/anserini/search/SimpleSearcher.java | 10 +- .../similarity/AxiomaticSimilarity.java | 116 +++++------------- .../search/similarity/RankLibSimilarity.java | 9 +- .../io/anserini/util/ExtractTopDfTerms.java | 4 +- .../io/anserini/integration/IndexerTest.java | 2 +- .../integration/MultiThreadingSearchTest.java | 3 +- .../integration/TrecEndToEndTest.java | 3 +- .../integration/TweetEndToEndTest.java | 3 +- 23 files changed, 103 insertions(+), 171 deletions(-) diff --git a/pom.xml b/pom.xml index 37c54f0fa2..a2996778e6 100644 --- a/pom.xml +++ b/pom.xml @@ -33,7 +33,7 @@ - 7.6.0 + 8.0.0 UTF-8 diff --git a/src/main/java/io/anserini/analysis/EnglishStemmingAnalyzer.java b/src/main/java/io/anserini/analysis/EnglishStemmingAnalyzer.java index 1ed4055d96..5e8be821c6 100644 --- a/src/main/java/io/anserini/analysis/EnglishStemmingAnalyzer.java +++ b/src/main/java/io/anserini/analysis/EnglishStemmingAnalyzer.java @@ -17,12 +17,11 @@ package io.anserini.analysis; import org.apache.lucene.analysis.*; +import org.apache.lucene.analysis.en.EnglishAnalyzer; import org.apache.lucene.analysis.en.EnglishPossessiveFilter; import org.apache.lucene.analysis.en.KStemFilter; import org.apache.lucene.analysis.en.PorterStemFilter; import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; -import org.apache.lucene.analysis.standard.StandardAnalyzer; -import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; public class EnglishStemmingAnalyzer extends StopwordAnalyzerBase { @@ -30,11 +29,11 @@ public class EnglishStemmingAnalyzer extends StopwordAnalyzerBase { private final CharArraySet stemExclusionSet; public EnglishStemmingAnalyzer() { - this("", StandardAnalyzer.STOP_WORDS_SET); + this("", EnglishAnalyzer.ENGLISH_STOP_WORDS_SET); } public EnglishStemmingAnalyzer(String stemmer) { - this(stemmer, StandardAnalyzer.STOP_WORDS_SET, CharArraySet.EMPTY_SET); + this(stemmer, EnglishAnalyzer.ENGLISH_STOP_WORDS_SET, CharArraySet.EMPTY_SET); } public EnglishStemmingAnalyzer(CharArraySet stopwords) { @@ -54,7 +53,7 @@ public EnglishStemmingAnalyzer(String stemmer, CharArraySet stopwords, CharArray protected TokenStreamComponents createComponents(String fieldName) { Tokenizer source = new StandardTokenizer(); TokenStream result = null; - result = new StandardFilter(source); + result = source; result = new EnglishPossessiveFilter(result); result = new LowerCaseFilter(result); result = new StopFilter(result, this.stopwords); @@ -72,7 +71,7 @@ protected TokenStreamComponents createComponents(String fieldName) { } protected TokenStream normalize(String fieldName, TokenStream in) { - TokenStream result = new StandardFilter(in); + TokenStream result = in; result = new LowerCaseFilter(result); return result; } diff --git a/src/main/java/io/anserini/analysis/FreebaseAnalyzer.java b/src/main/java/io/anserini/analysis/FreebaseAnalyzer.java index f53237aec2..31eae0065d 100644 --- a/src/main/java/io/anserini/analysis/FreebaseAnalyzer.java +++ b/src/main/java/io/anserini/analysis/FreebaseAnalyzer.java @@ -21,13 +21,11 @@ import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.StopwordAnalyzerBase; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Analyzer.TokenStreamComponents; +import org.apache.lucene.analysis.en.EnglishAnalyzer; import org.apache.lucene.analysis.en.EnglishPossessiveFilter; import org.apache.lucene.analysis.en.PorterStemFilter; import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter; import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; -import org.apache.lucene.analysis.standard.StandardAnalyzer; -import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; /* ASCIIFoldingFilter is used for accent folding. This will normalize the characters @@ -54,23 +52,23 @@ public FreebaseAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) { protected TokenStreamComponents createComponents(String fieldName) { StandardTokenizer source = new StandardTokenizer(); - StandardFilter result = new StandardFilter(source); - EnglishPossessiveFilter result2 = new EnglishPossessiveFilter(result); - LowerCaseFilter result3 = new LowerCaseFilter(result2); - Object result4 = new StopFilter(result3, this.stopwords); - result4 = new ASCIIFoldingFilter((TokenStream) result4); + TokenStream result = source; + result = new EnglishPossessiveFilter(result); + result = new LowerCaseFilter(result); + result = new StopFilter(result, this.stopwords); + result = new ASCIIFoldingFilter(result); if(!this.stemExclusionSet.isEmpty()) { - result4 = new SetKeywordMarkerFilter((TokenStream)result4, this.stemExclusionSet); + result = new SetKeywordMarkerFilter(result, this.stemExclusionSet); } - PorterStemFilter result1 = new PorterStemFilter((TokenStream)result4); - return new TokenStreamComponents(source, result1); + result = new PorterStemFilter(result); + return new TokenStreamComponents(source, result); } protected TokenStream normalize(String fieldName, TokenStream in) { - StandardFilter result = new StandardFilter(in); - LowerCaseFilter result1 = new LowerCaseFilter(result); - return result1; + TokenStream result = in; + result = new LowerCaseFilter(result); + return result; } private static class DefaultSetHolder { @@ -80,7 +78,7 @@ private DefaultSetHolder() { } static { - DEFAULT_STOP_SET = StandardAnalyzer.STOP_WORDS_SET; + DEFAULT_STOP_SET = EnglishAnalyzer.ENGLISH_STOP_WORDS_SET; } } } \ No newline at end of file diff --git a/src/main/java/io/anserini/index/IndexCollection.java b/src/main/java/io/anserini/index/IndexCollection.java index 24aecd6d44..6b5d221f50 100644 --- a/src/main/java/io/anserini/index/IndexCollection.java +++ b/src/main/java/io/anserini/index/IndexCollection.java @@ -594,7 +594,7 @@ public void run() throws IOException { if (args.solr) { numIndexed = counters.indexed.get(); } else { - numIndexed = args.dryRun ? counters.indexed.get() : writer.maxDoc(); + numIndexed = args.dryRun ? counters.indexed.get() : writer.getDocStats().maxDoc; } // Do a final commit diff --git a/src/main/java/io/anserini/index/IndexUtils.java b/src/main/java/io/anserini/index/IndexUtils.java index 1544510059..2bd7036073 100755 --- a/src/main/java/io/anserini/index/IndexUtils.java +++ b/src/main/java/io/anserini/index/IndexUtils.java @@ -45,7 +45,6 @@ import java.io.*; import java.nio.charset.StandardCharsets; import java.nio.file.Files; -import java.nio.file.Path; import java.nio.file.Paths; import java.nio.file.StandardOpenOption; import java.util.ArrayList; @@ -147,8 +146,7 @@ public InputStream getReadFileStream(String path) throws IOException { } void printIndexStats() throws IOException { - Fields fields = MultiFields.getFields(reader); - Terms terms = fields.terms(LuceneDocumentGenerator.FIELD_BODY); + Terms terms = MultiTerms.getTerms(reader, LuceneDocumentGenerator.FIELD_BODY); System.out.println("Index statistics"); System.out.println("----------------"); @@ -159,10 +157,9 @@ void printIndexStats() throws IOException { System.out.println("stored fields:"); - FieldInfos fieldInfos = MultiFields.getMergedFieldInfos(reader); - for (String fd : fields) { - FieldInfo fi = fieldInfos.fieldInfo(fd); - System.out.println(" " + fd + " (" + "indexOption: " + fi.getIndexOptions() + + FieldInfos fieldInfos = FieldInfos.getMergedFieldInfos(reader); + for (FieldInfo fi : fieldInfos) { + System.out.println(" " + fi.name + " (" + "indexOption: " + fi.getIndexOptions() + ", hasVectors: " + fi.hasVectors() + ")"); } } @@ -178,7 +175,7 @@ public void printTermCounts(String termStr) throws IOException, ParseException { System.out.println("collection frequency: " + reader.totalTermFreq(t)); System.out.println("document frequency: " + reader.docFreq(t)); - PostingsEnum postingsEnum = MultiFields.getTermDocsEnum(reader, LuceneDocumentGenerator.FIELD_BODY, t.bytes()); + PostingsEnum postingsEnum = MultiTerms.getTermPostingsEnum(reader, LuceneDocumentGenerator.FIELD_BODY, t.bytes()); System.out.println("postings:\n"); while (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { System.out.printf("\t%s, %s\n", postingsEnum.docID(), postingsEnum.freq()); diff --git a/src/main/java/io/anserini/kg/IndexFreebase.java b/src/main/java/io/anserini/kg/IndexFreebase.java index d163c8e448..5732f050e1 100644 --- a/src/main/java/io/anserini/kg/IndexFreebase.java +++ b/src/main/java/io/anserini/kg/IndexFreebase.java @@ -161,7 +161,7 @@ public void run() throws IOException { LOG.info(String.format("%,d triples indexed.", triplesCount.get())); LOG.info(String.format("%,d documents added.", docCount.get())); - int numIndexed = writer.maxDoc(); + int numIndexed = writer.getDocStats().maxDoc; try { writer.commit(); diff --git a/src/main/java/io/anserini/kg/LookupFreebaseNodes.java b/src/main/java/io/anserini/kg/LookupFreebaseNodes.java index 4ac64e11b9..52ebb40d88 100644 --- a/src/main/java/io/anserini/kg/LookupFreebaseNodes.java +++ b/src/main/java/io/anserini/kg/LookupFreebaseNodes.java @@ -104,11 +104,11 @@ public Document lookupMid(String mid) throws IOException { TermQuery query = new TermQuery(new Term(IndexFreebase.FIELD_ID, mid)); TopDocs topDocs = searcher.search(query, 1); - if (topDocs.totalHits == 0) { + if (topDocs.totalHits.value == 0) { System.err.println("Error: mid not found!"); return null; } - if (topDocs.totalHits > 1) { + if (topDocs.totalHits.value > 1) { System.err.println("Error: more than one matching mid found. This shouldn't happen!"); return null; } diff --git a/src/main/java/io/anserini/ltr/BaseFeatureExtractor.java b/src/main/java/io/anserini/ltr/BaseFeatureExtractor.java index 92d384ae6f..e2e0077bed 100644 --- a/src/main/java/io/anserini/ltr/BaseFeatureExtractor.java +++ b/src/main/java/io/anserini/ltr/BaseFeatureExtractor.java @@ -25,7 +25,8 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.MultiBits; +import org.apache.lucene.index.MultiTerms; import org.apache.lucene.index.Terms; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; @@ -168,7 +169,7 @@ private void printHeader(PrintStream out, FeatureExtractors extractors) { public void printFeatureForAllDocs(PrintStream out) throws IOException { Map> queryContextMap = buildRerankerContextMap(); FeatureExtractors extractors = getExtractors(); - Bits liveDocs = MultiFields.getLiveDocs(reader); + Bits liveDocs = MultiBits.getLiveDocs(reader); Set fieldsToLoad = getFieldsToLoad(); this.printHeader(out, extractors); @@ -183,7 +184,7 @@ public void printFeatureForAllDocs(PrintStream out) throws IOException { String docIdString = doc.get(getIdField()); // NOTE doc frequencies should not be retrieved from here, term vector returned is as if on single document // index - Terms terms = MultiFields.getTerms(reader, getTermVectorField());//reader.getTermVector(docId, getTermVectorField()); + Terms terms = MultiTerms.getTerms(reader, getTermVectorField());//reader.getTermVector(docId, getTermVectorField()); if (terms == null) { continue; @@ -207,7 +208,7 @@ public void printFeatureForAllDocs(PrintStream out) throws IOException { public void printFeatures(PrintStream out) throws IOException { Map> queryContextMap = buildRerankerContextMap(); FeatureExtractors extractors = getExtractors(); - Bits liveDocs = MultiFields.getLiveDocs(reader); + Bits liveDocs = MultiBits.getLiveDocs(reader); Set fieldsToLoad = getFieldsToLoad(); // We need to open a searcher @@ -227,7 +228,7 @@ public void printFeatures(PrintStream out) throws IOException { int qrelScore = entry.getValue(); // We issue a specific query TopDocs topDocs = searcher.search(docIdQuery(docId), 1); - if (topDocs.totalHits == 0) { + if (topDocs.totalHits.value == 0) { LOG.warn(String.format("Document Id %s expected but not found in index, skipping...", docId)); continue; } diff --git a/src/main/java/io/anserini/ltr/feature/base/BM25FeatureExtractor.java b/src/main/java/io/anserini/ltr/feature/base/BM25FeatureExtractor.java index e0bee3295d..be6c845176 100644 --- a/src/main/java/io/anserini/ltr/feature/base/BM25FeatureExtractor.java +++ b/src/main/java/io/anserini/ltr/feature/base/BM25FeatureExtractor.java @@ -23,7 +23,7 @@ import org.apache.logging.log4j.Logger; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.MultiTerms; import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; @@ -83,7 +83,7 @@ private double computeAvgFL(long sumTermFreqs, long maxDocs) { private long getSumTermFrequency(IndexReader reader, String fieldName) { Terms collectionTermVector = null; try { - collectionTermVector = MultiFields.getTerms(reader, fieldName); + collectionTermVector = MultiTerms.getTerms(reader, fieldName); long totalTermFreq = collectionTermVector.getSumTotalTermFreq(); return totalTermFreq; } catch (IOException e) { diff --git a/src/main/java/io/anserini/ltr/feature/base/PMIFeatureExtractor.java b/src/main/java/io/anserini/ltr/feature/base/PMIFeatureExtractor.java index 15385d9d49..80067b6ed4 100644 --- a/src/main/java/io/anserini/ltr/feature/base/PMIFeatureExtractor.java +++ b/src/main/java/io/anserini/ltr/feature/base/PMIFeatureExtractor.java @@ -21,7 +21,7 @@ import io.anserini.rerank.RerankerContext; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.MultiTerms; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; @@ -100,8 +100,8 @@ public float extract(Document doc, Terms terms, RerankerContext context) { for (int j = i +1; j < queryTokens.size(); j++) { pairsComputed ++; String secondToken = queryTokens.get(j); - PostingsEnum firstEnum = MultiFields.getTermDocsEnum(reader,LuceneDocumentGenerator.FIELD_BODY, new BytesRef(firstToken)); - PostingsEnum secondEnum = MultiFields.getTermDocsEnum(reader,LuceneDocumentGenerator.FIELD_BODY, new BytesRef(secondToken)); + PostingsEnum firstEnum = MultiTerms.getTermPostingsEnum(reader,LuceneDocumentGenerator.FIELD_BODY, new BytesRef(firstToken)); + PostingsEnum secondEnum = MultiTerms.getTermPostingsEnum(reader,LuceneDocumentGenerator.FIELD_BODY, new BytesRef(secondToken)); int intersect; if (firstEnum == null || secondEnum == null) { intersect = 0; diff --git a/src/main/java/io/anserini/rerank/lib/AxiomReranker.java b/src/main/java/io/anserini/rerank/lib/AxiomReranker.java index 342afe7b7f..414c118856 100644 --- a/src/main/java/io/anserini/rerank/lib/AxiomReranker.java +++ b/src/main/java/io/anserini/rerank/lib/AxiomReranker.java @@ -192,9 +192,9 @@ private ScoredDocuments searchTopDocs(Query query, RerankerContext context) t if (context.getSearchArgs().arbitraryScoreTieBreak) { rs = searcher.search(finalQuery, context.getSearchArgs().hits); } else if (context.getSearchArgs().searchtweets) { - rs = searcher.search(finalQuery, context.getSearchArgs().hits, BREAK_SCORE_TIES_BY_TWEETID, true, true); + rs = searcher.search(finalQuery, context.getSearchArgs().hits, BREAK_SCORE_TIES_BY_TWEETID, true); } else { - rs = searcher.search(finalQuery, context.getSearchArgs().hits, BREAK_SCORE_TIES_BY_DOCID, true, true); + rs = searcher.search(finalQuery, context.getSearchArgs().hits, BREAK_SCORE_TIES_BY_DOCID, true); } return ScoredDocuments.fromTopDocs(rs, searcher); @@ -262,7 +262,7 @@ private ScoredDocuments processExternalContext(ScoredDocuments docs, RerankerCon } IndexReader reader = DirectoryReader.open(FSDirectory.open(indexPath)); IndexSearcher searcher = new IndexSearcher(reader); - searcher.setSimilarity(context.getIndexSearcher().getSimilarity(true)); + searcher.setSimilarity(context.getIndexSearcher().getSimilarity()); SearchArgs args = new SearchArgs(); args.hits = this.R; diff --git a/src/main/java/io/anserini/rerank/lib/RankLibReranker.java b/src/main/java/io/anserini/rerank/lib/RankLibReranker.java index ab5d42d302..55475415fb 100644 --- a/src/main/java/io/anserini/rerank/lib/RankLibReranker.java +++ b/src/main/java/io/anserini/rerank/lib/RankLibReranker.java @@ -28,7 +28,7 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.apache.lucene.document.Document; -import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.MultiTerms; import org.apache.lucene.index.Terms; import java.io.IOException; @@ -49,7 +49,7 @@ public class RankLibReranker implements Reranker { private DataPoint convertToDataPoint(Document doc, RerankerContext context) { Terms terms = null; try { - terms = MultiFields.getTerms(context.getIndexSearcher().getIndexReader(), this.termsField); + terms = MultiTerms.getTerms(context.getIndexSearcher().getIndexReader(), this.termsField); } catch (IOException e) { LOG.error("Unable to retrieve term vectors"); } diff --git a/src/main/java/io/anserini/rerank/lib/Rm3Reranker.java b/src/main/java/io/anserini/rerank/lib/Rm3Reranker.java index 5e0d014a6e..bc54a54b19 100644 --- a/src/main/java/io/anserini/rerank/lib/Rm3Reranker.java +++ b/src/main/java/io/anserini/rerank/lib/Rm3Reranker.java @@ -107,9 +107,9 @@ public ScoredDocuments rerank(ScoredDocuments docs, RerankerContext context) { if (context.getSearchArgs().arbitraryScoreTieBreak) { rs = searcher.search(finalQuery, context.getSearchArgs().hits); } else if (context.getSearchArgs().searchtweets) { - rs = searcher.search(finalQuery, context.getSearchArgs().hits, BREAK_SCORE_TIES_BY_TWEETID, true, true); + rs = searcher.search(finalQuery, context.getSearchArgs().hits, BREAK_SCORE_TIES_BY_TWEETID, true); } else { - rs = searcher.search(finalQuery, context.getSearchArgs().hits, BREAK_SCORE_TIES_BY_DOCID, true, true); + rs = searcher.search(finalQuery, context.getSearchArgs().hits, BREAK_SCORE_TIES_BY_DOCID, true); } } catch (IOException e) { e.printStackTrace(); diff --git a/src/main/java/io/anserini/search/SearchArgs.java b/src/main/java/io/anserini/search/SearchArgs.java index 74f8a12172..9aa0bf787a 100644 --- a/src/main/java/io/anserini/search/SearchArgs.java +++ b/src/main/java/io/anserini/search/SearchArgs.java @@ -126,11 +126,11 @@ public class SearchArgs { @Option(name = "-b", handler = StringArrayOptionHandler.class, usage = "BM25 b parameter") public String[] b = new String[] {"0.4"}; - @Option(name = "-pl2", usage = "use PL2 scoring model") - public boolean pl2 = false; + @Option(name = "-inl2", usage = "use I(n)L2 scoring model") + public boolean inl2 = false; - @Option(name = "-pl2.c", metaVar = "[value]", usage = "PL2 c parameter") - public String[] pl2_c = new String[] {"0.1"}; + @Option(name = "-inl2.c", metaVar = "[value]", usage = "I(n)L2 c parameter") + public String[] inl2_c = new String[] {"0.1"}; @Option(name = "-spl", usage = "use SPL scoring model") public boolean spl = false; diff --git a/src/main/java/io/anserini/search/SearchCollection.java b/src/main/java/io/anserini/search/SearchCollection.java index a4bd424238..664b1a0004 100644 --- a/src/main/java/io/anserini/search/SearchCollection.java +++ b/src/main/java/io/anserini/search/SearchCollection.java @@ -44,7 +44,6 @@ import org.apache.lucene.document.LongPoint; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.Term; import org.apache.lucene.queryparser.flexible.core.QueryNodeException; import org.apache.lucene.queryparser.flexible.standard.StandardQueryParser; import org.apache.lucene.search.*; @@ -229,9 +228,9 @@ public List constructSimiliries() { similarities.add(new TaggedSimilarity(new BM25Similarity(Float.valueOf(k1), Float.valueOf(b)), "k1:"+k1+",b:"+b)); } } - } else if (args.pl2) { - for (String c : args.pl2_c) { - similarities.add(new TaggedSimilarity(new DFRSimilarity(new BasicModelP(), new AfterEffectL(), new NormalizationH2(Float.valueOf(c))), "c:"+c)); + } else if (args.inl2) { + for (String c : args.inl2_c) { + similarities.add(new TaggedSimilarity(new DFRSimilarity(new BasicModelIn(), new AfterEffectL(), new NormalizationH2(Float.valueOf(c))), "c:"+c)); }; } else if (args.spl) { for (String c : args.spl_c) { @@ -351,12 +350,12 @@ public ScoredDocuments search(IndexSearcher searcher, K qid, String queryStri query = new BagOfWordsQueryGenerator().buildQuery(FIELD_BODY, analyzer, queryString); } - TopDocs rs = new TopDocs(0, new ScoreDoc[]{}, Float.NaN); + TopDocs rs = new TopDocs(new TotalHits(0, TotalHits.Relation.EQUAL_TO), new ScoreDoc[]{}); if (!(isRerank && args.rerankcutoff <= 0)) { if (args.arbitraryScoreTieBreak) {// Figure out how to break the scoring ties. rs = searcher.search(query, isRerank ? args.rerankcutoff : args.hits); } else { - rs = searcher.search(query, isRerank ? args.rerankcutoff : args.hits, BREAK_SCORE_TIES_BY_DOCID, true, true); + rs = searcher.search(query, isRerank ? args.rerankcutoff : args.hits, BREAK_SCORE_TIES_BY_DOCID, true); } } @@ -396,12 +395,12 @@ public ScoredDocuments searchBackgroundLinking(IndexSearcher searcher, K qid, builder.add(q, BooleanClause.Occur.MUST); query = builder.build(); - TopDocs rs = new TopDocs(0, new ScoreDoc[]{}, Float.NaN); + TopDocs rs = new TopDocs(new TotalHits(0, TotalHits.Relation.EQUAL_TO), new ScoreDoc[]{}); if (!(isRerank && args.rerankcutoff <= 0)) { if (args.arbitraryScoreTieBreak) {// Figure out how to break the scoring ties. rs = searcher.search(query, isRerank ? args.rerankcutoff : args.hits); } else { - rs = searcher.search(query, isRerank ? args.rerankcutoff : args.hits, BREAK_SCORE_TIES_BY_DOCID, true, true); + rs = searcher.search(query, isRerank ? args.rerankcutoff : args.hits, BREAK_SCORE_TIES_BY_DOCID, true); } } @@ -464,12 +463,12 @@ public ScoredDocuments searchTweets(IndexSearcher searcher, K qid, String que Query compositeQuery = builder.build(); - TopDocs rs = new TopDocs(0, new ScoreDoc[]{}, Float.NaN); + TopDocs rs = new TopDocs(new TotalHits(0,TotalHits.Relation.EQUAL_TO), new ScoreDoc[]{}); if (!(isRerank && args.rerankcutoff <= 0)) { if (args.arbitraryScoreTieBreak) {// Figure out how to break the scoring ties. rs = searcher.search(compositeQuery, isRerank ? args.rerankcutoff : args.hits); } else { - rs = searcher.search(compositeQuery, isRerank ? args.rerankcutoff : args.hits, BREAK_SCORE_TIES_BY_TWEETID, true, true); + rs = searcher.search(compositeQuery, isRerank ? args.rerankcutoff : args.hits, BREAK_SCORE_TIES_BY_TWEETID, true); } } diff --git a/src/main/java/io/anserini/search/SimpleSearcher.java b/src/main/java/io/anserini/search/SimpleSearcher.java index 8ab4876f98..1cc37b57fb 100644 --- a/src/main/java/io/anserini/search/SimpleSearcher.java +++ b/src/main/java/io/anserini/search/SimpleSearcher.java @@ -132,7 +132,7 @@ public void setBM25Similarity(float k1, float b) { } public void setDFRSimilarity(float c) { - this.similarity = new DFRSimilarity(new BasicModelP(), new AfterEffectL(), new NormalizationH2(c)); + this.similarity = new DFRSimilarity(new BasicModelIn(), new AfterEffectL(), new NormalizationH2(c)); } public void setIBSimilarity(float c) { @@ -171,7 +171,7 @@ public Result[] search(String q, int k, long t) throws IOException { searchArgs.hits = k; searchArgs.searchtweets = searchtweets; - TopDocs rs = new TopDocs(0, new ScoreDoc[]{}, Float.NaN); + TopDocs rs = new TopDocs(new TotalHits(0, TotalHits.Relation.EQUAL_TO), new ScoreDoc[]{}); RerankerContext context; if (searchtweets) { if (t > 0) { @@ -183,14 +183,14 @@ public Result[] search(String q, int k, long t) throws IOException { builder.add(filter, BooleanClause.Occur.FILTER); builder.add(query, BooleanClause.Occur.MUST); Query compositeQuery = builder.build(); - rs = searcher.search(compositeQuery, isRerank ? searchArgs.rerankcutoff : k, BREAK_SCORE_TIES_BY_TWEETID, true, true); + rs = searcher.search(compositeQuery, isRerank ? searchArgs.rerankcutoff : k, BREAK_SCORE_TIES_BY_TWEETID, true); context = new RerankerContext<>(searcher, null, compositeQuery, null, q, queryTokens, filter, searchArgs); } else { - rs = searcher.search(query, isRerank ? searchArgs.rerankcutoff : k, BREAK_SCORE_TIES_BY_TWEETID, true, true); + rs = searcher.search(query, isRerank ? searchArgs.rerankcutoff : k, BREAK_SCORE_TIES_BY_TWEETID, true); context = new RerankerContext<>(searcher, null, query, null, q, queryTokens, null, searchArgs); } } else { - rs = searcher.search(query, isRerank ? searchArgs.rerankcutoff : k, BREAK_SCORE_TIES_BY_DOCID, true, true); + rs = searcher.search(query, isRerank ? searchArgs.rerankcutoff : k, BREAK_SCORE_TIES_BY_DOCID, true); context = new RerankerContext<>(searcher, null, query, null, q, queryTokens, null, searchArgs); } diff --git a/src/main/java/io/anserini/search/similarity/AxiomaticSimilarity.java b/src/main/java/io/anserini/search/similarity/AxiomaticSimilarity.java index 36a4bf1858..d691f32a98 100644 --- a/src/main/java/io/anserini/search/similarity/AxiomaticSimilarity.java +++ b/src/main/java/io/anserini/search/similarity/AxiomaticSimilarity.java @@ -16,12 +16,9 @@ package io.anserini.search.similarity; -import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.lucene.index.FieldInvertState; -import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.search.CollectionStatistics; import org.apache.lucene.search.Explanation; import org.apache.lucene.search.TermStatistics; @@ -37,16 +34,9 @@ public abstract class AxiomaticSimilarity extends Similarity { protected final float s; /** Cache of decoded bytes. */ - protected static final float[] OLD_LENGTH_TABLE = new float[256]; protected static final float[] LENGTH_TABLE = new float[256]; static { - for (int i = 1; i < 256; i++) { - float f = SmallFloat.byte315ToFloat((byte)i); - OLD_LENGTH_TABLE[i] = 1.0f / (f*f); - } - OLD_LENGTH_TABLE[0] = 1.0f / OLD_LENGTH_TABLE[255]; // otherwise inf - for (int i = 0; i < 256; i++) { LENGTH_TABLE[i] = SmallFloat.byte4ToInt((byte) i); } @@ -226,28 +216,20 @@ public Explanation idfExplain(CollectionStatistics collectionStats, TermStatisti } @Override - public final SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { + public final SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { Explanation idf = termStats.length == 1 ? idfExplain(collectionStats, termStats[0]) : idfExplain(collectionStats, termStats); float avgdl = avgFieldLength(collectionStats); - float[] oldCache = new float[256]; float[] cache = new float[256]; for (int i = 0; i < cache.length; i++) { - oldCache[i] = s + s * OLD_LENGTH_TABLE[i] / avgdl; cache[i] = s + s * LENGTH_TABLE[i] / avgdl; } - return new Stats(collectionStats.field(), boost, idf, avgdl, oldCache, cache); - } - - - @Override - public final SimScorer simScorer(SimWeight stats, LeafReaderContext context) throws IOException { - Stats axStats = (Stats) stats; - return new AxDocScorer(axStats, context.reader().getMetaData().getCreatedVersionMajor(), context.reader().getNormValues(axStats.field)); + Stats axStats = new Stats(collectionStats.field(), boost, idf, avgdl, cache); + return new AxDocScorer(axStats); } /** DocumentCollection statistics for the F2Log model. */ - static class Stats extends SimWeight { + static class Stats { /** F2Log's idf */ public final Explanation idf; /** The average document length. */ @@ -259,15 +241,14 @@ static class Stats extends SimWeight { /** field name, for pulling norms */ public final String field; /** precomputed norm[256] with k1 * ((1 - b) + b * dl / avgdl) - * for both OLD_LENGTH_TABLE and LENGTH_TABLE */ - private final float[] oldCache, cache; + * for LENGTH_TABLE */ + private final float[] cache; - Stats(String field, float boost, Explanation idf, float avgdl, float[] oldCache, float[] cache) { + Stats(String field, float boost, Explanation idf, float avgdl, float[] cache) { this.field = field; this.idf = idf; this.avgdl = avgdl; - this.weight = idf.getValue() * boost; - this.oldCache = oldCache; + this.weight = (float) (idf.getValue().doubleValue() * boost); this.cache = cache; } } @@ -275,23 +256,13 @@ static class Stats extends SimWeight { class AxDocScorer extends SimScorer { private final Stats stats; private final float weightValue; // boost * idf - private final NumericDocValues norms; - /** precomputed cache for all length values */ - private final float[] lengthCache; /** precomputed norm[256] with k1 * ((1 - b) + b * dl / avgdl) */ private final float[] cache; - AxDocScorer(Stats stats, int indexCreatedVersionMajor, NumericDocValues norms) throws IOException { + AxDocScorer(Stats stats) { this.stats = stats; this.weightValue = stats.weight; - this.norms = norms; - if (indexCreatedVersionMajor >= 7) { - lengthCache = LENGTH_TABLE; - cache = stats.cache; - } else { - lengthCache = OLD_LENGTH_TABLE; - cache = stats.oldCache; - } + cache = stats.cache; } /* Score function is: @@ -302,73 +273,44 @@ class AxDocScorer extends SimScorer { */ @Override - public float score(int doc, float freq) throws IOException { + public float score(float freq, long encodedNorm) { // if there are no norms, we act as if b=0 - float norm; - if (norms == null) { - norm = 0.0f; - } else { - if (norms.advanceExact(doc)) { - norm = cache[((byte) norms.longValue()) & 0xFF]; - } else { - norm = cache[0]; - } - } - return weightValue * freq / (freq + norm); + double norm = cache[((byte) encodedNorm) & 0xFF]; + return weightValue * (float) (freq / (freq + norm)); } @Override - public Explanation explain(int doc, Explanation freq) throws IOException { - return explainScore(doc, freq, stats, norms, lengthCache); - } - - @Override - public float computeSlopFactor(int distance) { - return sloppyFreq(distance); - } - - @Override - public float computePayloadFactor(int doc, int start, int end, BytesRef payload) { - return scorePayload(doc, start, end, payload); + public Explanation explain(Explanation freq, long encodedNorm) { + return explainScore(freq, encodedNorm, stats); } } - private Explanation explainTFNorm(int doc, Explanation freq, Stats stats, NumericDocValues norms, float[] lengthCache) throws IOException { + private Explanation explainTFNorm(Explanation freq, long encodedNorm, Stats stats) { List subs = new ArrayList<>(); subs.add(freq); subs.add(Explanation.match(s, "parameter s")); - if (norms == null) { - subs.add(Explanation.match(0, "norm")); - return Explanation.match(1, - "tfNorm, computed as constant from:", subs); - } else { - byte norm; - if (norms.advanceExact(doc)) { - norm = (byte) norms.longValue(); - } else { - norm = 0; - } - float doclen = lengthCache[norm & 0xff]; - subs.add(Explanation.match(stats.avgdl, "avgFieldLength")); - subs.add(Explanation.match(doclen, "fieldLength")); - return Explanation.match( - (freq.getValue() / (freq.getValue() + s + s * doclen/stats.avgdl)), - "tfNorm, computed as (freq / (freq + s + s * fieldLength / avgFieldLength) from:", subs); - } + + byte norm = (byte) encodedNorm; + float doclen = LENGTH_TABLE[norm & 0xff]; + subs.add(Explanation.match(stats.avgdl, "avgFieldLength")); + subs.add(Explanation.match(doclen, "fieldLength")); + return Explanation.match( + (freq.getValue().floatValue() / (freq.getValue().floatValue() + s + s * doclen/stats.avgdl)), + "tfNorm, computed as (freq / (freq + s + s * fieldLength / avgFieldLength) from:", subs); } - private Explanation explainScore(int doc, Explanation freq, Stats stats, NumericDocValues norms, float[] lengthCache) throws IOException { + private Explanation explainScore(Explanation freq, long encodedNorm, Stats stats) { Explanation boostExpl = Explanation.match(stats.boost, "boost"); List subs = new ArrayList<>(); - if (boostExpl.getValue() != 1.0f) + if (boostExpl.getValue().floatValue() != 1.0f) subs.add(boostExpl); subs.add(stats.idf); - Explanation tfNormExpl = explainTFNorm(doc, freq, stats, norms, lengthCache); + Explanation tfNormExpl = explainTFNorm(freq, encodedNorm, stats); subs.add(tfNormExpl); return Explanation.match( - boostExpl.getValue() * stats.idf.getValue() * tfNormExpl.getValue(), - "score(doc="+doc+",freq="+freq+"), product of:", subs); + boostExpl.getValue().floatValue() * stats.idf.getValue().floatValue() * tfNormExpl.getValue().floatValue(), + "score(freq="+freq+", length=" + LENGTH_TABLE[Byte.toUnsignedInt((byte) encodedNorm)] + "), product of:", subs); } @Override diff --git a/src/main/java/io/anserini/search/similarity/RankLibSimilarity.java b/src/main/java/io/anserini/search/similarity/RankLibSimilarity.java index 9adae2ed86..4fd20480bf 100644 --- a/src/main/java/io/anserini/search/similarity/RankLibSimilarity.java +++ b/src/main/java/io/anserini/search/similarity/RankLibSimilarity.java @@ -17,13 +17,10 @@ package io.anserini.search.similarity; import org.apache.lucene.index.FieldInvertState; -import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.search.CollectionStatistics; import org.apache.lucene.search.TermStatistics; import org.apache.lucene.search.similarities.Similarity; -import java.io.IOException; - /** * Similarity that uses a Ranklib ranker to compute the score */ @@ -34,12 +31,8 @@ public long computeNorm(FieldInvertState fieldInvertState) { } @Override - public SimWeight computeWeight(float boost, CollectionStatistics collectionStatistics, TermStatistics... termStatistics) { + public SimScorer scorer(float boost, CollectionStatistics collectionStatistics, TermStatistics... termStatistics) { return null; } - @Override - public SimScorer simScorer(SimWeight simWeight, LeafReaderContext leafReaderContext) throws IOException { - return null; - } } diff --git a/src/main/java/io/anserini/util/ExtractTopDfTerms.java b/src/main/java/io/anserini/util/ExtractTopDfTerms.java index adae2cd631..cf72d6c344 100644 --- a/src/main/java/io/anserini/util/ExtractTopDfTerms.java +++ b/src/main/java/io/anserini/util/ExtractTopDfTerms.java @@ -20,7 +20,7 @@ import org.apache.logging.log4j.Logger; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.MultiTerms; import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; @@ -95,7 +95,7 @@ public int compare(Pair p1, Pair p2) { PriorityQueue queue = new PriorityQueue(myArgs.topK, comp); LOG.info("Starting to iterate through all terms..."); - Terms terms = MultiFields.getFields(reader).terms(myArgs.field); + Terms terms = MultiTerms.getTerms(reader, myArgs.field); TermsEnum termsEnum = terms.iterator(); BytesRef text; int cnt = 0; diff --git a/src/test/java/io/anserini/integration/IndexerTest.java b/src/test/java/io/anserini/integration/IndexerTest.java index c37c9c69f6..61e9503245 100644 --- a/src/test/java/io/anserini/integration/IndexerTest.java +++ b/src/test/java/io/anserini/integration/IndexerTest.java @@ -217,7 +217,7 @@ public void testIterateThroughDocumentVectorComputeBM25() throws Exception { TopDocs rs = searcher.search(finalQuery, 1); // issue the query // The BM25 weight is the maxScore - System.out.println(term + " " + tf + " " + rs.getMaxScore()); + System.out.println(term + " " + tf + " " + (rs.scoreDocs.length == 0 ? Float.NaN : rs.scoreDocs[0].score)); } } } diff --git a/src/test/java/io/anserini/integration/MultiThreadingSearchTest.java b/src/test/java/io/anserini/integration/MultiThreadingSearchTest.java index 076d2f4854..a16ae32953 100644 --- a/src/test/java/io/anserini/integration/MultiThreadingSearchTest.java +++ b/src/test/java/io/anserini/integration/MultiThreadingSearchTest.java @@ -35,8 +35,9 @@ protected void init() { termIndexStatusTermCount = 12; // Please note that standard analyzer ignores stopwords. // Also, this includes docids termIndexStatusTotFreq = 17; // - termIndexStatusTotPos = 16; // only "text" fields are indexed with position so we have 16 storedFieldStatusTotalDocCounts = 3; + // 16 positions for text fields, plus 1 for each document because of id + termIndexStatusTotPos = 16 + storedFieldStatusTotalDocCounts; storedFieldStatusTotFields = 9; // 3 docs * (1 id + 1 text + 1 raw) } diff --git a/src/test/java/io/anserini/integration/TrecEndToEndTest.java b/src/test/java/io/anserini/integration/TrecEndToEndTest.java index c5c83c26a2..f1e11885c6 100644 --- a/src/test/java/io/anserini/integration/TrecEndToEndTest.java +++ b/src/test/java/io/anserini/integration/TrecEndToEndTest.java @@ -28,8 +28,9 @@ protected void init() { fieldNormStatusTotalFields = 1; // text termIndexStatusTermCount = 12; // Note that standard analyzer ignores stopwords; includes docids. termIndexStatusTotFreq = 17; - termIndexStatusTotPos = 16; // Only "text" fields are indexed with position so we have 16. storedFieldStatusTotalDocCounts = 3; + // 16 positions for text fields, plus 1 for each document because of id + termIndexStatusTotPos = 16 + storedFieldStatusTotalDocCounts; storedFieldStatusTotFields = 9; // 3 docs * (1 id + 1 text + 1 raw) // The search output should be as follows (for Lucene 7.5): diff --git a/src/test/java/io/anserini/integration/TweetEndToEndTest.java b/src/test/java/io/anserini/integration/TweetEndToEndTest.java index 247f127244..0c8eb3c2e5 100644 --- a/src/test/java/io/anserini/integration/TweetEndToEndTest.java +++ b/src/test/java/io/anserini/integration/TweetEndToEndTest.java @@ -30,8 +30,9 @@ protected void init() { // We set that retweets and the tweets with ids larger than tweetMaxId will NOT be indexed! termIndexStatusTermCount = 32; // other indexable fields: 4 doc ids + 4 "lang" fields + 4 "screen_name" fields termIndexStatusTotFreq = 36; - termIndexStatusTotPos = 24; // only "text" fields are indexed with positions storedFieldStatusTotalDocCounts = 4; + // 24 positions for text fields, plus 3 for each document because of id, screen_name and lang + termIndexStatusTotPos = 24 + 3 * storedFieldStatusTotalDocCounts; storedFieldStatusTotFields = 12; // 4 tweets * (1 id + 1 text + 1 raw) // The search output should be as follows (for Lucene 7.5): From 3b44a7e17195d755f53382d969b1238e6cd68139 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Thu, 14 Mar 2019 08:48:57 +0100 Subject: [PATCH 2/6] Use Lucene`s Axiomatic similarity. --- .../io/anserini/search/SearchCollection.java | 6 +- .../similarity/AxiomaticSimilarity.java | 330 ------------------ .../search/similarity/F2ExpSimilarity.java | 59 ---- .../search/similarity/F2LogSimilarity.java | 48 --- 4 files changed, 2 insertions(+), 441 deletions(-) delete mode 100644 src/main/java/io/anserini/search/similarity/AxiomaticSimilarity.java delete mode 100644 src/main/java/io/anserini/search/similarity/F2ExpSimilarity.java delete mode 100644 src/main/java/io/anserini/search/similarity/F2LogSimilarity.java diff --git a/src/main/java/io/anserini/search/SearchCollection.java b/src/main/java/io/anserini/search/SearchCollection.java index 664b1a0004..0b2f868a92 100644 --- a/src/main/java/io/anserini/search/SearchCollection.java +++ b/src/main/java/io/anserini/search/SearchCollection.java @@ -29,8 +29,6 @@ import io.anserini.rerank.lib.ScoreTiesAdjusterReranker; import io.anserini.search.query.BagOfWordsQueryGenerator; import io.anserini.search.query.SdmQueryGenerator; -import io.anserini.search.similarity.F2ExpSimilarity; -import io.anserini.search.similarity.F2LogSimilarity; import io.anserini.search.similarity.TaggedSimilarity; import io.anserini.search.topicreader.NewsBackgroundLinkingTopicReader; import io.anserini.search.topicreader.TopicReader; @@ -238,11 +236,11 @@ public List constructSimiliries() { } } else if (args.f2exp) { for (String s : args.f2exp_s) { - similarities.add(new TaggedSimilarity(new F2ExpSimilarity(Float.valueOf(s)), "s:"+s)); + similarities.add(new TaggedSimilarity(new AxiomaticF2EXP(Float.valueOf(s)), "s:"+s)); } } else if (args.f2log) { for (String s : args.f2log_s) { - similarities.add(new TaggedSimilarity(new F2LogSimilarity(Float.valueOf(s)), "s:"+s)); + similarities.add(new TaggedSimilarity(new AxiomaticF2LOG(Float.valueOf(s)), "s:"+s)); } } else { throw new IllegalArgumentException("Error: Must specify scoring model!"); diff --git a/src/main/java/io/anserini/search/similarity/AxiomaticSimilarity.java b/src/main/java/io/anserini/search/similarity/AxiomaticSimilarity.java deleted file mode 100644 index d691f32a98..0000000000 --- a/src/main/java/io/anserini/search/similarity/AxiomaticSimilarity.java +++ /dev/null @@ -1,330 +0,0 @@ -/** - * Anserini: An information retrieval toolkit built on Lucene - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.anserini.search.similarity; - -import java.util.ArrayList; -import java.util.List; -import org.apache.lucene.index.FieldInvertState; -import org.apache.lucene.search.CollectionStatistics; -import org.apache.lucene.search.Explanation; -import org.apache.lucene.search.TermStatistics; -import org.apache.lucene.search.similarities.Similarity; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.SmallFloat; - -/** - * Hui Fang and ChengXiang Zhai. 2005. An exploration of axiomatic approaches to information retrieval. - * In Proceedings of the 28th annual international ACM SIGIR conference on Research and development in - * information retrieval (SIGIR '05). ACM, New York, NY, USA, 480-487. - */ -public abstract class AxiomaticSimilarity extends Similarity { - protected final float s; - /** Cache of decoded bytes. */ - protected static final float[] LENGTH_TABLE = new float[256]; - - static { - for (int i = 0; i < 256; i++) { - LENGTH_TABLE[i] = SmallFloat.byte4ToInt((byte) i); - } - } - - /** - * @param s Generic parater s - * @throws IllegalArgumentException if {@code s} is infinite or if {@code s} is - * not within the range {@code [0..1]} - */ - AxiomaticSimilarity(float s) { - if (Float.isNaN(s) || s < 0 || s > 1) { - throw new IllegalArgumentException("illegal s value: " + s + ", must be between 0 and 1"); - } - this.s = s; - } - - /** Default parameter: - *
    - *
  • {@code s = 0.5}
  • - *
- */ - AxiomaticSimilarity() { - this(0.5f); - } - - /** Implemented as log(1 + (docCount - docFreq + 0.5)/(docFreq + 0.5)). - * - * @param docFreq terms's document frequency - * @param docCount total document count in the index - * @return inverted document frequency - * */ - float idf(long docFreq, long docCount) { - throw new UnsupportedOperationException(); - } - - /** Implemented as 1 / (distance + 1). - * - * @param distance distance - * @return sloppy frequency - * */ - float sloppyFreq(int distance) { - return 1.0f / (distance + 1); - } - - /** The default implementation returns 1 - * - * @param doc doc - * @param start start - * @param end end - * @param payload payload - * @return 1 - * */ - float scorePayload(int doc, int start, int end, BytesRef payload) { - return 1; - } - - /** The default implementation computes the average as sumTotalTermFreq / docCount, - * or returns 1 if the index does not store sumTotalTermFreq: - * any field that omits frequency information). - * - * @param collectionStats collection-wide statistics - * @return average document length of FIELD_BODY - * */ - float avgFieldLength(CollectionStatistics collectionStats) { - final long sumTotalTermFreq = collectionStats.sumTotalTermFreq(); - if (sumTotalTermFreq <= 0) { - return 1f; // field does not exist, or stat is unsupported - } else { - final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount(); - return (float) (sumTotalTermFreq / (double) docCount); - } - } - - /** - * True if overlap tokens (tokens with a position of increment of zero) are - * discounted from the document's length. - */ - boolean discountOverlaps = true; - - /** Sets whether overlap tokens (Tokens with 0 position increment) are - * ignored when computing norm. By default this is true, meaning overlap - * tokens do not count when computing norms. - * - * @param v v - * */ - public void setDiscountOverlaps(boolean v) { - discountOverlaps = v; - } - - /** - * Returns true if overlap tokens are discounted from the document's length. - * @see #setDiscountOverlaps - * - * @return discountOverlaps - */ - public boolean getDiscountOverlaps() { - return discountOverlaps; - } - - /** Cache of decoded bytes. */ - private static final float[] NORM_TABLE = new float[256]; - - static { - for (int i = 1; i < 256; i++) { - float f = SmallFloat.byte315ToFloat((byte)i); - NORM_TABLE[i] = 1.0f / (f*f); - } - NORM_TABLE[0] = 1.0f / NORM_TABLE[255]; // otherwise inf - } - - - @Override - public final long computeNorm(FieldInvertState state) { - final int numTerms = discountOverlaps ? state.getLength() - state.getNumOverlap() : state.getLength(); - int indexCreatedVersionMajor = state.getIndexCreatedVersionMajor(); - if (indexCreatedVersionMajor >= 7) { - return SmallFloat.intToByte4(numTerms); - } else { - return SmallFloat.floatToByte315((float) (1 / Math.sqrt(numTerms))); - } - } - - /** - * Computes a score factor for a simple term and returns an explanation - * for that score factor. - * - *

- * The default implementation uses: - * - *

-   * idf(docFreq, docCount);
-   * 
- * - * Note that {@link CollectionStatistics#docCount()} is used instead of - * {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also - * {@link TermStatistics#docFreq()} is used, and when the latter - * is inaccurate, so is {@link CollectionStatistics#docCount()}, and in the same direction. - * In addition, {@link CollectionStatistics#docCount()} does not skew when fields are sparse. - * - * @param collectionStats collection-level statistics - * @param termStats term-level statistics for the term - * @return an Explain object that includes both an idf score factor - and an explanation for the term. - */ - public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) { - final long df = termStats.docFreq(); - final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount(); - final float idf = idf(df, docCount); - return Explanation.match(idf, "idf(docFreq=" + df + ", docCount=" + docCount + ")"); - } - - /** - * Computes a score factor for a phrase. - * - *

- * The default implementation sums the idf factor for - * each term in the phrase. - * - * @param collectionStats collection-level statistics - * @param termStats term-level statistics for the terms in the phrase - * @return an Explain object that includes both an idf - * score factor for the phrase and an explanation - * for each term. - */ - public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) { - final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount(); - double idf = 0d; - List details = new ArrayList<>(); - for (final TermStatistics stat : termStats ) { - final long df = stat.docFreq(); - final float termIdf = idf(df, docCount); - details.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", docCount=" + docCount + ")")); - idf += termIdf; - } - return Explanation.match((float)idf, "idf(), sum of:", details); - } - - @Override - public final SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { - Explanation idf = termStats.length == 1 ? idfExplain(collectionStats, termStats[0]) : idfExplain(collectionStats, termStats); - float avgdl = avgFieldLength(collectionStats); - - float[] cache = new float[256]; - for (int i = 0; i < cache.length; i++) { - cache[i] = s + s * LENGTH_TABLE[i] / avgdl; - } - Stats axStats = new Stats(collectionStats.field(), boost, idf, avgdl, cache); - return new AxDocScorer(axStats); - } - - /** DocumentCollection statistics for the F2Log model. */ - static class Stats { - /** F2Log's idf */ - public final Explanation idf; - /** The average document length. */ - public final float avgdl; - /** query boost */ - public float boost; - /** weight (idf * boost) */ - public float weight; - /** field name, for pulling norms */ - public final String field; - /** precomputed norm[256] with k1 * ((1 - b) + b * dl / avgdl) - * for LENGTH_TABLE */ - private final float[] cache; - - Stats(String field, float boost, Explanation idf, float avgdl, float[] cache) { - this.field = field; - this.idf = idf; - this.avgdl = avgdl; - this.weight = (float) (idf.getValue().doubleValue() * boost); - this.cache = cache; - } - } - - class AxDocScorer extends SimScorer { - private final Stats stats; - private final float weightValue; // boost * idf - /** precomputed norm[256] with k1 * ((1 - b) + b * dl / avgdl) */ - private final float[] cache; - - AxDocScorer(Stats stats) { - this.stats = stats; - this.weightValue = stats.weight; - cache = stats.cache; - } - - /* Score function is: - *

-                                                     occurrences
-      score = termWeight * IDF * ---------------------------------------------------------
-                                 occurrences + s + documentLength * s / avgDocLength
-       
- */ - @Override - public float score(float freq, long encodedNorm) { - // if there are no norms, we act as if b=0 - double norm = cache[((byte) encodedNorm) & 0xFF]; - return weightValue * (float) (freq / (freq + norm)); - } - - @Override - public Explanation explain(Explanation freq, long encodedNorm) { - return explainScore(freq, encodedNorm, stats); - } - } - - private Explanation explainTFNorm(Explanation freq, long encodedNorm, Stats stats) { - List subs = new ArrayList<>(); - subs.add(freq); - subs.add(Explanation.match(s, "parameter s")); - - byte norm = (byte) encodedNorm; - float doclen = LENGTH_TABLE[norm & 0xff]; - subs.add(Explanation.match(stats.avgdl, "avgFieldLength")); - subs.add(Explanation.match(doclen, "fieldLength")); - return Explanation.match( - (freq.getValue().floatValue() / (freq.getValue().floatValue() + s + s * doclen/stats.avgdl)), - "tfNorm, computed as (freq / (freq + s + s * fieldLength / avgFieldLength) from:", subs); - } - - - private Explanation explainScore(Explanation freq, long encodedNorm, Stats stats) { - Explanation boostExpl = Explanation.match(stats.boost, "boost"); - List subs = new ArrayList<>(); - if (boostExpl.getValue().floatValue() != 1.0f) - subs.add(boostExpl); - subs.add(stats.idf); - Explanation tfNormExpl = explainTFNorm(freq, encodedNorm, stats); - subs.add(tfNormExpl); - return Explanation.match( - boostExpl.getValue().floatValue() * stats.idf.getValue().floatValue() * tfNormExpl.getValue().floatValue(), - "score(freq="+freq+", length=" + LENGTH_TABLE[Byte.toUnsignedInt((byte) encodedNorm)] + "), product of:", subs); - } - - @Override - public String toString() { - throw new UnsupportedOperationException(); - } - - /** - * Returns the b parameter - * @see #AxiomaticSimilarity(float) - * - * @return s - */ - public float getS() { - return s; - } -} diff --git a/src/main/java/io/anserini/search/similarity/F2ExpSimilarity.java b/src/main/java/io/anserini/search/similarity/F2ExpSimilarity.java deleted file mode 100644 index c7a2394f71..0000000000 --- a/src/main/java/io/anserini/search/similarity/F2ExpSimilarity.java +++ /dev/null @@ -1,59 +0,0 @@ -/** - * Anserini: An information retrieval toolkit built on Lucene - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.anserini.search.similarity; - -public class F2ExpSimilarity extends AxiomaticSimilarity { - private final float k = 0.35f; - - /** - * F2Exp with the supplied parameter values. - * @param s Controls to what degree document length normalizes tf values. - * @throws IllegalArgumentException if {@code s} is infinite or if {@code s} is - * not within the range {@code [0..1]} - */ - public F2ExpSimilarity(float s) { - super(s); - } - - /** F2Exp with these default values: - *
    - *
  • {@code k = 0.35}
  • - *
- */ - public F2ExpSimilarity() { - this(0.5f); - } - - @Override - float idf(long docFreq, long docCount) { - return (float) Math.pow((docCount + 1.0) / docFreq, this.k); - } - - @Override - public String toString() { - return "F2Exp(s=" + s +")"; - } - - /** - * Returns the k parameter - * @see #F2ExpSimilarity(float) - * @return k - */ - public float getK() { - return k; - } -} diff --git a/src/main/java/io/anserini/search/similarity/F2LogSimilarity.java b/src/main/java/io/anserini/search/similarity/F2LogSimilarity.java deleted file mode 100644 index 5e59f665fc..0000000000 --- a/src/main/java/io/anserini/search/similarity/F2LogSimilarity.java +++ /dev/null @@ -1,48 +0,0 @@ -/** - * Anserini: An information retrieval toolkit built on Lucene - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.anserini.search.similarity; - -public class F2LogSimilarity extends AxiomaticSimilarity { - /** - * F2Log with the supplied parameter values. - * @param s Controls to what degree document length normalizes tf values. - * @throws IllegalArgumentException if {@code s} is infinite or if {@code s} is - * not within the range {@code [0..1]} - */ - public F2LogSimilarity(float s) { - super(s); - } - - /** F2Log with these default values: - *
    - *
  • {@code s = 0.5}
  • - *
- */ - public F2LogSimilarity() { - this(0.5f); - } - - @Override - float idf(long docFreq, long docCount) { - return (float) Math.log((1.0f + docCount) / docFreq); - } - - @Override - public String toString() { - return "F2Log(s=" + s +")"; - } -} From 813e5923aa146d162c0022504c334ef04c05b404 Mon Sep 17 00:00:00 2001 From: Jimmy Lin Date: Tue, 23 Apr 2019 12:52:59 -0400 Subject: [PATCH 3/6] Fixed all regressions for Lucene8 (#596) --- docs/experiments-car17.md | 2 +- docs/experiments-core17.md | 2 +- docs/experiments-core18.md | 2 +- docs/experiments-cw09b.md | 24 +++---- docs/experiments-cw12.md | 16 ++--- docs/experiments-cw12b13.md | 16 ++--- docs/experiments-gov2.md | 8 +-- docs/experiments-mb11.md | 2 +- docs/experiments-robust04.md | 2 +- docs/experiments-robust05.md | 2 +- docs/experiments-wt10g.md | 2 +- src/main/resources/regression/car17.yaml | 6 +- src/main/resources/regression/core17.yaml | 4 +- src/main/resources/regression/core18.yaml | 4 +- src/main/resources/regression/cw09b.yaml | 78 ++++++++++----------- src/main/resources/regression/cw12.yaml | 50 ++++++------- src/main/resources/regression/cw12b13.yaml | 62 ++++++++-------- src/main/resources/regression/gov2.yaml | 10 +-- src/main/resources/regression/mb11.yaml | 2 +- src/main/resources/regression/robust04.yaml | 4 +- src/main/resources/regression/robust05.yaml | 4 +- src/main/resources/regression/wt10g.yaml | 8 +-- 22 files changed, 155 insertions(+), 155 deletions(-) diff --git a/docs/experiments-car17.md b/docs/experiments-car17.md index 495e6a8482..c89f471617 100644 --- a/docs/experiments-car17.md +++ b/docs/experiments-car17.md @@ -63,7 +63,7 @@ With the above commands, you should be able to replicate the following results: MAP | BM25 | BM25+RM3 | BM25+AX | QL | QL+RM3 | QL+AX | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -All Topics | 0.1689 | 0.1287 | 0.1355 | 0.1516 | 0.1173 | 0.1082 | +All Topics | 0.1689 | 0.1286 | 0.1355 | 0.1516 | 0.1173 | 0.1082 | RECIP_RANK | BM25 | BM25+RM3 | BM25+AX | QL | QL+RM3 | QL+AX | diff --git a/docs/experiments-core17.md b/docs/experiments-core17.md index aecc04bdde..4e25a8079b 100644 --- a/docs/experiments-core17.md +++ b/docs/experiments-core17.md @@ -64,7 +64,7 @@ With the above commands, you should be able to replicate the following results: MAP | BM25 | BM25+RM3 | BM25+AX | QL | QL+RM3 | QL+AX | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -All Topics | 0.1977 | 0.2682 | 0.2700 | 0.1913 | 0.2485 | 0.2514 | +All Topics | 0.1977 | 0.2682 | 0.2701 | 0.1913 | 0.2485 | 0.2514 | P30 | BM25 | BM25+RM3 | BM25+AX | QL | QL+RM3 | QL+AX | diff --git a/docs/experiments-core18.md b/docs/experiments-core18.md index 474d2a1fd2..2b7c9091d9 100644 --- a/docs/experiments-core18.md +++ b/docs/experiments-core18.md @@ -64,7 +64,7 @@ With the above commands, you should be able to replicate the following results: MAP | BM25 | BM25+RM3 | BM25+AX | QL | QL+RM3 | QL+AX | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -All Topics | 0.2491 | 0.3147 | 0.2921 | 0.2522 | 0.3064 | 0.2975 | +All Topics | 0.2491 | 0.3147 | 0.2926 | 0.2522 | 0.3064 | 0.2975 | P30 | BM25 | BM25+RM3 | BM25+AX | QL | QL+RM3 | QL+AX | diff --git a/docs/experiments-cw09b.md b/docs/experiments-cw09b.md index 9bac861ada..cbf0331ada 100644 --- a/docs/experiments-cw09b.md +++ b/docs/experiments-cw09b.md @@ -110,29 +110,29 @@ With the above commands, you should be able to replicate the following results: MAP | BM25 | BM25+RM3 | BM25+AX | QL | QL+RM3 | QL+AX | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -TREC 2010 Web Track: Topics 51-100 | 0.1126 | 0.0933 | 0.0928 | 0.1060 | 0.1019 | 0.1086 | -TREC 2011 Web Track: Topics 101-150 | 0.1094 | 0.1081 | 0.0974 | 0.0958 | 0.0837 | 0.0879 | -TREC 2012 Web Track: Topics 151-200 | 0.1106 | 0.1107 | 0.1315 | 0.1069 | 0.1059 | 0.1212 | +TREC 2010 Web Track: Topics 51-100 | 0.1126 | 0.0933 | 0.0929 | 0.1060 | 0.1019 | 0.1086 | +TREC 2011 Web Track: Topics 101-150 | 0.1094 | 0.1085 | 0.0975 | 0.0958 | 0.0839 | 0.0879 | +TREC 2012 Web Track: Topics 151-200 | 0.1105 | 0.1107 | 0.1315 | 0.1069 | 0.1058 | 0.1212 | P30 | BM25 | BM25+RM3 | BM25+AX | QL | QL+RM3 | QL+AX | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -TREC 2010 Web Track: Topics 51-100 | 0.2681 | 0.2389 | 0.2354 | 0.2431 | 0.2312 | 0.2618 | -TREC 2011 Web Track: Topics 101-150 | 0.2513 | 0.2467 | 0.2393 | 0.2147 | 0.2067 | 0.2167 | -TREC 2012 Web Track: Topics 151-200 | 0.2167 | 0.1920 | 0.2553 | 0.2080 | 0.1980 | 0.2140 | +TREC 2010 Web Track: Topics 51-100 | 0.2694 | 0.2389 | 0.2354 | 0.2431 | 0.2312 | 0.2618 | +TREC 2011 Web Track: Topics 101-150 | 0.2513 | 0.2480 | 0.2387 | 0.2147 | 0.2047 | 0.2173 | +TREC 2012 Web Track: Topics 151-200 | 0.2167 | 0.1920 | 0.2553 | 0.2080 | 0.1980 | 0.2147 | NDCG20 | BM25 | BM25+RM3 | BM25+AX | QL | QL+RM3 | QL+AX | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -TREC 2010 Web Track: Topics 51-100 | 0.1354 | 0.1369 | 0.1637 | 0.1143 | 0.1185 | 0.1454 | -TREC 2011 Web Track: Topics 101-150 | 0.1890 | 0.1916 | 0.1833 | 0.1619 | 0.1447 | 0.1509 | -TREC 2012 Web Track: Topics 151-200 | 0.1014 | 0.0917 | 0.1441 | 0.0868 | 0.0896 | 0.1030 | +TREC 2010 Web Track: Topics 51-100 | 0.1354 | 0.1369 | 0.1632 | 0.1143 | 0.1182 | 0.1454 | +TREC 2011 Web Track: Topics 101-150 | 0.1890 | 0.1916 | 0.1835 | 0.1619 | 0.1449 | 0.1517 | +TREC 2012 Web Track: Topics 151-200 | 0.1014 | 0.0918 | 0.1441 | 0.0868 | 0.0896 | 0.1037 | ERR20 | BM25 | BM25+RM3 | BM25+AX | QL | QL+RM3 | QL+AX | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -TREC 2010 Web Track: Topics 51-100 | 0.0733 | 0.0747 | 0.0981 | 0.0599 | 0.0592 | 0.0742 | -TREC 2011 Web Track: Topics 101-150 | 0.0959 | 0.0960 | 0.1091 | 0.0849 | 0.0786 | 0.0820 | -TREC 2012 Web Track: Topics 151-200 | 0.1304 | 0.1493 | 0.2355 | 0.1305 | 0.1334 | 0.1558 | +TREC 2010 Web Track: Topics 51-100 | 0.0733 | 0.0747 | 0.0977 | 0.0599 | 0.0592 | 0.0742 | +TREC 2011 Web Track: Topics 101-150 | 0.0959 | 0.0960 | 0.1091 | 0.0849 | 0.0787 | 0.0821 | +TREC 2012 Web Track: Topics 151-200 | 0.1303 | 0.1494 | 0.2355 | 0.1305 | 0.1334 | 0.1558 | diff --git a/docs/experiments-cw12.md b/docs/experiments-cw12.md index cec82a6012..a7ff8120c3 100644 --- a/docs/experiments-cw12.md +++ b/docs/experiments-cw12.md @@ -73,25 +73,25 @@ With the above commands, you should be able to replicate the following results: MAP | BM25 | BM25+RM3 | QL | QL+RM3 | :---------------------------------------|-----------|-----------|-----------|-----------| -[TREC 2013 Web Track: Topics 201-250](http://trec.nist.gov/data/web2013.html)| 0.1695 | 0.1464 | 0.1493 | 0.1291 | -[TREC 2014 Web Track: Topics 251-300](http://trec.nist.gov/data/web2014.html)| 0.2469 | 0.2325 | 0.2467 | 0.2168 | +[TREC 2013 Web Track: Topics 201-250](http://trec.nist.gov/data/web2013.html)| 0.1694 | 0.1464 | 0.1494 | 0.1290 | +[TREC 2014 Web Track: Topics 251-300](http://trec.nist.gov/data/web2014.html)| 0.2469 | 0.2324 | 0.2466 | 0.2177 | P30 | BM25 | BM25+RM3 | QL | QL+RM3 | :---------------------------------------|-----------|-----------|-----------|-----------| -[TREC 2013 Web Track: Topics 201-250](http://trec.nist.gov/data/web2013.html)| 0.2767 | 0.2387 | 0.2613 | 0.2347 | -[TREC 2014 Web Track: Topics 251-300](http://trec.nist.gov/data/web2014.html)| 0.4533 | 0.4073 | 0.4380 | 0.3793 | +[TREC 2013 Web Track: Topics 201-250](http://trec.nist.gov/data/web2013.html)| 0.2773 | 0.2393 | 0.2607 | 0.2347 | +[TREC 2014 Web Track: Topics 251-300](http://trec.nist.gov/data/web2014.html)| 0.4547 | 0.4080 | 0.4380 | 0.3800 | NDCG20 | BM25 | BM25+RM3 | QL | QL+RM3 | :---------------------------------------|-----------|-----------|-----------|-----------| -[TREC 2013 Web Track: Topics 201-250](http://trec.nist.gov/data/web2013.html)| 0.2086 | 0.2033 | 0.1993 | 0.1725 | -[TREC 2014 Web Track: Topics 251-300](http://trec.nist.gov/data/web2014.html)| 0.2578 | 0.2530 | 0.2228 | 0.2066 | +[TREC 2013 Web Track: Topics 201-250](http://trec.nist.gov/data/web2013.html)| 0.2088 | 0.2033 | 0.1993 | 0.1725 | +[TREC 2014 Web Track: Topics 251-300](http://trec.nist.gov/data/web2014.html)| 0.2572 | 0.2530 | 0.2218 | 0.2083 | ERR20 | BM25 | BM25+RM3 | QL | QL+RM3 | :---------------------------------------|-----------|-----------|-----------|-----------| -[TREC 2013 Web Track: Topics 201-250](http://trec.nist.gov/data/web2013.html)| 0.1284 | 0.1264 | 0.1232 | 0.1008 | -[TREC 2014 Web Track: Topics 251-300](http://trec.nist.gov/data/web2014.html)| 0.1630 | 0.1655 | 0.1321 | 0.1218 | +[TREC 2013 Web Track: Topics 201-250](http://trec.nist.gov/data/web2013.html)| 0.1284 | 0.1264 | 0.1233 | 0.1008 | +[TREC 2014 Web Track: Topics 251-300](http://trec.nist.gov/data/web2014.html)| 0.1616 | 0.1655 | 0.1322 | 0.1245 | diff --git a/docs/experiments-cw12b13.md b/docs/experiments-cw12b13.md index 4dfe6f754e..d6d7729f3e 100644 --- a/docs/experiments-cw12b13.md +++ b/docs/experiments-cw12b13.md @@ -88,25 +88,25 @@ With the above commands, you should be able to replicate the following results: MAP | BM25 | BM25+RM3 | BM25+AX | QL | QL+RM3 | QL+AX | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2013 Web Track: Topics 201-250](http://trec.nist.gov/data/web2013.html)| 0.0468 | 0.0412 | 0.0435 | 0.0397 | 0.0322 | 0.0359 | -[TREC 2014 Web Track: Topics 251-300](http://trec.nist.gov/data/web2014.html)| 0.0224 | 0.0210 | 0.0180 | 0.0235 | 0.0203 | 0.0186 | +[TREC 2013 Web Track: Topics 201-250](http://trec.nist.gov/data/web2013.html)| 0.0468 | 0.0408 | 0.0435 | 0.0397 | 0.0322 | 0.0358 | +[TREC 2014 Web Track: Topics 251-300](http://trec.nist.gov/data/web2014.html)| 0.0224 | 0.0210 | 0.0180 | 0.0235 | 0.0203 | 0.0183 | P30 | BM25 | BM25+RM3 | BM25+AX | QL | QL+RM3 | QL+AX | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2013 Web Track: Topics 201-250](http://trec.nist.gov/data/web2013.html)| 0.2113 | 0.1713 | 0.1840 | 0.1767 | 0.1507 | 0.1513 | -[TREC 2014 Web Track: Topics 251-300](http://trec.nist.gov/data/web2014.html)| 0.1273 | 0.1207 | 0.1107 | 0.1373 | 0.1173 | 0.1167 | +[TREC 2013 Web Track: Topics 201-250](http://trec.nist.gov/data/web2013.html)| 0.2113 | 0.1673 | 0.1833 | 0.1780 | 0.1513 | 0.1507 | +[TREC 2014 Web Track: Topics 251-300](http://trec.nist.gov/data/web2014.html)| 0.1273 | 0.1207 | 0.1107 | 0.1373 | 0.1173 | 0.1147 | NDCG20 | BM25 | BM25+RM3 | BM25+AX | QL | QL+RM3 | QL+AX | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2013 Web Track: Topics 201-250](http://trec.nist.gov/data/web2013.html)| 0.1286 | 0.1129 | 0.1287 | 0.1107 | 0.0920 | 0.1143 | -[TREC 2014 Web Track: Topics 251-300](http://trec.nist.gov/data/web2014.html)| 0.1185 | 0.1080 | 0.0964 | 0.1177 | 0.1003 | 0.1001 | +[TREC 2013 Web Track: Topics 201-250](http://trec.nist.gov/data/web2013.html)| 0.1286 | 0.1119 | 0.1287 | 0.1106 | 0.0920 | 0.1141 | +[TREC 2014 Web Track: Topics 251-300](http://trec.nist.gov/data/web2014.html)| 0.1183 | 0.1081 | 0.0963 | 0.1177 | 0.1004 | 0.0989 | ERR20 | BM25 | BM25+RM3 | BM25+AX | QL | QL+RM3 | QL+AX | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2013 Web Track: Topics 201-250](http://trec.nist.gov/data/web2013.html)| 0.0838 | 0.0763 | 0.0943 | 0.0769 | 0.0553 | 0.0780 | -[TREC 2014 Web Track: Topics 251-300](http://trec.nist.gov/data/web2014.html)| 0.1201 | 0.1065 | 0.0929 | 0.1091 | 0.0929 | 0.0896 | +[TREC 2013 Web Track: Topics 201-250](http://trec.nist.gov/data/web2013.html)| 0.0838 | 0.0753 | 0.0941 | 0.0768 | 0.0553 | 0.0780 | +[TREC 2014 Web Track: Topics 251-300](http://trec.nist.gov/data/web2014.html)| 0.1201 | 0.1066 | 0.0928 | 0.1092 | 0.0928 | 0.0900 | diff --git a/docs/experiments-gov2.md b/docs/experiments-gov2.md index 859b7490e7..6a998a8d77 100644 --- a/docs/experiments-gov2.md +++ b/docs/experiments-gov2.md @@ -90,14 +90,14 @@ With the above commands, you should be able to replicate the following results: MAP | BM25 | BM25+RM3 | BM25+AX | QL | QL+RM3 | QL+AX | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2004 Terabyte Track: Topics 701-750](http://trec.nist.gov/data/terabyte04.html)| 0.2689 | 0.2844 | 0.2665 | 0.2681 | 0.2708 | 0.2666 | -[TREC 2005 Terabyte Track: Topics 751-800](http://trec.nist.gov/data/terabyte05.html)| 0.3390 | 0.3820 | 0.3664 | 0.3303 | 0.3559 | 0.3646 | -[TREC 2006 Terabyte Track: Topics 801-850](http://trec.nist.gov/data/terabyte06.html)| 0.3080 | 0.3377 | 0.3069 | 0.2996 | 0.3154 | 0.3084 | +[TREC 2004 Terabyte Track: Topics 701-750](http://trec.nist.gov/data/terabyte04.html)| 0.2689 | 0.2844 | 0.2669 | 0.2681 | 0.2708 | 0.2666 | +[TREC 2005 Terabyte Track: Topics 751-800](http://trec.nist.gov/data/terabyte05.html)| 0.3390 | 0.3820 | 0.3666 | 0.3303 | 0.3559 | 0.3646 | +[TREC 2006 Terabyte Track: Topics 801-850](http://trec.nist.gov/data/terabyte06.html)| 0.3080 | 0.3377 | 0.3069 | 0.2997 | 0.3154 | 0.3084 | P30 | BM25 | BM25+RM3 | BM25+AX | QL | QL+RM3 | QL+AX | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2004 Terabyte Track: Topics 701-750](http://trec.nist.gov/data/terabyte04.html)| 0.4864 | 0.5190 | 0.4986 | 0.4755 | 0.4925 | 0.4932 | +[TREC 2004 Terabyte Track: Topics 701-750](http://trec.nist.gov/data/terabyte04.html)| 0.4864 | 0.5190 | 0.4993 | 0.4755 | 0.4925 | 0.4932 | [TREC 2005 Terabyte Track: Topics 751-800](http://trec.nist.gov/data/terabyte05.html)| 0.5540 | 0.5920 | 0.5933 | 0.5347 | 0.5620 | 0.5840 | [TREC 2006 Terabyte Track: Topics 801-850](http://trec.nist.gov/data/terabyte06.html)| 0.4907 | 0.5160 | 0.5033 | 0.4720 | 0.4847 | 0.4920 | diff --git a/docs/experiments-mb11.md b/docs/experiments-mb11.md index 9805271083..fd77fac0a1 100644 --- a/docs/experiments-mb11.md +++ b/docs/experiments-mb11.md @@ -95,6 +95,6 @@ MAP | BM25 | BM25+RM3 | BM25+AX | QL P30 | BM25 | BM25+RM3 | BM25+AX | QL | QL+RM3 | QL+AX | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| [TREC 2011 Microblog Track](http://trec.nist.gov/data/microblog2011.html)| 0.3959 | 0.4170 | 0.4612 | 0.4061 | 0.4435 | 0.4408 | -[TREC 2012 Microblog Track](http://trec.nist.gov/data/microblog2012.html)| 0.3316 | 0.3463 | 0.3554 | 0.3333 | 0.3520 | 0.3842 | +[TREC 2012 Microblog Track](http://trec.nist.gov/data/microblog2012.html)| 0.3316 | 0.3463 | 0.3554 | 0.3333 | 0.3514 | 0.3842 | diff --git a/docs/experiments-robust04.md b/docs/experiments-robust04.md index 69aca1b3ab..d600518575 100644 --- a/docs/experiments-robust04.md +++ b/docs/experiments-robust04.md @@ -63,7 +63,7 @@ With the above commands, you should be able to replicate the following results: MAP | BM25 | BM25+RM3 | BM25+AX | QL | QL+RM3 | QL+AX | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -All Topics | 0.2531 | 0.2903 | 0.2895 | 0.2467 | 0.2747 | 0.2774 | +All Topics | 0.2531 | 0.2903 | 0.2896 | 0.2467 | 0.2747 | 0.2774 | P30 | BM25 | BM25+RM3 | BM25+AX | QL | QL+RM3 | QL+AX | diff --git a/docs/experiments-robust05.md b/docs/experiments-robust05.md index 96cad0889e..9c50869363 100644 --- a/docs/experiments-robust05.md +++ b/docs/experiments-robust05.md @@ -62,7 +62,7 @@ With the above commands, you should be able to replicate the following results: MAP | BM25 | BM25+RM3 | BM25+AX | QL | QL+RM3 | QL+AX | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -All Topics | 0.2031 | 0.2602 | 0.2584 | 0.2028 | 0.2491 | 0.2476 | +All Topics | 0.2032 | 0.2602 | 0.2587 | 0.2028 | 0.2491 | 0.2476 | P30 | BM25 | BM25+RM3 | BM25+AX | QL | QL+RM3 | QL+AX | diff --git a/docs/experiments-wt10g.md b/docs/experiments-wt10g.md index cff0a0bf1d..21a24798f2 100644 --- a/docs/experiments-wt10g.md +++ b/docs/experiments-wt10g.md @@ -69,6 +69,6 @@ Wt10g: Topics 451-550 | 0.1992 | 0.2276 | 0.2200 | 0. P30 | BM25 | BM25+RM3 | BM25+AX | QL | QL+RM3 | QL+AX | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -Wt10g: Topics 451-550 | 0.2218 | 0.2398 | 0.2483 | 0.2180 | 0.2310 | 0.2517 | +Wt10g: Topics 451-550 | 0.2214 | 0.2398 | 0.2483 | 0.2180 | 0.2310 | 0.2514 | diff --git a/src/main/resources/regression/car17.yaml b/src/main/resources/regression/car17.yaml index c2b08c8da8..56b46e2371 100644 --- a/src/main/resources/regression/car17.yaml +++ b/src/main/resources/regression/car17.yaml @@ -21,8 +21,8 @@ index_path: indexes/lucene-index.car17.pos+docvectors+rawdocs # path to the exis collection: CarCollection index_stats: documents: 29678360 - documents (non-empty): 29674409 - total terms: 1257896158 + documents (non-empty): 29674425 + total terms: 1257909884 topics: - name: "All Topics" path: topics.car17.test200.txt @@ -59,7 +59,7 @@ models: - -rm3 results: map: - - 0.1287 + - 0.1286 recip_rank: - 0.1788 - name: bm25+ax diff --git a/src/main/resources/regression/core17.yaml b/src/main/resources/regression/core17.yaml index bd928d6f2b..922b779cc0 100644 --- a/src/main/resources/regression/core17.yaml +++ b/src/main/resources/regression/core17.yaml @@ -22,7 +22,7 @@ collection: NewYorkTimesCollection index_stats: documents: 1831109 documents (non-empty): 1831109 - total terms: 720510677 + total terms: 720510680 topics: - name: "All Topics" path: topics.core17.txt @@ -70,7 +70,7 @@ models: - -axiom.deterministic results: map: - - 0.2700 + - 0.2701 p30: - 0.4927 - name: ql diff --git a/src/main/resources/regression/core18.yaml b/src/main/resources/regression/core18.yaml index af8fb27aad..6bb5eb33d9 100644 --- a/src/main/resources/regression/core18.yaml +++ b/src/main/resources/regression/core18.yaml @@ -22,7 +22,7 @@ collection: WashingtonPostCollection index_stats: documents: 595037 documents (non-empty): 595037 - total terms: 317882653 + total terms: 317898812 topics: - name: "All Topics" path: topics.core18.txt @@ -70,7 +70,7 @@ models: - -axiom.deterministic results: map: - - 0.2921 + - 0.2926 p30: - 0.4007 - name: ql diff --git a/src/main/resources/regression/cw09b.yaml b/src/main/resources/regression/cw09b.yaml index f8901e6ff4..054b3449bc 100644 --- a/src/main/resources/regression/cw09b.yaml +++ b/src/main/resources/regression/cw09b.yaml @@ -22,7 +22,7 @@ topic_reader: Webxml index_stats: documents: 50220189 documents (non-empty): 50220159 - total terms: 31270685466 + total terms: 31302554269 topics: - name: "TREC 2010 Web Track: Topics 51-100" path: topics.web.51-100.txt @@ -70,19 +70,19 @@ models: map: - 0.1126 - 0.1094 - - 0.1106 + - 0.1105 p30: - - 0.2681 + - 0.2694 - 0.2513 - 0.2167 ndcg20: - - 0.13539 - - 0.18901 - - 0.10141 + - 0.13537 + - 0.18900 + - 0.10139 err20: - 0.07335 - 0.09592 - - 0.13036 + - 0.13031 - name: bm25+rm3 params: - -bm25 @@ -90,20 +90,20 @@ models: results: map: - 0.0933 - - 0.1081 + - 0.1085 - 0.1107 p30: - 0.2389 - - 0.2467 + - 0.2480 - 0.1920 ndcg20: - - 0.13690 - - 0.19164 - - 0.09170 + - 0.13693 + - 0.19160 + - 0.09182 err20: - - 0.07470 - - 0.09597 - - 0.14933 + - 0.07473 + - 0.09596 + - 0.14936 - name: bm25+ax params: - -bm25 @@ -113,21 +113,21 @@ models: - -axiom.beta 0.1 results: map: - - 0.0928 - - 0.0974 + - 0.0929 + - 0.0975 - 0.1315 p30: - 0.2354 - - 0.2393 + - 0.2387 - 0.2553 ndcg20: - - 0.16375 - - 0.18330 + - 0.16319 + - 0.18348 - 0.14413 err20: - - 0.09815 - - 0.10909 - - 0.23554 + - 0.09771 + - 0.10912 + - 0.23551 - name: ql params: - -ql @@ -141,12 +141,12 @@ models: - 0.2147 - 0.2080 ndcg20: - - 0.11431 - - 0.16192 + - 0.11432 + - 0.16191 - 0.08682 err20: - 0.05994 - - 0.08487 + - 0.08486 - 0.13052 - name: ql+rm3 params: @@ -155,19 +155,19 @@ models: results: map: - 0.1019 - - 0.0837 - - 0.1059 + - 0.0839 + - 0.1058 p30: - 0.2312 - - 0.2067 + - 0.2047 - 0.1980 ndcg20: - - 0.11852 - - 0.14469 + - 0.11823 + - 0.14487 - 0.08959 err20: - - 0.05920 - - 0.07861 + - 0.05917 + - 0.07872 - 0.13336 - name: ql+ax params: @@ -183,13 +183,13 @@ models: - 0.1212 p30: - 0.2618 - - 0.2167 - - 0.2140 + - 0.2173 + - 0.2147 ndcg20: - 0.14541 - - 0.15091 - - 0.10296 + - 0.15174 + - 0.10373 err20: - 0.07424 - - 0.08203 - - 0.15575 + - 0.08205 + - 0.15577 diff --git a/src/main/resources/regression/cw12.yaml b/src/main/resources/regression/cw12.yaml index 2e221ee26a..1afdf07f67 100644 --- a/src/main/resources/regression/cw12.yaml +++ b/src/main/resources/regression/cw12.yaml @@ -21,8 +21,8 @@ index_options: topic_reader: Webxml index_stats: documents: 731705088 - documents (non-empty): 731556725 - total terms: 428628865985 + documents (non-empty): 731556853 + total terms: 429328271635 topics: - name: "[TREC 2013 Web Track: Topics 201-250](http://trec.nist.gov/data/web2013.html)" path: topics.web.201-250.txt @@ -65,17 +65,17 @@ models: - -bm25 results: map: - - 0.1695 + - 0.1694 - 0.2469 p30: - - 0.2767 - - 0.4533 + - 0.2773 + - 0.4547 ndcg20: - - 0.20858 - - 0.25776 + - 0.20881 + - 0.25719 err20: - - 0.12835 - - 0.16305 + - 0.12838 + - 0.16162 - name: bm25+rm3 params: - -bm25 @@ -83,13 +83,13 @@ models: results: map: - 0.1464 - - 0.2325 + - 0.2324 p30: - - 0.2387 - - 0.4073 + - 0.2393 + - 0.4080 ndcg20: - 0.20327 - - 0.25304 + - 0.25303 err20: - 0.12637 - 0.16550 @@ -98,32 +98,32 @@ models: - -ql results: map: - - 0.1493 - - 0.2467 + - 0.1494 + - 0.2466 p30: - - 0.2613 + - 0.2607 - 0.4380 ndcg20: - 0.19935 - - 0.22282 + - 0.22184 err20: - - 0.12319 - - 0.13211 + - 0.12325 + - 0.13218 - name: ql+rm3 params: - -ql - -rm3 results: map: - - 0.1291 - - 0.2168 + - 0.1290 + - 0.2177 p30: - 0.2347 - - 0.3793 + - 0.3800 ndcg20: - 0.17253 - - 0.20662 + - 0.20829 err20: - - 0.10084 - - 0.12179 + - 0.10083 + - 0.12450 diff --git a/src/main/resources/regression/cw12b13.yaml b/src/main/resources/regression/cw12b13.yaml index 584fd76759..6de24bb7d5 100644 --- a/src/main/resources/regression/cw12b13.yaml +++ b/src/main/resources/regression/cw12b13.yaml @@ -21,8 +21,8 @@ index_options: topic_reader: Webxml index_stats: documents: 52249039 - documents (non-empty): 52238521 - total terms: 30617038149 + documents (non-empty): 52238526 + total terms: 30666923268 topics: - name: "[TREC 2013 Web Track: Topics 201-250](http://trec.nist.gov/data/web2013.html)" path: topics.web.201-250.txt @@ -72,27 +72,27 @@ models: - 0.1273 ndcg20: - 0.12862 - - 0.11849 + - 0.11835 err20: - - 0.08379 - - 0.12013 + - 0.08378 + - 0.12006 - name: bm25+rm3 params: - -bm25 - -rm3 results: map: - - 0.0412 + - 0.0408 - 0.0210 p30: - - 0.1713 + - 0.1673 - 0.1207 ndcg20: - - 0.11293 - - 0.10796 + - 0.11192 + - 0.10809 err20: - - 0.07629 - - 0.10653 + - 0.07530 + - 0.10662 - name: bm25+ax params: - -bm25 @@ -105,14 +105,14 @@ models: - 0.0435 - 0.0180 p30: - - 0.1840 + - 0.1833 - 0.1107 ndcg20: - - 0.12875 - - 0.09637 + - 0.12867 + - 0.09627 err20: - - 0.09430 - - 0.09289 + - 0.09413 + - 0.09285 - name: ql params: - -ql @@ -121,14 +121,14 @@ models: - 0.0397 - 0.0235 p30: - - 0.1767 + - 0.1780 - 0.1373 ndcg20: - - 0.11067 + - 0.11059 - 0.11765 err20: - - 0.07689 - - 0.10908 + - 0.07679 + - 0.10917 - name: ql+rm3 params: - -ql @@ -138,14 +138,14 @@ models: - 0.0322 - 0.0203 p30: - - 0.1507 + - 0.1513 - 0.1173 ndcg20: - 0.09199 - - 0.10035 + - 0.10036 err20: - 0.05525 - - 0.09289 + - 0.09284 - name: ql+ax params: - -ql @@ -155,14 +155,14 @@ models: - -axiom.beta 0.1 results: map: - - 0.0359 - - 0.0186 + - 0.0358 + - 0.0183 p30: - - 0.1513 - - 0.1167 + - 0.1507 + - 0.1147 ndcg20: - - 0.11435 - - 0.10013 + - 0.11407 + - 0.09891 err20: - - 0.07800 - - 0.08965 + - 0.07803 + - 0.09002 diff --git a/src/main/resources/regression/gov2.yaml b/src/main/resources/regression/gov2.yaml index f0c23ec988..f5f9917c8a 100644 --- a/src/main/resources/regression/gov2.yaml +++ b/src/main/resources/regression/gov2.yaml @@ -39,7 +39,7 @@ evals: index_stats: documents: 25172934 documents (non-empty): 25170664 - total terms: 17343119816 + total terms: 17345062322 topics: - name: "[TREC 2004 Terabyte Track: Topics 701-750](http://trec.nist.gov/data/terabyte04.html)" path: topics.701-750.txt @@ -85,11 +85,11 @@ models: - -axiom.deterministic results: map: - - 0.2665 - - 0.3664 + - 0.2669 + - 0.3666 - 0.3069 p30: - - 0.4986 + - 0.4993 - 0.5933 - 0.5033 - name: ql @@ -99,7 +99,7 @@ models: map: - 0.2681 - 0.3303 - - 0.2996 + - 0.2997 p30: - 0.4755 - 0.5347 diff --git a/src/main/resources/regression/mb11.yaml b/src/main/resources/regression/mb11.yaml index 173b3b48a3..76cbd7dec7 100644 --- a/src/main/resources/regression/mb11.yaml +++ b/src/main/resources/regression/mb11.yaml @@ -112,7 +112,7 @@ models: - 0.2389 p30: - 0.4435 - - 0.3520 + - 0.3514 - name: ql+ax params: - -searchtweets diff --git a/src/main/resources/regression/robust04.yaml b/src/main/resources/regression/robust04.yaml index 5193b946cb..6186e90db3 100644 --- a/src/main/resources/regression/robust04.yaml +++ b/src/main/resources/regression/robust04.yaml @@ -40,7 +40,7 @@ index_path: indexes/lucene-index.robust04.pos+docvectors+rawdocs # path to the e index_stats: documents: 528030 documents (non-empty): 528030 - total terms: 174540587 + total terms: 174540872 topics: - name: "All Topics" path: topics.robust04.301-450.601-700.txt @@ -71,7 +71,7 @@ models: - -axiom.deterministic results: map: - - 0.2895 + - 0.2896 p30: - 0.3333 - name: ql diff --git a/src/main/resources/regression/robust05.yaml b/src/main/resources/regression/robust05.yaml index 500a386f39..3d61dbc57f 100644 --- a/src/main/resources/regression/robust05.yaml +++ b/src/main/resources/regression/robust05.yaml @@ -51,7 +51,7 @@ models: - -bm25 results: map: - - 0.2031 + - 0.2032 p30: - 0.3693 - name: bm25+rm3 @@ -71,7 +71,7 @@ models: - -axiom.deterministic results: map: - - 0.2584 + - 0.2587 p30: - 0.4120 - name: ql diff --git a/src/main/resources/regression/wt10g.yaml b/src/main/resources/regression/wt10g.yaml index fceec4ffac..6db937ec82 100644 --- a/src/main/resources/regression/wt10g.yaml +++ b/src/main/resources/regression/wt10g.yaml @@ -39,8 +39,8 @@ input: collections/web/wt10g/ index_path: indexes/lucene-index.wt10g.pos+docvectors+rawdocs # path to the existing index, used in regression test if `--index` option is absent index_stats: documents: 1688402 - documents (non-empty): 1688290 - total terms: 752326031 + documents (non-empty): 1688291 + total terms: 752790242 topics: - name: "Wt10g: Topics 451-550" path: topics.451-550.txt @@ -53,7 +53,7 @@ models: map: - 0.1992 p30: - - 0.2218 + - 0.2214 - name: bm25+rm3 params: - -bm25 @@ -103,4 +103,4 @@ models: map: - 0.2275 p30: - - 0.2517 + - 0.2514 From 5b4c78585bcffdeddd0e52cd4f762cb48233e140 Mon Sep 17 00:00:00 2001 From: lintool Date: Fri, 7 Jun 2019 11:41:57 -0400 Subject: [PATCH 4/6] Fixed compiler error and regressions. --- src/main/java/io/anserini/search/SimpleSearcher.java | 6 +++--- src/main/resources/regression/car17v2.0.yaml | 4 ++-- src/main/resources/regression/core17.yaml | 4 ++-- src/main/resources/regression/core18.yaml | 8 ++++---- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/main/java/io/anserini/search/SimpleSearcher.java b/src/main/java/io/anserini/search/SimpleSearcher.java index 6642fdb71e..c15221f2e6 100644 --- a/src/main/java/io/anserini/search/SimpleSearcher.java +++ b/src/main/java/io/anserini/search/SimpleSearcher.java @@ -189,14 +189,14 @@ protected Result[] search(Query query, List queryTokens, String queryStr builder.add(query, BooleanClause.Occur.MUST); Query compositeQuery = builder.build(); rs = searcher.search(compositeQuery, isRerank ? searchArgs.rerankcutoff : k, BREAK_SCORE_TIES_BY_TWEETID, true); - context = new RerankerContext<>(searcher, null, compositeQuery, null, q, queryTokens, filter, searchArgs); + context = new RerankerContext<>(searcher, null, compositeQuery, null, queryString, queryTokens, filter, searchArgs); } else { rs = searcher.search(query, isRerank ? searchArgs.rerankcutoff : k, BREAK_SCORE_TIES_BY_TWEETID, true); - context = new RerankerContext<>(searcher, null, query, null, q, queryTokens, null, searchArgs); + context = new RerankerContext<>(searcher, null, query, null, queryString, queryTokens, null, searchArgs); } } else { rs = searcher.search(query, isRerank ? searchArgs.rerankcutoff : k, BREAK_SCORE_TIES_BY_DOCID, true); - context = new RerankerContext<>(searcher, null, query, null, q, queryTokens, null, searchArgs); + context = new RerankerContext<>(searcher, null, query, null, queryString, queryTokens, null, searchArgs); } ScoredDocuments hits = cascade.run(ScoredDocuments.fromTopDocs(rs, searcher), context); diff --git a/src/main/resources/regression/car17v2.0.yaml b/src/main/resources/regression/car17v2.0.yaml index 551eb028c0..f1ab5cfcb5 100644 --- a/src/main/resources/regression/car17v2.0.yaml +++ b/src/main/resources/regression/car17v2.0.yaml @@ -21,8 +21,8 @@ index_path: indexes/lucene-index.car17v2.0.pos+docvectors+rawdocs collection: CarCollection index_stats: documents: 29794689 - documents (non-empty): 29791041 - total terms: 1249740109 + documents (non-empty): 29791059 + total terms: 1249754054 topics: - name: "benchmarkY1test" path: topics.car17v2.0.benchmarkY1test.txt diff --git a/src/main/resources/regression/core17.yaml b/src/main/resources/regression/core17.yaml index bd20578ba8..f9c022d9d1 100644 --- a/src/main/resources/regression/core17.yaml +++ b/src/main/resources/regression/core17.yaml @@ -22,7 +22,7 @@ collection: NewYorkTimesCollection index_stats: documents: 1855649 documents (non-empty): 1855649 - total terms: 751034051 + total terms: 751034054 topics: - name: "All Topics" path: topics.core17.txt @@ -70,7 +70,7 @@ models: - -axiom.deterministic results: map: - - 0.2787 + - 0.2788 p30: - 0.4980 - name: ql diff --git a/src/main/resources/regression/core18.yaml b/src/main/resources/regression/core18.yaml index d36cdda743..2ff64295cb 100644 --- a/src/main/resources/regression/core18.yaml +++ b/src/main/resources/regression/core18.yaml @@ -21,8 +21,8 @@ index_path: indexes/lucene-index.core18.pos+docvectors+rawdocs # path to the exi collection: WashingtonPostCollection index_stats: documents: 595037 - documents (non-empty): 595037 - total terms: 317898812 + documents (non-empty): 595030 + total terms: 318219945 topics: - name: "All Topics" path: topics.core18.txt @@ -59,7 +59,7 @@ models: - -rm3 results: map: - - 0.3136 + - 0.3135 p30: - 0.4200 - name: bm25+ax @@ -70,7 +70,7 @@ models: - -axiom.deterministic results: map: - - 0.2920 + - 0.2925 p30: - 0.4027 - name: ql From bebbda0c5dd0a0b02e7c6946007591482674194a Mon Sep 17 00:00:00 2001 From: lintool Date: Fri, 7 Jun 2019 20:20:12 -0400 Subject: [PATCH 5/6] Tweaks. --- docs/experiments-core17.md | 2 +- docs/experiments-core18.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/experiments-core17.md b/docs/experiments-core17.md index 6acda9e0c1..9dbf1bdd5b 100644 --- a/docs/experiments-core17.md +++ b/docs/experiments-core17.md @@ -64,7 +64,7 @@ With the above commands, you should be able to replicate the following results: MAP | BM25 | BM25+RM3 | BM25+AX | QL | QL+RM3 | QL+AX | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -All Topics | 0.2087 | 0.2823 | 0.2787 | 0.2032 | 0.2606 | 0.2613 | +All Topics | 0.2087 | 0.2823 | 0.2788 | 0.2032 | 0.2606 | 0.2613 | P30 | BM25 | BM25+RM3 | BM25+AX | QL | QL+RM3 | QL+AX | diff --git a/docs/experiments-core18.md b/docs/experiments-core18.md index fafa740668..38330b6170 100644 --- a/docs/experiments-core18.md +++ b/docs/experiments-core18.md @@ -64,7 +64,7 @@ With the above commands, you should be able to replicate the following results: MAP | BM25 | BM25+RM3 | BM25+AX | QL | QL+RM3 | QL+AX | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -All Topics | 0.2495 | 0.3136 | 0.2920 | 0.2526 | 0.3073 | 0.2966 | +All Topics | 0.2495 | 0.3135 | 0.2925 | 0.2526 | 0.3073 | 0.2966 | P30 | BM25 | BM25+RM3 | BM25+AX | QL | QL+RM3 | QL+AX | From b485d236246532d5186902deece179f0ef65f2c8 Mon Sep 17 00:00:00 2001 From: lintool Date: Tue, 11 Jun 2019 23:06:10 -0400 Subject: [PATCH 6/6] Fixed broken regressions. --- docs/regressions-car17v1.5.md | 4 ++-- docs/regressions-msmarco-doc.md | 4 ++-- docs/regressions-msmarco-passage.md | 2 +- src/main/resources/regression/car17v1.5.yaml | 4 ++-- src/main/resources/regression/msmarco-doc.yaml | 8 ++++---- src/main/resources/regression/msmarco-passage.yaml | 6 +++--- 6 files changed, 14 insertions(+), 14 deletions(-) diff --git a/docs/regressions-car17v1.5.md b/docs/regressions-car17v1.5.md index b73e776dfc..b5873f528e 100644 --- a/docs/regressions-car17v1.5.md +++ b/docs/regressions-car17v1.5.md @@ -64,11 +64,11 @@ With the above commands, you should be able to replicate the following results: MAP | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2017 CAR: benchmarkY1test (v1.5)](http://trec-car.cs.unh.edu/datareleases/)| 0.1563 | 0.1295 | 0.1358 | 0.1386 | 0.1080 | 0.1048 | +[TREC 2017 CAR: benchmarkY1test (v1.5)](http://trec-car.cs.unh.edu/datareleases/)| 0.1562 | 0.1295 | 0.1358 | 0.1386 | 0.1080 | 0.1048 | RECIP_RANK | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2017 CAR: benchmarkY1test (v1.5)](http://trec-car.cs.unh.edu/datareleases/)| 0.2336 | 0.1923 | 0.1949 | 0.2037 | 0.1599 | 0.1524 | +[TREC 2017 CAR: benchmarkY1test (v1.5)](http://trec-car.cs.unh.edu/datareleases/)| 0.2331 | 0.1923 | 0.1949 | 0.2037 | 0.1599 | 0.1524 | diff --git a/docs/regressions-msmarco-doc.md b/docs/regressions-msmarco-doc.md index 4ce79b929d..6aaf2966d0 100644 --- a/docs/regressions-msmarco-doc.md +++ b/docs/regressions-msmarco-doc.md @@ -47,11 +47,11 @@ With the above commands, you should be able to replicate the following results: MAP | BM25 | +RM3 | :---------------------------------------|-----------|-----------| -[MS MARCO Document Ranking: Dev Queries](https://github.com/microsoft/TREC-2019-Deep-Learning)| 0.2308 | 0.1631 | +[MS MARCO Document Ranking: Dev Queries](https://github.com/microsoft/TREC-2019-Deep-Learning)| 0.2310 | 0.1632 | R@1000 | BM25 | +RM3 | :---------------------------------------|-----------|-----------| -[MS MARCO Document Ranking: Dev Queries](https://github.com/microsoft/TREC-2019-Deep-Learning)| 0.8856 | 0.8787 | +[MS MARCO Document Ranking: Dev Queries](https://github.com/microsoft/TREC-2019-Deep-Learning)| 0.8856 | 0.8785 | diff --git a/docs/regressions-msmarco-passage.md b/docs/regressions-msmarco-passage.md index 202a3fb53d..d8b29f8b6d 100644 --- a/docs/regressions-msmarco-passage.md +++ b/docs/regressions-msmarco-passage.md @@ -56,7 +56,7 @@ With the above commands, you should be able to replicate the following results: MAP | BM25 (Default)| +RM3 | BM25 (Tuned)| +RM3 | :---------------------------------------|-----------|-----------|-----------|-----------| -[MS MARCO Passage Ranking: Dev Queries](https://github.com/microsoft/MSMARCO-Passage-Ranking)| 0.1924 | 0.1661 | 0.1956 | 0.1766 | +[MS MARCO Passage Ranking: Dev Queries](https://github.com/microsoft/MSMARCO-Passage-Ranking)| 0.1926 | 0.1661 | 0.1957 | 0.1766 | R@1000 | BM25 (Default)| +RM3 | BM25 (Tuned)| +RM3 | diff --git a/src/main/resources/regression/car17v1.5.yaml b/src/main/resources/regression/car17v1.5.yaml index ed6b77f4d2..e13410f6ea 100644 --- a/src/main/resources/regression/car17v1.5.yaml +++ b/src/main/resources/regression/car17v1.5.yaml @@ -51,9 +51,9 @@ models: - -bm25 results: map: - - 0.1563 + - 0.1562 recip_rank: - - 0.2336 + - 0.2331 - name: bm25+rm3 display: +RM3 params: diff --git a/src/main/resources/regression/msmarco-doc.yaml b/src/main/resources/regression/msmarco-doc.yaml index dcf364fb78..7fdf92776f 100644 --- a/src/main/resources/regression/msmarco-doc.yaml +++ b/src/main/resources/regression/msmarco-doc.yaml @@ -42,7 +42,7 @@ index_path: indexes/lucene-index.msmarco-doc.pos+docvectors+rawdocs index_stats: documents: 3213835 documents (non-empty): 3213835 - total terms: 2746735247 + total terms: 2748636047 topics: - name: "[MS MARCO Document Ranking: Dev Queries](https://github.com/microsoft/TREC-2019-Deep-Learning)" path: topics.msmarco-doc.dev.txt @@ -54,7 +54,7 @@ models: - -bm25 results: map: - - 0.2308 + - 0.2310 R@1000: - 0.8856 - name: bm25+rm3 @@ -64,6 +64,6 @@ models: - -rm3 results: map: - - 0.1631 + - 0.1632 R@1000: - - 0.8787 + - 0.8785 diff --git a/src/main/resources/regression/msmarco-passage.yaml b/src/main/resources/regression/msmarco-passage.yaml index 8dc22f5af6..c7d46f6380 100644 --- a/src/main/resources/regression/msmarco-passage.yaml +++ b/src/main/resources/regression/msmarco-passage.yaml @@ -42,7 +42,7 @@ index_path: indexes/lucene-index.msmarco-passage.pos+docvectors+rawdocs index_stats: documents: 8841823 documents (non-empty): 8841823 - total terms: 352122244 + total terms: 352316036 topics: - name: "[MS MARCO Passage Ranking: Dev Queries](https://github.com/microsoft/MSMARCO-Passage-Ranking)" path: topics.msmarco-passage.dev-subset.txt @@ -54,7 +54,7 @@ models: - -bm25 results: map: - - 0.1924 + - 0.1926 R@1000: - 0.8526 - name: bm25-default+rm3 @@ -75,7 +75,7 @@ models: - -b 0.72 results: map: - - 0.1956 + - 0.1957 R@1000: - 0.8578 - name: bm25-tuned+rm3