queryTokens, String queryStr
builder.add(filter, BooleanClause.Occur.FILTER);
builder.add(query, BooleanClause.Occur.MUST);
Query compositeQuery = builder.build();
- rs = searcher.search(compositeQuery, isRerank ?
- searchArgs.rerankcutoff : k, BREAK_SCORE_TIES_BY_TWEETID, true, true);
+ rs = searcher.search(compositeQuery, isRerank ? searchArgs.rerankcutoff : k, BREAK_SCORE_TIES_BY_TWEETID, true);
context = new RerankerContext<>(searcher, null, compositeQuery, null, queryString, queryTokens, filter, searchArgs);
} else {
- rs = searcher.search(query, isRerank ?
- searchArgs.rerankcutoff : k, BREAK_SCORE_TIES_BY_TWEETID, true, true);
+ rs = searcher.search(query, isRerank ? searchArgs.rerankcutoff : k, BREAK_SCORE_TIES_BY_TWEETID, true);
context = new RerankerContext<>(searcher, null, query, null, queryString, queryTokens, null, searchArgs);
}
} else {
- rs = searcher.search(query, isRerank ?
- searchArgs.rerankcutoff : k, BREAK_SCORE_TIES_BY_DOCID, true, true);
+ rs = searcher.search(query, isRerank ? searchArgs.rerankcutoff : k, BREAK_SCORE_TIES_BY_DOCID, true);
context = new RerankerContext<>(searcher, null, query, null, queryString, queryTokens, null, searchArgs);
}
diff --git a/src/main/java/io/anserini/search/similarity/AxiomaticSimilarity.java b/src/main/java/io/anserini/search/similarity/AxiomaticSimilarity.java
deleted file mode 100644
index 12b05c9e5b..0000000000
--- a/src/main/java/io/anserini/search/similarity/AxiomaticSimilarity.java
+++ /dev/null
@@ -1,389 +0,0 @@
-/**
- * Anserini: A Lucene toolkit for replicable information retrieval research
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package io.anserini.search.similarity;
-
-import org.apache.lucene.index.FieldInvertState;
-import org.apache.lucene.index.LeafReaderContext;
-import org.apache.lucene.index.NumericDocValues;
-import org.apache.lucene.search.CollectionStatistics;
-import org.apache.lucene.search.Explanation;
-import org.apache.lucene.search.TermStatistics;
-import org.apache.lucene.search.similarities.Similarity;
-import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.SmallFloat;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-
-/**
- * Hui Fang and ChengXiang Zhai. 2005. An exploration of axiomatic approaches to information retrieval.
- * In Proceedings of the 28th annual international ACM SIGIR conference on Research and development in
- * information retrieval (SIGIR '05). ACM, New York, NY, USA, 480-487.
- */
-public abstract class AxiomaticSimilarity extends Similarity {
- protected final float s;
- /** Cache of decoded bytes. */
- protected static final float[] OLD_LENGTH_TABLE = new float[256];
- protected static final float[] LENGTH_TABLE = new float[256];
-
- static {
- for (int i = 1; i < 256; i++) {
- float f = SmallFloat.byte315ToFloat((byte)i);
- OLD_LENGTH_TABLE[i] = 1.0f / (f*f);
- }
- OLD_LENGTH_TABLE[0] = 1.0f / OLD_LENGTH_TABLE[255]; // otherwise inf
-
- for (int i = 0; i < 256; i++) {
- LENGTH_TABLE[i] = SmallFloat.byte4ToInt((byte) i);
- }
- }
-
- /**
- * @param s Generic parater s
- * @throws IllegalArgumentException if {@code s} is infinite or if {@code s} is
- * not within the range {@code [0..1]}
- */
- AxiomaticSimilarity(float s) {
- if (Float.isNaN(s) || s < 0 || s > 1) {
- throw new IllegalArgumentException("illegal s value: " + s + ", must be between 0 and 1");
- }
- this.s = s;
- }
-
- /** Default parameter:
- *
- */
- AxiomaticSimilarity() {
- this(0.5f);
- }
-
- /** Implemented as log(1 + (docCount - docFreq + 0.5)/(docFreq + 0.5))
.
- *
- * @param docFreq terms's document frequency
- * @param docCount total document count in the index
- * @return inverted document frequency
- * */
- float idf(long docFreq, long docCount) {
- throw new UnsupportedOperationException();
- }
-
- /** Implemented as 1 / (distance + 1)
.
- *
- * @param distance distance
- * @return sloppy frequency
- * */
- float sloppyFreq(int distance) {
- return 1.0f / (distance + 1);
- }
-
- /** The default implementation returns 1
- *
- * @param doc doc
- * @param start start
- * @param end end
- * @param payload payload
- * @return 1
- * */
- float scorePayload(int doc, int start, int end, BytesRef payload) {
- return 1;
- }
-
- /** The default implementation computes the average as sumTotalTermFreq / docCount
,
- * or returns 1
if the index does not store sumTotalTermFreq:
- * any field that omits frequency information).
- *
- * @param collectionStats collection-wide statistics
- * @return average document length of FIELD_BODY
- * */
- float avgFieldLength(CollectionStatistics collectionStats) {
- final long sumTotalTermFreq = collectionStats.sumTotalTermFreq();
- if (sumTotalTermFreq <= 0) {
- return 1f; // field does not exist, or stat is unsupported
- } else {
- final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
- return (float) (sumTotalTermFreq / (double) docCount);
- }
- }
-
- /**
- * True if overlap tokens (tokens with a position of increment of zero) are
- * discounted from the document's length.
- */
- boolean discountOverlaps = true;
-
- /** Sets whether overlap tokens (Tokens with 0 position increment) are
- * ignored when computing norm. By default this is true, meaning overlap
- * tokens do not count when computing norms.
- *
- * @param v v
- * */
- public void setDiscountOverlaps(boolean v) {
- discountOverlaps = v;
- }
-
- /**
- * Returns true if overlap tokens are discounted from the document's length.
- * @see #setDiscountOverlaps
- *
- * @return discountOverlaps
- */
- public boolean getDiscountOverlaps() {
- return discountOverlaps;
- }
-
- /** Cache of decoded bytes. */
- private static final float[] NORM_TABLE = new float[256];
-
- static {
- for (int i = 1; i < 256; i++) {
- float f = SmallFloat.byte315ToFloat((byte)i);
- NORM_TABLE[i] = 1.0f / (f*f);
- }
- NORM_TABLE[0] = 1.0f / NORM_TABLE[255]; // otherwise inf
- }
-
-
- @Override
- public final long computeNorm(FieldInvertState state) {
- final int numTerms = discountOverlaps ? state.getLength() - state.getNumOverlap() : state.getLength();
- int indexCreatedVersionMajor = state.getIndexCreatedVersionMajor();
- if (indexCreatedVersionMajor >= 7) {
- return SmallFloat.intToByte4(numTerms);
- } else {
- return SmallFloat.floatToByte315((float) (1 / Math.sqrt(numTerms)));
- }
- }
-
- /**
- * Computes a score factor for a simple term and returns an explanation
- * for that score factor.
- *
- *
- * The default implementation uses:
- *
- *
- * idf(docFreq, docCount);
- *
- *
- * Note that {@link CollectionStatistics#docCount()} is used instead of
- * {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also
- * {@link TermStatistics#docFreq()} is used, and when the latter
- * is inaccurate, so is {@link CollectionStatistics#docCount()}, and in the same direction.
- * In addition, {@link CollectionStatistics#docCount()} does not skew when fields are sparse.
- *
- * @param collectionStats collection-level statistics
- * @param termStats term-level statistics for the term
- * @return an Explain object that includes both an idf score factor
- and an explanation for the term.
- */
- public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
- final long df = termStats.docFreq();
- final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
- final float idf = idf(df, docCount);
- return Explanation.match(idf, "idf(docFreq=" + df + ", docCount=" + docCount + ")");
- }
-
- /**
- * Computes a score factor for a phrase.
- *
- *
- * The default implementation sums the idf factor for
- * each term in the phrase.
- *
- * @param collectionStats collection-level statistics
- * @param termStats term-level statistics for the terms in the phrase
- * @return an Explain object that includes both an idf
- * score factor for the phrase and an explanation
- * for each term.
- */
- public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) {
- final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
- double idf = 0d;
- List details = new ArrayList<>();
- for (final TermStatistics stat : termStats ) {
- final long df = stat.docFreq();
- final float termIdf = idf(df, docCount);
- details.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", docCount=" + docCount + ")"));
- idf += termIdf;
- }
- return Explanation.match((float)idf, "idf(), sum of:", details);
- }
-
- @Override
- public final SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
- Explanation idf = termStats.length == 1 ? idfExplain(collectionStats, termStats[0]) : idfExplain(collectionStats, termStats);
- float avgdl = avgFieldLength(collectionStats);
-
- float[] oldCache = new float[256];
- float[] cache = new float[256];
- for (int i = 0; i < cache.length; i++) {
- oldCache[i] = s + s * OLD_LENGTH_TABLE[i] / avgdl;
- cache[i] = s + s * LENGTH_TABLE[i] / avgdl;
- }
- return new Stats(collectionStats.field(), boost, idf, avgdl, oldCache, cache);
- }
-
-
- @Override
- public final SimScorer simScorer(SimWeight stats, LeafReaderContext context) throws IOException {
- Stats axStats = (Stats) stats;
- return new AxDocScorer(axStats, context.reader().getMetaData().getCreatedVersionMajor(), context.reader().getNormValues(axStats.field));
- }
-
- /** DocumentCollection statistics for the F2Log model. */
- static class Stats extends SimWeight {
- /** F2Log's idf */
- public final Explanation idf;
- /** The average document length. */
- public final float avgdl;
- /** query boost */
- public float boost;
- /** weight (idf * boost) */
- public float weight;
- /** field name, for pulling norms */
- public final String field;
- /** precomputed norm[256] with k1 * ((1 - b) + b * dl / avgdl)
- * for both OLD_LENGTH_TABLE and LENGTH_TABLE */
- private final float[] oldCache, cache;
-
- Stats(String field, float boost, Explanation idf, float avgdl, float[] oldCache, float[] cache) {
- this.field = field;
- this.idf = idf;
- this.avgdl = avgdl;
- this.weight = idf.getValue() * boost;
- this.oldCache = oldCache;
- this.cache = cache;
- }
- }
-
- class AxDocScorer extends SimScorer {
- private final Stats stats;
- private final float weightValue; // boost * idf
- private final NumericDocValues norms;
- /** precomputed cache for all length values */
- private final float[] lengthCache;
- /** precomputed norm[256] with k1 * ((1 - b) + b * dl / avgdl) */
- private final float[] cache;
-
- AxDocScorer(Stats stats, int indexCreatedVersionMajor, NumericDocValues norms) throws IOException {
- this.stats = stats;
- this.weightValue = stats.weight;
- this.norms = norms;
- if (indexCreatedVersionMajor >= 7) {
- lengthCache = LENGTH_TABLE;
- cache = stats.cache;
- } else {
- lengthCache = OLD_LENGTH_TABLE;
- cache = stats.oldCache;
- }
- }
-
- /* Score function is:
- *
- occurrences
- score = termWeight * IDF * ---------------------------------------------------------
- occurrences + s + documentLength * s / avgDocLength
-
- */
- @Override
- public float score(int doc, float freq) throws IOException {
- // if there are no norms, we act as if b=0
- float norm;
- if (norms == null) {
- norm = 0.0f;
- } else {
- if (norms.advanceExact(doc)) {
- norm = cache[((byte) norms.longValue()) & 0xFF];
- } else {
- norm = cache[0];
- }
- }
- return weightValue * freq / (freq + norm);
- }
-
- @Override
- public Explanation explain(int doc, Explanation freq) throws IOException {
- return explainScore(doc, freq, stats, norms, lengthCache);
- }
-
- @Override
- public float computeSlopFactor(int distance) {
- return sloppyFreq(distance);
- }
-
- @Override
- public float computePayloadFactor(int doc, int start, int end, BytesRef payload) {
- return scorePayload(doc, start, end, payload);
- }
- }
-
- private Explanation explainTFNorm(int doc, Explanation freq, Stats stats, NumericDocValues norms, float[] lengthCache) throws IOException {
- List subs = new ArrayList<>();
- subs.add(freq);
- subs.add(Explanation.match(s, "parameter s"));
- if (norms == null) {
- subs.add(Explanation.match(0, "norm"));
- return Explanation.match(1,
- "tfNorm, computed as constant from:", subs);
- } else {
- byte norm;
- if (norms.advanceExact(doc)) {
- norm = (byte) norms.longValue();
- } else {
- norm = 0;
- }
- float doclen = lengthCache[norm & 0xff];
- subs.add(Explanation.match(stats.avgdl, "avgFieldLength"));
- subs.add(Explanation.match(doclen, "fieldLength"));
- return Explanation.match(
- (freq.getValue() / (freq.getValue() + s + s * doclen/stats.avgdl)),
- "tfNorm, computed as (freq / (freq + s + s * fieldLength / avgFieldLength) from:", subs);
- }
- }
-
-
- private Explanation explainScore(int doc, Explanation freq, Stats stats, NumericDocValues norms, float[] lengthCache) throws IOException {
- Explanation boostExpl = Explanation.match(stats.boost, "boost");
- List subs = new ArrayList<>();
- if (boostExpl.getValue() != 1.0f)
- subs.add(boostExpl);
- subs.add(stats.idf);
- Explanation tfNormExpl = explainTFNorm(doc, freq, stats, norms, lengthCache);
- subs.add(tfNormExpl);
- return Explanation.match(
- boostExpl.getValue() * stats.idf.getValue() * tfNormExpl.getValue(),
- "score(doc="+doc+",freq="+freq+"), product of:", subs);
- }
-
- @Override
- public String toString() {
- throw new UnsupportedOperationException();
- }
-
- /**
- * Returns the b
parameter
- * @see #AxiomaticSimilarity(float)
- *
- * @return s
- */
- public float getS() {
- return s;
- }
-}
diff --git a/src/main/java/io/anserini/search/similarity/F2ExpSimilarity.java b/src/main/java/io/anserini/search/similarity/F2ExpSimilarity.java
deleted file mode 100644
index ce24a00f61..0000000000
--- a/src/main/java/io/anserini/search/similarity/F2ExpSimilarity.java
+++ /dev/null
@@ -1,59 +0,0 @@
-/**
- * Anserini: A Lucene toolkit for replicable information retrieval research
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package io.anserini.search.similarity;
-
-public class F2ExpSimilarity extends AxiomaticSimilarity {
- private final float k = 0.35f;
-
- /**
- * F2Exp with the supplied parameter values.
- * @param s Controls to what degree document length normalizes tf values.
- * @throws IllegalArgumentException if {@code s} is infinite or if {@code s} is
- * not within the range {@code [0..1]}
- */
- public F2ExpSimilarity(float s) {
- super(s);
- }
-
- /** F2Exp with these default values:
- *
- */
- public F2ExpSimilarity() {
- this(0.5f);
- }
-
- @Override
- float idf(long docFreq, long docCount) {
- return (float) Math.pow((docCount + 1.0) / docFreq, this.k);
- }
-
- @Override
- public String toString() {
- return "F2Exp(s=" + s +")";
- }
-
- /**
- * Returns the k
parameter
- * @see #F2ExpSimilarity(float)
- * @return k
- */
- public float getK() {
- return k;
- }
-}
diff --git a/src/main/java/io/anserini/search/similarity/F2LogSimilarity.java b/src/main/java/io/anserini/search/similarity/F2LogSimilarity.java
deleted file mode 100644
index 7967b7b5d9..0000000000
--- a/src/main/java/io/anserini/search/similarity/F2LogSimilarity.java
+++ /dev/null
@@ -1,48 +0,0 @@
-/**
- * Anserini: A Lucene toolkit for replicable information retrieval research
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package io.anserini.search.similarity;
-
-public class F2LogSimilarity extends AxiomaticSimilarity {
- /**
- * F2Log with the supplied parameter values.
- * @param s Controls to what degree document length normalizes tf values.
- * @throws IllegalArgumentException if {@code s} is infinite or if {@code s} is
- * not within the range {@code [0..1]}
- */
- public F2LogSimilarity(float s) {
- super(s);
- }
-
- /** F2Log with these default values:
- *
- */
- public F2LogSimilarity() {
- this(0.5f);
- }
-
- @Override
- float idf(long docFreq, long docCount) {
- return (float) Math.log((1.0f + docCount) / docFreq);
- }
-
- @Override
- public String toString() {
- return "F2Log(s=" + s +")";
- }
-}
diff --git a/src/main/java/io/anserini/search/similarity/RankLibSimilarity.java b/src/main/java/io/anserini/search/similarity/RankLibSimilarity.java
index b263965723..da1c077b5d 100644
--- a/src/main/java/io/anserini/search/similarity/RankLibSimilarity.java
+++ b/src/main/java/io/anserini/search/similarity/RankLibSimilarity.java
@@ -17,13 +17,10 @@
package io.anserini.search.similarity;
import org.apache.lucene.index.FieldInvertState;
-import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.search.similarities.Similarity;
-import java.io.IOException;
-
/**
* Similarity that uses a Ranklib ranker to compute the score
*/
@@ -34,12 +31,8 @@ public long computeNorm(FieldInvertState fieldInvertState) {
}
@Override
- public SimWeight computeWeight(float boost, CollectionStatistics collectionStatistics, TermStatistics... termStatistics) {
+ public SimScorer scorer(float boost, CollectionStatistics collectionStatistics, TermStatistics... termStatistics) {
return null;
}
- @Override
- public SimScorer simScorer(SimWeight simWeight, LeafReaderContext leafReaderContext) throws IOException {
- return null;
- }
}
diff --git a/src/main/java/io/anserini/util/ExtractTopDfTerms.java b/src/main/java/io/anserini/util/ExtractTopDfTerms.java
index 6c33e38619..b5d12c1007 100644
--- a/src/main/java/io/anserini/util/ExtractTopDfTerms.java
+++ b/src/main/java/io/anserini/util/ExtractTopDfTerms.java
@@ -20,7 +20,7 @@
import org.apache.logging.log4j.Logger;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.MultiFields;
+import org.apache.lucene.index.MultiTerms;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
@@ -95,7 +95,7 @@ public int compare(Pair p1, Pair p2) {
PriorityQueue queue = new PriorityQueue(myArgs.topK, comp);
LOG.info("Starting to iterate through all terms...");
- Terms terms = MultiFields.getFields(reader).terms(myArgs.field);
+ Terms terms = MultiTerms.getTerms(reader, myArgs.field);
TermsEnum termsEnum = terms.iterator();
BytesRef text;
int cnt = 0;
diff --git a/src/main/resources/regression/car17v1.5.yaml b/src/main/resources/regression/car17v1.5.yaml
index cd85d2a4c0..e13410f6ea 100644
--- a/src/main/resources/regression/car17v1.5.yaml
+++ b/src/main/resources/regression/car17v1.5.yaml
@@ -21,8 +21,8 @@ index_path: indexes/lucene-index.car17v1.5.pos+docvectors+rawdocs # path to the
collection: CarCollection
index_stats:
documents: 29678360
- documents (non-empty): 29674409
- total terms: 1257896158
+ documents (non-empty): 29674425
+ total terms: 1257909884
topics:
- name: "[TREC 2017 CAR: benchmarkY1test (v1.5)](http://trec-car.cs.unh.edu/datareleases/)"
path: topics.car17v1.5.benchmarkY1test.txt
@@ -51,9 +51,9 @@ models:
- -bm25
results:
map:
- - 0.1563
+ - 0.1562
recip_rank:
- - 0.2336
+ - 0.2331
- name: bm25+rm3
display: +RM3
params:
diff --git a/src/main/resources/regression/car17v2.0.yaml b/src/main/resources/regression/car17v2.0.yaml
index e9eac18e80..e84552d897 100644
--- a/src/main/resources/regression/car17v2.0.yaml
+++ b/src/main/resources/regression/car17v2.0.yaml
@@ -21,8 +21,8 @@ index_path: indexes/lucene-index.car17v2.0.pos+docvectors+rawdocs
collection: CarCollection
index_stats:
documents: 29794689
- documents (non-empty): 29791041
- total terms: 1249740109
+ documents (non-empty): 29791059
+ total terms: 1249754054
topics:
- name: "[TREC 2017 CAR: benchmarkY1test (v2.0)](http://trec-car.cs.unh.edu/datareleases/)"
path: topics.car17v2.0.benchmarkY1test.txt
diff --git a/src/main/resources/regression/core17.yaml b/src/main/resources/regression/core17.yaml
index c8f80c88f7..b91ebee6a8 100644
--- a/src/main/resources/regression/core17.yaml
+++ b/src/main/resources/regression/core17.yaml
@@ -22,7 +22,7 @@ collection: NewYorkTimesCollection
index_stats:
documents: 1855649
documents (non-empty): 1855649
- total terms: 751034051
+ total terms: 751034054
topics:
- name: "[TREC 2017 Common Core Track Topics](https://trec.nist.gov/data/core/core_nist.txt)"
path: topics.core17.txt
@@ -73,7 +73,7 @@ models:
- -axiom.deterministic
results:
map:
- - 0.2787
+ - 0.2788
p30:
- 0.4980
- name: ql
diff --git a/src/main/resources/regression/core18.yaml b/src/main/resources/regression/core18.yaml
index 9ce7cd6419..658204bacb 100644
--- a/src/main/resources/regression/core18.yaml
+++ b/src/main/resources/regression/core18.yaml
@@ -22,7 +22,7 @@ collection: WashingtonPostCollection
index_stats:
documents: 595037
documents (non-empty): 595030
- total terms: 318203786
+ total terms: 318219945
topics:
- name: "[TREC 2018 Common Core Track Topics](https://trec.nist.gov/data/core/topics2018.txt)"
path: topics.core18.txt
@@ -61,7 +61,7 @@ models:
- -rm3
results:
map:
- - 0.3136
+ - 0.3135
p30:
- 0.4200
- name: bm25+ax
@@ -73,7 +73,7 @@ models:
- -axiom.deterministic
results:
map:
- - 0.2920
+ - 0.2925
p30:
- 0.4027
- name: ql
diff --git a/src/main/resources/regression/cw09b.yaml b/src/main/resources/regression/cw09b.yaml
index 293a03d1c2..55642ea662 100644
--- a/src/main/resources/regression/cw09b.yaml
+++ b/src/main/resources/regression/cw09b.yaml
@@ -22,7 +22,7 @@ topic_reader: Webxml
index_stats:
documents: 50220189
documents (non-empty): 50220159
- total terms: 31270685466
+ total terms: 31302554269
topics:
- name: "[TREC 2010 Web Track: Topics 51-100](http://trec.nist.gov/data/web/10/wt2010-topics.xml)"
path: topics.web.51-100.txt
@@ -71,19 +71,19 @@ models:
map:
- 0.1126
- 0.1094
- - 0.1106
+ - 0.1105
p30:
- - 0.2681
+ - 0.2694
- 0.2513
- 0.2167
ndcg20:
- - 0.13539
- - 0.18901
- - 0.10141
+ - 0.13537
+ - 0.18900
+ - 0.10139
err20:
- 0.07335
- 0.09592
- - 0.13036
+ - 0.13031
- name: bm25+rm3
display: +RM3
params:
@@ -92,20 +92,20 @@ models:
results:
map:
- 0.0933
- - 0.1081
+ - 0.1085
- 0.1107
p30:
- 0.2389
- - 0.2467
+ - 0.2480
- 0.1920
ndcg20:
- - 0.13690
- - 0.19164
- - 0.09170
+ - 0.13693
+ - 0.19160
+ - 0.09182
err20:
- - 0.07470
- - 0.09597
- - 0.14933
+ - 0.07473
+ - 0.09596
+ - 0.14936
- name: bm25+ax
display: +Ax
params:
@@ -116,21 +116,21 @@ models:
- -axiom.beta 0.1
results:
map:
- - 0.0928
- - 0.0974
+ - 0.0929
+ - 0.0975
- 0.1315
p30:
- 0.2354
- - 0.2393
+ - 0.2387
- 0.2553
ndcg20:
- - 0.16375
- - 0.18330
+ - 0.16319
+ - 0.18348
- 0.14413
err20:
- - 0.09815
- - 0.10909
- - 0.23554
+ - 0.09771
+ - 0.10912
+ - 0.23551
- name: ql
display: QL
params:
@@ -145,12 +145,12 @@ models:
- 0.2147
- 0.2080
ndcg20:
- - 0.11431
- - 0.16192
+ - 0.11432
+ - 0.16191
- 0.08682
err20:
- 0.05994
- - 0.08487
+ - 0.08486
- 0.13052
- name: ql+rm3
display: +RM3
@@ -160,19 +160,19 @@ models:
results:
map:
- 0.1019
- - 0.0837
- - 0.1059
+ - 0.0839
+ - 0.1058
p30:
- 0.2312
- - 0.2067
+ - 0.2047
- 0.1980
ndcg20:
- - 0.11852
- - 0.14469
+ - 0.11823
+ - 0.14487
- 0.08959
err20:
- - 0.05920
- - 0.07861
+ - 0.05917
+ - 0.07872
- 0.13336
- name: ql+ax
display: +Ax
@@ -189,13 +189,13 @@ models:
- 0.1212
p30:
- 0.2618
- - 0.2167
- - 0.2140
+ - 0.2173
+ - 0.2147
ndcg20:
- 0.14541
- - 0.15091
- - 0.10296
+ - 0.15174
+ - 0.10373
err20:
- 0.07424
- - 0.08203
- - 0.15575
+ - 0.08205
+ - 0.15577
diff --git a/src/main/resources/regression/cw12.yaml b/src/main/resources/regression/cw12.yaml
index ac09f64ddb..71b23c4c3f 100644
--- a/src/main/resources/regression/cw12.yaml
+++ b/src/main/resources/regression/cw12.yaml
@@ -21,8 +21,8 @@ index_options:
topic_reader: Webxml
index_stats:
documents: 731705088
- documents (non-empty): 731556725
- total terms: 428628865985
+ documents (non-empty): 731556853
+ total terms: 429328271635
topics:
- name: "[TREC 2013 Web Track: Topics 201-250](http://trec.nist.gov/data/web2013.html)"
path: topics.web.201-250.txt
@@ -66,17 +66,17 @@ models:
- -bm25
results:
map:
- - 0.1695
+ - 0.1694
- 0.2469
p30:
- - 0.2767
- - 0.4533
+ - 0.2773
+ - 0.4547
ndcg20:
- - 0.20858
- - 0.25776
+ - 0.20881
+ - 0.25719
err20:
- - 0.12835
- - 0.16305
+ - 0.12838
+ - 0.16162
- name: bm25+rm3
display: +RM3
params:
@@ -85,13 +85,13 @@ models:
results:
map:
- 0.1464
- - 0.2325
+ - 0.2324
p30:
- - 0.2387
- - 0.4073
+ - 0.2393
+ - 0.4080
ndcg20:
- 0.20327
- - 0.25304
+ - 0.25303
err20:
- 0.12637
- 0.16550
@@ -101,17 +101,17 @@ models:
- -ql
results:
map:
- - 0.1493
- - 0.2467
+ - 0.1494
+ - 0.2466
p30:
- - 0.2613
+ - 0.2607
- 0.4380
ndcg20:
- 0.19935
- - 0.22282
+ - 0.22184
err20:
- - 0.12319
- - 0.13211
+ - 0.12325
+ - 0.13218
- name: ql+rm3
display: +RM3
params:
@@ -119,15 +119,15 @@ models:
- -rm3
results:
map:
- - 0.1291
- - 0.2168
+ - 0.1290
+ - 0.2177
p30:
- 0.2347
- - 0.3793
+ - 0.3800
ndcg20:
- 0.17253
- - 0.20662
+ - 0.20829
err20:
- - 0.10084
- - 0.12179
+ - 0.10083
+ - 0.12450
diff --git a/src/main/resources/regression/cw12b13.yaml b/src/main/resources/regression/cw12b13.yaml
index 7ee7c46d3b..98f39a94ea 100644
--- a/src/main/resources/regression/cw12b13.yaml
+++ b/src/main/resources/regression/cw12b13.yaml
@@ -21,8 +21,8 @@ index_options:
topic_reader: Webxml
index_stats:
documents: 52249039
- documents (non-empty): 52238521
- total terms: 30617038149
+ documents (non-empty): 52238526
+ total terms: 30666923268
topics:
- name: "[TREC 2013 Web Track: Topics 201-250](http://trec.nist.gov/data/web2013.html)"
path: topics.web.201-250.txt
@@ -73,10 +73,10 @@ models:
- 0.1273
ndcg20:
- 0.12862
- - 0.11849
+ - 0.11835
err20:
- - 0.08379
- - 0.12013
+ - 0.08378
+ - 0.12006
- name: bm25+rm3
display: +RM3
params:
@@ -84,17 +84,17 @@ models:
- -rm3
results:
map:
- - 0.0412
+ - 0.0408
- 0.0210
p30:
- - 0.1713
+ - 0.1673
- 0.1207
ndcg20:
- - 0.11293
- - 0.10796
+ - 0.11192
+ - 0.10809
err20:
- - 0.07629
- - 0.10653
+ - 0.07530
+ - 0.10662
- name: bm25+ax
display: +Ax
params:
@@ -108,14 +108,14 @@ models:
- 0.0435
- 0.0180
p30:
- - 0.1840
+ - 0.1833
- 0.1107
ndcg20:
- - 0.12875
- - 0.09637
+ - 0.12867
+ - 0.09627
err20:
- - 0.09430
- - 0.09289
+ - 0.09413
+ - 0.09285
- name: ql
display: QL
params:
@@ -125,14 +125,14 @@ models:
- 0.0397
- 0.0235
p30:
- - 0.1767
+ - 0.1780
- 0.1373
ndcg20:
- - 0.11067
+ - 0.11059
- 0.11765
err20:
- - 0.07689
- - 0.10908
+ - 0.07679
+ - 0.10917
- name: ql+rm3
display: +RM3
params:
@@ -143,14 +143,14 @@ models:
- 0.0322
- 0.0203
p30:
- - 0.1507
+ - 0.1513
- 0.1173
ndcg20:
- 0.09199
- - 0.10035
+ - 0.10036
err20:
- 0.05525
- - 0.09289
+ - 0.09284
- name: ql+ax
display: +Ax
params:
@@ -161,14 +161,14 @@ models:
- -axiom.beta 0.1
results:
map:
- - 0.0359
- - 0.0186
+ - 0.0358
+ - 0.0183
p30:
- - 0.1513
- - 0.1167
+ - 0.1507
+ - 0.1147
ndcg20:
- - 0.11435
- - 0.10013
+ - 0.11407
+ - 0.09891
err20:
- - 0.07800
- - 0.08965
+ - 0.07803
+ - 0.09002
diff --git a/src/main/resources/regression/gov2.yaml b/src/main/resources/regression/gov2.yaml
index e4f24910f0..de644b0c71 100644
--- a/src/main/resources/regression/gov2.yaml
+++ b/src/main/resources/regression/gov2.yaml
@@ -39,7 +39,7 @@ evals:
index_stats:
documents: 25172934
documents (non-empty): 25170664
- total terms: 17343119816
+ total terms: 17345062322
topics:
- name: "[TREC 2004 Terabyte Track: Topics 701-750](http://trec.nist.gov/data/terabyte04.html)"
path: topics.701-750.txt
@@ -88,11 +88,11 @@ models:
- -axiom.deterministic
results:
map:
- - 0.2665
- - 0.3664
+ - 0.2669
+ - 0.3666
- 0.3069
p30:
- - 0.4986
+ - 0.4993
- 0.5933
- 0.5033
- name: ql
@@ -103,7 +103,7 @@ models:
map:
- 0.2681
- 0.3303
- - 0.2996
+ - 0.2997
p30:
- 0.4755
- 0.5347
diff --git a/src/main/resources/regression/mb11.yaml b/src/main/resources/regression/mb11.yaml
index 118281e6be..8e28d104ef 100644
--- a/src/main/resources/regression/mb11.yaml
+++ b/src/main/resources/regression/mb11.yaml
@@ -117,7 +117,7 @@ models:
- 0.2389
p30:
- 0.4435
- - 0.3520
+ - 0.3514
- name: ql+ax
display: +Ax
params:
diff --git a/src/main/resources/regression/msmarco-doc.yaml b/src/main/resources/regression/msmarco-doc.yaml
index dcf364fb78..7fdf92776f 100644
--- a/src/main/resources/regression/msmarco-doc.yaml
+++ b/src/main/resources/regression/msmarco-doc.yaml
@@ -42,7 +42,7 @@ index_path: indexes/lucene-index.msmarco-doc.pos+docvectors+rawdocs
index_stats:
documents: 3213835
documents (non-empty): 3213835
- total terms: 2746735247
+ total terms: 2748636047
topics:
- name: "[MS MARCO Document Ranking: Dev Queries](https://github.com/microsoft/TREC-2019-Deep-Learning)"
path: topics.msmarco-doc.dev.txt
@@ -54,7 +54,7 @@ models:
- -bm25
results:
map:
- - 0.2308
+ - 0.2310
R@1000:
- 0.8856
- name: bm25+rm3
@@ -64,6 +64,6 @@ models:
- -rm3
results:
map:
- - 0.1631
+ - 0.1632
R@1000:
- - 0.8787
+ - 0.8785
diff --git a/src/main/resources/regression/msmarco-passage.yaml b/src/main/resources/regression/msmarco-passage.yaml
index 8dc22f5af6..c7d46f6380 100644
--- a/src/main/resources/regression/msmarco-passage.yaml
+++ b/src/main/resources/regression/msmarco-passage.yaml
@@ -42,7 +42,7 @@ index_path: indexes/lucene-index.msmarco-passage.pos+docvectors+rawdocs
index_stats:
documents: 8841823
documents (non-empty): 8841823
- total terms: 352122244
+ total terms: 352316036
topics:
- name: "[MS MARCO Passage Ranking: Dev Queries](https://github.com/microsoft/MSMARCO-Passage-Ranking)"
path: topics.msmarco-passage.dev-subset.txt
@@ -54,7 +54,7 @@ models:
- -bm25
results:
map:
- - 0.1924
+ - 0.1926
R@1000:
- 0.8526
- name: bm25-default+rm3
@@ -75,7 +75,7 @@ models:
- -b 0.72
results:
map:
- - 0.1956
+ - 0.1957
R@1000:
- 0.8578
- name: bm25-tuned+rm3
diff --git a/src/main/resources/regression/robust04.yaml b/src/main/resources/regression/robust04.yaml
index 9a3388934c..eb656e8626 100644
--- a/src/main/resources/regression/robust04.yaml
+++ b/src/main/resources/regression/robust04.yaml
@@ -40,7 +40,7 @@ index_path: indexes/lucene-index.robust04.pos+docvectors+rawdocs # path to the e
index_stats:
documents: 528030
documents (non-empty): 528030
- total terms: 174540587
+ total terms: 174540872
topics:
- name: "[TREC 2004 Robust Track Topics](http://trec.nist.gov/data/robust/04.testset.gz)"
path: topics.robust04.301-450.601-700.txt
@@ -74,7 +74,7 @@ models:
- -axiom.deterministic
results:
map:
- - 0.2895
+ - 0.2896
p30:
- 0.3333
- name: ql
diff --git a/src/main/resources/regression/robust05.yaml b/src/main/resources/regression/robust05.yaml
index ec5a33b4d9..b09fac1167 100644
--- a/src/main/resources/regression/robust05.yaml
+++ b/src/main/resources/regression/robust05.yaml
@@ -52,7 +52,7 @@ models:
- -bm25
results:
map:
- - 0.2031
+ - 0.2032
p30:
- 0.3693
- name: bm25+rm3
@@ -74,7 +74,7 @@ models:
- -axiom.deterministic
results:
map:
- - 0.2584
+ - 0.2587
p30:
- 0.4120
- name: ql
diff --git a/src/main/resources/regression/wt10g.yaml b/src/main/resources/regression/wt10g.yaml
index cf4fd4065d..ce85ca198d 100644
--- a/src/main/resources/regression/wt10g.yaml
+++ b/src/main/resources/regression/wt10g.yaml
@@ -39,8 +39,8 @@ input: collections/web/wt10g/
index_path: indexes/lucene-index.wt10g.pos+docvectors+rawdocs # path to the existing index, used in regression test if `--index` option is absent
index_stats:
documents: 1688402
- documents (non-empty): 1688290
- total terms: 752326031
+ documents (non-empty): 1688291
+ total terms: 752790242
topics:
- name: "Wt10g: Topics 451-550"
path: topics.451-550.txt
@@ -54,7 +54,7 @@ models:
map:
- 0.1992
p30:
- - 0.2218
+ - 0.2214
- name: bm25+rm3
display: +RM3
params:
@@ -109,4 +109,4 @@ models:
map:
- 0.2275
p30:
- - 0.2517
+ - 0.2514
diff --git a/src/test/java/io/anserini/integration/IndexerTest.java b/src/test/java/io/anserini/integration/IndexerTest.java
index 353ddbec12..efb7e045c2 100644
--- a/src/test/java/io/anserini/integration/IndexerTest.java
+++ b/src/test/java/io/anserini/integration/IndexerTest.java
@@ -219,7 +219,7 @@ public void testIterateThroughDocumentVectorComputeBM25() throws Exception {
TopDocs rs = searcher.search(finalQuery, 1); // issue the query
// The BM25 weight is the maxScore
- System.out.println(term + " " + tf + " " + rs.getMaxScore());
+ System.out.println(term + " " + tf + " " + (rs.scoreDocs.length == 0 ? Float.NaN : rs.scoreDocs[0].score));
}
}
}
diff --git a/src/test/java/io/anserini/integration/MultiThreadingSearchTest.java b/src/test/java/io/anserini/integration/MultiThreadingSearchTest.java
index 7903bfcafb..20abf65337 100644
--- a/src/test/java/io/anserini/integration/MultiThreadingSearchTest.java
+++ b/src/test/java/io/anserini/integration/MultiThreadingSearchTest.java
@@ -34,8 +34,9 @@ protected void init() {
termIndexStatusTermCount = 12; // Please note that standard analyzer ignores stopwords.
// Also, this includes docids
termIndexStatusTotFreq = 17; //
- termIndexStatusTotPos = 16; // only "text" fields are indexed with position so we have 16
storedFieldStatusTotalDocCounts = 3;
+ // 16 positions for text fields, plus 1 for each document because of id
+ termIndexStatusTotPos = 16 + storedFieldStatusTotalDocCounts;
storedFieldStatusTotFields = 9; // 3 docs * (1 id + 1 text + 1 raw)
}
diff --git a/src/test/java/io/anserini/integration/TrecEndToEndTest.java b/src/test/java/io/anserini/integration/TrecEndToEndTest.java
index b6ef275491..2cb60f5a76 100644
--- a/src/test/java/io/anserini/integration/TrecEndToEndTest.java
+++ b/src/test/java/io/anserini/integration/TrecEndToEndTest.java
@@ -28,8 +28,9 @@ protected void init() {
fieldNormStatusTotalFields = 1; // text
termIndexStatusTermCount = 12; // Note that standard analyzer ignores stopwords; includes docids.
termIndexStatusTotFreq = 17;
- termIndexStatusTotPos = 16; // Only "text" fields are indexed with position so we have 16.
storedFieldStatusTotalDocCounts = 3;
+ // 16 positions for text fields, plus 1 for each document because of id
+ termIndexStatusTotPos = 16 + storedFieldStatusTotalDocCounts;
storedFieldStatusTotFields = 9; // 3 docs * (1 id + 1 text + 1 raw)
// The search output should be as follows (for Lucene 7.5):
diff --git a/src/test/java/io/anserini/integration/TweetEndToEndTest.java b/src/test/java/io/anserini/integration/TweetEndToEndTest.java
index 46f9a4cd3d..86deb7d0e7 100644
--- a/src/test/java/io/anserini/integration/TweetEndToEndTest.java
+++ b/src/test/java/io/anserini/integration/TweetEndToEndTest.java
@@ -30,8 +30,9 @@ protected void init() {
// We set that retweets and the tweets with ids larger than tweetMaxId will NOT be indexed!
termIndexStatusTermCount = 32; // other indexable fields: 4 doc ids + 4 "lang" fields + 4 "screen_name" fields
termIndexStatusTotFreq = 36;
- termIndexStatusTotPos = 24; // only "text" fields are indexed with positions
storedFieldStatusTotalDocCounts = 4;
+ // 24 positions for text fields, plus 3 for each document because of id, screen_name and lang
+ termIndexStatusTotPos = 24 + 3 * storedFieldStatusTotalDocCounts;
storedFieldStatusTotFields = 12; // 4 tweets * (1 id + 1 text + 1 raw)
// The search output should be as follows (for Lucene 7.5):