diff --git a/docs/regressions-car17v1.5.md b/docs/regressions-car17v1.5.md index b73e776dfc..b5873f528e 100644 --- a/docs/regressions-car17v1.5.md +++ b/docs/regressions-car17v1.5.md @@ -64,11 +64,11 @@ With the above commands, you should be able to replicate the following results: MAP | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2017 CAR: benchmarkY1test (v1.5)](http://trec-car.cs.unh.edu/datareleases/)| 0.1563 | 0.1295 | 0.1358 | 0.1386 | 0.1080 | 0.1048 | +[TREC 2017 CAR: benchmarkY1test (v1.5)](http://trec-car.cs.unh.edu/datareleases/)| 0.1562 | 0.1295 | 0.1358 | 0.1386 | 0.1080 | 0.1048 | RECIP_RANK | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2017 CAR: benchmarkY1test (v1.5)](http://trec-car.cs.unh.edu/datareleases/)| 0.2336 | 0.1923 | 0.1949 | 0.2037 | 0.1599 | 0.1524 | +[TREC 2017 CAR: benchmarkY1test (v1.5)](http://trec-car.cs.unh.edu/datareleases/)| 0.2331 | 0.1923 | 0.1949 | 0.2037 | 0.1599 | 0.1524 | diff --git a/docs/regressions-core17.md b/docs/regressions-core17.md index 7a02edf469..6622c8e469 100644 --- a/docs/regressions-core17.md +++ b/docs/regressions-core17.md @@ -64,7 +64,7 @@ With the above commands, you should be able to replicate the following results: MAP | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2017 Common Core Track Topics](https://trec.nist.gov/data/core/core_nist.txt)| 0.2087 | 0.2823 | 0.2787 | 0.2032 | 0.2606 | 0.2613 | +[TREC 2017 Common Core Track Topics](https://trec.nist.gov/data/core/core_nist.txt)| 0.2087 | 0.2823 | 0.2788 | 0.2032 | 0.2606 | 0.2613 | P30 | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | diff --git a/docs/regressions-core18.md b/docs/regressions-core18.md index 4e49c08911..3b1c0e8495 100644 --- a/docs/regressions-core18.md +++ b/docs/regressions-core18.md @@ -64,7 +64,7 @@ With the above commands, you should be able to replicate the following results: MAP | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2018 Common Core Track Topics](https://trec.nist.gov/data/core/topics2018.txt)| 0.2495 | 0.3136 | 0.2920 | 0.2526 | 0.3073 | 0.2966 | +[TREC 2018 Common Core Track Topics](https://trec.nist.gov/data/core/topics2018.txt)| 0.2495 | 0.3135 | 0.2925 | 0.2526 | 0.3073 | 0.2966 | P30 | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | diff --git a/docs/regressions-cw09b.md b/docs/regressions-cw09b.md index e3fe9a343f..8094c251fe 100644 --- a/docs/regressions-cw09b.md +++ b/docs/regressions-cw09b.md @@ -110,29 +110,29 @@ With the above commands, you should be able to replicate the following results: MAP | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2010 Web Track: Topics 51-100](http://trec.nist.gov/data/web/10/wt2010-topics.xml)| 0.1126 | 0.0933 | 0.0928 | 0.1060 | 0.1019 | 0.1086 | -[TREC 2011 Web Track: Topics 101-150](http://trec.nist.gov/data/web/11/full-topics.xml)| 0.1094 | 0.1081 | 0.0974 | 0.0958 | 0.0837 | 0.0879 | -[TREC 2012 Web Track: Topics 151-200](http://trec.nist.gov/data/web/12/full-topics.xml)| 0.1106 | 0.1107 | 0.1315 | 0.1069 | 0.1059 | 0.1212 | +[TREC 2010 Web Track: Topics 51-100](http://trec.nist.gov/data/web/10/wt2010-topics.xml)| 0.1126 | 0.0933 | 0.0929 | 0.1060 | 0.1019 | 0.1086 | +[TREC 2011 Web Track: Topics 101-150](http://trec.nist.gov/data/web/11/full-topics.xml)| 0.1094 | 0.1085 | 0.0975 | 0.0958 | 0.0839 | 0.0879 | +[TREC 2012 Web Track: Topics 151-200](http://trec.nist.gov/data/web/12/full-topics.xml)| 0.1105 | 0.1107 | 0.1315 | 0.1069 | 0.1058 | 0.1212 | P30 | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2010 Web Track: Topics 51-100](http://trec.nist.gov/data/web/10/wt2010-topics.xml)| 0.2681 | 0.2389 | 0.2354 | 0.2431 | 0.2312 | 0.2618 | -[TREC 2011 Web Track: Topics 101-150](http://trec.nist.gov/data/web/11/full-topics.xml)| 0.2513 | 0.2467 | 0.2393 | 0.2147 | 0.2067 | 0.2167 | -[TREC 2012 Web Track: Topics 151-200](http://trec.nist.gov/data/web/12/full-topics.xml)| 0.2167 | 0.1920 | 0.2553 | 0.2080 | 0.1980 | 0.2140 | +[TREC 2010 Web Track: Topics 51-100](http://trec.nist.gov/data/web/10/wt2010-topics.xml)| 0.2694 | 0.2389 | 0.2354 | 0.2431 | 0.2312 | 0.2618 | +[TREC 2011 Web Track: Topics 101-150](http://trec.nist.gov/data/web/11/full-topics.xml)| 0.2513 | 0.2480 | 0.2387 | 0.2147 | 0.2047 | 0.2173 | +[TREC 2012 Web Track: Topics 151-200](http://trec.nist.gov/data/web/12/full-topics.xml)| 0.2167 | 0.1920 | 0.2553 | 0.2080 | 0.1980 | 0.2147 | NDCG20 | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2010 Web Track: Topics 51-100](http://trec.nist.gov/data/web/10/wt2010-topics.xml)| 0.1354 | 0.1369 | 0.1637 | 0.1143 | 0.1185 | 0.1454 | -[TREC 2011 Web Track: Topics 101-150](http://trec.nist.gov/data/web/11/full-topics.xml)| 0.1890 | 0.1916 | 0.1833 | 0.1619 | 0.1447 | 0.1509 | -[TREC 2012 Web Track: Topics 151-200](http://trec.nist.gov/data/web/12/full-topics.xml)| 0.1014 | 0.0917 | 0.1441 | 0.0868 | 0.0896 | 0.1030 | +[TREC 2010 Web Track: Topics 51-100](http://trec.nist.gov/data/web/10/wt2010-topics.xml)| 0.1354 | 0.1369 | 0.1632 | 0.1143 | 0.1182 | 0.1454 | +[TREC 2011 Web Track: Topics 101-150](http://trec.nist.gov/data/web/11/full-topics.xml)| 0.1890 | 0.1916 | 0.1835 | 0.1619 | 0.1449 | 0.1517 | +[TREC 2012 Web Track: Topics 151-200](http://trec.nist.gov/data/web/12/full-topics.xml)| 0.1014 | 0.0918 | 0.1441 | 0.0868 | 0.0896 | 0.1037 | ERR20 | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2010 Web Track: Topics 51-100](http://trec.nist.gov/data/web/10/wt2010-topics.xml)| 0.0733 | 0.0747 | 0.0981 | 0.0599 | 0.0592 | 0.0742 | -[TREC 2011 Web Track: Topics 101-150](http://trec.nist.gov/data/web/11/full-topics.xml)| 0.0959 | 0.0960 | 0.1091 | 0.0849 | 0.0786 | 0.0820 | -[TREC 2012 Web Track: Topics 151-200](http://trec.nist.gov/data/web/12/full-topics.xml)| 0.1304 | 0.1493 | 0.2355 | 0.1305 | 0.1334 | 0.1558 | +[TREC 2010 Web Track: Topics 51-100](http://trec.nist.gov/data/web/10/wt2010-topics.xml)| 0.0733 | 0.0747 | 0.0977 | 0.0599 | 0.0592 | 0.0742 | +[TREC 2011 Web Track: Topics 101-150](http://trec.nist.gov/data/web/11/full-topics.xml)| 0.0959 | 0.0960 | 0.1091 | 0.0849 | 0.0787 | 0.0821 | +[TREC 2012 Web Track: Topics 151-200](http://trec.nist.gov/data/web/12/full-topics.xml)| 0.1303 | 0.1494 | 0.2355 | 0.1305 | 0.1334 | 0.1558 | diff --git a/docs/regressions-cw12.md b/docs/regressions-cw12.md index b6db2d5df6..f6c4ece851 100644 --- a/docs/regressions-cw12.md +++ b/docs/regressions-cw12.md @@ -73,25 +73,25 @@ With the above commands, you should be able to replicate the following results: MAP | BM25 | +RM3 | QL | +RM3 | :---------------------------------------|-----------|-----------|-----------|-----------| -[TREC 2013 Web Track: Topics 201-250](http://trec.nist.gov/data/web2013.html)| 0.1695 | 0.1464 | 0.1493 | 0.1291 | -[TREC 2014 Web Track: Topics 251-300](http://trec.nist.gov/data/web2014.html)| 0.2469 | 0.2325 | 0.2467 | 0.2168 | +[TREC 2013 Web Track: Topics 201-250](http://trec.nist.gov/data/web2013.html)| 0.1694 | 0.1464 | 0.1494 | 0.1290 | +[TREC 2014 Web Track: Topics 251-300](http://trec.nist.gov/data/web2014.html)| 0.2469 | 0.2324 | 0.2466 | 0.2177 | P30 | BM25 | +RM3 | QL | +RM3 | :---------------------------------------|-----------|-----------|-----------|-----------| -[TREC 2013 Web Track: Topics 201-250](http://trec.nist.gov/data/web2013.html)| 0.2767 | 0.2387 | 0.2613 | 0.2347 | -[TREC 2014 Web Track: Topics 251-300](http://trec.nist.gov/data/web2014.html)| 0.4533 | 0.4073 | 0.4380 | 0.3793 | +[TREC 2013 Web Track: Topics 201-250](http://trec.nist.gov/data/web2013.html)| 0.2773 | 0.2393 | 0.2607 | 0.2347 | +[TREC 2014 Web Track: Topics 251-300](http://trec.nist.gov/data/web2014.html)| 0.4547 | 0.4080 | 0.4380 | 0.3800 | NDCG20 | BM25 | +RM3 | QL | +RM3 | :---------------------------------------|-----------|-----------|-----------|-----------| -[TREC 2013 Web Track: Topics 201-250](http://trec.nist.gov/data/web2013.html)| 0.2086 | 0.2033 | 0.1993 | 0.1725 | -[TREC 2014 Web Track: Topics 251-300](http://trec.nist.gov/data/web2014.html)| 0.2578 | 0.2530 | 0.2228 | 0.2066 | +[TREC 2013 Web Track: Topics 201-250](http://trec.nist.gov/data/web2013.html)| 0.2088 | 0.2033 | 0.1993 | 0.1725 | +[TREC 2014 Web Track: Topics 251-300](http://trec.nist.gov/data/web2014.html)| 0.2572 | 0.2530 | 0.2218 | 0.2083 | ERR20 | BM25 | +RM3 | QL | +RM3 | :---------------------------------------|-----------|-----------|-----------|-----------| -[TREC 2013 Web Track: Topics 201-250](http://trec.nist.gov/data/web2013.html)| 0.1284 | 0.1264 | 0.1232 | 0.1008 | -[TREC 2014 Web Track: Topics 251-300](http://trec.nist.gov/data/web2014.html)| 0.1630 | 0.1655 | 0.1321 | 0.1218 | +[TREC 2013 Web Track: Topics 201-250](http://trec.nist.gov/data/web2013.html)| 0.1284 | 0.1264 | 0.1233 | 0.1008 | +[TREC 2014 Web Track: Topics 251-300](http://trec.nist.gov/data/web2014.html)| 0.1616 | 0.1655 | 0.1322 | 0.1245 | diff --git a/docs/regressions-cw12b13.md b/docs/regressions-cw12b13.md index 0af491058d..afdf674194 100644 --- a/docs/regressions-cw12b13.md +++ b/docs/regressions-cw12b13.md @@ -88,26 +88,26 @@ With the above commands, you should be able to replicate the following results: MAP | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2013 Web Track: Topics 201-250](http://trec.nist.gov/data/web2013.html)| 0.0468 | 0.0412 | 0.0435 | 0.0397 | 0.0322 | 0.0359 | -[TREC 2014 Web Track: Topics 251-300](http://trec.nist.gov/data/web2014.html)| 0.0224 | 0.0210 | 0.0180 | 0.0235 | 0.0203 | 0.0186 | +[TREC 2013 Web Track: Topics 201-250](http://trec.nist.gov/data/web2013.html)| 0.0468 | 0.0408 | 0.0435 | 0.0397 | 0.0322 | 0.0358 | +[TREC 2014 Web Track: Topics 251-300](http://trec.nist.gov/data/web2014.html)| 0.0224 | 0.0210 | 0.0180 | 0.0235 | 0.0203 | 0.0183 | P30 | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2013 Web Track: Topics 201-250](http://trec.nist.gov/data/web2013.html)| 0.2113 | 0.1713 | 0.1840 | 0.1767 | 0.1507 | 0.1513 | -[TREC 2014 Web Track: Topics 251-300](http://trec.nist.gov/data/web2014.html)| 0.1273 | 0.1207 | 0.1107 | 0.1373 | 0.1173 | 0.1167 | +[TREC 2013 Web Track: Topics 201-250](http://trec.nist.gov/data/web2013.html)| 0.2113 | 0.1673 | 0.1833 | 0.1780 | 0.1513 | 0.1507 | +[TREC 2014 Web Track: Topics 251-300](http://trec.nist.gov/data/web2014.html)| 0.1273 | 0.1207 | 0.1107 | 0.1373 | 0.1173 | 0.1147 | NDCG20 | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2013 Web Track: Topics 201-250](http://trec.nist.gov/data/web2013.html)| 0.1286 | 0.1129 | 0.1287 | 0.1107 | 0.0920 | 0.1143 | -[TREC 2014 Web Track: Topics 251-300](http://trec.nist.gov/data/web2014.html)| 0.1185 | 0.1080 | 0.0964 | 0.1177 | 0.1003 | 0.1001 | +[TREC 2013 Web Track: Topics 201-250](http://trec.nist.gov/data/web2013.html)| 0.1286 | 0.1119 | 0.1287 | 0.1106 | 0.0920 | 0.1141 | +[TREC 2014 Web Track: Topics 251-300](http://trec.nist.gov/data/web2014.html)| 0.1183 | 0.1081 | 0.0963 | 0.1177 | 0.1004 | 0.0989 | ERR20 | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2013 Web Track: Topics 201-250](http://trec.nist.gov/data/web2013.html)| 0.0838 | 0.0763 | 0.0943 | 0.0769 | 0.0553 | 0.0780 | -[TREC 2014 Web Track: Topics 251-300](http://trec.nist.gov/data/web2014.html)| 0.1201 | 0.1065 | 0.0929 | 0.1091 | 0.0929 | 0.0896 | +[TREC 2013 Web Track: Topics 201-250](http://trec.nist.gov/data/web2013.html)| 0.0838 | 0.0753 | 0.0941 | 0.0768 | 0.0553 | 0.0780 | +[TREC 2014 Web Track: Topics 251-300](http://trec.nist.gov/data/web2014.html)| 0.1201 | 0.1066 | 0.0928 | 0.1092 | 0.0928 | 0.0900 | diff --git a/docs/regressions-gov2.md b/docs/regressions-gov2.md index 07256103e1..7133ceba6a 100644 --- a/docs/regressions-gov2.md +++ b/docs/regressions-gov2.md @@ -90,14 +90,14 @@ With the above commands, you should be able to replicate the following results: MAP | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2004 Terabyte Track: Topics 701-750](http://trec.nist.gov/data/terabyte04.html)| 0.2689 | 0.2844 | 0.2665 | 0.2681 | 0.2708 | 0.2666 | -[TREC 2005 Terabyte Track: Topics 751-800](http://trec.nist.gov/data/terabyte05.html)| 0.3390 | 0.3820 | 0.3664 | 0.3303 | 0.3559 | 0.3646 | -[TREC 2006 Terabyte Track: Topics 801-850](http://trec.nist.gov/data/terabyte06.html)| 0.3080 | 0.3377 | 0.3069 | 0.2996 | 0.3154 | 0.3084 | +[TREC 2004 Terabyte Track: Topics 701-750](http://trec.nist.gov/data/terabyte04.html)| 0.2689 | 0.2844 | 0.2669 | 0.2681 | 0.2708 | 0.2666 | +[TREC 2005 Terabyte Track: Topics 751-800](http://trec.nist.gov/data/terabyte05.html)| 0.3390 | 0.3820 | 0.3666 | 0.3303 | 0.3559 | 0.3646 | +[TREC 2006 Terabyte Track: Topics 801-850](http://trec.nist.gov/data/terabyte06.html)| 0.3080 | 0.3377 | 0.3069 | 0.2997 | 0.3154 | 0.3084 | P30 | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2004 Terabyte Track: Topics 701-750](http://trec.nist.gov/data/terabyte04.html)| 0.4864 | 0.5190 | 0.4986 | 0.4755 | 0.4925 | 0.4932 | +[TREC 2004 Terabyte Track: Topics 701-750](http://trec.nist.gov/data/terabyte04.html)| 0.4864 | 0.5190 | 0.4993 | 0.4755 | 0.4925 | 0.4932 | [TREC 2005 Terabyte Track: Topics 751-800](http://trec.nist.gov/data/terabyte05.html)| 0.5540 | 0.5920 | 0.5933 | 0.5347 | 0.5620 | 0.5840 | [TREC 2006 Terabyte Track: Topics 801-850](http://trec.nist.gov/data/terabyte06.html)| 0.4907 | 0.5160 | 0.5033 | 0.4720 | 0.4847 | 0.4920 | diff --git a/docs/regressions-mb11.md b/docs/regressions-mb11.md index 2f1f223236..1eabf89233 100644 --- a/docs/regressions-mb11.md +++ b/docs/regressions-mb11.md @@ -92,6 +92,6 @@ MAP | BM25 | +RM3 | +Ax | QL P30 | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| [TREC 2011 Microblog Track Topics](http://trec.nist.gov/data/microblog2011.html)| 0.3959 | 0.4170 | 0.4612 | 0.4061 | 0.4435 | 0.4408 | -[TREC 2012 Microblog Track Topics](http://trec.nist.gov/data/microblog2012.html)| 0.3316 | 0.3463 | 0.3554 | 0.3333 | 0.3520 | 0.3842 | +[TREC 2012 Microblog Track Topics](http://trec.nist.gov/data/microblog2012.html)| 0.3316 | 0.3463 | 0.3554 | 0.3333 | 0.3514 | 0.3842 | diff --git a/docs/regressions-msmarco-doc.md b/docs/regressions-msmarco-doc.md index 4ce79b929d..6aaf2966d0 100644 --- a/docs/regressions-msmarco-doc.md +++ b/docs/regressions-msmarco-doc.md @@ -47,11 +47,11 @@ With the above commands, you should be able to replicate the following results: MAP | BM25 | +RM3 | :---------------------------------------|-----------|-----------| -[MS MARCO Document Ranking: Dev Queries](https://github.com/microsoft/TREC-2019-Deep-Learning)| 0.2308 | 0.1631 | +[MS MARCO Document Ranking: Dev Queries](https://github.com/microsoft/TREC-2019-Deep-Learning)| 0.2310 | 0.1632 | R@1000 | BM25 | +RM3 | :---------------------------------------|-----------|-----------| -[MS MARCO Document Ranking: Dev Queries](https://github.com/microsoft/TREC-2019-Deep-Learning)| 0.8856 | 0.8787 | +[MS MARCO Document Ranking: Dev Queries](https://github.com/microsoft/TREC-2019-Deep-Learning)| 0.8856 | 0.8785 | diff --git a/docs/regressions-msmarco-passage.md b/docs/regressions-msmarco-passage.md index 202a3fb53d..d8b29f8b6d 100644 --- a/docs/regressions-msmarco-passage.md +++ b/docs/regressions-msmarco-passage.md @@ -56,7 +56,7 @@ With the above commands, you should be able to replicate the following results: MAP | BM25 (Default)| +RM3 | BM25 (Tuned)| +RM3 | :---------------------------------------|-----------|-----------|-----------|-----------| -[MS MARCO Passage Ranking: Dev Queries](https://github.com/microsoft/MSMARCO-Passage-Ranking)| 0.1924 | 0.1661 | 0.1956 | 0.1766 | +[MS MARCO Passage Ranking: Dev Queries](https://github.com/microsoft/MSMARCO-Passage-Ranking)| 0.1926 | 0.1661 | 0.1957 | 0.1766 | R@1000 | BM25 (Default)| +RM3 | BM25 (Tuned)| +RM3 | diff --git a/docs/regressions-robust04.md b/docs/regressions-robust04.md index a135548e85..0a3dbb132c 100644 --- a/docs/regressions-robust04.md +++ b/docs/regressions-robust04.md @@ -63,7 +63,7 @@ With the above commands, you should be able to replicate the following results: MAP | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2004 Robust Track Topics](http://trec.nist.gov/data/robust/04.testset.gz)| 0.2531 | 0.2903 | 0.2895 | 0.2467 | 0.2747 | 0.2774 | +[TREC 2004 Robust Track Topics](http://trec.nist.gov/data/robust/04.testset.gz)| 0.2531 | 0.2903 | 0.2896 | 0.2467 | 0.2747 | 0.2774 | P30 | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | diff --git a/docs/regressions-robust05.md b/docs/regressions-robust05.md index 3cd52df47c..ce396de720 100644 --- a/docs/regressions-robust05.md +++ b/docs/regressions-robust05.md @@ -62,7 +62,7 @@ With the above commands, you should be able to replicate the following results: MAP | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2005 Robust Track Topics](http://trec.nist.gov/data/robust/05/05.50.topics.txt)| 0.2031 | 0.2602 | 0.2584 | 0.2028 | 0.2491 | 0.2476 | +[TREC 2005 Robust Track Topics](http://trec.nist.gov/data/robust/05/05.50.topics.txt)| 0.2032 | 0.2602 | 0.2587 | 0.2028 | 0.2491 | 0.2476 | P30 | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | diff --git a/docs/regressions-wt10g.md b/docs/regressions-wt10g.md index 676a6f1fe5..b43d55e93d 100644 --- a/docs/regressions-wt10g.md +++ b/docs/regressions-wt10g.md @@ -69,6 +69,6 @@ Wt10g: Topics 451-550 | 0.1992 | 0.2276 | 0.2200 | 0. P30 | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -Wt10g: Topics 451-550 | 0.2218 | 0.2398 | 0.2483 | 0.2180 | 0.2310 | 0.2517 | +Wt10g: Topics 451-550 | 0.2214 | 0.2398 | 0.2483 | 0.2180 | 0.2310 | 0.2514 | diff --git a/pom.xml b/pom.xml index a3f721719e..ec12dbb040 100644 --- a/pom.xml +++ b/pom.xml @@ -33,7 +33,7 @@ - 7.6.0 + 8.0.0 UTF-8 diff --git a/src/main/java/io/anserini/analysis/EnglishStemmingAnalyzer.java b/src/main/java/io/anserini/analysis/EnglishStemmingAnalyzer.java index f30b0653ae..c395fd4105 100644 --- a/src/main/java/io/anserini/analysis/EnglishStemmingAnalyzer.java +++ b/src/main/java/io/anserini/analysis/EnglishStemmingAnalyzer.java @@ -22,12 +22,11 @@ import org.apache.lucene.analysis.StopwordAnalyzerBase; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.en.EnglishAnalyzer; import org.apache.lucene.analysis.en.EnglishPossessiveFilter; import org.apache.lucene.analysis.en.KStemFilter; import org.apache.lucene.analysis.en.PorterStemFilter; import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; -import org.apache.lucene.analysis.standard.StandardAnalyzer; -import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; public class EnglishStemmingAnalyzer extends StopwordAnalyzerBase { @@ -35,11 +34,11 @@ public class EnglishStemmingAnalyzer extends StopwordAnalyzerBase { private final CharArraySet stemExclusionSet; public EnglishStemmingAnalyzer() { - this("", StandardAnalyzer.STOP_WORDS_SET); + this("", EnglishAnalyzer.ENGLISH_STOP_WORDS_SET); } public EnglishStemmingAnalyzer(String stemmer) { - this(stemmer, StandardAnalyzer.STOP_WORDS_SET, CharArraySet.EMPTY_SET); + this(stemmer, EnglishAnalyzer.ENGLISH_STOP_WORDS_SET, CharArraySet.EMPTY_SET); } public EnglishStemmingAnalyzer(CharArraySet stopwords) { @@ -59,7 +58,7 @@ public EnglishStemmingAnalyzer(String stemmer, CharArraySet stopwords, CharArray protected TokenStreamComponents createComponents(String fieldName) { Tokenizer source = new StandardTokenizer(); TokenStream result = null; - result = new StandardFilter(source); + result = source; result = new EnglishPossessiveFilter(result); result = new LowerCaseFilter(result); result = new StopFilter(result, this.stopwords); @@ -77,7 +76,7 @@ protected TokenStreamComponents createComponents(String fieldName) { } protected TokenStream normalize(String fieldName, TokenStream in) { - TokenStream result = new StandardFilter(in); + TokenStream result = in; result = new LowerCaseFilter(result); return result; } diff --git a/src/main/java/io/anserini/analysis/FreebaseAnalyzer.java b/src/main/java/io/anserini/analysis/FreebaseAnalyzer.java index a4e71a02e6..60b8710ed4 100644 --- a/src/main/java/io/anserini/analysis/FreebaseAnalyzer.java +++ b/src/main/java/io/anserini/analysis/FreebaseAnalyzer.java @@ -21,12 +21,11 @@ import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.StopwordAnalyzerBase; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.en.EnglishAnalyzer; import org.apache.lucene.analysis.en.EnglishPossessiveFilter; import org.apache.lucene.analysis.en.PorterStemFilter; import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter; import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; -import org.apache.lucene.analysis.standard.StandardAnalyzer; -import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; /* ASCIIFoldingFilter is used for accent folding. This will normalize the characters @@ -53,23 +52,23 @@ public FreebaseAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) { protected TokenStreamComponents createComponents(String fieldName) { StandardTokenizer source = new StandardTokenizer(); - StandardFilter result = new StandardFilter(source); - EnglishPossessiveFilter result2 = new EnglishPossessiveFilter(result); - LowerCaseFilter result3 = new LowerCaseFilter(result2); - Object result4 = new StopFilter(result3, this.stopwords); - result4 = new ASCIIFoldingFilter((TokenStream) result4); + TokenStream result = source; + result = new EnglishPossessiveFilter(result); + result = new LowerCaseFilter(result); + result = new StopFilter(result, this.stopwords); + result = new ASCIIFoldingFilter(result); if(!this.stemExclusionSet.isEmpty()) { - result4 = new SetKeywordMarkerFilter((TokenStream)result4, this.stemExclusionSet); + result = new SetKeywordMarkerFilter(result, this.stemExclusionSet); } - PorterStemFilter result1 = new PorterStemFilter((TokenStream)result4); - return new TokenStreamComponents(source, result1); + result = new PorterStemFilter(result); + return new TokenStreamComponents(source, result); } protected TokenStream normalize(String fieldName, TokenStream in) { - StandardFilter result = new StandardFilter(in); - LowerCaseFilter result1 = new LowerCaseFilter(result); - return result1; + TokenStream result = in; + result = new LowerCaseFilter(result); + return result; } private static class DefaultSetHolder { @@ -79,7 +78,7 @@ private DefaultSetHolder() { } static { - DEFAULT_STOP_SET = StandardAnalyzer.STOP_WORDS_SET; + DEFAULT_STOP_SET = EnglishAnalyzer.ENGLISH_STOP_WORDS_SET; } } -} \ No newline at end of file +} diff --git a/src/main/java/io/anserini/index/IndexCollection.java b/src/main/java/io/anserini/index/IndexCollection.java index c261ff5d45..5d53e65812 100644 --- a/src/main/java/io/anserini/index/IndexCollection.java +++ b/src/main/java/io/anserini/index/IndexCollection.java @@ -579,7 +579,7 @@ public void run() throws IOException { if (args.solr) { numIndexed = counters.indexed.get(); } else { - numIndexed = args.dryRun ? counters.indexed.get() : writer.maxDoc(); + numIndexed = args.dryRun ? counters.indexed.get() : writer.getDocStats().maxDoc; } // Do a final commit diff --git a/src/main/java/io/anserini/index/IndexUtils.java b/src/main/java/io/anserini/index/IndexUtils.java index df235b8103..9e8c91690e 100755 --- a/src/main/java/io/anserini/index/IndexUtils.java +++ b/src/main/java/io/anserini/index/IndexUtils.java @@ -34,10 +34,9 @@ import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; -import org.apache.lucene.index.Fields; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexableField; -import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.MultiTerms; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; @@ -170,8 +169,7 @@ public InputStream getReadFileStream(String path) throws IOException { } void printIndexStats() throws IOException { - Fields fields = MultiFields.getFields(reader); - Terms terms = fields.terms(LuceneDocumentGenerator.FIELD_BODY); + Terms terms = MultiTerms.getTerms(reader, LuceneDocumentGenerator.FIELD_BODY); System.out.println("Index statistics"); System.out.println("----------------"); @@ -182,10 +180,9 @@ void printIndexStats() throws IOException { System.out.println("stored fields:"); - FieldInfos fieldInfos = MultiFields.getMergedFieldInfos(reader); - for (String fd : fields) { - FieldInfo fi = fieldInfos.fieldInfo(fd); - System.out.println(" " + fd + " (" + "indexOption: " + fi.getIndexOptions() + + FieldInfos fieldInfos = FieldInfos.getMergedFieldInfos(reader); + for (FieldInfo fi : fieldInfos) { + System.out.println(" " + fi.name + " (" + "indexOption: " + fi.getIndexOptions() + ", hasVectors: " + fi.hasVectors() + ")"); } } @@ -201,7 +198,7 @@ public void printTermCounts(String termStr) throws IOException, ParseException { System.out.println("collection frequency: " + reader.totalTermFreq(t)); System.out.println("document frequency: " + reader.docFreq(t)); - PostingsEnum postingsEnum = MultiFields.getTermDocsEnum(reader, LuceneDocumentGenerator.FIELD_BODY, t.bytes()); + PostingsEnum postingsEnum = MultiTerms.getTermPostingsEnum(reader, LuceneDocumentGenerator.FIELD_BODY, t.bytes()); System.out.println("postings:\n"); while (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { System.out.printf("\t%s, %s\n", postingsEnum.docID(), postingsEnum.freq()); diff --git a/src/main/java/io/anserini/kg/IndexFreebase.java b/src/main/java/io/anserini/kg/IndexFreebase.java index ca0eba2bb5..240f80b167 100644 --- a/src/main/java/io/anserini/kg/IndexFreebase.java +++ b/src/main/java/io/anserini/kg/IndexFreebase.java @@ -161,7 +161,7 @@ public void run() throws IOException { LOG.info(String.format("%,d triples indexed.", triplesCount.get())); LOG.info(String.format("%,d documents added.", docCount.get())); - int numIndexed = writer.maxDoc(); + int numIndexed = writer.getDocStats().maxDoc; try { writer.commit(); diff --git a/src/main/java/io/anserini/kg/LookupFreebaseNodes.java b/src/main/java/io/anserini/kg/LookupFreebaseNodes.java index d50c7e8ecb..b816283459 100644 --- a/src/main/java/io/anserini/kg/LookupFreebaseNodes.java +++ b/src/main/java/io/anserini/kg/LookupFreebaseNodes.java @@ -102,11 +102,11 @@ public Document lookupMid(String mid) throws IOException { TermQuery query = new TermQuery(new Term(IndexFreebase.FIELD_ID, mid)); TopDocs topDocs = searcher.search(query, 1); - if (topDocs.totalHits == 0) { + if (topDocs.totalHits.value == 0) { System.err.println("Error: mid not found!"); return null; } - if (topDocs.totalHits > 1) { + if (topDocs.totalHits.value > 1) { System.err.println("Error: more than one matching mid found. This shouldn't happen!"); return null; } diff --git a/src/main/java/io/anserini/ltr/BaseFeatureExtractor.java b/src/main/java/io/anserini/ltr/BaseFeatureExtractor.java index c50e0cbb67..2ff3312a84 100644 --- a/src/main/java/io/anserini/ltr/BaseFeatureExtractor.java +++ b/src/main/java/io/anserini/ltr/BaseFeatureExtractor.java @@ -25,7 +25,8 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.MultiBits; +import org.apache.lucene.index.MultiTerms; import org.apache.lucene.index.Terms; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; @@ -171,7 +172,7 @@ private void printHeader(PrintStream out, FeatureExtractors extractors) { public void printFeatureForAllDocs(PrintStream out) throws IOException { Map> queryContextMap = buildRerankerContextMap(); FeatureExtractors extractors = getExtractors(); - Bits liveDocs = MultiFields.getLiveDocs(reader); + Bits liveDocs = MultiBits.getLiveDocs(reader); Set fieldsToLoad = getFieldsToLoad(); this.printHeader(out, extractors); @@ -186,7 +187,7 @@ public void printFeatureForAllDocs(PrintStream out) throws IOException { String docIdString = doc.get(getIdField()); // NOTE doc frequencies should not be retrieved from here, term vector returned is as if on single document // index - Terms terms = MultiFields.getTerms(reader, getTermVectorField());//reader.getTermVector(docId, getTermVectorField()); + Terms terms = MultiTerms.getTerms(reader, getTermVectorField());//reader.getTermVector(docId, getTermVectorField()); if (terms == null) { continue; @@ -210,7 +211,7 @@ public void printFeatureForAllDocs(PrintStream out) throws IOException { public void printFeatures(PrintStream out) throws IOException { Map> queryContextMap = buildRerankerContextMap(); FeatureExtractors extractors = getExtractors(); - Bits liveDocs = MultiFields.getLiveDocs(reader); + Bits liveDocs = MultiBits.getLiveDocs(reader); Set fieldsToLoad = getFieldsToLoad(); // We need to open a searcher @@ -230,7 +231,7 @@ public void printFeatures(PrintStream out) throws IOException { int qrelScore = entry.getValue(); // We issue a specific query TopDocs topDocs = searcher.search(docIdQuery(docId), 1); - if (topDocs.totalHits == 0) { + if (topDocs.totalHits.value == 0) { LOG.warn(String.format("Document Id %s expected but not found in index, skipping...", docId)); continue; } diff --git a/src/main/java/io/anserini/ltr/feature/base/BM25FeatureExtractor.java b/src/main/java/io/anserini/ltr/feature/base/BM25FeatureExtractor.java index 4619f07d91..10c38add04 100644 --- a/src/main/java/io/anserini/ltr/feature/base/BM25FeatureExtractor.java +++ b/src/main/java/io/anserini/ltr/feature/base/BM25FeatureExtractor.java @@ -23,7 +23,7 @@ import org.apache.logging.log4j.Logger; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.MultiTerms; import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; @@ -83,7 +83,7 @@ private double computeAvgFL(long sumTermFreqs, long maxDocs) { private long getSumTermFrequency(IndexReader reader, String fieldName) { Terms collectionTermVector = null; try { - collectionTermVector = MultiFields.getTerms(reader, fieldName); + collectionTermVector = MultiTerms.getTerms(reader, fieldName); long totalTermFreq = collectionTermVector.getSumTotalTermFreq(); return totalTermFreq; } catch (IOException e) { diff --git a/src/main/java/io/anserini/ltr/feature/base/PMIFeatureExtractor.java b/src/main/java/io/anserini/ltr/feature/base/PMIFeatureExtractor.java index 6e953fd2a5..a4a6848c2f 100644 --- a/src/main/java/io/anserini/ltr/feature/base/PMIFeatureExtractor.java +++ b/src/main/java/io/anserini/ltr/feature/base/PMIFeatureExtractor.java @@ -21,7 +21,7 @@ import io.anserini.rerank.RerankerContext; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.MultiTerms; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; @@ -100,8 +100,8 @@ public float extract(Document doc, Terms terms, RerankerContext context) { for (int j = i +1; j < queryTokens.size(); j++) { pairsComputed ++; String secondToken = queryTokens.get(j); - PostingsEnum firstEnum = MultiFields.getTermDocsEnum(reader,LuceneDocumentGenerator.FIELD_BODY, new BytesRef(firstToken)); - PostingsEnum secondEnum = MultiFields.getTermDocsEnum(reader,LuceneDocumentGenerator.FIELD_BODY, new BytesRef(secondToken)); + PostingsEnum firstEnum = MultiTerms.getTermPostingsEnum(reader,LuceneDocumentGenerator.FIELD_BODY, new BytesRef(firstToken)); + PostingsEnum secondEnum = MultiTerms.getTermPostingsEnum(reader,LuceneDocumentGenerator.FIELD_BODY, new BytesRef(secondToken)); int intersect; if (firstEnum == null || secondEnum == null) { intersect = 0; diff --git a/src/main/java/io/anserini/rerank/lib/AxiomReranker.java b/src/main/java/io/anserini/rerank/lib/AxiomReranker.java index 113b4c4407..21a99bad15 100644 --- a/src/main/java/io/anserini/rerank/lib/AxiomReranker.java +++ b/src/main/java/io/anserini/rerank/lib/AxiomReranker.java @@ -207,9 +207,9 @@ private ScoredDocuments searchTopDocs(Query query, RerankerContext context) t if (context.getSearchArgs().arbitraryScoreTieBreak) { rs = searcher.search(finalQuery, context.getSearchArgs().hits); } else if (context.getSearchArgs().searchtweets) { - rs = searcher.search(finalQuery, context.getSearchArgs().hits, BREAK_SCORE_TIES_BY_TWEETID, true, true); + rs = searcher.search(finalQuery, context.getSearchArgs().hits, BREAK_SCORE_TIES_BY_TWEETID, true); } else { - rs = searcher.search(finalQuery, context.getSearchArgs().hits, BREAK_SCORE_TIES_BY_DOCID, true, true); + rs = searcher.search(finalQuery, context.getSearchArgs().hits, BREAK_SCORE_TIES_BY_DOCID, true); } return ScoredDocuments.fromTopDocs(rs, searcher); @@ -277,7 +277,7 @@ private ScoredDocuments processExternalContext(ScoredDocuments docs, RerankerCon } IndexReader reader = DirectoryReader.open(FSDirectory.open(indexPath)); IndexSearcher searcher = new IndexSearcher(reader); - searcher.setSimilarity(context.getIndexSearcher().getSimilarity(true)); + searcher.setSimilarity(context.getIndexSearcher().getSimilarity()); SearchArgs args = new SearchArgs(); args.hits = this.R; diff --git a/src/main/java/io/anserini/rerank/lib/RankLibReranker.java b/src/main/java/io/anserini/rerank/lib/RankLibReranker.java index 98504e3c11..568a070ee8 100644 --- a/src/main/java/io/anserini/rerank/lib/RankLibReranker.java +++ b/src/main/java/io/anserini/rerank/lib/RankLibReranker.java @@ -28,7 +28,7 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.apache.lucene.document.Document; -import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.MultiTerms; import org.apache.lucene.index.Terms; import java.io.IOException; @@ -50,7 +50,7 @@ public class RankLibReranker implements Reranker { private DataPoint convertToDataPoint(Document doc, RerankerContext context) { Terms terms = null; try { - terms = MultiFields.getTerms(context.getIndexSearcher().getIndexReader(), this.termsField); + terms = MultiTerms.getTerms(context.getIndexSearcher().getIndexReader(), this.termsField); } catch (IOException e) { LOG.error("Unable to retrieve term vectors"); } diff --git a/src/main/java/io/anserini/rerank/lib/Rm3Reranker.java b/src/main/java/io/anserini/rerank/lib/Rm3Reranker.java index 2ca7d22c6f..844ca7ef2f 100644 --- a/src/main/java/io/anserini/rerank/lib/Rm3Reranker.java +++ b/src/main/java/io/anserini/rerank/lib/Rm3Reranker.java @@ -112,9 +112,9 @@ public ScoredDocuments rerank(ScoredDocuments docs, RerankerContext context) { if (context.getSearchArgs().arbitraryScoreTieBreak) { rs = searcher.search(finalQuery, context.getSearchArgs().hits); } else if (context.getSearchArgs().searchtweets) { - rs = searcher.search(finalQuery, context.getSearchArgs().hits, BREAK_SCORE_TIES_BY_TWEETID, true, true); + rs = searcher.search(finalQuery, context.getSearchArgs().hits, BREAK_SCORE_TIES_BY_TWEETID, true); } else { - rs = searcher.search(finalQuery, context.getSearchArgs().hits, BREAK_SCORE_TIES_BY_DOCID, true, true); + rs = searcher.search(finalQuery, context.getSearchArgs().hits, BREAK_SCORE_TIES_BY_DOCID, true); } } catch (IOException e) { e.printStackTrace(); diff --git a/src/main/java/io/anserini/search/SearchArgs.java b/src/main/java/io/anserini/search/SearchArgs.java index 0f502f492f..26f444466b 100644 --- a/src/main/java/io/anserini/search/SearchArgs.java +++ b/src/main/java/io/anserini/search/SearchArgs.java @@ -126,11 +126,11 @@ public class SearchArgs { @Option(name = "-b", handler = StringArrayOptionHandler.class, usage = "BM25 b parameter") public String[] b = new String[] {"0.4"}; - @Option(name = "-pl2", usage = "use PL2 scoring model") - public boolean pl2 = false; + @Option(name = "-inl2", usage = "use I(n)L2 scoring model") + public boolean inl2 = false; - @Option(name = "-pl2.c", metaVar = "[value]", usage = "PL2 c parameter") - public String[] pl2_c = new String[] {"0.1"}; + @Option(name = "-inl2.c", metaVar = "[value]", usage = "I(n)L2 c parameter") + public String[] inl2_c = new String[] {"0.1"}; @Option(name = "-spl", usage = "use SPL scoring model") public boolean spl = false; diff --git a/src/main/java/io/anserini/search/SearchCollection.java b/src/main/java/io/anserini/search/SearchCollection.java index 68b128c30d..0c7a41ccbf 100644 --- a/src/main/java/io/anserini/search/SearchCollection.java +++ b/src/main/java/io/anserini/search/SearchCollection.java @@ -29,8 +29,6 @@ import io.anserini.rerank.lib.ScoreTiesAdjusterReranker; import io.anserini.search.query.BagOfWordsQueryGenerator; import io.anserini.search.query.SdmQueryGenerator; -import io.anserini.search.similarity.F2ExpSimilarity; -import io.anserini.search.similarity.F2LogSimilarity; import io.anserini.search.similarity.TaggedSimilarity; import io.anserini.search.topicreader.NewsBackgroundLinkingTopicReader; import io.anserini.search.topicreader.TopicReader; @@ -55,9 +53,12 @@ import org.apache.lucene.search.SortField; import org.apache.lucene.search.TermInSetQuery; import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.TotalHits; import org.apache.lucene.search.similarities.AfterEffectL; +import org.apache.lucene.search.similarities.AxiomaticF2EXP; +import org.apache.lucene.search.similarities.AxiomaticF2LOG; import org.apache.lucene.search.similarities.BM25Similarity; -import org.apache.lucene.search.similarities.BasicModelP; +import org.apache.lucene.search.similarities.BasicModelIn; import org.apache.lucene.search.similarities.DFRSimilarity; import org.apache.lucene.search.similarities.DistributionSPL; import org.apache.lucene.search.similarities.IBSimilarity; @@ -252,9 +253,9 @@ public List constructSimiliries() { similarities.add(new TaggedSimilarity(new BM25Similarity(Float.valueOf(k1), Float.valueOf(b)), "k1="+k1+",b="+b)); } } - } else if (args.pl2) { - for (String c : args.pl2_c) { - similarities.add(new TaggedSimilarity(new DFRSimilarity(new BasicModelP(), new AfterEffectL(), new NormalizationH2(Float.valueOf(c))), "c="+c)); + } else if (args.inl2) { + for (String c : args.inl2_c) { + similarities.add(new TaggedSimilarity(new DFRSimilarity(new BasicModelIn(), new AfterEffectL(), new NormalizationH2(Float.valueOf(c))), "c:"+c)); }; } else if (args.spl) { for (String c : args.spl_c) { @@ -262,11 +263,11 @@ public List constructSimiliries() { } } else if (args.f2exp) { for (String s : args.f2exp_s) { - similarities.add(new TaggedSimilarity(new F2ExpSimilarity(Float.valueOf(s)), "s="+s)); + similarities.add(new TaggedSimilarity(new AxiomaticF2EXP(Float.valueOf(s)), "s:"+s)); } } else if (args.f2log) { for (String s : args.f2log_s) { - similarities.add(new TaggedSimilarity(new F2LogSimilarity(Float.valueOf(s)), "s="+s)); + similarities.add(new TaggedSimilarity(new AxiomaticF2LOG(Float.valueOf(s)), "s:"+s)); } } else { throw new IllegalArgumentException("Error: Must specify scoring model!"); @@ -374,12 +375,12 @@ public ScoredDocuments search(IndexSearcher searcher, K qid, String queryStri query = new BagOfWordsQueryGenerator().buildQuery(FIELD_BODY, analyzer, queryString); } - TopDocs rs = new TopDocs(0, new ScoreDoc[]{}, Float.NaN); + TopDocs rs = new TopDocs(new TotalHits(0, TotalHits.Relation.EQUAL_TO), new ScoreDoc[]{}); if (!(isRerank && args.rerankcutoff <= 0)) { if (args.arbitraryScoreTieBreak) {// Figure out how to break the scoring ties. rs = searcher.search(query, isRerank ? args.rerankcutoff : args.hits); } else { - rs = searcher.search(query, isRerank ? args.rerankcutoff : args.hits, BREAK_SCORE_TIES_BY_DOCID, true, true); + rs = searcher.search(query, isRerank ? args.rerankcutoff : args.hits, BREAK_SCORE_TIES_BY_DOCID, true); } } @@ -419,12 +420,12 @@ public ScoredDocuments searchBackgroundLinking(IndexSearcher searcher, K qid, builder.add(q, BooleanClause.Occur.MUST); query = builder.build(); - TopDocs rs = new TopDocs(0, new ScoreDoc[]{}, Float.NaN); + TopDocs rs = new TopDocs(new TotalHits(0, TotalHits.Relation.EQUAL_TO), new ScoreDoc[]{}); if (!(isRerank && args.rerankcutoff <= 0)) { if (args.arbitraryScoreTieBreak) {// Figure out how to break the scoring ties. rs = searcher.search(query, isRerank ? args.rerankcutoff : args.hits); } else { - rs = searcher.search(query, isRerank ? args.rerankcutoff : args.hits, BREAK_SCORE_TIES_BY_DOCID, true, true); + rs = searcher.search(query, isRerank ? args.rerankcutoff : args.hits, BREAK_SCORE_TIES_BY_DOCID, true); } } @@ -487,12 +488,12 @@ public ScoredDocuments searchTweets(IndexSearcher searcher, K qid, String que Query compositeQuery = builder.build(); - TopDocs rs = new TopDocs(0, new ScoreDoc[]{}, Float.NaN); + TopDocs rs = new TopDocs(new TotalHits(0,TotalHits.Relation.EQUAL_TO), new ScoreDoc[]{}); if (!(isRerank && args.rerankcutoff <= 0)) { if (args.arbitraryScoreTieBreak) {// Figure out how to break the scoring ties. rs = searcher.search(compositeQuery, isRerank ? args.rerankcutoff : args.hits); } else { - rs = searcher.search(compositeQuery, isRerank ? args.rerankcutoff : args.hits, BREAK_SCORE_TIES_BY_TWEETID, true, true); + rs = searcher.search(compositeQuery, isRerank ? args.rerankcutoff : args.hits, BREAK_SCORE_TIES_BY_TWEETID, true); } } diff --git a/src/main/java/io/anserini/search/SimpleSearcher.java b/src/main/java/io/anserini/search/SimpleSearcher.java index 2bc3753227..35f798df9d 100644 --- a/src/main/java/io/anserini/search/SimpleSearcher.java +++ b/src/main/java/io/anserini/search/SimpleSearcher.java @@ -44,11 +44,12 @@ import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.TotalHits; import org.apache.lucene.search.similarities.AfterEffectL; import org.apache.lucene.search.similarities.AxiomaticF2EXP; import org.apache.lucene.search.similarities.AxiomaticF2LOG; import org.apache.lucene.search.similarities.BM25Similarity; -import org.apache.lucene.search.similarities.BasicModelP; +import org.apache.lucene.search.similarities.BasicModelIn; import org.apache.lucene.search.similarities.DFRSimilarity; import org.apache.lucene.search.similarities.DistributionSPL; import org.apache.lucene.search.similarities.IBSimilarity; @@ -154,7 +155,7 @@ public void setBM25Similarity(float k1, float b) { } public void setDFRSimilarity(float c) { - this.similarity = new DFRSimilarity(new BasicModelP(), new AfterEffectL(), new NormalizationH2(c)); + this.similarity = new DFRSimilarity(new BasicModelIn(), new AfterEffectL(), new NormalizationH2(c)); } public void setIBSimilarity(float c) { @@ -201,7 +202,7 @@ protected Result[] search(Query query, List queryTokens, String queryStr searchArgs.hits = k; searchArgs.searchtweets = searchtweets; - TopDocs rs = new TopDocs(0, new ScoreDoc[]{}, Float.NaN); + TopDocs rs = new TopDocs(new TotalHits(0, TotalHits.Relation.EQUAL_TO), new ScoreDoc[]{}); RerankerContext context; if (searchtweets) { if (t > 0) { @@ -213,17 +214,14 @@ protected Result[] search(Query query, List queryTokens, String queryStr builder.add(filter, BooleanClause.Occur.FILTER); builder.add(query, BooleanClause.Occur.MUST); Query compositeQuery = builder.build(); - rs = searcher.search(compositeQuery, isRerank ? - searchArgs.rerankcutoff : k, BREAK_SCORE_TIES_BY_TWEETID, true, true); + rs = searcher.search(compositeQuery, isRerank ? searchArgs.rerankcutoff : k, BREAK_SCORE_TIES_BY_TWEETID, true); context = new RerankerContext<>(searcher, null, compositeQuery, null, queryString, queryTokens, filter, searchArgs); } else { - rs = searcher.search(query, isRerank ? - searchArgs.rerankcutoff : k, BREAK_SCORE_TIES_BY_TWEETID, true, true); + rs = searcher.search(query, isRerank ? searchArgs.rerankcutoff : k, BREAK_SCORE_TIES_BY_TWEETID, true); context = new RerankerContext<>(searcher, null, query, null, queryString, queryTokens, null, searchArgs); } } else { - rs = searcher.search(query, isRerank ? - searchArgs.rerankcutoff : k, BREAK_SCORE_TIES_BY_DOCID, true, true); + rs = searcher.search(query, isRerank ? searchArgs.rerankcutoff : k, BREAK_SCORE_TIES_BY_DOCID, true); context = new RerankerContext<>(searcher, null, query, null, queryString, queryTokens, null, searchArgs); } diff --git a/src/main/java/io/anserini/search/similarity/AxiomaticSimilarity.java b/src/main/java/io/anserini/search/similarity/AxiomaticSimilarity.java deleted file mode 100644 index 12b05c9e5b..0000000000 --- a/src/main/java/io/anserini/search/similarity/AxiomaticSimilarity.java +++ /dev/null @@ -1,389 +0,0 @@ -/** - * Anserini: A Lucene toolkit for replicable information retrieval research - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.anserini.search.similarity; - -import org.apache.lucene.index.FieldInvertState; -import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.index.NumericDocValues; -import org.apache.lucene.search.CollectionStatistics; -import org.apache.lucene.search.Explanation; -import org.apache.lucene.search.TermStatistics; -import org.apache.lucene.search.similarities.Similarity; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.SmallFloat; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; - -/** - * Hui Fang and ChengXiang Zhai. 2005. An exploration of axiomatic approaches to information retrieval. - * In Proceedings of the 28th annual international ACM SIGIR conference on Research and development in - * information retrieval (SIGIR '05). ACM, New York, NY, USA, 480-487. - */ -public abstract class AxiomaticSimilarity extends Similarity { - protected final float s; - /** Cache of decoded bytes. */ - protected static final float[] OLD_LENGTH_TABLE = new float[256]; - protected static final float[] LENGTH_TABLE = new float[256]; - - static { - for (int i = 1; i < 256; i++) { - float f = SmallFloat.byte315ToFloat((byte)i); - OLD_LENGTH_TABLE[i] = 1.0f / (f*f); - } - OLD_LENGTH_TABLE[0] = 1.0f / OLD_LENGTH_TABLE[255]; // otherwise inf - - for (int i = 0; i < 256; i++) { - LENGTH_TABLE[i] = SmallFloat.byte4ToInt((byte) i); - } - } - - /** - * @param s Generic parater s - * @throws IllegalArgumentException if {@code s} is infinite or if {@code s} is - * not within the range {@code [0..1]} - */ - AxiomaticSimilarity(float s) { - if (Float.isNaN(s) || s < 0 || s > 1) { - throw new IllegalArgumentException("illegal s value: " + s + ", must be between 0 and 1"); - } - this.s = s; - } - - /** Default parameter: - *
    - *
  • {@code s = 0.5}
  • - *
- */ - AxiomaticSimilarity() { - this(0.5f); - } - - /** Implemented as log(1 + (docCount - docFreq + 0.5)/(docFreq + 0.5)). - * - * @param docFreq terms's document frequency - * @param docCount total document count in the index - * @return inverted document frequency - * */ - float idf(long docFreq, long docCount) { - throw new UnsupportedOperationException(); - } - - /** Implemented as 1 / (distance + 1). - * - * @param distance distance - * @return sloppy frequency - * */ - float sloppyFreq(int distance) { - return 1.0f / (distance + 1); - } - - /** The default implementation returns 1 - * - * @param doc doc - * @param start start - * @param end end - * @param payload payload - * @return 1 - * */ - float scorePayload(int doc, int start, int end, BytesRef payload) { - return 1; - } - - /** The default implementation computes the average as sumTotalTermFreq / docCount, - * or returns 1 if the index does not store sumTotalTermFreq: - * any field that omits frequency information). - * - * @param collectionStats collection-wide statistics - * @return average document length of FIELD_BODY - * */ - float avgFieldLength(CollectionStatistics collectionStats) { - final long sumTotalTermFreq = collectionStats.sumTotalTermFreq(); - if (sumTotalTermFreq <= 0) { - return 1f; // field does not exist, or stat is unsupported - } else { - final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount(); - return (float) (sumTotalTermFreq / (double) docCount); - } - } - - /** - * True if overlap tokens (tokens with a position of increment of zero) are - * discounted from the document's length. - */ - boolean discountOverlaps = true; - - /** Sets whether overlap tokens (Tokens with 0 position increment) are - * ignored when computing norm. By default this is true, meaning overlap - * tokens do not count when computing norms. - * - * @param v v - * */ - public void setDiscountOverlaps(boolean v) { - discountOverlaps = v; - } - - /** - * Returns true if overlap tokens are discounted from the document's length. - * @see #setDiscountOverlaps - * - * @return discountOverlaps - */ - public boolean getDiscountOverlaps() { - return discountOverlaps; - } - - /** Cache of decoded bytes. */ - private static final float[] NORM_TABLE = new float[256]; - - static { - for (int i = 1; i < 256; i++) { - float f = SmallFloat.byte315ToFloat((byte)i); - NORM_TABLE[i] = 1.0f / (f*f); - } - NORM_TABLE[0] = 1.0f / NORM_TABLE[255]; // otherwise inf - } - - - @Override - public final long computeNorm(FieldInvertState state) { - final int numTerms = discountOverlaps ? state.getLength() - state.getNumOverlap() : state.getLength(); - int indexCreatedVersionMajor = state.getIndexCreatedVersionMajor(); - if (indexCreatedVersionMajor >= 7) { - return SmallFloat.intToByte4(numTerms); - } else { - return SmallFloat.floatToByte315((float) (1 / Math.sqrt(numTerms))); - } - } - - /** - * Computes a score factor for a simple term and returns an explanation - * for that score factor. - * - *

- * The default implementation uses: - * - *

-   * idf(docFreq, docCount);
-   * 
- * - * Note that {@link CollectionStatistics#docCount()} is used instead of - * {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also - * {@link TermStatistics#docFreq()} is used, and when the latter - * is inaccurate, so is {@link CollectionStatistics#docCount()}, and in the same direction. - * In addition, {@link CollectionStatistics#docCount()} does not skew when fields are sparse. - * - * @param collectionStats collection-level statistics - * @param termStats term-level statistics for the term - * @return an Explain object that includes both an idf score factor - and an explanation for the term. - */ - public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) { - final long df = termStats.docFreq(); - final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount(); - final float idf = idf(df, docCount); - return Explanation.match(idf, "idf(docFreq=" + df + ", docCount=" + docCount + ")"); - } - - /** - * Computes a score factor for a phrase. - * - *

- * The default implementation sums the idf factor for - * each term in the phrase. - * - * @param collectionStats collection-level statistics - * @param termStats term-level statistics for the terms in the phrase - * @return an Explain object that includes both an idf - * score factor for the phrase and an explanation - * for each term. - */ - public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) { - final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount(); - double idf = 0d; - List details = new ArrayList<>(); - for (final TermStatistics stat : termStats ) { - final long df = stat.docFreq(); - final float termIdf = idf(df, docCount); - details.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", docCount=" + docCount + ")")); - idf += termIdf; - } - return Explanation.match((float)idf, "idf(), sum of:", details); - } - - @Override - public final SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { - Explanation idf = termStats.length == 1 ? idfExplain(collectionStats, termStats[0]) : idfExplain(collectionStats, termStats); - float avgdl = avgFieldLength(collectionStats); - - float[] oldCache = new float[256]; - float[] cache = new float[256]; - for (int i = 0; i < cache.length; i++) { - oldCache[i] = s + s * OLD_LENGTH_TABLE[i] / avgdl; - cache[i] = s + s * LENGTH_TABLE[i] / avgdl; - } - return new Stats(collectionStats.field(), boost, idf, avgdl, oldCache, cache); - } - - - @Override - public final SimScorer simScorer(SimWeight stats, LeafReaderContext context) throws IOException { - Stats axStats = (Stats) stats; - return new AxDocScorer(axStats, context.reader().getMetaData().getCreatedVersionMajor(), context.reader().getNormValues(axStats.field)); - } - - /** DocumentCollection statistics for the F2Log model. */ - static class Stats extends SimWeight { - /** F2Log's idf */ - public final Explanation idf; - /** The average document length. */ - public final float avgdl; - /** query boost */ - public float boost; - /** weight (idf * boost) */ - public float weight; - /** field name, for pulling norms */ - public final String field; - /** precomputed norm[256] with k1 * ((1 - b) + b * dl / avgdl) - * for both OLD_LENGTH_TABLE and LENGTH_TABLE */ - private final float[] oldCache, cache; - - Stats(String field, float boost, Explanation idf, float avgdl, float[] oldCache, float[] cache) { - this.field = field; - this.idf = idf; - this.avgdl = avgdl; - this.weight = idf.getValue() * boost; - this.oldCache = oldCache; - this.cache = cache; - } - } - - class AxDocScorer extends SimScorer { - private final Stats stats; - private final float weightValue; // boost * idf - private final NumericDocValues norms; - /** precomputed cache for all length values */ - private final float[] lengthCache; - /** precomputed norm[256] with k1 * ((1 - b) + b * dl / avgdl) */ - private final float[] cache; - - AxDocScorer(Stats stats, int indexCreatedVersionMajor, NumericDocValues norms) throws IOException { - this.stats = stats; - this.weightValue = stats.weight; - this.norms = norms; - if (indexCreatedVersionMajor >= 7) { - lengthCache = LENGTH_TABLE; - cache = stats.cache; - } else { - lengthCache = OLD_LENGTH_TABLE; - cache = stats.oldCache; - } - } - - /* Score function is: - *

-                                                     occurrences
-      score = termWeight * IDF * ---------------------------------------------------------
-                                 occurrences + s + documentLength * s / avgDocLength
-       
- */ - @Override - public float score(int doc, float freq) throws IOException { - // if there are no norms, we act as if b=0 - float norm; - if (norms == null) { - norm = 0.0f; - } else { - if (norms.advanceExact(doc)) { - norm = cache[((byte) norms.longValue()) & 0xFF]; - } else { - norm = cache[0]; - } - } - return weightValue * freq / (freq + norm); - } - - @Override - public Explanation explain(int doc, Explanation freq) throws IOException { - return explainScore(doc, freq, stats, norms, lengthCache); - } - - @Override - public float computeSlopFactor(int distance) { - return sloppyFreq(distance); - } - - @Override - public float computePayloadFactor(int doc, int start, int end, BytesRef payload) { - return scorePayload(doc, start, end, payload); - } - } - - private Explanation explainTFNorm(int doc, Explanation freq, Stats stats, NumericDocValues norms, float[] lengthCache) throws IOException { - List subs = new ArrayList<>(); - subs.add(freq); - subs.add(Explanation.match(s, "parameter s")); - if (norms == null) { - subs.add(Explanation.match(0, "norm")); - return Explanation.match(1, - "tfNorm, computed as constant from:", subs); - } else { - byte norm; - if (norms.advanceExact(doc)) { - norm = (byte) norms.longValue(); - } else { - norm = 0; - } - float doclen = lengthCache[norm & 0xff]; - subs.add(Explanation.match(stats.avgdl, "avgFieldLength")); - subs.add(Explanation.match(doclen, "fieldLength")); - return Explanation.match( - (freq.getValue() / (freq.getValue() + s + s * doclen/stats.avgdl)), - "tfNorm, computed as (freq / (freq + s + s * fieldLength / avgFieldLength) from:", subs); - } - } - - - private Explanation explainScore(int doc, Explanation freq, Stats stats, NumericDocValues norms, float[] lengthCache) throws IOException { - Explanation boostExpl = Explanation.match(stats.boost, "boost"); - List subs = new ArrayList<>(); - if (boostExpl.getValue() != 1.0f) - subs.add(boostExpl); - subs.add(stats.idf); - Explanation tfNormExpl = explainTFNorm(doc, freq, stats, norms, lengthCache); - subs.add(tfNormExpl); - return Explanation.match( - boostExpl.getValue() * stats.idf.getValue() * tfNormExpl.getValue(), - "score(doc="+doc+",freq="+freq+"), product of:", subs); - } - - @Override - public String toString() { - throw new UnsupportedOperationException(); - } - - /** - * Returns the b parameter - * @see #AxiomaticSimilarity(float) - * - * @return s - */ - public float getS() { - return s; - } -} diff --git a/src/main/java/io/anserini/search/similarity/F2ExpSimilarity.java b/src/main/java/io/anserini/search/similarity/F2ExpSimilarity.java deleted file mode 100644 index ce24a00f61..0000000000 --- a/src/main/java/io/anserini/search/similarity/F2ExpSimilarity.java +++ /dev/null @@ -1,59 +0,0 @@ -/** - * Anserini: A Lucene toolkit for replicable information retrieval research - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.anserini.search.similarity; - -public class F2ExpSimilarity extends AxiomaticSimilarity { - private final float k = 0.35f; - - /** - * F2Exp with the supplied parameter values. - * @param s Controls to what degree document length normalizes tf values. - * @throws IllegalArgumentException if {@code s} is infinite or if {@code s} is - * not within the range {@code [0..1]} - */ - public F2ExpSimilarity(float s) { - super(s); - } - - /** F2Exp with these default values: - *
    - *
  • {@code k = 0.35}
  • - *
- */ - public F2ExpSimilarity() { - this(0.5f); - } - - @Override - float idf(long docFreq, long docCount) { - return (float) Math.pow((docCount + 1.0) / docFreq, this.k); - } - - @Override - public String toString() { - return "F2Exp(s=" + s +")"; - } - - /** - * Returns the k parameter - * @see #F2ExpSimilarity(float) - * @return k - */ - public float getK() { - return k; - } -} diff --git a/src/main/java/io/anserini/search/similarity/F2LogSimilarity.java b/src/main/java/io/anserini/search/similarity/F2LogSimilarity.java deleted file mode 100644 index 7967b7b5d9..0000000000 --- a/src/main/java/io/anserini/search/similarity/F2LogSimilarity.java +++ /dev/null @@ -1,48 +0,0 @@ -/** - * Anserini: A Lucene toolkit for replicable information retrieval research - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.anserini.search.similarity; - -public class F2LogSimilarity extends AxiomaticSimilarity { - /** - * F2Log with the supplied parameter values. - * @param s Controls to what degree document length normalizes tf values. - * @throws IllegalArgumentException if {@code s} is infinite or if {@code s} is - * not within the range {@code [0..1]} - */ - public F2LogSimilarity(float s) { - super(s); - } - - /** F2Log with these default values: - *
    - *
  • {@code s = 0.5}
  • - *
- */ - public F2LogSimilarity() { - this(0.5f); - } - - @Override - float idf(long docFreq, long docCount) { - return (float) Math.log((1.0f + docCount) / docFreq); - } - - @Override - public String toString() { - return "F2Log(s=" + s +")"; - } -} diff --git a/src/main/java/io/anserini/search/similarity/RankLibSimilarity.java b/src/main/java/io/anserini/search/similarity/RankLibSimilarity.java index b263965723..da1c077b5d 100644 --- a/src/main/java/io/anserini/search/similarity/RankLibSimilarity.java +++ b/src/main/java/io/anserini/search/similarity/RankLibSimilarity.java @@ -17,13 +17,10 @@ package io.anserini.search.similarity; import org.apache.lucene.index.FieldInvertState; -import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.search.CollectionStatistics; import org.apache.lucene.search.TermStatistics; import org.apache.lucene.search.similarities.Similarity; -import java.io.IOException; - /** * Similarity that uses a Ranklib ranker to compute the score */ @@ -34,12 +31,8 @@ public long computeNorm(FieldInvertState fieldInvertState) { } @Override - public SimWeight computeWeight(float boost, CollectionStatistics collectionStatistics, TermStatistics... termStatistics) { + public SimScorer scorer(float boost, CollectionStatistics collectionStatistics, TermStatistics... termStatistics) { return null; } - @Override - public SimScorer simScorer(SimWeight simWeight, LeafReaderContext leafReaderContext) throws IOException { - return null; - } } diff --git a/src/main/java/io/anserini/util/ExtractTopDfTerms.java b/src/main/java/io/anserini/util/ExtractTopDfTerms.java index 6c33e38619..b5d12c1007 100644 --- a/src/main/java/io/anserini/util/ExtractTopDfTerms.java +++ b/src/main/java/io/anserini/util/ExtractTopDfTerms.java @@ -20,7 +20,7 @@ import org.apache.logging.log4j.Logger; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.MultiTerms; import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; @@ -95,7 +95,7 @@ public int compare(Pair p1, Pair p2) { PriorityQueue queue = new PriorityQueue(myArgs.topK, comp); LOG.info("Starting to iterate through all terms..."); - Terms terms = MultiFields.getFields(reader).terms(myArgs.field); + Terms terms = MultiTerms.getTerms(reader, myArgs.field); TermsEnum termsEnum = terms.iterator(); BytesRef text; int cnt = 0; diff --git a/src/main/resources/regression/car17v1.5.yaml b/src/main/resources/regression/car17v1.5.yaml index cd85d2a4c0..e13410f6ea 100644 --- a/src/main/resources/regression/car17v1.5.yaml +++ b/src/main/resources/regression/car17v1.5.yaml @@ -21,8 +21,8 @@ index_path: indexes/lucene-index.car17v1.5.pos+docvectors+rawdocs # path to the collection: CarCollection index_stats: documents: 29678360 - documents (non-empty): 29674409 - total terms: 1257896158 + documents (non-empty): 29674425 + total terms: 1257909884 topics: - name: "[TREC 2017 CAR: benchmarkY1test (v1.5)](http://trec-car.cs.unh.edu/datareleases/)" path: topics.car17v1.5.benchmarkY1test.txt @@ -51,9 +51,9 @@ models: - -bm25 results: map: - - 0.1563 + - 0.1562 recip_rank: - - 0.2336 + - 0.2331 - name: bm25+rm3 display: +RM3 params: diff --git a/src/main/resources/regression/car17v2.0.yaml b/src/main/resources/regression/car17v2.0.yaml index e9eac18e80..e84552d897 100644 --- a/src/main/resources/regression/car17v2.0.yaml +++ b/src/main/resources/regression/car17v2.0.yaml @@ -21,8 +21,8 @@ index_path: indexes/lucene-index.car17v2.0.pos+docvectors+rawdocs collection: CarCollection index_stats: documents: 29794689 - documents (non-empty): 29791041 - total terms: 1249740109 + documents (non-empty): 29791059 + total terms: 1249754054 topics: - name: "[TREC 2017 CAR: benchmarkY1test (v2.0)](http://trec-car.cs.unh.edu/datareleases/)" path: topics.car17v2.0.benchmarkY1test.txt diff --git a/src/main/resources/regression/core17.yaml b/src/main/resources/regression/core17.yaml index c8f80c88f7..b91ebee6a8 100644 --- a/src/main/resources/regression/core17.yaml +++ b/src/main/resources/regression/core17.yaml @@ -22,7 +22,7 @@ collection: NewYorkTimesCollection index_stats: documents: 1855649 documents (non-empty): 1855649 - total terms: 751034051 + total terms: 751034054 topics: - name: "[TREC 2017 Common Core Track Topics](https://trec.nist.gov/data/core/core_nist.txt)" path: topics.core17.txt @@ -73,7 +73,7 @@ models: - -axiom.deterministic results: map: - - 0.2787 + - 0.2788 p30: - 0.4980 - name: ql diff --git a/src/main/resources/regression/core18.yaml b/src/main/resources/regression/core18.yaml index 9ce7cd6419..658204bacb 100644 --- a/src/main/resources/regression/core18.yaml +++ b/src/main/resources/regression/core18.yaml @@ -22,7 +22,7 @@ collection: WashingtonPostCollection index_stats: documents: 595037 documents (non-empty): 595030 - total terms: 318203786 + total terms: 318219945 topics: - name: "[TREC 2018 Common Core Track Topics](https://trec.nist.gov/data/core/topics2018.txt)" path: topics.core18.txt @@ -61,7 +61,7 @@ models: - -rm3 results: map: - - 0.3136 + - 0.3135 p30: - 0.4200 - name: bm25+ax @@ -73,7 +73,7 @@ models: - -axiom.deterministic results: map: - - 0.2920 + - 0.2925 p30: - 0.4027 - name: ql diff --git a/src/main/resources/regression/cw09b.yaml b/src/main/resources/regression/cw09b.yaml index 293a03d1c2..55642ea662 100644 --- a/src/main/resources/regression/cw09b.yaml +++ b/src/main/resources/regression/cw09b.yaml @@ -22,7 +22,7 @@ topic_reader: Webxml index_stats: documents: 50220189 documents (non-empty): 50220159 - total terms: 31270685466 + total terms: 31302554269 topics: - name: "[TREC 2010 Web Track: Topics 51-100](http://trec.nist.gov/data/web/10/wt2010-topics.xml)" path: topics.web.51-100.txt @@ -71,19 +71,19 @@ models: map: - 0.1126 - 0.1094 - - 0.1106 + - 0.1105 p30: - - 0.2681 + - 0.2694 - 0.2513 - 0.2167 ndcg20: - - 0.13539 - - 0.18901 - - 0.10141 + - 0.13537 + - 0.18900 + - 0.10139 err20: - 0.07335 - 0.09592 - - 0.13036 + - 0.13031 - name: bm25+rm3 display: +RM3 params: @@ -92,20 +92,20 @@ models: results: map: - 0.0933 - - 0.1081 + - 0.1085 - 0.1107 p30: - 0.2389 - - 0.2467 + - 0.2480 - 0.1920 ndcg20: - - 0.13690 - - 0.19164 - - 0.09170 + - 0.13693 + - 0.19160 + - 0.09182 err20: - - 0.07470 - - 0.09597 - - 0.14933 + - 0.07473 + - 0.09596 + - 0.14936 - name: bm25+ax display: +Ax params: @@ -116,21 +116,21 @@ models: - -axiom.beta 0.1 results: map: - - 0.0928 - - 0.0974 + - 0.0929 + - 0.0975 - 0.1315 p30: - 0.2354 - - 0.2393 + - 0.2387 - 0.2553 ndcg20: - - 0.16375 - - 0.18330 + - 0.16319 + - 0.18348 - 0.14413 err20: - - 0.09815 - - 0.10909 - - 0.23554 + - 0.09771 + - 0.10912 + - 0.23551 - name: ql display: QL params: @@ -145,12 +145,12 @@ models: - 0.2147 - 0.2080 ndcg20: - - 0.11431 - - 0.16192 + - 0.11432 + - 0.16191 - 0.08682 err20: - 0.05994 - - 0.08487 + - 0.08486 - 0.13052 - name: ql+rm3 display: +RM3 @@ -160,19 +160,19 @@ models: results: map: - 0.1019 - - 0.0837 - - 0.1059 + - 0.0839 + - 0.1058 p30: - 0.2312 - - 0.2067 + - 0.2047 - 0.1980 ndcg20: - - 0.11852 - - 0.14469 + - 0.11823 + - 0.14487 - 0.08959 err20: - - 0.05920 - - 0.07861 + - 0.05917 + - 0.07872 - 0.13336 - name: ql+ax display: +Ax @@ -189,13 +189,13 @@ models: - 0.1212 p30: - 0.2618 - - 0.2167 - - 0.2140 + - 0.2173 + - 0.2147 ndcg20: - 0.14541 - - 0.15091 - - 0.10296 + - 0.15174 + - 0.10373 err20: - 0.07424 - - 0.08203 - - 0.15575 + - 0.08205 + - 0.15577 diff --git a/src/main/resources/regression/cw12.yaml b/src/main/resources/regression/cw12.yaml index ac09f64ddb..71b23c4c3f 100644 --- a/src/main/resources/regression/cw12.yaml +++ b/src/main/resources/regression/cw12.yaml @@ -21,8 +21,8 @@ index_options: topic_reader: Webxml index_stats: documents: 731705088 - documents (non-empty): 731556725 - total terms: 428628865985 + documents (non-empty): 731556853 + total terms: 429328271635 topics: - name: "[TREC 2013 Web Track: Topics 201-250](http://trec.nist.gov/data/web2013.html)" path: topics.web.201-250.txt @@ -66,17 +66,17 @@ models: - -bm25 results: map: - - 0.1695 + - 0.1694 - 0.2469 p30: - - 0.2767 - - 0.4533 + - 0.2773 + - 0.4547 ndcg20: - - 0.20858 - - 0.25776 + - 0.20881 + - 0.25719 err20: - - 0.12835 - - 0.16305 + - 0.12838 + - 0.16162 - name: bm25+rm3 display: +RM3 params: @@ -85,13 +85,13 @@ models: results: map: - 0.1464 - - 0.2325 + - 0.2324 p30: - - 0.2387 - - 0.4073 + - 0.2393 + - 0.4080 ndcg20: - 0.20327 - - 0.25304 + - 0.25303 err20: - 0.12637 - 0.16550 @@ -101,17 +101,17 @@ models: - -ql results: map: - - 0.1493 - - 0.2467 + - 0.1494 + - 0.2466 p30: - - 0.2613 + - 0.2607 - 0.4380 ndcg20: - 0.19935 - - 0.22282 + - 0.22184 err20: - - 0.12319 - - 0.13211 + - 0.12325 + - 0.13218 - name: ql+rm3 display: +RM3 params: @@ -119,15 +119,15 @@ models: - -rm3 results: map: - - 0.1291 - - 0.2168 + - 0.1290 + - 0.2177 p30: - 0.2347 - - 0.3793 + - 0.3800 ndcg20: - 0.17253 - - 0.20662 + - 0.20829 err20: - - 0.10084 - - 0.12179 + - 0.10083 + - 0.12450 diff --git a/src/main/resources/regression/cw12b13.yaml b/src/main/resources/regression/cw12b13.yaml index 7ee7c46d3b..98f39a94ea 100644 --- a/src/main/resources/regression/cw12b13.yaml +++ b/src/main/resources/regression/cw12b13.yaml @@ -21,8 +21,8 @@ index_options: topic_reader: Webxml index_stats: documents: 52249039 - documents (non-empty): 52238521 - total terms: 30617038149 + documents (non-empty): 52238526 + total terms: 30666923268 topics: - name: "[TREC 2013 Web Track: Topics 201-250](http://trec.nist.gov/data/web2013.html)" path: topics.web.201-250.txt @@ -73,10 +73,10 @@ models: - 0.1273 ndcg20: - 0.12862 - - 0.11849 + - 0.11835 err20: - - 0.08379 - - 0.12013 + - 0.08378 + - 0.12006 - name: bm25+rm3 display: +RM3 params: @@ -84,17 +84,17 @@ models: - -rm3 results: map: - - 0.0412 + - 0.0408 - 0.0210 p30: - - 0.1713 + - 0.1673 - 0.1207 ndcg20: - - 0.11293 - - 0.10796 + - 0.11192 + - 0.10809 err20: - - 0.07629 - - 0.10653 + - 0.07530 + - 0.10662 - name: bm25+ax display: +Ax params: @@ -108,14 +108,14 @@ models: - 0.0435 - 0.0180 p30: - - 0.1840 + - 0.1833 - 0.1107 ndcg20: - - 0.12875 - - 0.09637 + - 0.12867 + - 0.09627 err20: - - 0.09430 - - 0.09289 + - 0.09413 + - 0.09285 - name: ql display: QL params: @@ -125,14 +125,14 @@ models: - 0.0397 - 0.0235 p30: - - 0.1767 + - 0.1780 - 0.1373 ndcg20: - - 0.11067 + - 0.11059 - 0.11765 err20: - - 0.07689 - - 0.10908 + - 0.07679 + - 0.10917 - name: ql+rm3 display: +RM3 params: @@ -143,14 +143,14 @@ models: - 0.0322 - 0.0203 p30: - - 0.1507 + - 0.1513 - 0.1173 ndcg20: - 0.09199 - - 0.10035 + - 0.10036 err20: - 0.05525 - - 0.09289 + - 0.09284 - name: ql+ax display: +Ax params: @@ -161,14 +161,14 @@ models: - -axiom.beta 0.1 results: map: - - 0.0359 - - 0.0186 + - 0.0358 + - 0.0183 p30: - - 0.1513 - - 0.1167 + - 0.1507 + - 0.1147 ndcg20: - - 0.11435 - - 0.10013 + - 0.11407 + - 0.09891 err20: - - 0.07800 - - 0.08965 + - 0.07803 + - 0.09002 diff --git a/src/main/resources/regression/gov2.yaml b/src/main/resources/regression/gov2.yaml index e4f24910f0..de644b0c71 100644 --- a/src/main/resources/regression/gov2.yaml +++ b/src/main/resources/regression/gov2.yaml @@ -39,7 +39,7 @@ evals: index_stats: documents: 25172934 documents (non-empty): 25170664 - total terms: 17343119816 + total terms: 17345062322 topics: - name: "[TREC 2004 Terabyte Track: Topics 701-750](http://trec.nist.gov/data/terabyte04.html)" path: topics.701-750.txt @@ -88,11 +88,11 @@ models: - -axiom.deterministic results: map: - - 0.2665 - - 0.3664 + - 0.2669 + - 0.3666 - 0.3069 p30: - - 0.4986 + - 0.4993 - 0.5933 - 0.5033 - name: ql @@ -103,7 +103,7 @@ models: map: - 0.2681 - 0.3303 - - 0.2996 + - 0.2997 p30: - 0.4755 - 0.5347 diff --git a/src/main/resources/regression/mb11.yaml b/src/main/resources/regression/mb11.yaml index 118281e6be..8e28d104ef 100644 --- a/src/main/resources/regression/mb11.yaml +++ b/src/main/resources/regression/mb11.yaml @@ -117,7 +117,7 @@ models: - 0.2389 p30: - 0.4435 - - 0.3520 + - 0.3514 - name: ql+ax display: +Ax params: diff --git a/src/main/resources/regression/msmarco-doc.yaml b/src/main/resources/regression/msmarco-doc.yaml index dcf364fb78..7fdf92776f 100644 --- a/src/main/resources/regression/msmarco-doc.yaml +++ b/src/main/resources/regression/msmarco-doc.yaml @@ -42,7 +42,7 @@ index_path: indexes/lucene-index.msmarco-doc.pos+docvectors+rawdocs index_stats: documents: 3213835 documents (non-empty): 3213835 - total terms: 2746735247 + total terms: 2748636047 topics: - name: "[MS MARCO Document Ranking: Dev Queries](https://github.com/microsoft/TREC-2019-Deep-Learning)" path: topics.msmarco-doc.dev.txt @@ -54,7 +54,7 @@ models: - -bm25 results: map: - - 0.2308 + - 0.2310 R@1000: - 0.8856 - name: bm25+rm3 @@ -64,6 +64,6 @@ models: - -rm3 results: map: - - 0.1631 + - 0.1632 R@1000: - - 0.8787 + - 0.8785 diff --git a/src/main/resources/regression/msmarco-passage.yaml b/src/main/resources/regression/msmarco-passage.yaml index 8dc22f5af6..c7d46f6380 100644 --- a/src/main/resources/regression/msmarco-passage.yaml +++ b/src/main/resources/regression/msmarco-passage.yaml @@ -42,7 +42,7 @@ index_path: indexes/lucene-index.msmarco-passage.pos+docvectors+rawdocs index_stats: documents: 8841823 documents (non-empty): 8841823 - total terms: 352122244 + total terms: 352316036 topics: - name: "[MS MARCO Passage Ranking: Dev Queries](https://github.com/microsoft/MSMARCO-Passage-Ranking)" path: topics.msmarco-passage.dev-subset.txt @@ -54,7 +54,7 @@ models: - -bm25 results: map: - - 0.1924 + - 0.1926 R@1000: - 0.8526 - name: bm25-default+rm3 @@ -75,7 +75,7 @@ models: - -b 0.72 results: map: - - 0.1956 + - 0.1957 R@1000: - 0.8578 - name: bm25-tuned+rm3 diff --git a/src/main/resources/regression/robust04.yaml b/src/main/resources/regression/robust04.yaml index 9a3388934c..eb656e8626 100644 --- a/src/main/resources/regression/robust04.yaml +++ b/src/main/resources/regression/robust04.yaml @@ -40,7 +40,7 @@ index_path: indexes/lucene-index.robust04.pos+docvectors+rawdocs # path to the e index_stats: documents: 528030 documents (non-empty): 528030 - total terms: 174540587 + total terms: 174540872 topics: - name: "[TREC 2004 Robust Track Topics](http://trec.nist.gov/data/robust/04.testset.gz)" path: topics.robust04.301-450.601-700.txt @@ -74,7 +74,7 @@ models: - -axiom.deterministic results: map: - - 0.2895 + - 0.2896 p30: - 0.3333 - name: ql diff --git a/src/main/resources/regression/robust05.yaml b/src/main/resources/regression/robust05.yaml index ec5a33b4d9..b09fac1167 100644 --- a/src/main/resources/regression/robust05.yaml +++ b/src/main/resources/regression/robust05.yaml @@ -52,7 +52,7 @@ models: - -bm25 results: map: - - 0.2031 + - 0.2032 p30: - 0.3693 - name: bm25+rm3 @@ -74,7 +74,7 @@ models: - -axiom.deterministic results: map: - - 0.2584 + - 0.2587 p30: - 0.4120 - name: ql diff --git a/src/main/resources/regression/wt10g.yaml b/src/main/resources/regression/wt10g.yaml index cf4fd4065d..ce85ca198d 100644 --- a/src/main/resources/regression/wt10g.yaml +++ b/src/main/resources/regression/wt10g.yaml @@ -39,8 +39,8 @@ input: collections/web/wt10g/ index_path: indexes/lucene-index.wt10g.pos+docvectors+rawdocs # path to the existing index, used in regression test if `--index` option is absent index_stats: documents: 1688402 - documents (non-empty): 1688290 - total terms: 752326031 + documents (non-empty): 1688291 + total terms: 752790242 topics: - name: "Wt10g: Topics 451-550" path: topics.451-550.txt @@ -54,7 +54,7 @@ models: map: - 0.1992 p30: - - 0.2218 + - 0.2214 - name: bm25+rm3 display: +RM3 params: @@ -109,4 +109,4 @@ models: map: - 0.2275 p30: - - 0.2517 + - 0.2514 diff --git a/src/test/java/io/anserini/integration/IndexerTest.java b/src/test/java/io/anserini/integration/IndexerTest.java index 353ddbec12..efb7e045c2 100644 --- a/src/test/java/io/anserini/integration/IndexerTest.java +++ b/src/test/java/io/anserini/integration/IndexerTest.java @@ -219,7 +219,7 @@ public void testIterateThroughDocumentVectorComputeBM25() throws Exception { TopDocs rs = searcher.search(finalQuery, 1); // issue the query // The BM25 weight is the maxScore - System.out.println(term + " " + tf + " " + rs.getMaxScore()); + System.out.println(term + " " + tf + " " + (rs.scoreDocs.length == 0 ? Float.NaN : rs.scoreDocs[0].score)); } } } diff --git a/src/test/java/io/anserini/integration/MultiThreadingSearchTest.java b/src/test/java/io/anserini/integration/MultiThreadingSearchTest.java index 7903bfcafb..20abf65337 100644 --- a/src/test/java/io/anserini/integration/MultiThreadingSearchTest.java +++ b/src/test/java/io/anserini/integration/MultiThreadingSearchTest.java @@ -34,8 +34,9 @@ protected void init() { termIndexStatusTermCount = 12; // Please note that standard analyzer ignores stopwords. // Also, this includes docids termIndexStatusTotFreq = 17; // - termIndexStatusTotPos = 16; // only "text" fields are indexed with position so we have 16 storedFieldStatusTotalDocCounts = 3; + // 16 positions for text fields, plus 1 for each document because of id + termIndexStatusTotPos = 16 + storedFieldStatusTotalDocCounts; storedFieldStatusTotFields = 9; // 3 docs * (1 id + 1 text + 1 raw) } diff --git a/src/test/java/io/anserini/integration/TrecEndToEndTest.java b/src/test/java/io/anserini/integration/TrecEndToEndTest.java index b6ef275491..2cb60f5a76 100644 --- a/src/test/java/io/anserini/integration/TrecEndToEndTest.java +++ b/src/test/java/io/anserini/integration/TrecEndToEndTest.java @@ -28,8 +28,9 @@ protected void init() { fieldNormStatusTotalFields = 1; // text termIndexStatusTermCount = 12; // Note that standard analyzer ignores stopwords; includes docids. termIndexStatusTotFreq = 17; - termIndexStatusTotPos = 16; // Only "text" fields are indexed with position so we have 16. storedFieldStatusTotalDocCounts = 3; + // 16 positions for text fields, plus 1 for each document because of id + termIndexStatusTotPos = 16 + storedFieldStatusTotalDocCounts; storedFieldStatusTotFields = 9; // 3 docs * (1 id + 1 text + 1 raw) // The search output should be as follows (for Lucene 7.5): diff --git a/src/test/java/io/anserini/integration/TweetEndToEndTest.java b/src/test/java/io/anserini/integration/TweetEndToEndTest.java index 46f9a4cd3d..86deb7d0e7 100644 --- a/src/test/java/io/anserini/integration/TweetEndToEndTest.java +++ b/src/test/java/io/anserini/integration/TweetEndToEndTest.java @@ -30,8 +30,9 @@ protected void init() { // We set that retweets and the tweets with ids larger than tweetMaxId will NOT be indexed! termIndexStatusTermCount = 32; // other indexable fields: 4 doc ids + 4 "lang" fields + 4 "screen_name" fields termIndexStatusTotFreq = 36; - termIndexStatusTotPos = 24; // only "text" fields are indexed with positions storedFieldStatusTotalDocCounts = 4; + // 24 positions for text fields, plus 3 for each document because of id, screen_name and lang + termIndexStatusTotPos = 24 + 3 * storedFieldStatusTotalDocCounts; storedFieldStatusTotFields = 12; // 4 tweets * (1 id + 1 text + 1 raw) // The search output should be as follows (for Lucene 7.5):