From cdb74b738a62bcfecc2b0d910ab38c7a3bae3651 Mon Sep 17 00:00:00 2001
From: Aaron Hatcher <110987709+koncheto-broad@users.noreply.github.com>
Date: Thu, 2 Feb 2023 09:33:56 -0500
Subject: [PATCH] [VS-693] Add support for VQSR Lite to GvsCreateFilterSet
 (#8157)

* Added a new suite of tools for variant filtering based on site-level annotations. (#7954)

* Adds wdl that tests joint VCF filtering tools (#7932)

* adding filtering wdl

* renaming pipeline

* addressing comments

* added bash

* renaming json

* adding glob to extract for extra files

* changing dollar signs

* small comments

* Added changes for specifying model backend and other tweaks to WDLs and environment.

* Added classes for representing a collection of labeled variant annotations.

* Added interfaces for modeling and scoring backends.

* Added a new suite of tools for variant filtering based on site-level annotations.

* Added integration tests.

* Added test resources and expected results.

* Miscellaneous changes.

* Removed non-ASCII characters.

* Added documentation for TrainVariantAnnotationsModel and addressed review comments.

Co-authored-by: meganshand <mshand@broadinstitute.org>

* Added toggle for selecting resource-matching strategies and miscellaneous minor fixes to new annotation-based filtering tools. (#8049)

* Adding use_allele_specific_annotation arg and fixing task with empty input in JointVcfFiltering WDL (#8027)

* Small changes to JointVCFFiltering WDL

* making default for use_allele_specific_annotations

* addressing comments

* first stab

* wire through WDL changes

* fixed typo

* set model_backend input value

* add gatk_override to JointVcfFiltering call

* typo in indel_annotations

* make model_backend optional

* tabs and spaces

* make all model_backends optional

* use gatk 4.3.0

* no point in changing the table names as this is a POC

* adding new branch to dockstore

* adding in branching logic for classic VQSR vs VQSR-Lite

* implementing the separate schemas for the VQSR vs VQSR-Lite branches, including Java changes necessary to produce the different tsv files

* passing classic flag to indel run of CreateFilteringFiles

* Update GvsCreateFilterSet.wdl

cleaning up verbiage

* Removed mapping error rate from estimate of denoised copy ratios output by gCNV and updated sklearn. (#7261)

* cleanup up sloppy comment

---------

Co-authored-by: samuelklee <samuelklee@users.noreply.github.com>
Co-authored-by: meganshand <mshand@broadinstitute.org>
Co-authored-by: Rebecca Asch <rasch@broadinstitute.org>
---
 .dockstore.yml                                |   2 +
 .github/workflows/gatk-tests.yml              |   8 +-
 build.gradle                                  |   1 +
 scripts/gatkcondaenv.yml.template             |   3 +-
 .../variantstore/wdl/GvsCreateFilterSet.wdl   | 290 +++++---
 .../README.md                                 |   9 +
 .../run_vcf_site_level_filtering_wdl.sh       |  38 +
 .../vcf_site_level_filtering_travis.json      |  14 +
 .../JointVcfFiltering.wdl                     | 294 ++++++++
 .../tools/copynumber/CollectReadCounts.java   |   2 +-
 .../CreateReadCountPanelOfNormals.java        |   2 +-
 .../tools/copynumber/utils/HDF5Utils.java     |   2 +-
 .../gvs/filtering/CreateFilteringFiles.java   |  60 +-
 .../scalable/ExtractVariantAnnotations.java   | 369 +++++++++
 .../LabeledVariantAnnotationsWalker.java      | 409 ++++++++++
 .../scalable/ScoreVariantAnnotations.java     | 627 ++++++++++++++++
 .../TrainVariantAnnotationsModel.java         | 703 ++++++++++++++++++
 .../data/LabeledVariantAnnotationsData.java   | 284 +++++++
 .../data/LabeledVariantAnnotationsDatum.java  | 104 +++
 .../vqsr/scalable/data/VariantType.java       |  58 ++
 .../modeling/BGMMVariantAnnotationsModel.java |  31 +
 .../BGMMVariantAnnotationsScorer.java         |  67 ++
 .../PythonSklearnVariantAnnotationsModel.java |  69 ++
 ...PythonSklearnVariantAnnotationsScorer.java |  69 ++
 .../modeling/VariantAnnotationsModel.java     |  46 ++
 .../VariantAnnotationsModelBackend.java       |  16 +
 .../modeling/VariantAnnotationsScorer.java    | 111 +++
 .../hellbender/utils/MathUtils.java           |   1 +
 .../hellbender/utils/NaturalLogUtils.java     |   2 +-
 .../BayesianGaussianMixtureModeller.java      |  35 +
 .../models/model_denoising_calling.py         |   3 +-
 .../isolation-forest-hyperparameters.json     |   3 +
 .../walkers/vqsr/scalable/isolation-forest.py | 138 ++++
 ...ractVariantAnnotationsIntegrationTest.java | 253 +++++++
 ...coreVariantAnnotationsIntegrationTest.java | 260 +++++++
 .../vqsr/scalable/SystemCommandUtilsTest.java |  62 ++
 ...ariantAnnotationsModelIntegrationTest.java | 428 +++++++++++
 .../PythonEnvironmentIntegrationTest.java     |   2 +-
 .../test_10_samples.22.avg.vcf.gz             |   3 +
 .../test_10_samples.22.avg.vcf.gz.tbi         |   3 +
 .../test_10_samples.23.avg.vcf.gz             |   3 +
 .../test_10_samples.23.avg.vcf.gz.tbi         |   3 +
 .../test_10_samples.sites_only.vcf.gz         |   3 +
 .../test_10_samples.sites_only.vcf.gz.tbi     |   3 +
 .../expected/extract.AS.indel.pos.annot.hdf5  |   3 +
 .../extract/expected/extract.AS.indel.pos.vcf |   3 +
 .../expected/extract.AS.indel.pos.vcf.idx     |   3 +
 .../extract.AS.indel.posUn.annot.hdf5         |   3 +
 ...xtract.AS.indel.posUn.unlabeled.annot.hdf5 |   3 +
 .../expected/extract.AS.indel.posUn.vcf       |   3 +
 .../expected/extract.AS.indel.posUn.vcf.idx   |   3 +
 .../expected/extract.AS.snp.pos.annot.hdf5    |   3 +
 .../extract/expected/extract.AS.snp.pos.vcf   |   3 +
 .../expected/extract.AS.snp.pos.vcf.idx       |   3 +
 .../expected/extract.AS.snp.posUn.annot.hdf5  |   3 +
 .../extract.AS.snp.posUn.unlabeled.annot.hdf5 |   3 +
 .../extract/expected/extract.AS.snp.posUn.vcf |   3 +
 .../expected/extract.AS.snp.posUn.vcf.idx     |   3 +
 .../extract.AS.snpIndel.pos.annot.hdf5        |   3 +
 .../expected/extract.AS.snpIndel.pos.vcf      |   3 +
 .../expected/extract.AS.snpIndel.pos.vcf.idx  |   3 +
 .../extract.AS.snpIndel.posUn.annot.hdf5      |   3 +
 ...act.AS.snpIndel.posUn.unlabeled.annot.hdf5 |   3 +
 .../expected/extract.AS.snpIndel.posUn.vcf    |   3 +
 .../extract.AS.snpIndel.posUn.vcf.idx         |   3 +
 .../extract.nonAS.indel.pos.annot.hdf5        |   3 +
 .../expected/extract.nonAS.indel.pos.vcf      |   3 +
 .../expected/extract.nonAS.indel.pos.vcf.idx  |   3 +
 .../extract.nonAS.indel.posUn.annot.hdf5      |   3 +
 ...act.nonAS.indel.posUn.unlabeled.annot.hdf5 |   3 +
 .../expected/extract.nonAS.indel.posUn.vcf    |   3 +
 .../extract.nonAS.indel.posUn.vcf.idx         |   3 +
 .../expected/extract.nonAS.snp.pos.annot.hdf5 |   3 +
 .../expected/extract.nonAS.snp.pos.vcf        |   3 +
 .../expected/extract.nonAS.snp.pos.vcf.idx    |   3 +
 .../extract.nonAS.snp.posUn.annot.hdf5        |   3 +
 ...tract.nonAS.snp.posUn.unlabeled.annot.hdf5 |   3 +
 .../expected/extract.nonAS.snp.posUn.vcf      |   3 +
 .../expected/extract.nonAS.snp.posUn.vcf.idx  |   3 +
 .../extract.nonAS.snpIndel.pos.annot.hdf5     |   3 +
 .../expected/extract.nonAS.snpIndel.pos.vcf   |   3 +
 .../extract.nonAS.snpIndel.pos.vcf.idx        |   3 +
 .../extract.nonAS.snpIndel.posUn.annot.hdf5   |   3 +
 ....nonAS.snpIndel.posUn.unlabeled.annot.hdf5 |   3 +
 .../expected/extract.nonAS.snpIndel.posUn.vcf |   3 +
 .../extract.nonAS.snpIndel.posUn.vcf.idx      |   3 +
 ...et_low_threshold.sites-only.chr1.1-10M.vcf |   3 +
 ...ow_threshold.sites-only.chr1.1-10M.vcf.idx |   3 +
 .../1000G_omni2.5.hg38.chr1.1-5M.vcf.gz       |   3 +
 .../1000G_omni2.5.hg38.chr1.1-5M.vcf.gz.tbi   |   3 +
 .../1000G_omni2.5.hg38.chr1.5M-10M.vcf.gz     |   3 +
 .../1000G_omni2.5.hg38.chr1.5M-10M.vcf.gz.tbi |   3 +
 ...gold_standard.indels.hg38.chr1.1-5M.vcf.gz |   3 +
 ..._standard.indels.hg38.chr1.1-5M.vcf.gz.tbi |   3 +
 ...ld_standard.indels.hg38.chr1.5M-10M.vcf.gz |   3 +
 ...tandard.indels.hg38.chr1.5M-10M.vcf.gz.tbi |   3 +
 ...in.snpIndel.posNeg.IF.score.snp.annot.hdf5 |   3 +
 ...n.snpIndel.posNeg.IF.score.snp.scores.hdf5 |   3 +
 ...sUn.train.snpIndel.posNeg.IF.score.snp.vcf |   3 +
 ...train.snpIndel.posNeg.IF.score.snp.vcf.idx |   3 +
 ...pIndel.posNeg.IF.score.snpIndel.annot.hdf5 |   3 +
 ...Indel.posNeg.IF.score.snpIndel.scores.hdf5 |   3 +
 ...rain.snpIndel.posNeg.IF.score.snpIndel.vcf |   3 +
 ....snpIndel.posNeg.IF.score.snpIndel.vcf.idx |   3 +
 ...in.snpIndel.posNeg.IF.score.snp.annot.hdf5 |   3 +
 ...n.snpIndel.posNeg.IF.score.snp.scores.hdf5 |   3 +
 ...sUn.train.snpIndel.posNeg.IF.score.snp.vcf |   3 +
 ...train.snpIndel.posNeg.IF.score.snp.vcf.idx |   3 +
 ...pIndel.posNeg.IF.score.snpIndel.annot.hdf5 |   3 +
 ...Indel.posNeg.IF.score.snpIndel.scores.hdf5 |   3 +
 ...rain.snpIndel.posNeg.IF.score.snpIndel.vcf |   3 +
 ....snpIndel.posNeg.IF.score.snpIndel.vcf.idx |   3 +
 ...n.snp.posNeg.IF.snp.calibrationScores.hdf5 |   3 +
 ...rain.snp.posNeg.IF.snp.negative.scorer.pkl |   3 +
 ...l.posUn.train.snp.posNeg.IF.snp.scorer.pkl |   3 +
 ...rain.snp.posNeg.IF.snp.trainingScores.hdf5 |   3 +
 ...ain.snp.posNeg.IF.snp.unlabeledScores.hdf5 |   3 +
 ...IFDifferentSeed.snp.calibrationScores.hdf5 |   3 +
 ...eg.IFDifferentSeed.snp.negative.scorer.pkl |   3 +
 ....snp.posNeg.IFDifferentSeed.snp.scorer.pkl |   3 +
 ...eg.IFDifferentSeed.snp.trainingScores.hdf5 |   3 +
 ...g.IFDifferentSeed.snp.unlabeledScores.hdf5 |   3 +
 ....snp.posOnly.IF.snp.calibrationScores.hdf5 |   3 +
 ....posUn.train.snp.posOnly.IF.snp.scorer.pkl |   3 +
 ...ain.snp.posOnly.IF.snp.trainingScores.hdf5 |   3 +
 ...IFDifferentSeed.snp.calibrationScores.hdf5 |   3 +
 ...snp.posOnly.IFDifferentSeed.snp.scorer.pkl |   3 +
 ...ly.IFDifferentSeed.snp.trainingScores.hdf5 |   3 +
 ...del.posNeg.IF.indel.calibrationScores.hdf5 |   3 +
 ...pIndel.posNeg.IF.indel.negative.scorer.pkl |   3 +
 ....train.snpIndel.posNeg.IF.indel.scorer.pkl |   3 +
 ...pIndel.posNeg.IF.indel.trainingScores.hdf5 |   3 +
 ...Indel.posNeg.IF.indel.unlabeledScores.hdf5 |   3 +
 ...Indel.posNeg.IF.snp.calibrationScores.hdf5 |   3 +
 ...snpIndel.posNeg.IF.snp.negative.scorer.pkl |   3 +
 ...Un.train.snpIndel.posNeg.IF.snp.scorer.pkl |   3 +
 ...snpIndel.posNeg.IF.snp.trainingScores.hdf5 |   3 +
 ...npIndel.posNeg.IF.snp.unlabeledScores.hdf5 |   3 +
 ...DifferentSeed.indel.calibrationScores.hdf5 |   3 +
 ....IFDifferentSeed.indel.negative.scorer.pkl |   3 +
 ...el.posNeg.IFDifferentSeed.indel.scorer.pkl |   3 +
 ....IFDifferentSeed.indel.trainingScores.hdf5 |   3 +
 ...IFDifferentSeed.indel.unlabeledScores.hdf5 |   3 +
 ...IFDifferentSeed.snp.calibrationScores.hdf5 |   3 +
 ...eg.IFDifferentSeed.snp.negative.scorer.pkl |   3 +
 ...ndel.posNeg.IFDifferentSeed.snp.scorer.pkl |   3 +
 ...eg.IFDifferentSeed.snp.trainingScores.hdf5 |   3 +
 ...g.IFDifferentSeed.snp.unlabeledScores.hdf5 |   3 +
 ...el.posOnly.IF.indel.calibrationScores.hdf5 |   3 +
 ...train.snpIndel.posOnly.IF.indel.scorer.pkl |   3 +
 ...Indel.posOnly.IF.indel.trainingScores.hdf5 |   3 +
 ...ndel.posOnly.IF.snp.calibrationScores.hdf5 |   3 +
 ...n.train.snpIndel.posOnly.IF.snp.scorer.pkl |   3 +
 ...npIndel.posOnly.IF.snp.trainingScores.hdf5 |   3 +
 ...DifferentSeed.indel.calibrationScores.hdf5 |   3 +
 ...l.posOnly.IFDifferentSeed.indel.scorer.pkl |   3 +
 ....IFDifferentSeed.indel.trainingScores.hdf5 |   3 +
 ...IFDifferentSeed.snp.calibrationScores.hdf5 |   3 +
 ...del.posOnly.IFDifferentSeed.snp.scorer.pkl |   3 +
 ...ly.IFDifferentSeed.snp.trainingScores.hdf5 |   3 +
 ...n.snp.posNeg.IF.snp.calibrationScores.hdf5 |   3 +
 ...rain.snp.posNeg.IF.snp.negative.scorer.pkl |   3 +
 ...l.posUn.train.snp.posNeg.IF.snp.scorer.pkl |   3 +
 ...rain.snp.posNeg.IF.snp.trainingScores.hdf5 |   3 +
 ...ain.snp.posNeg.IF.snp.unlabeledScores.hdf5 |   3 +
 ...IFDifferentSeed.snp.calibrationScores.hdf5 |   3 +
 ...eg.IFDifferentSeed.snp.negative.scorer.pkl |   3 +
 ....snp.posNeg.IFDifferentSeed.snp.scorer.pkl |   3 +
 ...eg.IFDifferentSeed.snp.trainingScores.hdf5 |   3 +
 ...g.IFDifferentSeed.snp.unlabeledScores.hdf5 |   3 +
 ....snp.posOnly.IF.snp.calibrationScores.hdf5 |   3 +
 ....posUn.train.snp.posOnly.IF.snp.scorer.pkl |   3 +
 ...ain.snp.posOnly.IF.snp.trainingScores.hdf5 |   3 +
 ...IFDifferentSeed.snp.calibrationScores.hdf5 |   3 +
 ...snp.posOnly.IFDifferentSeed.snp.scorer.pkl |   3 +
 ...ly.IFDifferentSeed.snp.trainingScores.hdf5 |   3 +
 ...del.posNeg.IF.indel.calibrationScores.hdf5 |   3 +
 ...pIndel.posNeg.IF.indel.negative.scorer.pkl |   3 +
 ....train.snpIndel.posNeg.IF.indel.scorer.pkl |   3 +
 ...pIndel.posNeg.IF.indel.trainingScores.hdf5 |   3 +
 ...Indel.posNeg.IF.indel.unlabeledScores.hdf5 |   3 +
 ...Indel.posNeg.IF.snp.calibrationScores.hdf5 |   3 +
 ...snpIndel.posNeg.IF.snp.negative.scorer.pkl |   3 +
 ...Un.train.snpIndel.posNeg.IF.snp.scorer.pkl |   3 +
 ...snpIndel.posNeg.IF.snp.trainingScores.hdf5 |   3 +
 ...npIndel.posNeg.IF.snp.unlabeledScores.hdf5 |   3 +
 ...DifferentSeed.indel.calibrationScores.hdf5 |   3 +
 ....IFDifferentSeed.indel.negative.scorer.pkl |   3 +
 ...el.posNeg.IFDifferentSeed.indel.scorer.pkl |   3 +
 ....IFDifferentSeed.indel.trainingScores.hdf5 |   3 +
 ...IFDifferentSeed.indel.unlabeledScores.hdf5 |   3 +
 ...IFDifferentSeed.snp.calibrationScores.hdf5 |   3 +
 ...eg.IFDifferentSeed.snp.negative.scorer.pkl |   3 +
 ...ndel.posNeg.IFDifferentSeed.snp.scorer.pkl |   3 +
 ...eg.IFDifferentSeed.snp.trainingScores.hdf5 |   3 +
 ...g.IFDifferentSeed.snp.unlabeledScores.hdf5 |   3 +
 ...el.posOnly.IF.indel.calibrationScores.hdf5 |   3 +
 ...train.snpIndel.posOnly.IF.indel.scorer.pkl |   3 +
 ...Indel.posOnly.IF.indel.trainingScores.hdf5 |   3 +
 ...ndel.posOnly.IF.snp.calibrationScores.hdf5 |   3 +
 ...n.train.snpIndel.posOnly.IF.snp.scorer.pkl |   3 +
 ...npIndel.posOnly.IF.snp.trainingScores.hdf5 |   3 +
 ...DifferentSeed.indel.calibrationScores.hdf5 |   3 +
 ...l.posOnly.IFDifferentSeed.indel.scorer.pkl |   3 +
 ....IFDifferentSeed.indel.trainingScores.hdf5 |   3 +
 ...IFDifferentSeed.snp.calibrationScores.hdf5 |   3 +
 ...del.posOnly.IFDifferentSeed.snp.scorer.pkl |   3 +
 ...ly.IFDifferentSeed.snp.trainingScores.hdf5 |   3 +
 ...forest-hyperparameters-different-seed.json |   3 +
 209 files changed, 5259 insertions(+), 129 deletions(-)
 create mode 100644 scripts/vcf_site_level_filtering_cromwell_tests/README.md
 create mode 100644 scripts/vcf_site_level_filtering_cromwell_tests/run_vcf_site_level_filtering_wdl.sh
 create mode 100644 scripts/vcf_site_level_filtering_cromwell_tests/vcf_site_level_filtering_travis.json
 create mode 100644 scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl
 create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ExtractVariantAnnotations.java
 create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/LabeledVariantAnnotationsWalker.java
 create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ScoreVariantAnnotations.java
 create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/TrainVariantAnnotationsModel.java
 create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/data/LabeledVariantAnnotationsData.java
 create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/data/LabeledVariantAnnotationsDatum.java
 create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/data/VariantType.java
 create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/BGMMVariantAnnotationsModel.java
 create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/BGMMVariantAnnotationsScorer.java
 create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/PythonSklearnVariantAnnotationsModel.java
 create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/PythonSklearnVariantAnnotationsScorer.java
 create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/VariantAnnotationsModel.java
 create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/VariantAnnotationsModelBackend.java
 create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/VariantAnnotationsScorer.java
 create mode 100644 src/main/java/org/broadinstitute/hellbender/utils/clustering/BayesianGaussianMixtureModeller.java
 create mode 100644 src/main/resources/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/isolation-forest-hyperparameters.json
 create mode 100644 src/main/resources/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/isolation-forest.py
 create mode 100644 src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ExtractVariantAnnotationsIntegrationTest.java
 create mode 100644 src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ScoreVariantAnnotationsIntegrationTest.java
 create mode 100644 src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/SystemCommandUtilsTest.java
 create mode 100644 src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/TrainVariantAnnotationsModelIntegrationTest.java
 create mode 100644 src/test/resources/large/filteringJointVcf/test_10_samples.22.avg.vcf.gz
 create mode 100644 src/test/resources/large/filteringJointVcf/test_10_samples.22.avg.vcf.gz.tbi
 create mode 100644 src/test/resources/large/filteringJointVcf/test_10_samples.23.avg.vcf.gz
 create mode 100644 src/test/resources/large/filteringJointVcf/test_10_samples.23.avg.vcf.gz.tbi
 create mode 100644 src/test/resources/large/filteringJointVcf/test_10_samples.sites_only.vcf.gz
 create mode 100644 src/test/resources/large/filteringJointVcf/test_10_samples.sites_only.vcf.gz.tbi
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.pos.annot.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.pos.vcf
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.pos.vcf.idx
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.posUn.annot.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.posUn.unlabeled.annot.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.posUn.vcf
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.posUn.vcf.idx
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.pos.annot.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.pos.vcf
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.pos.vcf.idx
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.posUn.annot.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.posUn.unlabeled.annot.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.posUn.vcf
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.posUn.vcf.idx
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.pos.annot.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.pos.vcf
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.pos.vcf.idx
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.posUn.annot.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.posUn.unlabeled.annot.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.posUn.vcf
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.posUn.vcf.idx
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.pos.annot.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.pos.vcf
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.pos.vcf.idx
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.posUn.annot.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.posUn.unlabeled.annot.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.posUn.vcf
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.posUn.vcf.idx
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.pos.annot.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.pos.vcf
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.pos.vcf.idx
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.posUn.annot.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.posUn.unlabeled.annot.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.posUn.vcf
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.posUn.vcf.idx
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.pos.annot.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.pos.vcf
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.pos.vcf.idx
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.posUn.annot.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.posUn.unlabeled.annot.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.posUn.vcf
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.posUn.vcf.idx
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/input/small_callset_low_threshold.sites-only.chr1.1-10M.vcf
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/input/small_callset_low_threshold.sites-only.chr1.1-10M.vcf.idx
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/1000G_omni2.5.hg38.chr1.1-5M.vcf.gz
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/1000G_omni2.5.hg38.chr1.1-5M.vcf.gz.tbi
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/1000G_omni2.5.hg38.chr1.5M-10M.vcf.gz
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/1000G_omni2.5.hg38.chr1.5M-10M.vcf.gz.tbi
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/Mills_and_1000G_gold_standard.indels.hg38.chr1.1-5M.vcf.gz
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/Mills_and_1000G_gold_standard.indels.hg38.chr1.1-5M.vcf.gz.tbi
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/Mills_and_1000G_gold_standard.indels.hg38.chr1.5M-10M.vcf.gz
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/Mills_and_1000G_gold_standard.indels.hg38.chr1.5M-10M.vcf.gz.tbi
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.annot.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.scores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf.idx
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.annot.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.scores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf.idx
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.annot.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.scores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf.idx
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.annot.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.scores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf.idx
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.calibrationScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.negative.scorer.pkl
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.scorer.pkl
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.trainingScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.unlabeledScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.negative.scorer.pkl
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.scorer.pkl
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.trainingScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IF.snp.calibrationScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IF.snp.scorer.pkl
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IF.snp.trainingScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.calibrationScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.scorer.pkl
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.trainingScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.calibrationScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.negative.scorer.pkl
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.scorer.pkl
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.trainingScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.unlabeledScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.calibrationScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.negative.scorer.pkl
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.scorer.pkl
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.trainingScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.unlabeledScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.calibrationScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.negative.scorer.pkl
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.scorer.pkl
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.trainingScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.unlabeledScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.negative.scorer.pkl
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.scorer.pkl
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.trainingScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.calibrationScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.scorer.pkl
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.trainingScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.calibrationScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.scorer.pkl
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.trainingScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.calibrationScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.scorer.pkl
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.trainingScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.calibrationScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.scorer.pkl
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.trainingScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.calibrationScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.negative.scorer.pkl
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.scorer.pkl
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.trainingScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.unlabeledScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.negative.scorer.pkl
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.scorer.pkl
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.trainingScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IF.snp.calibrationScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IF.snp.scorer.pkl
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IF.snp.trainingScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.calibrationScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.scorer.pkl
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.trainingScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.calibrationScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.negative.scorer.pkl
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.scorer.pkl
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.trainingScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.unlabeledScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.calibrationScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.negative.scorer.pkl
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.scorer.pkl
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.trainingScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.unlabeledScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.calibrationScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.negative.scorer.pkl
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.scorer.pkl
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.trainingScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.unlabeledScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.negative.scorer.pkl
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.scorer.pkl
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.trainingScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.calibrationScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.scorer.pkl
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.trainingScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.calibrationScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.scorer.pkl
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.trainingScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.calibrationScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.scorer.pkl
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.trainingScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.calibrationScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.scorer.pkl
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.trainingScores.hdf5
 create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/isolation-forest-hyperparameters-different-seed.json

diff --git a/.dockstore.yml b/.dockstore.yml
index 441fbe6748e..f9474455764 100644
--- a/.dockstore.yml
+++ b/.dockstore.yml
@@ -95,6 +95,8 @@ workflows:
        branches:
          - master
          - ah_var_store
+         - rsa_vqsr_lite_poc
+         - VS-693_VQSR_lite
    - name: GvsPopulateAltAllele
      subclass: WDL
      primaryDescriptorPath: /scripts/variantstore/wdl/GvsPopulateAltAllele.wdl
diff --git a/.github/workflows/gatk-tests.yml b/.github/workflows/gatk-tests.yml
index 6ad01a31f75..a908b98cd15 100644
--- a/.github/workflows/gatk-tests.yml
+++ b/.github/workflows/gatk-tests.yml
@@ -291,7 +291,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        wdlTest: [ 'RUN_CNV_GERMLINE_COHORT_WDL', 'RUN_CNV_GERMLINE_CASE_WDL', 'RUN_CNV_SOMATIC_WDL', 'RUN_M2_WDL', 'RUN_CNN_WDL' ]
+        wdlTest: [ 'RUN_CNV_GERMLINE_COHORT_WDL', 'RUN_CNV_GERMLINE_CASE_WDL', 'RUN_CNV_SOMATIC_WDL', 'RUN_M2_WDL', 'RUN_CNN_WDL', 'RUN_VCF_SITE_LEVEL_FILTERING_WDL' ]
     continue-on-error: true
     name: WDL test ${{ matrix.wdlTest }} on cromwell
     steps:
@@ -349,3 +349,9 @@ jobs:
         run: |
           echo "Running CNN WDL";
           bash scripts/cnn_variant_cromwell_tests/run_cnn_variant_wdl.sh;
+
+      - name: "VCF_SITE_LEVEL_FILTERING_WDL_TEST"
+        if: ${{ matrix.wdlTest == 'RUN_VCF_SITE_LEVEL_FILTERING_WDL' }}
+        run: |
+          echo "Running VCF Site Level Filtering WDL";
+          bash scripts/vcf_site_level_filtering_cromwell_tests/run_vcf_site_level_filtering_wdl.sh;
\ No newline at end of file
diff --git a/build.gradle b/build.gradle
index 85aab88634c..c7698c5a290 100644
--- a/build.gradle
+++ b/build.gradle
@@ -293,6 +293,7 @@ dependencies {
 
     implementation 'org.apache.commons:commons-lang3:3.5'
     implementation 'org.apache.commons:commons-math3:3.5'
+    implementation 'org.hipparchus:hipparchus-stat:2.0'
     implementation 'org.apache.commons:commons-collections4:4.1'
     implementation 'org.apache.commons:commons-vfs2:2.0'
     implementation 'org.apache.commons:commons-configuration2:2.4'
diff --git a/scripts/gatkcondaenv.yml.template b/scripts/gatkcondaenv.yml.template
index 9077fed6296..dbe29ed5a28 100644
--- a/scripts/gatkcondaenv.yml.template
+++ b/scripts/gatkcondaenv.yml.template
@@ -38,10 +38,11 @@ dependencies:
                                     #   if you wish to update, note that versions of conda-forge::keras after 2.2.5
                                     #   undesirably set the environment variable KERAS_BACKEND = theano by default
 - defaults::intel-openmp=2019.4
-- conda-forge::scikit-learn=0.22.2
+- conda-forge::scikit-learn=0.23.1
 - conda-forge::matplotlib=3.2.1
 - conda-forge::pandas=1.0.3
 - conda-forge::typing_extensions=4.1.1   # see https://github.com/broadinstitute/gatk/issues/7800 and linked PRs
+- conda-forge::dill=0.3.4                # used for pickling lambdas in TrainVariantAnnotationsModel
 
 # core R dependencies; these should only be used for plotting and do not take precedence over core python dependencies!
 - r-base=3.6.2
diff --git a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
index 7d45b360814..61d6a32e9f7 100644
--- a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
+++ b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
@@ -2,6 +2,7 @@ version 1.0
 
 import "GvsWarpTasks.wdl" as Tasks
 import "GvsUtils.wdl" as Utils
+import "../../vcf_site_level_filtering_wdl/JointVcfFiltering.wdl" as VQSRLite
 
 workflow GvsCreateFilterSet {
   input {
@@ -17,6 +18,7 @@ workflow GvsCreateFilterSet {
     File interval_list = "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.noCentromeres.noTelomeres.interval_list"
     File? gatk_override
 
+    Boolean use_classic_VQSR = true
     Int? INDEL_VQSR_max_gaussians_override = 4
     Int? INDEL_VQSR_maximum_training_variants
     Int? INDEL_VQSR_mem_gb_override
@@ -54,9 +56,13 @@ workflow GvsCreateFilterSet {
   String fq_sample_table = "~{project_id}.~{dataset_name}.sample_info"
   String fq_alt_allele_table = "~{project_id}.~{dataset_name}.alt_allele"
   String fq_info_destination_table = "~{project_id}.~{dataset_name}.filter_set_info"
+  String fq_info_destination_table_vqsr_lite = "~{project_id}.~{dataset_name}.vqsr_lite_filter_set_info"
   String fq_tranches_destination_table = "~{project_id}.~{dataset_name}.filter_set_tranches"
   String fq_filter_sites_destination_table = "~{project_id}.~{dataset_name}.filter_set_sites"
 
+  String fq_info_destination_table_schema = "filter_set_name:string,type:string,location:integer,ref:string,alt:string,vqslod:float,culprit:string,training_label:string,yng_status:string"
+  String fq_info_destination_table_vqsr_lite_schema = "filter_set_name:string,type:string,location:integer,ref:string,alt:string,vqslod:float,culprit:string,training_label:string,yng_status:string,calibration_sensitivity:float"
+
   call Utils.GetBQTableLastModifiedDatetime as SamplesTableDatetimeCheck {
     input:
       query_project = project_id,
@@ -118,63 +124,98 @@ workflow GvsCreateFilterSet {
       preemptible_tries = 3,
   }
 
-  call Tasks.IndelsVariantRecalibrator {
-    input:
-      sites_only_variant_filtered_vcf = MergeVCFs.output_vcf,
-      sites_only_variant_filtered_vcf_index = MergeVCFs.output_vcf_index,
-      recalibration_filename = filter_set_name + ".indels.recal",
-      tranches_filename = filter_set_name + ".indels.tranches",
-      recalibration_tranche_values = ["100.0", "99.95", "99.9", "99.5", "99.0", "97.0", "96.0", "95.0", "94.0", "93.5", "93.0", "92.0", "91.0", "90.0"],
-      recalibration_annotation_values = indel_recalibration_annotation_values,
-      mills_resource_vcf = mills_resource_vcf,
-      mills_resource_vcf_index = mills_resource_vcf_index,
-      axiomPoly_resource_vcf = axiomPoly_resource_vcf,
-      axiomPoly_resource_vcf_index = axiomPoly_resource_vcf_index,
-      dbsnp_resource_vcf = dbsnp_vcf,
-      dbsnp_resource_vcf_index = dbsnp_vcf_index,
-      use_allele_specific_annotations = true,
-      disk_size = "1000",
-      machine_mem_gb = INDEL_VQSR_mem_gb_override,
-      max_gaussians = INDEL_VQSR_max_gaussians_override,
-      maximum_training_variants = INDEL_VQSR_maximum_training_variants,
+  # From this point, the paths diverge depending on whether they're using classic VQSR or VQSR-Lite
+  # The first branch here is VQSR-Lite, and the second is classic VQSR
+  if (!use_classic_VQSR) {
+    call VQSRLite.JointVcfFiltering as JointVcfFiltering {
+      input:
+        vcf = ExtractFilterTask.output_vcf,
+        vcf_index = ExtractFilterTask.output_vcf_index,
+        sites_only_vcf = MergeVCFs.output_vcf,
+        sites_only_vcf_index = MergeVCFs.output_vcf_index,
+        basename = filter_set_name,
+        gatk_docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0",
+        extract_interval_list = interval_list,
+        score_interval_list = interval_list,
+        snp_annotations = "-A AS_QD -A AS_MQRankSum -A AS_ReadPosRankSum -A AS_FS -A AS_MQ -A AS_SOR",
+        indel_annotations = "-A AS_FS -A AS_ReadPosRankSum -A AS_MQRankSum -A AS_QD -A AS_SOR",
+        use_allele_specific_annotations = true,
+    }
+
+    call Utils.MergeVCFs as MergeINDELScoredVCFs {
+      input:
+        input_vcfs = JointVcfFiltering.indels_variant_scored_vcf,
+        gather_type = "CONVENTIONAL",
+        output_vcf_name = "${filter_set_name}.indel.vrecalibration.gz",
+        preemptible_tries = 3,
+    }
+
+    call Utils.MergeVCFs as MergeSNPScoredVCFs {
+      input:
+        input_vcfs = JointVcfFiltering.snps_variant_scored_vcf,
+        gather_type = "CONVENTIONAL",
+        output_vcf_name = "${filter_set_name}.snp.vrecalibration.gz",
+        preemptible_tries = 3,
+    }
+
+    call PopulateFilterSetInfo {
+      input:
+        gatk_override = gatk_override,
+        filter_set_name = filter_set_name,
+        snp_recal_file = MergeSNPScoredVCFs.output_vcf,
+        snp_recal_file_index = MergeSNPScoredVCFs.output_vcf_index,
+        indel_recal_file = MergeINDELScoredVCFs.output_vcf,
+        indel_recal_file_index = MergeINDELScoredVCFs.output_vcf_index,
+        fq_info_destination_table = fq_info_destination_table_vqsr_lite,
+        filter_schema = fq_info_destination_table_vqsr_lite_schema,
+        query_project = project_id,
+        useClassic = false
+    }
+
+    call PopulateFilterSetSites {
+      input:
+        gatk_override = gatk_override,
+        filter_set_name = filter_set_name,
+        sites_only_variant_filtered_vcf = MergeVCFs.output_vcf,
+        sites_only_variant_filtered_vcf_index = MergeVCFs.output_vcf_index,
+        fq_filter_sites_destination_table = fq_filter_sites_destination_table,
+        query_project = project_id
+    }
   }
 
-  if (GetNumSamplesLoaded.num_samples > snps_variant_recalibration_threshold) {
-    call Tasks.SNPsVariantRecalibratorCreateModel {
+  if (use_classic_VQSR) {
+
+    call Tasks.IndelsVariantRecalibrator {
       input:
         sites_only_variant_filtered_vcf = MergeVCFs.output_vcf,
         sites_only_variant_filtered_vcf_index = MergeVCFs.output_vcf_index,
-        recalibration_filename = filter_set_name + ".snps.recal",
-        tranches_filename = filter_set_name + ".snps.tranches",
-        recalibration_tranche_values = snp_recalibration_tranche_values,
-        recalibration_annotation_values = snp_recalibration_annotation_values,
-        model_report_filename = filter_set_name + ".snps.model.report",
-        hapmap_resource_vcf = hapmap_resource_vcf,
-        hapmap_resource_vcf_index = hapmap_resource_vcf_index,
-        omni_resource_vcf = omni_resource_vcf,
-        omni_resource_vcf_index = omni_resource_vcf_index,
-        one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf,
-        one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index,
+        recalibration_filename = filter_set_name + ".indels.recal",
+        tranches_filename = filter_set_name + ".indels.tranches",
+        recalibration_tranche_values = ["100.0", "99.95", "99.9", "99.5", "99.0", "97.0", "96.0", "95.0", "94.0", "93.5", "93.0", "92.0", "91.0", "90.0"],
+        recalibration_annotation_values = indel_recalibration_annotation_values,
+        mills_resource_vcf = mills_resource_vcf,
+        mills_resource_vcf_index = mills_resource_vcf_index,
+        axiomPoly_resource_vcf = axiomPoly_resource_vcf,
+        axiomPoly_resource_vcf_index = axiomPoly_resource_vcf_index,
         dbsnp_resource_vcf = dbsnp_vcf,
         dbsnp_resource_vcf_index = dbsnp_vcf_index,
         use_allele_specific_annotations = true,
         disk_size = "1000",
-        machine_mem_gb = SNP_VQSR_mem_gb_override,
-        max_gaussians = SNP_VQSR_max_gaussians_override,
-        sample_every_nth_variant = SNP_VQSR_sample_every_nth_variant,
-        maximum_training_variants = SNP_VQSR_maximum_training_variants
+        machine_mem_gb = INDEL_VQSR_mem_gb_override,
+        max_gaussians = INDEL_VQSR_max_gaussians_override,
+        maximum_training_variants = INDEL_VQSR_maximum_training_variants,
     }
 
-    scatter (idx in range(length(ExtractFilterTask.output_vcf))) {
-      call Tasks.SNPsVariantRecalibrator as SNPsVariantRecalibratorScattered {
+    if (GetNumSamplesLoaded.num_samples > snps_variant_recalibration_threshold) {
+      call Tasks.SNPsVariantRecalibratorCreateModel {
         input:
-          sites_only_variant_filtered_vcf = ExtractFilterTask.output_vcf[idx],
-          sites_only_variant_filtered_vcf_index = ExtractFilterTask.output_vcf_index[idx],
-          recalibration_filename = filter_set_name + ".snps." + idx + ".recal",
-          tranches_filename = filter_set_name + ".snps." + idx + ".tranches",
+          sites_only_variant_filtered_vcf = MergeVCFs.output_vcf,
+          sites_only_variant_filtered_vcf_index = MergeVCFs.output_vcf_index,
+          recalibration_filename = filter_set_name + ".snps.recal",
+          tranches_filename = filter_set_name + ".snps.tranches",
           recalibration_tranche_values = snp_recalibration_tranche_values,
           recalibration_annotation_values = snp_recalibration_annotation_values,
-          model_report = SNPsVariantRecalibratorCreateModel.model_report,
+          model_report_filename = filter_set_name + ".snps.model.report",
           hapmap_resource_vcf = hapmap_resource_vcf,
           hapmap_resource_vcf_index = hapmap_resource_vcf_index,
           omni_resource_vcf = omni_resource_vcf,
@@ -185,84 +226,114 @@ workflow GvsCreateFilterSet {
           dbsnp_resource_vcf_index = dbsnp_vcf_index,
           use_allele_specific_annotations = true,
           disk_size = "1000",
-          machine_mem_gb = SNP_VQSR_mem_gb_override
+          machine_mem_gb = SNP_VQSR_mem_gb_override,
+          max_gaussians = SNP_VQSR_max_gaussians_override,
+          sample_every_nth_variant = SNP_VQSR_sample_every_nth_variant,
+          maximum_training_variants = SNP_VQSR_maximum_training_variants
+      }
+
+      scatter (idx in range(length(ExtractFilterTask.output_vcf))) {
+        call Tasks.SNPsVariantRecalibrator as SNPsVariantRecalibratorScattered {
+          input:
+            sites_only_variant_filtered_vcf = ExtractFilterTask.output_vcf[idx],
+            sites_only_variant_filtered_vcf_index = ExtractFilterTask.output_vcf_index[idx],
+            recalibration_filename = filter_set_name + ".snps." + idx + ".recal",
+            tranches_filename = filter_set_name + ".snps." + idx + ".tranches",
+            recalibration_tranche_values = snp_recalibration_tranche_values,
+            recalibration_annotation_values = snp_recalibration_annotation_values,
+            model_report = SNPsVariantRecalibratorCreateModel.model_report,
+            hapmap_resource_vcf = hapmap_resource_vcf,
+            hapmap_resource_vcf_index = hapmap_resource_vcf_index,
+            omni_resource_vcf = omni_resource_vcf,
+            omni_resource_vcf_index = omni_resource_vcf_index,
+            one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf,
+            one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index,
+            dbsnp_resource_vcf = dbsnp_vcf,
+            dbsnp_resource_vcf_index = dbsnp_vcf_index,
+            use_allele_specific_annotations = true,
+            disk_size = "1000",
+            machine_mem_gb = SNP_VQSR_mem_gb_override
+        }
+      }
+
+      call Tasks.GatherTranches as SNPGatherTranches {
+        input:
+          tranches = SNPsVariantRecalibratorScattered.tranches,
+          output_filename = filter_set_name + ".snps.gathered.tranches",
+          output_tranche_values = snp_recalibration_tranche_values,
+          mode = "SNP",
+          disk_size = "200",
+          gatk_override = gatk_override
+      }
+
+      call Utils.MergeVCFs as MergeRecalibrationFiles {
+        input:
+          input_vcfs = SNPsVariantRecalibratorScattered.recalibration,
+          gather_type = "CONVENTIONAL",
+          output_vcf_name = "${filter_set_name}.vrecalibration.gz",
+          preemptible_tries = 3,
       }
     }
 
-    call Tasks.GatherTranches as SNPGatherTranches {
-      input:
-        tranches = SNPsVariantRecalibratorScattered.tranches,
-        output_filename = filter_set_name + ".snps.gathered.tranches",
-        output_tranche_values = snp_recalibration_tranche_values,
-        mode = "SNP",
-        disk_size = "200",
-        gatk_override = gatk_override
+    if (GetNumSamplesLoaded.num_samples <= snps_variant_recalibration_threshold) {
+      call Tasks.SNPsVariantRecalibrator as SNPsVariantRecalibratorClassic {
+        input:
+          sites_only_variant_filtered_vcf = MergeVCFs.output_vcf,
+          sites_only_variant_filtered_vcf_index = MergeVCFs.output_vcf_index,
+          recalibration_filename = filter_set_name + ".snps.recal",
+          tranches_filename = filter_set_name + ".snps.tranches",
+          recalibration_tranche_values = snp_recalibration_tranche_values,
+          recalibration_annotation_values = snp_recalibration_annotation_values,
+          hapmap_resource_vcf = hapmap_resource_vcf,
+          hapmap_resource_vcf_index = hapmap_resource_vcf_index,
+          omni_resource_vcf = omni_resource_vcf,
+          omni_resource_vcf_index = omni_resource_vcf_index,
+          one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf,
+          one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index,
+          dbsnp_resource_vcf = dbsnp_vcf,
+          dbsnp_resource_vcf_index = dbsnp_vcf_index,
+          use_allele_specific_annotations = true,
+          disk_size = "1000",
+          machine_mem_gb = SNP_VQSR_mem_gb_override,
+          max_gaussians = SNP_VQSR_max_gaussians_override,
+      }
     }
 
-    call Utils.MergeVCFs as MergeRecalibrationFiles {
+    call PopulateFilterSetInfo as PopulateFilterSetInfoCLassic {
       input:
-        input_vcfs = SNPsVariantRecalibratorScattered.recalibration,
-        gather_type = "CONVENTIONAL",
-        output_vcf_name = "${filter_set_name}.vrecalibration.gz",
-        preemptible_tries = 3,
+        gatk_override = gatk_override,
+        filter_set_name = filter_set_name,
+        snp_recal_file = select_first([MergeRecalibrationFiles.output_vcf, SNPsVariantRecalibratorClassic.recalibration]),
+        snp_recal_file_index = select_first([MergeRecalibrationFiles.output_vcf_index, SNPsVariantRecalibratorClassic.recalibration_index]),
+        indel_recal_file = IndelsVariantRecalibrator.recalibration,
+        indel_recal_file_index = IndelsVariantRecalibrator.recalibration_index,
+        fq_info_destination_table = fq_info_destination_table,
+        filter_schema = fq_info_destination_table_schema,
+        query_project = project_id,
+        useClassic = true
     }
-  }
 
-  if (GetNumSamplesLoaded.num_samples <= snps_variant_recalibration_threshold) {
-    call Tasks.SNPsVariantRecalibrator as SNPsVariantRecalibratorClassic {
+    call PopulateFilterSetSites as PopulateFilterSetSitesClassic {
       input:
+        gatk_override = gatk_override,
+        filter_set_name = filter_set_name,
         sites_only_variant_filtered_vcf = MergeVCFs.output_vcf,
         sites_only_variant_filtered_vcf_index = MergeVCFs.output_vcf_index,
-        recalibration_filename = filter_set_name + ".snps.recal",
-        tranches_filename = filter_set_name + ".snps.tranches",
-        recalibration_tranche_values = snp_recalibration_tranche_values,
-        recalibration_annotation_values = snp_recalibration_annotation_values,
-        hapmap_resource_vcf = hapmap_resource_vcf,
-        hapmap_resource_vcf_index = hapmap_resource_vcf_index,
-        omni_resource_vcf = omni_resource_vcf,
-        omni_resource_vcf_index = omni_resource_vcf_index,
-        one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf,
-        one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index,
-        dbsnp_resource_vcf = dbsnp_vcf,
-        dbsnp_resource_vcf_index = dbsnp_vcf_index,
-        use_allele_specific_annotations = true,
-        disk_size = "1000",
-        machine_mem_gb = SNP_VQSR_mem_gb_override,
-        max_gaussians = SNP_VQSR_max_gaussians_override,
+        fq_filter_sites_destination_table = fq_filter_sites_destination_table,
+        query_project = project_id
     }
-  }
-
-  call PopulateFilterSetInfo {
-    input:
-      gatk_override = gatk_override,
-      filter_set_name = filter_set_name,
-      snp_recal_file = select_first([MergeRecalibrationFiles.output_vcf, SNPsVariantRecalibratorClassic.recalibration]),
-      snp_recal_file_index = select_first([MergeRecalibrationFiles.output_vcf_index, SNPsVariantRecalibratorClassic.recalibration_index]),
-      indel_recal_file = IndelsVariantRecalibrator.recalibration,
-      indel_recal_file_index = IndelsVariantRecalibrator.recalibration_index,
-      fq_info_destination_table = fq_info_destination_table,
-      query_project = project_id
-  }
 
-  call PopulateFilterSetSites {
-    input:
-      gatk_override = gatk_override,
-      filter_set_name = filter_set_name,
-      sites_only_variant_filtered_vcf = MergeVCFs.output_vcf,
-      sites_only_variant_filtered_vcf_index = MergeVCFs.output_vcf_index,
-      fq_filter_sites_destination_table = fq_filter_sites_destination_table,
-      query_project = project_id
+    call PopulateFilterSetTranches as PopulateFilterSetTranchesClassic {
+      input:
+        gatk_override = gatk_override,
+        filter_set_name = filter_set_name,
+        snp_recal_tranches = select_first([SNPGatherTranches.tranches_file, SNPsVariantRecalibratorClassic.tranches]),
+        indel_recal_tranches = IndelsVariantRecalibrator.tranches,
+        fq_tranches_destination_table = fq_tranches_destination_table,
+        query_project = project_id
+    }
   }
 
-  call PopulateFilterSetTranches {
-    input:
-      gatk_override = gatk_override,
-      filter_set_name = filter_set_name,
-      snp_recal_tranches = select_first([SNPGatherTranches.tranches_file, SNPsVariantRecalibratorClassic.tranches]),
-      indel_recal_tranches = IndelsVariantRecalibrator.tranches,
-      fq_tranches_destination_table = fq_tranches_destination_table,
-      query_project = project_id
-  }
 
   output {
     File output_vcf = MergeVCFs.output_vcf
@@ -349,7 +420,9 @@ task ExtractFilterTask {
 task PopulateFilterSetInfo {
   input {
     String filter_set_name
+    String filter_schema
     String fq_info_destination_table
+    Boolean useClassic = true
 
     File snp_recal_file
     File snp_recal_file_index
@@ -378,6 +451,7 @@ task PopulateFilterSetInfo {
       --ref-version 38 \
       --filter-set-name ~{filter_set_name} \
       -mode SNP \
+      --classic ~{useClassic} \
       -V ~{snp_recal_file} \
       -O ~{filter_set_name}.snps.recal.tsv
 
@@ -387,6 +461,7 @@ task PopulateFilterSetInfo {
       --ref-version 38 \
       --filter-set-name ~{filter_set_name} \
       -mode INDEL \
+      --classic ~{useClassic} \
       -V ~{indel_recal_file} \
       -O ~{filter_set_name}.indels.recal.tsv
 
@@ -401,7 +476,7 @@ task PopulateFilterSetInfo {
     bq load --project_id=~{query_project} --skip_leading_rows 0 -F "tab" \
       --range_partitioning=location,0,26000000000000,6500000000 \
       --clustering_fields=location \
-      --schema "filter_set_name:string,type:string,location:integer,ref:string,alt:string,vqslod:float,culprit:string,training_label:string,yng_status:string" \
+      --schema "~{filter_schema}" \
       ${bq_table} \
       ~{filter_set_name}.filter_set_load.tsv > status_load_filter_set_info
   >>>
@@ -473,7 +548,6 @@ task PopulateFilterSetSites {
 
   output {
     String status_load_filter_set_sites = read_string("status_load_filter_set_sites")
-
   }
 }
 
diff --git a/scripts/vcf_site_level_filtering_cromwell_tests/README.md b/scripts/vcf_site_level_filtering_cromwell_tests/README.md
new file mode 100644
index 00000000000..6f9950fa36d
--- /dev/null
+++ b/scripts/vcf_site_level_filtering_cromwell_tests/README.md
@@ -0,0 +1,9 @@
+# Filtering Automated Tests for WDL
+
+**This directory is for GATK devs only**
+
+This directory contains scripts for running Variant Site Level WDL tests in the automated travis build environment.
+
+Please note that this only tests whether the WDL will complete successfully.
+
+Test data is a "plumbing test" using a small portion of a 10 sample callset.
\ No newline at end of file
diff --git a/scripts/vcf_site_level_filtering_cromwell_tests/run_vcf_site_level_filtering_wdl.sh b/scripts/vcf_site_level_filtering_cromwell_tests/run_vcf_site_level_filtering_wdl.sh
new file mode 100644
index 00000000000..1c19d18c3b6
--- /dev/null
+++ b/scripts/vcf_site_level_filtering_cromwell_tests/run_vcf_site_level_filtering_wdl.sh
@@ -0,0 +1,38 @@
+#!/bin/bash -l
+set -e
+#cd in the directory of the script in order to use relative paths
+script_path=$( cd "$(dirname "${BASH_SOURCE}")" ; pwd -P )
+cd "$script_path"
+
+WORKING_DIR=/home/runner/work/gatk
+
+set -e
+echo "Building docker image for VCF Site Level Filtering WDL tests (skipping unit tests)..."
+
+#assume Dockerfile is in root
+echo "Building docker without running unit tests... ========="
+cd $WORKING_DIR/gatk
+
+# IMPORTANT: This code is duplicated in the cnv and M2 WDL test.
+if [ ! -z "$CI_PULL_REQUEST" ]; then
+  HASH_TO_USE=FETCH_HEAD
+  sudo bash build_docker.sh  -e ${HASH_TO_USE} -s -u -d $PWD/temp_staging/ -t ${CI_PULL_REQUEST};
+  echo "using fetch head:"$HASH_TO_USE
+else
+  HASH_TO_USE=${CI_COMMIT}
+  sudo bash build_docker.sh  -e ${HASH_TO_USE} -s -u -d $PWD/temp_staging/;
+  echo "using travis commit:"$HASH_TO_USE
+fi
+echo "Docker build done =========="
+
+cd $WORKING_DIR/gatk/scripts/
+sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" vcf_site_level_filtering_cromwell_tests/vcf_site_level_filtering_travis.json >$WORKING_DIR/vcf_site_level_filtering_travis.json
+echo "JSON FILES (modified) ======="
+cat $WORKING_DIR/vcf_site_level_filtering_travis.json
+echo "=================="
+
+
+echo "Running Filtering WDL through cromwell"
+ln -fs $WORKING_DIR/gatk/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl
+cd $WORKING_DIR/gatk/scripts/vcf_site_level_filtering_wdl/
+java -jar $CROMWELL_JAR run JointVcfFiltering.wdl -i $WORKING_DIR/vcf_site_level_filtering_travis.json
diff --git a/scripts/vcf_site_level_filtering_cromwell_tests/vcf_site_level_filtering_travis.json b/scripts/vcf_site_level_filtering_cromwell_tests/vcf_site_level_filtering_travis.json
new file mode 100644
index 00000000000..8165e199d22
--- /dev/null
+++ b/scripts/vcf_site_level_filtering_cromwell_tests/vcf_site_level_filtering_travis.json
@@ -0,0 +1,14 @@
+{
+  "JointVcfFiltering.gatk_docker": "__GATK_DOCKER__",
+  "JointVcfFiltering.vcf": ["/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.22.avg.vcf.gz",
+    "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.23.avg.vcf.gz"],
+  "JointVcfFiltering.vcf_index": ["/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.22.avg.vcf.gz.tbi",
+    "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.23.avg.vcf.gz.tbi"],
+  "JointVcfFiltering.sites_only_vcf": "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.sites_only.vcf.gz",
+  "JointVcfFiltering.sites_only_vcf_index": "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.sites_only.vcf.gz.tbi",
+  "JointVcfFiltering.basename": "test_10_samples",
+  "JointVcfFiltering.snp_annotations": "-A ReadPosRankSum -A FS -A SOR -A QD -A AVERAGE_TREE_SCORE -A AVERAGE_ASSEMBLED_HAPS -A AVERAGE_FILTERED_HAPS",
+  "JointVcfFiltering.indel_annotations": "-A MQRankSum -A ReadPosRankSum -A FS -A SOR -A QD -A AVERAGE_TREE_SCORE",
+  "JointVcfFiltering.model_backend": "PYTHON_IFOREST",
+  "JointVcfFiltering.use_allele_specific_annotations": false
+}
diff --git a/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl b/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl
new file mode 100644
index 00000000000..63d69efa560
--- /dev/null
+++ b/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl
@@ -0,0 +1,294 @@
+version 1.0
+
+# This is a workflow for filtering a joint callset VCF using INFO level annotations (so filtering is at the site level).
+# Note that the input VCFs here may be sharded by genomic position which may be helpful for large cohorts. The script
+# will output the same number of shards that are input.
+# This portion of the filtering pipeline will assign a SCORE INFO field annotation to each site, but does not yet apply
+# the filtering threshold to the final VCF.
+
+workflow JointVcfFiltering {
+	input {
+		Array[File] vcf
+		Array[File] vcf_index
+		File sites_only_vcf
+		File sites_only_vcf_index
+		String basename
+
+		String? model_backend
+		File? training_python_script
+		File? scoring_python_script
+		File? hyperparameters_json
+
+		String gatk_docker
+		File? extract_interval_list
+		File? score_interval_list
+
+		String snp_annotations
+		String indel_annotations
+		File? gatk_override
+
+		Boolean use_allele_specific_annotations
+
+		String snp_resource_args = "--resource:hapmap,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz --resource:omni,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz --resource:1000G,training=true,calibration=false gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz"
+		String indel_resource_args = "--resource:mills,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz"
+	}
+
+	parameter_meta {
+		vcf: "An array of input VCFs that are one callset sharded by genomic region."
+		sites_only_vcf: "The full VCF callset without any genotype or sample level information."
+		basename: "Desired output file basename."
+	}
+
+	call ExtractVariantAnnotations as ExtractVariantAnnotationsSNPs {
+		input:
+			input_vcf = sites_only_vcf,
+			input_vcf_index = sites_only_vcf_index,
+			mode = "SNP",
+			annotations = snp_annotations,
+			resource_args = snp_resource_args,
+			basename = basename,
+			interval_list = extract_interval_list,
+			use_allele_specific_annotations = use_allele_specific_annotations,
+			gatk_override = gatk_override,
+			gatk_docker = gatk_docker
+	}
+
+	call ExtractVariantAnnotations as ExtractVariantAnnotationsINDELs {
+		input:
+			input_vcf = sites_only_vcf,
+			input_vcf_index = sites_only_vcf_index,
+			mode = "INDEL",
+			annotations = indel_annotations,
+			resource_args = indel_resource_args,
+			basename = basename,
+			interval_list = extract_interval_list,
+			use_allele_specific_annotations = use_allele_specific_annotations,
+			gatk_override = gatk_override,
+			gatk_docker = gatk_docker
+	}
+
+	call TrainVariantAnnotationModel as TrainVariantAnnotationModelSNPs {
+		input:
+			annots = ExtractVariantAnnotationsSNPs.annots,
+			basename = basename,
+			mode = "snp",
+			model_backend = model_backend,
+			python_script = training_python_script,
+			hyperparameters_json = hyperparameters_json,
+			gatk_override = gatk_override,
+			gatk_docker = gatk_docker
+	}
+
+	call TrainVariantAnnotationModel as TrainVariantAnnotationModelINDELs {
+		input:
+			annots = ExtractVariantAnnotationsINDELs.annots,
+			basename = basename,
+			mode = "indel",
+			model_backend = model_backend,
+			python_script = training_python_script,
+			hyperparameters_json = hyperparameters_json,
+			gatk_override = gatk_override,
+			gatk_docker = gatk_docker
+	}
+
+	scatter(idx in range(length(vcf))) {
+		call ScoreVariantAnnotations as ScoreVariantAnnotationsSNPs {
+			input:
+				vcf = vcf[idx],
+				vcf_index = vcf_index[idx],
+				basename = basename,
+				mode = "SNP",
+				model_backend = model_backend,
+				python_script = scoring_python_script,
+				annotations = snp_annotations,
+				extracted_training_vcf = ExtractVariantAnnotationsSNPs.extracted_training_vcf,
+				extracted_training_vcf_index = ExtractVariantAnnotationsSNPs.extracted_training_vcf_index,
+				interval_list = score_interval_list,
+				model_files = TrainVariantAnnotationModelSNPs.outputs,
+				resource_args = snp_resource_args,
+				use_allele_specific_annotations = use_allele_specific_annotations,
+				gatk_override = gatk_override,
+				gatk_docker = gatk_docker
+		}
+
+		call ScoreVariantAnnotations as ScoreVariantAnnotationsINDELs {
+			input:
+				vcf = vcf[idx],
+				vcf_index = vcf_index[idx],
+				basename = basename,
+				mode = "INDEL",
+				model_backend = model_backend,
+				python_script = scoring_python_script,
+				annotations = indel_annotations,
+				extracted_training_vcf = ExtractVariantAnnotationsINDELs.extracted_training_vcf,
+				extracted_training_vcf_index = ExtractVariantAnnotationsINDELs.extracted_training_vcf_index,
+				interval_list = score_interval_list,
+				model_files = TrainVariantAnnotationModelINDELs.outputs,
+				resource_args = indel_resource_args,
+				use_allele_specific_annotations = use_allele_specific_annotations,
+				gatk_override = gatk_override,
+				gatk_docker = gatk_docker
+		}
+
+	}
+
+	output {
+		Array[File] indels_variant_scored_vcf = ScoreVariantAnnotationsINDELs.output_vcf
+		Array[File] indels_variant_scored_vcf_index = ScoreVariantAnnotationsINDELs.output_vcf_index
+		Array[File] snps_variant_scored_vcf = ScoreVariantAnnotationsSNPs.output_vcf
+		Array[File] snps_variant_scored_vcf_index = ScoreVariantAnnotationsSNPs.output_vcf_index
+	}
+
+}
+
+task ExtractVariantAnnotations {
+	input {
+		String gatk_docker
+		File? gatk_override
+		File input_vcf
+		File input_vcf_index
+		String basename
+		String mode
+		String annotations
+		String resource_args
+		File? interval_list
+		Boolean use_allele_specific_annotations
+
+		Int memory_mb = 14000
+		Int command_mem = memory_mb - 1000
+	}
+	Int disk_size = ceil(size(input_vcf, "GB") + 50)
+	command {
+		set -e
+		export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override}
+
+		gatk --java-options "-Xmx~{command_mem}m" \
+			ExtractVariantAnnotations \
+			-V ~{input_vcf} \
+			-O ~{basename}.~{mode} \
+			~{annotations} \
+			~{if use_allele_specific_annotations then "--use-allele-specific-annotations" else ""} \
+			~{"-L " + interval_list} \
+			--mode ~{mode} \
+			~{resource_args}
+	}
+	output {
+		File annots = "~{basename}.~{mode}.annot.hdf5"
+		File extracted_training_vcf = "~{basename}.~{mode}.vcf.gz"
+		File extracted_training_vcf_index = "~{basename}.~{mode}.vcf.gz.tbi"
+		Array[File] outputs = glob("~{basename}.~{mode}.*")
+	}
+	runtime {
+		docker: gatk_docker
+		disks: "local-disk " + disk_size + " LOCAL"
+		memory: memory_mb + " MiB"
+	}
+}
+
+task TrainVariantAnnotationModel {
+	input {
+		String gatk_docker
+		File? gatk_override
+		File annots
+		String basename
+		String mode
+		String? model_backend
+		File? python_script
+		File? hyperparameters_json
+
+		Int memory_mb = 14000
+		Int command_mem = memory_mb - 1000
+	}
+	Int disk_size = ceil(size(annots, "GB") + 100)
+	command <<<
+		set -e
+
+		export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override}
+
+		mode=$(echo "~{mode}" | awk '{print toupper($0)}')
+
+		gatk --java-options "-Xmx~{command_mem}m" \
+			TrainVariantAnnotationsModel \
+			--annotations-hdf5 ~{annots} \
+			-O ~{basename} \
+			~{"--model-backend " + model_backend} \
+			~{"--python-script " + python_script} \
+			~{"--hyperparameters-json " + hyperparameters_json} \
+			--mode $mode
+
+	>>>
+	output {
+		Array[File] outputs = glob("~{basename}.~{mode}.*")
+	}
+	runtime {
+		docker: gatk_docker
+		disks: "local-disk " + disk_size + " LOCAL"
+		memory: memory_mb + " MiB"
+	}
+}
+
+task ScoreVariantAnnotations {
+	input {
+		String gatk_docker
+		File? gatk_override
+		File vcf
+		File vcf_index
+		String basename
+		String mode
+		String? model_backend
+		File? python_script
+		String annotations
+		String resource_args
+		File extracted_training_vcf
+		File extracted_training_vcf_index
+		File? interval_list
+		Array[File] model_files
+		Boolean use_allele_specific_annotations
+
+		Int memory_mb = 16000
+		Int command_mem = memory_mb - 1000
+	}
+	Int disk_size = ceil(size(vcf, "GB") *2 + 50)
+
+	command {
+		zgrep -v '#' ~{vcf} > empty.txt
+		set -e
+
+		if [ -s empty.txt ]; then
+			ln -s ~{sep=" . && ln -s " model_files} .
+
+			export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override}
+
+			gatk --java-options "-Xmx~{command_mem}m" \
+				ScoreVariantAnnotations \
+				~{"-L " + interval_list} \
+				-V ~{vcf} \
+				-O ~{basename}.~{mode} \
+				~{"--model-backend " + model_backend} \
+				~{"--python-script " + python_script} \
+				--model-prefix ~{basename} \
+				~{annotations} \
+				~{if use_allele_specific_annotations then "--use-allele-specific-annotations" else ""} \
+				-mode ~{mode} \
+				--resource:extracted,extracted=true ~{extracted_training_vcf} \
+				~{resource_args}
+		else
+			echo "Input VCF was empty so we'll return the same VCF that was input."
+			echo "Scores and annot hdf5 files will not be produced since the input was empty."
+			ln -s ~{vcf} ~{basename}.~{mode}.vcf.gz
+			ln -s ~{vcf_index} ~{basename}.~{mode}.vcf.gz.tbi
+		fi
+	}
+	output {
+		File? scores = "~{basename}.~{mode}.scores.hdf5"
+		File? annots = "~{basename}.~{mode}.annot.hdf5"
+		File output_vcf = "~{basename}.~{mode}.vcf.gz"
+		File output_vcf_index = "~{basename}.~{mode}.vcf.gz.tbi"
+	}
+	runtime {
+		docker: gatk_docker
+		disks: "local-disk " + disk_size + " LOCAL"
+		memory: memory_mb + " MiB"
+	}
+}
+
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CollectReadCounts.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CollectReadCounts.java
index 878d2706cbc..d248d9c8a2a 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CollectReadCounts.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CollectReadCounts.java
@@ -68,7 +68,7 @@
  *         to TSV format. Using HDF5 files with {@link CreateReadCountPanelOfNormals}
  *         can decrease runtime, by reducing time spent on IO, so this is the default output format.
  *         The HDF5 format contains information in the paths defined in {@link HDF5SimpleCountCollection}. HDF5 files may be viewed using
- *         <a href="https://support.hdfgroup.org/products/java/hdfview/">hdfview</a> or loaded in python using
+ *         <a href="https://support.hdfgroup.org/products/java/hdfview/">hdfview</a> or loaded in Python using
  *         <a href="http://www.pytables.org/">PyTables</a> or <a href="http://www.h5py.org/">h5py</a>.
  *         The TSV format has a SAM-style header containing a read group sample name, a sequence dictionary, a row specifying the column headers contained in
  *         {@link SimpleCountCollection.SimpleCountTableColumn}, and the corresponding entry rows.
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CreateReadCountPanelOfNormals.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CreateReadCountPanelOfNormals.java
index 9c7ef423fb2..63afaab70bd 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CreateReadCountPanelOfNormals.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CreateReadCountPanelOfNormals.java
@@ -85,7 +85,7 @@
  *         Panel-of-normals file.
  *         This is an HDF5 file containing the panel data in the paths defined in {@link HDF5SVDReadCountPanelOfNormals}.
  *         HDF5 files may be viewed using <a href="https://support.hdfgroup.org/products/java/hdfview/">hdfview</a>
- *         or loaded in python using <a href="http://www.pytables.org/">PyTables</a> or <a href="http://www.h5py.org/">h5py</a>.
+ *         or loaded in Python using <a href="http://www.pytables.org/">PyTables</a> or <a href="http://www.h5py.org/">h5py</a>.
  *     </li>
  * </ul>
  *
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/utils/HDF5Utils.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/utils/HDF5Utils.java
index 8590e3476f2..870ce37b7dc 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/utils/HDF5Utils.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/utils/HDF5Utils.java
@@ -135,7 +135,7 @@ public static double[][] readChunkedDoubleMatrix(final HDF5File file,
      * Given a large matrix, chunks the matrix into equally sized subsets of rows
      * (plus a subset containing the remainder, if necessary) and writes these submatrices to indexed sub-paths
      * to avoid a hard limit in Java HDF5 on the number of elements in a matrix given by
-     * {@code MAX_NUM_VALUES_PER_HDF5_MATRIX}. The number of chunks is determined by {@code maxChunkSize},
+     * {@code MAX_NUMBER_OF_VALUES_PER_HDF5_MATRIX}. The number of chunks is determined by {@code maxChunkSize},
      * which should be set appropriately for the desired number of columns.
      *
      * @param maxChunkSize  The maximum number of values in each chunk. Decreasing this number will reduce
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/filtering/CreateFilteringFiles.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/filtering/CreateFilteringFiles.java
index 44f46e8a903..8faf58d109b 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/filtering/CreateFilteringFiles.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/filtering/CreateFilteringFiles.java
@@ -38,6 +38,8 @@ public final class CreateFilteringFiles extends VariantWalker {
 
     private List<String> HEADER = 
         Arrays.asList("filter_set_name","mode","location","ref","alt","vqslod","culprit","training_label","yng");
+    private List<String> HEADER_VQSR_LITE =
+            Arrays.asList("filter_set_name","mode","location","ref","alt","vqslod","culprit","training_label","yng", "calibration_sensitivity");
 
     
     @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, 
@@ -64,6 +66,12 @@ public final class CreateFilteringFiles extends VariantWalker {
         optional = false)
     private String mode;
 
+    @Argument(
+            fullName = "classic",
+            doc = "Whether or not this is using classic VQSR or the newer VQSR-Lite",
+            optional = true)
+    private Boolean usingOldVQSR = null;
+
     @Override
     public boolean requiresIntervals() {
         return false;
@@ -76,7 +84,17 @@ public void onTraversalStart() {
         } catch (IOException ioe) {
             throw new GATKException("Unable to initialize writer", ioe);
         }
-        writer.setHeaderLine(HEADER);
+
+        if (usingOldVQSR == null) { // default to using the old, or "classic" VQSR if the user specifies nothing
+            usingOldVQSR = Boolean.TRUE;
+        }
+
+        if (usingOldVQSR) {
+            writer.setHeaderLine(HEADER);
+        } else {
+            writer.setHeaderLine(HEADER_VQSR_LITE);
+        }
+
 
         // Set reference version -- TODO remove this in the future, also, can we get ref version from the header?
         ChromosomeEnum.setRefVersion(refVersion);
@@ -99,17 +117,35 @@ public void apply(final VariantContext variant, final ReadsContext readsContext,
         // TODO: check with Laura -- should NEGATIVES also be NAYs?
         String yng = variant.hasAttribute("POSITIVE_TRAIN_SITE")?"Y":"G";
 
-        List<String> row = Arrays.asList(
-            filterSetName,
-            mode,
-            location.toString(),
-            ref,
-            alt,
-            vqslod,
-            culprit,
-            trainingLabel,
-            yng
-        );
+        List<String> row;
+        if (usingOldVQSR) {
+            row = Arrays.asList(
+                    filterSetName,
+                    mode,
+                    location.toString(),
+                    ref,
+                    alt,
+                    vqslod,
+                    culprit,
+                    trainingLabel,
+                    yng
+            );
+        } else {
+            // New VQSR-Lite has CALIBRATION_SENSITIVITY present, so add that column too.
+            String calibration_sensitivity = variant.getAttributeAsString("CALIBRATION_SENSITIVITY","");
+            row = Arrays.asList(
+                    filterSetName,
+                    mode,
+                    location.toString(),
+                    ref,
+                    alt,
+                    vqslod,
+                    culprit,
+                    trainingLabel,
+                    yng,
+                    calibration_sensitivity
+            );
+        }
 
         writer.getNewLineBuilder().setRow(row).write();
 
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ExtractVariantAnnotations.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ExtractVariantAnnotations.java
new file mode 100644
index 00000000000..dc98d99072e
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ExtractVariantAnnotations.java
@@ -0,0 +1,369 @@
+package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable;
+
+import htsjdk.variant.variantcontext.Allele;
+import htsjdk.variant.variantcontext.VariantContext;
+import org.apache.commons.lang3.tuple.Triple;
+import org.apache.commons.math3.random.RandomGenerator;
+import org.apache.commons.math3.random.RandomGeneratorFactory;
+import org.broadinstitute.barclay.argparser.Argument;
+import org.broadinstitute.barclay.argparser.BetaFeature;
+import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
+import org.broadinstitute.barclay.help.DocumentedFeature;
+import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
+import org.broadinstitute.hellbender.engine.FeatureContext;
+import org.broadinstitute.hellbender.engine.ReadsContext;
+import org.broadinstitute.hellbender.engine.ReferenceContext;
+import org.broadinstitute.hellbender.exceptions.GATKException;
+import org.broadinstitute.hellbender.tools.copynumber.utils.HDF5Utils;
+import org.broadinstitute.hellbender.tools.walkers.vqsr.VariantRecalibrator;
+import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.LabeledVariantAnnotationsData;
+import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.VariantType;
+import picard.cmdline.programgroups.VariantFilteringProgramGroup;
+
+import java.io.File;
+import java.util.List;
+import java.util.Random;
+import java.util.Set;
+import java.util.TreeSet;
+import java.util.stream.Collectors;
+
+/**
+ * Extracts site-level variant annotations, labels, and other metadata from a VCF file to HDF5 files.
+ *
+ * <p>
+ *     This tool is intended to be used as the first step in a variant-filtering workflow that supersedes the
+ *     {@link VariantRecalibrator} workflow. This tool extracts site-level annotations, labels, and other relevant metadata
+ *     from variant sites (or alleles, in allele-specific mode) that are or are not present in specified labeled
+ *     resource VCFs (e.g., training or calibration VCFs). Input sites that are present in the resources are considered
+ *     labeled; each site can have multiple labels if it is present in multiple resources. Other input sites that are
+ *     not present in any resources are considered unlabeled and can be randomly sampled using reservoir sampling;
+ *     extraction of these is optional. The outputs of the tool are HDF5 files containing the extracted data for
+ *     labeled and (optional) unlabeled variant sets, as well as a sites-only indexed VCF containing the labeled variants.
+ * </p>
+ * 
+ * <p>
+ *     The extracted sets can be provided as input to the {@link TrainVariantAnnotationsModel} tool
+ *     to produce an annotation-based model for scoring variant calls. This model can in turn be provided
+ *     along with a VCF file to the {@link ScoreVariantAnnotations} tool, which assigns a score to each call
+ *     (with a lower score indicating that a call is more likely to be an artifact and should perhaps be filtered).
+ *     Each score can also be converted to a corresponding sensitivity with respect to a calibration set, if the latter is available.
+ * </p>
+ *
+ * <p>
+ *     Note that annotations and metadata are collected in memory during traversal until they are written to HDF5 files
+ *     upon completion of the traversal. Memory requirements thus roughly scale linearly with both the number of sites
+ *     extracted and the number of annotations.
+ * </p>
+ *
+ * <p>
+ *     Note that HDF5 files may be viewed using <a href="https://support.hdfgroup.org/products/java/hdfview/">hdfview</a>
+ *     or loaded in Python using <a href="http://www.pytables.org/">PyTables</a> or <a href="http://www.h5py.org/">h5py</a>.
+ * </p>
+ * 
+ * <h3>Inputs</h3>
+ *
+ * <ul>
+ *     <li>
+ *         Input VCF file. Site-level annotations will be extracted from the contained variants (or alleles, 
+ *         if the {@value USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME} argument is specified).
+ *     </li>
+ *     <li>
+ *         Annotations to extract.
+ *     </li>
+ *     <li>
+ *         Variant types (i.e., SNP and/or INDEL) to extract. Logic for determining variant type was retained from
+ *         {@link VariantRecalibrator}; see {@link VariantType}. Extracting SNPs and INDELs separately in two runs of
+ *         this tool can be useful if one wishes to extract different sets of annotations for each variant type,
+ *         for example.
+ *     </li>
+ *     <li>
+ *         (Optional) Resource VCF file(s). Each resource should be tagged with a label, which will be assigned to
+ *         extracted sites that are present in the resource. In typical use, the {@value LabeledVariantAnnotationsData#TRAINING_LABEL}
+ *         and {@value LabeledVariantAnnotationsData#CALIBRATION_LABEL} labels should be used to tag at least one resource
+ *         apiece. The resulting sets of sites will be used for model training and conversion of scores to
+ *         calibration-set sensitivity, respectively; the trustworthiness of the respective resources should be
+ *         taken into account accordingly. The {@value LabeledVariantAnnotationsData#SNP_LABEL} label is
+ *         reserved by the tool, as it is used to label sites determined to be SNPs, and thus it cannot be used to tag
+ *         provided resources.
+ *     </li>
+ *     <li>
+ *         (Optional) Maximum number of unlabeled variants (or alleles) to randomly sample with reservoir sampling.
+ *         If nonzero, annotations will also be extracted from unlabeled sites (i.e., those that are not present
+ *         in the labeled resources).
+ *     </li>
+ *     <li>
+ *         Output prefix.
+ *         This is used as the basename for output files.
+ *     </li>
+ * </ul>
+ *
+ * <h3>Outputs</h3>
+ *
+ * <ul>
+ *     <li>
+ *         (Optional) Labeled-annotations HDF5 file (.annot.hdf5). Annotation data and metadata for those sites that
+ *         are present in labeled resources are stored in the following HDF5 directory structure:
+ *
+ *         <p>
+ *           |--- alleles<br>
+ *           |    |--- alt<br>
+ *           |    |--- ref<br>
+ *           |--- annotations<br>
+ *           |    |--- chunk_0<br>
+ *           |    |--- ...<br>
+ *           |    |--- chunk_{num_chunks - 1}<br>
+ *           |    |--- names<br>
+ *           |    |--- num_chunks<br>
+ *           |    |--- num_columns<br>
+ *           |    |--- num_rows<br>
+ *           |--- intervals<br>
+ *           |    |--- indexed_contig_names<br>
+ *           |    |--- transposed_index_start_end<br>
+ *           |--- labels<br>
+ *           |    |--- snp<br>
+ *           |    |--- ... (e.g., training, calibration, etc.)<br>
+ *           |    |--- ...<br>
+ *         </p>
+ *
+ *         <p>
+ *             Here, each chunk is a double matrix, with dimensions given by (number of sites in the chunk) x (number of annotations).
+ *             See the methods {@link HDF5Utils#writeChunkedDoubleMatrix} and {@link HDF5Utils#writeIntervals} for additional details.
+ *             If {@value USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME} is specified, each record corresponds to an individual allele;
+ *             otherwise, each record corresponds to a variant site, which may contain multiple alleles.
+ *             Storage of alleles can be omitted using the {@value OMIT_ALLELES_IN_HDF5_LONG_NAME} argument, which will reduce
+ *             the size of the file. This file will only be produced if resources are provided and the number of extracted
+ *             labeled sites is nonzero.
+ *         </p>
+ *
+ *     </li>
+ *     <li>
+ *         Labeled sites-only VCF file and index. The VCF will not be gzipped if the {@value DO_NOT_GZIP_VCF_OUTPUT_LONG_NAME}
+ *         argument is set to true. The VCF can be provided as a resource in subsequent runs of
+ *         {@link ScoreVariantAnnotations} and used to indicate labeled sites that were extracted.
+ *         This can be useful if the {@value StandardArgumentDefinitions#INTERVALS_LONG_NAME} argument was used to
+ *         subset sites in training or calibration resources for extraction; this may occur when setting up
+ *         training/validation/test splits, for example. Note that records for the random sample of unlabeled sites are
+ *         currently not included in the VCF.
+ *     </li>
+ *     <li>
+ *         (Optional) Unlabeled-annotations HDF5 file. This will have the same directory structure as in the
+ *         labeled-annotations HDF5 file. However, note that records are currently written in the order they
+ *         appear in the downsampling reservoir after random sampling, and hence, are not in genomic order.
+ *         This file will only be produced if a nonzero value of the {@value MAXIMUM_NUMBER_OF_UNLABELED_VARIANTS_LONG_NAME}
+ *         argument is provided.
+ *     </li>
+ * </ul>
+ *
+ * <h3>Usage examples</h3>
+ *
+ * <p>
+ *     Extract annotations from training/calibration SNP/INDEL sites, producing the outputs
+ *     1) {@code extract.annot.hdf5}, 2) {@code extract.vcf.gz}, and 3) {@code extract.vcf.gz.tbi}.
+ *     The HDF5 file can then be provided to {@link TrainVariantAnnotationsModel}
+ *     to train a model using a positive-only approach. Note that the {@value MODE_LONG_NAME} arguments are made
+ *     explicit here, although both SNP and INDEL modes are selected by default.
+ *
+ * <pre>
+ *     gatk ExtractVariantAnnotations \
+ *          -V input.vcf \
+ *          -A annotation_1 \
+ *          ...
+ *          -A annotation_N \
+ *          --mode SNP \
+ *          --resource:snp-training,training=true snp-training.vcf \
+ *          --resource:snp-calibration,calibration=true snp-calibration.vcf \
+ *          --mode INDEL \
+ *          --resource:indel-training,training=true indel-training.vcf \
+ *          --resource:indel-calibration,calibration=true indel-calibration.vcf \
+ *          -O extract
+ * </pre>
+ * </p>
+ *
+ * <p>
+ *     Extract annotations from both training/calibration SNP/INDEL sites and a random sample of
+ *     1000000 unlabeled (i.e., non-training/calibration) sites, producing the outputs
+ *     1) {@code extract.annot.hdf5}, 2) {@code extract.unlabeled.annot.hdf5}, 3) {@code extract.vcf.gz},
+ *     and 4) {@code extract.vcf.gz.tbi}. The HDF5 files can then be provided to {@link TrainVariantAnnotationsModel}
+ *     to train a model using a positive-negative approach (similar to that used in {@link VariantRecalibrator}).
+ *     Note that the {@value MODE_LONG_NAME} arguments are made explicit here, although both SNP and INDEL modes are
+ *     selected by default.
+ *
+ * <pre>
+ *     gatk ExtractVariantAnnotations \
+ *          -V input.vcf \
+ *          -A annotation_1 \
+ *          ...
+ *          -A annotation_N \
+ *          --mode SNP \
+ *          --resource:snp-training,training=true snp-training.vcf \
+ *          --resource:snp-calibration,calibration=true snp-calibration.vcf \
+ *          --mode INDEL \
+ *          --resource:indel-training,training=true indel-training.vcf \
+ *          --resource:indel-calibration,calibration=true indel-calibration.vcf \
+ *          --maximum-number-of-unlableled-variants 1000000
+ *          -O extract
+ * </pre>
+ * </p>
+ *
+ * <p>
+ *     In the (atypical) event that resource VCFs are unavailable, one can still extract annotations from a random sample of
+ *     unlabeled sites, producing the outputs 1) {@code extract.unlabeled.annot.hdf5},
+ *     2) {@code extract.vcf.gz} (which will contain no records), and 3) {@code extract.vcf.gz.tbi}.
+ *     This random sample cannot be used by {@link TrainVariantAnnotationsModel}, but may still be useful for
+ *     exploratory analyses. Note that the {@value MODE_LONG_NAME} arguments are made explicit here, although both
+ *     SNP and INDEL modes are selected by default.
+ *
+ * <pre>
+ *     gatk ExtractVariantAnnotations \
+ *          -V input.vcf \
+ *          -A annotation_1 \
+ *          ...
+ *          -A annotation_N \
+ *          --mode SNP \
+ *          --mode INDEL \
+ *          --maximum-number-of-unlableled-variants 1000000
+ *          -O extract
+ * </pre>
+ * </p>
+ *
+ * DEVELOPER NOTE: See documentation in {@link LabeledVariantAnnotationsWalker}.
+ *
+ * @author Samuel Lee &lt;slee@broadinstitute.org&gt;
+ */
+@CommandLineProgramProperties(
+        summary = "Extracts site-level variant annotations, labels, and other metadata from a VCF file to HDF5 files.",
+        oneLineSummary = "Extracts site-level variant annotations, labels, and other metadata from a VCF file to HDF5 files",
+        programGroup = VariantFilteringProgramGroup.class
+)
+@DocumentedFeature
+@BetaFeature
+public final class ExtractVariantAnnotations extends LabeledVariantAnnotationsWalker {
+
+    public static final String MAXIMUM_NUMBER_OF_UNLABELED_VARIANTS_LONG_NAME = "maximum-number-of-unlabeled-variants";
+    public static final String RESERVOIR_SAMPLING_RANDOM_SEED_LONG_NAME = "reservoir-sampling-random-seed";
+
+    public static final String UNLABELED_TAG = ".unlabeled";
+
+    @Argument(
+            fullName = MAXIMUM_NUMBER_OF_UNLABELED_VARIANTS_LONG_NAME,
+            doc = "Maximum number of unlabeled variants to extract. " +
+                    "If greater than zero, reservoir sampling will be used to randomly sample this number " +
+                    "of sites from input sites that are not present in the specified resources. " +
+                    "Choice of this number should be guided by considerations for training the negative model in " +
+                    "TrainVariantAnnotationsModel; users may wish to choose a number that is comparable to the " +
+                    "expected size of the labeled training set or that is compatible with available memory resources.",
+            minValue = 0)
+    private int maximumNumberOfUnlabeledVariants = 0;
+
+    @Argument(
+            fullName = RESERVOIR_SAMPLING_RANDOM_SEED_LONG_NAME,
+            doc = "Random seed to use for reservoir sampling of unlabeled variants.")
+    private int reservoirSamplingRandomSeed = 0;
+
+    private RandomGenerator rng;
+    private LabeledVariantAnnotationsData unlabeledDataReservoir; // will not be sorted in genomic order
+    private int unlabeledIndex = 0;
+
+    @Override
+    public void afterOnTraversalStart() {
+        if (!resourceLabels.contains(LabeledVariantAnnotationsData.TRAINING_LABEL)) {
+            logger.warn("No training set found! If you are using the downstream TrainVariantAnnotationsModel and ScoreVariantAnnotations tools, " +
+                    "provide sets of known polymorphic loci marked with the training=true feature input tag. " +
+                    "For example, --resource:hapmap,training=true hapmap.vcf");
+        }
+        if (!resourceLabels.contains(LabeledVariantAnnotationsData.CALIBRATION_LABEL)) {
+            logger.warn("No calibration set found! If you are using the downstream TrainVariantAnnotationsModel and ScoreVariantAnnotations tools " +
+                    "and wish to convert scores to sensitivity with respect to a calibration set of variants, " +
+                    "provide sets of known polymorphic loci marked with the calibration=true feature input tag. " +
+                    "For example, --resource:hapmap,calibration=true hapmap.vcf");
+        }
+
+        rng = RandomGeneratorFactory.createRandomGenerator(new Random(reservoirSamplingRandomSeed));
+        unlabeledDataReservoir = maximumNumberOfUnlabeledVariants == 0
+                ? null
+                : new LabeledVariantAnnotationsData(annotationNames, resourceLabels, useASAnnotations, maximumNumberOfUnlabeledVariants);   // we pass resourceLabels here so that both labeled and unlabeled
+    }                                                                                                                                       // HDF5 files will have the same directory structure
+
+    @Override
+    protected void nthPassApply(final VariantContext variant,
+                                final ReadsContext readsContext,
+                                final ReferenceContext referenceContext,
+                                final FeatureContext featureContext,
+                                final int n) {
+        if (n == 0) {
+            final List<Triple<List<Allele>, VariantType, TreeSet<String>>> metadata = extractVariantMetadata(
+                    variant, featureContext, unlabeledDataReservoir != null);
+            final boolean isVariantExtracted = !metadata.isEmpty();
+            if (isVariantExtracted) {
+                final boolean isUnlabeled = metadata.stream().map(Triple::getRight).allMatch(Set::isEmpty);
+                if (!isUnlabeled) {
+                    addExtractedVariantToData(data, variant, metadata);
+                    writeExtractedVariantToVCF(variant, metadata);
+                } else {
+                    // Algorithm R for reservoir sampling: https://en.wikipedia.org/wiki/Reservoir_sampling#Simple_algorithm
+                    if (unlabeledIndex < maximumNumberOfUnlabeledVariants) {
+                        addExtractedVariantToData(unlabeledDataReservoir, variant, metadata);
+                    } else {
+                        final int j = rng.nextInt(unlabeledIndex);
+                        if (j < maximumNumberOfUnlabeledVariants) {
+                            setExtractedVariantInData(unlabeledDataReservoir, variant, metadata, j);
+                        }
+                    }
+                    unlabeledIndex++;
+                }
+            }
+        }
+    }
+
+    @Override
+    protected void afterNthPass(final int n) {
+        if (n == 0) {
+            writeAnnotationsToHDF5();
+            data.clear();
+            if (unlabeledDataReservoir != null) {
+                writeUnlabeledAnnotationsToHDF5();
+                // TODO write extracted unlabeled variants to VCF, which can be used to mark extraction in scoring step
+                unlabeledDataReservoir.clear();
+            }
+            if (vcfWriter != null) {
+                vcfWriter.close();
+            }
+        }
+    }
+
+    @Override
+    public Object onTraversalSuccess() {
+
+        logger.info(String.format("%s complete.", getClass().getSimpleName()));
+
+        return null;
+    }
+
+    private static void setExtractedVariantInData(final LabeledVariantAnnotationsData data,
+                                                  final VariantContext variant,
+                                                  final List<Triple<List<Allele>, VariantType, TreeSet<String>>> metadata,
+                                                  final int index) {
+        data.set(index, variant,
+                metadata.stream().map(Triple::getLeft).collect(Collectors.toList()),
+                metadata.stream().map(Triple::getMiddle).collect(Collectors.toList()),
+                metadata.stream().map(Triple::getRight).collect(Collectors.toList()));
+    }
+
+    private void writeUnlabeledAnnotationsToHDF5() {
+        final File outputUnlabeledAnnotationsFile = new File(outputPrefix + UNLABELED_TAG + ANNOTATIONS_HDF5_SUFFIX);
+        if (unlabeledDataReservoir.size() == 0) {
+            throw new GATKException(String.format("No unlabeled variants were present in the input VCF. " +
+                    "Consider setting the %s argument to 0.", MAXIMUM_NUMBER_OF_UNLABELED_VARIANTS_LONG_NAME));
+        }
+        for (final VariantType variantType : variantTypesToExtract) {
+            logger.info(String.format("Extracted unlabeled annotations for %d variants of type %s.",
+                    unlabeledDataReservoir.getVariantTypeFlat().stream().mapToInt(t -> t == variantType ? 1 : 0).sum(), variantType));
+        }
+        logger.info(String.format("Extracted unlabeled annotations for %s total variants.", unlabeledDataReservoir.size()));
+
+        logger.info("Writing unlabeled annotations...");
+        // TODO coordinate sort
+        unlabeledDataReservoir.writeHDF5(outputUnlabeledAnnotationsFile, omitAllelesInHDF5);
+        logger.info(String.format("Unlabeled annotations and metadata written to %s.", outputUnlabeledAnnotationsFile.getAbsolutePath()));
+    }
+}
\ No newline at end of file
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/LabeledVariantAnnotationsWalker.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/LabeledVariantAnnotationsWalker.java
new file mode 100644
index 00000000000..e1ebf3ce608
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/LabeledVariantAnnotationsWalker.java
@@ -0,0 +1,409 @@
+package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable;
+
+import com.google.common.collect.Sets;
+import htsjdk.samtools.SAMSequenceDictionary;
+import htsjdk.variant.variantcontext.Allele;
+import htsjdk.variant.variantcontext.VariantContext;
+import htsjdk.variant.variantcontext.VariantContextBuilder;
+import htsjdk.variant.variantcontext.writer.VariantContextWriter;
+import htsjdk.variant.vcf.VCFConstants;
+import htsjdk.variant.vcf.VCFHeader;
+import htsjdk.variant.vcf.VCFHeaderLine;
+import htsjdk.variant.vcf.VCFHeaderLineType;
+import htsjdk.variant.vcf.VCFInfoHeaderLine;
+import org.apache.commons.collections4.ListUtils;
+import org.apache.commons.lang3.tuple.Triple;
+import org.broadinstitute.barclay.argparser.Argument;
+import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
+import org.broadinstitute.hellbender.engine.FeatureContext;
+import org.broadinstitute.hellbender.engine.FeatureInput;
+import org.broadinstitute.hellbender.engine.MultiplePassVariantWalker;
+import org.broadinstitute.hellbender.exceptions.GATKException;
+import org.broadinstitute.hellbender.exceptions.UserException;
+import org.broadinstitute.hellbender.tools.copynumber.arguments.CopyNumberArgumentValidationUtils;
+import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.LabeledVariantAnnotationsData;
+import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.VariantType;
+import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.VariantAnnotationsScorer;
+import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
+import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines;
+import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils;
+import org.broadinstitute.hellbender.utils.variant.VcfUtils;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.EnumSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeSet;
+import java.util.stream.Collectors;
+
+/**
+ * Base walker for both {@link ExtractVariantAnnotations} and {@link ScoreVariantAnnotations},
+ * which enforces identical variant-extraction behavior in both tools via {@link #extractVariantMetadata}.
+ *
+ * This base implementation covers functionality for {@link ExtractVariantAnnotations}. Namely, it is a single-pass
+ * walker, performing the operations:
+ *
+ *   - nthPassApply(n = 0)
+ *      - if variant/alleles pass filters and variant-type/resource-match checks, then:
+ *          - add variant/alleles to a {@link LabeledVariantAnnotationsData} collection
+ *          - write variant/alleles with labels appended to a sites-only VCF file
+ *   - afterNthPass(n = 0)
+ *      - write the resulting {@link LabeledVariantAnnotationsData} collection to an HDF5 file
+ *
+ * This results in the following output:
+ *
+ *   - an HDF5 file, with the directory structure documented in {@link LabeledVariantAnnotationsData#writeHDF5};
+ *     note that the matrix of annotations contains a single row per datum (i.e., per allele, in allele-specific mode,
+ *     and per variant otherwise)
+ *   - a sites-only VCF file, containing a single line per extracted variant, with labels appended
+ *
+ * In contrast, the {@link ScoreVariantAnnotations} implementation overrides methods to yield a two-pass walker,
+ * performing the operations:
+ *
+ *   - nthPassApply(n = 0)
+ *      - if variant/alleles pass filters and variant-type checks, then:
+ *          - add variant/alleles to a {@link LabeledVariantAnnotationsData} collection
+ *   - afterNthPass(n = 0)
+ *      - write the resulting {@link LabeledVariantAnnotationsData} collection to an HDF5 file
+ *      - pass this annotations HDF5 file to a {@link VariantAnnotationsScorer}, which generates and writes scores to an HDF5 file
+ *      - read the scores back in and load them into an iterator
+ *   - nthPassApply(n = 1)
+ *      - if variant/alleles pass filters and variant-type checks (which are identical to the first pass), then:
+ *          - draw the corresponding score (or scores, in allele-specific mode) from the iterator
+ *          - write the variant (with all alleles, not just those extracted) with the score
+ *            (or best score, in allele-specific mode) appended to a VCF file
+ *      - else:
+ *          - write an unprocessed copy of the variant to a VCF file
+ *
+ * This results in the following output:
+ *
+ *   - an HDF5 file, as above
+ *   - a VCF file, containing the input variants, with labels, scores, and filters appended/applied for those passing variant-type checks
+ */
+public abstract class LabeledVariantAnnotationsWalker extends MultiplePassVariantWalker {
+
+    public static final String MODE_LONG_NAME = "mode";
+    public static final String USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME = "use-allele-specific-annotations";
+    public static final String IGNORE_FILTER_LONG_NAME = "ignore-filter";
+    public static final String IGNORE_ALL_FILTERS_LONG_NAME = "ignore-all-filters";
+    public static final String DO_NOT_TRUST_ALL_POLYMORPHIC_LONG_NAME = "do-not-trust-all-polymorphic";
+    public static final String RESOURCE_MATCHING_STRATEGY_LONG_NAME = "resource-matching-strategy";
+    public static final String OMIT_ALLELES_IN_HDF5_LONG_NAME = "omit-alleles-in-hdf5";
+    public static final String DO_NOT_GZIP_VCF_OUTPUT_LONG_NAME = "do-not-gzip-vcf-output";
+
+    public static final String ANNOTATIONS_HDF5_SUFFIX = ".annot.hdf5";
+
+    public static final String RESOURCE_LABEL_INFO_HEADER_LINE_FORMAT_STRING = "This site was labeled as %s according to resources";
+
+    enum ResourceMatchingStrategy {
+        START_POSITION, START_POSITION_AND_GIVEN_REPRESENTATION, START_POSITION_AND_MINIMAL_REPRESENTATION
+    }
+
+    @Argument(
+            fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
+            shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
+            doc = "Prefix for output filenames.")
+    String outputPrefix;
+
+    @Argument(
+            fullName = StandardArgumentDefinitions.RESOURCE_LONG_NAME,
+            doc = "Resource VCFs used to label extracted variants.",
+            optional = true)
+    private List<FeatureInput<VariantContext>> resources = new ArrayList<>(10);
+
+    @Argument(
+            fullName = StandardArgumentDefinitions.ANNOTATION_LONG_NAME,
+            shortName = StandardArgumentDefinitions.ANNOTATION_SHORT_NAME,
+            doc = "Names of the annotations to extract. Note that a requested annotation may in fact not be present " +
+                    "at any extraction site; NaN missing values will be generated for such annotations.",
+            minElements = 1)
+    List<String> annotationNames = new ArrayList<>();
+
+    @Argument(
+            fullName = MODE_LONG_NAME,
+            doc = "Variant types to extract.",
+            minElements = 1)
+    private List<VariantType> variantTypesToExtractList = new ArrayList<>(Arrays.asList(VariantType.SNP, VariantType.INDEL));
+
+    @Argument(
+            fullName = USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME,
+            doc = "If true, use the allele-specific versions of the specified annotations.",
+            optional = true)
+    boolean useASAnnotations = false;
+
+    @Argument(
+            fullName = IGNORE_FILTER_LONG_NAME,
+            doc = "Ignore the specified filter(s) in the input VCF.",
+            optional = true)
+    private List<String> ignoreInputFilters = new ArrayList<>();
+
+    @Argument(
+            fullName = IGNORE_ALL_FILTERS_LONG_NAME,
+            doc = "If true, ignore all filters in the input VCF.",
+            optional = true)
+    private boolean ignoreAllFilters = false;
+
+    // TODO this is a perhaps vestigial argument inherited from VQSR; its impact and necessity could be reevaluated
+    @Argument(
+            fullName = DO_NOT_TRUST_ALL_POLYMORPHIC_LONG_NAME,
+            doc = "If true, do not trust that unfiltered records in the resources contain only polymorphic sites. " +
+                    "This may increase runtime if the resources are not sites-only VCFs.",
+            optional = true)
+    private boolean doNotTrustAllPolymorphic = false;
+
+
+    @Argument(
+            fullName = RESOURCE_MATCHING_STRATEGY_LONG_NAME,
+            doc = "The strategy to use for determining whether an input variant is present in a resource " +
+                    "in non-allele-specific mode (--" + USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME + " false). " +
+                    "START_POSITION: Start positions of input and resource variants must match. " +
+                    "START_POSITION_AND_GIVEN_REPRESENTATION: The intersection of the sets of input and resource alleles " +
+                    "(in their given representations) must also be non-empty. " +
+                    "START_POSITION_AND_MINIMAL_REPRESENTATION: The intersection of the sets of input and resource alleles " +
+                    "(after converting alleles to their minimal representations) must also be non-empty. " +
+                    "This argument has no effect in allele-specific mode (--" + USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME + " true), " +
+                    "in which the minimal representations of the input and resource alleles must match.",
+            optional = true)
+    private ResourceMatchingStrategy resourceMatchingStrategy = ResourceMatchingStrategy.START_POSITION;
+    @Argument(
+            fullName = OMIT_ALLELES_IN_HDF5_LONG_NAME,
+            doc = "If true, omit alleles in output HDF5 files in order to decrease file sizes.",
+            optional = true
+    )
+    boolean omitAllelesInHDF5 = false;
+
+    @Argument(
+            fullName = DO_NOT_GZIP_VCF_OUTPUT_LONG_NAME,
+            doc = "If true, VCF output will not be compressed.",
+            optional = true
+    )
+    boolean doNotGZIPVCFOutput = false;
+
+    private final Set<String> ignoreInputFilterSet = new TreeSet<>();
+    Set<VariantType> variantTypesToExtract;
+    TreeSet<String> resourceLabels = new TreeSet<>();
+
+    File outputAnnotationsFile;
+    VariantContextWriter vcfWriter;
+
+    LabeledVariantAnnotationsData data;
+
+    @Override
+    public void onTraversalStart() {
+
+        ignoreInputFilterSet.addAll(ignoreInputFilters);
+
+        variantTypesToExtract = EnumSet.copyOf(variantTypesToExtractList);
+
+        outputAnnotationsFile = new File(outputPrefix + ANNOTATIONS_HDF5_SUFFIX);
+        final String vcfSuffix = doNotGZIPVCFOutput ? ".vcf" : ".vcf.gz";
+        final File outputVCFFile = new File(outputPrefix + vcfSuffix);
+
+        // TODO this validation method should perhaps be moved outside of the CNV code
+        CopyNumberArgumentValidationUtils.validateOutputFiles(outputAnnotationsFile, outputVCFFile);
+
+        for (final FeatureInput<VariantContext> resource : resources) {
+            final TreeSet<String> trackResourceLabels = resource.getTagAttributes().entrySet().stream()
+                    .filter(e -> e.getValue().equals("true"))
+                    .map(Map.Entry::getKey)
+                    .sorted()
+                    .collect(Collectors.toCollection(TreeSet::new));
+            resourceLabels.addAll(trackResourceLabels);
+            logger.info( String.format("Found %s track: labels = %s", resource.getName(), trackResourceLabels));
+        }
+        resourceLabels.forEach(String::intern); // TODO evaluate if this affects memory usage and remove if not needed
+
+        if (resourceLabels.contains(LabeledVariantAnnotationsData.SNP_LABEL)) {
+            throw new UserException.BadInput(String.format("The resource label \"%s\" is reserved for labeling variant types.",
+                    LabeledVariantAnnotationsData.SNP_LABEL));
+        }
+
+        data = new LabeledVariantAnnotationsData(annotationNames, resourceLabels, useASAnnotations);
+
+        vcfWriter = createVCFWriter(outputVCFFile);
+        vcfWriter.writeHeader(constructVCFHeader(data.getSortedLabels()));
+
+        afterOnTraversalStart();   // perform additional validation, set modes in child tools, etc.
+    }
+
+    public void afterOnTraversalStart() {
+        // override
+    }
+
+    @Override
+    protected int numberOfPasses() {
+        return 1;
+    }
+
+    @Override
+    public Object onTraversalSuccess() {
+        return null;
+    }
+
+    static void addExtractedVariantToData(final LabeledVariantAnnotationsData data,
+                                          final VariantContext variant,
+                                          final List<Triple<List<Allele>, VariantType, TreeSet<String>>> metadata) {
+        data.add(variant,
+                metadata.stream().map(Triple::getLeft).collect(Collectors.toList()),
+                metadata.stream().map(Triple::getMiddle).collect(Collectors.toList()),
+                metadata.stream().map(Triple::getRight).collect(Collectors.toList()));
+    }
+
+    void writeExtractedVariantToVCF(final VariantContext variant,
+                                    final List<Triple<List<Allele>, VariantType, TreeSet<String>>> metadata) {
+        writeExtractedVariantToVCF(variant,
+                metadata.stream().map(Triple::getLeft).flatMap(List::stream).collect(Collectors.toList()),
+                metadata.stream().map(Triple::getRight).flatMap(Set::stream).collect(Collectors.toSet()));
+    }
+
+    void writeAnnotationsToHDF5() {
+        if (data.size() == 0) {
+            logger.warn("Found no input variants for extraction. This may be because the specified " +
+                    "genomic region contains no input variants of the requested type(s) or, if extracting " +
+                    "training labels, because none of the input variants were contained in the resource VCFs " +
+                    "or no resource VCFs were provided. The annotations HDF5 file will not be generated.");
+            return;
+        }
+        for (final VariantType variantType : variantTypesToExtract) {
+            logger.info(String.format("Extracted annotations for %d variants of type %s.",
+                    data.getVariantTypeFlat().stream().mapToInt(t -> t == variantType ? 1 : 0).sum(), variantType));
+        }
+        for (final String label : data.getSortedLabels()) {
+            logger.info(String.format("Extracted annotations for %d variants labeled as %s.",
+                    data.isLabelFlat(label).stream().mapToInt(b -> b ? 1 : 0).sum(), label));
+        }
+        logger.info(String.format("Extracted annotations for %s total variants.", data.size()));
+
+        logger.info("Writing annotations...");
+        data.writeHDF5(outputAnnotationsFile, omitAllelesInHDF5);
+        logger.info(String.format("Annotations and metadata written to %s.", outputAnnotationsFile.getAbsolutePath()));
+    }
+
+    /**
+     * Writes a sites-only VCF containing the extracted variants and corresponding labels.
+     */
+    void writeExtractedVariantToVCF(final VariantContext vc,
+                                    final List<Allele> altAlleles,
+                                    final Set<String> labels) {
+        final List<Allele> alleles = ListUtils.union(Collections.singletonList(vc.getReference()), altAlleles);
+        final VariantContextBuilder builder = new VariantContextBuilder(
+                vc.getSource(), vc.getContig(), vc.getStart(), vc.getEnd(), alleles);
+        labels.forEach(l -> builder.attribute(l, true)); // labels should already be sorted as a TreeSet
+        vcfWriter.add(builder.make());
+    }
+
+    // modified from VQSR code
+    // TODO we're just writing a standard sites-only VCF here, maybe there's a nicer way to do this?
+    VCFHeader constructVCFHeader(final List<String> sortedLabels) {
+        Set<VCFHeaderLine> hInfo = sortedLabels.stream()
+                .map(l -> new VCFInfoHeaderLine(l, 1, VCFHeaderLineType.Flag, String.format(RESOURCE_LABEL_INFO_HEADER_LINE_FORMAT_STRING, l)))
+                .collect(Collectors.toCollection(TreeSet::new));
+        hInfo.add(GATKVCFHeaderLines.getFilterLine(VCFConstants.PASSES_FILTERS_v4));
+        final SAMSequenceDictionary sequenceDictionary = getBestAvailableSequenceDictionary();
+        if (sequenceDictionary != null) {
+            hInfo = VcfUtils.updateHeaderContigLines(hInfo, referenceArguments.getReferencePath(), sequenceDictionary, true);
+        }
+        hInfo.addAll(getDefaultToolVCFHeaderLines());
+        return new VCFHeader(hInfo);
+    }
+
+    /**
+     * Performs variant-filter and variant-type checks to determine variants/alleles suitable for extraction, and returns
+     * a corresponding list of metadata. This method should not be overridden, as it is intended to enforce identical
+     * variant-extraction behavior in all child tools. Logic here and below for filtering and determining variant type
+     * was retained from VQSR, but has been heavily refactored.
+     */
+    final List<Triple<List<Allele>, VariantType, TreeSet<String>>> extractVariantMetadata(final VariantContext vc,
+                                                                                          final FeatureContext featureContext,
+                                                                                          final boolean isExtractUnlabeled) {
+        // if variant is filtered, do not consume here
+        if (vc == null || !(ignoreAllFilters || vc.isNotFiltered() || ignoreInputFilterSet.containsAll(vc.getFilters()))) {
+            return Collections.emptyList();
+        }
+        if (!useASAnnotations) {
+            // in non-allele-specific mode, get a singleton list of the triple
+            // (list of alt alleles passing variant-type and resource-match checks, variant type, set of labels)
+            final VariantType variantType = VariantType.getVariantType(vc);
+            if (variantTypesToExtract.contains(variantType)) {
+                final TreeSet<String> matchingResourceLabels = findMatchingResourceLabels(vc, null, featureContext);
+                if (isExtractUnlabeled || !matchingResourceLabels.isEmpty()) {
+                    return Collections.singletonList(Triple.of(vc.getAlternateAlleles(), variantType, matchingResourceLabels));
+                }
+            }
+        } else {
+            // in allele-specific mode, get a list containing the triples
+            // (singleton list of alt allele, variant type, set of labels)
+            // corresponding to alt alleles that pass variant-type and resource-match checks
+            return vc.getAlternateAlleles().stream()
+                    .filter(a -> !GATKVCFConstants.isSpanningDeletion(a))
+                    .filter(a -> variantTypesToExtract.contains(VariantType.getAlleleSpecificVariantType(vc, a)))
+                    .map(a -> Triple.of(Collections.singletonList(a), VariantType.getAlleleSpecificVariantType(vc, a),
+                            findMatchingResourceLabels(vc, a, featureContext)))
+                    .filter(t -> isExtractUnlabeled || !t.getRight().isEmpty())
+                    .collect(Collectors.toList());
+        }
+        // if variant-type and resource-match checks failed, return an empty list
+        return Collections.emptyList();
+    }
+
+    private TreeSet<String> findMatchingResourceLabels(final VariantContext vc,
+                                                       final Allele altAllele,
+                                                       final FeatureContext featureContext) {
+        final TreeSet<String> matchingResourceLabels = new TreeSet<>();
+        for (final FeatureInput<VariantContext> resource : resources) {
+            final List<VariantContext> resourceVCs = featureContext.getValues(resource, featureContext.getInterval().getStart());
+            for (final VariantContext resourceVC : resourceVCs) {
+                if (useASAnnotations && !doAllelesMatch(vc.getReference(), altAllele, resourceVC)) {
+                    continue;
+                }
+                if (isMatchingVariant(vc, resourceVC, !doNotTrustAllPolymorphic, resourceMatchingStrategy)) {
+                    resource.getTagAttributes().entrySet().stream()
+                            .filter(e -> e.getValue().equals("true"))
+                            .map(Map.Entry::getKey)
+                            .forEach(matchingResourceLabels::add);
+                }
+            }
+        }
+        return matchingResourceLabels;
+    }
+
+    private static boolean isMatchingVariant(final VariantContext vc,
+                                             final VariantContext resourceVC,
+                                             final boolean trustAllPolymorphic,
+                                             final ResourceMatchingStrategy resourceMatchingStrategy) {
+        if (resourceVC != null && resourceVC.isNotFiltered() && resourceVC.isVariant() && VariantType.checkVariantType(vc, resourceVC) &&
+                (trustAllPolymorphic || !resourceVC.hasGenotypes() || resourceVC.isPolymorphicInSamples())) { // this is the check originally performed by VQSR
+            switch (resourceMatchingStrategy) {
+                case START_POSITION:
+                    return true;
+                case START_POSITION_AND_GIVEN_REPRESENTATION:
+                    // we further require that at least one alt allele is present in the resource alt alleles, but don't reconcile representations
+                    return !Sets.intersection(Sets.newHashSet(vc.getAlternateAlleles()), Sets.newHashSet(resourceVC.getAlternateAlleles())).isEmpty();
+                case START_POSITION_AND_MINIMAL_REPRESENTATION:
+                    // we further require that at least one alt allele is present in the resource alt alleles, and do reconcile representations
+                    return vc.getAlternateAlleles().stream()
+                            .anyMatch(altAllele -> GATKVariantContextUtils.isAlleleInList(vc.getReference(), altAllele, resourceVC.getReference(), resourceVC.getAlternateAlleles()));
+                default:
+                    throw new GATKException.ShouldNeverReachHereException("Unknown ResourceMatchingStrategy.");
+            }
+        }
+        return false;
+    }
+
+    private static boolean doAllelesMatch(final Allele refAllele,
+                                          final Allele altAllele,
+                                          final VariantContext resourceVC) {
+        if (altAllele == null) {
+            return true;
+        }
+        try {
+            return GATKVariantContextUtils.isAlleleInList(refAllele, altAllele, resourceVC.getReference(), resourceVC.getAlternateAlleles());
+        } catch (final IllegalStateException e) {
+            throw new IllegalStateException("Reference allele mismatch at position " + resourceVC.getContig() + ':' + resourceVC.getStart() + " : ", e);
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ScoreVariantAnnotations.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ScoreVariantAnnotations.java
new file mode 100644
index 00000000000..fbbbed81faf
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ScoreVariantAnnotations.java
@@ -0,0 +1,627 @@
+package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable;
+
+import com.google.common.primitives.Doubles;
+import htsjdk.variant.variantcontext.Allele;
+import htsjdk.variant.variantcontext.VariantContext;
+import htsjdk.variant.variantcontext.VariantContextBuilder;
+import htsjdk.variant.vcf.VCFFilterHeaderLine;
+import htsjdk.variant.vcf.VCFHeader;
+import htsjdk.variant.vcf.VCFHeaderLine;
+import htsjdk.variant.vcf.VCFHeaderLineType;
+import htsjdk.variant.vcf.VCFInfoHeaderLine;
+import org.apache.commons.lang3.tuple.Triple;
+import org.broadinstitute.barclay.argparser.Argument;
+import org.broadinstitute.barclay.argparser.BetaFeature;
+import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
+import org.broadinstitute.barclay.help.DocumentedFeature;
+import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
+import org.broadinstitute.hellbender.engine.FeatureContext;
+import org.broadinstitute.hellbender.engine.ReadsContext;
+import org.broadinstitute.hellbender.engine.ReferenceContext;
+import org.broadinstitute.hellbender.exceptions.GATKException;
+import org.broadinstitute.hellbender.exceptions.UserException;
+import org.broadinstitute.hellbender.tools.copynumber.arguments.CopyNumberArgumentValidationUtils;
+import org.broadinstitute.hellbender.tools.walkers.vqsr.VariantRecalibrator;
+import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.LabeledVariantAnnotationsData;
+import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.VariantType;
+import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.BGMMVariantAnnotationsScorer;
+import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.PythonSklearnVariantAnnotationsScorer;
+import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.VariantAnnotationsModel;
+import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.VariantAnnotationsModelBackend;
+import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.VariantAnnotationsScorer;
+import org.broadinstitute.hellbender.utils.Utils;
+import org.broadinstitute.hellbender.utils.io.IOUtils;
+import org.broadinstitute.hellbender.utils.io.Resource;
+import org.broadinstitute.hellbender.utils.python.PythonScriptExecutor;
+import picard.cmdline.programgroups.VariantFilteringProgramGroup;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Set;
+import java.util.TreeSet;
+import java.util.function.Function;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+
+/**
+ * Scores variant calls in a VCF file based on site-level annotations using a previously trained model.
+ *
+ * <p>
+ *     This tool is intended to be used as the last step in a variant-filtering workflow that supersedes the
+ *     {@link VariantRecalibrator} workflow. Using a previously trained model produced by {@link TrainVariantAnnotationsModel},
+ *     this tool assigns a score to each call (with a lower score indicating that a call is more likely to be an artifact).
+ *     Each score can also be converted to a corresponding sensitivity with respect to a calibration set, if the latter is available.
+ *     Each VCF record can also be annotated with additional resource labels and/or hard filtered based on its
+ *     calibration-set sensitivity, if desired.
+ * </p>
+ *
+ * <p>
+ *     Note that annotations and metadata are collected in memory during traversal until they are written to HDF5 files
+ *     upon completion of the traversal. Memory and disk requirements thus roughly scale linearly with both the number
+ *     of sites scored and the number of annotations. For large callsets, this tool may be run in parallel over separate
+ *     genomic shards using the {@value StandardArgumentDefinitions#INTERVALS_LONG_NAME} argument as usual.
+ * </p>
+ *
+ * <p>
+ *     Scores and annotations are also output to HDF5 files, which may be viewed using
+ *     <a href="https://support.hdfgroup.org/products/java/hdfview/">hdfview</a> or loaded in Python using
+ *     <a href="http://www.pytables.org/">PyTables</a> or <a href="http://www.h5py.org/">h5py</a>.
+ * </p>
+ *
+ * <h3>Inputs</h3>
+ *
+ * <ul>
+ *     <li>
+ *         Input VCF file. Site-level annotations will be extracted from the contained variants (or alleles,
+ *         if the {@value USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME} argument is specified).
+ *     </li>
+ *     <li>
+ *         Annotations to use for scoring. These should be identical to those used in the {@link ExtractVariantAnnotations}
+ *         step to create the training set.
+ *     </li>
+ *     <li>
+ *         Variant types (i.e., SNP and/or INDEL) to score. Logic for determining variant type was retained from
+ *         {@link VariantRecalibrator}; see {@link VariantType}. To use different models for SNPs and INDELs
+ *         (e.g., if it is desired to use different sets of annotations for each variant type), one can first run
+ *         this tool to score SNPs and then again on the resulting output to score INDELs.
+ *     </li>
+ *     <li>
+ *         Model prefix. This should denote the path of model files produced by {@link TrainVariantAnnotationsModel}.
+ *     </li>
+ *     <li>
+ *         (Optional) Model backend. This should be identical to that specified in {@link TrainVariantAnnotationsModel}.
+ *         The default Python IsolationForest implementation requires either the GATK Python environment
+ *         or that certain Python packages (argparse, h5py, numpy, sklearn, and dill) are otherwise available.
+ *         A custom backend can also be specified in conjunction with the {@value PYTHON_SCRIPT_LONG_NAME} argument.
+ *     </li>
+ *     <li>
+ *         (Optional) Resource VCF file(s). See the corresponding documentation in {@link ExtractVariantAnnotations}.
+ *         In typical usage, the same resource VCFs and tags provided to that tool should also be provided here.
+ *         In addition, the sites-only VCF that is produced by that tool can also be provided here and used to
+ *         mark those labeled sites that were extracted, which can be useful if these are a subset of the resource sites.
+ *     </li>
+ *     <li>
+ *         (Optional) Calibration-set sensitivity thresholds for SNPs and INDELs. If the corresponding SNP or INDEL
+ *         calibration-set scores are available in the provided model files, sites that have a calibration-set
+ *         sensitivity falling above the corresponding threshold (i.e., a score falling below the corresponding
+ *         score threshold) will have a filter applied.
+ *     </li>
+ *     <li>
+ *         Output prefix.
+ *         This is used as the basename for output files.
+ *     </li>
+ * </ul>
+ *
+ * <h3>Outputs</h3>
+ *
+ * <ul>
+ *     <li>
+ *         Scored VCF file and index. The VCF will not be gzipped if the {@value DO_NOT_GZIP_VCF_OUTPUT_LONG_NAME}
+ *         argument is set to true. The INFO field in each VCF record will be annotated with:
+ *
+ *         <p>
+ *             1) a score (with a key as given by the {@value SCORE_KEY_LONG_NAME} argument,
+ *             which has a default value of {@value DEFAULT_SCORE_KEY}),
+ *         </p>
+ *         <p>
+ *             2) if resources are provided, flags corresponding to the labels (e.g.,
+ *             {@value LabeledVariantAnnotationsData#TRAINING_LABEL}, {@value LabeledVariantAnnotationsData#CALIBRATION_LABEL}, etc.)
+ *             of resources containing the record,
+ *         </p>
+ *         <p>
+ *             3) if the {@value SNP_KEY_LONG_NAME} argument (which has a default value of {@value DEFAULT_SNP_KEY})
+ *             is non-null, a flag corresponding to whether a site is treated as a SNP,
+ *         </p>
+ *         <p>
+ *             4) if {@value SNP_CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME} and/or
+ *             {@value INDEL_CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME} are provided, a filter (with name given by
+ *             the {@value LOW_SCORE_FILTER_NAME_LONG_NAME} argument, which has a default value of
+ *             {@value DEFAULT_LOW_SCORE_FILTER_NAME}) will be applied if a record has a calibration-set sensitivity
+ *             falling above the appropriate threshold (i.e., if it has a score falling below the corresponding
+ *             score threshold).
+ *         </p>
+ *         <p>
+ *             If {@value USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME} is true, the score, SNP flag, calibration sensitivity,
+ *             and filter appropriate for the highest scoring allele are used; however, the resource labels for all alleles
+ *             are applied.
+ *         </p>
+ *
+ *     </li>
+ *     <li>
+ *         (Optional) Annotations HDF5 file (.annot.hdf5). Annotation data and metadata for all scored sites
+ *         (labeled and unlabeled) are stored in the HDF5 directory structure given in the documentation for the
+ *         {@link ExtractVariantAnnotations} tool. This file will only be produced if the number of scored sites
+ *         is nonzero.
+ *         </p>
+ *
+ *     </li>
+ *     <li>
+ *         (Optional) Scores HDF5 file (.scores.hdf5). Scores for all scored sites are stored in the
+ *         HDF5 path {@value VariantAnnotationsScorer#SCORES_PATH}. Scores are given in the same order as records
+ *         in both the VCF and the annotations HDF5 file. This file will only be produced if the number of scored sites
+ *         is nonzero.
+ *         </p>
+ *     </li>
+ * </ul>
+ *
+ * <h3>Usage examples</h3>
+ *
+ * <p>
+ *     Score sites using a model (produced by {@link TrainVariantAnnotationsModel} using the default
+ *     {@link VariantAnnotationsModelBackend#PYTHON_IFOREST} model backend and contained in the directory
+ *     {@code model_dir}), producing the outputs 1) {@code output.vcf.gz}, 2) {@code output.vcf.gz.tbi},
+ *     3) {@code output.annot.hdf5}, and 4) {@code output.scores.hdf5}. Note that {@code extract.vcf.gz} is
+ *     produced by {@link ExtractVariantAnnotations}. Records will be filtered according to the values provided to the
+ *     {@value SNP_CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME} and {@value INDEL_CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME}
+ *     arguments; the values below are only meant to be illustrative and should be set as appropriate for a given analysis.
+ *     Note that the {@value MODE_LONG_NAME} arguments are made explicit here, although both SNP and INDEL modes are
+ *     selected by default.
+ *
+ * <pre>
+ *     gatk ScoreVariantAnnotations \
+ *          -V input.vcf \
+ *          -A annotation_1 \
+ *          ...
+ *          -A annotation_N \
+ *          --model-prefix model_dir \
+ *          --mode SNP \
+ *          --resource:snp-training,training=true snp-training.vcf \
+ *          --resource:snp-calibration,calibration=true snp-calibration.vcf \
+ *          --mode INDEL \
+ *          --resource:indel-training,training=true indel-training.vcf \
+ *          --resource:indel-calibration,calibration=true indel-calibration.vcf \
+ *          --resource:extracted,extracted=true extract.vcf.gz \
+ *          --snp-calibration-sensitivity-threshold 0.99 \
+ *          --indel-calibration-sensitivity-threshold 0.99 \
+ *          -O output
+ * </pre>
+ *
+ * <p>
+ *     One may chain together two runs of this tool to score SNPs and INDELs using different models
+ *     (note that SNP and INDEL models have "snp" and "indel" tags in their respective filenames, so these
+ *     models can still be contained in the same {@code model_dir} directory).
+ *     This may have implications for mixed SNP/INDEL sites, especially if filters are applied; see also the
+ *     {@value IGNORE_ALL_FILTERS_LONG_NAME} and {@value IGNORE_FILTER_LONG_NAME} arguments.
+ *
+ * <pre>
+ *     gatk ScoreVariantAnnotations \
+ *          -V input.vcf \
+ *          -A snp_annotation_1 \
+ *          ...
+ *          -A snp_annotation_N \
+ *          --model-prefix model_dir \
+ *          --mode SNP \
+ *          --resource:snp-training,training=true snp-training.vcf \
+ *          --resource:snp-calibration,calibration=true snp-calibration.vcf \
+ *          --resource:extracted,extracted=true snp-extract.vcf.gz \
+ *          --snp-calibration-sensitivity-threshold 0.99 \
+ *          -O intermediate-output
+ *
+ *     gatk ScoreVariantAnnotations \
+ *          -V intermediate-output.vcf \
+ *          -A indel_annotation_1 \
+ *          ...
+ *          -A indel_annotation_M \
+ *          --model-prefix model_dir \
+ *          --mode INDEL \
+ *          --resource:indel-training,training=true indel-training.vcf \
+ *          --resource:indel-calibration,calibration=true indel-calibration.vcf \
+ *          --resource:extracted,extracted=true indel-extract.vcf.gz \
+ *          --indel-calibration-sensitivity-threshold 0.99 \
+ *          -O output
+ * </pre>
+ *
+ * <h3>Custom modeling/scoring backends (ADVANCED)</h3>
+ *
+ * <p>
+ *     The primary scoring functionality performed by this tool is accomplished by a "scoring backend"
+ *     whose fundamental contract is to take an input annotation matrix and to output corresponding scores,
+ *     with both input and output given as HDF5 files. Rather than using one of the available, implemented backends,
+ *     advanced users may provide their own backend via the {@value PYTHON_SCRIPT_LONG_NAME} argument.
+ *     See documentation in the modeling and scoring interfaces ({@link VariantAnnotationsModel} and
+ *     {@link VariantAnnotationsScorer}, respectively), as well as the default Python IsolationForest implementation at
+ *     {@link PythonSklearnVariantAnnotationsScorer} and
+ *     src/main/resources/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/isolation-forest.py.
+ * </p>
+ *
+ * DEVELOPER NOTE: See documentation in {@link LabeledVariantAnnotationsWalker}.
+ *
+ * @author Samuel Lee &lt;slee@broadinstitute.org&gt;
+ */
+@CommandLineProgramProperties(
+        summary = "Scores variant calls in a VCF file based on site-level annotations using a previously trained model.",
+        oneLineSummary = "Scores variant calls in a VCF file based on site-level annotations using a previously trained model",
+        programGroup = VariantFilteringProgramGroup.class
+)
+@DocumentedFeature
+@BetaFeature
+public class ScoreVariantAnnotations extends LabeledVariantAnnotationsWalker {
+
+    public static final String MODEL_PREFIX_LONG_NAME = "model-prefix";
+    public static final String MODEL_BACKEND_LONG_NAME = TrainVariantAnnotationsModel.MODEL_BACKEND_LONG_NAME;
+    public static final String PYTHON_SCRIPT_LONG_NAME = "python-script";
+    public static final String SNP_CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME = "snp-calibration-sensitivity-threshold";
+    public static final String INDEL_CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME = "indel-calibration-sensitivity-threshold";
+
+    public static final String SNP_KEY_LONG_NAME = "snp-key";
+    public static final String SCORE_KEY_LONG_NAME = "score-key";
+    public static final String CALIBRATION_SENSITIVITY_KEY_LONG_NAME = "calibration-sensitivity-key";
+    public static final String LOW_SCORE_FILTER_NAME_LONG_NAME = "low-score-filter-name";
+    public static final String DOUBLE_FORMAT_LONG_NAME = "double-format";
+
+    public static final String DEFAULT_SNP_KEY = LabeledVariantAnnotationsData.SNP_LABEL;
+    public static final String DEFAULT_SCORE_KEY = "SCORE";
+    public static final String DEFAULT_CALIBRATION_SENSITIVITY_KEY = "CALIBRATION_SENSITIVITY";
+    public static final String DEFAULT_LOW_SCORE_FILTER_NAME = "LOW_SCORE";
+    public static final String DEFAULT_DOUBLE_FORMAT = "%.4f";
+
+    public static final String SCORES_HDF5_SUFFIX = ".scores.hdf5";
+
+    @Argument(
+            fullName = MODEL_PREFIX_LONG_NAME,
+            doc = "Prefix for model files. This should be identical to the output prefix specified in TrainVariantAnnotationsModel." )
+    private String modelPrefix;
+
+    @Argument(
+            fullName = MODEL_BACKEND_LONG_NAME,
+            doc = "Backend to use for scoring. " +
+                    "JAVA_BGMM will use a pure Java implementation (ported from Python scikit-learn) of the Bayesian Gaussian Mixture Model. " +
+                    "PYTHON_IFOREST will use the Python scikit-learn implementation of the IsolationForest method and " +
+                    "will require that the corresponding Python dependencies are present in the environment. " +
+                    "PYTHON_SCRIPT will use the script specified by the " + PYTHON_SCRIPT_LONG_NAME + " argument. " +
+                    "See the tool documentation for more details." )
+    private VariantAnnotationsModelBackend modelBackend = VariantAnnotationsModelBackend.PYTHON_IFOREST;
+
+    @Argument(
+            fullName = PYTHON_SCRIPT_LONG_NAME,
+            doc = "Python script used for specifying a custom scoring backend. If provided, " + MODEL_BACKEND_LONG_NAME + " must also be set to PYTHON_SCRIPT.",
+            optional = true)
+    private File pythonScriptFile;
+
+    @Argument(
+            fullName = SNP_CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME,
+            doc = "If specified, SNPs with scores corresponding to a calibration sensitivity that is greater than or equal to this threshold will be hard filtered.",
+            optional = true,
+            minValue = 0.,
+            maxValue = 1.)
+    private Double snpCalibrationSensitivityThreshold;
+
+    @Argument(
+            fullName = INDEL_CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME,
+            doc = "If specified, indels with scores corresponding to a calibration sensitivity that is greater than or equal to this threshold will be hard filtered.",
+            optional = true,
+            minValue = 0.,
+            maxValue = 1.)
+    private Double indelCalibrationSensitivityThreshold;
+
+    @Argument(
+            fullName = SNP_KEY_LONG_NAME,
+            doc = "Annotation flag to use for labeling sites as SNPs in output. " +
+                    "Set this to \"null\" to omit these labels.")
+    private String snpKey = DEFAULT_SNP_KEY;
+
+    @Argument(
+            fullName = SCORE_KEY_LONG_NAME,
+            doc = "Annotation key to use for score values in output.")
+    private String scoreKey = DEFAULT_SCORE_KEY;
+
+    @Argument(
+            fullName = CALIBRATION_SENSITIVITY_KEY_LONG_NAME,
+            doc = "Annotation key to use for calibration-sensitivity values in output.")
+    private String calibrationSensitivityKey = DEFAULT_CALIBRATION_SENSITIVITY_KEY;
+
+    @Argument(
+            fullName = LOW_SCORE_FILTER_NAME_LONG_NAME,
+            doc = "Name to use for low-score filter in output.")
+    private String lowScoreFilterName = DEFAULT_LOW_SCORE_FILTER_NAME;
+
+    @Argument(
+            fullName = DOUBLE_FORMAT_LONG_NAME,
+            doc = "Format string to use for formatting score and calibration-sensitivity values in output.")
+    private String doubleFormat = DEFAULT_DOUBLE_FORMAT;
+
+    private File outputScoresFile;
+    private Iterator<Double> scoresIterator;
+    private Iterator<Boolean> isSNPIterator;
+
+    private VariantAnnotationsScorer snpScorer;
+    private VariantAnnotationsScorer indelScorer;
+
+    private Function<Double, Double> snpCalibrationSensitivityConverter;
+    private Function<Double, Double> indelCalibrationSensitivityConverter;
+
+    @Override
+    protected int numberOfPasses() {
+        return 2;
+    }
+
+    @Override
+    public void afterOnTraversalStart() {
+
+        Utils.nonNull(scoreKey);
+        Utils.nonNull(calibrationSensitivityKey);
+        Utils.nonNull(lowScoreFilterName);
+        Utils.nonNull(doubleFormat);
+
+        switch (modelBackend) {
+            case JAVA_BGMM:
+                Utils.validateArg(pythonScriptFile == null,
+                        "Python script should not be provided when using JAVA_BGMM backend.");
+                logger.info("Running in JAVA_BGMM mode...");
+                snpScorer = deserializeScorerFromSerFiles(VariantType.SNP);
+                indelScorer = deserializeScorerFromSerFiles(VariantType.INDEL);
+                break;
+            case PYTHON_IFOREST:
+                Utils.validateArg(pythonScriptFile == null,
+                        "Python script should not be provided when using PYTHON_IFOREST backend.");
+
+                pythonScriptFile = IOUtils.writeTempResource(new Resource(TrainVariantAnnotationsModel.ISOLATION_FOREST_PYTHON_SCRIPT, TrainVariantAnnotationsModel.class));
+                PythonScriptExecutor.checkPythonEnvironmentForPackage("argparse");
+                PythonScriptExecutor.checkPythonEnvironmentForPackage("h5py");
+                PythonScriptExecutor.checkPythonEnvironmentForPackage("numpy");
+                PythonScriptExecutor.checkPythonEnvironmentForPackage("sklearn");
+                PythonScriptExecutor.checkPythonEnvironmentForPackage("dill");
+                logger.info("Running in PYTHON_IFOREST mode...");
+                snpScorer = deserializeScorerFromPklFiles(VariantType.SNP);
+                indelScorer = deserializeScorerFromPklFiles(VariantType.INDEL);
+                break;
+            case PYTHON_SCRIPT:
+                IOUtils.canReadFile(pythonScriptFile);
+                logger.info("Running in PYTHON_SCRIPT mode...");
+                snpScorer = deserializeScorerFromPklFiles(VariantType.SNP);
+                indelScorer = deserializeScorerFromPklFiles(VariantType.INDEL);
+                break;
+            default:
+                throw new GATKException.ShouldNeverReachHereException("Unknown model-backend mode.");
+        }
+
+        if (snpScorer == null && indelScorer == null) {
+            throw new UserException.BadInput(String.format("At least one serialized scorer must be present " +
+                    "in the model files with the prefix %s.", modelPrefix));
+        }
+        if (variantTypesToExtract.contains(VariantType.SNP) && snpScorer == null) {
+            throw new UserException.BadInput(String.format("SNPs were indicated for extraction via the %s argument, " +
+                    "but no serialized SNP scorer was available in the model files with the prefix.", MODE_LONG_NAME, modelPrefix));
+        }
+        if (variantTypesToExtract.contains(VariantType.INDEL) && indelScorer == null) {
+            throw new UserException.BadInput(String.format("INDELs were indicated for extraction via the %s argument, " +
+                    "but no serialized INDEL scorer was available in the model files with the prefix.", MODE_LONG_NAME, modelPrefix));
+        }
+
+        snpCalibrationSensitivityConverter = readCalibrationScoresAndCreateConverter(VariantType.SNP);
+        indelCalibrationSensitivityConverter = readCalibrationScoresAndCreateConverter(VariantType.INDEL);
+
+        if (snpCalibrationSensitivityConverter == null && snpCalibrationSensitivityThreshold != null) {
+            throw new UserException.BadInput(String.format("The %s argument was specified, " +
+                            "but no SNP calibration scores were provided in the model files with the prefix %s.",
+                    SNP_CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME, modelPrefix));
+        }
+        if (indelCalibrationSensitivityConverter == null && indelCalibrationSensitivityThreshold != null) {
+            throw new UserException.BadInput(String.format("The %s argument was specified, " +
+                    "but no INDEL calibration scores were provided in the model files with the prefix %s.",
+                    INDEL_CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME, modelPrefix));
+        }
+
+        outputScoresFile = new File(outputPrefix + SCORES_HDF5_SUFFIX);
+
+        // TODO this validation method should perhaps be moved outside of the CNV code
+        CopyNumberArgumentValidationUtils.validateOutputFiles(outputScoresFile);
+    }
+
+    @Override
+    protected void nthPassApply(final VariantContext variant,
+                                final ReadsContext readsContext,
+                                final ReferenceContext referenceContext,
+                                final FeatureContext featureContext,
+                                final int n) {
+        final List<Triple<List<Allele>, VariantType, TreeSet<String>>> metadata = extractVariantMetadata(variant, featureContext, true);
+        final boolean isVariantExtracted = !metadata.isEmpty();
+        if (n == 0 && isVariantExtracted) {
+            addExtractedVariantToData(data, variant, metadata);
+        }
+        if (n == 1) {
+            if (isVariantExtracted) {
+                writeExtractedVariantToVCF(variant, metadata);
+            } else {
+                vcfWriter.add(variant);
+            }
+        }
+    }
+
+    @Override
+    protected void afterNthPass(final int n) {
+        if (n == 0) {
+            // TODO if BGMM, preprocess annotations and write to HDF5 with BGMMVariantAnnotationsScorer.preprocessAnnotationsWithBGMMAndWriteHDF5
+            writeAnnotationsToHDF5();
+            if (data.size() > 0) {
+                data.clear();
+                readAnnotationsAndWriteScoresToHDF5();
+                scoresIterator = Arrays.stream(VariantAnnotationsScorer.readScores(outputScoresFile)).iterator();
+                isSNPIterator = LabeledVariantAnnotationsData.readLabel(outputAnnotationsFile, LabeledVariantAnnotationsData.SNP_LABEL).iterator();
+            } else {
+                scoresIterator = Collections.emptyIterator();
+                isSNPIterator = Collections.emptyIterator();
+            }
+        }
+        if (n == 1) {
+            if (scoresIterator.hasNext()) {
+                throw new IllegalStateException("Traversals of scores and variants " +
+                        "(or alleles, in allele-specific mode) were not correctly synchronized.");
+            }
+            if (vcfWriter != null) {
+                vcfWriter.close();
+            }
+        }
+    }
+
+    private VariantAnnotationsScorer deserializeScorerFromPklFiles(final VariantType variantType) {
+        final String variantTypeTag = '.' + variantType.toString().toLowerCase();
+        final File scorerPklFile = new File(
+                modelPrefix + variantTypeTag + PythonSklearnVariantAnnotationsScorer.PYTHON_SCORER_PKL_SUFFIX);
+        final File negativeScorerPklFile = new File(
+                modelPrefix + variantTypeTag + TrainVariantAnnotationsModel.NEGATIVE_TAG + PythonSklearnVariantAnnotationsScorer.PYTHON_SCORER_PKL_SUFFIX);
+        return scorerPklFile.canRead()
+                ? negativeScorerPklFile.canRead()
+                ? VariantAnnotationsScorer.combinePositiveAndNegativeScorer(
+                new PythonSklearnVariantAnnotationsScorer(pythonScriptFile, scorerPklFile),
+                new PythonSklearnVariantAnnotationsScorer(pythonScriptFile, negativeScorerPklFile))
+                : new PythonSklearnVariantAnnotationsScorer(pythonScriptFile, scorerPklFile)
+                : null;
+    }
+
+    private VariantAnnotationsScorer deserializeScorerFromSerFiles(final VariantType variantType) {
+        final String variantTypeTag = '.' + variantType.toString().toLowerCase();
+        final File scorerSerFile = new File(
+                modelPrefix + variantTypeTag + BGMMVariantAnnotationsScorer.BGMM_SCORER_SER_SUFFIX);
+        final File negativeScorerSerFile = new File(
+                modelPrefix + variantTypeTag + TrainVariantAnnotationsModel.NEGATIVE_TAG + BGMMVariantAnnotationsScorer.BGMM_SCORER_SER_SUFFIX);
+        return scorerSerFile.canRead()
+                ? negativeScorerSerFile.canRead()
+                ? VariantAnnotationsScorer.combinePositiveAndNegativeScorer(
+                BGMMVariantAnnotationsScorer.deserialize(scorerSerFile),
+                BGMMVariantAnnotationsScorer.deserialize(negativeScorerSerFile))
+                : BGMMVariantAnnotationsScorer.deserialize(scorerSerFile)
+                : null;
+    }
+
+    private Function<Double, Double> readCalibrationScoresAndCreateConverter(final VariantType variantType) {
+        final String variantTypeTag = '.' + variantType.toString().toLowerCase();
+        final File calibrationScores = new File(
+                modelPrefix + variantTypeTag + TrainVariantAnnotationsModel.CALIBRATION_SCORES_HDF5_SUFFIX);
+        return calibrationScores.canRead()
+                ? VariantAnnotationsScorer.createScoreToCalibrationSensitivityConverter(VariantAnnotationsScorer.readScores(calibrationScores))
+                : null;
+    }
+
+    private void readAnnotationsAndWriteScoresToHDF5() {
+        final List<String> annotationNames = LabeledVariantAnnotationsData.readAnnotationNames(outputAnnotationsFile);
+        final List<Boolean> isSNP = LabeledVariantAnnotationsData.readLabel(outputAnnotationsFile, LabeledVariantAnnotationsData.SNP_LABEL);
+        final double[][] allAnnotations = LabeledVariantAnnotationsData.readAnnotations(outputAnnotationsFile);
+        final int numAll = allAnnotations.length;
+        final List<Double> allScores = new ArrayList<>(Collections.nCopies(numAll, Double.NaN));
+        if (variantTypesToExtract.contains(VariantType.SNP)) {
+            logger.info("Scoring SNP variants...");
+            scoreVariantTypeAndSetElementsOfAllScores(annotationNames, allAnnotations, isSNP, snpScorer, allScores);
+        }
+        if (variantTypesToExtract.contains(VariantType.INDEL)) {
+            logger.info("Scoring INDEL variants...");
+            final List<Boolean> isIndel = isSNP.stream().map(x -> !x).collect(Collectors.toList());
+            scoreVariantTypeAndSetElementsOfAllScores(annotationNames, allAnnotations, isIndel, indelScorer, allScores);
+        }
+        VariantAnnotationsScorer.writeScores(outputScoresFile, Doubles.toArray(allScores));
+        logger.info(String.format("Scores written to %s.", outputScoresFile.getAbsolutePath()));
+    }
+
+    private static void scoreVariantTypeAndSetElementsOfAllScores(final List<String> annotationNames,
+                                                                  final double[][] allAnnotations,
+                                                                  final List<Boolean> isVariantType,
+                                                                  final VariantAnnotationsScorer variantTypeScorer,
+                                                                  final List<Double> allScores) {
+        final File variantTypeAnnotationsFile = LabeledVariantAnnotationsData.subsetAnnotationsToTemporaryFile(annotationNames, allAnnotations, isVariantType);
+        final File variantTypeScoresFile = IOUtils.createTempFile("temp", ".scores.hdf5");
+        variantTypeScorer.score(variantTypeAnnotationsFile, variantTypeScoresFile); // TODO we do not fail until here in the case of mismatched annotation names; we could fail earlier
+        final double[] variantTypeScores = VariantAnnotationsScorer.readScores(variantTypeScoresFile);
+        final Iterator<Double> variantTypeScoresIterator = Arrays.stream(variantTypeScores).iterator();
+        IntStream.range(0, allScores.size()).filter(isVariantType::get).forEach(i -> allScores.set(i, variantTypeScoresIterator.next()));
+    }
+
+    @Override
+    void writeExtractedVariantToVCF(final VariantContext vc,
+                                    final List<Allele> altAlleles,
+                                    final Set<String> labels) {
+        final VariantContextBuilder builder = new VariantContextBuilder(vc);
+        labels.forEach(l -> builder.attribute(l, true)); // labels should already be sorted as a TreeSet
+
+        final List<Double> scores = useASAnnotations
+                ? altAlleles.stream().map(a -> scoresIterator.next()).collect(Collectors.toList())
+                : Collections.singletonList(scoresIterator.next());
+        final double score = Collections.max(scores);
+        final int scoreIndex = scores.indexOf(score);
+        builder.attribute(scoreKey, formatDouble(score));
+
+        final List<Boolean> isSNP = useASAnnotations
+                ? altAlleles.stream().map(a -> isSNPIterator.next()).collect(Collectors.toList())
+                : Collections.singletonList(isSNPIterator.next());
+        final boolean isSNPMax = isSNP.get(scoreIndex);
+
+        if (snpKey != null) {
+            builder.attribute(snpKey, isSNPMax);
+        }
+
+        final Function<Double, Double> calibrationSensitivityConverter = isSNPMax ? snpCalibrationSensitivityConverter : indelCalibrationSensitivityConverter;
+        if (calibrationSensitivityConverter != null) {
+            final double calibrationSensitivity = calibrationSensitivityConverter.apply(score);
+            builder.attribute(calibrationSensitivityKey, formatDouble(calibrationSensitivity));
+            final Double calibrationSensitivityThreshold = isSNPMax ? snpCalibrationSensitivityThreshold : indelCalibrationSensitivityThreshold;
+            if (calibrationSensitivityThreshold != null && calibrationSensitivity >= calibrationSensitivityThreshold) {
+                builder.filter(lowScoreFilterName); // TODO does this sufficiently cover the desired behavior when dealing with previously filtered sites, etc.?
+            }
+        }
+
+        vcfWriter.add(builder.make());
+    }
+
+    private String formatDouble(final double x) {
+        return String.format(doubleFormat, x);
+    }
+
+    /**
+     * Copies the header from the input VCF and adds info lines for the score, calibration-sensitivity, and label keys,
+     * as well as the filter line.
+     */
+    @Override
+    VCFHeader constructVCFHeader(final List<String> sortedLabels) {
+        final VCFHeader inputHeader = getHeaderForVariants();
+        final Set<VCFHeaderLine> sortedInputHeaderMetaData = inputHeader.getMetaDataInSortedOrder();
+
+        final Set<VCFHeaderLine> hInfo = new HashSet<>(sortedInputHeaderMetaData);
+        hInfo.add(new VCFInfoHeaderLine(scoreKey, 1, VCFHeaderLineType.Float,
+                "Score according to the model applied by ScoreVariantAnnotations"));
+        hInfo.add(new VCFInfoHeaderLine(calibrationSensitivityKey, 1, VCFHeaderLineType.Float,
+                String.format("Calibration sensitivity corresponding to the value of %s", scoreKey)));
+        hInfo.add(new VCFFilterHeaderLine(lowScoreFilterName, "Low score (corresponding to high calibration sensitivity)"));
+
+        if (snpKey != null) {
+            hInfo.add(new VCFInfoHeaderLine(snpKey, 1, VCFHeaderLineType.Flag, "This site was considered a SNP during filtering"));
+        }
+        hInfo.addAll(sortedLabels.stream()
+                .map(l -> new VCFInfoHeaderLine(l, 1, VCFHeaderLineType.Flag, String.format(RESOURCE_LABEL_INFO_HEADER_LINE_FORMAT_STRING, l)))
+                .collect(Collectors.toList()));
+        hInfo.addAll(getDefaultToolVCFHeaderLines());
+
+        return new VCFHeader(hInfo, inputHeader.getGenotypeSamples());
+    }
+
+    @Override
+    public Object onTraversalSuccess() {
+
+        logger.info(String.format("%s complete.", getClass().getSimpleName()));
+
+        return null;
+    }
+}
\ No newline at end of file
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/TrainVariantAnnotationsModel.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/TrainVariantAnnotationsModel.java
new file mode 100644
index 00000000000..3aa67197f8b
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/TrainVariantAnnotationsModel.java
@@ -0,0 +1,703 @@
+package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable;
+
+import com.google.common.collect.Streams;
+import com.google.common.primitives.Doubles;
+import org.apache.commons.math3.stat.descriptive.moment.Variance;
+import org.apache.commons.math3.stat.descriptive.rank.Percentile;
+import org.broadinstitute.barclay.argparser.Argument;
+import org.broadinstitute.barclay.argparser.BetaFeature;
+import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
+import org.broadinstitute.barclay.help.DocumentedFeature;
+import org.broadinstitute.hellbender.cmdline.CommandLineProgram;
+import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
+import org.broadinstitute.hellbender.exceptions.GATKException;
+import org.broadinstitute.hellbender.exceptions.UserException;
+import org.broadinstitute.hellbender.tools.walkers.vqsr.VariantRecalibrator;
+import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.LabeledVariantAnnotationsData;
+import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.VariantType;
+import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.BGMMVariantAnnotationsModel;
+import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.BGMMVariantAnnotationsScorer;
+import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.PythonSklearnVariantAnnotationsModel;
+import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.PythonSklearnVariantAnnotationsScorer;
+import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.VariantAnnotationsModel;
+import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.VariantAnnotationsModelBackend;
+import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.VariantAnnotationsScorer;
+import org.broadinstitute.hellbender.utils.Utils;
+import org.broadinstitute.hellbender.utils.io.IOUtils;
+import org.broadinstitute.hellbender.utils.io.Resource;
+import org.broadinstitute.hellbender.utils.python.PythonScriptExecutor;
+import picard.cmdline.programgroups.VariantFilteringProgramGroup;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+
+/**
+ * Trains a model for scoring variant calls based on site-level annotations.
+ *
+ * <p>
+ *     This tool is intended to be used as the second step in a variant-filtering workflow that supersedes the
+ *     {@link VariantRecalibrator} workflow. Given training (and optionally, calibration) sets of site-level annotations
+ *     produced by {@link ExtractVariantAnnotations}, this tool can be used to train a model for scoring variant
+ *     calls. For each variant type (i.e., SNP or INDEL) specified using the {@value MODE_LONG_NAME} argument, the tool
+ *     outputs files that are either: 1) serialized scorers, each of which persists to disk a function for computing
+ *     scores given subsequent annotations, or 2) HDF5 files containing a set of scores, each corresponding to training,
+ *     calibration, and unlabeled sets, as appropriate.
+ * </p>
+ *
+ * <p>
+ *     The model files produced by this tool can in turn be provided along with a VCF file to the {@link ScoreVariantAnnotations}
+ *     tool, which assigns a score to each call (with a lower score indicating that a call is more likely to be an artifact
+ *     and should perhaps be filtered). Each score can also be converted to a corresponding sensitivity with respect to a
+ *     calibration set, if the latter is available.
+ * </p>
+ *
+ * <h3>Modeling approaches</h3>
+ *
+ * <p>
+ *     This tool can perform modeling using either a positive-only approach or a positive-negative approach.
+ *     In a positive-only approach, the annotation-space distribution of training sites is used to learn a
+ *     function for converting annotations for subsequent sites into a score; typically, higher scores correspond to
+ *     regions of annotation space that are more densely populated by training sites. In contrast, a positive-negative
+ *     approach attempts to additionally use unlabeled sites to better identify regions of annotation space that correspond
+ *     to low scores against the original, positive-only model (with the assumption being that unlabeled sites are
+ *     more likely to populate such regions than are training sites). A second, negative model can then be trained,
+ *     and the resulting scores (which are presumably higher in regions of annotation space that are less densely
+ *     populated by the original training sites) can be subtracted from the original scores to produce a final score.
+ *     (Note that this positive-negative approach could be considered as a single iteration of a more general
+ *     approach typically referred to as positive-unlabeled learning.)
+ * </p>
+ *
+ * <p>
+ *     A positive-only approach is likely to perform well in cases where a sufficient number of reliable training sites
+ *     is available. In contrast, if 1) only a small number of reliable training sites is available, and/or
+ *     2) the reliability of the training sites is questionable (e.g., the sites may be contaminated by
+ *     a non-negigible number of sequencing artifacts), then a positive-negative approach may be beneficial.
+ *     However, note that the positive-negative approach introduces an additional hyperparameter---the threshold
+ *     that determines the selection of sites for training the negative model, controlled by the
+ *     {@value CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME} argument---which may require tuning.
+ *     Further note that although {@link VariantRecalibrator} (which this tool supplants) has typically been used to
+ *     implement a positive-negative approach, a positive-only approach likely suffices in many use cases.
+ * </p>
+ *
+ * <p>
+ *     If a positive-only approach has been specified, then if training sites of the variant type are available:
+ *
+ *     <ul>
+ *         <li> 1) A positive model is trained using these training sites and is serialized to file,</li>
+ *         <li> 2) Scores for these training sites are generated using the positive model and output to a file,</li>
+ *         <li> 3) If calibration sites of the variant type are available, scores for these calibration sites are
+ *                 generated using the positive model and output to a file.</li>
+ *     </ul>
+ *
+ *     Additionally, if a positive-negative approach has been specified (i.e., the {@value UNLABELED_ANNOTATIONS_HDF5_LONG_NAME}
+ *     and {@value CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME} arguments have been provided),
+ *     and if both unlabeled and calibration sites of the variant type are available, then:
+ *
+ *     <ul>
+ *         <li> 4) The calibration scores generated from the positive model are used to convert the
+ *                 calibration-sensitivity threshold into a score threshold,</li>
+ *         <li> 5) Training sites with scores below the score threshold are selected for training a negative model,</li>
+ *         <li> 6) Scores for unlabeled sites are generated using the positive model and output to a file,</li>
+ *         <li> 7) Unlabeled sites with scores below the score threshold are selected for training a negative model,</li>
+ *         <li> 8) A negative model is trained using these selected training and unlabeled sites and is serialized to file,</li>
+ *         <li> 9) Scores for calibration sites are generated using the positive-negative model and overwritten in the existing file.</li>
+ *     </ul>
+ *
+ *     Note that the positive-negative approach thus yields 1) scores for training and unlabeled sites generated from
+ *     the positive model and 2) scores for calibration sites generated from the positive-negative model. This is opposed
+ *     to generating scores from all sites from the positive-negative model, since these can simply be obtained from
+ *     a downstream run of {@link ScoreVariantAnnotations}.
+ * </p>
+ *
+ * <h3>Modeling backends</h3>
+ *
+ * <p>
+ *     This tool allows the use of different backends for modeling and scoring. See also below
+ *     for instructions for using a custom, user-provided implementation.
+ * </p>
+ *
+ * <h4>Python isolation-forest backend</h4>
+ *
+ * <p>
+ *
+ *     This backend uses scikit-learn modules to train models and scoring functions using the
+ *     <a href="https://en.wikipedia.org/wiki/Isolation_forest">isolation-forest method for anomaly detection</a>.
+ *     Median imputation of missing annotation values is performed before applying the method.
+ * </p>
+ *
+ * <p>
+ *     This backend can be selected by specifying {@code PYTHON_IFOREST} to the {@value MODEL_BACKEND_LONG_NAME} argument
+ *     and is also currently the the default backend. It is implemented by the script at
+ *     src/main/resources/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/isolation-forest.py, which
+ *     requires that the argparse, h5py, numpy, sklearn, and dill packages be present in the Python environment; users
+ *     may wish to simply use the provided GATK conda environment to ensure that the correct versions of all packages are available.
+ *     See the IsolationForest documentation <a href="https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.IsolationForest.html">here</a>
+ *     as appropriate for the version of scikit-learn used in your Python environment. The hyperparameters documented
+ *     there can be specified using the {@value HYPERPARAMETERS_JSON_LONG_NAME} argument; see
+ *     src/main/resources/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/isolation-forest-hyperparameters.json
+ *     for an example and the default values.
+ * </p>
+ *
+ * <p>
+ *     Note that HDF5 files may be viewed using <a href="https://support.hdfgroup.org/products/java/hdfview/">hdfview</a>
+ *     or loaded in Python using <a href="http://www.pytables.org/">PyTables</a> or <a href="http://www.h5py.org/">h5py</a>.
+ * </p>
+ *
+ * <h3>Calibration sets</h3>
+ *
+ * <p>
+ *     The choice of calibration set will determine the conversion between model scores and calibration-set sensitivities.
+ *     Ideally, the calibration set should be comprised of a unbiased sample from the full distribution of true sites
+ *     in annotation space; the score-sensitivity conversion can roughly be thought of as a mapping from sensitivities in
+ *     [0, 1] to a contour of this annotation-space distribution. In practice, any biases in the calibration set (e.g.,
+ *     if it consists of high quality, previously filtered calls, which may be biased towards the high density regions
+ *     of the full distribution) will be reflected in the conversion and should be taken into consideration when
+ *     interpreting calibration-set sensitivities.
+ * </p>
+ *
+ * <h3>Inputs</h3>
+ *
+ * <ul>
+ *     <li>
+ *         Labeled-annotations HDF5 file (.annot.hdf5). Annotation data and metadata for labeled sites are stored in the
+ *         HDF5 directory structure given in the documentation for the {@link ExtractVariantAnnotations} tool. In typical
+ *         usage, both the {@value LabeledVariantAnnotationsData#TRAINING_LABEL} and
+ *         {@value LabeledVariantAnnotationsData#CALIBRATION_LABEL} labels would be available for non-empty sets of
+ *         sites of the requested variant type.
+ *     </li>
+ *     <li>
+ *         (Optional) Unlabeled-annotations HDF5 file (.unlabeled.annot.hdf5). Annotation data and metadata for
+ *         unlabeled sites are stored in the HDF5 directory structure given in the documentation for the
+ *         {@link ExtractVariantAnnotations} tool. If provided, a positive-negative modeling approach (similar to
+ *         that used in {@link VariantRecalibrator} will be used.
+ *     </li>
+ *     <li>
+ *         Variant types (i.e., SNP and/or INDEL) for which to train models. Logic for determining variant type was retained from
+ *         {@link VariantRecalibrator}; see {@link VariantType}. A separate model will be trained for each variant type
+ *         and separate sets of outputs with corresponding tags in the filenames (i.e., "snp" or "indel") will be produced.
+ *         Alternatively, the tool can be run twice, once for each variant type; this may be useful if one wishes to use
+ *         different argument values or modeling approaches.
+ *     </li>
+ *     <li>
+ *         (Optional) Model backend. The Python isolation-forest backend is currently the default backend.
+ *         A custom backend can also be specified in conjunction with the {@value PYTHON_SCRIPT_LONG_NAME} argument.
+ *     </li>
+ *     <li>
+ *         (Optional) Model hyperparameters JSON file. This file can be used to specify backend-specific
+ *         hyperparameters in JSON format, which is to be consumed by the modeling script. This is required if a
+ *         custom backend is used.
+ *     </li>
+ *     <li>
+ *         (Optional) Calibration-set sensitivity threshold. The same threshold will be used for both SNP and INDEL
+ *         variant types. If different thresholds are desired, the tool can be twice, once for each variant type.
+ *     </li>
+ *     <li>
+ *         Output prefix.
+ *         This is used as the basename for output files.
+ *     </li>
+ * </ul>
+ *
+ * <h3>Outputs</h3>
+ *
+ * <p>
+ *     The following outputs are produced for each variant type specified by the {@value MODE_LONG_NAME} argument
+ *     and are delineated by type-specific tags in the filename of each output, which take the form of
+ *     {@code {output-prefix}.{variant-type}.{file-suffix}}. For example, scores for the SNP calibration set
+ *     will be output to the {@code {output-prefix}.snp.calibrationScores.hdf5} file.
+ * </p>
+ *
+ * <ul>
+ *     <li>
+ *         Training-set positive-model scores HDF5 file (.trainingScores.hdf5).
+ *     </li>
+ *     <li>
+ *         Positive-model serialized scorer file. (.scorer.pkl for the default {@code PYTHON_IFOREST} model backend).
+ *     </li>
+ *     <li>
+ *         (Optional) Unlabeled-set positive-model scores HDF5 file (.unlabeledScores.hdf5). This is only output
+ *         if a positive-negative modeling approach is used.
+ *     </li>
+ *     <li>
+ *         (Optional) Calibration-set scores HDF5 file (.calibrationScores.hdf5). This is only output if a calibration
+ *         set is provided. If a positive-only modeling approach is used, scores will be generated from the positive model;
+ *         if a positive-negative modeling approach is used, scores will be generated from the positive-negative model.
+ *     </li>
+ *     <li>
+ *         (Optional) Negative-model serialized scorer file. (.negative.scorer.pkl for the default {@code PYTHON_IFOREST} model backend).
+ *         This is only output if a positive-negative modeling approach is used.
+ *     </li>
+ * </ul>
+ *
+ * <h3>Usage examples</h3>
+ *
+ * <p>
+ *     Train SNP and INDEL models using the default Python IsolationForest model backend with a positive-only approach,
+ *     given an input labeled-annotations HDF5 file generated by {@link ExtractVariantAnnotations} that contains
+ *     labels for both training and calibration sets, producing the outputs 1) train.snp.scorer.pkl,
+ *     2) train.snp.trainingScores.hdf5, and 3) train.snp.calibrationScores.hdf5, as well as analogous files
+ *     for the INDEL model. Note that the {@value MODE_LONG_NAME} arguments are made explicit here, although both
+ *     SNP and INDEL modes are selected by default.
+ *
+ * <pre>
+ *     gatk TrainVariantAnnotationsModel \
+ *          --annotations-hdf5 extract.annot.hdf5 \
+ *          --mode SNP \
+ *          --mode INDEL \
+ *          -O train
+ * </pre>
+ * </p>
+ *
+ * <p>
+ *     Train SNP and INDEL models using the default Python IsolationForest model backend with a positive-negative approach
+ *     (using a calibration-sensitivity threshold of 0.95 to select sites for training the negative model),
+ *     given an input labeled-annotations HDF5 file that contains labels for both training and calibration sets
+ *     and an input unlabeled-annotations HDF5 file (with both HDF5 files generated by {@link ExtractVariantAnnotations}),
+ *     producing the outputs 1) train.snp.scorer.pkl, 2) train.snp.negative.scorer.pkl, 3) train.snp.trainingScores.hdf5,
+ *     4) train.snp.calibrationScores.hdf5, and 5) train.snp.unlabeledScores.hdf5, as well as analogous files
+ *     for the INDEL model. Note that the {@value MODE_LONG_NAME} arguments are made explicit here, although both
+ *     SNP and INDEL modes are selected by default.
+ *
+ * <pre>
+ *     gatk TrainVariantAnnotationsModel \
+ *          --annotations-hdf5 extract.annot.hdf5 \
+ *          --unlabeled-annotations-hdf5 extract.unlabeled.annot.hdf5 \
+ *          --mode SNP \
+ *          --mode INDEL \
+ *          --calibration-sensitivity-threshold 0.95 \
+ *          -O train
+ * </pre>
+ * </p>
+ *
+ * <h3>Custom modeling/scoring backends (ADVANCED)</h3>
+ *
+ * <p>
+ *     The primary modeling functionality performed by this tool is accomplished by a "modeling backend"
+ *     whose fundamental contract is to take an input HDF5 file containing an annotation matrix for sites of a
+ *     single variant type (i.e., SNP or INDEL) and to output a serialized scorer for that variant type.
+ *     Rather than using one of the available, implemented backends, advanced users may provide their own backend
+ *     via the {@value PYTHON_SCRIPT_LONG_NAME} argument. See documentation in the modeling and scoring interfaces
+ *     ({@link VariantAnnotationsModel} and {@link VariantAnnotationsScorer}, respectively), as well as the default
+ *     Python IsolationForest implementation at {@link PythonSklearnVariantAnnotationsModel} and
+ *     src/main/resources/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/isolation-forest.py.
+ * </p>
+ *
+ * <p>
+ *     Extremely advanced users could potentially substitute their own implementation for the entire
+ *     {@link TrainVariantAnnotationsModel} tool, while still making use of the up/downstream
+ *     {@link ExtractVariantAnnotations} and {@link ScoreVariantAnnotations} tools. To do so, one would additionally
+ *     have to implement functionality for subsetting training/calibration sets by variant type,
+ *     calling modeling backends as appropriate, and scoring calibration sets.
+ * </p>
+ *
+ * @author Samuel Lee &lt;slee@broadinstitute.org&gt;
+ */
+@CommandLineProgramProperties(
+        summary = "Trains a model for scoring variant calls based on site-level annotations.",
+        oneLineSummary = "Trains a model for scoring variant calls based on site-level annotations",
+        programGroup = VariantFilteringProgramGroup.class
+)
+@DocumentedFeature
+@BetaFeature
+public final class TrainVariantAnnotationsModel extends CommandLineProgram {
+
+    public static final String MODE_LONG_NAME = "mode";
+    public static final String ANNOTATIONS_HDF5_LONG_NAME = "annotations-hdf5";
+    public static final String UNLABELED_ANNOTATIONS_HDF5_LONG_NAME = "unlabeled-annotations-hdf5";
+    public static final String MODEL_BACKEND_LONG_NAME = "model-backend";
+    public static final String PYTHON_SCRIPT_LONG_NAME = "python-script";
+    public static final String HYPERPARAMETERS_JSON_LONG_NAME = "hyperparameters-json";
+    public static final String CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME = "calibration-sensitivity-threshold";
+
+    public static final String ISOLATION_FOREST_PYTHON_SCRIPT = "isolation-forest.py";
+    public static final String ISOLATION_FOREST_HYPERPARAMETERS_JSON = "isolation-forest-hyperparameters.json";
+
+    enum AvailableLabelsMode {
+        POSITIVE_ONLY, POSITIVE_UNLABELED
+    }
+
+    public static final String TRAINING_SCORES_HDF5_SUFFIX = ".trainingScores.hdf5";
+    public static final String CALIBRATION_SCORES_HDF5_SUFFIX = ".calibrationScores.hdf5";
+    public static final String UNLABELED_SCORES_HDF5_SUFFIX = ".unlabeledScores.hdf5";
+    public static final String NEGATIVE_TAG = ".negative";
+
+    @Argument(
+            fullName = ANNOTATIONS_HDF5_LONG_NAME,
+            doc = "HDF5 file containing annotations extracted with ExtractVariantAnnotations.")
+    private File inputAnnotationsFile;
+
+    @Argument(
+            fullName = UNLABELED_ANNOTATIONS_HDF5_LONG_NAME,
+            doc = "HDF5 file containing annotations extracted with ExtractVariantAnnotations. " +
+                    "If specified with " + CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME + ", " +
+                    "a positive-unlabeled modeling approach will be used; otherwise, a positive-only modeling " +
+                    "approach will be used.",
+            optional = true)
+    private File inputUnlabeledAnnotationsFile;
+
+    @Argument(
+            fullName = MODEL_BACKEND_LONG_NAME,
+            doc = "Backend to use for training models. " +
+                    "JAVA_BGMM will use a pure Java implementation (ported from Python scikit-learn) of the Bayesian Gaussian Mixture Model. " +
+                    "PYTHON_IFOREST will use the Python scikit-learn implementation of the IsolationForest method and " +
+                    "will require that the corresponding Python dependencies are present in the environment. " +
+                    "PYTHON_SCRIPT will use the script specified by the " + PYTHON_SCRIPT_LONG_NAME + " argument. " +
+                    "See the tool documentation for more details.")
+    private VariantAnnotationsModelBackend modelBackend = VariantAnnotationsModelBackend.PYTHON_IFOREST;
+
+    @Argument(
+            fullName = PYTHON_SCRIPT_LONG_NAME,
+            doc = "Python script used for specifying a custom scoring backend. If provided, " + MODEL_BACKEND_LONG_NAME + " must also be set to PYTHON_SCRIPT.",
+            optional = true)
+    private File pythonScriptFile;
+
+    @Argument(
+            fullName = HYPERPARAMETERS_JSON_LONG_NAME,
+            doc = "JSON file containing hyperparameters. Optional if the PYTHON_IFOREST backend is used " +
+                    "(if not specified, a default set of hyperparameters will be used); otherwise required.",
+            optional = true)
+    private File hyperparametersJSONFile;
+
+    @Argument(
+            fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
+            shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
+            doc = "Output prefix.")
+    private String outputPrefix;
+
+    @Argument(
+            fullName = CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME,
+            doc = "Calibration-sensitivity threshold that determines which sites will be used for training the negative model " +
+                    "in the positive-unlabeled modeling approach. " +
+                    "Increasing this will decrease the corresponding positive-model score threshold; sites with scores below this score " +
+                    "threshold will be used for training the negative model. Thus, this parameter should typically be chosen to " +
+                    "be close to 1, so that sites that score highly according to the positive model will not be used to train the negative model. " +
+                    "The " + UNLABELED_ANNOTATIONS_HDF5_LONG_NAME + " argument must be specified in conjunction with this argument. " +
+                    "If separate thresholds for SNP and INDEL models are desired, run the tool separately for each mode with its respective threshold.",
+            optional = true,
+            minValue = 0.,
+            maxValue = 1.)
+    private Double calibrationSensitivityThreshold;
+
+    @Argument(
+            fullName = MODE_LONG_NAME,
+            doc = "Variant types for which to train models. Duplicate values will be ignored.",
+            minElements = 1)
+    public List<VariantType> variantTypes = new ArrayList<>(Arrays.asList(VariantType.SNP, VariantType.INDEL));
+
+    private AvailableLabelsMode availableLabelsMode;
+
+    @Override
+    protected Object doWork() {
+
+        validateArgumentsAndSetModes();
+
+        logger.info("Starting training...");
+
+        for (final VariantType variantType : VariantType.values()) { // enforces order in which models are trained
+            if (variantTypes.contains(variantType)) {
+                doModelingWorkForVariantType(variantType);
+            }
+        }
+
+        logger.info(String.format("%s complete.", getClass().getSimpleName()));
+
+        return null;
+    }
+
+    private void validateArgumentsAndSetModes() {
+        IOUtils.canReadFile(inputAnnotationsFile);
+
+        Utils.validateArg((inputUnlabeledAnnotationsFile == null) == (calibrationSensitivityThreshold == null),
+                "Unlabeled annotations and calibration-sensitivity threshold must both be unspecified (for positive-only model training) " +
+                        "or specified (for positive-negative model training).");
+
+        availableLabelsMode = inputUnlabeledAnnotationsFile != null && calibrationSensitivityThreshold != null
+                ? AvailableLabelsMode.POSITIVE_UNLABELED
+                : AvailableLabelsMode.POSITIVE_ONLY;
+
+        if (inputUnlabeledAnnotationsFile != null) {
+            IOUtils.canReadFile(inputUnlabeledAnnotationsFile);
+            final List<String> annotationNames = LabeledVariantAnnotationsData.readAnnotationNames(inputAnnotationsFile);
+            final List<String> unlabeledAnnotationNames = LabeledVariantAnnotationsData.readAnnotationNames(inputUnlabeledAnnotationsFile);
+            Utils.validateArg(annotationNames.equals(unlabeledAnnotationNames), "Annotation names must be identical for positive and unlabeled annotations.");
+        }
+
+        switch (modelBackend) {
+            case JAVA_BGMM:
+                Utils.validateArg(pythonScriptFile == null,
+                        "Python script should not be provided when using JAVA_BGMM backend.");
+                IOUtils.canReadFile(hyperparametersJSONFile);
+                logger.info("Running in JAVA_BGMM mode...");
+                break;
+            case PYTHON_IFOREST:
+                Utils.validateArg(pythonScriptFile == null,
+                        "Python script should not be provided when using PYTHON_IFOREST backend.");
+
+                pythonScriptFile = IOUtils.writeTempResource(new Resource(ISOLATION_FOREST_PYTHON_SCRIPT, TrainVariantAnnotationsModel.class));
+                if (hyperparametersJSONFile == null) {
+                    hyperparametersJSONFile = IOUtils.writeTempResource(new Resource(ISOLATION_FOREST_HYPERPARAMETERS_JSON, TrainVariantAnnotationsModel.class));
+                }
+                IOUtils.canReadFile(hyperparametersJSONFile);
+                PythonScriptExecutor.checkPythonEnvironmentForPackage("argparse");
+                PythonScriptExecutor.checkPythonEnvironmentForPackage("h5py");
+                PythonScriptExecutor.checkPythonEnvironmentForPackage("numpy");
+                PythonScriptExecutor.checkPythonEnvironmentForPackage("sklearn");
+                PythonScriptExecutor.checkPythonEnvironmentForPackage("dill");
+                logger.info("Running in PYTHON_IFOREST mode...");
+                break;
+            case PYTHON_SCRIPT:
+                IOUtils.canReadFile(pythonScriptFile);
+                IOUtils.canReadFile(hyperparametersJSONFile);
+                logger.info("Running in PYTHON_SCRIPT mode...");
+                break;
+            default:
+                throw new GATKException.ShouldNeverReachHereException("Unknown model-backend mode.");
+        }
+    }
+
+    /**
+     * This method does all modeling and scoring work for a given {@code variantType}. See the tool-level documentation
+     * for the steps expected to be performed.
+     */
+    private void doModelingWorkForVariantType(final VariantType variantType) {
+        // positive model
+        final List<String> annotationNames = LabeledVariantAnnotationsData.readAnnotationNames(inputAnnotationsFile);
+        final double[][] annotations = LabeledVariantAnnotationsData.readAnnotations(inputAnnotationsFile);
+
+        final List<Boolean> isTraining = LabeledVariantAnnotationsData.readLabel(inputAnnotationsFile, LabeledVariantAnnotationsData.TRAINING_LABEL);
+        final List<Boolean> isCalibration = LabeledVariantAnnotationsData.readLabel(inputAnnotationsFile, LabeledVariantAnnotationsData.CALIBRATION_LABEL);
+        final List<Boolean> isSNP = LabeledVariantAnnotationsData.readLabel(inputAnnotationsFile, LabeledVariantAnnotationsData.SNP_LABEL);
+        final List<Boolean> isVariantType = variantType == VariantType.SNP ? isSNP : isSNP.stream().map(x -> !x).collect(Collectors.toList());
+
+        final List<Boolean> isTrainingAndVariantType = Streams.zip(isTraining.stream(), isVariantType.stream(), (a, b) -> a && b).collect(Collectors.toList());
+        final int numTrainingAndVariantType = numPassingFilter(isTrainingAndVariantType);
+
+        final String variantTypeString = variantType.toString();
+        final String outputPrefixTag = '.' + variantType.toString().toLowerCase();
+
+        if (numTrainingAndVariantType > 0) {
+            logger.info(String.format("Training %s model with %d training sites x %d annotations %s...",
+                    variantTypeString, numTrainingAndVariantType, annotationNames.size(), annotationNames));
+            final File labeledTrainingAndVariantTypeAnnotationsFile = LabeledVariantAnnotationsData.subsetAnnotationsToTemporaryFile(annotationNames, annotations, isTrainingAndVariantType);
+            trainAndSerializeModel(labeledTrainingAndVariantTypeAnnotationsFile, outputPrefixTag);
+            logger.info(String.format("%s model trained and serialized with output prefix \"%s\".", variantTypeString, outputPrefix + outputPrefixTag));
+
+            if (modelBackend == VariantAnnotationsModelBackend.JAVA_BGMM) {
+                BGMMVariantAnnotationsScorer.preprocessAnnotationsWithBGMMAndWriteHDF5(
+                        annotationNames, outputPrefix + outputPrefixTag, labeledTrainingAndVariantTypeAnnotationsFile, logger);
+            }
+
+            logger.info(String.format("Scoring %d %s training sites...", numTrainingAndVariantType, variantTypeString));
+            final File labeledTrainingAndVariantTypeScoresFile = score(labeledTrainingAndVariantTypeAnnotationsFile, outputPrefixTag, TRAINING_SCORES_HDF5_SUFFIX);
+            logger.info(String.format("%s training scores written to %s.", variantTypeString, labeledTrainingAndVariantTypeScoresFile.getAbsolutePath()));
+
+            final List<Boolean> isLabeledCalibrationAndVariantType = Streams.zip(isCalibration.stream(), isVariantType.stream(), (a, b) -> a && b).collect(Collectors.toList());
+            final int numLabeledCalibrationAndVariantType = numPassingFilter(isLabeledCalibrationAndVariantType);
+            if (numLabeledCalibrationAndVariantType > 0) {
+                logger.info(String.format("Scoring %d %s calibration sites...", numLabeledCalibrationAndVariantType, variantTypeString));
+                final File labeledCalibrationAndVariantTypeAnnotationsFile = LabeledVariantAnnotationsData.subsetAnnotationsToTemporaryFile(annotationNames, annotations, isLabeledCalibrationAndVariantType);
+                final File labeledCalibrationAndVariantTypeScoresFile = score(labeledCalibrationAndVariantTypeAnnotationsFile, outputPrefixTag, CALIBRATION_SCORES_HDF5_SUFFIX);
+                logger.info(String.format("%s calibration scores written to %s.", variantTypeString, labeledCalibrationAndVariantTypeScoresFile.getAbsolutePath()));
+            } else {
+                logger.warn(String.format("No %s calibration sites were available.", variantTypeString));
+            }
+
+            // negative model
+            if (availableLabelsMode == AvailableLabelsMode.POSITIVE_UNLABELED) {
+                if (numLabeledCalibrationAndVariantType == 0) {
+                    throw new UserException.BadInput(String.format("Attempted to train %s negative model, " +
+                            "but no suitable calibration sites were found in the provided annotations.", variantTypeString));
+                }
+                final double[][] unlabeledAnnotations = LabeledVariantAnnotationsData.readAnnotations(inputUnlabeledAnnotationsFile);
+                final List<Boolean> unlabeledIsSNP = LabeledVariantAnnotationsData.readLabel(inputUnlabeledAnnotationsFile, "snp");
+                final List<Boolean> isUnlabeledVariantType = variantType == VariantType.SNP ? unlabeledIsSNP : unlabeledIsSNP.stream().map(x -> !x).collect(Collectors.toList());
+
+                final int numUnlabeledVariantType = numPassingFilter(isUnlabeledVariantType);
+
+                if (numUnlabeledVariantType > 0) {
+                    final File labeledCalibrationAndVariantTypeScoresFile = new File(outputPrefix + outputPrefixTag + CALIBRATION_SCORES_HDF5_SUFFIX);
+                    final double[] labeledCalibrationAndVariantTypeScores = VariantAnnotationsScorer.readScores(labeledCalibrationAndVariantTypeScoresFile);
+                    final double scoreThreshold = calibrationSensitivityThreshold == 1. // Percentile requires quantile > 0, so we treat this as a special case
+                            ? Doubles.min(labeledCalibrationAndVariantTypeScores)
+                            : new Percentile(100. * (1. - calibrationSensitivityThreshold)).evaluate(labeledCalibrationAndVariantTypeScores);
+                    logger.info(String.format("Using %s score threshold of %.4f corresponding to specified calibration-sensitivity threshold of %.4f ...",
+                            variantTypeString, scoreThreshold, calibrationSensitivityThreshold));
+
+                    final double[] labeledTrainingAndVariantTypeScores = VariantAnnotationsScorer.readScores(labeledTrainingAndVariantTypeScoresFile);
+                    final List<Boolean> isNegativeTrainingFromLabeledTrainingAndVariantType = Arrays.stream(labeledTrainingAndVariantTypeScores).boxed().map(s -> s < scoreThreshold).collect(Collectors.toList());
+                    final int numNegativeTrainingFromLabeledTrainingAndVariantType = numPassingFilter(isNegativeTrainingFromLabeledTrainingAndVariantType);
+                    logger.info(String.format("Selected %d labeled %s sites below score threshold of %.4f for negative-model training...",
+                            numNegativeTrainingFromLabeledTrainingAndVariantType, variantTypeString, scoreThreshold));
+
+                    logger.info(String.format("Scoring %d unlabeled %s sites...", numUnlabeledVariantType, variantTypeString));
+                    final File unlabeledVariantTypeAnnotationsFile = LabeledVariantAnnotationsData.subsetAnnotationsToTemporaryFile(annotationNames, unlabeledAnnotations, isUnlabeledVariantType);
+                    final File unlabeledVariantTypeScoresFile = score(unlabeledVariantTypeAnnotationsFile, outputPrefixTag, UNLABELED_SCORES_HDF5_SUFFIX);
+                    final double[] unlabeledVariantTypeScores = VariantAnnotationsScorer.readScores(unlabeledVariantTypeScoresFile);
+                    final List<Boolean> isNegativeTrainingFromUnlabeledVariantType = Arrays.stream(unlabeledVariantTypeScores).boxed().map(s -> s < scoreThreshold).collect(Collectors.toList()); // length matches unlabeledAnnotationsFile
+                    final int numNegativeTrainingFromUnlabeledVariantType = numPassingFilter(isNegativeTrainingFromUnlabeledVariantType);
+                    logger.info(String.format("Selected %d unlabeled %s sites below score threshold of %.4f for negative-model training...",
+                            numNegativeTrainingFromUnlabeledVariantType, variantTypeString, scoreThreshold));
+
+                    final double[][] negativeTrainingAndVariantTypeAnnotations = concatenateLabeledAndUnlabeledNegativeTrainingData(
+                            annotationNames, annotations, unlabeledAnnotations, isNegativeTrainingFromLabeledTrainingAndVariantType, isNegativeTrainingFromUnlabeledVariantType);
+                    final int numNegativeTrainingAndVariantType = negativeTrainingAndVariantTypeAnnotations.length;
+                    final List<Boolean> isNegativeTrainingAndVariantType = Collections.nCopies(numNegativeTrainingAndVariantType, true);
+
+                    logger.info(String.format("Training %s negative model with %d negative-training sites x %d annotations %s...",
+                            variantTypeString, numNegativeTrainingAndVariantType, annotationNames.size(), annotationNames));
+                    final File negativeTrainingAnnotationsFile = LabeledVariantAnnotationsData.subsetAnnotationsToTemporaryFile(
+                            annotationNames, negativeTrainingAndVariantTypeAnnotations, isNegativeTrainingAndVariantType);
+                    trainAndSerializeModel(negativeTrainingAnnotationsFile, outputPrefixTag + NEGATIVE_TAG);
+                    logger.info(String.format("%s negative model trained and serialized with output prefix \"%s\".", variantTypeString, outputPrefix + outputPrefixTag + NEGATIVE_TAG));
+
+                    logger.info(String.format("Re-scoring %d %s calibration sites...", numLabeledCalibrationAndVariantType, variantTypeString));
+                    final File labeledCalibrationAnnotationsFile = LabeledVariantAnnotationsData.subsetAnnotationsToTemporaryFile(annotationNames, annotations, isLabeledCalibrationAndVariantType);
+                    final File labeledCalibrationScoresFile = positiveNegativeScore(labeledCalibrationAnnotationsFile, outputPrefixTag, CALIBRATION_SCORES_HDF5_SUFFIX);
+                    logger.info(String.format("Calibration scores written to %s.", labeledCalibrationScoresFile.getAbsolutePath()));
+                } else {
+                    throw new UserException.BadInput(String.format("Attempted to train %s negative model, " +
+                            "but no suitable unlabeled sites were found in the provided annotations.", variantTypeString));
+                }
+            }
+        } else {
+            throw new UserException.BadInput(String.format("Attempted to train %s model, " +
+                    "but no suitable training sites were found in the provided annotations.", variantTypeString));
+        }
+    }
+
+    private static int numPassingFilter(final List<Boolean> isPassing) {
+        return (int) isPassing.stream().filter(x -> x).count();
+    }
+
+    private void trainAndSerializeModel(final File trainingAnnotationsFile,
+                                        final String outputPrefixTag) {
+        readAndValidateTrainingAnnotations(trainingAnnotationsFile, outputPrefixTag);
+        final VariantAnnotationsModel model;
+        switch (modelBackend) {
+            case JAVA_BGMM:
+                model = new BGMMVariantAnnotationsModel(hyperparametersJSONFile);
+                break;
+            case PYTHON_IFOREST:
+                model = new PythonSklearnVariantAnnotationsModel(pythonScriptFile, hyperparametersJSONFile);
+                break;
+            case PYTHON_SCRIPT:
+                model = new PythonSklearnVariantAnnotationsModel(pythonScriptFile, hyperparametersJSONFile);
+                break;
+            default:
+                throw new GATKException.ShouldNeverReachHereException("Unknown model mode.");
+        }
+        model.trainAndSerialize(trainingAnnotationsFile, outputPrefix + outputPrefixTag);
+    }
+
+    /**
+     * When training models on data that has been subset to a given variant type,
+     * we FAIL if any annotation is completely missing and WARN if any annotation has zero variance.
+     */
+    private void readAndValidateTrainingAnnotations(final File trainingAnnotationsFile,
+                                                    final String outputPrefixTag) {
+        final List<String> annotationNames = LabeledVariantAnnotationsData.readAnnotationNames(trainingAnnotationsFile);
+        final double[][] annotations = LabeledVariantAnnotationsData.readAnnotations(trainingAnnotationsFile);
+
+        // these checks are redundant, but we err on the side of robustness
+        final int numAnnotationNames = annotationNames.size();
+        final int numData = annotations.length;
+        Utils.validateArg(numAnnotationNames > 0, "Number of annotation names must be positive.");
+        Utils.validateArg(numData > 0, "Number of data points must be positive.");
+        final int numFeatures = annotations[0].length;
+        Utils.validateArg(numAnnotationNames == numFeatures,
+                "Number of annotation names must match the number of features in the annotation data.");
+
+        final List<String> completelyMissingAnnotationNames = new ArrayList<>(numFeatures);
+        IntStream.range(0, numFeatures).forEach(
+                i -> {
+                    if (new Variance().evaluate(IntStream.range(0, numData).mapToDouble(n -> annotations[n][i]).toArray()) == 0.) {
+                        logger.warn(String.format("All values of the annotation %s are identical in the training data for the %s model.",
+                                annotationNames.get(i), outputPrefix + outputPrefixTag));
+                    }
+                    if (IntStream.range(0, numData).boxed().map(n -> annotations[n][i]).allMatch(x -> Double.isNaN(x))) {
+                        completelyMissingAnnotationNames.add(annotationNames.get(i));
+                    }
+                }
+        );
+
+        if (!completelyMissingAnnotationNames.isEmpty()) {
+            throw new UserException.BadInput(
+                    String.format("All values of the following annotations are missing in the training data for the %s model: %s. " +
+                                    "Consider repeating the extraction step with this annotation dropped. " +
+                                    "If this is a negative model and the amount of negative training data is small, " +
+                                    "perhaps also consider lowering the value of the %s argument so that more " +
+                                    "training data is considered, which may ultimately admit data with non-missing values for the annotation " +
+                                    "(although note that this will also have implications for the resulting model fit); " +
+                                    "alternatively, consider excluding the %s and %s arguments and running positive-only modeling.",
+                            outputPrefix + outputPrefixTag, completelyMissingAnnotationNames,
+                            CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME, UNLABELED_ANNOTATIONS_HDF5_LONG_NAME, CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME));
+        }
+    }
+
+    private File score(final File annotationsFile,
+                       final String outputPrefixTag,
+                       final String outputSuffix) {
+        final VariantAnnotationsScorer scorer;
+        switch (modelBackend) {
+            case JAVA_BGMM:
+                scorer = BGMMVariantAnnotationsScorer.deserialize(new File(outputPrefix + outputPrefixTag + BGMMVariantAnnotationsScorer.BGMM_SCORER_SER_SUFFIX));
+                break;
+            case PYTHON_IFOREST:
+            case PYTHON_SCRIPT:
+                scorer = new PythonSklearnVariantAnnotationsScorer(pythonScriptFile, new File(outputPrefix + outputPrefixTag + PythonSklearnVariantAnnotationsScorer.PYTHON_SCORER_PKL_SUFFIX));
+                break;
+
+            default:
+                throw new GATKException.ShouldNeverReachHereException("Unknown model mode.");
+        }
+        final File outputScoresFile = new File(outputPrefix + outputPrefixTag + outputSuffix);
+        scorer.score(annotationsFile, outputScoresFile);
+        return outputScoresFile;
+    }
+
+    private File positiveNegativeScore(final File annotationsFile,
+                                       final String outputPrefixTag,
+                                       final String outputSuffix) {
+        final VariantAnnotationsScorer scorer;
+        switch (modelBackend) {
+            case JAVA_BGMM:
+                scorer = VariantAnnotationsScorer.combinePositiveAndNegativeScorer(
+                        BGMMVariantAnnotationsScorer.deserialize(new File(outputPrefix + outputPrefixTag + BGMMVariantAnnotationsScorer.BGMM_SCORER_SER_SUFFIX)),
+                        BGMMVariantAnnotationsScorer.deserialize(new File(outputPrefix + outputPrefixTag + NEGATIVE_TAG + BGMMVariantAnnotationsScorer.BGMM_SCORER_SER_SUFFIX)));
+                break;
+            case PYTHON_IFOREST:
+            case PYTHON_SCRIPT:
+                scorer = VariantAnnotationsScorer.combinePositiveAndNegativeScorer(
+                        new PythonSklearnVariantAnnotationsScorer(pythonScriptFile, new File(outputPrefix + outputPrefixTag + PythonSklearnVariantAnnotationsScorer.PYTHON_SCORER_PKL_SUFFIX)),
+                        new PythonSklearnVariantAnnotationsScorer(pythonScriptFile, new File(outputPrefix + outputPrefixTag + NEGATIVE_TAG + PythonSklearnVariantAnnotationsScorer.PYTHON_SCORER_PKL_SUFFIX)));
+                break;
+            default:
+                throw new GATKException.ShouldNeverReachHereException("Unknown model mode.");
+        }
+        final File outputScoresFile = new File(outputPrefix + outputPrefixTag + outputSuffix);
+        scorer.score(annotationsFile, outputScoresFile);
+        return outputScoresFile;
+    }
+
+    private static double[][] concatenateLabeledAndUnlabeledNegativeTrainingData(final List<String> annotationNames,
+                                                                                 final double[][] annotations,
+                                                                                 final double[][] unlabeledAnnotations,
+                                                                                 final List<Boolean> isNegativeTrainingFromLabeledTrainingAndVariantType,
+                                                                                 final List<Boolean> isNegativeTrainingFromUnlabeledVariantType) {
+        final File negativeTrainingFromLabeledTrainingAndVariantTypeAnnotationsFile =
+                LabeledVariantAnnotationsData.subsetAnnotationsToTemporaryFile(annotationNames, annotations, isNegativeTrainingFromLabeledTrainingAndVariantType);
+        final double[][] negativeTrainingFromLabeledTrainingAndVariantTypeAnnotations = LabeledVariantAnnotationsData.readAnnotations(negativeTrainingFromLabeledTrainingAndVariantTypeAnnotationsFile);
+
+        final File negativeTrainingFromUnlabeledVariantTypeAnnotationsFile =
+                LabeledVariantAnnotationsData.subsetAnnotationsToTemporaryFile(annotationNames, unlabeledAnnotations, isNegativeTrainingFromUnlabeledVariantType);
+        final double[][] negativeTrainingFromUnlabeledVariantTypeAnnotations = LabeledVariantAnnotationsData.readAnnotations(negativeTrainingFromUnlabeledVariantTypeAnnotationsFile);
+
+        return Streams.concat(
+                Arrays.stream(negativeTrainingFromLabeledTrainingAndVariantTypeAnnotations),
+                Arrays.stream(negativeTrainingFromUnlabeledVariantTypeAnnotations)).toArray(double[][]::new);
+    }
+}
\ No newline at end of file
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/data/LabeledVariantAnnotationsData.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/data/LabeledVariantAnnotationsData.java
new file mode 100644
index 00000000000..75d8046f09a
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/data/LabeledVariantAnnotationsData.java
@@ -0,0 +1,284 @@
+package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data;
+
+import com.google.common.collect.ImmutableList;
+import htsjdk.variant.variantcontext.Allele;
+import htsjdk.variant.variantcontext.VariantContext;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.broadinstitute.hdf5.HDF5File;
+import org.broadinstitute.hdf5.HDF5LibException;
+import org.broadinstitute.hellbender.exceptions.GATKException;
+import org.broadinstitute.hellbender.tools.copynumber.utils.HDF5Utils;
+import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.VariantAnnotationsModel;
+import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.VariantAnnotationsScorer;
+import org.broadinstitute.hellbender.utils.Utils;
+import org.broadinstitute.hellbender.utils.io.IOUtils;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.List;
+import java.util.TreeSet;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+import java.util.stream.Stream;
+
+/**
+ * Represents a collection of {@link LabeledVariantAnnotationsDatum} as a list of lists of datums.
+ * The outer list is always per-variant. In allele-specific mode, each datum in the inner lists
+ * corresponds to a single allele; otherwise, each inner list trivially contains a single datum corresponding
+ * to the variant.
+ */
+public final class LabeledVariantAnnotationsData {
+    private static final Logger logger = LogManager.getLogger(LabeledVariantAnnotationsData.class);
+
+    // chunk size in temporary annotation files
+    // TODO this could be exposed
+    private static final int CHUNK_DIVISOR = 16;
+    private static final int MAXIMUM_CHUNK_SIZE = HDF5Utils.MAX_NUMBER_OF_VALUES_PER_HDF5_MATRIX / CHUNK_DIVISOR;
+
+    private static final int INITIAL_SIZE = 10_000_000;
+
+    public static final String TRAINING_LABEL = "training";
+    public static final String CALIBRATION_LABEL = "calibration";
+    public static final String SNP_LABEL = "snp";
+
+    public static final String INTERVALS_PATH = "/intervals";
+    public static final String ALLELES_REF_PATH = "/alleles/ref";
+    public static final String ALLELES_ALT_PATH = "/alleles/alt";
+    public static final String ANNOTATIONS_NAMES_PATH = "/annotations/names";
+    public static final String ANNOTATIONS_PATH = "/annotations";
+    public static final String LABELS_PATH = "/labels";
+    public static final String LABELS_SNP_PATH = LABELS_PATH + "/snp";
+
+    private final List<String> sortedAnnotationNames;
+    final List<String> sortedLabels;
+
+    private final List<List<LabeledVariantAnnotationsDatum>> data;
+    private final boolean useASAnnotations;
+
+    public LabeledVariantAnnotationsData(final Collection<String> annotationNames,
+                                         final Collection<String> labels,
+                                         final boolean useASAnnotations,
+                                         final int initialSize) {
+        data = new ArrayList<>(initialSize);
+        sortedAnnotationNames = ImmutableList.copyOf(annotationNames.stream().distinct().sorted().collect(Collectors.toList()));
+        Utils.validateArg(sortedAnnotationNames.size() > 0, "Number of annotation names must be positive.");
+        if (sortedAnnotationNames.size() != annotationNames.size()) {
+            logger.warn(String.format("Ignoring duplicate annotations: %s.", Utils.getDuplicatedItems(annotationNames)));
+        }
+        sortedLabels = ImmutableList.copyOf(labels.stream().distinct().sorted().collect(Collectors.toList()));
+        if (sortedLabels.size() != labels.size()) {
+            logger.warn(String.format("Ignoring duplicate labels: %s.", Utils.getDuplicatedItems(labels)));
+        }
+        this.useASAnnotations = useASAnnotations;
+    }
+
+    public LabeledVariantAnnotationsData(final Collection<String> annotationNames,
+                                         final Collection<String> labels,
+                                         final boolean useASAnnotations) {
+        this(annotationNames, labels, useASAnnotations, INITIAL_SIZE);
+    }
+
+    public List<String> getSortedAnnotationNames() {
+        return sortedAnnotationNames;
+    }
+
+    public List<String> getSortedLabels() {
+        return sortedLabels;
+    }
+
+    public int size() {
+        return data.size();
+    }
+
+    public void clear() {
+        data.clear();
+    }
+
+    /**
+     * Adds an element to the underlying {@link #data} collection.
+     */
+    public void add(final VariantContext vc,
+                    final List<List<Allele>> altAllelesPerDatum,
+                    final List<VariantType> variantTypePerDatum,
+                    final List<TreeSet<String>> labelsPerDatum) {
+        if (!useASAnnotations) {
+            data.add(Collections.singletonList(new LabeledVariantAnnotationsDatum(
+                    vc, altAllelesPerDatum.get(0), variantTypePerDatum.get(0), labelsPerDatum.get(0), sortedAnnotationNames, useASAnnotations)));
+        } else {
+            data.add(IntStream.range(0, altAllelesPerDatum.size()).boxed()
+                    .map(i -> new LabeledVariantAnnotationsDatum(
+                            vc, altAllelesPerDatum.get(i), variantTypePerDatum.get(i), labelsPerDatum.get(i), sortedAnnotationNames, useASAnnotations))
+                    .collect(Collectors.toList()));
+        }
+    }
+
+    /**
+     * Sets the element at a specified index in the underlying {@link #data} collection.
+     */
+    public void set(final int index,
+                    final VariantContext vc,
+                    final List<List<Allele>> altAllelesPerDatum,
+                    final List<VariantType> variantTypePerDatum,
+                    final List<TreeSet<String>> labelsPerDatum) {
+        if (!useASAnnotations) {
+            data.set(index, Collections.singletonList(new LabeledVariantAnnotationsDatum(
+                    vc, altAllelesPerDatum.get(0), variantTypePerDatum.get(0), labelsPerDatum.get(0), sortedAnnotationNames, useASAnnotations)));
+        } else {
+            data.set(index, IntStream.range(0, altAllelesPerDatum.size()).boxed()
+                    .map(i -> new LabeledVariantAnnotationsDatum(
+                            vc, altAllelesPerDatum.get(i), variantTypePerDatum.get(i), labelsPerDatum.get(i), sortedAnnotationNames, useASAnnotations))
+                    .collect(Collectors.toList()));
+        }
+    }
+
+    /**
+     * @return  list of {@link VariantType} indicators, with length given by the number of corresponding sites
+     */
+    public List<VariantType> getVariantTypeFlat() {
+        return streamFlattenedData().map(datum -> datum.variantType).collect(Collectors.toList());
+    }
+
+    /**
+     * @return  list of boolean label indicators, with length given by the number of sites;
+     *          an element in the list will be true if the corresponding site is assigned to the specified label
+     */
+    public List<Boolean> isLabelFlat(final String label) {
+        return streamFlattenedData().map(datum -> datum.labels.contains(label)).collect(Collectors.toList());
+    }
+
+    private Stream<LabeledVariantAnnotationsDatum> streamFlattenedData() {
+        return data.stream().flatMap(List::stream);
+    }
+
+    /**
+     * Writes a representation of the collection to an HDF5 file with the following directory structure:
+     *
+     *      <p>
+     *          |--- alleles<br>
+     *          |    |--- alt<br>
+     *          |    |--- ref<br>
+     *          |--- annotations<br>
+     *          |    |--- chunk_0<br>
+     *          |    |--- ...<br>
+     *          |    |--- chunk_{num_chunks - 1}<br>
+     *          |    |--- names<br>
+     *          |    |--- num_chunks<br>
+     *          |    |--- num_columns<br>
+     *          |    |--- num_rows<br>
+     *          |--- intervals<br>
+     *          |    |--- indexed_contig_names<br>
+     *          |    |--- transposed_index_start_end<br>
+     *          |--- labels<br>
+     *          |    |--- snp<br>
+     *          |    |--- ... (e.g., training, calibration, etc.)<br>
+     *          |    |--- ...<br>
+     *      </p>
+     *
+     * Here, each chunk is a double matrix, with dimensions given by (number of sites in the chunk) x (number of annotations).
+     * See the methods {@link HDF5Utils#writeChunkedDoubleMatrix} and {@link HDF5Utils#writeIntervals} for additional details.
+     *
+     * @param omitAllelesInHDF5 string arrays containing ref/alt alleles can be large, so we allow the option of omitting them
+     */
+    public void writeHDF5(final File outputFile,
+                          final boolean omitAllelesInHDF5) {
+
+        try (final HDF5File outputHDF5File = new HDF5File(outputFile, HDF5File.OpenMode.CREATE)) {
+            IOUtils.canReadFile(outputHDF5File.getFile());
+            HDF5Utils.writeIntervals(outputHDF5File, INTERVALS_PATH,
+                    streamFlattenedData().map(datum -> datum.interval).collect(Collectors.toList()));
+            if (!omitAllelesInHDF5) {
+                outputHDF5File.makeStringArray(ALLELES_REF_PATH,
+                        streamFlattenedData().map(datum -> datum.refAllele.getDisplayString()).toArray(String[]::new));
+                if (!useASAnnotations) {
+                    outputHDF5File.makeStringArray(ALLELES_ALT_PATH,
+                            streamFlattenedData()
+                                    .map(datum -> datum.altAlleles.stream().map(Allele::getDisplayString).collect(Collectors.joining(",")))
+                                    .toArray(String[]::new));
+                } else {
+                    outputHDF5File.makeStringArray(ALLELES_ALT_PATH,
+                            streamFlattenedData().map(datum -> datum.altAlleles.get(0).getDisplayString()).toArray(String[]::new));
+                }
+            }
+            outputHDF5File.makeStringArray(ANNOTATIONS_NAMES_PATH, sortedAnnotationNames.toArray(new String[0]));
+            HDF5Utils.writeChunkedDoubleMatrix(outputHDF5File, ANNOTATIONS_PATH,
+                    streamFlattenedData().map(datum -> datum.annotations).toArray(double[][]::new), MAXIMUM_CHUNK_SIZE);
+            outputHDF5File.makeDoubleArray(LABELS_SNP_PATH,
+                    streamFlattenedData().mapToDouble(datum -> datum.variantType == VariantType.SNP ? 1 : 0).toArray());
+            for (final String label : sortedLabels) {
+                outputHDF5File.makeDoubleArray(String.format("%s/%s", LABELS_PATH, label),
+                        streamFlattenedData().mapToDouble(datum -> datum.labels.contains(label) ? 1 : 0).toArray());
+            }
+        } catch (final HDF5LibException exception) {
+            throw new GATKException(String.format("Exception encountered during writing of annotations and metadata (%s). Output file at %s may be in a bad state.",
+                    exception, outputFile.getAbsolutePath()));
+        }
+    }
+
+    /**
+     * @return  list of annotation names, with length given by the number of annotations, read from the specified file
+     */
+    public static List<String> readAnnotationNames(final File annotationsFile) {
+        try (final HDF5File annotationsHDF5File = new HDF5File(annotationsFile, HDF5File.OpenMode.READ_ONLY)) {
+            IOUtils.canReadFile(annotationsHDF5File.getFile());
+            return Arrays.asList(annotationsHDF5File.readStringArray(ANNOTATIONS_NAMES_PATH));
+        } catch (final HDF5LibException exception) {
+            throw new GATKException(String.format("Exception encountered during reading of annotation names from %s: %s",
+                    annotationsFile.getAbsolutePath(), exception));
+        }
+    }
+
+    /**
+     * @return  matrix with dimensions (number of sites) x (number of annotations), read from the specified file
+     */
+    public static double[][] readAnnotations(final File annotationsFile) {
+        try (final HDF5File annotationsHDF5File = new HDF5File(annotationsFile, HDF5File.OpenMode.READ_ONLY)) {
+            IOUtils.canReadFile(annotationsHDF5File.getFile());
+            return HDF5Utils.readChunkedDoubleMatrix(annotationsHDF5File, ANNOTATIONS_PATH);
+        } catch (final HDF5LibException exception) {
+            throw new GATKException(String.format("Exception encountered during reading of annotations from %s: %s",
+                    annotationsFile.getAbsolutePath(), exception));
+        }
+    }
+
+    /**
+     * @return  list of boolean label indicators, with length given by the number of corresponding sites, read from the specified file;
+     *          an element in the list will be true if the corresponding site is assigned to the specified label
+     */
+    public static List<Boolean> readLabel(final File annotationsFile,
+                                          final String label) {
+        try (final HDF5File annotationsHDF5File = new HDF5File(annotationsFile, HDF5File.OpenMode.READ_ONLY)) {
+            IOUtils.canReadFile(annotationsHDF5File.getFile());
+            return Arrays.stream(annotationsHDF5File.readDoubleArray(String.format("/labels/%s", label))).boxed().map(d -> d == 1).collect(Collectors.toList());
+        } catch (final HDF5LibException exception) {
+            throw new GATKException(String.format("Exception encountered during reading of label %s from %s: %s",
+                    label, annotationsFile.getAbsolutePath(), exception));
+        }
+    }
+
+    /**
+     * Subsets annotation data according to a boolean filter and writes a limited representation to a temporary HDF5 file.
+     * Intended for passing annotations via the file interfaces of {@link VariantAnnotationsModel} and {@link VariantAnnotationsScorer}.
+     */
+    public static File subsetAnnotationsToTemporaryFile(final List<String> annotationNames,
+                                                        final double[][] allAnnotations,
+                                                        final List<Boolean> isSubset) {
+        Utils.validateArg(annotationNames.size() > 0, "Number of annotation names must be positive.");
+        Utils.validateArg(allAnnotations.length > 0, "Number of annotation data points must be positive.");
+        Utils.validateArg(annotationNames.size() == allAnnotations[0].length,
+                "Number of annotation names must match number of features in annotation data.");
+        final double[][] subsetData = IntStream.range(0, isSubset.size()).boxed().filter(isSubset::get).map(i -> allAnnotations[i]).toArray(double[][]::new);
+        final File subsetAnnotationsFile = IOUtils.createTempFile("subset.annot", ".hdf5");
+        try (final HDF5File subsetAnnotationsHDF5File = new HDF5File(subsetAnnotationsFile, HDF5File.OpenMode.CREATE)) {
+            subsetAnnotationsHDF5File.makeStringArray(ANNOTATIONS_NAMES_PATH, annotationNames.toArray(new String[0]));
+            HDF5Utils.writeChunkedDoubleMatrix(subsetAnnotationsHDF5File, ANNOTATIONS_PATH, subsetData, MAXIMUM_CHUNK_SIZE);
+        } catch (final HDF5LibException exception) {
+            throw new GATKException(String.format("Exception encountered during writing of annotations (%s). Output file at %s may be in a bad state.",
+                    exception, subsetAnnotationsFile.getAbsolutePath()));
+        }
+        return subsetAnnotationsFile;
+    }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/data/LabeledVariantAnnotationsDatum.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/data/LabeledVariantAnnotationsDatum.java
new file mode 100644
index 00000000000..884529f5c56
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/data/LabeledVariantAnnotationsDatum.java
@@ -0,0 +1,104 @@
+package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableSet;
+import htsjdk.samtools.util.Locatable;
+import htsjdk.variant.variantcontext.Allele;
+import htsjdk.variant.variantcontext.VariantContext;
+import org.broadinstitute.hellbender.exceptions.UserException;
+import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.LabeledVariantAnnotationsWalker;
+import org.broadinstitute.hellbender.utils.SimpleInterval;
+import org.broadinstitute.hellbender.utils.Utils;
+import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
+
+import java.util.List;
+import java.util.TreeSet;
+
+/**
+ * Represents metadata and annotations extracted from either a variant or a single alt allele (if in allele-specific mode).
+ * Intended to be package-private and accessed only by {@link LabeledVariantAnnotationsData}.
+ */
+final class LabeledVariantAnnotationsDatum implements Locatable {
+    final SimpleInterval interval;
+    final Allele refAllele;
+    final ImmutableList<Allele> altAlleles; // in allele-specific mode, this contains a single alt allele; otherwise, it contains all alt alleles that passed variant-type checks
+    final VariantType variantType;
+    final ImmutableSet<String> labels;      // sorted TreeSet
+    final double[] annotations;             // TODO use ImmutableDoubleArray?
+
+    LabeledVariantAnnotationsDatum(final VariantContext vc,
+                                   final List<Allele> altAlleles,
+                                   final VariantType variantType,
+                                   final TreeSet<String> labels,
+                                   final List<String> sortedAnnotationNames,
+                                   final boolean useASAnnotations) {
+        Utils.validate(!useASAnnotations || altAlleles.size() == 1,
+                "Datum should only be associated with one alt allele in allele-specific mode.");
+        this.interval = new SimpleInterval(vc);
+        this.refAllele = vc.getReference();
+        this.altAlleles = ImmutableList.copyOf(altAlleles);
+        this.variantType = variantType;
+        this.labels = ImmutableSet.copyOf(labels);
+        this.annotations = sortedAnnotationNames.stream()
+                .mapToDouble(a -> decodeAnnotation(vc, altAlleles, a, useASAnnotations))
+                .toArray();
+    }
+
+    @Override
+    public String getContig() {
+        return interval.getContig();
+    }
+
+    @Override
+    public int getStart() {
+        return interval.getStart();
+    }
+
+    @Override
+    public int getEnd() {
+        return interval.getEnd();
+    }
+
+    // code mostly retained from VQSR; some exception catching added
+    private static double decodeAnnotation(final VariantContext vc,
+                                           final List<Allele> altAlleles,
+                                           final String annotationName,
+                                           final boolean useASAnnotations) {
+        double value;
+        try {
+            // if we're in allele-specific mode and an allele-specific annotation has been requested, parse the appropriate value from the list
+            // TODO: can we trigger allele-specific parsing based on annotation prefix or some other logic?
+            if (useASAnnotations && annotationName.startsWith(GATKVCFConstants.ALLELE_SPECIFIC_PREFIX)) {
+                final List<Object> valueList = vc.getAttributeAsList(annotationName);
+                final Allele altAllele = altAlleles.get(0);
+                // FIXME: we need to look at the ref allele here too (SL: this comment was retained from VQSR code, I'm not sure what it means...)
+                if (vc.hasAllele(altAllele)) {
+                    final int altIndex = vc.getAlleleIndex(altAllele) - 1; //- 1 is to convert the index from all alleles (including reference) to just alternate alleles
+                    try {
+                        value = Double.parseDouble((String) valueList.get(altIndex));
+                    } catch (final IndexOutOfBoundsException e) {
+                        throw new UserException(String.format("Could not extract annotation %s from variant context: %s. " +
+                                "Encountered exception: %s", annotationName, vc, e));
+                    }
+                } else {
+                    //if somehow our alleles got mixed up
+                    throw new IllegalStateException("Allele " + altAllele + " is not contained in the input VariantContext.");
+                }
+            } else {
+                try {
+                    value = vc.getAttributeAsDouble(annotationName, Double.NaN);
+                } catch (final ClassCastException e) {
+                    throw new UserException(String.format("Could not extract annotation %s from variant context: %s. " +
+                                    "Ensure that %s is specified, if desired. Encountered exception: %s",
+                            annotationName, vc, LabeledVariantAnnotationsWalker.USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME, e));
+                }
+            }
+            if (Double.isInfinite(value)) {
+                value = Double.NaN;
+            }
+        } catch (final NumberFormatException e) {
+            value = Double.NaN;
+        }
+        return value;
+    }
+}
\ No newline at end of file
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/data/VariantType.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/data/VariantType.java
new file mode 100644
index 00000000000..0c9560d76fc
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/data/VariantType.java
@@ -0,0 +1,58 @@
+package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data;
+
+import htsjdk.variant.variantcontext.Allele;
+import htsjdk.variant.variantcontext.VariantContext;
+
+/**
+ * This code and logic for determining variant types was mostly retained from VQSR.
+ * Note that there may be some inconsistencies and room for improvement in these definitions;
+ * see comments in https://github.com/broadinstitute/gatk/pull/7954.
+ */
+public enum VariantType {
+    SNP,
+    INDEL;
+
+    /**
+     * Returns true if both {@code vc} and {@code resourceVC} are the same variant type,
+     * following our definitions.
+     */
+    public static boolean checkVariantType(final VariantContext vc,
+                                           final VariantContext resourceVC) {
+        switch (resourceVC.getType()) {
+            case SNP:
+            case MNP:
+                return getVariantType(vc) == SNP;
+            case INDEL:
+            case MIXED:
+            case SYMBOLIC:
+                return getVariantType(vc) == INDEL;
+            default:
+                return false;
+        }
+    }
+
+    public static VariantType getVariantType(final VariantContext vc) {
+        if (vc.isSNP() || vc.isMNP()) {
+            return SNP;
+        } else if (vc.isStructuralIndel() || vc.isIndel() || vc.isMixed() || vc.isSymbolic()) {
+            return INDEL;
+        } else {
+            throw new IllegalStateException("Encountered unknown variant type: " + vc.getType());
+        }
+    }
+
+    /**
+     * Note that spanning deletions are expected to be filtered out upstream of this method
+     * to preserve VQSR behavior; we do not explicitly check this.
+     * See VariantDataManager#checkVariationClass(VariantContext, Allele, VariantRecalibratorArgumentCollection.Mode),
+     * from which this method originated.
+     */
+    public static VariantType getAlleleSpecificVariantType(final VariantContext vc,
+                                                           final Allele allele) {
+        if (vc.getReference().length() == allele.length()) {
+            // note that spanning deletions would be considered SNPs by this logic
+            return SNP;
+        }
+        return INDEL;
+    }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/BGMMVariantAnnotationsModel.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/BGMMVariantAnnotationsModel.java
new file mode 100644
index 00000000000..14fedaa0a98
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/BGMMVariantAnnotationsModel.java
@@ -0,0 +1,31 @@
+package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling;
+
+import org.apache.commons.lang.NotImplementedException;
+
+import java.io.File;
+import java.io.Serializable;
+
+// TODO this is just a stub, will be fleshed out in a separate PR
+public final class BGMMVariantAnnotationsModel implements VariantAnnotationsModel {
+
+    public BGMMVariantAnnotationsModel(final File hyperparametersJSONFile) {
+        throw new NotImplementedException("BGMM module will be implemented in separate PR.");
+    }
+
+    @Override
+    public void trainAndSerialize(final File trainingAnnotationsFile,
+                                  final String outputPrefix) {
+        throw new NotImplementedException("BGMM module will be implemented in separate PR.");
+    }
+
+    static final class Preprocesser implements Serializable {
+        private static final long serialVersionUID = 1L;
+
+        Preprocesser() {
+        }
+
+        double[][] transform(final double[][] data) {
+            throw new NotImplementedException("BGMM module will be implemented in separate PR.");
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/BGMMVariantAnnotationsScorer.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/BGMMVariantAnnotationsScorer.java
new file mode 100644
index 00000000000..5a51dcf8dfb
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/BGMMVariantAnnotationsScorer.java
@@ -0,0 +1,67 @@
+package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling;
+
+import org.apache.commons.lang.NotImplementedException;
+import org.apache.logging.log4j.Logger;
+import org.broadinstitute.hdf5.HDF5File;
+import org.broadinstitute.hdf5.HDF5LibException;
+import org.broadinstitute.hellbender.exceptions.GATKException;
+import org.broadinstitute.hellbender.tools.copynumber.utils.HDF5Utils;
+import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.LabeledVariantAnnotationsData;
+import org.broadinstitute.hellbender.utils.clustering.BayesianGaussianMixtureModeller;
+import org.broadinstitute.hellbender.utils.io.IOUtils;
+
+import java.io.File;
+import java.io.Serializable;
+import java.util.List;
+
+// TODO this is just a stub, will be fleshed out in a separate PR
+public final class BGMMVariantAnnotationsScorer implements VariantAnnotationsScorer, Serializable {
+
+    private static final long serialVersionUID = 1L;
+
+    public static final String BGMM_SCORER_SER_SUFFIX = ".bgmmScorer.ser";
+
+    public BGMMVariantAnnotationsScorer(final List<String> annotationNames,
+                                        final BGMMVariantAnnotationsModel.Preprocesser preprocesser,
+                                        final BayesianGaussianMixtureModeller bgmm) {
+        throw new NotImplementedException("BGMM module will be implemented in separate PR.");
+    }
+
+    @Override
+    public void score(final File inputAnnotationsFile,
+                      final File outputScoresFile) {
+        throw new NotImplementedException("BGMM module will be implemented in separate PR.");
+    }
+
+    public double[][] preprocess(final double[][] annotations) {
+        throw new NotImplementedException("BGMM module will be implemented in separate PR.");
+    }
+
+    public void serialize(final File scorerFile) {
+        throw new NotImplementedException("BGMM module will be implemented in separate PR.");
+    }
+
+    public static BGMMVariantAnnotationsScorer deserialize(final File scorerFile) {
+        throw new NotImplementedException("BGMM module will be implemented in separate PR.");
+    }
+
+    // TODO clean this up, copy more fields
+    public static void preprocessAnnotationsWithBGMMAndWriteHDF5(final List<String> annotationNames,
+                                                                 final String outputPrefix,
+                                                                 final File labeledTrainingAndVariantTypeAnnotationsFile,
+                                                                 final Logger logger) {
+        final double[][] rawAnnotations = LabeledVariantAnnotationsData.readAnnotations(labeledTrainingAndVariantTypeAnnotationsFile);
+        final BGMMVariantAnnotationsScorer scorer = BGMMVariantAnnotationsScorer.deserialize(new File(outputPrefix + BGMM_SCORER_SER_SUFFIX));
+        final double[][] preprocessedAnnotations = scorer.preprocess(rawAnnotations);
+        final File outputPreprocessedAnnotationsFile = new File(outputPrefix + ".annot.pre.hdf5");
+        try (final HDF5File hdf5File = new HDF5File(outputPreprocessedAnnotationsFile, HDF5File.OpenMode.CREATE)) {
+            IOUtils.canReadFile(hdf5File.getFile());
+            hdf5File.makeStringArray("/data/annotation_names", annotationNames.toArray(new String[0]));
+            HDF5Utils.writeChunkedDoubleMatrix(hdf5File, "/data/annotations", preprocessedAnnotations, HDF5Utils.MAX_NUMBER_OF_VALUES_PER_HDF5_MATRIX / 16);
+        } catch (final HDF5LibException exception) {
+            throw new GATKException(String.format("Exception encountered during writing of preprocessed annotations (%s). Output file at %s may be in a bad state.",
+                    exception, outputPreprocessedAnnotationsFile.getAbsolutePath()));
+        }
+        logger.info(String.format("Preprocessed annotations written to %s.", outputPreprocessedAnnotationsFile.getAbsolutePath()));
+    }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/PythonSklearnVariantAnnotationsModel.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/PythonSklearnVariantAnnotationsModel.java
new file mode 100644
index 00000000000..bbe082186a3
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/PythonSklearnVariantAnnotationsModel.java
@@ -0,0 +1,69 @@
+package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling;
+
+import org.broadinstitute.hellbender.exceptions.UserException;
+import org.broadinstitute.hellbender.utils.python.PythonScriptExecutor;
+import org.broadinstitute.hellbender.utils.runtime.ProcessOutput;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * Given an HDF5 file containing annotations for a training set (in the format specified by
+ * {@link VariantAnnotationsModel#trainAndSerialize}), a Python script containing modeling code,
+ * and a JSON file containing hyperparameters, the {@link #trainAndSerialize} method can be used to train a model.
+ *
+ * The modeling script should take the arguments: {@code annotations_file}, {@code hyperparameters_json_file},
+ * and {@code output_prefix}. The script is expected to generate the file {outputPrefix}.scorer.pkl. This file should
+ * contain a pickled Python lambda function to be used for generating scores from annotations in a subsequent test set.
+ * The lambda should have the signature:
+ *
+ *      lambda test_annotation_names_i, test_X_ni
+ *
+ * Here, test_annotation_names_i is a numpy array of strings containing the annotation names, and
+ * test X_ni is a numpy matrix of float-valued annotations, with dimensions (number of data points) x (number of annotations).
+ * The lambda should check the test annotation names against the training annotation names and
+ * then return a numpy array of float-valued scores with length given by the number of data points.
+ *
+ * See src/main/resources/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/isolation-forest.py for an example implementation.
+ */
+public final class PythonSklearnVariantAnnotationsModel implements VariantAnnotationsModel {
+
+    private final File pythonScriptFile;
+    private final File hyperparametersJSONFile;
+
+    public PythonSklearnVariantAnnotationsModel(final File pythonScriptFile,
+                                                final File hyperparametersJSONFile) {
+        this.pythonScriptFile = pythonScriptFile;
+        this.hyperparametersJSONFile = hyperparametersJSONFile;
+    }
+
+    @Override
+    public void trainAndSerialize(final File trainingAnnotationsFile,
+                                  final String outputPrefix) {
+        final PythonScriptExecutor executor = new PythonScriptExecutor(true);
+        final ProcessOutput pythonProcessOutput = executor.executeScriptAndGetOutput(
+                pythonScriptFile.getAbsolutePath(),
+                null,
+                composePythonArguments(trainingAnnotationsFile, hyperparametersJSONFile, outputPrefix));
+
+        if (pythonProcessOutput.getExitValue() != 0) {
+            throw executor.getScriptException(executor.getExceptionMessageFromScriptError(pythonProcessOutput));
+        }
+    }
+
+    private static List<String> composePythonArguments(final File annotationsFile,
+                                                       final File hyperparametersJSONFile,
+                                                       final String outputPrefix) {
+        try {
+            return new ArrayList<>(Arrays.asList(
+                    "--annotations_file=" + annotationsFile.getCanonicalPath(),
+                    "--hyperparameters_json_file=" + hyperparametersJSONFile.getCanonicalPath(),
+                    "--output_prefix=" + outputPrefix));
+        } catch (final IOException e) {
+            throw new UserException.BadInput(String.format("Encountered exception resolving canonical file paths: %s", e));
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/PythonSklearnVariantAnnotationsScorer.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/PythonSklearnVariantAnnotationsScorer.java
new file mode 100644
index 00000000000..51e4e9a4e9b
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/PythonSklearnVariantAnnotationsScorer.java
@@ -0,0 +1,69 @@
+package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling;
+
+import org.broadinstitute.hellbender.exceptions.UserException;
+import org.broadinstitute.hellbender.utils.python.PythonScriptExecutor;
+import org.broadinstitute.hellbender.utils.runtime.ProcessOutput;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * Given an HDF5 file containing annotations for a test set (in the format specified by
+ * {@link VariantAnnotationsScorer#score}), a Python script containing scoring code,
+ * and a file containing a pickled Python lambda function for scoring,
+ * the {@link #score} method can be used to generate scores.
+ *
+ * The scoring script should take the arguments: {@code annotations_file}, {@code scorer_pkl_file},
+ * and {@code output_scores_file}. The script is expected to load both the annotations and the pickled scoring function,
+ * which are then used to generate the file {outputPrefix}.scores.hdf5. This HDF5 file should contain
+ * a double array of the scores in {@value SCORES_PATH}, in the same order as the corresponding data points
+ * in the provided annotations.
+ *
+ * See src/main/resources/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/isolation-forest.py for an example implementation.
+ */
+public final class PythonSklearnVariantAnnotationsScorer implements VariantAnnotationsScorer, Serializable {
+
+    private static final long serialVersionUID = 1L;
+
+    public static final String PYTHON_SCORER_PKL_SUFFIX = ".scorer.pkl";
+
+    private final File pythonScriptFile;
+    private final File scorerPklFile;
+
+    public PythonSklearnVariantAnnotationsScorer(final File pythonScriptFile,
+                                                 final File scorerPklFile) {
+        this.pythonScriptFile = pythonScriptFile;
+        this.scorerPklFile = scorerPklFile;
+    }
+
+    @Override
+    public void score(final File inputAnnotationsFile,
+                      final File outputScoresFile) {
+        final PythonScriptExecutor executor = new PythonScriptExecutor(true);
+        final ProcessOutput pythonProcessOutput = executor.executeScriptAndGetOutput(
+                pythonScriptFile.getAbsolutePath(),
+                null,
+                composePythonArguments(inputAnnotationsFile, scorerPklFile, outputScoresFile));
+
+        if (pythonProcessOutput.getExitValue() != 0) {
+            throw executor.getScriptException(executor.getExceptionMessageFromScriptError(pythonProcessOutput));
+        }
+    }
+
+    private static List<String> composePythonArguments(final File annotationsFile,
+                                                       final File scorerPklFile,
+                                                       final File outputScoresFile) {
+        try {
+            return new ArrayList<>(Arrays.asList(
+                    "--annotations_file=" + annotationsFile.getCanonicalPath(),
+                    "--scorer_pkl_file=" + scorerPklFile.getCanonicalPath(),
+                    "--output_scores_file=" + outputScoresFile.getCanonicalPath()));
+        } catch (final IOException e) {
+            throw new UserException.BadInput(String.format("Encountered exception resolving canonical file paths: %s", e));
+        }
+    }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/VariantAnnotationsModel.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/VariantAnnotationsModel.java
new file mode 100644
index 00000000000..ee2e899d0a8
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/VariantAnnotationsModel.java
@@ -0,0 +1,46 @@
+package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling;
+
+import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.LabeledVariantAnnotationsData;
+
+import java.io.File;
+
+/**
+ * File interface for passing annotations to a modeling backend and indicating a path prefix for resulting output.
+ */
+public interface VariantAnnotationsModel {
+
+    /**
+     * @param trainingAnnotationsFile   Training annotations in HDF5 format, containing at least the directory structure
+     *
+     *                                  <p>
+     *                                    |--- annotations<br>
+     *                                    |    |--- chunk_0<br>
+     *                                    |    |--- ...<br>
+     *                                    |    |--- chunk_{num_chunks - 1}<br>
+     *                                    |    |--- names<br>
+     *                                    |    |--- num_chunks<br>
+     *                                    |    |--- num_columns<br>
+     *                                    |    |--- num_rows<br>
+     *                                  </p>
+     *
+     *                                  Here, each chunk is a double matrix, with dimensions given by
+     *                                  (number of sites in the chunk) x (number of annotations).
+     *                                  See {@link LabeledVariantAnnotationsData#writeHDF5}.
+     *
+     *                                  Modeling backends are responsible for consuming annotations in this format
+     *                                  and outputting a {@link VariantAnnotationsScorer} for each variant type
+     *                                  with the appropriate output names. This responsibility includes the
+     *                                  implementation of functionality that allows validation of annotation names
+     *                                  in downstream {@link VariantAnnotationsScorer} instances.
+     *
+     *                                  In current use, we assume that a single model will be trained, so either
+     *                                    1) training annotations have already been subset to a single variant type (SNP or INDEL), or
+     *                                    2) we assume the model does not care about the variant type.
+     *                                  TODO we could also pass additional labels to be used in training,
+     *                                       but all backends would have to likewise respect directory structure
+     *
+     * @param outputPrefix              Path prefix for all output files
+     */
+    void trainAndSerialize(final File trainingAnnotationsFile,
+                           final String outputPrefix);
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/VariantAnnotationsModelBackend.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/VariantAnnotationsModelBackend.java
new file mode 100644
index 00000000000..a4fa8460440
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/VariantAnnotationsModelBackend.java
@@ -0,0 +1,16 @@
+package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling;
+
+public enum VariantAnnotationsModelBackend {
+    // TODO will be added in a separate PR
+    JAVA_BGMM,
+
+    /**
+     * Use the script at org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/isolation-forest.py
+     */
+    PYTHON_IFOREST,
+
+    /**
+     * Use a user-provided script.
+     */
+    PYTHON_SCRIPT
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/VariantAnnotationsScorer.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/VariantAnnotationsScorer.java
new file mode 100644
index 00000000000..c0550273c57
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/VariantAnnotationsScorer.java
@@ -0,0 +1,111 @@
+package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling;
+
+import org.broadinstitute.hdf5.HDF5File;
+import org.broadinstitute.hdf5.HDF5LibException;
+import org.broadinstitute.hellbender.exceptions.GATKException;
+import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.LabeledVariantAnnotationsData;
+import org.broadinstitute.hellbender.utils.Utils;
+import org.broadinstitute.hellbender.utils.io.IOUtils;
+import org.hipparchus.stat.fitting.EmpiricalDistribution;
+
+import java.io.File;
+import java.util.Arrays;
+import java.util.function.Function;
+import java.util.stream.IntStream;
+
+/**
+ * File interface for passing annotations to a scoring backend and returning scores.
+ */
+public interface VariantAnnotationsScorer {
+
+    String SCORES_PATH = "/data/scores"; // our HDF5 library does not allow writing to a bare/root path (e.g., /scores)
+
+    /**
+     * @param inputAnnotationsFile  Annotations to be scored in HDF5 format, containing at least the directory structure
+     *
+     *                              <p>
+     *                                |--- annotations<br>
+     *                                |    |--- chunk_0<br>
+     *                                |    |--- ...<br>
+     *                                |    |--- chunk_{num_chunks - 1}<br>
+     *                                |    |--- names<br>
+     *                                |    |--- num_chunks<br>
+     *                                |    |--- num_columns<br>
+     *                                |    |--- num_rows<br>
+     *                              </p>
+     *
+     *                              Here, each chunk is a double matrix, with dimensions given by
+     *                              (number of sites in the chunk) x (number of annotations).
+     *                              See {@link LabeledVariantAnnotationsData#writeHDF5}.
+     *
+     *                              Scoring backends are responsible for consuming annotations in this format and
+     *                              outputting a double array of scores to file. This responsibility includes
+     *                              validation of annotation names.
+     *
+     * @param outputScoresFile      Output file in HDF5 format, containing scores at {@link VariantAnnotationsScorer#SCORES_PATH}.
+     */
+    void score(final File inputAnnotationsFile,
+               final File outputScoresFile);
+
+    /**
+     * Given scores for a calibration set, returns a function for converting a subsequent score to a
+     * sensitivity with respect to that calibration set. This function is simply given by 1 - ECDF,
+     * where ECDF is the empirical cumulative distribution function of the calibration scores;
+     * see <a href='https://en.wikipedia.org/wiki/Empirical_distribution_function'>here</a>.
+     * For example, a score that is very low relative to the calibration scores would yield a
+     * high calibration sensitivity; that is, using this score as the minimum allowable threshold for filtering
+     * would result in a high sensitivity with respect to the calibration set.
+     *
+     * @param calibrationScores must all be finite
+     */
+    static Function<Double, Double> createScoreToCalibrationSensitivityConverter(final double[] calibrationScores) {
+        Utils.validateArg(Arrays.stream(calibrationScores).allMatch(Double::isFinite),
+                "Calibration scores must all be finite.");
+        final EmpiricalDistribution empiricalDistribution = new EmpiricalDistribution();
+        empiricalDistribution.load(calibrationScores);
+        return score -> 1. - empiricalDistribution.cumulativeProbability(score);
+    }
+
+    /**
+     * Reads a double array of scores from {@value SCORES_PATH} in an HDF5 file.
+     */
+    static double[] readScores(final File inputFile) {
+        try (final HDF5File inputHDF5File = new HDF5File(inputFile, HDF5File.OpenMode.READ_ONLY)) {
+            IOUtils.canReadFile(inputHDF5File.getFile());
+            return inputHDF5File.readDoubleArray(SCORES_PATH);
+        } catch (final HDF5LibException exception) {
+            throw new GATKException(String.format("Exception encountered during reading of scores from %s: %s",
+                    inputFile.getAbsolutePath(), exception));
+        }
+    }
+
+    /**
+     * Writes a double array of scores to {@value SCORES_PATH} in an HDF5 file.
+     */
+    static void writeScores(final File outputFile,
+                            final double[] scores) {
+        try (final HDF5File outputHDF5File = new HDF5File(outputFile, HDF5File.OpenMode.CREATE)) {
+            outputHDF5File.makeDoubleArray(SCORES_PATH, scores);
+        } catch (final HDF5LibException exception) {
+            throw new GATKException(String.format("Exception encountered during writing of scores (%s). Output file at %s may be in a bad state.",
+                    exception, outputFile.getAbsolutePath()));
+        }
+    }
+
+    /**
+     * Yields a VQSR-style positive-negative scorer that returns the difference of the positive score and the negative score.
+     */
+    static VariantAnnotationsScorer combinePositiveAndNegativeScorer(final VariantAnnotationsScorer positiveScorer,
+                                                                     final VariantAnnotationsScorer negativeScorer) {
+        return (inputAnnotationsFile, outputScoresFile) -> {
+            final File tempPositiveScoresFile = IOUtils.createTempFile("positive", "scores.hdf5");
+            final File tempNegativeScoresFile = IOUtils.createTempFile("negative", "scores.hdf5");
+            positiveScorer.score(inputAnnotationsFile, tempPositiveScoresFile);
+            final double[] positiveScores = VariantAnnotationsScorer.readScores(tempPositiveScoresFile);
+            negativeScorer.score(inputAnnotationsFile, tempNegativeScoresFile);
+            final double[] negativeScores = VariantAnnotationsScorer.readScores(tempNegativeScoresFile);
+            final double[] scores = IntStream.range(0, positiveScores.length).mapToDouble(i -> positiveScores[i] - negativeScores[i]).toArray();
+            VariantAnnotationsScorer.writeScores(outputScoresFile, scores);
+        };
+    }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/utils/MathUtils.java b/src/main/java/org/broadinstitute/hellbender/utils/MathUtils.java
index 32b0123b2d7..4ca56eae151 100644
--- a/src/main/java/org/broadinstitute/hellbender/utils/MathUtils.java
+++ b/src/main/java/org/broadinstitute/hellbender/utils/MathUtils.java
@@ -31,6 +31,7 @@ public final class MathUtils {
     public static final double LOG10_ONE_HALF = Math.log10(0.5);
     public static final double LOG10_ONE_THIRD = -Math.log10(3.0);
     public static final double LOG_ONE_THIRD = -Math.log(3.0);
+    public static final double LOG_2 = Math.log(2.0);
     public static final double INV_LOG_2 = 1.0 / Math.log(2.0);
     private static final double LOG_10 = Math.log(10);
     private static final double INV_LOG_10 = 1.0 / LOG_10;
diff --git a/src/main/java/org/broadinstitute/hellbender/utils/NaturalLogUtils.java b/src/main/java/org/broadinstitute/hellbender/utils/NaturalLogUtils.java
index 6de748f01f7..55f7b9d8909 100644
--- a/src/main/java/org/broadinstitute/hellbender/utils/NaturalLogUtils.java
+++ b/src/main/java/org/broadinstitute/hellbender/utils/NaturalLogUtils.java
@@ -81,7 +81,7 @@ public static double logSumExp(final double... logValues) {
             }
         }
         if ( Double.isNaN(sum) || sum == Double.POSITIVE_INFINITY ) {
-            throw new IllegalArgumentException("log10 p: Values must be non-infinite and non-NAN");
+            throw new IllegalArgumentException("logValues must be non-infinite and non-NAN");
         }
         return maxValue + (sum != 1.0 ? Math.log(sum) : 0.0);
     }
diff --git a/src/main/java/org/broadinstitute/hellbender/utils/clustering/BayesianGaussianMixtureModeller.java b/src/main/java/org/broadinstitute/hellbender/utils/clustering/BayesianGaussianMixtureModeller.java
new file mode 100644
index 00000000000..fc759db3e9d
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/utils/clustering/BayesianGaussianMixtureModeller.java
@@ -0,0 +1,35 @@
+package org.broadinstitute.hellbender.utils.clustering;
+
+import org.apache.commons.lang.NotImplementedException;
+import org.apache.commons.math3.linear.RealMatrix;
+import org.apache.commons.math3.linear.RealVector;
+
+import java.io.Serializable;
+
+public final class BayesianGaussianMixtureModeller implements Serializable {
+    private static final long serialVersionUID = 1L;
+
+    public enum InitMethod {
+        K_MEANS_PLUS_PLUS, RANDOM, TEST
+    }
+
+    private BayesianGaussianMixtureModeller(final int nComponents,
+                                            final double tol,
+                                            final double regCovar,
+                                            final int maxIter,
+                                            final int nInit,
+                                            final InitMethod initMethod,
+                                            final double weightConcentrationPrior,
+                                            final double meanPrecisionPrior,
+                                            final RealVector meanPrior,
+                                            final Double degreesOfFreedomPrior,
+                                            final RealMatrix covariancePrior,
+                                            final int seed,
+                                            final boolean warmStart,
+                                            final int verboseInterval,
+                                            final double relativeSymmetryThreshold,
+                                            final double absolutePositivityThreshold,
+                                            final double epsilon) {
+        throw new NotImplementedException("BGMM module implemented in separate PR.");
+    }
+}
\ No newline at end of file
diff --git a/src/main/python/org/broadinstitute/hellbender/gcnvkernel/models/model_denoising_calling.py b/src/main/python/org/broadinstitute/hellbender/gcnvkernel/models/model_denoising_calling.py
index f5d525ace74..cefebd1cf2b 100644
--- a/src/main/python/org/broadinstitute/hellbender/gcnvkernel/models/model_denoising_calling.py
+++ b/src/main/python/org/broadinstitute/hellbender/gcnvkernel/models/model_denoising_calling.py
@@ -786,8 +786,7 @@ def __init__(self,
         # the expected number of erroneously mapped reads
         mean_mapping_error_correction_s = eps_mapping * read_depth_s * shared_workspace.average_ploidy_s
 
-        denoised_copy_ratio_st = ((shared_workspace.n_st - mean_mapping_error_correction_s.dimshuffle(0, 'x'))
-                                  / ((1.0 - eps_mapping) * read_depth_s.dimshuffle(0, 'x') * bias_st))
+        denoised_copy_ratio_st = shared_workspace.n_st / (read_depth_s.dimshuffle(0, 'x') * bias_st)
 
         Deterministic(name='denoised_copy_ratio_st', var=denoised_copy_ratio_st)
 
diff --git a/src/main/resources/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/isolation-forest-hyperparameters.json b/src/main/resources/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/isolation-forest-hyperparameters.json
new file mode 100644
index 00000000000..172b8aa42eb
--- /dev/null
+++ b/src/main/resources/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/isolation-forest-hyperparameters.json
@@ -0,0 +1,3 @@
+{
+  "random_state": 0
+}
\ No newline at end of file
diff --git a/src/main/resources/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/isolation-forest.py b/src/main/resources/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/isolation-forest.py
new file mode 100644
index 00000000000..554817162b2
--- /dev/null
+++ b/src/main/resources/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/isolation-forest.py
@@ -0,0 +1,138 @@
+import argparse
+import h5py
+import sklearn.ensemble
+import sklearn.impute
+import numpy as np
+import dill
+import json
+
+
+def read_annotations(h5file):
+    with h5py.File(h5file, 'r') as f:
+        annotation_names_i = f['/annotations/names'][()].astype(str)
+
+        # read chunked annotations
+        num_chunks = int(f['/annotations/num_chunks'][()])
+        num_columns = int(f['/annotations/num_columns'][()])
+        num_rows = int(f['/annotations/num_rows'][()])
+        X_ni = np.zeros((num_rows, num_columns))
+        n = 0
+        for chunk_index in range(num_chunks):
+            chunk_ni = f[f'/annotations/chunk_{chunk_index}'][()]
+            num_rows_in_chunk = len(chunk_ni)
+            X_ni[n:n + num_rows_in_chunk, :] = chunk_ni
+            n += num_rows_in_chunk
+        assert n == num_rows
+    return annotation_names_i, X_ni
+
+
+def train(annotations_file,
+          hyperparameters_json_file,
+          output_prefix):
+    print('Reading annotations...')
+    annotation_names_i, X_ni = read_annotations(annotations_file)
+    print(f'Annotations: {annotation_names_i}.')
+
+    print('Reading hyperparameters...')
+    with open(hyperparameters_json_file) as json_file:
+        hyperparameters_kwargs = json.load(json_file)
+    print('Hyperparameters:', hyperparameters_kwargs)
+
+    print('Imputing annotations...')
+    imputer = sklearn.impute.SimpleImputer(strategy='median')
+    imputed_X_ni = imputer.fit_transform(X_ni)
+
+    # SimpleImputer will drop any features that are completely missing, resulting in different shapes for
+    # imputed_X_ni and X_ni and misalignment of features when training and scoring downstream if not checked.
+    # We externally check for and fail in the presence of any entirely missing features, but we do a redundant check here.
+    assert imputed_X_ni.shape == X_ni.shape, \
+        f'Shape of imputed annotations differs from shape of raw annotations; at least one feature is completely missing ' \
+        f'and hence dropped during imputation.'
+
+    print(f'Training IsolationForest with {imputed_X_ni.shape[0]} training sites x {imputed_X_ni.shape[1]} annotations...')
+    clf = sklearn.ensemble.IsolationForest(**hyperparameters_kwargs)
+    clf.fit(imputed_X_ni)
+    print('Training complete.')
+
+    def score_samples(test_annotation_names_i,
+                      test_X_ni):
+        assert np.array_equal(test_annotation_names_i, annotation_names_i), \
+            f'Input annotation names ({test_annotation_names_i}) must be identical to those used to train the scorer ({annotation_names_i}).'
+        return clf.score_samples(imputer.transform(test_X_ni)) # TODO sklearn's implementation is single-threaded, but this could perhaps be parallelized
+
+    scorer_lambda = lambda test_annotation_names_i, test_X_ni: score_samples(test_annotation_names_i, test_X_ni)
+
+    print(f'Pickling scorer...')
+    output_scorer_pkl_file = f'{output_prefix}.scorer.pkl'
+    with open(output_scorer_pkl_file, 'wb') as f:
+        dill.dump(scorer_lambda, f) # the dill package can be used to pickle lambda functions
+    print(f'Scorer pickled to {output_scorer_pkl_file}.')
+
+
+def score(annotations_file,
+          scorer_pkl_file,
+          output_scores_file):
+    annotation_names_i, X_ni = read_annotations(annotations_file)
+
+    with open(scorer_pkl_file, 'rb') as f:
+        scorer_lambda = dill.load(f)
+    score_n = scorer_lambda(annotation_names_i, X_ni)
+
+    with h5py.File(output_scores_file, 'w') as f:
+        scores_dset = f.create_dataset('data/scores', (len(score_n),), dtype='d')
+        scores_dset[:] = score_n
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--annotations_file',
+                        type=str,
+                        required=True,
+                        help='')
+
+    parser.add_argument('--hyperparameters_json_file',
+                        type=str,
+                        required=False,
+                        help='')
+
+    parser.add_argument('--output_prefix',
+                        type=str,
+                        required=False,
+                        help='')
+
+    parser.add_argument('--scorer_pkl_file',
+                        type=str,
+                        required=False,
+                        help='')
+
+    parser.add_argument('--output_scores_file',
+                        type=str,
+                        required=False,
+                        help='')
+
+    args = parser.parse_args()
+
+    annotations_file = args.annotations_file
+
+    # this script can handle both training and scoring; we check the passed arguments to determine which is appropriate
+    if args.hyperparameters_json_file is not None and args.output_prefix is not None and \
+        args.scorer_pkl_file is None and args.output_scores_file is None:
+        hyperparameters_json_file = args.hyperparameters_json_file
+        output_prefix = args.output_prefix
+        train(annotations_file,
+              hyperparameters_json_file,
+              output_prefix)
+    elif args.hyperparameters_json_file is None and args.output_prefix is None and \
+            args.scorer_pkl_file is not None and args.output_scores_file is not None:
+        scorer_pkl_file = args.scorer_pkl_file
+        output_scores_file = args.output_scores_file
+        score(annotations_file,
+              scorer_pkl_file,
+              output_scores_file)
+    else:
+        raise
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ExtractVariantAnnotationsIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ExtractVariantAnnotationsIntegrationTest.java
new file mode 100644
index 00000000000..dd3f1202b7f
--- /dev/null
+++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ExtractVariantAnnotationsIntegrationTest.java
@@ -0,0 +1,253 @@
+package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable;
+
+import com.google.common.collect.Lists;
+import org.apache.commons.lang3.tuple.Pair;
+import org.broadinstitute.hellbender.CommandLineProgramTest;
+import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
+import org.broadinstitute.hellbender.exceptions.UserException;
+import org.broadinstitute.hellbender.testutils.ArgumentsBuilder;
+import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.LabeledVariantAnnotationsData;
+import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.VariantType;
+import org.testng.Assert;
+import org.testng.annotations.DataProvider;
+import org.testng.annotations.Test;
+
+import java.io.File;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.function.Function;
+import java.util.function.Supplier;
+import java.util.stream.Collectors;
+
+/**
+ * Note that the expected outputs for the exact-match tests below are used as inputs for
+ * {@link TrainVariantAnnotationsModelIntegrationTest}. Similarly, the expected outputs for
+ * {@link TrainVariantAnnotationsModelIntegrationTest} are used as inputs for {@link ScoreVariantAnnotationsIntegrationTest}.
+ * Thus, developers should keep the expected outputs for all of these integration tests in sync when updating any of them.
+ * This can easily be done by setting the UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS flags for all tools to be true and then running
+ * the tests in order.
+ */
+public final class ExtractVariantAnnotationsIntegrationTest extends CommandLineProgramTest {
+
+    // If true, update the expected outputs in tests that assert an exact match vs. prior output,
+    // instead of actually running the tests. Can be used with "./gradlew test -Dtest.single=ExtractVariantAnnotationsIntegrationTest"
+    // to update all of the exact-match tests at once. After you do this, you should look at the
+    // diffs in the new expected outputs in git to confirm that they are consistent with expectations.
+    public static final boolean UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS = false;
+
+    /**
+     * Make sure that someone didn't leave the UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS toggle turned on.
+     */
+    @Test
+    public void assertThatExpectedOutputUpdateToggleIsDisabled() {
+        Assert.assertFalse(UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS, "The toggle to update expected outputs should not be left enabled.");
+    }
+
+    private static final List<String> NON_ALLELE_SPECIFIC_ANNOTATIONS = Arrays.asList(
+            "DP", "FS", "MQ", "MQRankSum", "QD", "ReadPosRankSum", "SOR");
+
+    private static final List<String> ALLELE_SPECIFIC_ANNOTATIONS = Arrays.asList(
+            "AS_FS", "AS_MQ", "AS_MQRankSum", "AS_QD", "AS_ReadPosRankSum", "AS_SOR");
+
+    private static final File PACKAGE_TEST_FILES_DIR = new File(largeFileTestDir,
+            "org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/");
+    private static final File TEST_FILES_DIR = new File(largeFileTestDir,
+            "org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract");
+    private static final File EXPECTED_TEST_FILES_DIR = new File(TEST_FILES_DIR, "expected");
+
+    // The input VCF should cover a genomic region given by the union of regions in the below training and calibration resources
+    // and should also contain a few multiallelics that overlap those resources.
+    private static final File INPUT_VCF = new File(PACKAGE_TEST_FILES_DIR, "input/small_callset_low_threshold.sites-only.chr1.1-10M.vcf");
+
+    // We use snippets of the Omni sites for SNP training (chr1:1-5000000) and calibration (chr1:5000000-10000000); we don't sweat the 1bp overlap.
+    private static final File SNP_TRAINING_VCF = new File(PACKAGE_TEST_FILES_DIR, "resources/1000G_omni2.5.hg38.chr1.1-5M.vcf.gz");
+    private static final File SNP_CALIBRATION_VCF = new File(PACKAGE_TEST_FILES_DIR, "resources/1000G_omni2.5.hg38.chr1.5M-10M.vcf.gz");
+
+    // We use snippets of the Mills sites for indel training (chr1:1-5000000) and calibration (chr1:5000000-10000000); we don't sweat the 1bp overlap.
+    private static final File INDEL_TRAINING_VCF = new File(PACKAGE_TEST_FILES_DIR, "resources/Mills_and_1000G_gold_standard.indels.hg38.chr1.1-5M.vcf.gz");
+    private static final File INDEL_CALIBRATION_VCF = new File(PACKAGE_TEST_FILES_DIR, "resources/Mills_and_1000G_gold_standard.indels.hg38.chr1.5M-10M.vcf.gz");
+
+    private static final int MAXIMUM_NUMBER_OF_UNLABELED_VARIANTS = 100;
+
+    // Supplier and functions for creating and adding various arguments to an ArgumentsBuilder.
+    private static final Supplier<ArgumentsBuilder> BASE_ARGUMENTS_BUILDER_SUPPLIER = () -> {
+        final ArgumentsBuilder argsBuilder = new ArgumentsBuilder();
+        argsBuilder.addVCF(INPUT_VCF);
+        argsBuilder.addFlag(LabeledVariantAnnotationsWalker.DO_NOT_GZIP_VCF_OUTPUT_LONG_NAME); // we do not gzip VCF outputs so that we can use diff to compare to the expected result
+        argsBuilder.add(StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE, false);
+        return argsBuilder;
+    };
+    static final Function<ArgumentsBuilder, ArgumentsBuilder> ADD_NON_ALLELE_SPECIFIC_ANNOTATIONS = argsBuilder -> {
+        NON_ALLELE_SPECIFIC_ANNOTATIONS.forEach(a -> argsBuilder.add(StandardArgumentDefinitions.ANNOTATION_LONG_NAME, a));
+        return argsBuilder;
+    };
+    static final Function<ArgumentsBuilder, ArgumentsBuilder> ADD_ALLELE_SPECIFIC_ANNOTATIONS = argsBuilder -> {
+        argsBuilder.addFlag(LabeledVariantAnnotationsWalker.USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME);
+        ALLELE_SPECIFIC_ANNOTATIONS.forEach(a -> argsBuilder.add(StandardArgumentDefinitions.ANNOTATION_LONG_NAME, a));
+        return argsBuilder;
+    };
+    static final Function<ArgumentsBuilder, ArgumentsBuilder> ADD_SNP_MODE_AND_RESOURCES = argsBuilder -> {
+        argsBuilder.add(LabeledVariantAnnotationsWalker.MODE_LONG_NAME, VariantType.SNP)
+                .add(StandardArgumentDefinitions.RESOURCE_LONG_NAME + String.format(":omni-training,%s=true", LabeledVariantAnnotationsData.TRAINING_LABEL), SNP_TRAINING_VCF)
+                .add(StandardArgumentDefinitions.RESOURCE_LONG_NAME + String.format(":omni-calibration,%s=true", LabeledVariantAnnotationsData.CALIBRATION_LABEL), SNP_CALIBRATION_VCF);
+        return argsBuilder;
+    };
+    static final Function<ArgumentsBuilder, ArgumentsBuilder> ADD_INDEL_MODE_AND_RESOURCES = argsBuilder -> {
+        argsBuilder.add(LabeledVariantAnnotationsWalker.MODE_LONG_NAME, VariantType.INDEL)
+                .add(StandardArgumentDefinitions.RESOURCE_LONG_NAME + String.format(":mills-training,%s=true", LabeledVariantAnnotationsData.TRAINING_LABEL), INDEL_TRAINING_VCF)
+                .add(StandardArgumentDefinitions.RESOURCE_LONG_NAME + String.format(":mills-calibration,%s=true", LabeledVariantAnnotationsData.CALIBRATION_LABEL), INDEL_CALIBRATION_VCF);
+        return argsBuilder;
+    };
+    private static final Function<ArgumentsBuilder, ArgumentsBuilder> ADD_MAXIMUM_NUMBER_OF_UNLABELED_VARIANTS = argsBuilder -> {
+        argsBuilder.add(ExtractVariantAnnotations.MAXIMUM_NUMBER_OF_UNLABELED_VARIANTS_LONG_NAME, MAXIMUM_NUMBER_OF_UNLABELED_VARIANTS);
+        return argsBuilder;
+    };
+
+    /**
+     * Exact-match tests for configurations given by the Cartesian product of the following options:
+     *  1) non-allele-specific ("nonAS') vs. allele-specific ("AS")
+     *  2) SNP-only ("snp") vs. INDEL-only ("indel") vs. SNP+INDEL ("snpIndel")
+     *  3) positive ("pos") vs. positive-unlabeled ("posUn")
+     */
+    @DataProvider(name = "dataValidInputs")
+    public Object[][] dataValidInputs() {
+        final List<List<Pair<String, Function<ArgumentsBuilder, ArgumentsBuilder>>>> testConfigurations = Lists.cartesianProduct(
+                Collections.singletonList(
+                        Pair.of("extract", Function.identity())),
+                Arrays.asList(
+                        Pair.of("nonAS", ADD_NON_ALLELE_SPECIFIC_ANNOTATIONS),
+                        Pair.of("AS", ADD_ALLELE_SPECIFIC_ANNOTATIONS)),
+                Arrays.asList(
+                        Pair.of("snp", ADD_SNP_MODE_AND_RESOURCES),
+                        Pair.of("indel", ADD_INDEL_MODE_AND_RESOURCES),
+                        Pair.of("snpIndel", ADD_SNP_MODE_AND_RESOURCES.andThen(ADD_INDEL_MODE_AND_RESOURCES))),
+                Arrays.asList(
+                        Pair.of("pos", Function.identity()),
+                        Pair.of("posUn", ADD_MAXIMUM_NUMBER_OF_UNLABELED_VARIANTS)));
+
+        return testConfigurations.stream()
+                .map(tagAndAddFunctionPairs -> new Object[]{
+                        tagAndAddFunctionPairs.stream().map(Pair::getLeft).collect(Collectors.joining(".")),    // e.g., "extract.nonAS.snp.pos"
+                        tagAndAddFunctionPairs.stream().map(Pair::getRight)                                              // creates the corresponding ArgumentsBuilder
+                                .reduce(Function.identity(), Function::andThen)                                          //  by stringing together functions that add the
+                                .apply(BASE_ARGUMENTS_BUILDER_SUPPLIER.get())})                                          //  appropriate arguments
+                .toArray(Object[][]::new);
+    }
+
+    /**
+     * Checks expected outputs given a tag (e.g., "extract.nonAS.snp.pos") and arguments corresponding to the
+     * Cartesian products generated in {@link #dataValidInputs}.
+     *
+     * We perform exact-match tests of any annotation HDF5 files produced using h5diff, which is insensitive to timestamps within the file.
+     * We also perform exact-match tests of VCF files using diff. VCF indices may not be diff equivalent, so
+     * we just check for their existence.
+     */
+    @Test(dataProvider = "dataValidInputs", groups = {"python"}) // python environment is required to use h5diff for exact-match comparisons
+    public void testValidInputs(final String tag,
+                                final ArgumentsBuilder argsBuilder) {
+        final File outputDir = UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS ? EXPECTED_TEST_FILES_DIR : createTempDir("extract");
+        final String outputPrefix = String.format("%s/%s", outputDir, tag);
+        argsBuilder.addOutput(outputPrefix);
+        runCommandLine(argsBuilder);
+
+        if (!UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS) {
+            assertOutputs(tag, outputPrefix);
+        }
+    }
+
+    private static void assertOutputs(final String tag,
+                                      final String outputPrefix) {
+        // vcf.idx files are not reproducible
+        SystemCommandUtilsTest.runSystemCommand(String.format("h5diff %s/%s %s",
+                EXPECTED_TEST_FILES_DIR,
+                tag + LabeledVariantAnnotationsWalker.ANNOTATIONS_HDF5_SUFFIX,
+                outputPrefix + LabeledVariantAnnotationsWalker.ANNOTATIONS_HDF5_SUFFIX));
+        SystemCommandUtilsTest.runSystemCommand(String.format("diff %s/%s.vcf %s.vcf",
+                EXPECTED_TEST_FILES_DIR, tag, outputPrefix));
+        if (tag.contains("posUn")) {
+            SystemCommandUtilsTest.runSystemCommand(String.format("h5diff %s/%s %s",
+                    EXPECTED_TEST_FILES_DIR,
+                    tag + ExtractVariantAnnotations.UNLABELED_TAG + ExtractVariantAnnotations.ANNOTATIONS_HDF5_SUFFIX,
+                    outputPrefix + ExtractVariantAnnotations.UNLABELED_TAG + ExtractVariantAnnotations.ANNOTATIONS_HDF5_SUFFIX));
+        } else {
+            Assert.assertFalse(new File(outputPrefix + ExtractVariantAnnotations.UNLABELED_TAG + ExtractVariantAnnotations.ANNOTATIONS_HDF5_SUFFIX).exists());
+        }
+    }
+
+    /**
+     * If no resources are provided and we do not extract unlabeled sites, then only a zero-record VCF and the corresponding index are created.
+     * This is because we cannot create HDF5 files with empty arrays/matrices.
+     */
+    @Test
+    public void testNoResources() {
+        final File outputDir = createTempDir("extract");
+        final String outputPrefix = String.format("%s/test", outputDir);
+        final ArgumentsBuilder argsBuilder = ADD_ALLELE_SPECIFIC_ANNOTATIONS.apply(BASE_ARGUMENTS_BUILDER_SUPPLIER.get());
+        argsBuilder.add(LabeledVariantAnnotationsWalker.MODE_LONG_NAME, VariantType.SNP)
+                .addOutput(outputPrefix);
+        runCommandLine(argsBuilder);
+        Assert.assertFalse(new File(outputPrefix + ExtractVariantAnnotations.ANNOTATIONS_HDF5_SUFFIX).exists());
+        Assert.assertFalse(new File(outputPrefix + ExtractVariantAnnotations.UNLABELED_TAG + ExtractVariantAnnotations.ANNOTATIONS_HDF5_SUFFIX).exists());
+        Assert.assertTrue(new File(outputPrefix + ".vcf").exists());
+        Assert.assertTrue(new File(outputPrefix + ".vcf.idx").exists());
+    }
+
+    /**
+     * If no resources are provided but we do extract unlabeled sites, then all output files except the labeled-annotations HDF5 file are created.
+     * This is because we cannot create HDF5 files with empty arrays/matrices.
+     */
+    @Test
+    public void testNoResourcesAndExtractUnlabeled() {
+        final File outputDir = createTempDir("extract");
+        final String outputPrefix = String.format("%s/test", outputDir);
+        final ArgumentsBuilder argsBuilder = ADD_ALLELE_SPECIFIC_ANNOTATIONS.apply(BASE_ARGUMENTS_BUILDER_SUPPLIER.get());
+        argsBuilder.add(LabeledVariantAnnotationsWalker.MODE_LONG_NAME, VariantType.SNP)
+                .add(ExtractVariantAnnotations.MAXIMUM_NUMBER_OF_UNLABELED_VARIANTS_LONG_NAME, 1)
+                .addOutput(outputPrefix);
+        runCommandLine(argsBuilder);
+        Assert.assertFalse(new File(outputPrefix + ExtractVariantAnnotations.ANNOTATIONS_HDF5_SUFFIX).exists());
+        Assert.assertTrue(new File(outputPrefix + ExtractVariantAnnotations.UNLABELED_TAG + ExtractVariantAnnotations.ANNOTATIONS_HDF5_SUFFIX).exists());
+        Assert.assertTrue(new File(outputPrefix + ".vcf").exists());
+        Assert.assertTrue(new File(outputPrefix + ".vcf.idx").exists());
+    }
+
+    /**
+     * If no variants are present in the input in the specified region, then only a zero-record VCF and the corresponding index are created.
+     * This is because we cannot create HDF5 files with empty arrays/matrices.
+     */
+    @Test
+    public void testNoVariantsInInput() {
+        final File outputDir = createTempDir("extract");
+        final String outputPrefix = String.format("%s/test", outputDir);
+        final ArgumentsBuilder argsBuilder = ADD_ALLELE_SPECIFIC_ANNOTATIONS.apply(BASE_ARGUMENTS_BUILDER_SUPPLIER.get());
+        argsBuilder.add(LabeledVariantAnnotationsWalker.MODE_LONG_NAME, VariantType.SNP)
+                .addOutput(outputPrefix);
+        runCommandLine(argsBuilder);
+        Assert.assertFalse(new File(outputPrefix + ScoreVariantAnnotations.ANNOTATIONS_HDF5_SUFFIX).exists());
+        Assert.assertFalse(new File(outputPrefix + ScoreVariantAnnotations.SCORES_HDF5_SUFFIX).exists());
+        Assert.assertTrue(new File(outputPrefix + ".vcf").exists());
+        Assert.assertTrue(new File(outputPrefix + ".vcf.idx").exists());
+    }
+
+    @Test(expectedExceptions = UserException.class)
+    public void testForgotToSpecifyUseAlleleSpecificAnnotationsFlag() {
+        final File outputDir = createTempDir("extract");
+        final String outputPrefix = String.format("%s/test", outputDir);
+        final ArgumentsBuilder argsBuilder = ADD_SNP_MODE_AND_RESOURCES.apply(BASE_ARGUMENTS_BUILDER_SUPPLIER.get());
+        ALLELE_SPECIFIC_ANNOTATIONS.forEach(a -> argsBuilder.add(StandardArgumentDefinitions.ANNOTATION_LONG_NAME, a));
+        argsBuilder.addOutput(outputPrefix);
+        runCommandLine(argsBuilder);
+    }
+
+    @Test(expectedExceptions = UserException.class)
+    public void testReservedSNPResourceLabel() {
+        final File outputDir = createTempDir("extract");
+        final String outputPrefix = String.format("%s/test", outputDir);
+        final ArgumentsBuilder argsBuilder = ADD_NON_ALLELE_SPECIFIC_ANNOTATIONS.apply(BASE_ARGUMENTS_BUILDER_SUPPLIER.get());
+        argsBuilder.add(LabeledVariantAnnotationsWalker.MODE_LONG_NAME, VariantType.SNP)
+                .add(StandardArgumentDefinitions.RESOURCE_LONG_NAME + String.format(":snp,%s=true", LabeledVariantAnnotationsData.SNP_LABEL), SNP_TRAINING_VCF)
+                .addOutput(outputPrefix);
+        runCommandLine(argsBuilder);
+    }
+}
\ No newline at end of file
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ScoreVariantAnnotationsIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ScoreVariantAnnotationsIntegrationTest.java
new file mode 100644
index 00000000000..289821d0e54
--- /dev/null
+++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ScoreVariantAnnotationsIntegrationTest.java
@@ -0,0 +1,260 @@
+package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable;
+
+import com.google.common.collect.Lists;
+import org.apache.commons.lang3.tuple.Pair;
+import org.broadinstitute.hellbender.CommandLineProgramTest;
+import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
+import org.broadinstitute.hellbender.exceptions.UserException;
+import org.broadinstitute.hellbender.testutils.ArgumentsBuilder;
+import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.LabeledVariantAnnotationsData;
+import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.VariantType;
+import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.VariantAnnotationsModelBackend;
+import org.broadinstitute.hellbender.utils.io.IOUtils;
+import org.broadinstitute.hellbender.utils.io.Resource;
+import org.broadinstitute.hellbender.utils.python.PythonScriptExecutorException;
+import org.testng.Assert;
+import org.testng.annotations.DataProvider;
+import org.testng.annotations.Test;
+
+import java.io.File;
+import java.util.Arrays;
+import java.util.List;
+import java.util.function.BiFunction;
+import java.util.function.Function;
+import java.util.function.Supplier;
+import java.util.stream.Collectors;
+
+/**
+ * See documentation for {@link ExtractVariantAnnotationsIntegrationTest} for information about how inputs and
+ * expected outputs used there are related to those used here and in {@link TrainVariantAnnotationsModelIntegrationTest}.
+ */
+public final class ScoreVariantAnnotationsIntegrationTest extends CommandLineProgramTest {
+
+    // If true, update the expected outputs in tests that assert an exact match vs. prior output,
+    // instead of actually running the tests. Can be used with "./gradlew test -Dtest.single=ScoreVariantAnnotationsIntegrationTest"
+    // to update all of the exact-match tests at once. After you do this, you should look at the
+    // diffs in the new expected outputs in git to confirm that they are consistent with expectations.
+    public static final boolean UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS = false;
+
+    /**
+     * Make sure that someone didn't leave the UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS toggle turned on.
+     */
+    @Test
+    public void assertThatExpectedOutputUpdateToggleIsDisabled() {
+        Assert.assertFalse(UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS, "The toggle to update expected outputs should not be left enabled.");
+    }
+
+    private static final double CALIBRATION_SENSITIVITY_THRESHOLD = 0.9;
+
+    private static final File PACKAGE_TEST_FILES_DIR = new File(largeFileTestDir,
+            "org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/");
+    private static final File TEST_FILES_DIR = new File(largeFileTestDir,
+            "org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score");
+    private static final File INPUT_FROM_TRAIN_EXPECTED_TEST_FILES_DIR = new File(largeFileTestDir,
+            "org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected");
+    private static final File EXPECTED_TEST_FILES_DIR = new File(TEST_FILES_DIR, "expected");
+
+    private static final File ISOLATION_FOREST_PYTHON_SCRIPT = IOUtils.writeTempResource(
+            new Resource("isolation-forest.py", TrainVariantAnnotationsModel.class));
+
+    private static final File INPUT_VCF = new File(PACKAGE_TEST_FILES_DIR, "input/small_callset_low_threshold.sites-only.chr1.1-10M.vcf");
+
+    // Supplier and functions for creating and adding various arguments to an ArgumentsBuilder.
+    private static final Supplier<ArgumentsBuilder> BASE_ARGUMENTS_BUILDER_SUPPLIER = () -> {
+        final ArgumentsBuilder argsBuilder = new ArgumentsBuilder();
+        argsBuilder.addVCF(INPUT_VCF);
+        argsBuilder.addFlag(LabeledVariantAnnotationsWalker.DO_NOT_GZIP_VCF_OUTPUT_LONG_NAME);
+        argsBuilder.add(StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE, false);
+        return argsBuilder;
+    };
+    private static final BiFunction<ArgumentsBuilder, String, ArgumentsBuilder> ADD_MODEL_PREFIX = (argsBuilder, modelPrefix) -> {
+        argsBuilder.add(ScoreVariantAnnotations.MODEL_PREFIX_LONG_NAME, modelPrefix);
+        return argsBuilder;
+    };
+    private static final BiFunction<ArgumentsBuilder, Double, ArgumentsBuilder> ADD_CALIBRATION_SENSITIVITY_THRESHOLD = (argsBuilder, calibrationSensitivityThreshold) -> {
+        argsBuilder.add(ScoreVariantAnnotations.SNP_CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME, calibrationSensitivityThreshold);
+        argsBuilder.add(ScoreVariantAnnotations.INDEL_CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME, calibrationSensitivityThreshold);
+        return argsBuilder;
+    };
+    private static final BiFunction<ArgumentsBuilder, VariantAnnotationsModelBackend, ArgumentsBuilder> ADD_MODEL_BACKEND = (argsBuilder, modelBackendMode) -> {
+        argsBuilder.add(TrainVariantAnnotationsModel.MODEL_BACKEND_LONG_NAME, modelBackendMode);
+        return argsBuilder;
+    };
+    private static final Function<ArgumentsBuilder, ArgumentsBuilder> ADD_ISOLATION_FOREST_PYTHON_SCRIPT = argsBuilder -> {
+        argsBuilder.add(ScoreVariantAnnotations.PYTHON_SCRIPT_LONG_NAME, ISOLATION_FOREST_PYTHON_SCRIPT);
+        return argsBuilder;
+    };
+
+    /**
+     * Exact-match tests for (non-exhaustive) configurations given by the Cartesian product of the following options:
+     * 1) non-allele-specific ("nonAS") vs. allele-specific ("AS")
+     * 2) model backend
+     *      2a) Java Bayesian Gaussian Mixture Model (BGMM) backend TODO the BGMM has been reduced to a stub for this initial PR; subsequent PRs will cover the backend code and reconnect the stub
+     *      2b) default PYTHON_IFOREST ("IF.score")
+     *      2c) specified PYTHON_SCRIPT ("IF.score"); we will simply use the same script as the default PYTHON_IFOREST backend, so this is just a test of the command-line interface
+     *      We should expect 2b-c to give functionally identical results.
+     * 3) SNP-only ("snp") vs. SNP+INDEL ("snpIndel") (for both of these options, we use trained models that contain both SNP and INDEL scorers as input)
+     *  TODO the BGMM has been reduced to a stub for this initial PR; subsequent PRs will cover the backend code and reconnect the stub
+     */
+    @DataProvider(name = "dataValidInputs")
+    public Object[][] dataValidInputs() {
+        final List<List<Pair<String, Function<ArgumentsBuilder, ArgumentsBuilder>>>> testConfigurations = Lists.cartesianProduct(
+                Arrays.asList(
+                        Pair.of("extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg", Function.identity()),
+                        Pair.of("extract.AS.snpIndel.posUn.train.snpIndel.posNeg", Function.identity())),
+                Arrays.asList(
+                        Pair.of("IF.score", ab -> ADD_MODEL_BACKEND.apply(ab, VariantAnnotationsModelBackend.PYTHON_IFOREST)), // this and the following case give the same results, so they are given the same IF.score tag
+                        Pair.of("IF.score", ADD_ISOLATION_FOREST_PYTHON_SCRIPT
+                                .andThen(ab -> ADD_MODEL_BACKEND.apply(ab, VariantAnnotationsModelBackend.PYTHON_SCRIPT)))),
+                Arrays.asList(
+                        Pair.of("snp", ExtractVariantAnnotationsIntegrationTest.ADD_SNP_MODE_AND_RESOURCES),
+                        Pair.of("snpIndel", ExtractVariantAnnotationsIntegrationTest.ADD_SNP_MODE_AND_RESOURCES
+                                .andThen(ExtractVariantAnnotationsIntegrationTest.ADD_INDEL_MODE_AND_RESOURCES))));
+
+        return testConfigurations.stream()
+                .map(tagAndAddFunctionPairs -> new Object[]{
+                        tagAndAddFunctionPairs.stream().map(Pair::getLeft).collect(Collectors.joining(".")), // e.g., extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp
+                        tagAndAddFunctionPairs.stream().map(Pair::getRight)                                              // creates the corresponding ArgumentsBuilder
+                                .reduce(Function.identity(), Function::andThen)                                          //  by stringing together functions that add the
+                                .apply(BASE_ARGUMENTS_BUILDER_SUPPLIER.get())})                                          //  appropriate arguments
+                .toArray(Object[][]::new);
+    }
+
+    /**
+     * Checks expected outputs given a tag (e.g., "extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp") and arguments corresponding to the
+     * Cartesian products generated in {@link #dataValidInputs}.
+     *
+     * We perform exact-match tests of any HDF5 files produced using h5diff, which is insensitive to timestamps within the file.
+     * We also perform exact-match tests of VCF files using diff. VCF indices may not be diff equivalent, so
+     * we just check for their existence.
+     */
+    @Test(dataProvider = "dataValidInputs", groups = {"python"}) // python environment is required to run tool and to use h5diff for exact-match comparisons
+    public void testValidInputs(final String tag,
+                                final ArgumentsBuilder argsBuilder) {
+        final File outputDir = UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS ? EXPECTED_TEST_FILES_DIR : createTempDir("score");
+        final String outputPrefix = String.format("%s/%s", outputDir, tag);
+        argsBuilder.addOutput(outputPrefix);
+
+        // add arguments for model prefix based on the
+        // train tag (the portion of the tag preceding ".score", e.g., extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF),
+        // which gives the basename for the model files
+        final String trainTag = tag.split(".score")[0];
+        if (tag.contains("nonAS")) {
+            ExtractVariantAnnotationsIntegrationTest.ADD_NON_ALLELE_SPECIFIC_ANNOTATIONS.apply(argsBuilder);
+        } else {
+            ExtractVariantAnnotationsIntegrationTest.ADD_ALLELE_SPECIFIC_ANNOTATIONS.apply(argsBuilder);
+        }
+        final String modelPrefix = new File(INPUT_FROM_TRAIN_EXPECTED_TEST_FILES_DIR, trainTag).toString();
+        final Function<ArgumentsBuilder, ArgumentsBuilder> addModelPrefix = ab ->
+                ADD_MODEL_PREFIX.apply(ab, modelPrefix);
+        final Function<ArgumentsBuilder, ArgumentsBuilder> addCalibrationSensitivityThreshold = ab ->
+                ADD_CALIBRATION_SENSITIVITY_THRESHOLD.apply(ab, CALIBRATION_SENSITIVITY_THRESHOLD);
+        addModelPrefix.andThen(addCalibrationSensitivityThreshold).apply(argsBuilder);
+
+        // TODO test use of sites-only VCF (output by extract tool) to label extracted sites
+
+        runCommandLine(argsBuilder);
+
+        if (!UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS) {
+            assertExpectedOutputs(tag, outputPrefix);
+        }
+    }
+
+    private static void assertExpectedOutputs(final String tag,
+                                              final String outputPrefix) {
+        // vcf.idx files are not reproducible
+        SystemCommandUtilsTest.runSystemCommand(String.format("diff %s/%s.vcf %s.vcf",
+                EXPECTED_TEST_FILES_DIR, tag, outputPrefix));
+        SystemCommandUtilsTest.runSystemCommand(String.format("h5diff %s/%s.annot.hdf5 %s.annot.hdf5",
+                EXPECTED_TEST_FILES_DIR, tag, outputPrefix));
+        SystemCommandUtilsTest.runSystemCommand(String.format("h5diff %s/%s.scores.hdf5 %s.scores.hdf5",
+                EXPECTED_TEST_FILES_DIR, tag, outputPrefix));
+    }
+
+    /**
+     * In contrast to {@link ExtractVariantAnnotationsIntegrationTest#testNoResources}, the non-presence of
+     * resources here does not really affect the output.
+     */
+    @Test(groups = {"python"}) // python environment is required to run tool
+    public void testNoResources() {
+        final File outputDir = createTempDir("score");
+        final String outputPrefix = String.format("%s/test", outputDir);
+        final ArgumentsBuilder argsBuilder = BASE_ARGUMENTS_BUILDER_SUPPLIER.get();
+        argsBuilder.add(LabeledVariantAnnotationsWalker.MODE_LONG_NAME, VariantType.SNP)
+                .addOutput(outputPrefix);
+        final String modelPrefix = new File(INPUT_FROM_TRAIN_EXPECTED_TEST_FILES_DIR,
+                "extract.nonAS.snpIndel.posUn.train.snp.posOnly.IF").toString();
+        final Function<ArgumentsBuilder, ArgumentsBuilder> addModelPrefix = ab ->
+                ADD_MODEL_PREFIX.apply(ab, modelPrefix);
+        addModelPrefix
+                .andThen(ExtractVariantAnnotationsIntegrationTest.ADD_NON_ALLELE_SPECIFIC_ANNOTATIONS)
+                .apply(argsBuilder);
+        runCommandLine(argsBuilder);
+        Assert.assertTrue(new File(outputPrefix + ScoreVariantAnnotations.ANNOTATIONS_HDF5_SUFFIX).exists());
+        Assert.assertTrue(new File(outputPrefix + ScoreVariantAnnotations.SCORES_HDF5_SUFFIX).exists());
+        Assert.assertTrue(new File(outputPrefix + ".vcf").exists());
+        Assert.assertTrue(new File(outputPrefix + ".vcf.idx").exists());
+    }
+
+    /**
+     * If no variants are present in the input in the specified region, we do not create the scores or annotations HDF5 files.
+     * This is because we cannot create HDF5 files with empty arrays/matrices.
+     */
+    @Test(groups = {"python"}) // python environment is required to run tool
+    public void testNoVariantsInInput() {
+        final File outputDir = createTempDir("score");
+        final String outputPrefix = String.format("%s/test", outputDir);
+        final ArgumentsBuilder argsBuilder = BASE_ARGUMENTS_BUILDER_SUPPLIER.get();
+        argsBuilder.add(LabeledVariantAnnotationsWalker.MODE_LONG_NAME, VariantType.SNP)
+                .add(StandardArgumentDefinitions.INTERVALS_LONG_NAME, "chr2") // the test input VCF does not have variants here
+                .addOutput(outputPrefix);
+        final String modelPrefix = new File(INPUT_FROM_TRAIN_EXPECTED_TEST_FILES_DIR,
+                "extract.nonAS.snpIndel.posUn.train.snp.posOnly.IF").toString();
+        final Function<ArgumentsBuilder, ArgumentsBuilder> addModelPrefix = ab ->
+                ADD_MODEL_PREFIX.apply(ab, modelPrefix);
+        addModelPrefix
+                .andThen(ExtractVariantAnnotationsIntegrationTest.ADD_NON_ALLELE_SPECIFIC_ANNOTATIONS)
+                .andThen(ExtractVariantAnnotationsIntegrationTest.ADD_SNP_MODE_AND_RESOURCES)
+                .apply(argsBuilder);
+        runCommandLine(argsBuilder);
+        Assert.assertFalse(new File(outputPrefix + ScoreVariantAnnotations.ANNOTATIONS_HDF5_SUFFIX).exists());
+        Assert.assertFalse(new File(outputPrefix + ScoreVariantAnnotations.SCORES_HDF5_SUFFIX).exists());
+        Assert.assertTrue(new File(outputPrefix + ".vcf").exists());
+        Assert.assertTrue(new File(outputPrefix + ".vcf.idx").exists());
+    }
+
+    @Test(expectedExceptions = PythonScriptExecutorException.class, groups = {"python"}) // python environment is required to run tool
+    public void testAnnotationsDoNotMatchThoseUsedToTrainModel() {
+        final File outputDir = createTempDir("score");
+        final String outputPrefix = String.format("%s/test", outputDir);
+        final ArgumentsBuilder argsBuilder = BASE_ARGUMENTS_BUILDER_SUPPLIER.get();
+        argsBuilder.add(LabeledVariantAnnotationsWalker.MODE_LONG_NAME, VariantType.SNP)
+                .addOutput(outputPrefix);
+        final String modelPrefix = new File(INPUT_FROM_TRAIN_EXPECTED_TEST_FILES_DIR,
+                "extract.nonAS.snpIndel.posUn.train.snp.posOnly.IF").toString();
+        final Function<ArgumentsBuilder, ArgumentsBuilder> addModelPrefix = ab ->
+                ADD_MODEL_PREFIX.apply(ab, modelPrefix);
+        addModelPrefix
+                .andThen(ExtractVariantAnnotationsIntegrationTest.ADD_NON_ALLELE_SPECIFIC_ANNOTATIONS)  // model was trained with non-AS annotations
+                .andThen(ExtractVariantAnnotationsIntegrationTest.ADD_ALLELE_SPECIFIC_ANNOTATIONS)      // but we additionally specify AS annotations
+                .andThen(ExtractVariantAnnotationsIntegrationTest.ADD_SNP_MODE_AND_RESOURCES)
+                .apply(argsBuilder);
+        runCommandLine(argsBuilder);
+    }
+
+    @Test(expectedExceptions = UserException.class)
+    public void testReservedSNPResourceLabel() {
+        final File outputDir = createTempDir("extract");
+        final String outputPrefix = String.format("%s/test", outputDir);
+        final ArgumentsBuilder argsBuilder = ExtractVariantAnnotationsIntegrationTest.ADD_NON_ALLELE_SPECIFIC_ANNOTATIONS.apply(BASE_ARGUMENTS_BUILDER_SUPPLIER.get());
+        argsBuilder.add(LabeledVariantAnnotationsWalker.MODE_LONG_NAME, VariantType.SNP)
+                .add(StandardArgumentDefinitions.RESOURCE_LONG_NAME + String.format(":snp,%s=true", LabeledVariantAnnotationsData.SNP_LABEL), INPUT_VCF) // we just use the input VCF as a dummy resource
+                .addOutput(outputPrefix);
+        final String modelPrefix = new File(INPUT_FROM_TRAIN_EXPECTED_TEST_FILES_DIR,
+                "extract.nonAS.snpIndel.posUn.train.snp.posOnly.IF").toString();
+        final Function<ArgumentsBuilder, ArgumentsBuilder> addModelPrefix = ab ->
+                ADD_MODEL_PREFIX.apply(ab, modelPrefix);
+        addModelPrefix.apply(argsBuilder);
+        runCommandLine(argsBuilder);
+    }
+}
\ No newline at end of file
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/SystemCommandUtilsTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/SystemCommandUtilsTest.java
new file mode 100644
index 00000000000..705f292116a
--- /dev/null
+++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/SystemCommandUtilsTest.java
@@ -0,0 +1,62 @@
+package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable;
+
+import org.broadinstitute.hellbender.GATKBaseTest;
+import org.broadinstitute.hellbender.exceptions.GATKException;
+import org.testng.Assert;
+import org.testng.annotations.Test;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStreamReader;
+
+public final class SystemCommandUtilsTest extends GATKBaseTest {
+
+    private static final File TEST_FILES_DIR = new File(largeFileTestDir,
+            "org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract");
+    private static final File EXPECTED_TEST_FILES_DIR = new File(TEST_FILES_DIR, "expected");
+
+    static void runSystemCommand(final String command) {
+        logger.debug(String.format("Testing command: %s", command));
+        try {
+            final ProcessBuilder processBuilder = new ProcessBuilder("sh", "-c", command).redirectErrorStream(true);
+            final Process process = processBuilder.start();
+
+            final BufferedReader stdInReader = new BufferedReader(new InputStreamReader(process.getInputStream()));
+            String stdInLine;
+            while ((stdInLine = stdInReader.readLine()) != null) {
+                Assert.fail(String.format("The command \"%s\" resulted in: %s", command, stdInLine));
+            }
+            stdInReader.close();
+
+        } catch (final IOException e) {
+            throw new GATKException.ShouldNeverReachHereException(e.getMessage());
+        }
+    }
+
+    @Test(groups = {"python"}) // python environment is required to use h5diff
+    public void testRunSystemCommand() {
+        runSystemCommand(String.format("h5diff %s/extract.AS.indel.pos.annot.hdf5 %s/extract.AS.indel.pos.annot.hdf5",
+                EXPECTED_TEST_FILES_DIR, EXPECTED_TEST_FILES_DIR));
+        runSystemCommand(String.format("diff %s/extract.AS.indel.pos.vcf %s/extract.AS.indel.pos.vcf",
+                EXPECTED_TEST_FILES_DIR, EXPECTED_TEST_FILES_DIR));
+    }
+
+    @Test(expectedExceptions = AssertionError.class, groups = {"python"}) // python environment is required to use h5diff
+    public void testRunSystemCommandH5diffException() {
+        runSystemCommand(String.format("h5diff %s/extract.AS.indel.pos.annot.hdf5 %s/extract.AS.snp.pos.annot.hdf5",
+                EXPECTED_TEST_FILES_DIR, EXPECTED_TEST_FILES_DIR));
+    }
+
+    @Test(expectedExceptions = AssertionError.class)
+    public void testRunSystemCommandDiffException() {
+        runSystemCommand(String.format("diff %s/extract.AS.indel.pos.vcf %s/extract.AS.snp.pos.vcf",
+                EXPECTED_TEST_FILES_DIR, EXPECTED_TEST_FILES_DIR));
+    }
+
+    @Test(expectedExceptions = AssertionError.class)
+    public void testRunSystemCommandDiffNoSuchFileException() {
+        runSystemCommand(String.format("diff %s/blahblah %s/extract.AS.snp.pos.vcf",
+                EXPECTED_TEST_FILES_DIR, EXPECTED_TEST_FILES_DIR));
+    }
+}
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/TrainVariantAnnotationsModelIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/TrainVariantAnnotationsModelIntegrationTest.java
new file mode 100644
index 00000000000..9082fe7a0ad
--- /dev/null
+++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/TrainVariantAnnotationsModelIntegrationTest.java
@@ -0,0 +1,428 @@
+package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable;
+
+import com.google.common.collect.Lists;
+import org.apache.commons.lang3.tuple.Pair;
+import org.broadinstitute.hdf5.HDF5File;
+import org.broadinstitute.hellbender.CommandLineProgramTest;
+import org.broadinstitute.hellbender.exceptions.UserException;
+import org.broadinstitute.hellbender.testutils.ArgumentsBuilder;
+import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.LabeledVariantAnnotationsData;
+import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.VariantType;
+import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.BGMMVariantAnnotationsScorer;
+import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.PythonSklearnVariantAnnotationsScorer;
+import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.VariantAnnotationsModelBackend;
+import org.broadinstitute.hellbender.utils.io.IOUtils;
+import org.broadinstitute.hellbender.utils.io.Resource;
+import org.testng.Assert;
+import org.testng.annotations.DataProvider;
+import org.testng.annotations.Test;
+
+import java.io.File;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.function.BiFunction;
+import java.util.function.Function;
+import java.util.function.Supplier;
+import java.util.stream.Collectors;
+
+/**
+ * See documentation for {@link ExtractVariantAnnotationsIntegrationTest} for information about how inputs and
+ * expected outputs used there are related to those used here and in {@link ScoreVariantAnnotationsIntegrationTest}.
+ */
+public final class TrainVariantAnnotationsModelIntegrationTest extends CommandLineProgramTest {
+
+    // If true, update the expected outputs in tests that assert an exact match vs. prior output,
+    // instead of actually running the tests. Can be used with "./gradlew test -Dtest.single=TrainVariantAnnotationsIntegrationTest"
+    // to update all of the exact-match tests at once. After you do this, you should look at the
+    // diffs in the new expected outputs in git to confirm that they are consistent with expectations.
+    public static final boolean UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS = false;
+
+    /**
+     * Make sure that someone didn't leave the UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS toggle turned on.
+     */
+    @Test
+    public void assertThatExpectedOutputUpdateToggleIsDisabled() {
+        Assert.assertFalse(UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS, "The toggle to update expected outputs should not be left enabled.");
+    }
+    
+    private static final double CALIBRATION_SENSITIVITY_THRESHOLD = 0.9;
+
+    private static final File TEST_FILES_DIR = new File(largeFileTestDir,
+            "org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train");
+    private static final File INPUT_FROM_EXTRACT_EXPECTED_TEST_FILES_DIR = new File(largeFileTestDir,
+            "org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected");
+    private static final File EXPECTED_TEST_FILES_DIR = new File(TEST_FILES_DIR, "expected");
+
+    private static final File ISOLATION_FOREST_PYTHON_SCRIPT = IOUtils.writeTempResource(
+            new Resource("isolation-forest.py", TrainVariantAnnotationsModel.class));
+    private static final File ISOLATION_FOREST_HYPERPARAMETERS_JSON = new File(TEST_FILES_DIR,
+            "isolation-forest-hyperparameters-different-seed.json");
+
+    // Supplier and functions for creating and adding various arguments to an ArgumentsBuilder.
+    private static final Supplier<ArgumentsBuilder> BASE_ARGUMENTS_BUILDER_SUPPLIER = ArgumentsBuilder::new;
+    private static final BiFunction<ArgumentsBuilder, File, ArgumentsBuilder> ADD_ANNOTATIONS_HDF5 = (argsBuilder, annotationsHDF5) -> {
+        argsBuilder.add(TrainVariantAnnotationsModel.ANNOTATIONS_HDF5_LONG_NAME, annotationsHDF5);
+        return argsBuilder;
+    };
+    private static final BiFunction<ArgumentsBuilder, File, ArgumentsBuilder> ADD_UNLABELED_ANNOTATIONS_HDF5 = (argsBuilder, unlabeledAnnotationsHDF5) -> {
+        argsBuilder.add(TrainVariantAnnotationsModel.UNLABELED_ANNOTATIONS_HDF5_LONG_NAME, unlabeledAnnotationsHDF5);
+        return argsBuilder;
+    };
+    private static final BiFunction<ArgumentsBuilder, Double, ArgumentsBuilder> ADD_CALIBRATION_SENSITIVITY_THRESHOLD = (argsBuilder, calibrationSensitivityThreshold) -> {
+        argsBuilder.add(TrainVariantAnnotationsModel.CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME, calibrationSensitivityThreshold);
+        return argsBuilder;
+    };
+    private static final Function<ArgumentsBuilder, ArgumentsBuilder> ADD_SNP_MODE = argsBuilder -> {
+        argsBuilder.add(LabeledVariantAnnotationsWalker.MODE_LONG_NAME, VariantType.SNP);
+        return argsBuilder;
+    };
+    private static final Function<ArgumentsBuilder, ArgumentsBuilder> ADD_INDEL_MODE = argsBuilder -> {
+        argsBuilder.add(LabeledVariantAnnotationsWalker.MODE_LONG_NAME, VariantType.INDEL);
+        return argsBuilder;
+    };
+    private static final BiFunction<ArgumentsBuilder, VariantAnnotationsModelBackend, ArgumentsBuilder> ADD_MODEL_BACKEND = (argsBuilder, modelBackendMode) -> {
+        argsBuilder.add(TrainVariantAnnotationsModel.MODEL_BACKEND_LONG_NAME, modelBackendMode);
+        return argsBuilder;
+    };
+    private static final Function<ArgumentsBuilder, ArgumentsBuilder> ADD_ISOLATION_FOREST_PYTHON_SCRIPT = argsBuilder -> {
+        argsBuilder.add(TrainVariantAnnotationsModel.PYTHON_SCRIPT_LONG_NAME, ISOLATION_FOREST_PYTHON_SCRIPT);
+        return argsBuilder;
+    };
+    private static final Function<ArgumentsBuilder, ArgumentsBuilder> ADD_ISOLATION_FOREST_HYPERPARAMETERS_JSON = argsBuilder -> {
+        argsBuilder.add(TrainVariantAnnotationsModel.HYPERPARAMETERS_JSON_LONG_NAME, ISOLATION_FOREST_HYPERPARAMETERS_JSON);
+        return argsBuilder;
+    };
+
+    /**
+     * Exact-match tests for (non-exhaustive) configurations given by the Cartesian product of the following options:
+     *  1) non-allele-specific ("nonAS") vs. allele-specific ("AS")
+     *  2) SNP-only ("snp") vs. SNP+INDEL ("snpIndel") (for both of these options, we use extracted annotations that contain both SNP and INDEL variants as input)
+     *  3) positive training with {extract-tag}.annot.hdf5 ("posOnly") vs. positive-negative training with {extract-tag}.annot.hdf5 and {extract-tag}.unlabeled.annot.hdf5 ("posNeg")
+     *  4) model backend
+     *      4a) Java Bayesian Gaussian Mixture Model (BGMM) backend TODO the BGMM has been reduced to a stub for this initial PR; subsequent PRs will cover the backend code and reconnect the stub
+     *      4b) default PYTHON_IFOREST with default hyperparameters ("IF")
+     *      4c) default PYTHON_IFOREST with non-default seed hyperparameter ("IFDifferentSeed")
+     *      4d) specified PYTHON_SCRIPT with non-default seed hyperparameter ("IFDifferentSeed"); we will simply use the same script as the default PYTHON_IFOREST backend, so this is just a test of the command-line interface
+     *      We should expect 4c-d to give functionally identical results.
+     */
+    @DataProvider(name = "dataValidInputs")
+    public Object[][] dataValidInputs() {
+        final List<List<Pair<String, Function<ArgumentsBuilder, ArgumentsBuilder>>>> testConfigurations = Lists.cartesianProduct(
+                Arrays.asList(
+                        Pair.of("extract.nonAS.snpIndel.posUn.train", Function.identity()),
+                        Pair.of("extract.AS.snpIndel.posUn.train", Function.identity())),
+                Arrays.asList(
+                        Pair.of("snp", ADD_SNP_MODE),
+                        Pair.of("snpIndel", ADD_SNP_MODE.andThen(ADD_INDEL_MODE))),
+                Arrays.asList(              // we will consume the tag and add appropriate arguments for positive and positive-negative training below
+                        Pair.of("posOnly", Function.identity()),
+                        Pair.of("posNeg", Function.identity())),
+                Arrays.asList(
+                        Pair.of("IF", ab -> ADD_MODEL_BACKEND.apply(ab, VariantAnnotationsModelBackend.PYTHON_IFOREST)),
+                        Pair.of("IFDifferentSeed", ADD_ISOLATION_FOREST_HYPERPARAMETERS_JSON
+                                .andThen(ab -> ADD_MODEL_BACKEND.apply(ab, VariantAnnotationsModelBackend.PYTHON_IFOREST))), // this and the following case give the same results, so they are given the same IFDifferentSeed tag
+                        Pair.of("IFDifferentSeed", ADD_ISOLATION_FOREST_PYTHON_SCRIPT
+                                .andThen(ADD_ISOLATION_FOREST_HYPERPARAMETERS_JSON)
+                                .andThen(ab -> ADD_MODEL_BACKEND.apply(ab, VariantAnnotationsModelBackend.PYTHON_SCRIPT)))));
+
+        return testConfigurations.stream()
+                .map(tagAndAddFunctionPairs -> new Object[]{
+                        tagAndAddFunctionPairs.stream().map(Pair::getLeft).collect(Collectors.joining(".")), // e.g., extract.nonAS.snpIndel.posUn.train.snp.posOnly.IF
+                        tagAndAddFunctionPairs.stream().map(Pair::getRight)                                              // creates the corresponding ArgumentsBuilder
+                                .reduce(Function.identity(), Function::andThen)                                          //  by stringing together functions that add the
+                                .apply(BASE_ARGUMENTS_BUILDER_SUPPLIER.get())})                                          //  appropriate arguments
+                .toArray(Object[][]::new);
+    }
+
+    /**
+     * Checks expected outputs given a tag (e.g., "extract.nonAS.snpIndel.posUn.train.snp.posOnly.IF") and arguments corresponding to the
+     * Cartesian products generated in {@link #dataValidInputs}.
+     *
+     * We perform exact-match tests of any HDF5 files produced using h5diff, which is insensitive to timestamps within the file.
+     * Binary serialized scorers may not be diff equivalent, so we just check for their existence.
+     */
+    @Test(dataProvider = "dataValidInputs", groups = {"python"}) // python environment is required to run tool and to use h5diff for exact-match comparisons
+    public void testValidInputs(final String tag,
+                                final ArgumentsBuilder argsBuilder) {
+        final File outputDir = UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS ? EXPECTED_TEST_FILES_DIR : createTempDir("train");
+        final String outputPrefix = String.format("%s/%s", outputDir, tag);
+        argsBuilder.addOutput(outputPrefix);
+
+        // add arguments for positive/unlabeled annotations based on the
+        // extract tag (the portion of the tag preceding ".train", e.g., extract.nonAS.snpIndel.posUn),
+        // which gives the basename for the annotation files
+        final String extractTag = tag.split(".train")[0];
+        final File positiveAnnotationsHDF5 = new File(INPUT_FROM_EXTRACT_EXPECTED_TEST_FILES_DIR,
+                extractTag + LabeledVariantAnnotationsWalker.ANNOTATIONS_HDF5_SUFFIX);
+        final Function<ArgumentsBuilder, ArgumentsBuilder> addPositiveAnnotations = ab ->
+                ADD_ANNOTATIONS_HDF5.apply(ab, positiveAnnotationsHDF5);
+        if (tag.contains("posNeg")) {
+            final File unlabeledAnnotationsHDF5 = new File(INPUT_FROM_EXTRACT_EXPECTED_TEST_FILES_DIR,
+                    extractTag + ExtractVariantAnnotations.UNLABELED_TAG + LabeledVariantAnnotationsWalker.ANNOTATIONS_HDF5_SUFFIX);
+            final Function<ArgumentsBuilder, ArgumentsBuilder> addUnlabeledAnnotations = ab ->
+                    ADD_UNLABELED_ANNOTATIONS_HDF5.apply(ab, unlabeledAnnotationsHDF5);
+            final Function<ArgumentsBuilder, ArgumentsBuilder> addCalibrationSensitivityThreshold = ab ->
+                    ADD_CALIBRATION_SENSITIVITY_THRESHOLD.apply(ab, CALIBRATION_SENSITIVITY_THRESHOLD);
+            addPositiveAnnotations.andThen(addUnlabeledAnnotations).andThen(addCalibrationSensitivityThreshold).apply(argsBuilder);
+        } else {
+            addPositiveAnnotations.apply(argsBuilder);
+        }
+
+        runCommandLine(argsBuilder);
+
+        if (!UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS) {
+            assertExpectedOutputs(tag, outputPrefix);
+        }
+    }
+
+    private static void assertExpectedOutputs(final String tag,
+                                              final String outputPrefix) {
+        if (tag.contains("train.snp.")) {
+            assertExpectedOutputsForVariantType(tag, outputPrefix, "snp");
+            assertOutputsForVariantTypeDoNotExist(outputPrefix, "indel");
+        } else if (tag.contains("train.snpIndel.")) {
+            assertExpectedOutputsForVariantType(tag, outputPrefix, "snp");
+            assertExpectedOutputsForVariantType(tag, outputPrefix, "indel");
+        } else {
+            Assert.fail("Unknown variant-type tag.");
+        }
+    }
+
+    private static void assertExpectedOutputsForVariantType(final String tag,
+                                                            final String outputPrefix,
+                                                            final String variantType) {
+        final String tagAndVariantType = String.format("%s.%s", tag, variantType);
+        final String outputPrefixAndVariantType = String.format("%s.%s", outputPrefix, variantType);
+
+        SystemCommandUtilsTest.runSystemCommand(String.format("h5diff %s/%s %s",
+                EXPECTED_TEST_FILES_DIR,
+                tagAndVariantType + TrainVariantAnnotationsModel.TRAINING_SCORES_HDF5_SUFFIX,
+                outputPrefixAndVariantType + TrainVariantAnnotationsModel.TRAINING_SCORES_HDF5_SUFFIX));
+        SystemCommandUtilsTest.runSystemCommand(String.format("h5diff %s/%s %s",
+                EXPECTED_TEST_FILES_DIR,
+                tagAndVariantType + TrainVariantAnnotationsModel.CALIBRATION_SCORES_HDF5_SUFFIX,
+                outputPrefixAndVariantType + TrainVariantAnnotationsModel.CALIBRATION_SCORES_HDF5_SUFFIX));
+
+        assertScorerExpectedOutputs(tagAndVariantType, outputPrefixAndVariantType, false);
+
+        if (tag.contains("posNeg")) {
+            SystemCommandUtilsTest.runSystemCommand(String.format("h5diff %s/%s %s",
+                    EXPECTED_TEST_FILES_DIR,
+                    tagAndVariantType + TrainVariantAnnotationsModel.UNLABELED_SCORES_HDF5_SUFFIX,
+                    outputPrefixAndVariantType + TrainVariantAnnotationsModel.UNLABELED_SCORES_HDF5_SUFFIX));
+            assertScorerExpectedOutputs(tagAndVariantType, outputPrefixAndVariantType, true);
+        } else {
+            Assert.assertFalse(new File(outputPrefixAndVariantType + TrainVariantAnnotationsModel.UNLABELED_SCORES_HDF5_SUFFIX).exists());
+            Assert.assertFalse(new File(outputPrefixAndVariantType + TrainVariantAnnotationsModel.NEGATIVE_TAG + BGMMVariantAnnotationsScorer.BGMM_SCORER_SER_SUFFIX).exists());
+            Assert.assertFalse(new File(outputPrefixAndVariantType + TrainVariantAnnotationsModel.NEGATIVE_TAG + PythonSklearnVariantAnnotationsScorer.PYTHON_SCORER_PKL_SUFFIX).exists());
+        }
+    }
+
+    private static void assertOutputsForVariantTypeDoNotExist(final String outputPrefix,
+                                                              final String variantType) {
+        final String outputPrefixAndVariantType = String.format("%s.%s", outputPrefix, variantType);
+        Assert.assertFalse(new File(outputPrefixAndVariantType + TrainVariantAnnotationsModel.TRAINING_SCORES_HDF5_SUFFIX).exists());
+        Assert.assertFalse(new File(outputPrefixAndVariantType + TrainVariantAnnotationsModel.CALIBRATION_SCORES_HDF5_SUFFIX).exists());
+        Assert.assertFalse(new File(outputPrefixAndVariantType + TrainVariantAnnotationsModel.UNLABELED_SCORES_HDF5_SUFFIX).exists());
+        Assert.assertFalse(new File(outputPrefixAndVariantType + BGMMVariantAnnotationsScorer.BGMM_SCORER_SER_SUFFIX).exists());
+        Assert.assertFalse(new File(outputPrefixAndVariantType + PythonSklearnVariantAnnotationsScorer.PYTHON_SCORER_PKL_SUFFIX).exists());
+        Assert.assertFalse(new File(outputPrefixAndVariantType + TrainVariantAnnotationsModel.NEGATIVE_TAG + BGMMVariantAnnotationsScorer.BGMM_SCORER_SER_SUFFIX).exists());
+        Assert.assertFalse(new File(outputPrefixAndVariantType + TrainVariantAnnotationsModel.NEGATIVE_TAG + PythonSklearnVariantAnnotationsScorer.PYTHON_SCORER_PKL_SUFFIX).exists());
+    }
+
+    /**
+     * Binary serialized scorers may not be diff equivalent, so we just check for their existence.
+     * We assume that checking elsewhere for equivalence of the scores that the scorers generate provides sufficient
+     * coverage.
+     */
+    private static void assertScorerExpectedOutputs(final String tagAndVariantType,
+                                                    final String outputPrefixAndVariantType,
+                                                    final boolean isNegative) {
+        final String positiveOrNegativeTag = isNegative ? ".negative" : "";
+        final String scorerTag = outputPrefixAndVariantType + positiveOrNegativeTag;
+        if (tagAndVariantType.contains("BGMM")) {
+            Assert.assertTrue(new File(scorerTag + BGMMVariantAnnotationsScorer.BGMM_SCORER_SER_SUFFIX).exists());
+            Assert.assertFalse(new File(scorerTag + PythonSklearnVariantAnnotationsScorer.PYTHON_SCORER_PKL_SUFFIX).exists());
+        } else if (tagAndVariantType.contains("IF")) {
+            Assert.assertTrue(new File(scorerTag + PythonSklearnVariantAnnotationsScorer.PYTHON_SCORER_PKL_SUFFIX).exists());
+            Assert.assertFalse(new File(scorerTag + BGMMVariantAnnotationsScorer.BGMM_SCORER_SER_SUFFIX).exists());
+        } else {
+            Assert.fail("Unknown model-backend tag.");
+        }
+    }
+
+    @Test(groups = {"python"}) // python environment is required to run tool and to use h5diff for exact-match comparisons
+    public void testSNPOnlyModelsFromSNPOnlyAndSNPPlusIndelAnnotationsAreIdentical() {
+        final File outputDir = createTempDir("train");
+
+        final String outputPrefixSNPOnly = String.format("%s/test-snp", outputDir);
+        final ArgumentsBuilder argsBuilderSNPOnly = BASE_ARGUMENTS_BUILDER_SUPPLIER.get();
+        argsBuilderSNPOnly.addOutput(outputPrefixSNPOnly);
+        final File positiveAnnotationsHDF5SNPOnly = new File(INPUT_FROM_EXTRACT_EXPECTED_TEST_FILES_DIR,
+                "extract.nonAS.snp.pos" + LabeledVariantAnnotationsWalker.ANNOTATIONS_HDF5_SUFFIX);
+        final Function<ArgumentsBuilder, ArgumentsBuilder> addPositiveAnnotationsSNPOnly = ab ->
+                ADD_ANNOTATIONS_HDF5.apply(ab, positiveAnnotationsHDF5SNPOnly);
+        addPositiveAnnotationsSNPOnly
+                .andThen(ADD_SNP_MODE)
+                .apply(argsBuilderSNPOnly);
+        runCommandLine(argsBuilderSNPOnly);
+
+        final String outputPrefixSNPPlusIndel = String.format("%s/test-snpIndel", outputDir);
+        final ArgumentsBuilder argsBuilderSNPPlusIndel = BASE_ARGUMENTS_BUILDER_SUPPLIER.get();
+        argsBuilderSNPPlusIndel.addOutput(outputPrefixSNPPlusIndel);
+        final File positiveAnnotationsHDF5SNPPlusIndel = new File(INPUT_FROM_EXTRACT_EXPECTED_TEST_FILES_DIR,
+                "extract.nonAS.snpIndel.pos" + LabeledVariantAnnotationsWalker.ANNOTATIONS_HDF5_SUFFIX);
+        final Function<ArgumentsBuilder, ArgumentsBuilder> addPositiveAnnotationsSNPPlusIndel = ab ->
+                ADD_ANNOTATIONS_HDF5.apply(ab, positiveAnnotationsHDF5SNPPlusIndel);
+        addPositiveAnnotationsSNPPlusIndel
+                .andThen(ADD_SNP_MODE)
+                .apply(argsBuilderSNPPlusIndel);
+        runCommandLine(argsBuilderSNPPlusIndel);
+
+        SystemCommandUtilsTest.runSystemCommand(String.format("h5diff %s %s",
+                outputPrefixSNPOnly + ".snp" + TrainVariantAnnotationsModel.TRAINING_SCORES_HDF5_SUFFIX,
+                outputPrefixSNPPlusIndel + ".snp" + TrainVariantAnnotationsModel.TRAINING_SCORES_HDF5_SUFFIX));
+        SystemCommandUtilsTest.runSystemCommand(String.format("h5diff %s %s",
+                outputPrefixSNPOnly + ".snp" + TrainVariantAnnotationsModel.CALIBRATION_SCORES_HDF5_SUFFIX,
+                outputPrefixSNPPlusIndel + ".snp" + TrainVariantAnnotationsModel.CALIBRATION_SCORES_HDF5_SUFFIX));
+    }
+
+    @Test(expectedExceptions = IllegalArgumentException.class)
+    public void testUnlabeledAnnotationsSpecifiedWithoutCalibrationSensitivityThreshold() {
+        final File outputDir = createTempDir("train");
+        final String outputPrefix = String.format("%s/test", outputDir);
+        final ArgumentsBuilder argsBuilder = BASE_ARGUMENTS_BUILDER_SUPPLIER.get();
+        argsBuilder.addOutput(outputPrefix);
+        final String extractTag = "extract.nonAS.snpIndel.posUn";
+        final File positiveAnnotationsHDF5 = new File(INPUT_FROM_EXTRACT_EXPECTED_TEST_FILES_DIR,
+                extractTag + LabeledVariantAnnotationsWalker.ANNOTATIONS_HDF5_SUFFIX);
+        final Function<ArgumentsBuilder, ArgumentsBuilder> addPositiveAnnotations = ab ->
+                ADD_ANNOTATIONS_HDF5.apply(ab, positiveAnnotationsHDF5);
+        final File unlabeledAnnotationsHDF5 = new File(INPUT_FROM_EXTRACT_EXPECTED_TEST_FILES_DIR,
+                extractTag + ExtractVariantAnnotations.UNLABELED_TAG + LabeledVariantAnnotationsWalker.ANNOTATIONS_HDF5_SUFFIX);
+        final Function<ArgumentsBuilder, ArgumentsBuilder> addUnlabeledAnnotations = ab ->
+                ADD_UNLABELED_ANNOTATIONS_HDF5.apply(ab, unlabeledAnnotationsHDF5);
+        addPositiveAnnotations
+                .andThen(addUnlabeledAnnotations)
+                .apply(argsBuilder);
+        runCommandLine(argsBuilder);
+    }
+
+    @Test(expectedExceptions = IllegalArgumentException.class)
+    public void testCalibrationSensitivityThresholdSpecifiedWithoutUnlabeledAnnotations() {
+        final File outputDir = createTempDir("train");
+        final String outputPrefix = String.format("%s/test", outputDir);
+        final ArgumentsBuilder argsBuilder = BASE_ARGUMENTS_BUILDER_SUPPLIER.get();
+        argsBuilder.addOutput(outputPrefix);
+        final String extractTag = "extract.nonAS.snpIndel.posUn";
+        final File positiveAnnotationsHDF5 = new File(INPUT_FROM_EXTRACT_EXPECTED_TEST_FILES_DIR,
+                extractTag + LabeledVariantAnnotationsWalker.ANNOTATIONS_HDF5_SUFFIX);
+        final Function<ArgumentsBuilder, ArgumentsBuilder> addPositiveAnnotations = ab ->
+                ADD_ANNOTATIONS_HDF5.apply(ab, positiveAnnotationsHDF5);
+        final Function<ArgumentsBuilder, ArgumentsBuilder> addCalibrationSensitivityThreshold = ab ->
+                ADD_CALIBRATION_SENSITIVITY_THRESHOLD.apply(ab, CALIBRATION_SENSITIVITY_THRESHOLD);
+        addPositiveAnnotations
+                .andThen(addCalibrationSensitivityThreshold)
+                .apply(argsBuilder);
+        runCommandLine(argsBuilder);
+    }
+
+    @Test(expectedExceptions = IllegalArgumentException.class) // python environment is required to run tool
+    public void testPositiveAndUnlabeledAnnotationNamesAreNotIdentical() {
+        final File outputDir = createTempDir("train");
+        final String outputPrefix = String.format("%s/test", outputDir);
+        final ArgumentsBuilder argsBuilder = BASE_ARGUMENTS_BUILDER_SUPPLIER.get();
+        argsBuilder.addOutput(outputPrefix);
+        final File positiveAnnotationsHDF5 = new File(INPUT_FROM_EXTRACT_EXPECTED_TEST_FILES_DIR,
+                "extract.nonAS.snpIndel.posUn" + LabeledVariantAnnotationsWalker.ANNOTATIONS_HDF5_SUFFIX);                                          // non-allele-specific
+        final Function<ArgumentsBuilder, ArgumentsBuilder> addPositiveAnnotations = ab ->
+                ADD_ANNOTATIONS_HDF5.apply(ab, positiveAnnotationsHDF5);
+        final File unlabeledAnnotationsHDF5 = new File(INPUT_FROM_EXTRACT_EXPECTED_TEST_FILES_DIR,
+                "extract.AS.snpIndel.posUn" + ExtractVariantAnnotations.UNLABELED_TAG + LabeledVariantAnnotationsWalker.ANNOTATIONS_HDF5_SUFFIX);  // allele-specific
+        final Function<ArgumentsBuilder, ArgumentsBuilder> addUnlabeledAnnotations = ab ->
+                ADD_UNLABELED_ANNOTATIONS_HDF5.apply(ab, unlabeledAnnotationsHDF5);
+        final Function<ArgumentsBuilder, ArgumentsBuilder> addCalibrationSensitivityThreshold = ab ->
+                ADD_CALIBRATION_SENSITIVITY_THRESHOLD.apply(ab, CALIBRATION_SENSITIVITY_THRESHOLD);
+        addPositiveAnnotations
+                .andThen(addUnlabeledAnnotations)
+                .andThen(addCalibrationSensitivityThreshold)
+                .apply(argsBuilder);
+        runCommandLine(argsBuilder);
+    }
+
+    @Test(expectedExceptions = UserException.BadInput.class, groups = {"python"}) // python environment is required to run tool
+    public void testPositiveAnnotationsOfSpecifiedVariantTypesNotPresent() {
+        final File outputDir = createTempDir("train");
+        final String outputPrefix = String.format("%s/test", outputDir);
+        final ArgumentsBuilder argsBuilder = BASE_ARGUMENTS_BUILDER_SUPPLIER.get();
+        argsBuilder.addOutput(outputPrefix);
+        final File positiveAnnotationsHDF5 = new File(INPUT_FROM_EXTRACT_EXPECTED_TEST_FILES_DIR,
+                "extract.nonAS.snp.posUn" + LabeledVariantAnnotationsWalker.ANNOTATIONS_HDF5_SUFFIX);     // contains only SNPs, but SNP+INDEL is specified
+        final Function<ArgumentsBuilder, ArgumentsBuilder> addPositiveAnnotations = ab ->
+                ADD_ANNOTATIONS_HDF5.apply(ab, positiveAnnotationsHDF5);
+        ADD_SNP_MODE
+                .andThen(ADD_INDEL_MODE)
+                .andThen(addPositiveAnnotations)
+                .apply(argsBuilder);
+        runCommandLine(argsBuilder);
+    }
+
+    @Test(expectedExceptions = UserException.BadInput.class, groups = {"python"}) // python environment is required to run tool
+    public void testUnlabeledAnnotationsOfSpecifiedVariantTypesNotPresent() {
+        final File outputDir = createTempDir("train");
+        final String outputPrefix = String.format("%s/test", outputDir);
+        final ArgumentsBuilder argsBuilder = BASE_ARGUMENTS_BUILDER_SUPPLIER.get();
+        argsBuilder.addOutput(outputPrefix);
+        final File positiveAnnotationsHDF5 = new File(INPUT_FROM_EXTRACT_EXPECTED_TEST_FILES_DIR,
+                "extract.nonAS.snpIndel.posUn" + LabeledVariantAnnotationsWalker.ANNOTATIONS_HDF5_SUFFIX);
+        final Function<ArgumentsBuilder, ArgumentsBuilder> addPositiveAnnotations = ab ->
+                ADD_ANNOTATIONS_HDF5.apply(ab, positiveAnnotationsHDF5);
+        final File unlabeledAnnotationsHDF5 = new File(INPUT_FROM_EXTRACT_EXPECTED_TEST_FILES_DIR,
+                "extract.nonAS.snp.posUn" + ExtractVariantAnnotations.UNLABELED_TAG + LabeledVariantAnnotationsWalker.ANNOTATIONS_HDF5_SUFFIX);    // contains only SNPs, but SNP+INDEL is specified
+        final Function<ArgumentsBuilder, ArgumentsBuilder> addUnlabeledAnnotations = ab ->
+                ADD_UNLABELED_ANNOTATIONS_HDF5.apply(ab, unlabeledAnnotationsHDF5);
+        final Function<ArgumentsBuilder, ArgumentsBuilder> addCalibrationSensitivityThreshold = ab ->
+                ADD_CALIBRATION_SENSITIVITY_THRESHOLD.apply(ab, CALIBRATION_SENSITIVITY_THRESHOLD);
+        ADD_SNP_MODE.andThen(ADD_INDEL_MODE)
+                .andThen(addPositiveAnnotations)
+                .andThen(addUnlabeledAnnotations)
+                .andThen(addCalibrationSensitivityThreshold)
+                .apply(argsBuilder);
+        runCommandLine(argsBuilder);
+    }
+
+    @Test(expectedExceptions = UserException.BadInput.class, groups = {"python"}) // python environment is required to run tool
+    public void testPositiveAnnotationForOneVariantTypeIsCompletelyMissing() { // TODO add analogous test that warning is emitted when annotation has zero variance?
+        final File outputDir = createTempDir("train");
+        final String outputPrefix = String.format("%s/test", outputDir);
+        final ArgumentsBuilder argsBuilder = BASE_ARGUMENTS_BUILDER_SUPPLIER.get();
+        argsBuilder.addOutput(outputPrefix);
+
+        // we will dummy up an annotations file that contains 2 annotations (ANNOT_1 and ANNOT_2)
+        // for 4 variants (2 SNPs and 2 INDELs); the INDELs will all have missing (i.e., NaN) ANNOT_1 values
+        final List<String> annotationNames = Arrays.asList("ANNOT_1", "ANNOT_2");
+        final double[][] annotations = new double[][]{
+                new double[]{1, 2},             // SNP
+                new double[]{3, 4},             // SNP
+                new double[]{Double.NaN, 2},    // INDEL
+                new double[]{Double.NaN, 4}};   // INDEL
+        final List<Boolean> isSubset = Collections.nCopies(4, true);
+
+        final File positiveAnnotationsHDF5 = LabeledVariantAnnotationsData.subsetAnnotationsToTemporaryFile(
+                annotationNames, annotations, isSubset);
+
+        try (final HDF5File positiveAnnotationsHDF5File = new HDF5File(positiveAnnotationsHDF5, HDF5File.OpenMode.READ_WRITE)) {
+            positiveAnnotationsHDF5File.makeDoubleArray("/labels/snp", new double[]{1, 1, 0, 0});
+            positiveAnnotationsHDF5File.makeDoubleArray("/labels/training", new double[]{1, 1, 1, 1});
+            positiveAnnotationsHDF5File.makeDoubleArray("/labels/calibration", new double[]{1, 1, 1, 1});
+        }
+        final Function<ArgumentsBuilder, ArgumentsBuilder> addPositiveAnnotations = ab ->
+                ADD_ANNOTATIONS_HDF5.apply(ab, positiveAnnotationsHDF5);
+
+        ADD_SNP_MODE.andThen(ADD_INDEL_MODE)
+                .andThen(addPositiveAnnotations)
+                .apply(argsBuilder);
+        runCommandLine(argsBuilder);
+    }
+}
\ No newline at end of file
diff --git a/src/test/java/org/broadinstitute/hellbender/utils/python/PythonEnvironmentIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/utils/python/PythonEnvironmentIntegrationTest.java
index fe89b721a1a..b055d8461cd 100644
--- a/src/test/java/org/broadinstitute/hellbender/utils/python/PythonEnvironmentIntegrationTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/utils/python/PythonEnvironmentIntegrationTest.java
@@ -25,7 +25,7 @@ public Object[][] getDataPackagePresent() {
                 { "pymc3",          "3.1" },
                 { "keras",          "2.2.4" },
                 { "h5py",           "2.10.0" },
-                { "sklearn",        "0.22.2.post1" },
+                { "sklearn",        "0.23.1" },
                 { "matplotlib",     "3.2.1" },
                 { "pandas",         "1.0.3" },
                 { "argparse",       null },
diff --git a/src/test/resources/large/filteringJointVcf/test_10_samples.22.avg.vcf.gz b/src/test/resources/large/filteringJointVcf/test_10_samples.22.avg.vcf.gz
new file mode 100644
index 00000000000..31cba1e00f8
--- /dev/null
+++ b/src/test/resources/large/filteringJointVcf/test_10_samples.22.avg.vcf.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dcf1dbda2255fbe1372d09d364835452d610822070b6b9b56b1733388aa3cd19
+size 140900871
diff --git a/src/test/resources/large/filteringJointVcf/test_10_samples.22.avg.vcf.gz.tbi b/src/test/resources/large/filteringJointVcf/test_10_samples.22.avg.vcf.gz.tbi
new file mode 100644
index 00000000000..5fd47681849
--- /dev/null
+++ b/src/test/resources/large/filteringJointVcf/test_10_samples.22.avg.vcf.gz.tbi
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:af32939cd4f63a0a9251a50cc5658738285d4cee4833bcf1cda6b92d90c4b99b
+size 100153
diff --git a/src/test/resources/large/filteringJointVcf/test_10_samples.23.avg.vcf.gz b/src/test/resources/large/filteringJointVcf/test_10_samples.23.avg.vcf.gz
new file mode 100644
index 00000000000..55dde2493e4
--- /dev/null
+++ b/src/test/resources/large/filteringJointVcf/test_10_samples.23.avg.vcf.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4144805bd8fabc74f3eea39a910dbd5c24017b844c44640efda49e3b0febe693
+size 112076612
diff --git a/src/test/resources/large/filteringJointVcf/test_10_samples.23.avg.vcf.gz.tbi b/src/test/resources/large/filteringJointVcf/test_10_samples.23.avg.vcf.gz.tbi
new file mode 100644
index 00000000000..114d43936c5
--- /dev/null
+++ b/src/test/resources/large/filteringJointVcf/test_10_samples.23.avg.vcf.gz.tbi
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f3484a38abb76952b02863099c383eae26d50f44514c5045992f63cc3294ebe8
+size 114295
diff --git a/src/test/resources/large/filteringJointVcf/test_10_samples.sites_only.vcf.gz b/src/test/resources/large/filteringJointVcf/test_10_samples.sites_only.vcf.gz
new file mode 100644
index 00000000000..f75a07bd09c
--- /dev/null
+++ b/src/test/resources/large/filteringJointVcf/test_10_samples.sites_only.vcf.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:00212a6387eba259a2d060eef08f50f3de512a155ed4e746d38530310a582e14
+size 134260565
diff --git a/src/test/resources/large/filteringJointVcf/test_10_samples.sites_only.vcf.gz.tbi b/src/test/resources/large/filteringJointVcf/test_10_samples.sites_only.vcf.gz.tbi
new file mode 100644
index 00000000000..475b5ba83a0
--- /dev/null
+++ b/src/test/resources/large/filteringJointVcf/test_10_samples.sites_only.vcf.gz.tbi
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c99412c88d072d494e545f56acdf621f6c960cbb8f2d734532cf9d5d11e83104
+size 133485
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.pos.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.pos.annot.hdf5
new file mode 100644
index 00000000000..6f17056f47d
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.pos.annot.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3ba3af854cf35cffa95393038075dc3dd8907d0987896ecc15854fb928756359
+size 30408
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.pos.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.pos.vcf
new file mode 100644
index 00000000000..67a8e58fe29
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.pos.vcf
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e1d20489c2ff9b0ccba12a24c84d5d9fd61d62d8ffbb416593559120461b8140
+size 171038
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.pos.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.pos.vcf.idx
new file mode 100644
index 00000000000..36818418317
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.pos.vcf.idx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:73701b5eeb43593c3d3cbdf2f8c4383e6ec6dd04c2b47086a263d234c463f2a5
+size 114263
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.posUn.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.posUn.annot.hdf5
new file mode 100644
index 00000000000..c9ee58d80f2
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.posUn.annot.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:511858197b92a96bac14d64e884a02497c958da24ada5fcebf5bd49664d78b59
+size 30512
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.posUn.unlabeled.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.posUn.unlabeled.annot.hdf5
new file mode 100644
index 00000000000..409f5b378bb
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.posUn.unlabeled.annot.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:35158eb4516e8db64479ad493c5cbe225f0a04afbae6c7145bdc3fe02a2d4162
+size 38160
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.posUn.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.posUn.vcf
new file mode 100644
index 00000000000..c33bd749e80
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.posUn.vcf
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0232e0dbb277e6a5e736d01e0b17c1d0e7f0254c89f200e693393ab0962293e6
+size 171044
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.posUn.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.posUn.vcf.idx
new file mode 100644
index 00000000000..737a04bb05b
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.posUn.vcf.idx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b6f669764aef593e112d21f7c6414cf2a54ced0fae9be140440e7e3e19055eac
+size 114265
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.pos.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.pos.annot.hdf5
new file mode 100644
index 00000000000..687decbfa2d
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.pos.annot.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3e836468e49819f47679eb4368f8cd52626df2e5e879dfce4314586b4a708198
+size 146832
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.pos.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.pos.vcf
new file mode 100644
index 00000000000..fef16673a21
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.pos.vcf
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3060969881dbc006d167f09817924d38b6345e25976ac53880f624d94aea68e9
+size 193277
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.pos.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.pos.vcf.idx
new file mode 100644
index 00000000000..96624e70f39
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.pos.vcf.idx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e3e477367d5b1f891ed1ef171acd096aa6ceabd2feda0c6699f68260442a1750
+size 114298
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.posUn.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.posUn.annot.hdf5
new file mode 100644
index 00000000000..803746a075d
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.posUn.annot.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3250c8e597f71a6a2f83047a50b96d518ba350d74c3322b1d8a740256d1c4635
+size 147088
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.posUn.unlabeled.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.posUn.unlabeled.annot.hdf5
new file mode 100644
index 00000000000..a6d81581282
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.posUn.unlabeled.annot.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4415f0d40620998e808113f6bcef97ba5ab9ff8cd8148a1f330d8c21b0c08a36
+size 32304
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.posUn.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.posUn.vcf
new file mode 100644
index 00000000000..3dbb5880865
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.posUn.vcf
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6cf5ee7adacf635c73d7493b99cc8df19a31acbbec991fbe5173e7cd6b405491
+size 193281
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.posUn.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.posUn.vcf.idx
new file mode 100644
index 00000000000..a092ca99b1c
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.posUn.vcf.idx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:92e591c5ec161a5ec44d76b780882194ccd6a22075ef9905079493d87a8be12a
+size 114300
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.pos.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.pos.annot.hdf5
new file mode 100644
index 00000000000..e4b20d5259d
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.pos.annot.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6f1a99968e68b93349dcd1b6a9671d97a060e33bbcb384fcf79d4505e0a038a5
+size 174096
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.pos.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.pos.vcf
new file mode 100644
index 00000000000..1b2a380111c
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.pos.vcf
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3ea0ee6bca7622ae8670c2f8ee2930a3223ba5d3edd89871c8e4b5cf3cf96f9f
+size 196269
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.pos.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.pos.vcf.idx
new file mode 100644
index 00000000000..c5cab723e7d
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.pos.vcf.idx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8bff89ecb61fef3a81c0a7fd58b397d9b7a4a62d1cd282beb50bb3c6f5b2564c
+size 114496
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.posUn.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.posUn.annot.hdf5
new file mode 100644
index 00000000000..d974c3905c0
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.posUn.annot.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f90c42658e62b3717f607ed44ffed1d570b506a8d07636070bee2b4a5dea2aba
+size 174480
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.posUn.unlabeled.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.posUn.unlabeled.annot.hdf5
new file mode 100644
index 00000000000..bf70828b1e1
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.posUn.unlabeled.annot.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:095a6624af5e354ad07a9695db1bdc06785e7088b30bf4696a09c64829ee2e1d
+size 32080
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.posUn.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.posUn.vcf
new file mode 100644
index 00000000000..201b4860fa1
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.posUn.vcf
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3fa4160640da9143000d5f3b2497ca20c02c0944ca53cfa03b1a63d935b2cf2e
+size 196279
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.posUn.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.posUn.vcf.idx
new file mode 100644
index 00000000000..5985428efb3
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.posUn.vcf.idx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a5119e3b8e791052ee7c71d354fe2fedf432e5b42a4fd07ccd37561e68b871d1
+size 114498
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.pos.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.pos.annot.hdf5
new file mode 100644
index 00000000000..acc759367e6
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.pos.annot.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:091efef8c8ac199e66b04dc610fab56b63d37c0943986f63e37e5c72b7fe2f37
+size 31088
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.pos.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.pos.vcf
new file mode 100644
index 00000000000..c33bd749e80
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.pos.vcf
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0232e0dbb277e6a5e736d01e0b17c1d0e7f0254c89f200e693393ab0962293e6
+size 171044
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.pos.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.pos.vcf.idx
new file mode 100644
index 00000000000..a93f309d0fa
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.pos.vcf.idx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:40eab65c4b0806c0f14f0178fafdda682d113bb044d94d14803f2b0b212d0d7e
+size 114266
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.posUn.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.posUn.annot.hdf5
new file mode 100644
index 00000000000..acc759367e6
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.posUn.annot.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:091efef8c8ac199e66b04dc610fab56b63d37c0943986f63e37e5c72b7fe2f37
+size 31088
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.posUn.unlabeled.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.posUn.unlabeled.annot.hdf5
new file mode 100644
index 00000000000..32572d5a954
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.posUn.unlabeled.annot.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:57c1cb118d8a46cf9f27fcf0dbc64e3de43b3350ae5a738571837f4145b2e8fc
+size 32880
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.posUn.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.posUn.vcf
new file mode 100644
index 00000000000..c33bd749e80
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.posUn.vcf
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0232e0dbb277e6a5e736d01e0b17c1d0e7f0254c89f200e693393ab0962293e6
+size 171044
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.posUn.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.posUn.vcf.idx
new file mode 100644
index 00000000000..f09bcbfe1ab
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.posUn.vcf.idx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2f360ae4d112ea29bea0e19a83cb13fe552785a9f1ee3473fb2b7f6cb05f50d6
+size 114268
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.pos.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.pos.annot.hdf5
new file mode 100644
index 00000000000..6f4d614aa06
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.pos.annot.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e3e58024a7fa06ff7483f12c7b50908ecbf96ad703b8194cddfe99e7c60be5c9
+size 153048
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.pos.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.pos.vcf
new file mode 100644
index 00000000000..665f9422ec8
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.pos.vcf
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fada922eff03dbd88bbe2e8d593a5cd194e657babb98db49b6c14adae0c2f9de
+size 193313
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.pos.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.pos.vcf.idx
new file mode 100644
index 00000000000..435819c3ea5
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.pos.vcf.idx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:69c40e76562c2e3d8af9818b0cdeee958f6b5851d4022be55cd9ab71acf9c8b9
+size 114301
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.posUn.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.posUn.annot.hdf5
new file mode 100644
index 00000000000..6f4d614aa06
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.posUn.annot.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e3e58024a7fa06ff7483f12c7b50908ecbf96ad703b8194cddfe99e7c60be5c9
+size 153048
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.posUn.unlabeled.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.posUn.unlabeled.annot.hdf5
new file mode 100644
index 00000000000..d63b214b697
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.posUn.unlabeled.annot.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b4498cdde82d8b9a03dda5c5057227b0978f1da70b09b26f174b10970840597c
+size 32880
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.posUn.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.posUn.vcf
new file mode 100644
index 00000000000..665f9422ec8
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.posUn.vcf
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fada922eff03dbd88bbe2e8d593a5cd194e657babb98db49b6c14adae0c2f9de
+size 193313
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.posUn.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.posUn.vcf.idx
new file mode 100644
index 00000000000..bfc8b5c6560
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.posUn.vcf.idx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e1ababc168beb7e1acd58011d0870d886f3c0b50e8d1e62a6c9611a3a7dfe2e
+size 114303
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.pos.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.pos.annot.hdf5
new file mode 100644
index 00000000000..730b64eae2f
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.pos.annot.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3f05dd6973390ce9a2069fa8593107853d38dedf88a8db87a8500dbb01329fc9
+size 180992
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.pos.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.pos.vcf
new file mode 100644
index 00000000000..abec25cff9d
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.pos.vcf
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:84b539e52f7e0f24e840845ce3d090e4f55e7444d50c2a44177bb62041fb172a
+size 196311
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.pos.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.pos.vcf.idx
new file mode 100644
index 00000000000..0af9bd98f39
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.pos.vcf.idx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d528a399f3d230ee8275e71e0878c1b2d132d010c57e78f94576004b87297425
+size 114502
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.posUn.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.posUn.annot.hdf5
new file mode 100644
index 00000000000..75691a1e949
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.posUn.annot.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b00a4acfd4c6cc01e3539498bde4bab2279cffc88e8de6217e26e9db64179132
+size 180992
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.posUn.unlabeled.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.posUn.unlabeled.annot.hdf5
new file mode 100644
index 00000000000..3fcef28ea81
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.posUn.unlabeled.annot.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3ced17111b7819b00fdf48da669e717538ebca89913ae2ee09833dc4f5ef6890
+size 32880
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.posUn.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.posUn.vcf
new file mode 100644
index 00000000000..abec25cff9d
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.posUn.vcf
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:84b539e52f7e0f24e840845ce3d090e4f55e7444d50c2a44177bb62041fb172a
+size 196311
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.posUn.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.posUn.vcf.idx
new file mode 100644
index 00000000000..65a56af440f
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.posUn.vcf.idx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0d7cc7e967cad1911f637d20172745d65adc0a0f804c578e85378028cc80a044
+size 114504
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/input/small_callset_low_threshold.sites-only.chr1.1-10M.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/input/small_callset_low_threshold.sites-only.chr1.1-10M.vcf
new file mode 100644
index 00000000000..5bb2ef3ab94
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/input/small_callset_low_threshold.sites-only.chr1.1-10M.vcf
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8ea6cbe230a5a18f3447cfd5d29ce2787fd4a625128ab147ce0a1b207e577d50
+size 2013818
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/input/small_callset_low_threshold.sites-only.chr1.1-10M.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/input/small_callset_low_threshold.sites-only.chr1.1-10M.vcf.idx
new file mode 100644
index 00000000000..6926fb95f58
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/input/small_callset_low_threshold.sites-only.chr1.1-10M.vcf.idx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f40a26b8528447a9d1b1154643cbd682a154e91a67e6dda58cf11a620a1af3dc
+size 5387
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/1000G_omni2.5.hg38.chr1.1-5M.vcf.gz b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/1000G_omni2.5.hg38.chr1.1-5M.vcf.gz
new file mode 100644
index 00000000000..4157ac3128e
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/1000G_omni2.5.hg38.chr1.1-5M.vcf.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b5804bcbfb060e10c3aa841a4a92acfbafbf1b24c88c87fceaa0d9089eee699e
+size 127853
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/1000G_omni2.5.hg38.chr1.1-5M.vcf.gz.tbi b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/1000G_omni2.5.hg38.chr1.1-5M.vcf.gz.tbi
new file mode 100644
index 00000000000..bc59b8a6e25
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/1000G_omni2.5.hg38.chr1.1-5M.vcf.gz.tbi
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d09b831af6a1b8585c26da1b29d131f8983f121703c5131a6596a1e81e0408f
+size 2141
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/1000G_omni2.5.hg38.chr1.5M-10M.vcf.gz b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/1000G_omni2.5.hg38.chr1.5M-10M.vcf.gz
new file mode 100644
index 00000000000..5a556e7a0d7
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/1000G_omni2.5.hg38.chr1.5M-10M.vcf.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5c6559d5c1567042ddb0fb05d7a5b7d9a07c56c61d8d21adfa85c15bf44e24fa
+size 132259
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/1000G_omni2.5.hg38.chr1.5M-10M.vcf.gz.tbi b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/1000G_omni2.5.hg38.chr1.5M-10M.vcf.gz.tbi
new file mode 100644
index 00000000000..a7a45835346
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/1000G_omni2.5.hg38.chr1.5M-10M.vcf.gz.tbi
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fbf9f97389369ac5e5a41420e58aaa3fa0a5f5edc21a6bd04b7e18c5bc21c914
+size 2542
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/Mills_and_1000G_gold_standard.indels.hg38.chr1.1-5M.vcf.gz b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/Mills_and_1000G_gold_standard.indels.hg38.chr1.1-5M.vcf.gz
new file mode 100644
index 00000000000..187e5f24e86
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/Mills_and_1000G_gold_standard.indels.hg38.chr1.1-5M.vcf.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b610a0aeccbec80b69572abcb89e1d3c5e96bc7df22b38b8dccf0b3c6b0ed1b5
+size 45717
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/Mills_and_1000G_gold_standard.indels.hg38.chr1.1-5M.vcf.gz.tbi b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/Mills_and_1000G_gold_standard.indels.hg38.chr1.1-5M.vcf.gz.tbi
new file mode 100644
index 00000000000..582b14d068e
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/Mills_and_1000G_gold_standard.indels.hg38.chr1.1-5M.vcf.gz.tbi
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:91926cca5a1c36a336f54ba918d0fe0581a6f6e89421a971c78c39aa9e5dd3e6
+size 2040
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/Mills_and_1000G_gold_standard.indels.hg38.chr1.5M-10M.vcf.gz b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/Mills_and_1000G_gold_standard.indels.hg38.chr1.5M-10M.vcf.gz
new file mode 100644
index 00000000000..38011d42e49
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/Mills_and_1000G_gold_standard.indels.hg38.chr1.5M-10M.vcf.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5063e401c67443ce0c12c1534b3b1284fe690c826c8987d0430e516193d062ce
+size 49655
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/Mills_and_1000G_gold_standard.indels.hg38.chr1.5M-10M.vcf.gz.tbi b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/Mills_and_1000G_gold_standard.indels.hg38.chr1.5M-10M.vcf.gz.tbi
new file mode 100644
index 00000000000..27bd4edcff5
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/Mills_and_1000G_gold_standard.indels.hg38.chr1.5M-10M.vcf.gz.tbi
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9a738a699beff718443d36022bf9fb35686498f63d7f8e5c40f79ef26e3d5908
+size 2465
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.annot.hdf5
new file mode 100644
index 00000000000..947ccf6cdf6
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.annot.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6ecfde9d89634bdeab65242b0b9fc64d2c3607cec0b4533ac2e6b8f71c8fde1e
+size 736656
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.scores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.scores.hdf5
new file mode 100644
index 00000000000..c52cc89bfe4
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.scores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0746acbff61c8ef95225e010964b69f48465f35185a1f6af576bb53ab726314d
+size 35136
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf
new file mode 100644
index 00000000000..d08cc10df21
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fc110dfc24a356c280ffd772af354a4cee06eb46e8e2f321638d1faa882b17bb
+size 2227437
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf.idx
new file mode 100644
index 00000000000..01ca0f42445
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf.idx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:67ee6520f4abc8f9f659e3704e2cf45318a46a963297eec9c5534cb352584718
+size 119222
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.annot.hdf5
new file mode 100644
index 00000000000..bfca590ccc5
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.annot.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:39872e8a5516b5480015190f646a94dc22eeb4b58b337c46bbab83d67fd5b789
+size 822288
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.scores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.scores.hdf5
new file mode 100644
index 00000000000..9c9224beed2
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.scores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9a5b0cd60e2dc9387ceedce1e42bebaac932af2364228944f00be0626bad167e
+size 38440
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf
new file mode 100644
index 00000000000..8af03459e62
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7bb3acb4c30df8259c64e5c47ba7ffaf416fa6f7fa271e8cbd216a555ee62a22
+size 2243539
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf.idx
new file mode 100644
index 00000000000..485769c9598
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf.idx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:29bdcaa085904edb984ca9ab2724efc9142e63256291cead4e2407080cf87196
+size 119227
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.annot.hdf5
new file mode 100644
index 00000000000..fbf0990ee70
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.annot.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b25607c74d197a7116421014925ad4dcc10c326e561b193b1e2eb71152598369
+size 766368
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.scores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.scores.hdf5
new file mode 100644
index 00000000000..ee4850c9acb
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.scores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:afb44cc0b2f1c821d4b79f4c0145edc5fc662d06ce13239fd2077e1d1e045783
+size 34960
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf
new file mode 100644
index 00000000000..e46bbcf2a15
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:993e2d40dea8558c001a7321a4bbe4804877b2de36c3a266416310446c915ccb
+size 2226076
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf.idx
new file mode 100644
index 00000000000..9be1548020d
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf.idx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1a56007a28f971a86349be709cb2b5ce3821ef5f3ae19ff0f9dcd2841021a510
+size 119225
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.annot.hdf5
new file mode 100644
index 00000000000..1378a5e61da
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.annot.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:38fb5c443979d9468de740c26c1e3b2d8f27938c1ffb43ebf48ae1bef94196b3
+size 829672
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.scores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.scores.hdf5
new file mode 100644
index 00000000000..58244d511a7
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.scores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3bb53ebfca7a737737a1d01ff541d414c3cef07d507b3e360d360079239d723a
+size 37720
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf
new file mode 100644
index 00000000000..4af1921ce48
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:05f85d264a457cdd81896bde03f51b2369343da5ade21b1c8df183a2b7e8f974
+size 2242450
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf.idx
new file mode 100644
index 00000000000..34133ce42ad
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf.idx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cba3840238dc7d6c7d85eeda892da51abdccf1e80c60b9030fa781da42d16b9f
+size 119230
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.calibrationScores.hdf5
new file mode 100644
index 00000000000..449a7e34730
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.calibrationScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f2e3672445358ec0259fc96c72a2c32781e21b584d869c49a6356bb1869a577d
+size 4960
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.negative.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.negative.scorer.pkl
new file mode 100644
index 00000000000..c262ca0bc24
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.negative.scorer.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9b439960dcb18984ea43a0cc6f918eeb54ea796de72730264039081cb8b32ae4
+size 356566
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.scorer.pkl
new file mode 100644
index 00000000000..114008b6174
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.scorer.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a174a894dee69236e7767bff18a752981e14b9d891efe05843137bcf4b67cffd
+size 506091
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.trainingScores.hdf5
new file mode 100644
index 00000000000..794bef0ac90
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.trainingScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:63e6805ffd01e5b0420f44cd35a1e41faa96c4a4252c6487ef000dd98290d99e
+size 5984
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.unlabeledScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.unlabeledScores.hdf5
new file mode 100644
index 00000000000..2f8fda2a056
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.unlabeledScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:11c14051f3c1d2ab1481a0d77f2f127d5cb5fa5f0978879f0225a83c86e70456
+size 3192
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5
new file mode 100644
index 00000000000..b803f7adaa2
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e3d1b3de26c39fa919cede168b2b5e5ce560a1c10fae5db82da4108467ac96ce
+size 4960
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.negative.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.negative.scorer.pkl
new file mode 100644
index 00000000000..79698645b6e
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.negative.scorer.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cf5955e6395e1e3ca4843a395bd87b0664cbe3d931f27a1a9f68f54f93a6825b
+size 371024
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.scorer.pkl
new file mode 100644
index 00000000000..00f799db2fe
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.scorer.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:066e3f2e993cf5485d3c650cb10bc99a8b082f7aac4ee427e2eda925af758b14
+size 514533
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.trainingScores.hdf5
new file mode 100644
index 00000000000..3a9c7c2ef55
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.trainingScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:24c3ed12e070196e9178f2da8876d58f2d0211d3bc424291bb9754f10fecaa78
+size 5984
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5
new file mode 100644
index 00000000000..c522bdbdc76
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:63ac52c99cb736b74a4ddadd3dfa529a7a5f0524f6a4733181a5f83c98daca79
+size 3192
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IF.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IF.snp.calibrationScores.hdf5
new file mode 100644
index 00000000000..058663f5a25
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IF.snp.calibrationScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:843f2fcfb16e5ff1cc69f135129881e560e19adcf4ae20d43a41e4f86ae0cea9
+size 4960
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IF.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IF.snp.scorer.pkl
new file mode 100644
index 00000000000..b49deaadde6
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IF.snp.scorer.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c2b2d3350bc640f19bc81f527ea45f78ff0c6069fb7acf45524ee26e69f4668c
+size 506091
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IF.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IF.snp.trainingScores.hdf5
new file mode 100644
index 00000000000..083847575cc
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IF.snp.trainingScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a3479fc04ae522a9e5ea4558b484a723df7f40b07401196e36fdfb1412733b99
+size 5984
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.calibrationScores.hdf5
new file mode 100644
index 00000000000..1e4e7af4c24
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.calibrationScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:219b339ad1fe955f6fa20280b4a0a993d6c4dcfa5e7be025cf62bf9937a57e7d
+size 4960
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.scorer.pkl
new file mode 100644
index 00000000000..00f799db2fe
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.scorer.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:066e3f2e993cf5485d3c650cb10bc99a8b082f7aac4ee427e2eda925af758b14
+size 514533
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.trainingScores.hdf5
new file mode 100644
index 00000000000..02ddb4fb358
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.trainingScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b2c47a9586f1a4c4c37a62c902d91683392bf37e575f925266815ddc9bcc6e8
+size 5984
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.calibrationScores.hdf5
new file mode 100644
index 00000000000..11e985264af
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.calibrationScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3e52232e30ee87672170c3c6d38d1d422ddafcd3219081a912b7227d69f445e7
+size 2664
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.negative.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.negative.scorer.pkl
new file mode 100644
index 00000000000..12118c548f9
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.negative.scorer.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:28d5205329aba6be796f5a640f71598bcd9f10b31f871e613aa63d7dccfcc8c2
+size 130367
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.scorer.pkl
new file mode 100644
index 00000000000..5b629aeed4e
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.scorer.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6872a50edc608ffc632d5de1849384e05f916d0df64d391c68fc92abe0918d2e
+size 235813
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.trainingScores.hdf5
new file mode 100644
index 00000000000..7a2b7786b01
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.trainingScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8669a0d4ca10acfc5d969021d06bbafa2efc14af057ca968d365a2fe9dedac45
+size 2880
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.unlabeledScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.unlabeledScores.hdf5
new file mode 100644
index 00000000000..69d2a8d5d22
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.unlabeledScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:511e968d47dacb11b03439678f2a6f3bf54cde4c0e158d9ab99bfcb9478b5e10
+size 2472
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.calibrationScores.hdf5
new file mode 100644
index 00000000000..e703b28a4cb
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.calibrationScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:78b502beb500979c2d8e24f5f60629489da00984c0935929643a3dbade703086
+size 4960
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.negative.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.negative.scorer.pkl
new file mode 100644
index 00000000000..cc692c3e418
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.negative.scorer.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3a71085c8ff3f9266434b07166abae0977028cf6c3609656e557fa924ab37d22
+size 356566
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.scorer.pkl
new file mode 100644
index 00000000000..6c73d0e2d5b
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.scorer.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5e0c0d98b5a10f80b43e857781e3eb2d066d5b46bbf8aace671c1e848cfe7a7a
+size 506091
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.trainingScores.hdf5
new file mode 100644
index 00000000000..129cbdc80d2
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.trainingScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3fe6ddc2bad139260fc01d783ebb08613449898568dc01542f836395d7666c64
+size 5984
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.unlabeledScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.unlabeledScores.hdf5
new file mode 100644
index 00000000000..ccf01a0c470
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.unlabeledScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:14e3e8e2a0eab4992bf72bca0600109fada5f93848d244d1b1c097de2aeaa2ab
+size 3192
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.calibrationScores.hdf5
new file mode 100644
index 00000000000..ead5e9f41bd
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.calibrationScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e37fcda4dfd551f30a889a98862e9f6ed42a305c14d1f552a07567aa9b18441
+size 2664
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.negative.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.negative.scorer.pkl
new file mode 100644
index 00000000000..06ba7f87603
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.negative.scorer.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:28c4176bea3fec56b992ecb0259d191ebe7238615a8a803eaf1c2c9a61adccf5
+size 133823
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.scorer.pkl
new file mode 100644
index 00000000000..86600de5f2c
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.scorer.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1de142d06f85e0303ff046377c3eb51742b8556cc3531d1880d7d37a571a1ae7
+size 240055
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.trainingScores.hdf5
new file mode 100644
index 00000000000..8dc0a46dec7
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.trainingScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3c47e4959c7436c4aea44694ea2c401ab96624843b5f682691c65ac9435d3656
+size 2880
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.unlabeledScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.unlabeledScores.hdf5
new file mode 100644
index 00000000000..8182344fc6d
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.unlabeledScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e738c016efdfd9adacb2df9021f655bd0107a6a73cc8c788066b94368eca1c83
+size 2472
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5
new file mode 100644
index 00000000000..8b8393f2fa8
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:35da3cdd364d94815492009ef56b545a1221fd8ec71003d5c1a52174b6156613
+size 4960
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.negative.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.negative.scorer.pkl
new file mode 100644
index 00000000000..79698645b6e
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.negative.scorer.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cf5955e6395e1e3ca4843a395bd87b0664cbe3d931f27a1a9f68f54f93a6825b
+size 371024
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.scorer.pkl
new file mode 100644
index 00000000000..00f799db2fe
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.scorer.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:066e3f2e993cf5485d3c650cb10bc99a8b082f7aac4ee427e2eda925af758b14
+size 514533
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.trainingScores.hdf5
new file mode 100644
index 00000000000..eb8ac9e0552
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.trainingScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:367eb408b122fd23b49ee9c464e28f188851406b16be96a004b58999a26c87d8
+size 5984
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5
new file mode 100644
index 00000000000..0ef22ca8754
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:08457ce521c97cd9fe6a4a8be79d4b7f74c155382d8d43d3db0963ef3e1b15c0
+size 3192
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.calibrationScores.hdf5
new file mode 100644
index 00000000000..8a9731e5846
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.calibrationScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:25ccfe56dcd1890b70464a5a9c61e550f1f06a340d8d40c8c3637b573a6af6ae
+size 2664
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.scorer.pkl
new file mode 100644
index 00000000000..28519a5a7f5
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.scorer.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e4e1bba0d4e80002b36ae9c6b593207bd2e400a4a88f0f2d69bdcfd7665f18fe
+size 235812
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.trainingScores.hdf5
new file mode 100644
index 00000000000..57490375485
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.trainingScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1f85b85807a022ff37529099727d0475e9e4fe6c796d6ef69ec25037e4c4ffff
+size 2880
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.calibrationScores.hdf5
new file mode 100644
index 00000000000..f678f3ac45a
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.calibrationScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ca24136189690867a758ff1102ddd1168d49379bd5cf31c5e809f3049088ca4e
+size 4960
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.scorer.pkl
new file mode 100644
index 00000000000..f30afeafbb7
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.scorer.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0aaf6d8e05d906b80771f1165c75792451b86a52f7759774ee55175bea288c6d
+size 506090
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.trainingScores.hdf5
new file mode 100644
index 00000000000..b3b1cff2b66
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.trainingScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d93a6ff01ea21375c0b1998197c6bc2d25ef876d050abee095497a939883c0f2
+size 5984
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.calibrationScores.hdf5
new file mode 100644
index 00000000000..a51ead9a242
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.calibrationScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:829946ab39fde054c01fd38c14907141eee1c2fb4af5ba7dc93a1dc432ea3483
+size 2664
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.scorer.pkl
new file mode 100644
index 00000000000..86600de5f2c
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.scorer.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1de142d06f85e0303ff046377c3eb51742b8556cc3531d1880d7d37a571a1ae7
+size 240055
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.trainingScores.hdf5
new file mode 100644
index 00000000000..a74970fd2fc
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.trainingScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5ada773b0af0a3031c8f8a7cf51144929db6cff082d834aeffe17a796e48da7a
+size 2880
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.calibrationScores.hdf5
new file mode 100644
index 00000000000..3c889afe1a3
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.calibrationScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1b8a31718b25adc789a14d952403706a497e799f677fce94c0e5abcb8f3ca75a
+size 4960
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.scorer.pkl
new file mode 100644
index 00000000000..00f799db2fe
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.scorer.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:066e3f2e993cf5485d3c650cb10bc99a8b082f7aac4ee427e2eda925af758b14
+size 514533
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.trainingScores.hdf5
new file mode 100644
index 00000000000..a7a57c5e6e0
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.trainingScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9b72e1038e89fc1e16e17a32194e59bc90cb2f479e60ed31ad891fc4821f6fd6
+size 5984
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.calibrationScores.hdf5
new file mode 100644
index 00000000000..1af4242bbec
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.calibrationScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7687aa701fe0fdb52d86a27ccc12a6cc8bb2b57906ea2335146396bfae47ea1b
+size 4960
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.negative.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.negative.scorer.pkl
new file mode 100644
index 00000000000..f99de98d4fb
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.negative.scorer.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d7d481a06e9c5d27b8a322712818731b508ecb09309cacff1c5f24df1077d975
+size 368366
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.scorer.pkl
new file mode 100644
index 00000000000..21c88876f37
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.scorer.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6f4f417b26d9478fb36ac5372f1634140a4114f09d207a434f8d582a5936e9b9
+size 556675
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.trainingScores.hdf5
new file mode 100644
index 00000000000..17e3ab4ef97
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.trainingScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:048b4690d9f1fdce1e3dbabf995e8e306965ea8e00c21a92129d67e0e3b8fb5c
+size 5992
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.unlabeledScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.unlabeledScores.hdf5
new file mode 100644
index 00000000000..323cf93db29
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.unlabeledScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e4033c901c80e97d4668d38e40b6677ec68d5d6f960d498c90d6311c971d6ae
+size 3168
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5
new file mode 100644
index 00000000000..9638837ac77
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ff6eb7db94e1e78d2aa4e2bad08eb92e0df1ae88ef29e27f7f81362ac56e4faa
+size 4960
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.negative.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.negative.scorer.pkl
new file mode 100644
index 00000000000..02016de49a7
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.negative.scorer.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:373207a645b1caa64fe15fd4fec77556c87c11f0f304b8e4920094a59cae89eb
+size 359135
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.scorer.pkl
new file mode 100644
index 00000000000..ac1fe518303
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.scorer.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0510d85a2680139fbfb2d3c8f2d5fed8977af834bf82d09d9090a06fa8d454f4
+size 525312
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.trainingScores.hdf5
new file mode 100644
index 00000000000..6972268a95c
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.trainingScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:443ed54e4403ba517c2370e18f95200f9b9dc4648c914424344626bacab6c4f2
+size 5992
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5
new file mode 100644
index 00000000000..90420cf2917
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3cd7216a8a13adb6640b1e91d939600bc664202964b0cc74d76d8070c3422b75
+size 3168
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IF.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IF.snp.calibrationScores.hdf5
new file mode 100644
index 00000000000..b8c18492487
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IF.snp.calibrationScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4f0b956ad8c517d885254ba6f392713e098a92aeeccc699c5a17f53c1504f119
+size 4960
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IF.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IF.snp.scorer.pkl
new file mode 100644
index 00000000000..a9a9086bb5e
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IF.snp.scorer.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0cf2f338bb130d899eca8ceab17339f6f166958642101de301fb8458b11131a1
+size 556675
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IF.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IF.snp.trainingScores.hdf5
new file mode 100644
index 00000000000..486c29310ee
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IF.snp.trainingScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:537cb3879bde1388c6eca2035682a0c7b771ce790b523649e65f167a23eed255
+size 5992
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.calibrationScores.hdf5
new file mode 100644
index 00000000000..8fe0c3fa585
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.calibrationScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:25b24f7a4b85ad16f63043285fb31cd580e3498629bab0ef257a98d7ec318471
+size 4960
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.scorer.pkl
new file mode 100644
index 00000000000..ac1fe518303
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.scorer.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0510d85a2680139fbfb2d3c8f2d5fed8977af834bf82d09d9090a06fa8d454f4
+size 525312
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.trainingScores.hdf5
new file mode 100644
index 00000000000..075038efb0f
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.trainingScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:742c4374a393e2f86ad836001576d77210dc62cdf11cf8748abffe4399c02d0f
+size 5992
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.calibrationScores.hdf5
new file mode 100644
index 00000000000..4ae3f77a6dd
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.calibrationScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:298e36701a70226f3720cd40cd6ca8f37404a807cbd193c23d764141f5594f36
+size 2664
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.negative.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.negative.scorer.pkl
new file mode 100644
index 00000000000..8c7e18e918e
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.negative.scorer.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ff46e688731825bb03a6c9504ed7847c998c129a1e13c507a14f7adb56d733ad
+size 108247
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.scorer.pkl
new file mode 100644
index 00000000000..2aba7f5c93a
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.scorer.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fe5ca4414318a540488afbdea99293e0f67b02725839c13faa5b3ff39b959e7e
+size 259163
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.trainingScores.hdf5
new file mode 100644
index 00000000000..004a5bdb157
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.trainingScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7243a9e90836489a979905a386811a688bd07968c115063351b77bf91c72efc3
+size 2880
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.unlabeledScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.unlabeledScores.hdf5
new file mode 100644
index 00000000000..9cc88998aef
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.unlabeledScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:00393e27ad2c98fd4c36a003d7dcbc175d31d8346ace63e52b458df76a8d7457
+size 2496
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.calibrationScores.hdf5
new file mode 100644
index 00000000000..a52edeb5b40
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.calibrationScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5b3040e1926e7b59c623e2b930e16556d26dd91c4d586120f7db156f3f2f14fe
+size 4960
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.negative.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.negative.scorer.pkl
new file mode 100644
index 00000000000..df2c423b7d9
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.negative.scorer.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c4cbb64dbf96f8cbc10908aed4c8ec2e3fcf01af7a3512c90e48aa743af3bf28
+size 368366
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.scorer.pkl
new file mode 100644
index 00000000000..9192d59204c
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.scorer.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:86e04d419ae8a211acbead86c467cd8c3578c3312f31fdc5600eaefd040ffc32
+size 556675
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.trainingScores.hdf5
new file mode 100644
index 00000000000..483bfe123ec
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.trainingScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8007c65df2a62f1b12cb2bd9d0818ea106edf2fc18610318e1e09def0f1bd77a
+size 5992
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.unlabeledScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.unlabeledScores.hdf5
new file mode 100644
index 00000000000..b7ab0c0c576
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.unlabeledScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e6860775947699fe2df688d0c17321de501ef18c12c87dbb430221ba1c27e56b
+size 3168
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.calibrationScores.hdf5
new file mode 100644
index 00000000000..624a515e7f8
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.calibrationScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:223ee064aed2de5a2e0c7a773b08d730666fa28b49181984dc8f16d07233b2ef
+size 2664
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.negative.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.negative.scorer.pkl
new file mode 100644
index 00000000000..54aa186cf73
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.negative.scorer.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e0b377bad272e94bf11bca14ed7a7dd3c67296f347509d272b9538b695579199
+size 132823
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.scorer.pkl
new file mode 100644
index 00000000000..1142e1f4599
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.scorer.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:426cec56e16eec10f47c824797c035772d1ebf3cf4f73972e8a541deca622cd3
+size 248813
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.trainingScores.hdf5
new file mode 100644
index 00000000000..c91fcf083b5
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.trainingScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f80beea7f769595a56aa8af6d90335626fa5c93fe5f61761e998f59e774f3104
+size 2880
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.unlabeledScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.unlabeledScores.hdf5
new file mode 100644
index 00000000000..64fbf36622e
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.unlabeledScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1ba9bf669a916ee10750cf8cf78968af0372a5d272640be9fe3fba97fc4e6059
+size 2496
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5
new file mode 100644
index 00000000000..6a831bf1e51
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dce6eb6a9175112940d97573e3e99818a041f4c8311bd0d790b11bad7153cc90
+size 4960
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.negative.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.negative.scorer.pkl
new file mode 100644
index 00000000000..02016de49a7
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.negative.scorer.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:373207a645b1caa64fe15fd4fec77556c87c11f0f304b8e4920094a59cae89eb
+size 359135
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.scorer.pkl
new file mode 100644
index 00000000000..ac1fe518303
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.scorer.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0510d85a2680139fbfb2d3c8f2d5fed8977af834bf82d09d9090a06fa8d454f4
+size 525312
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.trainingScores.hdf5
new file mode 100644
index 00000000000..45b09bcdafc
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.trainingScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9885e1ace48f972b88d808fe9dcf31aae828b0f327909c3922305d48659c9516
+size 5992
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5
new file mode 100644
index 00000000000..3baaddc00b8
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:60dfec8ee105371c8b69b4c6790f6920c1f94adfad1bca58fa353a016224c3c4
+size 3168
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.calibrationScores.hdf5
new file mode 100644
index 00000000000..9937d988ad3
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.calibrationScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9c9b6a3bf6461653330c84b6ed491fbaf065e87c00b707409009a48e55dc3546
+size 2664
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.scorer.pkl
new file mode 100644
index 00000000000..7ab5b7071f4
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.scorer.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cb95c34fdfa9bb4126d27355e39708de5d273694ce2edf639a02c324608bfdd2
+size 259163
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.trainingScores.hdf5
new file mode 100644
index 00000000000..61cd73704ad
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.trainingScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8b1c27c9bb2587fed510eb347fe53332828c2ee0f1cc6f22f8e259d275c5f877
+size 2880
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.calibrationScores.hdf5
new file mode 100644
index 00000000000..91c0e50cbd1
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.calibrationScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bfb11237cffa3309f53869996111de5be91efacc8b210d43101535d90d6195a4
+size 4960
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.scorer.pkl
new file mode 100644
index 00000000000..786b299758d
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.scorer.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9113bec7467a7b597e6af65108fd19f2260510fc0174fa5bf071ca1b837e0b28
+size 556675
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.trainingScores.hdf5
new file mode 100644
index 00000000000..7f5f3e9342d
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.trainingScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:21a219096d12b0122e0e7dc7962d26911289d2e17377d606bb504cb3bf87daa6
+size 5992
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.calibrationScores.hdf5
new file mode 100644
index 00000000000..8a91db1f5b2
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.calibrationScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2e3fc94a375bde9284a219c1c2eb2b204731af0c4fce15426f7903f6fed43ecb
+size 2664
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.scorer.pkl
new file mode 100644
index 00000000000..1142e1f4599
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.scorer.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:426cec56e16eec10f47c824797c035772d1ebf3cf4f73972e8a541deca622cd3
+size 248813
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.trainingScores.hdf5
new file mode 100644
index 00000000000..e0fdf55eb20
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.trainingScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9ae00383d5302dfb129e6e7c0c61cb115e74fade18b0b62bba6284f876fa8277
+size 2880
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.calibrationScores.hdf5
new file mode 100644
index 00000000000..1250aeddd2a
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.calibrationScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5a2fdb209c8814f63837d30bca8883ed79063d66fe22516f4dc6cb54542d743e
+size 4960
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.scorer.pkl
new file mode 100644
index 00000000000..ac1fe518303
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.scorer.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0510d85a2680139fbfb2d3c8f2d5fed8977af834bf82d09d9090a06fa8d454f4
+size 525312
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.trainingScores.hdf5
new file mode 100644
index 00000000000..4f95113bc2e
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.trainingScores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9c063a077702f5979525286ff8bda09d9f6133e251babf27d0ccb9e384578fc0
+size 5992
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/isolation-forest-hyperparameters-different-seed.json b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/isolation-forest-hyperparameters-different-seed.json
new file mode 100644
index 00000000000..6fbb7d105da
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/isolation-forest-hyperparameters-different-seed.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ddca401b3f0fdceedc96946c8ced9870984f1ae34ce5e5626cc4b08152639532
+size 23