From e929409612b358ec9b05af33ab1b345ac0cd170e Mon Sep 17 00:00:00 2001
From: orlicohen <107129422+orlicohen@users.noreply.github.com>
Date: Thu, 30 Jun 2022 15:52:19 -0400
Subject: [PATCH] VariantsToTable: Include all fields when none are specified
(#7911)
VariantsToTable now outputs all fields declared in the VCF header when no fields are selected.
Added integration tests to cover this new functionality
Fixes #7677
---
.../walkers/variantutils/VariantsToTable.java | 47 ++++++-
.../VariantsToTableIntegrationTest.java | 41 ++++++
.../VCFWithGenotypes_1000G.phase3.snippet.vcf | 48 +++++++
...typesWithFormatField_dbsnp_138.snippet.vcf | 118 ++++++++++++++++++
.../VCFWithoutGenotypes_dbsnp_138.snippet.vcf | 117 +++++++++++++++++
.../expected.noFieldsSpecifiedNoSamples.table | 4 +
...xpected.noFieldsSpecifiedWithSamples.table | 4 +
7 files changed, 373 insertions(+), 6 deletions(-)
create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTable/VCFWithGenotypes_1000G.phase3.snippet.vcf
create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTable/VCFWithoutGenotypesWithFormatField_dbsnp_138.snippet.vcf
create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTable/VCFWithoutGenotypes_dbsnp_138.snippet.vcf
create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTable/expected.noFieldsSpecifiedNoSamples.table
create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTable/expected.noFieldsSpecifiedWithSamples.table
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTable.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTable.java
index c8ae76086a4..db5d4e2e63c 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTable.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTable.java
@@ -2,9 +2,7 @@
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.VariantContext;
-import htsjdk.variant.vcf.VCFConstants;
-import htsjdk.variant.vcf.VCFHeader;
-import htsjdk.variant.vcf.VCFHeaderLineCount;
+import htsjdk.variant.vcf.*;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.barclay.argparser.Advanced;
@@ -12,7 +10,6 @@
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
-import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
import picard.cmdline.programgroups.VariantEvaluationProgramGroup;
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.ReadsContext;
@@ -38,7 +35,8 @@
* This tool extracts specified fields for each variant in a VCF file to a tab-delimited table, which may be easier
* to work with than a VCF. By default, the tool only extracts PASS or . (unfiltered) variants in the VCF file. Filtered variants may be
* included in the output by adding the --show-filtered flag. The tool can extract both INFO (i.e. site-level) fields and
- * FORMAT (i.e. sample-level) fields.
+ * FORMAT (i.e. sample-level) fields. If the tool is run without specifying any fields, it defaults to include all fields
+ * declared in the VCF header.
*
*
* INFO/site-level fields
@@ -100,6 +98,12 @@
* 1 65068538 SNP 49,0 35,4
* 1 111146235 SNP 69,1 77,4
*
+ *
+ * gatk VariantsToTable \
+ * -V input.vcf \
+ * -O output.table
+ *
+ * would produce a file that includes all fields declared in the VCF header.
*
* Notes
*
@@ -212,9 +216,39 @@ public void onTraversalStart() {
inputHeader = getHeaderForVariants();
outputStream = createPrintStream();
+ // if no fields specified, default to include all fields listed in header into table
+ if(fieldsToTake.isEmpty() && genotypeFieldsToTake.isEmpty() && asFieldsToTake.isEmpty() && asGenotypeFieldsToTake.isEmpty()){
+ logger.warn("No fields were specified. All fields declared in the VCF header will be included in the output table.");
+
+ // add all mandatory VCF fields (except INFO)
+ for(VCFHeader.HEADER_FIELDS headerField : VCFHeader.HEADER_FIELDS.values()){
+ if(!headerField.name().equals(VCFHeader.HEADER_FIELDS.INFO.name())) {
+ fieldsToTake.add(headerField.name());
+ }
+ }
+
+ // add all INFO fields present in VCF header
+ for (final VCFInfoHeaderLine infoLine : inputHeader.getInfoHeaderLines()) {
+ fieldsToTake.add(infoLine.getID());
+ }
+
+ // add all FORMAT fields present in VCF header
+ for (final VCFFormatHeaderLine formatLine : inputHeader.getFormatHeaderLines()) {
+ // ensure GT field listed as first FORMAT field
+ if(formatLine.getID().equals(VCFConstants.GENOTYPE_KEY)) {
+ genotypeFieldsToTake.add(0, formatLine.getID());
+ }
+ else {
+ genotypeFieldsToTake.add(formatLine.getID());
+ }
+ }
+ }
+
+ // if fields specified, but none are genotype fields, set samples to empty
if (genotypeFieldsToTake.isEmpty() && asGenotypeFieldsToTake.isEmpty()) {
samples = Collections.emptySortedSet();
- } else {
+ }
+ else {
final Map vcfHeaders = Collections.singletonMap(getDrivingVariantsFeatureInput().getName(), getHeaderForVariants());
samples = VcfUtils.getSortedSampleSet(vcfHeaders, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE);
@@ -238,6 +272,7 @@ public void onTraversalStart() {
outputStream.println("RecordID\tSample\tVariable\tValue");
} else {
final List fields = new ArrayList<>();
+
fields.addAll(fieldsToTake);
fields.addAll(asFieldsToTake);
fields.addAll(createGenotypeFields());
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTableIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTableIntegrationTest.java
index b28094cf72e..10c922a267a 100644
--- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTableIntegrationTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTableIntegrationTest.java
@@ -5,6 +5,7 @@
import org.broadinstitute.hellbender.testutils.IntegrationTestSpec;
import org.testng.annotations.Test;
+import java.io.File;
import java.io.IOException;
import java.util.Arrays;
@@ -236,4 +237,44 @@ public void testMoltenOutputWithMultipleAlleles() throws IOException {
spec.setTrimWhiteSpace(false);
spec.executeTest("testMoltenOutputWithMultipleAlleles", this);
}
+
+ @Test
+ public void testNoFieldsSpecifiedNoSamples() throws IOException {
+ final File inputFile = new File(getToolTestDataDir(), "VCFWithoutGenotypes_dbsnp_138.snippet.vcf");
+ final File outputFile = createTempFile("noFieldsSpecifiedOutput", ".table");
+ final File expectedFile = new File(getToolTestDataDir(), "expected.noFieldsSpecifiedNoSamples.table");
+
+ final String[] args = new String[] {"--variant", inputFile.getAbsolutePath(),
+ "-O", outputFile.getAbsolutePath()};
+ runCommandLine(args);
+
+ IntegrationTestSpec.assertEqualTextFiles(outputFile, expectedFile);
+ }
+
+ @Test
+ public void testNoFieldsSpecifiedWithSamples() throws IOException {
+ final File inputFile = new File(getToolTestDataDir(), "VCFWithGenotypes_1000G.phase3.snippet.vcf");
+ final File outputFile = createTempFile("noFieldsSpecifiedWithSamplesOutput", ".table");
+ final File expectedFile = new File(getToolTestDataDir(), "expected.noFieldsSpecifiedWithSamples.table");
+
+ final String[] args = new String[] {"--variant", inputFile.getAbsolutePath(),
+ "-O", outputFile.getAbsolutePath()};
+ runCommandLine(args);
+
+ IntegrationTestSpec.assertEqualTextFiles(outputFile, expectedFile);
+ }
+
+ @Test
+ public void testNoFieldsSpecifiedFormatFieldInHeaderNoSamples() throws IOException {
+ final File inputFile = new File(getToolTestDataDir(), "VCFWithoutGenotypesWithFormatField_dbsnp_138.snippet.vcf");
+ final File outputFile = createTempFile("noFieldsSpecifiedNoSamplesOutput", ".table");
+ final File expectedFile = new File(getToolTestDataDir(), "expected.noFieldsSpecifiedNoSamples.table");
+
+ final String[] args = new String[] {"--variant", inputFile.getAbsolutePath(),
+ "-O", outputFile.getAbsolutePath()};
+ runCommandLine(args);
+
+ IntegrationTestSpec.assertEqualTextFiles(outputFile, expectedFile);
+ }
+
}
diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTable/VCFWithGenotypes_1000G.phase3.snippet.vcf b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTable/VCFWithGenotypes_1000G.phase3.snippet.vcf
new file mode 100644
index 00000000000..eaf1060e2e7
--- /dev/null
+++ b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTable/VCFWithGenotypes_1000G.phase3.snippet.vcf
@@ -0,0 +1,48 @@
+##fileformat=VCFv4.2
+##FILTER=
+##FILTER=
+##FILTER=
+##FILTER=
+##FILTER=
+##FILTER=
+##FILTER=
+##FILTER=
+##FILTER=
+##FORMAT=
+##FORMAT=
+##FORMAT=
+##FORMAT=
+##FORMAT=
+##GATKCommandLine=
+##GATKCommandLine=
+##GATKVersion=2.5-191-g02f8427
+##HaplotypeCaller="analysis_type=HaplotypeCaller input_file=[/humgen/1kg/processing/production_wgs_final/chr20/ALL.chr20.bam.list] read_buffer_size=null phone_home=AWS gatk_key=null tag=NA read_filter=[] intervals=[/humgen/1kg/processing/production_wgs_final/chr20/.queue/scatterGather/call.for.1000G-1-sg/temp_0001_of_1000/scatter.intervals] excludeIntervals=null interval_set_rule=UNION interval_merging=ALL interval_padding=0 reference_sequence=/humgen/1kg/reference/human_g1k_v37_decoy.fasta nonDeterministicRandomSeed=false disableDithering=false maxRuntime=-1 maxRuntimeUnits=MINUTES downsampling_type=BY_SAMPLE downsample_to_fraction=null downsample_to_coverage=200 baq=OFF baqGapOpenPenalty=40.0 fix_misencoded_quality_scores=false allow_potentially_misencoded_quality_scores=false performanceLog=null useOriginalQualities=false BQSR=null quantize_quals=0 disable_indel_quals=false emit_original_quals=false preserve_qscores_less_than=6 globalQScorePrior=-1.0 allow_bqsr_on_reduced_bams_despite_repeated_warnings=false defaultBaseQualities=-1 validation_strictness=SILENT remove_program_records=false keep_program_records=false unsafe=null disable_auto_index_creation_and_locking_when_reading_rods=false num_threads=1 num_cpu_threads_per_data_thread=1 num_io_threads=0 monitorThreadEfficiency=false num_bam_file_handles=null read_group_black_list=null pedigree=[] pedigreeString=[] pedigreeValidationType=STRICT allow_intervals_with_unindexed_bam=false generateShadowBCF=false logging_level=INFO log_to_file=null help=false version=false out=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub no_cmdline_in_header=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub sites_only=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub bcf=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub graphOutput=null bamOutput=null bam_compression=null disable_bam_indexing=null generate_md5=null simplifyBAM=null bamWriterType=CALLED_HAPLOTYPES dbsnp=(RodBinding name= source=UNBOUND) comp=[] annotation=[ClippingRankSumTest, DepthPerSampleHC] excludeAnnotation=[SpanningDeletions, TandemRepeatAnnotator] heterozygosity=0.001 indel_heterozygosity=1.25E-4 genotyping_mode=DISCOVERY output_mode=EMIT_VARIANTS_ONLY standard_min_confidence_threshold_for_calling=10.0 standard_min_confidence_threshold_for_emitting=10.0 alleles=(RodBinding name= source=UNBOUND) max_alternate_alleles=6 input_prior=[] contamination_fraction_to_filter=0.05 contamination_fraction_per_sample_file=null p_nonref_model=EXACT_INDEPENDENT exactcallslog=null useDebruijnAssembler=false minKmerForDebruijnAssembler=11 onlyUseKmerSizeForDebruijnAssembler=-1 kmerSize=[10, 25] dontIncreaseKmerSizesForCycles=false numPruningSamples=3 maxPathsPerSample=8 dontRecoverDanglingTails=false minPruning=2 gcpHMM=10 includeUmappedReads=false useAllelesTrigger=false useFilteredReadsForAnnotations=false phredScaledGlobalReadMismappingRate=45 maxNumHaplotypesInPopulation=25 mergeVariantsViaLD=false pair_hmm_implementation=LOGLESS_CACHING keepRG=null justDetermineActiveRegions=false dontGenotype=false errorCorrectKmers=false debug=false debugGraphTransformations=false useLowQualityBasesForAssembly=false dontTrimActiveRegions=false dontUseSoftClippedBases=false captureAssemblyFailureBAM=false allowCyclesInKmerGraphToGeneratePaths=false errorCorrectReads=false kmerLengthForReadErrorCorrection=25 minObservationsForKmerToBeSolid=20 activityProfileOut=null activeRegionOut=null activeRegionIn=null activeRegionExtension=null forceActive=false activeRegionMaxSize=null bandPassSigma=null min_mapping_quality_score=20 filter_reads_with_N_cigar=false filter_mismatching_base_and_quals=false filter_bases_not_stored=false"
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##contig=
+##contig=
+##source=SelectVariants
+#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG00096 HG00097 HG00099
+20 10000054 . CTTTG C 504.42 PASS AC=0;AF=0.00;AN=6;BaseQRankSum=-0.975;ClippingRankSum=-2.925;DP=22;FS=1.899;InbreedingCoeff=0.0592;MQ=59.27;MQ0=0;MQRankSum=-3.212;QD=2.43;ReadPosRankSum=-0.264;VQSLOD=5.10;culprit=FS GT:AD:DP:GQ:PL 0/0:2,0:2:6:0,6,119 0/0:10,0:10:29:0,29,592 0/0:10,0:10:30:0,30,598
+20 10000107 . T C 263.95 PASS AC=0;AF=0.00;AN=6;BaseQRankSum=-0.444;ClippingRankSum=-3.132;DP=25;FS=0.948;InbreedingCoeff=-0.0102;MQ=59.19;MQ0=0;MQRankSum=2.292;POSITIVE_TRAIN_SITE;QD=10.56;ReadPosRankSum=0.055;VQSLOD=7.76;culprit=FS GT:AD:DP:GQ:PL 0/0:5,0:5:15:0,15,387 0/0:13,0:13:42:0,42,786 0/0:7,0:7:24:0,24,548
+20 10000117 . C T 329458.17 PASS AC=1;AF=0.167;AN=6;BaseQRankSum=10.505;ClippingRankSum=-20.658;DP=28;FS=8.305;InbreedingCoeff=0.1727;MQ=59.17;MQ0=0;MQRankSum=2.689;POSITIVE_TRAIN_SITE;QD=25.46;ReadPosRankSum=-4.688;VQSLOD=3.19;culprit=ReadPosRankSum GT:AD:DP:GQ:PL 0/0:5,0:5:15:0,15,189 0/1:8,8:16:99:254,0,231 0/0:7,0:7:21:0,21,271
diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTable/VCFWithoutGenotypesWithFormatField_dbsnp_138.snippet.vcf b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTable/VCFWithoutGenotypesWithFormatField_dbsnp_138.snippet.vcf
new file mode 100644
index 00000000000..a595955ce8b
--- /dev/null
+++ b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTable/VCFWithoutGenotypesWithFormatField_dbsnp_138.snippet.vcf
@@ -0,0 +1,118 @@
+##fileformat=VCFv4.2
+##FILTER=
+##FORMAT=
+##GATKCommandLine.SelectVariants=
+##GATKCommandLine=
+##GATKCommandLine=
+##INFO=
+##INFO=
+##INFO=
+##INFO== 1% and for which 2 or more founders contribute to that minor allele frequency.">
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##dbSNP_BUILD_ID=138
+##fileDate=20130806
+##phasing=partial
+##source=SelectVariants
+##variationPropertyDocumentationUrl=ftp://ftp.ncbi.nlm.nih.gov/snp/specs/dbSNP_BitField_latest.pdf
+#CHROM POS ID REF ALT QUAL FILTER INFO
+20 10000092 rs183046704 T G . . CAF=[0.9991,0.0009183];COMMON=0;KGPROD;KGPhase1;RS=183046704;RSPOS=10000092;SAO=0;SSR=0;VC=SNV;VP=0x050000000001000014000100;WGT=1;dbSNPBuildID=135
+20 10000107 rs188245245 T C . . CAF=[0.9982,0.001837];COMMON=0;KGPROD;KGPhase1;RS=188245245;RSPOS=10000107;SAO=0;SSR=0;VC=SNV;VP=0x050000000001000014000100;WGT=1;dbSNPBuildID=135
+20 10000117 rs4816203 C T . . CAF=[0.3682,0.6318];COMMON=1;G5;G5A;GNO;KGPROD;KGPhase1;KGPilot123;OTHERKG;RS=4816203;RSPOS=10000117;SAO=0;SLO;SSR=0;VC=SNV;VLD;VP=0x05010000000117011e000100;WGT=1;dbSNPBuildID=111
diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTable/VCFWithoutGenotypes_dbsnp_138.snippet.vcf b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTable/VCFWithoutGenotypes_dbsnp_138.snippet.vcf
new file mode 100644
index 00000000000..628a1d05d1e
--- /dev/null
+++ b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTable/VCFWithoutGenotypes_dbsnp_138.snippet.vcf
@@ -0,0 +1,117 @@
+##fileformat=VCFv4.2
+##FILTER=
+##GATKCommandLine.SelectVariants=
+##GATKCommandLine=
+##GATKCommandLine=
+##INFO=
+##INFO=
+##INFO=
+##INFO== 1% and for which 2 or more founders contribute to that minor allele frequency.">
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=