responding to review comments & added integration test

broadinstitute · Jun 29, 2022 · 9b956ba · 9b956ba
1 parent 8a33af3
commit 9b956ba
Show file tree

Hide file tree

Showing 6 changed files with 315 additions and 8 deletions.
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTable.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTable.java
@@ -35,7 +35,8 @@
  *     This tool extracts specified fields for each variant in a VCF file to a tab-delimited table, which may be easier
  *     to work with than a VCF. By default, the tool only extracts PASS or . (unfiltered) variants in the VCF file. Filtered variants may be
  *     included in the output by adding the --show-filtered flag. The tool can extract both INFO (i.e. site-level) fields and
- *     FORMAT (i.e. sample-level) fields.
+ *     FORMAT (i.e. sample-level) fields. If the tool is run without specifying any fields, it defaults to include all fields
+ *     declared in the VCF header.
  * </p>
  *
  * <h4>INFO/site-level fields</h4>
@@ -97,6 +98,12 @@
  *     1      65068538   SNP    49,0          35,4
  *     1      111146235  SNP    69,1          77,4
  * </pre>
+ * <pre>
+ *     gatk VariantsToTable \
+ *     -V input.vcf \
+ *     -O output.table
+ * </pre>
+ * <p>would produce a file that includes all fields declared in the VCF header.</p>
  *
  * <h3>Notes</h3>
  * <ul>
@@ -211,7 +218,7 @@ public void onTraversalStart() {
 
         // if no fields specified, default to include all fields listed in header into table
         if(fieldsToTake.isEmpty() && genotypeFieldsToTake.isEmpty() && asFieldsToTake.isEmpty() && asGenotypeFieldsToTake.isEmpty()){
-            logger.warn("No fields were specified. All fields will be included in output table.");
+            logger.warn("No fields were specified. All fields declared in the VCF header will be included in the output table.");
 
             // add all mandatory VCF fields (except INFO)
             for(VCFHeader.HEADER_FIELDS headerField : VCFHeader.HEADER_FIELDS.values()){
@@ -228,7 +235,7 @@ public void onTraversalStart() {
             // add all FORMAT fields present in VCF header
             for (final VCFFormatHeaderLine formatLine : inputHeader.getFormatHeaderLines()) {
                 // ensure GT field listed as first FORMAT field
-                if(formatLine.getID().equals("GT")) {
+                if(formatLine.getID().equals(VCFConstants.GENOTYPE_KEY)) {
                     genotypeFieldsToTake.add(0, formatLine.getID());
                 }
                 else {
@@ -238,7 +245,7 @@ public void onTraversalStart() {
         }
 
         // if fields specified, but none are genotype fields, set samples to empty
-        if (genotypeFieldsToTake.isEmpty() && asGenotypeFieldsToTake.isEmpty() && (!fieldsToTake.isEmpty() || !asFieldsToTake.isEmpty())) {
+        if (genotypeFieldsToTake.isEmpty() && asGenotypeFieldsToTake.isEmpty()) {
                 samples = Collections.emptySortedSet();
         }
         else {

diff --git a/.../broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTableIntegrationTest.java b/.../broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTableIntegrationTest.java
@@ -240,8 +240,8 @@ public void testMoltenOutputWithMultipleAlleles() throws IOException {
 
     @Test
     public void testNoFieldsSpecified() throws IOException {
-        final File inputFile = new File(getToolTestDataDir(), "extraheaderlinesdeleted_dbsnp_138.snippet.vcf");
-        final File outputFile = createTempFile(getToolTestDataDir(), "noFieldsSpecifiedOutput.table");
+        final File inputFile = new File(getToolTestDataDir(), "VCFWithoutGenotypes_dbsnp_138.snippet.vcf");
+        final File outputFile = createTempFile("noFieldsSpecifiedOutput", ".table");
         final File expectedFile = new File(getToolTestDataDir(), "expected.noFieldsSpecified.table");
 
         final String[] args = new String[] {"--variant", inputFile.getAbsolutePath(),
@@ -253,8 +253,8 @@ public void testNoFieldsSpecified() throws IOException {
 
     @Test
     public void testNoFieldsSpecifiedWithSamples() throws IOException {
-        final File inputFile = new File(getToolTestDataDir(), "1000G.phase3.snippet.vcf");
-        final File outputFile = createTempFile(getToolTestDataDir(), "noFieldsSpecifiedWithSamplesOutput.table");
+        final File inputFile = new File(getToolTestDataDir(), "VCFWithGenotypes_1000G.phase3.snippet.vcf");
+        final File outputFile = createTempFile("noFieldsSpecifiedWithSamplesOutput", ".table");
         final File expectedFile = new File(getToolTestDataDir(), "expected.noFieldsSpecifiedWithSamples.table");
 
         final String[] args = new String[] {"--variant", inputFile.getAbsolutePath(),
@@ -264,4 +264,17 @@ public void testNoFieldsSpecifiedWithSamples() throws IOException {
         IntegrationTestSpec.assertEqualTextFiles(outputFile, expectedFile);
     }
 
+    @Test
+    public void testNoFieldsSpecifiedFormatFieldInHeaderNoSamples() throws IOException {
+        final File inputFile = new File(getToolTestDataDir(), "VCFWithoutGenotypesWithFormatField_dbsnp_138.snippet.vcf");
+        final File outputFile = createTempFile("noFieldsSpecifiedNoSamplesOutput", ".table");
+        final File expectedFile = new File(getToolTestDataDir(), "expected.noFieldsSpecifiedNoSamples.table");
+
+        final String[] args = new String[] {"--variant", inputFile.getAbsolutePath(),
+                "-O", outputFile.getAbsolutePath()};
+        runCommandLine(args);
+
+        IntegrationTestSpec.assertEqualTextFiles(outputFile, expectedFile);
+    }
+
 }
diff --git a/...nder/tools/walkers/variantutils/VariantsToTable/VCFWithGenotypes_1000G.phase3.snippet.vcf b/...nder/tools/walkers/variantutils/VariantsToTable/VCFWithGenotypes_1000G.phase3.snippet.vcf
@@ -0,0 +1,48 @@
+##fileformat=VCFv4.2
+##FILTER=<ID=LowQual,Description="Low quality">
+##FILTER=<ID=VQSRTrancheINDEL99.00to99.50,Description="Truth sensitivity tranche level for INDEL model at VQS Lod: -1.0597 <= x < 0.1687">
+##FILTER=<ID=VQSRTrancheINDEL99.50to99.90,Description="Truth sensitivity tranche level for INDEL model at VQS Lod: -5.8589 <= x < -1.0597">
+##FILTER=<ID=VQSRTrancheINDEL99.90to99.95,Description="Truth sensitivity tranche level for INDEL model at VQS Lod: -6.5565 <= x < -5.8589">
+##FILTER=<ID=VQSRTrancheINDEL99.95to100.00+,Description="Truth sensitivity tranche level for INDEL model at VQS Lod < -70.77">
+##FILTER=<ID=VQSRTrancheINDEL99.95to100.00,Description="Truth sensitivity tranche level for INDEL model at VQS Lod: -70.77 <= x < -6.5565">
+##FILTER=<ID=VQSRTrancheSNP99.90to99.95,Description="Truth sensitivity tranche level for SNP model at VQS Lod: -96.2455 <= x < -14.2763">
+##FILTER=<ID=VQSRTrancheSNP99.95to100.00+,Description="Truth sensitivity tranche level for SNP model at VQS Lod < -437.9355">
+##FILTER=<ID=VQSRTrancheSNP99.95to100.00,Description="Truth sensitivity tranche level for SNP model at VQS Lod: -437.9355 <= x < -96.2455">
+##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
+##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
+##GATKCommandLine=<ID=ApplyRecalibration,Version=2.6-20-g0728857,Date="Mon Jul 01 11:58:36 EDT 2013",Epoch=1372694316825,CommandLineOptions="analysis_type=ApplyRecalibration input_file=[] read_buffer_size=null phone_home=AWS gatk_key=null tag=NA read_filter=[] intervals=[20] excludeIntervals=null interval_set_rule=UNION interval_merging=ALL interval_padding=0 reference_sequence=/humgen/1kg/reference/human_g1k_v37.fasta nonDeterministicRandomSeed=false disableDithering=false maxRuntime=-1 maxRuntimeUnits=MINUTES downsampling_type=BY_SAMPLE downsample_to_fraction=null downsample_to_coverage=1000 baq=OFF baqGapOpenPenalty=40.0 fix_misencoded_quality_scores=false allow_potentially_misencoded_quality_scores=false useOriginalQualities=false defaultBaseQualities=-1 performanceLog=null BQSR=null quantize_quals=0 disable_indel_quals=false emit_original_quals=false preserve_qscores_less_than=6 globalQScorePrior=-1.0 allow_bqsr_on_reduced_bams_despite_repeated_warnings=false validation_strictness=SILENT remove_program_records=false keep_program_records=false unsafe=null disable_auto_index_creation_and_locking_when_reading_rods=false num_threads=5 num_cpu_threads_per_data_thread=1 num_io_threads=0 monitorThreadEfficiency=false num_bam_file_handles=null read_group_black_list=null pedigree=[] pedigreeString=[] pedigreeValidationType=STRICT allow_intervals_with_unindexed_bam=false generateShadowBCF=false logging_level=INFO log_to_file=null help=false version=false input=[(RodBinding name=input source=/humgen/1kg/processing/production_wgs_final/chr20/ALL.chr20.lowpass.hc.recal.vcf)] recal_file=(RodBinding name=recal_file source=/humgen/1kg/processing/production_wgs_final/chr20/ALL.chr20.lowpass.hc.recal.recal) tranches_file=/humgen/1kg/processing/production_wgs_final/chr20/ALL.chr20.lowpass.hc.recal.tranches out=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub no_cmdline_in_header=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub sites_only=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub bcf=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub ts_filter_level=99.0 ignore_filter=null mode=INDEL filter_reads_with_N_cigar=false filter_mismatching_base_and_quals=false filter_bases_not_stored=false">
+##GATKCommandLine=<ID=SelectVariants,CommandLine="SelectVariants --output src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTable/1000G.phase3.snippet.vcf --sample-name HG00096 --sample-name HG00097 --sample-name HG00099 --variant src/test/resources/large/1000G.phase3.broad.withGenotypes.chr20.10100000.vcf --intervals 20:10000054-10000117 --invertSelect false --exclude-non-variants false --exclude-filtered false --preserve-alleles false --remove-unused-alternates false --restrict-alleles-to ALL --keep-original-ac false --keep-original-dp false --mendelian-violation false --invert-mendelian-violation false --mendelian-violation-qual-threshold 0.0 --select-random-fraction 0.0 --remove-fraction-genotypes 0.0 --fully-decode false --max-indel-size 2147483647 --min-indel-size 0 --max-filtered-genotypes 2147483647 --min-filtered-genotypes 0 --max-fraction-filtered-genotypes 1.0 --min-fraction-filtered-genotypes 0.0 --max-nocall-number 2147483647 --max-nocall-fraction 1.0 --set-filtered-gt-to-nocall false --allow-nonoverlapping-command-line-samples false --suppress-reference-path false --genomicsdb-max-alternate-alleles 50 --call-genotypes false --genomicsdb-use-bcf-codec false --genomicsdb-shared-posixfs-optimizations false --genomicsdb-use-gcs-hdfs-connector false --interval-set-rule UNION --interval-padding 0 --interval-exclusion-padding 0 --interval-merging-rule ALL --read-validation-stringency SILENT --seconds-between-progress-updates 10.0 --disable-sequence-dictionary-validation false --create-output-bam-index true --create-output-bam-md5 false --create-output-variant-index true --create-output-variant-md5 false --max-variants-per-shard 0 --lenient false --add-output-sam-program-record true --add-output-vcf-command-line true --cloud-prefetch-buffer 40 --cloud-index-prefetch-buffer -1 --disable-bam-index-caching false --sites-only-vcf-output false --help false --version false --showHidden false --verbosity INFO --QUIET false --use-jdk-deflater false --use-jdk-inflater false --gcs-max-retries 20 --gcs-project-for-requester-pays  --disable-tool-default-read-filters false",Version="4.2.6.1-22-gd4f083d-SNAPSHOT",Date="June 21, 2022 1:46:50 PM EDT">
+##GATKVersion=2.5-191-g02f8427
+##HaplotypeCaller="analysis_type=HaplotypeCaller input_file=[/humgen/1kg/processing/production_wgs_final/chr20/ALL.chr20.bam.list] read_buffer_size=null phone_home=AWS gatk_key=null tag=NA read_filter=[] intervals=[/humgen/1kg/processing/production_wgs_final/chr20/.queue/scatterGather/call.for.1000G-1-sg/temp_0001_of_1000/scatter.intervals] excludeIntervals=null interval_set_rule=UNION interval_merging=ALL interval_padding=0 reference_sequence=/humgen/1kg/reference/human_g1k_v37_decoy.fasta nonDeterministicRandomSeed=false disableDithering=false maxRuntime=-1 maxRuntimeUnits=MINUTES downsampling_type=BY_SAMPLE downsample_to_fraction=null downsample_to_coverage=200 baq=OFF baqGapOpenPenalty=40.0 fix_misencoded_quality_scores=false allow_potentially_misencoded_quality_scores=false performanceLog=null useOriginalQualities=false BQSR=null quantize_quals=0 disable_indel_quals=false emit_original_quals=false preserve_qscores_less_than=6 globalQScorePrior=-1.0 allow_bqsr_on_reduced_bams_despite_repeated_warnings=false defaultBaseQualities=-1 validation_strictness=SILENT remove_program_records=false keep_program_records=false unsafe=null disable_auto_index_creation_and_locking_when_reading_rods=false num_threads=1 num_cpu_threads_per_data_thread=1 num_io_threads=0 monitorThreadEfficiency=false num_bam_file_handles=null read_group_black_list=null pedigree=[] pedigreeString=[] pedigreeValidationType=STRICT allow_intervals_with_unindexed_bam=false generateShadowBCF=false logging_level=INFO log_to_file=null help=false version=false out=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub no_cmdline_in_header=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub sites_only=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub bcf=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub graphOutput=null bamOutput=null bam_compression=null disable_bam_indexing=null generate_md5=null simplifyBAM=null bamWriterType=CALLED_HAPLOTYPES dbsnp=(RodBinding name= source=UNBOUND) comp=[] annotation=[ClippingRankSumTest, DepthPerSampleHC] excludeAnnotation=[SpanningDeletions, TandemRepeatAnnotator] heterozygosity=0.001 indel_heterozygosity=1.25E-4 genotyping_mode=DISCOVERY output_mode=EMIT_VARIANTS_ONLY standard_min_confidence_threshold_for_calling=10.0 standard_min_confidence_threshold_for_emitting=10.0 alleles=(RodBinding name= source=UNBOUND) max_alternate_alleles=6 input_prior=[] contamination_fraction_to_filter=0.05 contamination_fraction_per_sample_file=null p_nonref_model=EXACT_INDEPENDENT exactcallslog=null useDebruijnAssembler=false minKmerForDebruijnAssembler=11 onlyUseKmerSizeForDebruijnAssembler=-1 kmerSize=[10, 25] dontIncreaseKmerSizesForCycles=false numPruningSamples=3 maxPathsPerSample=8 dontRecoverDanglingTails=false minPruning=2 gcpHMM=10 includeUmappedReads=false useAllelesTrigger=false useFilteredReadsForAnnotations=false phredScaledGlobalReadMismappingRate=45 maxNumHaplotypesInPopulation=25 mergeVariantsViaLD=false pair_hmm_implementation=LOGLESS_CACHING keepRG=null justDetermineActiveRegions=false dontGenotype=false errorCorrectKmers=false debug=false debugGraphTransformations=false useLowQualityBasesForAssembly=false dontTrimActiveRegions=false dontUseSoftClippedBases=false captureAssemblyFailureBAM=false allowCyclesInKmerGraphToGeneratePaths=false errorCorrectReads=false kmerLengthForReadErrorCorrection=25 minObservationsForKmerToBeSolid=20 activityProfileOut=null activeRegionOut=null activeRegionIn=null activeRegionExtension=null forceActive=false activeRegionMaxSize=null bandPassSigma=null min_mapping_quality_score=20 filter_reads_with_N_cigar=false filter_mismatching_base_and_quals=false filter_bases_not_stored=false"
+##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
+##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
+##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
+##INFO=<ID=BaseQRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities">
+##INFO=<ID=ClippingRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref number of hard clipped bases">
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
+##INFO=<ID=DS,Number=0,Type=Flag,Description="Were any of the samples downsampled?">
+##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
+##INFO=<ID=FS,Number=1,Type=Float,Description="Phred-scaled p-value using Fisher's exact test to detect strand bias">
+##INFO=<ID=HaplotypeScore,Number=1,Type=Float,Description="Consistency of the site with at most two segregating haplotypes">
+##INFO=<ID=InbreedingCoeff,Number=1,Type=Float,Description="Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation">
+##INFO=<ID=MLEAC,Number=A,Type=Integer,Description="Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed">
+##INFO=<ID=MLEAF,Number=A,Type=Float,Description="Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed">
+##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">
+##INFO=<ID=MQ0,Number=1,Type=Integer,Description="Total Mapping Quality Zero Reads">
+##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities">
+##INFO=<ID=NEGATIVE_TRAIN_SITE,Number=0,Type=Flag,Description="This variant was used to build the negative training set of bad variants">
+##INFO=<ID=POSITIVE_TRAIN_SITE,Number=0,Type=Flag,Description="This variant was used to build the positive training set of good variants">
+##INFO=<ID=QD,Number=1,Type=Float,Description="Variant Confidence/Quality by Depth">
+##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias">
+##INFO=<ID=VQSLOD,Number=1,Type=Float,Description="Log odds ratio of being a true variant versus being false under the trained gaussian mixture model">
+##INFO=<ID=culprit,Number=1,Type=String,Description="The annotation which was the worst performing in the Gaussian mixture model, likely the reason why the variant was filtered out">
+##contig=<ID=20,length=63025520>
+##contig=<ID=21,length=48129895>
+##source=SelectVariants
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	HG00096	HG00097	HG00099
+20	10000054	.	CTTTG	C	504.42	PASS	AC=0;AF=0.00;AN=6;BaseQRankSum=-0.975;ClippingRankSum=-2.925;DP=22;FS=1.899;InbreedingCoeff=0.0592;MQ=59.27;MQ0=0;MQRankSum=-3.212;QD=2.43;ReadPosRankSum=-0.264;VQSLOD=5.10;culprit=FS	GT:AD:DP:GQ:PL	0/0:2,0:2:6:0,6,119	0/0:10,0:10:29:0,29,592	0/0:10,0:10:30:0,30,598
+20	10000107	.	T	C	263.95	PASS	AC=0;AF=0.00;AN=6;BaseQRankSum=-0.444;ClippingRankSum=-3.132;DP=25;FS=0.948;InbreedingCoeff=-0.0102;MQ=59.19;MQ0=0;MQRankSum=2.292;POSITIVE_TRAIN_SITE;QD=10.56;ReadPosRankSum=0.055;VQSLOD=7.76;culprit=FS	GT:AD:DP:GQ:PL	0/0:5,0:5:15:0,15,387	0/0:13,0:13:42:0,42,786	0/0:7,0:7:24:0,24,548
+20	10000117	.	C	T	329458.17	PASS	AC=1;AF=0.167;AN=6;BaseQRankSum=10.505;ClippingRankSum=-20.658;DP=28;FS=8.305;InbreedingCoeff=0.1727;MQ=59.17;MQ0=0;MQRankSum=2.689;POSITIVE_TRAIN_SITE;QD=25.46;ReadPosRankSum=-4.688;VQSLOD=3.19;culprit=ReadPosRankSum	GT:AD:DP:GQ:PL	0/0:5,0:5:15:0,15,189	0/1:8,8:16:99:254,0,231	0/0:7,0:7:21:0,21,271