Skip to content

Commit

Permalink
responding to review comments & added integration test
Browse files Browse the repository at this point in the history
  • Loading branch information
orlicohen committed Jun 29, 2022
1 parent 8a33af3 commit 9b956ba
Show file tree
Hide file tree
Showing 6 changed files with 315 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@
* This tool extracts specified fields for each variant in a VCF file to a tab-delimited table, which may be easier
* to work with than a VCF. By default, the tool only extracts PASS or . (unfiltered) variants in the VCF file. Filtered variants may be
* included in the output by adding the --show-filtered flag. The tool can extract both INFO (i.e. site-level) fields and
* FORMAT (i.e. sample-level) fields.
* FORMAT (i.e. sample-level) fields. If the tool is run without specifying any fields, it defaults to include all fields
* declared in the VCF header.
* </p>
*
* <h4>INFO/site-level fields</h4>
Expand Down Expand Up @@ -97,6 +98,12 @@
* 1 65068538 SNP 49,0 35,4
* 1 111146235 SNP 69,1 77,4
* </pre>
* <pre>
* gatk VariantsToTable \
* -V input.vcf \
* -O output.table
* </pre>
* <p>would produce a file that includes all fields declared in the VCF header.</p>
*
* <h3>Notes</h3>
* <ul>
Expand Down Expand Up @@ -211,7 +218,7 @@ public void onTraversalStart() {

// if no fields specified, default to include all fields listed in header into table
if(fieldsToTake.isEmpty() && genotypeFieldsToTake.isEmpty() && asFieldsToTake.isEmpty() && asGenotypeFieldsToTake.isEmpty()){
logger.warn("No fields were specified. All fields will be included in output table.");
logger.warn("No fields were specified. All fields declared in the VCF header will be included in the output table.");

// add all mandatory VCF fields (except INFO)
for(VCFHeader.HEADER_FIELDS headerField : VCFHeader.HEADER_FIELDS.values()){
Expand All @@ -228,7 +235,7 @@ public void onTraversalStart() {
// add all FORMAT fields present in VCF header
for (final VCFFormatHeaderLine formatLine : inputHeader.getFormatHeaderLines()) {
// ensure GT field listed as first FORMAT field
if(formatLine.getID().equals("GT")) {
if(formatLine.getID().equals(VCFConstants.GENOTYPE_KEY)) {
genotypeFieldsToTake.add(0, formatLine.getID());
}
else {
Expand All @@ -238,7 +245,7 @@ public void onTraversalStart() {
}

// if fields specified, but none are genotype fields, set samples to empty
if (genotypeFieldsToTake.isEmpty() && asGenotypeFieldsToTake.isEmpty() && (!fieldsToTake.isEmpty() || !asFieldsToTake.isEmpty())) {
if (genotypeFieldsToTake.isEmpty() && asGenotypeFieldsToTake.isEmpty()) {
samples = Collections.emptySortedSet();
}
else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -240,8 +240,8 @@ public void testMoltenOutputWithMultipleAlleles() throws IOException {

@Test
public void testNoFieldsSpecified() throws IOException {
final File inputFile = new File(getToolTestDataDir(), "extraheaderlinesdeleted_dbsnp_138.snippet.vcf");
final File outputFile = createTempFile(getToolTestDataDir(), "noFieldsSpecifiedOutput.table");
final File inputFile = new File(getToolTestDataDir(), "VCFWithoutGenotypes_dbsnp_138.snippet.vcf");
final File outputFile = createTempFile("noFieldsSpecifiedOutput", ".table");
final File expectedFile = new File(getToolTestDataDir(), "expected.noFieldsSpecified.table");

final String[] args = new String[] {"--variant", inputFile.getAbsolutePath(),
Expand All @@ -253,8 +253,8 @@ public void testNoFieldsSpecified() throws IOException {

@Test
public void testNoFieldsSpecifiedWithSamples() throws IOException {
final File inputFile = new File(getToolTestDataDir(), "1000G.phase3.snippet.vcf");
final File outputFile = createTempFile(getToolTestDataDir(), "noFieldsSpecifiedWithSamplesOutput.table");
final File inputFile = new File(getToolTestDataDir(), "VCFWithGenotypes_1000G.phase3.snippet.vcf");
final File outputFile = createTempFile("noFieldsSpecifiedWithSamplesOutput", ".table");
final File expectedFile = new File(getToolTestDataDir(), "expected.noFieldsSpecifiedWithSamples.table");

final String[] args = new String[] {"--variant", inputFile.getAbsolutePath(),
Expand All @@ -264,4 +264,17 @@ public void testNoFieldsSpecifiedWithSamples() throws IOException {
IntegrationTestSpec.assertEqualTextFiles(outputFile, expectedFile);
}

@Test
public void testNoFieldsSpecifiedFormatFieldInHeaderNoSamples() throws IOException {
final File inputFile = new File(getToolTestDataDir(), "VCFWithoutGenotypesWithFormatField_dbsnp_138.snippet.vcf");
final File outputFile = createTempFile("noFieldsSpecifiedNoSamplesOutput", ".table");
final File expectedFile = new File(getToolTestDataDir(), "expected.noFieldsSpecifiedNoSamples.table");

final String[] args = new String[] {"--variant", inputFile.getAbsolutePath(),
"-O", outputFile.getAbsolutePath()};
runCommandLine(args);

IntegrationTestSpec.assertEqualTextFiles(outputFile, expectedFile);
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
##fileformat=VCFv4.2
##FILTER=<ID=LowQual,Description="Low quality">
##FILTER=<ID=VQSRTrancheINDEL99.00to99.50,Description="Truth sensitivity tranche level for INDEL model at VQS Lod: -1.0597 <= x < 0.1687">
##FILTER=<ID=VQSRTrancheINDEL99.50to99.90,Description="Truth sensitivity tranche level for INDEL model at VQS Lod: -5.8589 <= x < -1.0597">
##FILTER=<ID=VQSRTrancheINDEL99.90to99.95,Description="Truth sensitivity tranche level for INDEL model at VQS Lod: -6.5565 <= x < -5.8589">
##FILTER=<ID=VQSRTrancheINDEL99.95to100.00+,Description="Truth sensitivity tranche level for INDEL model at VQS Lod < -70.77">
##FILTER=<ID=VQSRTrancheINDEL99.95to100.00,Description="Truth sensitivity tranche level for INDEL model at VQS Lod: -70.77 <= x < -6.5565">
##FILTER=<ID=VQSRTrancheSNP99.90to99.95,Description="Truth sensitivity tranche level for SNP model at VQS Lod: -96.2455 <= x < -14.2763">
##FILTER=<ID=VQSRTrancheSNP99.95to100.00+,Description="Truth sensitivity tranche level for SNP model at VQS Lod < -437.9355">
##FILTER=<ID=VQSRTrancheSNP99.95to100.00,Description="Truth sensitivity tranche level for SNP model at VQS Lod: -437.9355 <= x < -96.2455">
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
##GATKCommandLine=<ID=ApplyRecalibration,Version=2.6-20-g0728857,Date="Mon Jul 01 11:58:36 EDT 2013",Epoch=1372694316825,CommandLineOptions="analysis_type=ApplyRecalibration input_file=[] read_buffer_size=null phone_home=AWS gatk_key=null tag=NA read_filter=[] intervals=[20] excludeIntervals=null interval_set_rule=UNION interval_merging=ALL interval_padding=0 reference_sequence=/humgen/1kg/reference/human_g1k_v37.fasta nonDeterministicRandomSeed=false disableDithering=false maxRuntime=-1 maxRuntimeUnits=MINUTES downsampling_type=BY_SAMPLE downsample_to_fraction=null downsample_to_coverage=1000 baq=OFF baqGapOpenPenalty=40.0 fix_misencoded_quality_scores=false allow_potentially_misencoded_quality_scores=false useOriginalQualities=false defaultBaseQualities=-1 performanceLog=null BQSR=null quantize_quals=0 disable_indel_quals=false emit_original_quals=false preserve_qscores_less_than=6 globalQScorePrior=-1.0 allow_bqsr_on_reduced_bams_despite_repeated_warnings=false validation_strictness=SILENT remove_program_records=false keep_program_records=false unsafe=null disable_auto_index_creation_and_locking_when_reading_rods=false num_threads=5 num_cpu_threads_per_data_thread=1 num_io_threads=0 monitorThreadEfficiency=false num_bam_file_handles=null read_group_black_list=null pedigree=[] pedigreeString=[] pedigreeValidationType=STRICT allow_intervals_with_unindexed_bam=false generateShadowBCF=false logging_level=INFO log_to_file=null help=false version=false input=[(RodBinding name=input source=/humgen/1kg/processing/production_wgs_final/chr20/ALL.chr20.lowpass.hc.recal.vcf)] recal_file=(RodBinding name=recal_file source=/humgen/1kg/processing/production_wgs_final/chr20/ALL.chr20.lowpass.hc.recal.recal) tranches_file=/humgen/1kg/processing/production_wgs_final/chr20/ALL.chr20.lowpass.hc.recal.tranches out=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub no_cmdline_in_header=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub sites_only=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub bcf=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub ts_filter_level=99.0 ignore_filter=null mode=INDEL filter_reads_with_N_cigar=false filter_mismatching_base_and_quals=false filter_bases_not_stored=false">
##GATKCommandLine=<ID=SelectVariants,CommandLine="SelectVariants --output src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTable/1000G.phase3.snippet.vcf --sample-name HG00096 --sample-name HG00097 --sample-name HG00099 --variant src/test/resources/large/1000G.phase3.broad.withGenotypes.chr20.10100000.vcf --intervals 20:10000054-10000117 --invertSelect false --exclude-non-variants false --exclude-filtered false --preserve-alleles false --remove-unused-alternates false --restrict-alleles-to ALL --keep-original-ac false --keep-original-dp false --mendelian-violation false --invert-mendelian-violation false --mendelian-violation-qual-threshold 0.0 --select-random-fraction 0.0 --remove-fraction-genotypes 0.0 --fully-decode false --max-indel-size 2147483647 --min-indel-size 0 --max-filtered-genotypes 2147483647 --min-filtered-genotypes 0 --max-fraction-filtered-genotypes 1.0 --min-fraction-filtered-genotypes 0.0 --max-nocall-number 2147483647 --max-nocall-fraction 1.0 --set-filtered-gt-to-nocall false --allow-nonoverlapping-command-line-samples false --suppress-reference-path false --genomicsdb-max-alternate-alleles 50 --call-genotypes false --genomicsdb-use-bcf-codec false --genomicsdb-shared-posixfs-optimizations false --genomicsdb-use-gcs-hdfs-connector false --interval-set-rule UNION --interval-padding 0 --interval-exclusion-padding 0 --interval-merging-rule ALL --read-validation-stringency SILENT --seconds-between-progress-updates 10.0 --disable-sequence-dictionary-validation false --create-output-bam-index true --create-output-bam-md5 false --create-output-variant-index true --create-output-variant-md5 false --max-variants-per-shard 0 --lenient false --add-output-sam-program-record true --add-output-vcf-command-line true --cloud-prefetch-buffer 40 --cloud-index-prefetch-buffer -1 --disable-bam-index-caching false --sites-only-vcf-output false --help false --version false --showHidden false --verbosity INFO --QUIET false --use-jdk-deflater false --use-jdk-inflater false --gcs-max-retries 20 --gcs-project-for-requester-pays --disable-tool-default-read-filters false",Version="4.2.6.1-22-gd4f083d-SNAPSHOT",Date="June 21, 2022 1:46:50 PM EDT">
##GATKVersion=2.5-191-g02f8427
##HaplotypeCaller="analysis_type=HaplotypeCaller input_file=[/humgen/1kg/processing/production_wgs_final/chr20/ALL.chr20.bam.list] read_buffer_size=null phone_home=AWS gatk_key=null tag=NA read_filter=[] intervals=[/humgen/1kg/processing/production_wgs_final/chr20/.queue/scatterGather/call.for.1000G-1-sg/temp_0001_of_1000/scatter.intervals] excludeIntervals=null interval_set_rule=UNION interval_merging=ALL interval_padding=0 reference_sequence=/humgen/1kg/reference/human_g1k_v37_decoy.fasta nonDeterministicRandomSeed=false disableDithering=false maxRuntime=-1 maxRuntimeUnits=MINUTES downsampling_type=BY_SAMPLE downsample_to_fraction=null downsample_to_coverage=200 baq=OFF baqGapOpenPenalty=40.0 fix_misencoded_quality_scores=false allow_potentially_misencoded_quality_scores=false performanceLog=null useOriginalQualities=false BQSR=null quantize_quals=0 disable_indel_quals=false emit_original_quals=false preserve_qscores_less_than=6 globalQScorePrior=-1.0 allow_bqsr_on_reduced_bams_despite_repeated_warnings=false defaultBaseQualities=-1 validation_strictness=SILENT remove_program_records=false keep_program_records=false unsafe=null disable_auto_index_creation_and_locking_when_reading_rods=false num_threads=1 num_cpu_threads_per_data_thread=1 num_io_threads=0 monitorThreadEfficiency=false num_bam_file_handles=null read_group_black_list=null pedigree=[] pedigreeString=[] pedigreeValidationType=STRICT allow_intervals_with_unindexed_bam=false generateShadowBCF=false logging_level=INFO log_to_file=null help=false version=false out=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub no_cmdline_in_header=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub sites_only=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub bcf=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub graphOutput=null bamOutput=null bam_compression=null disable_bam_indexing=null generate_md5=null simplifyBAM=null bamWriterType=CALLED_HAPLOTYPES dbsnp=(RodBinding name= source=UNBOUND) comp=[] annotation=[ClippingRankSumTest, DepthPerSampleHC] excludeAnnotation=[SpanningDeletions, TandemRepeatAnnotator] heterozygosity=0.001 indel_heterozygosity=1.25E-4 genotyping_mode=DISCOVERY output_mode=EMIT_VARIANTS_ONLY standard_min_confidence_threshold_for_calling=10.0 standard_min_confidence_threshold_for_emitting=10.0 alleles=(RodBinding name= source=UNBOUND) max_alternate_alleles=6 input_prior=[] contamination_fraction_to_filter=0.05 contamination_fraction_per_sample_file=null p_nonref_model=EXACT_INDEPENDENT exactcallslog=null useDebruijnAssembler=false minKmerForDebruijnAssembler=11 onlyUseKmerSizeForDebruijnAssembler=-1 kmerSize=[10, 25] dontIncreaseKmerSizesForCycles=false numPruningSamples=3 maxPathsPerSample=8 dontRecoverDanglingTails=false minPruning=2 gcpHMM=10 includeUmappedReads=false useAllelesTrigger=false useFilteredReadsForAnnotations=false phredScaledGlobalReadMismappingRate=45 maxNumHaplotypesInPopulation=25 mergeVariantsViaLD=false pair_hmm_implementation=LOGLESS_CACHING keepRG=null justDetermineActiveRegions=false dontGenotype=false errorCorrectKmers=false debug=false debugGraphTransformations=false useLowQualityBasesForAssembly=false dontTrimActiveRegions=false dontUseSoftClippedBases=false captureAssemblyFailureBAM=false allowCyclesInKmerGraphToGeneratePaths=false errorCorrectReads=false kmerLengthForReadErrorCorrection=25 minObservationsForKmerToBeSolid=20 activityProfileOut=null activeRegionOut=null activeRegionIn=null activeRegionExtension=null forceActive=false activeRegionMaxSize=null bandPassSigma=null min_mapping_quality_score=20 filter_reads_with_N_cigar=false filter_mismatching_base_and_quals=false filter_bases_not_stored=false"
##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
##INFO=<ID=BaseQRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities">
##INFO=<ID=ClippingRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref number of hard clipped bases">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
##INFO=<ID=DS,Number=0,Type=Flag,Description="Were any of the samples downsampled?">
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
##INFO=<ID=FS,Number=1,Type=Float,Description="Phred-scaled p-value using Fisher's exact test to detect strand bias">
##INFO=<ID=HaplotypeScore,Number=1,Type=Float,Description="Consistency of the site with at most two segregating haplotypes">
##INFO=<ID=InbreedingCoeff,Number=1,Type=Float,Description="Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation">
##INFO=<ID=MLEAC,Number=A,Type=Integer,Description="Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed">
##INFO=<ID=MLEAF,Number=A,Type=Float,Description="Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed">
##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">
##INFO=<ID=MQ0,Number=1,Type=Integer,Description="Total Mapping Quality Zero Reads">
##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities">
##INFO=<ID=NEGATIVE_TRAIN_SITE,Number=0,Type=Flag,Description="This variant was used to build the negative training set of bad variants">
##INFO=<ID=POSITIVE_TRAIN_SITE,Number=0,Type=Flag,Description="This variant was used to build the positive training set of good variants">
##INFO=<ID=QD,Number=1,Type=Float,Description="Variant Confidence/Quality by Depth">
##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias">
##INFO=<ID=VQSLOD,Number=1,Type=Float,Description="Log odds ratio of being a true variant versus being false under the trained gaussian mixture model">
##INFO=<ID=culprit,Number=1,Type=String,Description="The annotation which was the worst performing in the Gaussian mixture model, likely the reason why the variant was filtered out">
##contig=<ID=20,length=63025520>
##contig=<ID=21,length=48129895>
##source=SelectVariants
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG00096 HG00097 HG00099
20 10000054 . CTTTG C 504.42 PASS AC=0;AF=0.00;AN=6;BaseQRankSum=-0.975;ClippingRankSum=-2.925;DP=22;FS=1.899;InbreedingCoeff=0.0592;MQ=59.27;MQ0=0;MQRankSum=-3.212;QD=2.43;ReadPosRankSum=-0.264;VQSLOD=5.10;culprit=FS GT:AD:DP:GQ:PL 0/0:2,0:2:6:0,6,119 0/0:10,0:10:29:0,29,592 0/0:10,0:10:30:0,30,598
20 10000107 . T C 263.95 PASS AC=0;AF=0.00;AN=6;BaseQRankSum=-0.444;ClippingRankSum=-3.132;DP=25;FS=0.948;InbreedingCoeff=-0.0102;MQ=59.19;MQ0=0;MQRankSum=2.292;POSITIVE_TRAIN_SITE;QD=10.56;ReadPosRankSum=0.055;VQSLOD=7.76;culprit=FS GT:AD:DP:GQ:PL 0/0:5,0:5:15:0,15,387 0/0:13,0:13:42:0,42,786 0/0:7,0:7:24:0,24,548
20 10000117 . C T 329458.17 PASS AC=1;AF=0.167;AN=6;BaseQRankSum=10.505;ClippingRankSum=-20.658;DP=28;FS=8.305;InbreedingCoeff=0.1727;MQ=59.17;MQ0=0;MQRankSum=2.689;POSITIVE_TRAIN_SITE;QD=25.46;ReadPosRankSum=-4.688;VQSLOD=3.19;culprit=ReadPosRankSum GT:AD:DP:GQ:PL 0/0:5,0:5:15:0,15,189 0/1:8,8:16:99:254,0,231 0/0:7,0:7:21:0,21,271
Loading

0 comments on commit 9b956ba

Please sign in to comment.