# This is a combination of 2 commits.

# This is the 1st commit message: Updated data source inputs to accept NIO paths for backing files. Now you can specify a URL in the backing file areas of the configuration files for Funcotator data sources and the backing files will be read by the FuncotationDataSourceFactories. This effectively enables use of data sources in the cloud or a mix of local- and cloud-based data sources through a config file change. This update will enable gnomAD annotations (once the data sources are updated to point at the gnomAD files on Google Cloud). Added in cloud data sources to test with. Minor refactoring of LocatableXsvFuncotationFactory. Now can only support one file at a time instead of multiple files for each instance. Fixes #5348 # This is the commit message #2: Added in more cloud data sources. New cloud dataset contains local data sources and a pointer to the gnomAD google cloud bucket.
broadinstitute · Nov 19, 2018 · 05e0c15 · 05e0c15
1 parent 197c4cb
commit 05e0c15
Show file tree

Hide file tree

Showing 93 changed files with 856 additions and 266 deletions.
diff --git a/scripts/funcotator/testing/getGencodeGenesForVcfVariants.sh b/scripts/funcotator/testing/getGencodeGenesForVcfVariants.sh
@@ -20,7 +20,7 @@
 ###############################################################################
 
 #Setup variables for the script:
-UNALIASED_SCRIPT_NAME=$( readlink "${BASH_SOURCE[0]}" || echo "${BASH_SOURCE[0]}" )
+UNALIASED_SCRIPT_NAME=$( python -c "import os;print os.path.realpath(\"${BASH_SOURCE[0]}\")" )
 SCRIPTDIR="$( cd "$( dirname "${UNALIASED_SCRIPT_NAME}" )" && pwd )"
 SCRIPTNAME=$( echo $0 | sed 's#.*/##g' )
 MINARGS=2

diff --git a/scripts/funcotator/testing/getGencodeSequencesForVcfVariants.sh b/scripts/funcotator/testing/getGencodeSequencesForVcfVariants.sh
@@ -20,7 +20,7 @@
 ###############################################################################
 
 #Setup variables for the script:
-UNALIASED_SCRIPT_NAME=$( readlink "${BASH_SOURCE[0]}" || echo "${BASH_SOURCE[0]}" )
+UNALIASED_SCRIPT_NAME=$( python -c "import os;print os.path.realpath(\"${BASH_SOURCE[0]}\")" )
 SCRIPTDIR="$( cd "$( dirname "${UNALIASED_SCRIPT_NAME}" )" && pwd )"
 SCRIPTNAME=$( echo $0 | sed 's#.*/##g' )
 MINARGS=2

diff --git a/scripts/funcotator/testing/testFuncotator.sh b/scripts/funcotator/testing/testFuncotator.sh
@@ -61,7 +61,7 @@ HG38=/Users/jonn/Development/references/Homo_sapiens_assembly38.fasta
 
 function simpleUsage()
 {
-  echo -e "Usage: $SCRIPTNAME [-c] [-u] [-t] [-19|-38] [-MAF|-VCF] [-AOU]"
+  echo -e "Usage: $SCRIPTNAME [-c] [-cloud] [-u] [-t] [-19|-38] [-MAF|-VCF] [-AOU]"
   echo -e "Build and run Funcotator."
 }
 
@@ -87,6 +87,7 @@ function usage()
   echo -e "  -38                                         run with hg38 data sources/reference/input file"
 	echo -e "  -MAF                                        create MAF output"
 	echo -e "  -VCF                                        create VCF output (default)"
+  echo -e "  -cloud                                      use cloud data sources"
 	echo -e "  -AOU                                        use the All of Us/Clinical Pipeline data sources"
 	echo -e "  -M REF_VER REFERENCE INPUT DATA_SOURCES     run in MANUAL mode, providing all necessary input"
 	echo -e "                                              REF_VER      - a string for the reference version"
@@ -146,7 +147,8 @@ trap at_exit EXIT
 function assertInputFilesExist() {
   assertFileExists ${INPUT}
   assertFileExists ${REF}
-  assertDirectoryExists ${DATA_SOURCES_PATH}
+
+	 [[ ! -d $DATA_SOURCES_PATH ]] && error "Warning: Data sources may not exist ${DATA_SOURCES_PATH}" && error "Ignore this if data sources directory is in the cloud."
 }
 
 ################################################################################
@@ -176,6 +178,9 @@ while [ $# -gt 0 ] ; do
 		-AOU)
 		useAOUDataSources=true
 		;;
+    -cloud)
+    useCloudDataSources=true
+    ;;
 		-t)
 		doRunLargeTests=true
 		;;
@@ -267,7 +272,7 @@ if [[ $r -eq 0 ]] && ${doRunLargeTests} ; then
 		INPUT=/Users/jonn/Development/NON_PUBLIC/0816201804HC0_R01C01.vcf
 		#INPUT=/Users/jonn/Development/gatk/src/test/resources/large/funcotator/regressionTestVariantSet1.vcf
 		#INPUT=/Users/jonn/Development/gatk/src/test/resources/large/funcotator/regressionTestVariantSet2.vcf
-		INPUT=/Users/jonn/Development/gatk/src/test/resources/large/funcotator/regressionTestHg19Large.vcf
+		#INPUT=/Users/jonn/Development/gatk/src/test/resources/large/funcotator/regressionTestHg19Large.vcf
 		#INPUT=/Users/jonn/Development/gatk/hg38_trio_liftoverb37.vcf
 		#INPUT=/Users/jonn/Development/gatk/tmp.vcf
 		#INPUT=/Users/jonn/Development/data_to_run/problem_samples/splice_site_should_not_be_splice_site/error_case.vcf
@@ -279,18 +284,23 @@ if [[ $r -eq 0 ]] && ${doRunLargeTests} ; then
 	else
 		INPUT=/Users/jonn/Development/FUNCOTATOR_LARGE_TEST_INPUTS/hg38_trio.vcf
 		#INPUT=/Users/jonn/Development/gatk/src/test/resources/large/funcotator/regressionTestVariantSetHG38.vcf
+		#INPUT=/Users/jonn/Development/tmp/cohort24_23_seg.subset.vcf
 		REF=$HG38
 	fi
 
 	# Use the AOU data sources if we need them:
-	$useAOUDataSources && DATA_SOURCES_PATH=/Users/jonn/Development/funcotator_dataSources.vAoU3
+	$useAOUDataSources && echo "Using AOU data sources." && DATA_SOURCES_PATH=/Users/jonn/Development/funcotator_dataSources.vAoU3
+
+  # Use cloud data sources if we need them:
+	$useCloudDataSources && echo "Using cloud data sources." && DATA_SOURCES_PATH=/Users/jonn/Development/gatk/src/test/resources/large/funcotator/funcotator_dataSources_cloud_gnomad/
+	#$useCloudDataSources && echo "Using cloud data sources." && DATA_SOURCES_PATH=gs://hellbender/test/resources/large/funcotatorDataSourceCollection/funcotator_dataSources_cloud/
 
 	OUT_FORMAT_LOWER=$( echo "${OUT_FORMAT}" | tr 'A-Z' 'a-z' )
 	OUT_FILE_NAME=FUNCOTATOR_OUT.${OUT_FORMAT_LOWER}
 
 	assertInputFilesExist
 
-	${GATKDIR}/gatk Funcotator \
+	time ${GATKDIR}/gatk Funcotator \
 		-V ${INPUT} \
 		-O ${OUT_FILE_NAME} \
 		-R ${REF} \

diff --git a/src/main/java/org/broadinstitute/hellbender/engine/FeatureDataSource.java b/src/main/java/org/broadinstitute/hellbender/engine/FeatureDataSource.java
@@ -20,7 +20,6 @@
 import org.broadinstitute.hellbender.utils.gcs.BucketUtils;
 import org.broadinstitute.hellbender.utils.io.IOUtils;
 import org.broadinstitute.hellbender.utils.nio.SeekableByteChannelPrefetcher;
-import static org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBUtils.*;
 
 import java.io.File;
 import java.io.IOException;
@@ -32,6 +31,8 @@
 import java.util.Optional;
 import java.util.function.Function;
 
+import static org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBUtils.createExportConfiguration;
+
 /**
  * Enables traversals and queries over sources of Features, which are metadata associated with a location
  * on the genome in a format supported by our file parsing framework, Tribble. Examples of Features are
@@ -276,6 +277,9 @@ public FeatureDataSource(final FeatureInput<T> featureInput, final int queryLook
         this.queryLookaheadBases = queryLookaheadBases;
     }
 
+    final void printCacheStats() {
+        queryCache.printCacheStatistics( getName() );
+    }
 
     @SuppressWarnings("unchecked")
     private static <T extends Feature> FeatureReader<T> getFeatureReader(final FeatureInput<T> featureInput, final Class<? extends Feature> targetFeatureType,
@@ -332,17 +336,19 @@ private static <T extends Feature> FeatureReader<T> getFeatureReader(final Featu
     private static <T extends Feature> AbstractFeatureReader<T, ?> getTribbleFeatureReader(final FeatureInput<T> featureInput, final FeatureCodec<T, ?> codec, final Function<SeekableByteChannel, SeekableByteChannel> cloudWrapper, final Function<SeekableByteChannel, SeekableByteChannel> cloudIndexWrapper) {
         Utils.nonNull(codec);
         try {
-            final String absolutePath = IOUtils.getPath(featureInput.getFeaturePath()).toAbsolutePath().toUri().toString();
+            // Must get the path to the data file from the codec here:
+            final String absoluteRawPath = IOUtils.getPath(featureInput.getFeaturePath()).toAbsolutePath().toUri().toString();
+            final String absoluteProcessedPath = IOUtils.getPath(codec.getPathToDataFile(featureInput.getFeaturePath())).toAbsolutePath().toUri().toString();
 
             // Instruct the reader factory to not require an index. We will require one ourselves as soon as
             // a query by interval is attempted.
             final boolean requireIndex = false;
 
             // Only apply the wrappers if the feature input is on Google Cloud Storage
-            if (BucketUtils.isCloudStorageUrl(absolutePath)) {
-                return AbstractFeatureReader.getFeatureReader(absolutePath, null, codec, requireIndex, cloudWrapper, cloudIndexWrapper);
+            if (BucketUtils.isCloudStorageUrl(absoluteProcessedPath)) {
+                return AbstractFeatureReader.getFeatureReader(absoluteRawPath, null, codec, requireIndex, cloudWrapper, cloudIndexWrapper);
             } else {
-                return AbstractFeatureReader.getFeatureReader(absolutePath, null, codec, requireIndex, Function.identity(), Function.identity());
+                return AbstractFeatureReader.getFeatureReader(absoluteRawPath, null, codec, requireIndex, Function.identity(), Function.identity());
             }
         } catch (final TribbleException e) {
             throw new GATKException("Error initializing feature reader for path " + featureInput.getFeaturePath(), e);

diff --git a/src/main/java/org/broadinstitute/hellbender/engine/FeatureInput.java b/src/main/java/org/broadinstitute/hellbender/engine/FeatureInput.java
@@ -291,6 +291,13 @@ public String getFeaturePath() {
         return featureFile;
     }
 
+    /**
+     * @return The key/value {@link Map<String,String>} as supplied to create the data in this {@link FeatureInput}.
+     */
+    public Map<String, String> getKeyValueMap() {
+        return keyValueMap;
+    }
+
     /**
      * FeatureInputs will be hashed by the engine, so make an effort to produce a reasonable hash code
      *

diff --git a/src/main/java/org/broadinstitute/hellbender/engine/FeatureManager.java b/src/main/java/org/broadinstitute/hellbender/engine/FeatureManager.java
@@ -210,6 +210,12 @@ private void initializeFeatureSources( final int featureQueryLookahead, final Co
         }
     }
 
+    @SuppressWarnings({"unchecked", "rawtypes"})
+    public void dumpAllFeatureCacheStats() {
+        for ( final FeatureDataSource f : featureSources.values() ) {
+            f.printCacheStats();
+        }
+    }
 
     /**
      * Add the feature data source to the given feature input.
@@ -450,7 +456,7 @@ private <T extends Feature> FeatureDataSource<T> lookupDataSource( final Feature
     public static FeatureCodec<? extends Feature, ?> getCodecForFile( final Path featurePath, final Class<? extends Feature> featureType ) {
         // Make sure Path exists/is readable
         if ( ! Files.isReadable(featurePath) ) {
-            throw new UserException.CouldNotReadInputFile(featurePath);
+            throw new UserException.CouldNotReadInputFile(featurePath.toUri().toString());
         }
 
         // Gather all discovered codecs that claim to be able to decode the given file according to their

diff --git a/src/main/java/org/broadinstitute/hellbender/engine/GATKTool.java b/src/main/java/org/broadinstitute/hellbender/engine/GATKTool.java
@@ -148,7 +148,7 @@ public abstract class GATKTool extends CommandLineProgram {
     /**
      * Our source of Feature data (null if no source of Features was provided)
      */
-    FeatureManager features;
+    public FeatureManager features;
 
     /**
      *

diff --git a/src/main/java/org/broadinstitute/hellbender/exceptions/UserException.java b/src/main/java/org/broadinstitute/hellbender/exceptions/UserException.java
@@ -384,7 +384,7 @@ public static final class NoSuitableCodecs extends  UserException {
         private static final long serialVersionUID = 0L;
 
         public NoSuitableCodecs(final Path file) {
-            super("Cannot read " + file + " because no suitable codecs found");
+            super("Cannot read " + file.toUri().toString() + " because no suitable codecs found");
         }
     }
 

diff --git a/...institute/hellbender/tools/copynumber/utils/annotatedinterval/AnnotatedIntervalCodec.java b/...institute/hellbender/tools/copynumber/utils/annotatedinterval/AnnotatedIntervalCodec.java
@@ -5,8 +5,10 @@
 import htsjdk.tribble.AsciiFeatureCodec;
 import htsjdk.tribble.readers.LineIterator;
 import org.apache.commons.lang3.StringUtils;
+import org.apache.commons.lang3.tuple.Pair;
 import org.broadinstitute.hellbender.exceptions.GATKException;
 import org.broadinstitute.hellbender.exceptions.UserException;
+import org.broadinstitute.hellbender.tools.funcotator.dataSources.DataSourceUtils;
 import org.broadinstitute.hellbender.utils.SimpleInterval;
 import org.broadinstitute.hellbender.utils.Utils;
 import org.broadinstitute.hellbender.utils.codecs.xsvLocatableTable.XsvLocatableTableCodec;
@@ -38,6 +40,7 @@ public class AnnotatedIntervalCodec extends AsciiFeatureCodec<AnnotatedInterval>
     public static final String START_COL_COMMENT = "_StartHeader=";
     public static final String END_COL_COMMENT = "_EndHeader=";
 
+    private Path configFilePath;
     private XsvLocatableTableCodec xsvLocatableTableCodec;
     private AnnotatedIntervalHeader header;
 
@@ -46,9 +49,10 @@ public AnnotatedIntervalCodec() {
         xsvLocatableTableCodec = new XsvLocatableTableCodec();
     }
 
-    public AnnotatedIntervalCodec(final Path overrideConfigFile) {
+    public AnnotatedIntervalCodec(final Path configFilePath) {
         super(AnnotatedInterval.class);
-        xsvLocatableTableCodec = new XsvLocatableTableCodec(overrideConfigFile);
+        this.configFilePath = configFilePath;
+        xsvLocatableTableCodec = new XsvLocatableTableCodec(configFilePath);
     }
 
     @Override
@@ -78,8 +82,8 @@ public AnnotatedIntervalHeader readActualHeader(final LineIterator reader) {
     }
 
     @Override
-    public boolean canDecode(final String path) {
-        return (path.endsWith(".seg") || path.endsWith(".maf") || path.endsWith(".maf.annotated")) && xsvLocatableTableCodec.canDecodeMinusExtensionChecks(path);
+    public boolean canDecode(final String pathString) {
+        return (pathString.endsWith(".seg") || pathString.endsWith(".maf") || pathString.endsWith(".maf.annotated")) && xsvLocatableTableCodec.canDecodeFileChecks(configFilePath.toUri().toString(), pathString);
     }
 
     /**
@@ -98,10 +102,15 @@ public static AnnotatedIntervalHeader createHeaderForWriter(final Path outputCon
         Utils.nonNull(outputConfigFile);
 
         //TODO: Change this so that it outputs the first in the list.
-        final Properties headerNameProperties = XsvLocatableTableCodec.getAndValidateConfigFileContents(outputConfigFile);
-        final String contigColumnName = determineOutputColumnFromList(headerNameProperties.getProperty(XsvLocatableTableCodec.CONFIG_FILE_CONTIG_COLUMN_KEY));
-        final String startColumnName = determineOutputColumnFromList(headerNameProperties.getProperty(XsvLocatableTableCodec.CONFIG_FILE_START_COLUMN_KEY));
-        final String endColumnName = determineOutputColumnFromList(headerNameProperties.getProperty(XsvLocatableTableCodec.CONFIG_FILE_END_COLUMN_KEY));
+        final Pair<Boolean, Properties> validityAndPropertiesPair = XsvLocatableTableCodec.getAndValidateConfigFileContentsOnPath(outputConfigFile, true);
+        final boolean                   isValid                   = validityAndPropertiesPair.getLeft();
+        final Properties                headerNameProperties      = validityAndPropertiesPair.getRight();
+        if ( !isValid ) {
+            throw new UserException.BadInput("Error: invalid configuration file given: " + outputConfigFile.toUri().toString());
+        }
+        final String                    contigColumnName          = determineOutputColumnFromList(headerNameProperties.getProperty(DataSourceUtils.CONFIG_FILE_FIELD_NAME_CONTIG_COLUMN));
+        final String                    startColumnName           = determineOutputColumnFromList(headerNameProperties.getProperty(DataSourceUtils.CONFIG_FILE_FIELD_NAME_START_COLUMN));
+        final String                    endColumnName             = determineOutputColumnFromList(headerNameProperties.getProperty(DataSourceUtils.CONFIG_FILE_FIELD_NAME_END_COLUMN));
 
         XsvLocatableTableCodec.validateLocatableColumnName(contigColumnName);
         XsvLocatableTableCodec.validateLocatableColumnName(startColumnName);

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/Funcotator.java b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/Funcotator.java
@@ -227,11 +227,13 @@ public boolean requiresReference() {
     @Override
     public void onTraversalStart() {
 
+        logger.info("Validating Sequence Dictionaries...");
         if (seqValidationArguments.performSequenceDictionaryValidation()) {
             // Ensure that the reference dictionary is a superset of the variant dictionary:
             checkReferenceDictionaryIsSupersetOfVariantDictionary();
         }
 
+        logger.info("Processing user transcripts/defaults/overrides...");
         // Next set up our transcript list:
         final Set<String> finalUserTranscriptIdSet = FuncotatorEngine.processTranscriptList(funcotatorArgs.userTranscriptIdSet);
 
@@ -242,11 +244,13 @@ public void onTraversalStart() {
         // Get the header for our variants:
         final VCFHeader vcfHeader = getHeaderForVariants();
 
+        logger.info("Initializing data sources...");
         // Initialize all of our data sources:
         // Sort data sources to make them process in the same order each time:
         funcotatorArgs.dataSourceDirectories.sort(Comparator.naturalOrder());
         final Map<Path, Properties> configData = DataSourceUtils.getAndValidateDataSourcesFromPaths(funcotatorArgs.referenceVersion, funcotatorArgs.dataSourceDirectories);
 
+        logger.info("Finalizing data sources (this step can be long if data sources are cloud-based)...");
         // Create the data sources from the input:
         // This will also create and register the FeatureInputs (created by the Data Sources)
         // with the GATK Engine, so we do not have to plumb them in after the fact.
@@ -260,6 +264,7 @@ public void onTraversalStart() {
                 new FlankSettings(funcotatorArgs.fivePrimeFlankSize, funcotatorArgs.threePrimeFlankSize)
         );
 
+        logger.info("Initializing Funcotator Engine...");
         // Create our engine to do our work and drive this Funcotation train!
         funcotatorEngine = new FuncotatorEngine(
                 funcotatorArgs,