Skip to content

Commit

Permalink
# This is a combination of 2 commits.
Browse files Browse the repository at this point in the history
# This is the 1st commit message:

Updated data source inputs to accept NIO paths for backing files.

Now you can specify a URL in the backing file areas of the configuration
files for Funcotator data sources and the backing files will be read by
the FuncotationDataSourceFactories.

This effectively enables use of data sources in the cloud or a mix of
local- and cloud-based data sources through a config file change.

This update will enable gnomAD annotations (once the data sources are
    updated to point at the gnomAD files on Google Cloud).

Added in cloud data sources to test with.

Minor refactoring of LocatableXsvFuncotationFactory.  Now can only
support one file at a time instead of multiple files for each instance.

Fixes #5348

# This is the commit message #2:

Added in more cloud data sources.

New cloud dataset contains local data sources and a pointer to the
gnomAD google cloud bucket.
  • Loading branch information
jonn-smith committed Nov 19, 2018
1 parent 197c4cb commit 05e0c15
Show file tree
Hide file tree
Showing 93 changed files with 856 additions and 266 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
###############################################################################

#Setup variables for the script:
UNALIASED_SCRIPT_NAME=$( readlink "${BASH_SOURCE[0]}" || echo "${BASH_SOURCE[0]}" )
UNALIASED_SCRIPT_NAME=$( python -c "import os;print os.path.realpath(\"${BASH_SOURCE[0]}\")" )
SCRIPTDIR="$( cd "$( dirname "${UNALIASED_SCRIPT_NAME}" )" && pwd )"
SCRIPTNAME=$( echo $0 | sed 's#.*/##g' )
MINARGS=2
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
###############################################################################

#Setup variables for the script:
UNALIASED_SCRIPT_NAME=$( readlink "${BASH_SOURCE[0]}" || echo "${BASH_SOURCE[0]}" )
UNALIASED_SCRIPT_NAME=$( python -c "import os;print os.path.realpath(\"${BASH_SOURCE[0]}\")" )
SCRIPTDIR="$( cd "$( dirname "${UNALIASED_SCRIPT_NAME}" )" && pwd )"
SCRIPTNAME=$( echo $0 | sed 's#.*/##g' )
MINARGS=2
Expand Down
20 changes: 15 additions & 5 deletions scripts/funcotator/testing/testFuncotator.sh
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ HG38=/Users/jonn/Development/references/Homo_sapiens_assembly38.fasta

function simpleUsage()
{
echo -e "Usage: $SCRIPTNAME [-c] [-u] [-t] [-19|-38] [-MAF|-VCF] [-AOU]"
echo -e "Usage: $SCRIPTNAME [-c] [-cloud] [-u] [-t] [-19|-38] [-MAF|-VCF] [-AOU]"
echo -e "Build and run Funcotator."
}

Expand All @@ -87,6 +87,7 @@ function usage()
echo -e " -38 run with hg38 data sources/reference/input file"
echo -e " -MAF create MAF output"
echo -e " -VCF create VCF output (default)"
echo -e " -cloud use cloud data sources"
echo -e " -AOU use the All of Us/Clinical Pipeline data sources"
echo -e " -M REF_VER REFERENCE INPUT DATA_SOURCES run in MANUAL mode, providing all necessary input"
echo -e " REF_VER - a string for the reference version"
Expand Down Expand Up @@ -146,7 +147,8 @@ trap at_exit EXIT
function assertInputFilesExist() {
assertFileExists ${INPUT}
assertFileExists ${REF}
assertDirectoryExists ${DATA_SOURCES_PATH}

[[ ! -d $DATA_SOURCES_PATH ]] && error "Warning: Data sources may not exist ${DATA_SOURCES_PATH}" && error "Ignore this if data sources directory is in the cloud."
}

################################################################################
Expand Down Expand Up @@ -176,6 +178,9 @@ while [ $# -gt 0 ] ; do
-AOU)
useAOUDataSources=true
;;
-cloud)
useCloudDataSources=true
;;
-t)
doRunLargeTests=true
;;
Expand Down Expand Up @@ -267,7 +272,7 @@ if [[ $r -eq 0 ]] && ${doRunLargeTests} ; then
INPUT=/Users/jonn/Development/NON_PUBLIC/0816201804HC0_R01C01.vcf
#INPUT=/Users/jonn/Development/gatk/src/test/resources/large/funcotator/regressionTestVariantSet1.vcf
#INPUT=/Users/jonn/Development/gatk/src/test/resources/large/funcotator/regressionTestVariantSet2.vcf
INPUT=/Users/jonn/Development/gatk/src/test/resources/large/funcotator/regressionTestHg19Large.vcf
#INPUT=/Users/jonn/Development/gatk/src/test/resources/large/funcotator/regressionTestHg19Large.vcf
#INPUT=/Users/jonn/Development/gatk/hg38_trio_liftoverb37.vcf
#INPUT=/Users/jonn/Development/gatk/tmp.vcf
#INPUT=/Users/jonn/Development/data_to_run/problem_samples/splice_site_should_not_be_splice_site/error_case.vcf
Expand All @@ -279,18 +284,23 @@ if [[ $r -eq 0 ]] && ${doRunLargeTests} ; then
else
INPUT=/Users/jonn/Development/FUNCOTATOR_LARGE_TEST_INPUTS/hg38_trio.vcf
#INPUT=/Users/jonn/Development/gatk/src/test/resources/large/funcotator/regressionTestVariantSetHG38.vcf
#INPUT=/Users/jonn/Development/tmp/cohort24_23_seg.subset.vcf
REF=$HG38
fi

# Use the AOU data sources if we need them:
$useAOUDataSources && DATA_SOURCES_PATH=/Users/jonn/Development/funcotator_dataSources.vAoU3
$useAOUDataSources && echo "Using AOU data sources." && DATA_SOURCES_PATH=/Users/jonn/Development/funcotator_dataSources.vAoU3

# Use cloud data sources if we need them:
$useCloudDataSources && echo "Using cloud data sources." && DATA_SOURCES_PATH=/Users/jonn/Development/gatk/src/test/resources/large/funcotator/funcotator_dataSources_cloud_gnomad/
#$useCloudDataSources && echo "Using cloud data sources." && DATA_SOURCES_PATH=gs://hellbender/test/resources/large/funcotatorDataSourceCollection/funcotator_dataSources_cloud/

OUT_FORMAT_LOWER=$( echo "${OUT_FORMAT}" | tr 'A-Z' 'a-z' )
OUT_FILE_NAME=FUNCOTATOR_OUT.${OUT_FORMAT_LOWER}

assertInputFilesExist

${GATKDIR}/gatk Funcotator \
time ${GATKDIR}/gatk Funcotator \
-V ${INPUT} \
-O ${OUT_FILE_NAME} \
-R ${REF} \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
import org.broadinstitute.hellbender.utils.gcs.BucketUtils;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.nio.SeekableByteChannelPrefetcher;
import static org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBUtils.*;

import java.io.File;
import java.io.IOException;
Expand All @@ -32,6 +31,8 @@
import java.util.Optional;
import java.util.function.Function;

import static org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBUtils.createExportConfiguration;

/**
* Enables traversals and queries over sources of Features, which are metadata associated with a location
* on the genome in a format supported by our file parsing framework, Tribble. Examples of Features are
Expand Down Expand Up @@ -276,6 +277,9 @@ public FeatureDataSource(final FeatureInput<T> featureInput, final int queryLook
this.queryLookaheadBases = queryLookaheadBases;
}

final void printCacheStats() {
queryCache.printCacheStatistics( getName() );
}

@SuppressWarnings("unchecked")
private static <T extends Feature> FeatureReader<T> getFeatureReader(final FeatureInput<T> featureInput, final Class<? extends Feature> targetFeatureType,
Expand Down Expand Up @@ -332,17 +336,19 @@ private static <T extends Feature> FeatureReader<T> getFeatureReader(final Featu
private static <T extends Feature> AbstractFeatureReader<T, ?> getTribbleFeatureReader(final FeatureInput<T> featureInput, final FeatureCodec<T, ?> codec, final Function<SeekableByteChannel, SeekableByteChannel> cloudWrapper, final Function<SeekableByteChannel, SeekableByteChannel> cloudIndexWrapper) {
Utils.nonNull(codec);
try {
final String absolutePath = IOUtils.getPath(featureInput.getFeaturePath()).toAbsolutePath().toUri().toString();
// Must get the path to the data file from the codec here:
final String absoluteRawPath = IOUtils.getPath(featureInput.getFeaturePath()).toAbsolutePath().toUri().toString();
final String absoluteProcessedPath = IOUtils.getPath(codec.getPathToDataFile(featureInput.getFeaturePath())).toAbsolutePath().toUri().toString();

// Instruct the reader factory to not require an index. We will require one ourselves as soon as
// a query by interval is attempted.
final boolean requireIndex = false;

// Only apply the wrappers if the feature input is on Google Cloud Storage
if (BucketUtils.isCloudStorageUrl(absolutePath)) {
return AbstractFeatureReader.getFeatureReader(absolutePath, null, codec, requireIndex, cloudWrapper, cloudIndexWrapper);
if (BucketUtils.isCloudStorageUrl(absoluteProcessedPath)) {
return AbstractFeatureReader.getFeatureReader(absoluteRawPath, null, codec, requireIndex, cloudWrapper, cloudIndexWrapper);
} else {
return AbstractFeatureReader.getFeatureReader(absolutePath, null, codec, requireIndex, Function.identity(), Function.identity());
return AbstractFeatureReader.getFeatureReader(absoluteRawPath, null, codec, requireIndex, Function.identity(), Function.identity());
}
} catch (final TribbleException e) {
throw new GATKException("Error initializing feature reader for path " + featureInput.getFeaturePath(), e);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,13 @@ public String getFeaturePath() {
return featureFile;
}

/**
* @return The key/value {@link Map<String,String>} as supplied to create the data in this {@link FeatureInput}.
*/
public Map<String, String> getKeyValueMap() {
return keyValueMap;
}

/**
* FeatureInputs will be hashed by the engine, so make an effort to produce a reasonable hash code
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,12 @@ private void initializeFeatureSources( final int featureQueryLookahead, final Co
}
}

@SuppressWarnings({"unchecked", "rawtypes"})
public void dumpAllFeatureCacheStats() {
for ( final FeatureDataSource f : featureSources.values() ) {
f.printCacheStats();
}
}

/**
* Add the feature data source to the given feature input.
Expand Down Expand Up @@ -450,7 +456,7 @@ private <T extends Feature> FeatureDataSource<T> lookupDataSource( final Feature
public static FeatureCodec<? extends Feature, ?> getCodecForFile( final Path featurePath, final Class<? extends Feature> featureType ) {
// Make sure Path exists/is readable
if ( ! Files.isReadable(featurePath) ) {
throw new UserException.CouldNotReadInputFile(featurePath);
throw new UserException.CouldNotReadInputFile(featurePath.toUri().toString());
}

// Gather all discovered codecs that claim to be able to decode the given file according to their
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ public abstract class GATKTool extends CommandLineProgram {
/**
* Our source of Feature data (null if no source of Features was provided)
*/
FeatureManager features;
public FeatureManager features;

/**
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -384,7 +384,7 @@ public static final class NoSuitableCodecs extends UserException {
private static final long serialVersionUID = 0L;

public NoSuitableCodecs(final Path file) {
super("Cannot read " + file + " because no suitable codecs found");
super("Cannot read " + file.toUri().toString() + " because no suitable codecs found");
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@
import htsjdk.tribble.AsciiFeatureCodec;
import htsjdk.tribble.readers.LineIterator;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.funcotator.dataSources.DataSourceUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.codecs.xsvLocatableTable.XsvLocatableTableCodec;
Expand Down Expand Up @@ -38,6 +40,7 @@ public class AnnotatedIntervalCodec extends AsciiFeatureCodec<AnnotatedInterval>
public static final String START_COL_COMMENT = "_StartHeader=";
public static final String END_COL_COMMENT = "_EndHeader=";

private Path configFilePath;
private XsvLocatableTableCodec xsvLocatableTableCodec;
private AnnotatedIntervalHeader header;

Expand All @@ -46,9 +49,10 @@ public AnnotatedIntervalCodec() {
xsvLocatableTableCodec = new XsvLocatableTableCodec();
}

public AnnotatedIntervalCodec(final Path overrideConfigFile) {
public AnnotatedIntervalCodec(final Path configFilePath) {
super(AnnotatedInterval.class);
xsvLocatableTableCodec = new XsvLocatableTableCodec(overrideConfigFile);
this.configFilePath = configFilePath;
xsvLocatableTableCodec = new XsvLocatableTableCodec(configFilePath);
}

@Override
Expand Down Expand Up @@ -78,8 +82,8 @@ public AnnotatedIntervalHeader readActualHeader(final LineIterator reader) {
}

@Override
public boolean canDecode(final String path) {
return (path.endsWith(".seg") || path.endsWith(".maf") || path.endsWith(".maf.annotated")) && xsvLocatableTableCodec.canDecodeMinusExtensionChecks(path);
public boolean canDecode(final String pathString) {
return (pathString.endsWith(".seg") || pathString.endsWith(".maf") || pathString.endsWith(".maf.annotated")) && xsvLocatableTableCodec.canDecodeFileChecks(configFilePath.toUri().toString(), pathString);
}

/**
Expand All @@ -98,10 +102,15 @@ public static AnnotatedIntervalHeader createHeaderForWriter(final Path outputCon
Utils.nonNull(outputConfigFile);

//TODO: Change this so that it outputs the first in the list.
final Properties headerNameProperties = XsvLocatableTableCodec.getAndValidateConfigFileContents(outputConfigFile);
final String contigColumnName = determineOutputColumnFromList(headerNameProperties.getProperty(XsvLocatableTableCodec.CONFIG_FILE_CONTIG_COLUMN_KEY));
final String startColumnName = determineOutputColumnFromList(headerNameProperties.getProperty(XsvLocatableTableCodec.CONFIG_FILE_START_COLUMN_KEY));
final String endColumnName = determineOutputColumnFromList(headerNameProperties.getProperty(XsvLocatableTableCodec.CONFIG_FILE_END_COLUMN_KEY));
final Pair<Boolean, Properties> validityAndPropertiesPair = XsvLocatableTableCodec.getAndValidateConfigFileContentsOnPath(outputConfigFile, true);
final boolean isValid = validityAndPropertiesPair.getLeft();
final Properties headerNameProperties = validityAndPropertiesPair.getRight();
if ( !isValid ) {
throw new UserException.BadInput("Error: invalid configuration file given: " + outputConfigFile.toUri().toString());
}
final String contigColumnName = determineOutputColumnFromList(headerNameProperties.getProperty(DataSourceUtils.CONFIG_FILE_FIELD_NAME_CONTIG_COLUMN));
final String startColumnName = determineOutputColumnFromList(headerNameProperties.getProperty(DataSourceUtils.CONFIG_FILE_FIELD_NAME_START_COLUMN));
final String endColumnName = determineOutputColumnFromList(headerNameProperties.getProperty(DataSourceUtils.CONFIG_FILE_FIELD_NAME_END_COLUMN));

XsvLocatableTableCodec.validateLocatableColumnName(contigColumnName);
XsvLocatableTableCodec.validateLocatableColumnName(startColumnName);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -227,11 +227,13 @@ public boolean requiresReference() {
@Override
public void onTraversalStart() {

logger.info("Validating Sequence Dictionaries...");
if (seqValidationArguments.performSequenceDictionaryValidation()) {
// Ensure that the reference dictionary is a superset of the variant dictionary:
checkReferenceDictionaryIsSupersetOfVariantDictionary();
}

logger.info("Processing user transcripts/defaults/overrides...");
// Next set up our transcript list:
final Set<String> finalUserTranscriptIdSet = FuncotatorEngine.processTranscriptList(funcotatorArgs.userTranscriptIdSet);

Expand All @@ -242,11 +244,13 @@ public void onTraversalStart() {
// Get the header for our variants:
final VCFHeader vcfHeader = getHeaderForVariants();

logger.info("Initializing data sources...");
// Initialize all of our data sources:
// Sort data sources to make them process in the same order each time:
funcotatorArgs.dataSourceDirectories.sort(Comparator.naturalOrder());
final Map<Path, Properties> configData = DataSourceUtils.getAndValidateDataSourcesFromPaths(funcotatorArgs.referenceVersion, funcotatorArgs.dataSourceDirectories);

logger.info("Finalizing data sources (this step can be long if data sources are cloud-based)...");
// Create the data sources from the input:
// This will also create and register the FeatureInputs (created by the Data Sources)
// with the GATK Engine, so we do not have to plumb them in after the fact.
Expand All @@ -260,6 +264,7 @@ public void onTraversalStart() {
new FlankSettings(funcotatorArgs.fivePrimeFlankSize, funcotatorArgs.threePrimeFlankSize)
);

logger.info("Initializing Funcotator Engine...");
// Create our engine to do our work and drive this Funcotation train!
funcotatorEngine = new FuncotatorEngine(
funcotatorArgs,
Expand Down
Loading

0 comments on commit 05e0c15

Please sign in to comment.