+ *
* Add new samples to an existing genomicsdb workspace.
* In the incremental import case, no intervals are specified in the command because the tool will use the same
* intervals used in the initial import. Sample map is also supported for incremental import.
@@ -409,14 +423,7 @@ public int getDefaultCloudIndexPrefetchBufferSize() {
// Intervals from command line (merged if specified)
private List intervals;
- // Sorted mapping between sample names and corresponding GVCF file name
- //
- // IMPORTANT: This must be sorted or it will result in sample name swaps in the output database.
- // This happens because the callset json is generated independently from the import process
- // each imported batch is then sorted, so if we have an unsorted list we'll end up with different global vs batch
- // sorting.
- // We preemptively sort here so we will have consistent sorting.
- private SortedMap sampleNameToVcfPath = new TreeMap<>();
+ private SampleNameMap sampleNameMap;
// Needed as smartMergeHeaders() returns a set of VCF header lines
private Set mergedHeaderLines = null;
@@ -511,16 +518,6 @@ private static void assertIntervalsCoverEntireContigs(GenomicsDBImporter importe
}
}
- private static void assertVariantFileIsCompressedAndIndexed(final Path path) {
- if (!path.toString().toLowerCase().endsWith(FileExtensions.COMPRESSED_VCF)) {
- throw new UserException("Input variant files must be block compressed vcfs when using " +
- BYPASS_FEATURE_READER + ", but " + path.toString() + " does not end with " +
- "the standard file extension " + FileExtensions.COMPRESSED_VCF);
- }
- Path indexPath = path.resolveSibling(path.getFileName() + FileExtensions.COMPRESSED_VCF_INDEX);
- IOUtils.assertFileIsReadable(indexPath);
- }
-
/**
* sets the values of mergedHeaderLines, mergedHeaderSequenceDictionary, and sampleNameToVcfPath
*/
@@ -529,23 +526,20 @@ private void initializeHeaderAndSampleMappings() {
if (variantPaths != null && variantPaths.size() > 0) {
// -V was specified
final List headers = new ArrayList<>(variantPaths.size());
+ sampleNameMap = new SampleNameMap();
for (final String variantPathString : variantPaths) {
final Path variantPath = IOUtils.getPath(variantPathString);
if (bypassFeatureReader) {
- assertVariantFileIsCompressedAndIndexed(variantPath);
+ GATKGenomicsDBUtils.assertVariantFileIsCompressedAndIndexed(variantPath);
}
- final VCFHeader header = getHeaderFromPath(variantPath);
+ final VCFHeader header = getHeaderFromPath(variantPath, null);
Utils.validate(header != null, "Null header was found in " + variantPath + ".");
assertGVCFHasOnlyOneSample(variantPathString, header);
headers.add(header);
final String sampleName = header.getGenotypeSamples().get(0);
try {
- final URI previousPath = sampleNameToVcfPath.put(sampleName, new URI(variantPathString));
- if (previousPath != null) {
- throw new UserException("Duplicate sample: " + sampleName + ". Sample was found in both "
- + variantPath.toUri() + " and " + previousPath + ".");
- }
+ sampleNameMap.addSample(sampleName, new URI(variantPathString));
}
catch(final URISyntaxException e) {
throw new UserException("Malformed URI "+e.toString(), e);
@@ -561,9 +555,14 @@ private void initializeHeaderAndSampleMappings() {
//it's VERY IMPORTANT that this map is Sorted according to String's natural ordering, if it is not
//the resulting database will have incorrect sample names
//see https://github.com/broadinstitute/gatk/issues/3682 for more information
- sampleNameToVcfPath = loadSampleNameMapFileInSortedOrder(IOUtils.getPath(sampleNameMapFile), bypassFeatureReader);
- final Path firstHeaderPath = IOUtils.getPath(sampleNameToVcfPath.entrySet().iterator().next().getValue().toString());
- final VCFHeader header = getHeaderFromPath(firstHeaderPath);
+ // The SampleNameMap class guarantees that the samples will be sorted correctly.
+ sampleNameMap = new SampleNameMap(IOUtils.getPath(sampleNameMapFile), bypassFeatureReader);
+
+ final String firstSample = sampleNameMap.getSampleNameToVcfPath().entrySet().iterator().next().getKey();
+ final Path firstVCFPath = sampleNameMap.getVCFForSampleAsPath(firstSample);
+ final Path firstVCFIndexPath = sampleNameMap.getVCFIndexForSampleAsPath(firstSample);
+ final VCFHeader header = getHeaderFromPath(firstVCFPath, firstVCFIndexPath);
+
//getMetaDataInInputOrder() returns an ImmutableSet - LinkedHashSet is mutable and preserves ordering
mergedHeaderLines = new LinkedHashSet(header.getMetaDataInInputOrder());
mergedHeaderSequenceDictionary = header.getSequenceDictionary();
@@ -592,10 +591,17 @@ else if (getIntervalsFromExistingWorkspace){
if ( mergedHeaderSequenceDictionary == null) {
throw new UserException("The merged vcf header has no sequence dictionary. Please provide a header that contains a sequence dictionary.");
}
+
+ // If any indices were specified in the sample name map file, make sure
+ // that --bypass-feature-reader wasn't also specified:
+ if ( sampleNameMap != null && sampleNameMap.indicesSpecified() && bypassFeatureReader ) {
+ throw new UserException("Indices were specified for some VCFs in the sample name map file, but --" + BYPASS_FEATURE_READER +
+ " was also specified. Specifying explicit indices is not supported when running with --" + BYPASS_FEATURE_READER);
+ }
}
- private VCFHeader getHeaderFromPath(final Path variantPath) {
- try(final FeatureReader reader = getReaderFromPath(variantPath)) {
+ private VCFHeader getHeaderFromPath(final Path variantPath, final Path variantIndexPath) {
+ try(final FeatureReader reader = getReaderFromPath(variantPath, variantIndexPath)) {
return (VCFHeader) reader.getHeader();
} catch (final IOException e) {
throw new UserException("Error while reading vcf header from " + variantPath.toUri(), e);
@@ -610,85 +616,7 @@ private static void assertGVCFHasOnlyOneSample(final String variantPath, final V
}
}
- /**
- * Load a tab delimited new line separated file of sample name to URI mapping:
- * this maintains the keys in the same order that they appeared in the file
- *
- * This tool should only call {@link #loadSampleNameMapFileInSortedOrder(Path)}.
- * This non-sorting overload is exposed for testing purposes only.
- *
- * ex:
- *
- * Sample1\tpathToSample1.vcf\n
- * Sample2\tpathTosample2.vcf\n
- * ...
- *
- * The sample names must be unique.
- * @param sampleToFileMapPath path to the mapping file
- * @return map of sample name to corresponding file, the map will be ordered according to the order in the input file
- */
- @VisibleForTesting
- static LinkedHashMap loadSampleNameMapFile(final Path sampleToFileMapPath) {
- return loadSampleNameMapFile(sampleToFileMapPath, false);
- }
-
- private static LinkedHashMap loadSampleNameMapFile(final Path sampleToFileMapPath,
- final boolean checkVcfIsCompressedAndIndexed) {
- try {
- final List lines = Files.readAllLines(sampleToFileMapPath);
- if (lines.isEmpty()) {
- throw new UserException.BadInput( "At least 1 sample is required but none were found in the sample mapping file");
- }
- final LinkedHashMap sampleToFilename = new LinkedHashMap<>();
- for ( final String line : lines) {
- final String[] split = line.split("\\t",-1);
- if (split.length != 2) {
- throw new UserException.BadInput("Expected a file with 2 fields per line in the format\nSample\tFile\n but found line: \""
- + line +"\" with "+split.length+" fields");
- }
- if ( !split[0].trim().equals(split[0]) || split[0].trim().isEmpty()
- || split[1].trim().isEmpty()) {
- throw new UserException.BadInput("Expected a file of format\nSample\tFile\n but found line: '" + line + "'\nValid sample names must be non-empty strings that cannot begin or end with whitespace and valid file names must be non-empty and not all whitespace");
- }
- final String sample = split[0];
- final String path = split[1].trim();
- try {
- final URI oldPath = sampleToFilename.put(sample, new URI(path));
- if (oldPath != null){
- throw new UserException.BadInput("Found two mappings for the same sample: " + sample + "\n" + path + "\n" + oldPath );
- }
- if (checkVcfIsCompressedAndIndexed) {
- assertVariantFileIsCompressedAndIndexed(IOUtils.getPath(path));
- }
- }
- catch(final URISyntaxException e) {
- throw new UserException("Malformed URI "+e.toString());
- }
- }
- return sampleToFilename;
- } catch (final IOException e) {
- throw new UserException.CouldNotReadInputFile(sampleToFileMapPath, "exception while reading sample->filename mapping file", e);
- }
- }
-
- /**
- * load a tab delimited new line separated file of sample name to URI mapping:
- *
- * ex:
- * Sample1\tpathToSample1.vcf\n
- * Sample2\tpathTosample2.vcf\n
- * ...
- *
- * The sample names must be unique.
- * @param sampleToFileMapPath path to the mapping file
- * @param checkVcfIsCompressedAndIndexed boolean indicating whether to check vcf is compressed and indexed
- * @return map of sample name to corresponding file, sorted by sample name
- */
- public static SortedMap loadSampleNameMapFileInSortedOrder(final Path sampleToFileMapPath,
- final boolean checkVcfIsCompressedAndIndexed){
- return new TreeMap<>(loadSampleNameMapFile(sampleToFileMapPath, checkVcfIsCompressedAndIndexed));
- }
/**
* write out interval list to file
@@ -757,11 +685,11 @@ private Void logMessageOnBatchCompletion(final BatchCompletionCallbackFunctionAr
logger.info("Done importing batch " + arg.batchCount + "/" + arg.totalBatchCount);
logger.debug("List of samples imported in batch " + arg.batchCount + ":");
int index = 0;
- final int sampleCount = sampleNameToVcfPath.size();
+ final int sampleCount = sampleNameMap.getNumSamples();
final int updatedBatchSize = (batchSize == DEFAULT_ZERO_BATCH_SIZE) ? sampleCount : batchSize;
final int startBatch = (arg.batchCount - 1) * updatedBatchSize;
final int stopBatch = arg.batchCount * updatedBatchSize;
- for(String key : sampleNameToVcfPath.keySet()) {
+ for(String key : sampleNameMap.getSampleNamesInSortedOrder()) {
index++;
if (index <= startBatch || index > stopBatch) {
continue;
@@ -817,7 +745,7 @@ private List generatePartitionListFromI
private List generateIntervalListFromVidMap() {
try {
GenomicsDBVidMapProto.VidMappingPB vidMapPB =
- org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBUtils.getProtobufVidMappingFromJsonFile(vidMapJSONFile);
+ GATKGenomicsDBUtils.getProtobufVidMappingFromJsonFile(vidMapJSONFile);
List partitions = Arrays.asList(GenomicsDBUtils.listGenomicsDBArrays(workspace));
return partitions.stream().flatMap(partition -> {
@@ -863,7 +791,7 @@ private ImportConfig createImportConfig(final int batchSize) {
importConfigurationBuilder.setConsolidateTiledbArrayAfterLoad(doConsolidation);
importConfigurationBuilder.setEnableSharedPosixfsOptimizations(sharedPosixFSOptimizations);
ImportConfig importConfig = new ImportConfig(importConfigurationBuilder.build(), validateSampleToReaderMap, true,
- batchSize, mergedHeaderLines, sampleNameToVcfPath, bypassFeatureReader ? null : this::createSampleToReaderMap,
+ batchSize, mergedHeaderLines, sampleNameMap.getSampleNameToVcfPath(), bypassFeatureReader ? null : this::createSampleToReaderMap,
doIncrementalImport);
importConfig.setOutputCallsetmapJsonFile(callsetMapJSONFile);
importConfig.setOutputVidmapJsonFile(vidMapJSONFile);
@@ -891,7 +819,7 @@ public void traverse() {
// Force the progress meter to update after every batch
progressMeter.setRecordsBetweenTimeChecks(1L);
- final int sampleCount = sampleNameToVcfPath.size();
+ final int sampleCount = sampleNameMap.getNumSamples();
final int updatedBatchSize = (batchSize == DEFAULT_ZERO_BATCH_SIZE) ? sampleCount : batchSize;
final ImportConfig importConfig = createImportConfig(updatedBatchSize);
@@ -899,7 +827,7 @@ public void traverse() {
try {
importer = new GenomicsDBImporter(importConfig);
// Modify importer directly from updateImportProtobufVidMapping.
- org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBUtils.updateImportProtobufVidMapping(importer);
+ GATKGenomicsDBUtils.updateImportProtobufVidMapping(importer);
if (mergeContigsIntoNumPartitions != 0) {
if (!doIncrementalImport) {
assertIntervalsCoverEntireContigs(importer, intervals);
@@ -952,8 +880,9 @@ private SortedMap> getFeatureReadersInPara
final String sampleName = sampleNames.get(i);
futures.put(sampleName, inputPreloadExecutorService.submit(() -> {
final Path variantPath = IOUtils.getPath(sampleNametoPath.get(sampleName).toString());
+ final Path variantIndexPath = sampleNameMap.getVCFIndexForSampleAsPath(sampleName);
try {
- return new InitializedQueryWrapper(getReaderFromPath(variantPath), intervals.get(0));
+ return new InitializedQueryWrapper(getReaderFromPath(variantPath, variantIndexPath), intervals.get(0));
} catch (final IOException e) {
throw new UserException.CouldNotReadInputFile("Couldn't read file: " + variantPath.toUri(), e);
}
@@ -980,7 +909,9 @@ private SortedMap> getFeatureReadersSerial
final List sampleNames = new ArrayList<>(sampleNameToPath.keySet());
for(int i = lowerSampleIndex; i < sampleNameToPath.size() && i < lowerSampleIndex+batchSize; ++i) {
final String sampleName = sampleNames.get(i);
- final FeatureReader reader = getReaderFromPath(IOUtils.getPath(sampleNameToPath.get(sampleName).toString()));
+ final Path variantPath = IOUtils.getPath(sampleNameToPath.get(sampleName).toString());
+ final Path variantIndexPath = sampleNameMap.getVCFIndexForSampleAsPath(sampleName);
+ final FeatureReader reader = getReaderFromPath(variantPath, variantIndexPath);
sampleToReaderMap.put(sampleName, reader);
}
logger.info("Importing batch " + this.batchCount + " with " + sampleToReaderMap.size() + " samples");
@@ -993,10 +924,13 @@ private SortedMap> getFeatureReadersSerial
* @return Feature reader
* @param variantPath
*/
- private FeatureReader getReaderFromPath(final Path variantPath) {
+ private FeatureReader getReaderFromPath(final Path variantPath, final Path variantIndexPath) {
+ // TODO: we repeatedly convert between URI, Path, and String in this tool. Is this necessary?
final String variantURI = variantPath.toAbsolutePath().toUri().toString();
+ final String variantIndexURI = variantIndexPath == null ? null : variantIndexPath.toAbsolutePath().toUri().toString();
+
try {
- final FeatureReader reader = AbstractFeatureReader.getFeatureReader(variantURI, null, new VCFCodec(), true,
+ final FeatureReader reader = AbstractFeatureReader.getFeatureReader(variantURI, variantIndexURI, new VCFCodec(), true,
BucketUtils.getPrefetchingWrapper(cloudPrefetchBuffer),
BucketUtils.getPrefetchingWrapper(cloudIndexPrefetchBuffer));
@@ -1058,7 +992,7 @@ public VariantContext next() {
*/
private String overwriteCreateOrCheckWorkspace() {
String workspaceDir = genomicsDBGetAbsolutePath(workspace);
- // From JavaDoc for GenomicsDBUtils.createTileDBWorkspacevid
+ // From JavaDoc for GATKGenomicsDBUtils.createTileDBWorkspacevid
// returnCode = 0 : OK. If overwriteExistingWorkspace is true and the workspace exists, it is deleted first.
// returnCode = -1 : path was not a directory
// returnCode = -2 : failed to create workspace
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/SampleNameMap.java b/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/SampleNameMap.java
new file mode 100644
index 00000000000..0448201d850
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/SampleNameMap.java
@@ -0,0 +1,274 @@
+package org.broadinstitute.hellbender.tools.genomicsdb;
+
+import org.broadinstitute.hellbender.exceptions.UserException;
+import org.broadinstitute.hellbender.utils.io.IOUtils;
+
+import java.io.IOException;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.*;
+
+/**
+ * A class to hold the mappings of sample names to VCF / VCF index paths. Used by GenomicsDBImport.
+ *
+ * This class can be constructed from a textual file containing lines in the format:
+ *
+ * Sample\tVCF
+ * or:
+ * Sample\tVCF\tIndex
+ *
+ * The sample names may have internal whitespace, but not leading/trailing whitespace.
+ * The VCF and Index URIs may have leading/trailing whitespace, which is ignored.
+ *
+ * The third Index column is optional. It is permitted to specify the index for some samples
+ * and not others. If an index is not specified for a sample, its location is inferred from
+ * the VCF URI.
+ *
+ * It is also possible to construct an empty SampleNameMap using the no-arg constructor, and
+ * add sample mappings one at a time using addSample().
+ */
+public final class SampleNameMap {
+ // Sorted mapping between sample names and corresponding GVCF file name
+ //
+ // IMPORTANT: This must be sorted or it will result in sample name swaps in the output database.
+ // This happens because the callset json is generated independently from the import process
+ // each imported batch is then sorted, so if we have an unsorted list we'll end up with different
+ // global vs batch sorting.
+ // We preemptively sort here so we will have consistent sorting.
+ private SortedMap sampleNameToVcfPath;
+
+ // Mapping between sample names and corresponding VCF index path
+ //
+ // This Map contains only indices specified explicitly via the sample name map file.
+ // If an explicit index is not specified for a given sample, it will not have an
+ // entry in this Map, and the index path will be automatically inferred based on
+ // the location of the VCF.
+ //
+ // The ordering of the entries in this Map does not actually matter, since it's not
+ // directly exposed, and is used only for individual lookups via getVCFIndexForSample()
+ private SortedMap sampleNameToVcfIndexPath;
+
+ /**
+ * Create an empty SampleNameMap. Samples can be added later using addSample()
+ */
+ public SampleNameMap() {
+ sampleNameToVcfPath = new TreeMap<>();
+ sampleNameToVcfIndexPath = new TreeMap<>();
+ }
+
+ /**
+ * Create a SampleNameMap from a textual file containing the sample mappings. The
+ * lines in this file must be in the format:
+ *
+ * Sample\tVCF
+ * or:
+ * Sample\tVCF\tIndex
+ *
+ * The sample names may have internal whitespace, but not leading/trailing whitespace.
+ * The VCF and Index URIs may have leading/trailing whitespace, which is ignored.
+ *
+ * The third Index column is optional. It is permitted to specify the index for some samples
+ * and not others. If an index is not specified for a sample, its location is inferred from
+ * the VCF URI.
+ *
+ * @param sampleMapFilePath Path to the file containing the sample name mappings to load
+ */
+ public SampleNameMap(final Path sampleMapFilePath) {
+ this(sampleMapFilePath, false);
+ }
+
+ /**
+ * Create a SampleNameMap from a textual file containing the sample mappings. The
+ * lines in this file must be in the format:
+ *
+ * SampleName1\tVCF
+ * or:
+ * SampleName1\tVCF\tIndex
+ *
+ * The sample names may have internal whitespace, but not leading/trailing whitespace.
+ * The VCF and Index URIs may have leading/trailing whitespace, which is ignored.
+ *
+ * The third Index column is optional. It is permitted to specify the index for some samples
+ * and not others. If an index is not specified for a sample, its location is inferred from
+ * the VCF URI.
+ *
+ * @param sampleMapFilePath Path to the file containing the sample name mappings to load
+ * @param checkVcfIsCompressedAndIndexed If true, check each VCF to make sure it's compressed and indexed
+ */
+ public SampleNameMap(final Path sampleMapFilePath, final boolean checkVcfIsCompressedAndIndexed) {
+ sampleNameToVcfPath = new TreeMap<>();
+ sampleNameToVcfIndexPath = new TreeMap<>();
+
+ loadSampleNameMapFile(sampleMapFilePath, checkVcfIsCompressedAndIndexed);
+ }
+
+ private void loadSampleNameMapFile(final Path sampleToFileMapPath, final boolean checkVcfIsCompressedAndIndexed) {
+ try {
+ final List lines = Files.readAllLines(sampleToFileMapPath);
+ if (lines.isEmpty()) {
+ throw new UserException.BadInput( "At least 1 sample is required but none were found in the sample mapping file");
+ }
+
+ for (final String line : lines) {
+ final String[] split = line.split("\\t",-1);
+ if (split.length != 2 && split.length != 3) {
+ throw new UserException.BadInput("Sample name map file must have 2 or 3 fields per line in the format:\nSample\tFile\nor:\nSample\tFile\tIndex\nbut found line: \""
+ + line +"\" with "+split.length+" fields");
+ }
+ if ( ! sampleNameIsLegal(split[0]) || split[1].trim().isEmpty()) {
+ throw new UserException.BadInput("Sample name map file must have lines in the format:\nSample\tFile\nor:\nSample\tFile\tIndex\n but found line: '" + line + "'\nValid sample names must be non-empty strings that cannot begin or end with whitespace and valid file names must be non-empty and not all whitespace");
+ }
+ final String sample = split[0];
+ final String vcfPath = split[1].trim();
+
+ String vcfIndexPath = null;
+ if ( split.length == 3 ) {
+ vcfIndexPath = split[2].trim();
+
+ if ( vcfIndexPath.isEmpty() ) {
+ throw new UserException.BadInput("Found a line in the sample name map file with an empty or all-whitespace value for the index:\n" + "\"" + line + "\"");
+ }
+ }
+
+ try {
+ final URI existingVCFPath = sampleNameToVcfPath.put(sample, new URI(vcfPath));
+ if (existingVCFPath != null){
+ throw new UserException.BadInput("Found two mappings for the same sample: " + sample + "\n" + vcfPath + "\n" + existingVCFPath);
+ }
+
+ if ( vcfIndexPath != null ) {
+ final URI existingVCFIndexPath = sampleNameToVcfIndexPath.put(sample, new URI(vcfIndexPath));
+ if (existingVCFIndexPath != null) {
+ throw new UserException.BadInput("Found two indices for the same sample: " + sample + "\n" + vcfIndexPath + "\n" + existingVCFIndexPath);
+ }
+ }
+
+ if (checkVcfIsCompressedAndIndexed) {
+ GATKGenomicsDBUtils.assertVariantFileIsCompressedAndIndexed(IOUtils.getPath(vcfPath), vcfIndexPath == null ? null : IOUtils.getPath(vcfIndexPath));
+ }
+ }
+ catch(final URISyntaxException e) {
+ throw new UserException("Malformed URI: " + e.toString());
+ }
+ }
+ } catch (final IOException e) {
+ throw new UserException.CouldNotReadInputFile(sampleToFileMapPath, "exception while reading sample->filename mapping file", e);
+ }
+ }
+
+ /**
+ * Tests whether the sample name is legal. Sample names must be non-empty, and
+ * may have internal whitespace but not leading/trailing whitespace.
+ *
+ * @param sampleName sample name to test
+ * @return true if sampleName is legal, otherwise false
+ */
+ private boolean sampleNameIsLegal(final String sampleName) {
+ return sampleName != null &&
+ ! sampleName.trim().isEmpty() &&
+ sampleName.trim().equals(sampleName);
+ }
+
+ /**
+ * Add a new sample mapping
+ *
+ * @param sampleName name of the sample
+ * @param vcfPath path to the VCF for the sample
+ */
+ public void addSample(final String sampleName, final URI vcfPath) {
+ addSample(sampleName, vcfPath, null);
+ }
+
+ /**
+ * Add a new sample mapping
+ *
+ * @param sampleName name of the sample
+ * @param vcfPath path to the VCF for the sample (not null)
+ * @param vcfIndexPath path to the index for the sample (may be null)
+ */
+ public void addSample(final String sampleName, final URI vcfPath, final URI vcfIndexPath) {
+ if ( ! sampleNameIsLegal(sampleName) ) {
+ throw new UserException.BadInput("Sample name " + sampleName + " is not legal. Sample names must be non-empty and not contain leading or trailing whitespace");
+ }
+ if ( vcfPath == null ) {
+ throw new UserException.BadInput("VCF path for sample " + sampleName + " was null");
+ }
+
+ final URI previousPath = sampleNameToVcfPath.put(sampleName, vcfPath);
+ if (previousPath != null) {
+ throw new UserException.BadInput("Duplicate sample: " + sampleName + ". Sample was found in both "
+ + vcfPath + " and " + previousPath + ".");
+ }
+
+ if (vcfIndexPath != null) {
+ final URI previousIndexPath = sampleNameToVcfIndexPath.put(sampleName, vcfIndexPath);
+ if (previousIndexPath != null) {
+ throw new UserException.BadInput("For sample " + sampleName + ", attempted to specify multiple indices: " + vcfIndexPath + " and " + previousIndexPath);
+ }
+ }
+ }
+
+ /**
+ * @return The full mapping of sample names -> VCF paths, with the sample names in sorted order
+ */
+ public SortedMap getSampleNameToVcfPath() {
+ return sampleNameToVcfPath;
+ }
+
+ /**
+ * @param sample sample name
+ * @return the VCF associated with that sample name, as a URI
+ */
+ public URI getVCFForSample(final String sample) {
+ return sampleNameToVcfPath.get(sample);
+ }
+
+ /**
+ * @param sample sample name
+ * @return the VCF associated with that sample name, as a Path
+ */
+ public Path getVCFForSampleAsPath(final String sample) {
+ final URI vcfURI = sampleNameToVcfPath.get(sample);
+ return vcfURI == null ? null : IOUtils.getPath(vcfURI.toString());
+ }
+
+ /**
+ * @param sample sample name
+ * @return the VCF index associated with that sample name, as a URI, or null if no index
+ */
+ public URI getVCFIndexForSample(final String sample) {
+ return sampleNameToVcfIndexPath.get(sample);
+ }
+
+ /**
+ * @param sample sample name
+ * @return the VCF index associated with that sample name, as a Path, or null if no index
+ */
+ public Path getVCFIndexForSampleAsPath(final String sample) {
+ final URI vcfIndexURI = sampleNameToVcfIndexPath.get(sample);
+ return vcfIndexURI == null ? null : IOUtils.getPath(vcfIndexURI.toString());
+ }
+
+ /**
+ * @return number of samples in this Map
+ */
+ public int getNumSamples() {
+ return sampleNameToVcfPath.size();
+ }
+
+ /**
+ * @return a List of the sample names in this Map in sorted order
+ */
+ public List getSampleNamesInSortedOrder() {
+ return new ArrayList<>(sampleNameToVcfPath.keySet());
+ }
+
+ /**
+ * @return true if an index was specified for at least one sample, otherwise false
+ */
+ public boolean indicesSpecified() {
+ return ! sampleNameToVcfIndexPath.isEmpty();
+ }
+}
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImportIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImportIntegrationTest.java
index 27539d6bef0..17ba65831ea 100644
--- a/src/test/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImportIntegrationTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImportIntegrationTest.java
@@ -2,10 +2,13 @@
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord;
+import htsjdk.samtools.util.FileExtensions;
import htsjdk.samtools.util.IntervalList;
import htsjdk.tribble.AbstractFeatureReader;
import htsjdk.tribble.CloseableTribbleIterator;
import htsjdk.tribble.FeatureReader;
+import htsjdk.tribble.index.Index;
+import htsjdk.tribble.index.IndexFactory;
import htsjdk.tribble.readers.LineIterator;
import htsjdk.variant.bcf2.BCF2Codec;
import htsjdk.variant.variantcontext.Allele;
@@ -24,6 +27,7 @@
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
+import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;
import java.util.ArrayList;
@@ -47,6 +51,7 @@
import org.broadinstitute.hellbender.testutils.ArgumentsBuilder;
import org.broadinstitute.hellbender.testutils.BaseTest;
import org.broadinstitute.hellbender.testutils.VariantContextTestUtils;
+import org.broadinstitute.hellbender.tools.IndexFeatureFile;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.gcs.BucketUtils;
@@ -65,8 +70,11 @@
@Test(groups = {"variantcalling"})
public final class GenomicsDBImportIntegrationTest extends CommandLineProgramTest {
private static final String HG_00096 = largeFileTestDir + "gvcfs/HG00096.g.vcf.gz";
+ private static final String HG_00096_SAMPLE_NAME = "HG00096";
private static final String HG_00268 = largeFileTestDir + "gvcfs/HG00268.g.vcf.gz";
+ private static final String HG_00268_SAMPLE_NAME = "HG00268";
private static final String NA_19625 = largeFileTestDir + "gvcfs/NA19625.g.vcf.gz";
+ private static final String NA_19625_SAMPLE_NAME = "NA19625";
//The following 3 files were obtained by running CombineGVCFs on the above 3 files (separately). This introduces spanning
//deletions in the files. Hence, these files can be used to test for spanning deletions in the input VCF.
private static final String HG_00096_after_combine_gvcfs = largeFileTestDir + "gvcfs/HG00096_after_combine_gvcfs.g.vcf.gz";
@@ -879,6 +887,224 @@ private static File getSampleMapFile(final Map mapping){
.collect(Collectors.joining("\n")));
}
+ @DataProvider
+ public Object[][] dataForTestExplicitIndicesInSampleNameMap() {
+ final Map originalVCFsInOrder = new LinkedHashMap<>();
+ originalVCFsInOrder.put(HG_00096_SAMPLE_NAME, new File(HG_00096));
+ originalVCFsInOrder.put(HG_00268_SAMPLE_NAME, new File(HG_00268));
+ originalVCFsInOrder.put(NA_19625_SAMPLE_NAME, new File(NA_19625));
+
+ final Map originalVCFsOutOfOrder = new LinkedHashMap<>();
+ originalVCFsOutOfOrder.put(NA_19625_SAMPLE_NAME, new File(NA_19625));
+ originalVCFsOutOfOrder.put(HG_00268_SAMPLE_NAME, new File(HG_00268));
+ originalVCFsOutOfOrder.put(HG_00096_SAMPLE_NAME, new File(HG_00096));
+
+ return new Object[][] {
+ // All VCFs have explicit indices, samples in order, TABIX index
+ { originalVCFsInOrder, Arrays.asList(HG_00096_SAMPLE_NAME, HG_00268_SAMPLE_NAME, NA_19625_SAMPLE_NAME), false },
+
+ // All VCFs have explicit indices, samples in order, TRIBBLE index
+ { originalVCFsInOrder, Arrays.asList(HG_00096_SAMPLE_NAME, HG_00268_SAMPLE_NAME, NA_19625_SAMPLE_NAME), true },
+
+ // Some VCFs have explicit indices, samples in order, TABIX index
+ { originalVCFsInOrder, Arrays.asList(HG_00268_SAMPLE_NAME), false },
+
+ // Some VCFs have explicit indices, samples in order, TRIBBLE index
+ { originalVCFsInOrder, Arrays.asList(HG_00268_SAMPLE_NAME), true },
+
+ // All VCFs have explicit indices, samples out of order, TABIX index
+ { originalVCFsOutOfOrder, Arrays.asList(HG_00096_SAMPLE_NAME, HG_00268_SAMPLE_NAME, NA_19625_SAMPLE_NAME), false },
+
+ // All VCFs have explicit indices, samples out of order, TRIBBLE index
+ { originalVCFsOutOfOrder, Arrays.asList(HG_00096_SAMPLE_NAME, HG_00268_SAMPLE_NAME, NA_19625_SAMPLE_NAME), true },
+
+ // Some VCFs have explicit indices, samples out of order, TABIX index
+ { originalVCFsOutOfOrder, Arrays.asList(HG_00268_SAMPLE_NAME), false },
+
+ // Some VCFs have explicit indices, samples out of order, TRIBBLE index
+ { originalVCFsOutOfOrder, Arrays.asList(HG_00268_SAMPLE_NAME), true }
+ };
+ }
+
+ // Test that we can handle explicit index files from a sample name map locally.
+ // The cloud version of this test is separate.
+ // Note that this test decompresses/reindexes its GVCFs on-the-fly as necessary in order
+ // to avoid our having to check uncompressed VCFs in to our repo
+ @Test(dataProvider = "dataForTestExplicitIndicesInSampleNameMap")
+ public void testExplicitIndicesInSampleNameMap(final Map originalVCFs, final List samplesWithExplicitIndices, final boolean useTribbleIndex) throws IOException {
+ final String workspace = createTempDir("testExplicitIndicesInSampleNameMap").getAbsolutePath() + "/workspace";
+ final File vcfDir = createTempDir("testExplicitIndicesInSampleNameMap_vcfs");
+ final File indexDir = createTempDir("testExplicitIndicesInSampleNameMap_indices");
+ Assert.assertNotEquals(vcfDir, indexDir,
+ "testExplicitIndicesInSampleNameMap failed to create separate directories for the vcfs and their indices");
+
+ final StringBuilder sampleNameMapContents = new StringBuilder();
+
+ for ( final Map.Entry originalVCFEntry : originalVCFs.entrySet() ) {
+ final String sampleName = originalVCFEntry.getKey();
+ final File originalVCFFile = originalVCFEntry.getValue();
+ final boolean createExplicitIndex = samplesWithExplicitIndices.contains(sampleName);
+
+ final Path originalVCFPath = originalVCFFile.toPath();
+ final String uncompressedVCFName = originalVCFFile.getName().replaceAll("\\.gz$", "");
+ Path vcfDestination = new File(vcfDir, originalVCFFile.getName()).toPath();
+ if ( useTribbleIndex ) {
+ vcfDestination = new File(vcfDir, uncompressedVCFName).toPath();
+ IOUtils.gunzip(originalVCFPath.toAbsolutePath().toFile(), vcfDestination.toAbsolutePath().toFile());
+ } else {
+ Files.copy(originalVCFPath, vcfDestination);
+ }
+
+ final File originalVCFIndexFile = new File(originalVCFFile.getAbsolutePath() + FileExtensions.TABIX_INDEX);
+ Assert.assertTrue(originalVCFIndexFile.exists());
+ final File thisVCFIndexDir = createExplicitIndex ? indexDir : vcfDir;
+ Path vcfIndexDestination = new File(thisVCFIndexDir, originalVCFIndexFile.getName()).toPath();
+ if ( useTribbleIndex ) {
+ vcfIndexDestination = new File(thisVCFIndexDir, uncompressedVCFName + FileExtensions.TRIBBLE_INDEX).toPath();
+ final Index inMemoryIndex = IndexFactory.createLinearIndex(vcfDestination, new VCFCodec(), IndexFeatureFile.OPTIMAL_GVCF_INDEX_BIN_SIZE);
+ inMemoryIndex.write(vcfIndexDestination);
+ } else {
+ Files.copy(originalVCFIndexFile.toPath(), vcfIndexDestination);
+ }
+
+ if ( createExplicitIndex ) {
+ sampleNameMapContents.append(String.format("%s\t%s\t%s\n", sampleName, vcfDestination.toAbsolutePath().toString(), vcfIndexDestination.toAbsolutePath().toString()));
+ } else {
+ sampleNameMapContents.append(String.format("%s\t%s\n", sampleName, vcfDestination.toAbsolutePath().toString()));
+ }
+ }
+
+ final File sampleNameMapFile = IOUtils.writeTempFile(sampleNameMapContents.toString(), "testExplicitIndicesInSampleNameMap_samplemap", ".txt");
+
+ final ArgumentsBuilder args = new ArgumentsBuilder();
+ args.add(GenomicsDBImport.SAMPLE_NAME_MAP_LONG_NAME, sampleNameMapFile.getAbsolutePath())
+ .addInterval(INTERVAL.get(0))
+ .add(GenomicsDBImport.WORKSPACE_ARG_LONG_NAME, workspace);
+ runCommandLine(args);
+
+ checkJSONFilesAreWritten(workspace);
+ checkGenomicsDBAgainstExpected(workspace, INTERVAL, COMBINED, b38_reference_20_21, true, ATTRIBUTES_TO_IGNORE);
+ checkGenomicsDBAgainstExpected(workspace, INTERVAL, COMBINED, b38_reference_20_21, true, ATTRIBUTES_TO_IGNORE, false, false, true);
+ }
+
+ @DataProvider
+ public Object[][] dataForTestExplicitIndicesInSampleNameMapInTheCloud() {
+ final String GVCFS_WITH_INDICES_BUCKET = "gs://hellbender/test/resources/org/broadinstitute/hellbender/tools/genomicsdb/gvcfs_with_indices/";
+ final String GVCFS_WITHOUT_INDICES_BUCKET = "gs://hellbender/test/resources/org/broadinstitute/hellbender/tools/genomicsdb/gvcfs_without_indices/";
+ final String GVCF_INDICES_ONLY_BUCKET = "gs://hellbender/test/resources/org/broadinstitute/hellbender/tools/genomicsdb/gvcf_indices_only/";
+
+ final String HG00096_COMPRESSED_WITH_INDEX = GVCFS_WITH_INDICES_BUCKET + "HG00096.g.vcf.gz";
+ final String HG00096_COMPRESSED_NO_INDEX = GVCFS_WITHOUT_INDICES_BUCKET + "HG00096.g.vcf.gz";
+ final String HG00096_COMPRESSED_INDEX = GVCF_INDICES_ONLY_BUCKET + "HG00096.g.vcf.gz.tbi";
+ final String HG00096_UNCOMPRESSED_WITH_INDEX = GVCFS_WITH_INDICES_BUCKET + "HG00096.g.vcf";
+ final String HG00096_UNCOMPRESSED_NO_INDEX = GVCFS_WITHOUT_INDICES_BUCKET + "HG00096.g.vcf";
+ final String HG00096_UNCOMPRESSED_INDEX = GVCF_INDICES_ONLY_BUCKET + "HG00096.g.vcf.idx";
+
+ final String HG00268_COMPRESSED_WITH_INDEX = GVCFS_WITH_INDICES_BUCKET + "HG00268.g.vcf.gz";
+ final String HG00268_COMPRESSED_NO_INDEX = GVCFS_WITHOUT_INDICES_BUCKET + "HG00268.g.vcf.gz";
+ final String HG00268_COMPRESSED_INDEX = GVCF_INDICES_ONLY_BUCKET + "HG00268.g.vcf.gz.tbi";
+ final String HG00268_UNCOMPRESSED_WITH_INDEX = GVCFS_WITH_INDICES_BUCKET + "HG00268.g.vcf";
+ final String HG00268_UNCOMPRESSED_NO_INDEX = GVCFS_WITHOUT_INDICES_BUCKET + "HG00268.g.vcf";
+ final String HG00268_UNCOMPRESSED_INDEX = GVCF_INDICES_ONLY_BUCKET + "HG00268.g.vcf.idx";
+
+ final String NA19625_COMPRESSED_WITH_INDEX = GVCFS_WITH_INDICES_BUCKET + "NA19625.g.vcf.gz";
+ final String NA19625_COMPRESSED_NO_INDEX = GVCFS_WITHOUT_INDICES_BUCKET + "NA19625.g.vcf.gz";
+ final String NA19625_COMPRESSED_INDEX = GVCF_INDICES_ONLY_BUCKET + "NA19625.g.vcf.gz.tbi";
+ final String NA19625_UNCOMPRESSED_WITH_INDEX = GVCFS_WITH_INDICES_BUCKET + "NA19625.g.vcf";
+ final String NA19625_UNCOMPRESSED_NO_INDEX = GVCFS_WITHOUT_INDICES_BUCKET + "NA19625.g.vcf";
+ final String NA19625_UNCOMPRESSED_INDEX = GVCF_INDICES_ONLY_BUCKET + "NA19625.g.vcf.idx";
+
+ return new Object[][] {
+ // All VCFs have explicit indices, samples in order, TABIX index
+ {
+ HG_00096_SAMPLE_NAME + "\t" + HG00096_COMPRESSED_NO_INDEX + "\t" + HG00096_COMPRESSED_INDEX + "\n" +
+ HG_00268_SAMPLE_NAME + "\t" + HG00268_COMPRESSED_NO_INDEX + "\t" + HG00268_COMPRESSED_INDEX + "\n" +
+ NA_19625_SAMPLE_NAME + "\t" + NA19625_COMPRESSED_NO_INDEX + "\t" + NA19625_COMPRESSED_INDEX + "\n"
+ },
+
+ // All VCFs have explicit indices, samples in order, TRIBBLE index
+ {
+ HG_00096_SAMPLE_NAME + "\t" + HG00096_UNCOMPRESSED_NO_INDEX + "\t" + HG00096_UNCOMPRESSED_INDEX + "\n" +
+ HG_00268_SAMPLE_NAME + "\t" + HG00268_UNCOMPRESSED_NO_INDEX + "\t" + HG00268_UNCOMPRESSED_INDEX + "\n" +
+ NA_19625_SAMPLE_NAME + "\t" + NA19625_UNCOMPRESSED_NO_INDEX + "\t" + NA19625_UNCOMPRESSED_INDEX + "\n"
+ },
+
+ // Some VCFs have explicit indices, samples in order, TABIX index
+ {
+ HG_00096_SAMPLE_NAME + "\t" + HG00096_COMPRESSED_WITH_INDEX + "\n" +
+ HG_00268_SAMPLE_NAME + "\t" + HG00268_COMPRESSED_NO_INDEX + "\t" + HG00268_COMPRESSED_INDEX + "\n" +
+ NA_19625_SAMPLE_NAME + "\t" + NA19625_COMPRESSED_WITH_INDEX + "\n"
+ },
+
+ // Some VCFs have explicit indices, samples in order, TRIBBLE index
+ {
+ HG_00096_SAMPLE_NAME + "\t" + HG00096_UNCOMPRESSED_WITH_INDEX + "\n" +
+ HG_00268_SAMPLE_NAME + "\t" + HG00268_UNCOMPRESSED_NO_INDEX + "\t" + HG00268_UNCOMPRESSED_INDEX + "\n" +
+ NA_19625_SAMPLE_NAME + "\t" + NA19625_UNCOMPRESSED_WITH_INDEX + "\n"
+ },
+
+ // All VCFs have explicit indices, samples out of order, TABIX index
+ {
+ NA_19625_SAMPLE_NAME + "\t" + NA19625_COMPRESSED_NO_INDEX + "\t" + NA19625_COMPRESSED_INDEX + "\n" +
+ HG_00268_SAMPLE_NAME + "\t" + HG00268_COMPRESSED_NO_INDEX + "\t" + HG00268_COMPRESSED_INDEX + "\n" +
+ HG_00096_SAMPLE_NAME + "\t" + HG00096_COMPRESSED_NO_INDEX + "\t" + HG00096_COMPRESSED_INDEX + "\n"
+ },
+
+ // All VCFs have explicit indices, samples out of order, TRIBBLE index
+ {
+ NA_19625_SAMPLE_NAME + "\t" + NA19625_UNCOMPRESSED_NO_INDEX + "\t" + NA19625_UNCOMPRESSED_INDEX + "\n" +
+ HG_00268_SAMPLE_NAME + "\t" + HG00268_UNCOMPRESSED_NO_INDEX + "\t" + HG00268_UNCOMPRESSED_INDEX + "\n" +
+ HG_00096_SAMPLE_NAME + "\t" + HG00096_UNCOMPRESSED_NO_INDEX + "\t" + HG00096_UNCOMPRESSED_INDEX + "\n"
+ },
+
+ // Some VCFs have explicit indices, samples out of order, TABIX index
+ {
+ NA_19625_SAMPLE_NAME + "\t" + NA19625_COMPRESSED_WITH_INDEX + "\n" +
+ HG_00268_SAMPLE_NAME + "\t" + HG00268_COMPRESSED_NO_INDEX + "\t" + HG00268_COMPRESSED_INDEX + "\n" +
+ HG_00096_SAMPLE_NAME + "\t" + HG00096_COMPRESSED_WITH_INDEX + "\n"
+ },
+
+ // Some VCFs have explicit indices, samples out of order, TRIBBLE index
+ {
+ NA_19625_SAMPLE_NAME + "\t" + NA19625_UNCOMPRESSED_WITH_INDEX + "\n" +
+ HG_00268_SAMPLE_NAME + "\t" + HG00268_UNCOMPRESSED_NO_INDEX + "\t" + HG00268_UNCOMPRESSED_INDEX + "\n" +
+ HG_00096_SAMPLE_NAME + "\t" + HG00096_UNCOMPRESSED_WITH_INDEX + "\n"
+ }
+ };
+ }
+
+ // Test that we can handle explicit index files from a sample name map in the cloud
+ @Test(dataProvider = "dataForTestExplicitIndicesInSampleNameMapInTheCloud", groups = {"bucket"})
+ public void testExplicitIndicesInSampleNameMapInTheCloud(final String sampleNameMapContents) throws IOException {
+ final String workspace = createTempDir("testExplicitIndicesInSampleNameMapInTheCloud").getAbsolutePath() + "/workspace";
+ final File sampleNameMapFile = IOUtils.writeTempFile(sampleNameMapContents, "testExplicitIndicesInSampleNameMapInTheCloud_samplemap", ".txt");
+
+ final ArgumentsBuilder args = new ArgumentsBuilder();
+ args.add(GenomicsDBImport.SAMPLE_NAME_MAP_LONG_NAME, sampleNameMapFile.getAbsolutePath())
+ .addInterval(INTERVAL.get(0))
+ .add(GenomicsDBImport.WORKSPACE_ARG_LONG_NAME, workspace);
+ runCommandLine(args);
+
+ checkJSONFilesAreWritten(workspace);
+ checkGenomicsDBAgainstExpected(workspace, INTERVAL, COMBINED, b38_reference_20_21, true, ATTRIBUTES_TO_IGNORE);
+ checkGenomicsDBAgainstExpected(workspace, INTERVAL, COMBINED, b38_reference_20_21, true, ATTRIBUTES_TO_IGNORE, false, false, true);
+ }
+
+ // This test guards against the possibility of someone accidentally putting an index file into
+ // the "gvcfs_without_indices" bucket directory used by testExplicitIndicesInSampleNameMapInTheCloud()
+ @Test(groups = {"bucket"})
+ public void testUnindexedCloudGVCFsAreActuallyUnindexed() throws IOException {
+ final String GVCFS_WITHOUT_INDICES_BUCKET = "gs://hellbender/test/resources/org/broadinstitute/hellbender/tools/genomicsdb/gvcfs_without_indices/";
+ final Path bucketPath = IOUtils.getPath(GVCFS_WITHOUT_INDICES_BUCKET);
+
+ Files.list(bucketPath).forEach(file -> {
+ Assert.assertFalse(file.endsWith(FileExtensions.TABIX_INDEX),
+ "Found a TABIX index in bucket " + GVCFS_WITHOUT_INDICES_BUCKET);
+ Assert.assertFalse(file.endsWith(FileExtensions.TRIBBLE_INDEX),
+ "Found a Tribble index in bucket " + GVCFS_WITHOUT_INDICES_BUCKET);
+ });
+ }
+
@DataProvider
public static Iterator