From 19778c169837dad0e3b00104dbc6630e916ef7ed Mon Sep 17 00:00:00 2001 From: droazen Date: Tue, 11 Oct 2022 14:51:56 -0400 Subject: [PATCH] GenomicsDBImport: add ability to specify explicit index locations via the sample name map file (#7967) The sample name map file accepted by GenomicsDBImport can now optionally contain a third column giving an explicit path to an index for the corresponding VCF. It is allowed to specify an explicit index in some lines of the sample name map and not others. Added comprehensive unit and integration tests. --- .../hellbender/engine/FeatureDataSource.java | 2 +- ...sDBUtils.java => GATKGenomicsDBUtils.java} | 19 +- .../tools/genomicsdb/GenomicsDBImport.java | 180 ++++------ .../tools/genomicsdb/SampleNameMap.java | 274 ++++++++++++++++ .../GenomicsDBImportIntegrationTest.java | 232 ++++++++++++- .../genomicsdb/GenomicsDBImportUnitTest.java | 111 ------- .../genomicsdb/SampleNameMapUnitTest.java | 310 ++++++++++++++++++ 7 files changed, 889 insertions(+), 239 deletions(-) rename src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/{GenomicsDBUtils.java => GATKGenomicsDBUtils.java} (94%) create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/SampleNameMap.java delete mode 100644 src/test/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImportUnitTest.java create mode 100644 src/test/java/org/broadinstitute/hellbender/tools/genomicsdb/SampleNameMapUnitTest.java diff --git a/src/main/java/org/broadinstitute/hellbender/engine/FeatureDataSource.java b/src/main/java/org/broadinstitute/hellbender/engine/FeatureDataSource.java index 21f4529601f..bd6fad2f6d2 100644 --- a/src/main/java/org/broadinstitute/hellbender/engine/FeatureDataSource.java +++ b/src/main/java/org/broadinstitute/hellbender/engine/FeatureDataSource.java @@ -36,7 +36,7 @@ import java.util.Optional; import java.util.function.Function; -import static org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBUtils.createExportConfiguration; +import static org.broadinstitute.hellbender.tools.genomicsdb.GATKGenomicsDBUtils.createExportConfiguration; import static org.broadinstitute.hellbender.utils.io.BlockCompressedIntervalStream.BCI_FILE_EXTENSION; /** diff --git a/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/GATKGenomicsDBUtils.java similarity index 94% rename from src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBUtils.java rename to src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/GATKGenomicsDBUtils.java index f81e825e39e..e0eb251501e 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBUtils.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/GATKGenomicsDBUtils.java @@ -1,6 +1,7 @@ package org.broadinstitute.hellbender.tools.genomicsdb; import com.googlecode.protobuf.format.JsonFormat; +import htsjdk.samtools.util.FileExtensions; import org.broadinstitute.hellbender.exceptions.UserException; import org.broadinstitute.hellbender.tools.walkers.annotator.AnnotationUtils; import org.broadinstitute.hellbender.utils.Utils; @@ -12,6 +13,7 @@ import org.genomicsdb.model.GenomicsDBVidMapProto; import java.io.IOException; +import java.nio.file.Path; import java.util.HashMap; import java.util.Map; @@ -28,7 +30,7 @@ * https://developers.google.com/protocol-buffers/docs/javatutorial#the-protocol-buffer-api * https://developers.google.com/protocol-buffers/docs/reference/java-generated */ -public class GenomicsDBUtils { +public class GATKGenomicsDBUtils { private static final String SUM = "sum"; private static final String ELEMENT_WISE_SUM = "element_wise_sum"; @@ -338,5 +340,20 @@ public static String genomicsDBApppendPaths(String parentPath, String path) { } } + public static void assertVariantFileIsCompressedAndIndexed(final Path vcfPath) { + assertVariantFileIsCompressedAndIndexed(vcfPath, null); + } + + public static void assertVariantFileIsCompressedAndIndexed(final Path vcfPath, final Path optionalVCFindexPath) { + if (!vcfPath.toString().toLowerCase().endsWith(FileExtensions.COMPRESSED_VCF)) { + throw new UserException("Input variant files must be block compressed vcfs when using " + + GenomicsDBImport.BYPASS_FEATURE_READER + ", but " + vcfPath.toString() + " does not end with " + + "the standard file extension " + FileExtensions.COMPRESSED_VCF); + } + Path indexPath = optionalVCFindexPath != null ? + optionalVCFindexPath : + vcfPath.resolveSibling(vcfPath.getFileName() + FileExtensions.COMPRESSED_VCF_INDEX); + IOUtils.assertFileIsReadable(indexPath); + } } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImport.java b/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImport.java index f68309565cf..fd7dedaab6d 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImport.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImport.java @@ -1,12 +1,10 @@ package org.broadinstitute.hellbender.tools.genomicsdb; -import com.google.common.annotations.VisibleForTesting; import com.google.common.util.concurrent.ThreadFactoryBuilder; import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.util.Interval; import htsjdk.samtools.util.IntervalList; import htsjdk.samtools.util.Locatable; -import htsjdk.samtools.util.FileExtensions; import htsjdk.tribble.AbstractFeatureReader; import htsjdk.tribble.CloseableTribbleIterator; import htsjdk.tribble.FeatureReader; @@ -69,8 +67,8 @@ import java.util.concurrent.ThreadFactory; import java.util.stream.Collectors; -import static org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBUtils.genomicsDBGetAbsolutePath; -import static org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBUtils.genomicsDBApppendPaths; +import static org.broadinstitute.hellbender.tools.genomicsdb.GATKGenomicsDBUtils.genomicsDBGetAbsolutePath; +import static org.broadinstitute.hellbender.tools.genomicsdb.GATKGenomicsDBUtils.genomicsDBApppendPaths; /** * Import single-sample GVCFs into GenomicsDB before joint genotyping. @@ -139,6 +137,22 @@ * sample3 sample3.vcf.gz * * + * The sample name map file may optionally contain a third column with an explicit index path/URI for each VCF: + * + *
+ *  sample1      sample1.vcf.gz      sample1.vcf.gz.tbi
+ *  sample2      sample2.vcf.gz      sample2.vcf.gz.tbi
+ *  sample3      sample3.vcf.gz      sample3.vcf.gz.tbi
+ *  
+ * + * It is also possible to specify an explicit index for only a subset of the samples: + * + *
+ *  sample1      sample1.vcf.gz
+ *  sample2      sample2.vcf.gz      sample2.vcf.gz.tbi
+ *  sample3      sample3.vcf.gz
+ *  
+ * * Add new samples to an existing genomicsdb workspace. * In the incremental import case, no intervals are specified in the command because the tool will use the same * intervals used in the initial import. Sample map is also supported for incremental import. @@ -409,14 +423,7 @@ public int getDefaultCloudIndexPrefetchBufferSize() { // Intervals from command line (merged if specified) private List intervals; - // Sorted mapping between sample names and corresponding GVCF file name - // - // IMPORTANT: This must be sorted or it will result in sample name swaps in the output database. - // This happens because the callset json is generated independently from the import process - // each imported batch is then sorted, so if we have an unsorted list we'll end up with different global vs batch - // sorting. - // We preemptively sort here so we will have consistent sorting. - private SortedMap sampleNameToVcfPath = new TreeMap<>(); + private SampleNameMap sampleNameMap; // Needed as smartMergeHeaders() returns a set of VCF header lines private Set mergedHeaderLines = null; @@ -511,16 +518,6 @@ private static void assertIntervalsCoverEntireContigs(GenomicsDBImporter importe } } - private static void assertVariantFileIsCompressedAndIndexed(final Path path) { - if (!path.toString().toLowerCase().endsWith(FileExtensions.COMPRESSED_VCF)) { - throw new UserException("Input variant files must be block compressed vcfs when using " + - BYPASS_FEATURE_READER + ", but " + path.toString() + " does not end with " + - "the standard file extension " + FileExtensions.COMPRESSED_VCF); - } - Path indexPath = path.resolveSibling(path.getFileName() + FileExtensions.COMPRESSED_VCF_INDEX); - IOUtils.assertFileIsReadable(indexPath); - } - /** * sets the values of mergedHeaderLines, mergedHeaderSequenceDictionary, and sampleNameToVcfPath */ @@ -529,23 +526,20 @@ private void initializeHeaderAndSampleMappings() { if (variantPaths != null && variantPaths.size() > 0) { // -V was specified final List headers = new ArrayList<>(variantPaths.size()); + sampleNameMap = new SampleNameMap(); for (final String variantPathString : variantPaths) { final Path variantPath = IOUtils.getPath(variantPathString); if (bypassFeatureReader) { - assertVariantFileIsCompressedAndIndexed(variantPath); + GATKGenomicsDBUtils.assertVariantFileIsCompressedAndIndexed(variantPath); } - final VCFHeader header = getHeaderFromPath(variantPath); + final VCFHeader header = getHeaderFromPath(variantPath, null); Utils.validate(header != null, "Null header was found in " + variantPath + "."); assertGVCFHasOnlyOneSample(variantPathString, header); headers.add(header); final String sampleName = header.getGenotypeSamples().get(0); try { - final URI previousPath = sampleNameToVcfPath.put(sampleName, new URI(variantPathString)); - if (previousPath != null) { - throw new UserException("Duplicate sample: " + sampleName + ". Sample was found in both " - + variantPath.toUri() + " and " + previousPath + "."); - } + sampleNameMap.addSample(sampleName, new URI(variantPathString)); } catch(final URISyntaxException e) { throw new UserException("Malformed URI "+e.toString(), e); @@ -561,9 +555,14 @@ private void initializeHeaderAndSampleMappings() { //it's VERY IMPORTANT that this map is Sorted according to String's natural ordering, if it is not //the resulting database will have incorrect sample names //see https://github.com/broadinstitute/gatk/issues/3682 for more information - sampleNameToVcfPath = loadSampleNameMapFileInSortedOrder(IOUtils.getPath(sampleNameMapFile), bypassFeatureReader); - final Path firstHeaderPath = IOUtils.getPath(sampleNameToVcfPath.entrySet().iterator().next().getValue().toString()); - final VCFHeader header = getHeaderFromPath(firstHeaderPath); + // The SampleNameMap class guarantees that the samples will be sorted correctly. + sampleNameMap = new SampleNameMap(IOUtils.getPath(sampleNameMapFile), bypassFeatureReader); + + final String firstSample = sampleNameMap.getSampleNameToVcfPath().entrySet().iterator().next().getKey(); + final Path firstVCFPath = sampleNameMap.getVCFForSampleAsPath(firstSample); + final Path firstVCFIndexPath = sampleNameMap.getVCFIndexForSampleAsPath(firstSample); + final VCFHeader header = getHeaderFromPath(firstVCFPath, firstVCFIndexPath); + //getMetaDataInInputOrder() returns an ImmutableSet - LinkedHashSet is mutable and preserves ordering mergedHeaderLines = new LinkedHashSet(header.getMetaDataInInputOrder()); mergedHeaderSequenceDictionary = header.getSequenceDictionary(); @@ -592,10 +591,17 @@ else if (getIntervalsFromExistingWorkspace){ if ( mergedHeaderSequenceDictionary == null) { throw new UserException("The merged vcf header has no sequence dictionary. Please provide a header that contains a sequence dictionary."); } + + // If any indices were specified in the sample name map file, make sure + // that --bypass-feature-reader wasn't also specified: + if ( sampleNameMap != null && sampleNameMap.indicesSpecified() && bypassFeatureReader ) { + throw new UserException("Indices were specified for some VCFs in the sample name map file, but --" + BYPASS_FEATURE_READER + + " was also specified. Specifying explicit indices is not supported when running with --" + BYPASS_FEATURE_READER); + } } - private VCFHeader getHeaderFromPath(final Path variantPath) { - try(final FeatureReader reader = getReaderFromPath(variantPath)) { + private VCFHeader getHeaderFromPath(final Path variantPath, final Path variantIndexPath) { + try(final FeatureReader reader = getReaderFromPath(variantPath, variantIndexPath)) { return (VCFHeader) reader.getHeader(); } catch (final IOException e) { throw new UserException("Error while reading vcf header from " + variantPath.toUri(), e); @@ -610,85 +616,7 @@ private static void assertGVCFHasOnlyOneSample(final String variantPath, final V } } - /** - * Load a tab delimited new line separated file of sample name to URI mapping: - * this maintains the keys in the same order that they appeared in the file - * - * This tool should only call {@link #loadSampleNameMapFileInSortedOrder(Path)}. - * This non-sorting overload is exposed for testing purposes only. - * - * ex: - * - * Sample1\tpathToSample1.vcf\n - * Sample2\tpathTosample2.vcf\n - * ... - * - * The sample names must be unique. - * @param sampleToFileMapPath path to the mapping file - * @return map of sample name to corresponding file, the map will be ordered according to the order in the input file - */ - @VisibleForTesting - static LinkedHashMap loadSampleNameMapFile(final Path sampleToFileMapPath) { - return loadSampleNameMapFile(sampleToFileMapPath, false); - } - - private static LinkedHashMap loadSampleNameMapFile(final Path sampleToFileMapPath, - final boolean checkVcfIsCompressedAndIndexed) { - try { - final List lines = Files.readAllLines(sampleToFileMapPath); - if (lines.isEmpty()) { - throw new UserException.BadInput( "At least 1 sample is required but none were found in the sample mapping file"); - } - final LinkedHashMap sampleToFilename = new LinkedHashMap<>(); - for ( final String line : lines) { - final String[] split = line.split("\\t",-1); - if (split.length != 2) { - throw new UserException.BadInput("Expected a file with 2 fields per line in the format\nSample\tFile\n but found line: \"" - + line +"\" with "+split.length+" fields"); - } - if ( !split[0].trim().equals(split[0]) || split[0].trim().isEmpty() - || split[1].trim().isEmpty()) { - throw new UserException.BadInput("Expected a file of format\nSample\tFile\n but found line: '" + line + "'\nValid sample names must be non-empty strings that cannot begin or end with whitespace and valid file names must be non-empty and not all whitespace"); - } - final String sample = split[0]; - final String path = split[1].trim(); - try { - final URI oldPath = sampleToFilename.put(sample, new URI(path)); - if (oldPath != null){ - throw new UserException.BadInput("Found two mappings for the same sample: " + sample + "\n" + path + "\n" + oldPath ); - } - if (checkVcfIsCompressedAndIndexed) { - assertVariantFileIsCompressedAndIndexed(IOUtils.getPath(path)); - } - } - catch(final URISyntaxException e) { - throw new UserException("Malformed URI "+e.toString()); - } - } - return sampleToFilename; - } catch (final IOException e) { - throw new UserException.CouldNotReadInputFile(sampleToFileMapPath, "exception while reading sample->filename mapping file", e); - } - } - - /** - * load a tab delimited new line separated file of sample name to URI mapping: - * - * ex: - * Sample1\tpathToSample1.vcf\n - * Sample2\tpathTosample2.vcf\n - * ... - * - * The sample names must be unique. - * @param sampleToFileMapPath path to the mapping file - * @param checkVcfIsCompressedAndIndexed boolean indicating whether to check vcf is compressed and indexed - * @return map of sample name to corresponding file, sorted by sample name - */ - public static SortedMap loadSampleNameMapFileInSortedOrder(final Path sampleToFileMapPath, - final boolean checkVcfIsCompressedAndIndexed){ - return new TreeMap<>(loadSampleNameMapFile(sampleToFileMapPath, checkVcfIsCompressedAndIndexed)); - } /** * write out interval list to file @@ -757,11 +685,11 @@ private Void logMessageOnBatchCompletion(final BatchCompletionCallbackFunctionAr logger.info("Done importing batch " + arg.batchCount + "/" + arg.totalBatchCount); logger.debug("List of samples imported in batch " + arg.batchCount + ":"); int index = 0; - final int sampleCount = sampleNameToVcfPath.size(); + final int sampleCount = sampleNameMap.getNumSamples(); final int updatedBatchSize = (batchSize == DEFAULT_ZERO_BATCH_SIZE) ? sampleCount : batchSize; final int startBatch = (arg.batchCount - 1) * updatedBatchSize; final int stopBatch = arg.batchCount * updatedBatchSize; - for(String key : sampleNameToVcfPath.keySet()) { + for(String key : sampleNameMap.getSampleNamesInSortedOrder()) { index++; if (index <= startBatch || index > stopBatch) { continue; @@ -817,7 +745,7 @@ private List generatePartitionListFromI private List generateIntervalListFromVidMap() { try { GenomicsDBVidMapProto.VidMappingPB vidMapPB = - org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBUtils.getProtobufVidMappingFromJsonFile(vidMapJSONFile); + GATKGenomicsDBUtils.getProtobufVidMappingFromJsonFile(vidMapJSONFile); List partitions = Arrays.asList(GenomicsDBUtils.listGenomicsDBArrays(workspace)); return partitions.stream().flatMap(partition -> { @@ -863,7 +791,7 @@ private ImportConfig createImportConfig(final int batchSize) { importConfigurationBuilder.setConsolidateTiledbArrayAfterLoad(doConsolidation); importConfigurationBuilder.setEnableSharedPosixfsOptimizations(sharedPosixFSOptimizations); ImportConfig importConfig = new ImportConfig(importConfigurationBuilder.build(), validateSampleToReaderMap, true, - batchSize, mergedHeaderLines, sampleNameToVcfPath, bypassFeatureReader ? null : this::createSampleToReaderMap, + batchSize, mergedHeaderLines, sampleNameMap.getSampleNameToVcfPath(), bypassFeatureReader ? null : this::createSampleToReaderMap, doIncrementalImport); importConfig.setOutputCallsetmapJsonFile(callsetMapJSONFile); importConfig.setOutputVidmapJsonFile(vidMapJSONFile); @@ -891,7 +819,7 @@ public void traverse() { // Force the progress meter to update after every batch progressMeter.setRecordsBetweenTimeChecks(1L); - final int sampleCount = sampleNameToVcfPath.size(); + final int sampleCount = sampleNameMap.getNumSamples(); final int updatedBatchSize = (batchSize == DEFAULT_ZERO_BATCH_SIZE) ? sampleCount : batchSize; final ImportConfig importConfig = createImportConfig(updatedBatchSize); @@ -899,7 +827,7 @@ public void traverse() { try { importer = new GenomicsDBImporter(importConfig); // Modify importer directly from updateImportProtobufVidMapping. - org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBUtils.updateImportProtobufVidMapping(importer); + GATKGenomicsDBUtils.updateImportProtobufVidMapping(importer); if (mergeContigsIntoNumPartitions != 0) { if (!doIncrementalImport) { assertIntervalsCoverEntireContigs(importer, intervals); @@ -952,8 +880,9 @@ private SortedMap> getFeatureReadersInPara final String sampleName = sampleNames.get(i); futures.put(sampleName, inputPreloadExecutorService.submit(() -> { final Path variantPath = IOUtils.getPath(sampleNametoPath.get(sampleName).toString()); + final Path variantIndexPath = sampleNameMap.getVCFIndexForSampleAsPath(sampleName); try { - return new InitializedQueryWrapper(getReaderFromPath(variantPath), intervals.get(0)); + return new InitializedQueryWrapper(getReaderFromPath(variantPath, variantIndexPath), intervals.get(0)); } catch (final IOException e) { throw new UserException.CouldNotReadInputFile("Couldn't read file: " + variantPath.toUri(), e); } @@ -980,7 +909,9 @@ private SortedMap> getFeatureReadersSerial final List sampleNames = new ArrayList<>(sampleNameToPath.keySet()); for(int i = lowerSampleIndex; i < sampleNameToPath.size() && i < lowerSampleIndex+batchSize; ++i) { final String sampleName = sampleNames.get(i); - final FeatureReader reader = getReaderFromPath(IOUtils.getPath(sampleNameToPath.get(sampleName).toString())); + final Path variantPath = IOUtils.getPath(sampleNameToPath.get(sampleName).toString()); + final Path variantIndexPath = sampleNameMap.getVCFIndexForSampleAsPath(sampleName); + final FeatureReader reader = getReaderFromPath(variantPath, variantIndexPath); sampleToReaderMap.put(sampleName, reader); } logger.info("Importing batch " + this.batchCount + " with " + sampleToReaderMap.size() + " samples"); @@ -993,10 +924,13 @@ private SortedMap> getFeatureReadersSerial * @return Feature reader * @param variantPath */ - private FeatureReader getReaderFromPath(final Path variantPath) { + private FeatureReader getReaderFromPath(final Path variantPath, final Path variantIndexPath) { + // TODO: we repeatedly convert between URI, Path, and String in this tool. Is this necessary? final String variantURI = variantPath.toAbsolutePath().toUri().toString(); + final String variantIndexURI = variantIndexPath == null ? null : variantIndexPath.toAbsolutePath().toUri().toString(); + try { - final FeatureReader reader = AbstractFeatureReader.getFeatureReader(variantURI, null, new VCFCodec(), true, + final FeatureReader reader = AbstractFeatureReader.getFeatureReader(variantURI, variantIndexURI, new VCFCodec(), true, BucketUtils.getPrefetchingWrapper(cloudPrefetchBuffer), BucketUtils.getPrefetchingWrapper(cloudIndexPrefetchBuffer)); @@ -1058,7 +992,7 @@ public VariantContext next() { */ private String overwriteCreateOrCheckWorkspace() { String workspaceDir = genomicsDBGetAbsolutePath(workspace); - // From JavaDoc for GenomicsDBUtils.createTileDBWorkspacevid + // From JavaDoc for GATKGenomicsDBUtils.createTileDBWorkspacevid // returnCode = 0 : OK. If overwriteExistingWorkspace is true and the workspace exists, it is deleted first. // returnCode = -1 : path was not a directory // returnCode = -2 : failed to create workspace diff --git a/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/SampleNameMap.java b/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/SampleNameMap.java new file mode 100644 index 00000000000..0448201d850 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/SampleNameMap.java @@ -0,0 +1,274 @@ +package org.broadinstitute.hellbender.tools.genomicsdb; + +import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.utils.io.IOUtils; + +import java.io.IOException; +import java.net.URI; +import java.net.URISyntaxException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.*; + +/** + * A class to hold the mappings of sample names to VCF / VCF index paths. Used by GenomicsDBImport. + * + * This class can be constructed from a textual file containing lines in the format: + * + * Sample\tVCF + * or: + * Sample\tVCF\tIndex + * + * The sample names may have internal whitespace, but not leading/trailing whitespace. + * The VCF and Index URIs may have leading/trailing whitespace, which is ignored. + * + * The third Index column is optional. It is permitted to specify the index for some samples + * and not others. If an index is not specified for a sample, its location is inferred from + * the VCF URI. + * + * It is also possible to construct an empty SampleNameMap using the no-arg constructor, and + * add sample mappings one at a time using addSample(). + */ +public final class SampleNameMap { + // Sorted mapping between sample names and corresponding GVCF file name + // + // IMPORTANT: This must be sorted or it will result in sample name swaps in the output database. + // This happens because the callset json is generated independently from the import process + // each imported batch is then sorted, so if we have an unsorted list we'll end up with different + // global vs batch sorting. + // We preemptively sort here so we will have consistent sorting. + private SortedMap sampleNameToVcfPath; + + // Mapping between sample names and corresponding VCF index path + // + // This Map contains only indices specified explicitly via the sample name map file. + // If an explicit index is not specified for a given sample, it will not have an + // entry in this Map, and the index path will be automatically inferred based on + // the location of the VCF. + // + // The ordering of the entries in this Map does not actually matter, since it's not + // directly exposed, and is used only for individual lookups via getVCFIndexForSample() + private SortedMap sampleNameToVcfIndexPath; + + /** + * Create an empty SampleNameMap. Samples can be added later using addSample() + */ + public SampleNameMap() { + sampleNameToVcfPath = new TreeMap<>(); + sampleNameToVcfIndexPath = new TreeMap<>(); + } + + /** + * Create a SampleNameMap from a textual file containing the sample mappings. The + * lines in this file must be in the format: + * + * Sample\tVCF + * or: + * Sample\tVCF\tIndex + * + * The sample names may have internal whitespace, but not leading/trailing whitespace. + * The VCF and Index URIs may have leading/trailing whitespace, which is ignored. + * + * The third Index column is optional. It is permitted to specify the index for some samples + * and not others. If an index is not specified for a sample, its location is inferred from + * the VCF URI. + * + * @param sampleMapFilePath Path to the file containing the sample name mappings to load + */ + public SampleNameMap(final Path sampleMapFilePath) { + this(sampleMapFilePath, false); + } + + /** + * Create a SampleNameMap from a textual file containing the sample mappings. The + * lines in this file must be in the format: + * + * SampleName1\tVCF + * or: + * SampleName1\tVCF\tIndex + * + * The sample names may have internal whitespace, but not leading/trailing whitespace. + * The VCF and Index URIs may have leading/trailing whitespace, which is ignored. + * + * The third Index column is optional. It is permitted to specify the index for some samples + * and not others. If an index is not specified for a sample, its location is inferred from + * the VCF URI. + * + * @param sampleMapFilePath Path to the file containing the sample name mappings to load + * @param checkVcfIsCompressedAndIndexed If true, check each VCF to make sure it's compressed and indexed + */ + public SampleNameMap(final Path sampleMapFilePath, final boolean checkVcfIsCompressedAndIndexed) { + sampleNameToVcfPath = new TreeMap<>(); + sampleNameToVcfIndexPath = new TreeMap<>(); + + loadSampleNameMapFile(sampleMapFilePath, checkVcfIsCompressedAndIndexed); + } + + private void loadSampleNameMapFile(final Path sampleToFileMapPath, final boolean checkVcfIsCompressedAndIndexed) { + try { + final List lines = Files.readAllLines(sampleToFileMapPath); + if (lines.isEmpty()) { + throw new UserException.BadInput( "At least 1 sample is required but none were found in the sample mapping file"); + } + + for (final String line : lines) { + final String[] split = line.split("\\t",-1); + if (split.length != 2 && split.length != 3) { + throw new UserException.BadInput("Sample name map file must have 2 or 3 fields per line in the format:\nSample\tFile\nor:\nSample\tFile\tIndex\nbut found line: \"" + + line +"\" with "+split.length+" fields"); + } + if ( ! sampleNameIsLegal(split[0]) || split[1].trim().isEmpty()) { + throw new UserException.BadInput("Sample name map file must have lines in the format:\nSample\tFile\nor:\nSample\tFile\tIndex\n but found line: '" + line + "'\nValid sample names must be non-empty strings that cannot begin or end with whitespace and valid file names must be non-empty and not all whitespace"); + } + final String sample = split[0]; + final String vcfPath = split[1].trim(); + + String vcfIndexPath = null; + if ( split.length == 3 ) { + vcfIndexPath = split[2].trim(); + + if ( vcfIndexPath.isEmpty() ) { + throw new UserException.BadInput("Found a line in the sample name map file with an empty or all-whitespace value for the index:\n" + "\"" + line + "\""); + } + } + + try { + final URI existingVCFPath = sampleNameToVcfPath.put(sample, new URI(vcfPath)); + if (existingVCFPath != null){ + throw new UserException.BadInput("Found two mappings for the same sample: " + sample + "\n" + vcfPath + "\n" + existingVCFPath); + } + + if ( vcfIndexPath != null ) { + final URI existingVCFIndexPath = sampleNameToVcfIndexPath.put(sample, new URI(vcfIndexPath)); + if (existingVCFIndexPath != null) { + throw new UserException.BadInput("Found two indices for the same sample: " + sample + "\n" + vcfIndexPath + "\n" + existingVCFIndexPath); + } + } + + if (checkVcfIsCompressedAndIndexed) { + GATKGenomicsDBUtils.assertVariantFileIsCompressedAndIndexed(IOUtils.getPath(vcfPath), vcfIndexPath == null ? null : IOUtils.getPath(vcfIndexPath)); + } + } + catch(final URISyntaxException e) { + throw new UserException("Malformed URI: " + e.toString()); + } + } + } catch (final IOException e) { + throw new UserException.CouldNotReadInputFile(sampleToFileMapPath, "exception while reading sample->filename mapping file", e); + } + } + + /** + * Tests whether the sample name is legal. Sample names must be non-empty, and + * may have internal whitespace but not leading/trailing whitespace. + * + * @param sampleName sample name to test + * @return true if sampleName is legal, otherwise false + */ + private boolean sampleNameIsLegal(final String sampleName) { + return sampleName != null && + ! sampleName.trim().isEmpty() && + sampleName.trim().equals(sampleName); + } + + /** + * Add a new sample mapping + * + * @param sampleName name of the sample + * @param vcfPath path to the VCF for the sample + */ + public void addSample(final String sampleName, final URI vcfPath) { + addSample(sampleName, vcfPath, null); + } + + /** + * Add a new sample mapping + * + * @param sampleName name of the sample + * @param vcfPath path to the VCF for the sample (not null) + * @param vcfIndexPath path to the index for the sample (may be null) + */ + public void addSample(final String sampleName, final URI vcfPath, final URI vcfIndexPath) { + if ( ! sampleNameIsLegal(sampleName) ) { + throw new UserException.BadInput("Sample name " + sampleName + " is not legal. Sample names must be non-empty and not contain leading or trailing whitespace"); + } + if ( vcfPath == null ) { + throw new UserException.BadInput("VCF path for sample " + sampleName + " was null"); + } + + final URI previousPath = sampleNameToVcfPath.put(sampleName, vcfPath); + if (previousPath != null) { + throw new UserException.BadInput("Duplicate sample: " + sampleName + ". Sample was found in both " + + vcfPath + " and " + previousPath + "."); + } + + if (vcfIndexPath != null) { + final URI previousIndexPath = sampleNameToVcfIndexPath.put(sampleName, vcfIndexPath); + if (previousIndexPath != null) { + throw new UserException.BadInput("For sample " + sampleName + ", attempted to specify multiple indices: " + vcfIndexPath + " and " + previousIndexPath); + } + } + } + + /** + * @return The full mapping of sample names -> VCF paths, with the sample names in sorted order + */ + public SortedMap getSampleNameToVcfPath() { + return sampleNameToVcfPath; + } + + /** + * @param sample sample name + * @return the VCF associated with that sample name, as a URI + */ + public URI getVCFForSample(final String sample) { + return sampleNameToVcfPath.get(sample); + } + + /** + * @param sample sample name + * @return the VCF associated with that sample name, as a Path + */ + public Path getVCFForSampleAsPath(final String sample) { + final URI vcfURI = sampleNameToVcfPath.get(sample); + return vcfURI == null ? null : IOUtils.getPath(vcfURI.toString()); + } + + /** + * @param sample sample name + * @return the VCF index associated with that sample name, as a URI, or null if no index + */ + public URI getVCFIndexForSample(final String sample) { + return sampleNameToVcfIndexPath.get(sample); + } + + /** + * @param sample sample name + * @return the VCF index associated with that sample name, as a Path, or null if no index + */ + public Path getVCFIndexForSampleAsPath(final String sample) { + final URI vcfIndexURI = sampleNameToVcfIndexPath.get(sample); + return vcfIndexURI == null ? null : IOUtils.getPath(vcfIndexURI.toString()); + } + + /** + * @return number of samples in this Map + */ + public int getNumSamples() { + return sampleNameToVcfPath.size(); + } + + /** + * @return a List of the sample names in this Map in sorted order + */ + public List getSampleNamesInSortedOrder() { + return new ArrayList<>(sampleNameToVcfPath.keySet()); + } + + /** + * @return true if an index was specified for at least one sample, otherwise false + */ + public boolean indicesSpecified() { + return ! sampleNameToVcfIndexPath.isEmpty(); + } +} diff --git a/src/test/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImportIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImportIntegrationTest.java index 27539d6bef0..17ba65831ea 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImportIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImportIntegrationTest.java @@ -2,10 +2,13 @@ import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.samtools.util.FileExtensions; import htsjdk.samtools.util.IntervalList; import htsjdk.tribble.AbstractFeatureReader; import htsjdk.tribble.CloseableTribbleIterator; import htsjdk.tribble.FeatureReader; +import htsjdk.tribble.index.Index; +import htsjdk.tribble.index.IndexFactory; import htsjdk.tribble.readers.LineIterator; import htsjdk.variant.bcf2.BCF2Codec; import htsjdk.variant.variantcontext.Allele; @@ -24,6 +27,7 @@ import java.io.File; import java.io.IOException; import java.nio.file.Files; +import java.nio.file.Path; import java.nio.file.Paths; import java.nio.file.StandardCopyOption; import java.util.ArrayList; @@ -47,6 +51,7 @@ import org.broadinstitute.hellbender.testutils.ArgumentsBuilder; import org.broadinstitute.hellbender.testutils.BaseTest; import org.broadinstitute.hellbender.testutils.VariantContextTestUtils; +import org.broadinstitute.hellbender.tools.IndexFeatureFile; import org.broadinstitute.hellbender.utils.SimpleInterval; import org.broadinstitute.hellbender.utils.Utils; import org.broadinstitute.hellbender.utils.gcs.BucketUtils; @@ -65,8 +70,11 @@ @Test(groups = {"variantcalling"}) public final class GenomicsDBImportIntegrationTest extends CommandLineProgramTest { private static final String HG_00096 = largeFileTestDir + "gvcfs/HG00096.g.vcf.gz"; + private static final String HG_00096_SAMPLE_NAME = "HG00096"; private static final String HG_00268 = largeFileTestDir + "gvcfs/HG00268.g.vcf.gz"; + private static final String HG_00268_SAMPLE_NAME = "HG00268"; private static final String NA_19625 = largeFileTestDir + "gvcfs/NA19625.g.vcf.gz"; + private static final String NA_19625_SAMPLE_NAME = "NA19625"; //The following 3 files were obtained by running CombineGVCFs on the above 3 files (separately). This introduces spanning //deletions in the files. Hence, these files can be used to test for spanning deletions in the input VCF. private static final String HG_00096_after_combine_gvcfs = largeFileTestDir + "gvcfs/HG00096_after_combine_gvcfs.g.vcf.gz"; @@ -879,6 +887,224 @@ private static File getSampleMapFile(final Map mapping){ .collect(Collectors.joining("\n"))); } + @DataProvider + public Object[][] dataForTestExplicitIndicesInSampleNameMap() { + final Map originalVCFsInOrder = new LinkedHashMap<>(); + originalVCFsInOrder.put(HG_00096_SAMPLE_NAME, new File(HG_00096)); + originalVCFsInOrder.put(HG_00268_SAMPLE_NAME, new File(HG_00268)); + originalVCFsInOrder.put(NA_19625_SAMPLE_NAME, new File(NA_19625)); + + final Map originalVCFsOutOfOrder = new LinkedHashMap<>(); + originalVCFsOutOfOrder.put(NA_19625_SAMPLE_NAME, new File(NA_19625)); + originalVCFsOutOfOrder.put(HG_00268_SAMPLE_NAME, new File(HG_00268)); + originalVCFsOutOfOrder.put(HG_00096_SAMPLE_NAME, new File(HG_00096)); + + return new Object[][] { + // All VCFs have explicit indices, samples in order, TABIX index + { originalVCFsInOrder, Arrays.asList(HG_00096_SAMPLE_NAME, HG_00268_SAMPLE_NAME, NA_19625_SAMPLE_NAME), false }, + + // All VCFs have explicit indices, samples in order, TRIBBLE index + { originalVCFsInOrder, Arrays.asList(HG_00096_SAMPLE_NAME, HG_00268_SAMPLE_NAME, NA_19625_SAMPLE_NAME), true }, + + // Some VCFs have explicit indices, samples in order, TABIX index + { originalVCFsInOrder, Arrays.asList(HG_00268_SAMPLE_NAME), false }, + + // Some VCFs have explicit indices, samples in order, TRIBBLE index + { originalVCFsInOrder, Arrays.asList(HG_00268_SAMPLE_NAME), true }, + + // All VCFs have explicit indices, samples out of order, TABIX index + { originalVCFsOutOfOrder, Arrays.asList(HG_00096_SAMPLE_NAME, HG_00268_SAMPLE_NAME, NA_19625_SAMPLE_NAME), false }, + + // All VCFs have explicit indices, samples out of order, TRIBBLE index + { originalVCFsOutOfOrder, Arrays.asList(HG_00096_SAMPLE_NAME, HG_00268_SAMPLE_NAME, NA_19625_SAMPLE_NAME), true }, + + // Some VCFs have explicit indices, samples out of order, TABIX index + { originalVCFsOutOfOrder, Arrays.asList(HG_00268_SAMPLE_NAME), false }, + + // Some VCFs have explicit indices, samples out of order, TRIBBLE index + { originalVCFsOutOfOrder, Arrays.asList(HG_00268_SAMPLE_NAME), true } + }; + } + + // Test that we can handle explicit index files from a sample name map locally. + // The cloud version of this test is separate. + // Note that this test decompresses/reindexes its GVCFs on-the-fly as necessary in order + // to avoid our having to check uncompressed VCFs in to our repo + @Test(dataProvider = "dataForTestExplicitIndicesInSampleNameMap") + public void testExplicitIndicesInSampleNameMap(final Map originalVCFs, final List samplesWithExplicitIndices, final boolean useTribbleIndex) throws IOException { + final String workspace = createTempDir("testExplicitIndicesInSampleNameMap").getAbsolutePath() + "/workspace"; + final File vcfDir = createTempDir("testExplicitIndicesInSampleNameMap_vcfs"); + final File indexDir = createTempDir("testExplicitIndicesInSampleNameMap_indices"); + Assert.assertNotEquals(vcfDir, indexDir, + "testExplicitIndicesInSampleNameMap failed to create separate directories for the vcfs and their indices"); + + final StringBuilder sampleNameMapContents = new StringBuilder(); + + for ( final Map.Entry originalVCFEntry : originalVCFs.entrySet() ) { + final String sampleName = originalVCFEntry.getKey(); + final File originalVCFFile = originalVCFEntry.getValue(); + final boolean createExplicitIndex = samplesWithExplicitIndices.contains(sampleName); + + final Path originalVCFPath = originalVCFFile.toPath(); + final String uncompressedVCFName = originalVCFFile.getName().replaceAll("\\.gz$", ""); + Path vcfDestination = new File(vcfDir, originalVCFFile.getName()).toPath(); + if ( useTribbleIndex ) { + vcfDestination = new File(vcfDir, uncompressedVCFName).toPath(); + IOUtils.gunzip(originalVCFPath.toAbsolutePath().toFile(), vcfDestination.toAbsolutePath().toFile()); + } else { + Files.copy(originalVCFPath, vcfDestination); + } + + final File originalVCFIndexFile = new File(originalVCFFile.getAbsolutePath() + FileExtensions.TABIX_INDEX); + Assert.assertTrue(originalVCFIndexFile.exists()); + final File thisVCFIndexDir = createExplicitIndex ? indexDir : vcfDir; + Path vcfIndexDestination = new File(thisVCFIndexDir, originalVCFIndexFile.getName()).toPath(); + if ( useTribbleIndex ) { + vcfIndexDestination = new File(thisVCFIndexDir, uncompressedVCFName + FileExtensions.TRIBBLE_INDEX).toPath(); + final Index inMemoryIndex = IndexFactory.createLinearIndex(vcfDestination, new VCFCodec(), IndexFeatureFile.OPTIMAL_GVCF_INDEX_BIN_SIZE); + inMemoryIndex.write(vcfIndexDestination); + } else { + Files.copy(originalVCFIndexFile.toPath(), vcfIndexDestination); + } + + if ( createExplicitIndex ) { + sampleNameMapContents.append(String.format("%s\t%s\t%s\n", sampleName, vcfDestination.toAbsolutePath().toString(), vcfIndexDestination.toAbsolutePath().toString())); + } else { + sampleNameMapContents.append(String.format("%s\t%s\n", sampleName, vcfDestination.toAbsolutePath().toString())); + } + } + + final File sampleNameMapFile = IOUtils.writeTempFile(sampleNameMapContents.toString(), "testExplicitIndicesInSampleNameMap_samplemap", ".txt"); + + final ArgumentsBuilder args = new ArgumentsBuilder(); + args.add(GenomicsDBImport.SAMPLE_NAME_MAP_LONG_NAME, sampleNameMapFile.getAbsolutePath()) + .addInterval(INTERVAL.get(0)) + .add(GenomicsDBImport.WORKSPACE_ARG_LONG_NAME, workspace); + runCommandLine(args); + + checkJSONFilesAreWritten(workspace); + checkGenomicsDBAgainstExpected(workspace, INTERVAL, COMBINED, b38_reference_20_21, true, ATTRIBUTES_TO_IGNORE); + checkGenomicsDBAgainstExpected(workspace, INTERVAL, COMBINED, b38_reference_20_21, true, ATTRIBUTES_TO_IGNORE, false, false, true); + } + + @DataProvider + public Object[][] dataForTestExplicitIndicesInSampleNameMapInTheCloud() { + final String GVCFS_WITH_INDICES_BUCKET = "gs://hellbender/test/resources/org/broadinstitute/hellbender/tools/genomicsdb/gvcfs_with_indices/"; + final String GVCFS_WITHOUT_INDICES_BUCKET = "gs://hellbender/test/resources/org/broadinstitute/hellbender/tools/genomicsdb/gvcfs_without_indices/"; + final String GVCF_INDICES_ONLY_BUCKET = "gs://hellbender/test/resources/org/broadinstitute/hellbender/tools/genomicsdb/gvcf_indices_only/"; + + final String HG00096_COMPRESSED_WITH_INDEX = GVCFS_WITH_INDICES_BUCKET + "HG00096.g.vcf.gz"; + final String HG00096_COMPRESSED_NO_INDEX = GVCFS_WITHOUT_INDICES_BUCKET + "HG00096.g.vcf.gz"; + final String HG00096_COMPRESSED_INDEX = GVCF_INDICES_ONLY_BUCKET + "HG00096.g.vcf.gz.tbi"; + final String HG00096_UNCOMPRESSED_WITH_INDEX = GVCFS_WITH_INDICES_BUCKET + "HG00096.g.vcf"; + final String HG00096_UNCOMPRESSED_NO_INDEX = GVCFS_WITHOUT_INDICES_BUCKET + "HG00096.g.vcf"; + final String HG00096_UNCOMPRESSED_INDEX = GVCF_INDICES_ONLY_BUCKET + "HG00096.g.vcf.idx"; + + final String HG00268_COMPRESSED_WITH_INDEX = GVCFS_WITH_INDICES_BUCKET + "HG00268.g.vcf.gz"; + final String HG00268_COMPRESSED_NO_INDEX = GVCFS_WITHOUT_INDICES_BUCKET + "HG00268.g.vcf.gz"; + final String HG00268_COMPRESSED_INDEX = GVCF_INDICES_ONLY_BUCKET + "HG00268.g.vcf.gz.tbi"; + final String HG00268_UNCOMPRESSED_WITH_INDEX = GVCFS_WITH_INDICES_BUCKET + "HG00268.g.vcf"; + final String HG00268_UNCOMPRESSED_NO_INDEX = GVCFS_WITHOUT_INDICES_BUCKET + "HG00268.g.vcf"; + final String HG00268_UNCOMPRESSED_INDEX = GVCF_INDICES_ONLY_BUCKET + "HG00268.g.vcf.idx"; + + final String NA19625_COMPRESSED_WITH_INDEX = GVCFS_WITH_INDICES_BUCKET + "NA19625.g.vcf.gz"; + final String NA19625_COMPRESSED_NO_INDEX = GVCFS_WITHOUT_INDICES_BUCKET + "NA19625.g.vcf.gz"; + final String NA19625_COMPRESSED_INDEX = GVCF_INDICES_ONLY_BUCKET + "NA19625.g.vcf.gz.tbi"; + final String NA19625_UNCOMPRESSED_WITH_INDEX = GVCFS_WITH_INDICES_BUCKET + "NA19625.g.vcf"; + final String NA19625_UNCOMPRESSED_NO_INDEX = GVCFS_WITHOUT_INDICES_BUCKET + "NA19625.g.vcf"; + final String NA19625_UNCOMPRESSED_INDEX = GVCF_INDICES_ONLY_BUCKET + "NA19625.g.vcf.idx"; + + return new Object[][] { + // All VCFs have explicit indices, samples in order, TABIX index + { + HG_00096_SAMPLE_NAME + "\t" + HG00096_COMPRESSED_NO_INDEX + "\t" + HG00096_COMPRESSED_INDEX + "\n" + + HG_00268_SAMPLE_NAME + "\t" + HG00268_COMPRESSED_NO_INDEX + "\t" + HG00268_COMPRESSED_INDEX + "\n" + + NA_19625_SAMPLE_NAME + "\t" + NA19625_COMPRESSED_NO_INDEX + "\t" + NA19625_COMPRESSED_INDEX + "\n" + }, + + // All VCFs have explicit indices, samples in order, TRIBBLE index + { + HG_00096_SAMPLE_NAME + "\t" + HG00096_UNCOMPRESSED_NO_INDEX + "\t" + HG00096_UNCOMPRESSED_INDEX + "\n" + + HG_00268_SAMPLE_NAME + "\t" + HG00268_UNCOMPRESSED_NO_INDEX + "\t" + HG00268_UNCOMPRESSED_INDEX + "\n" + + NA_19625_SAMPLE_NAME + "\t" + NA19625_UNCOMPRESSED_NO_INDEX + "\t" + NA19625_UNCOMPRESSED_INDEX + "\n" + }, + + // Some VCFs have explicit indices, samples in order, TABIX index + { + HG_00096_SAMPLE_NAME + "\t" + HG00096_COMPRESSED_WITH_INDEX + "\n" + + HG_00268_SAMPLE_NAME + "\t" + HG00268_COMPRESSED_NO_INDEX + "\t" + HG00268_COMPRESSED_INDEX + "\n" + + NA_19625_SAMPLE_NAME + "\t" + NA19625_COMPRESSED_WITH_INDEX + "\n" + }, + + // Some VCFs have explicit indices, samples in order, TRIBBLE index + { + HG_00096_SAMPLE_NAME + "\t" + HG00096_UNCOMPRESSED_WITH_INDEX + "\n" + + HG_00268_SAMPLE_NAME + "\t" + HG00268_UNCOMPRESSED_NO_INDEX + "\t" + HG00268_UNCOMPRESSED_INDEX + "\n" + + NA_19625_SAMPLE_NAME + "\t" + NA19625_UNCOMPRESSED_WITH_INDEX + "\n" + }, + + // All VCFs have explicit indices, samples out of order, TABIX index + { + NA_19625_SAMPLE_NAME + "\t" + NA19625_COMPRESSED_NO_INDEX + "\t" + NA19625_COMPRESSED_INDEX + "\n" + + HG_00268_SAMPLE_NAME + "\t" + HG00268_COMPRESSED_NO_INDEX + "\t" + HG00268_COMPRESSED_INDEX + "\n" + + HG_00096_SAMPLE_NAME + "\t" + HG00096_COMPRESSED_NO_INDEX + "\t" + HG00096_COMPRESSED_INDEX + "\n" + }, + + // All VCFs have explicit indices, samples out of order, TRIBBLE index + { + NA_19625_SAMPLE_NAME + "\t" + NA19625_UNCOMPRESSED_NO_INDEX + "\t" + NA19625_UNCOMPRESSED_INDEX + "\n" + + HG_00268_SAMPLE_NAME + "\t" + HG00268_UNCOMPRESSED_NO_INDEX + "\t" + HG00268_UNCOMPRESSED_INDEX + "\n" + + HG_00096_SAMPLE_NAME + "\t" + HG00096_UNCOMPRESSED_NO_INDEX + "\t" + HG00096_UNCOMPRESSED_INDEX + "\n" + }, + + // Some VCFs have explicit indices, samples out of order, TABIX index + { + NA_19625_SAMPLE_NAME + "\t" + NA19625_COMPRESSED_WITH_INDEX + "\n" + + HG_00268_SAMPLE_NAME + "\t" + HG00268_COMPRESSED_NO_INDEX + "\t" + HG00268_COMPRESSED_INDEX + "\n" + + HG_00096_SAMPLE_NAME + "\t" + HG00096_COMPRESSED_WITH_INDEX + "\n" + }, + + // Some VCFs have explicit indices, samples out of order, TRIBBLE index + { + NA_19625_SAMPLE_NAME + "\t" + NA19625_UNCOMPRESSED_WITH_INDEX + "\n" + + HG_00268_SAMPLE_NAME + "\t" + HG00268_UNCOMPRESSED_NO_INDEX + "\t" + HG00268_UNCOMPRESSED_INDEX + "\n" + + HG_00096_SAMPLE_NAME + "\t" + HG00096_UNCOMPRESSED_WITH_INDEX + "\n" + } + }; + } + + // Test that we can handle explicit index files from a sample name map in the cloud + @Test(dataProvider = "dataForTestExplicitIndicesInSampleNameMapInTheCloud", groups = {"bucket"}) + public void testExplicitIndicesInSampleNameMapInTheCloud(final String sampleNameMapContents) throws IOException { + final String workspace = createTempDir("testExplicitIndicesInSampleNameMapInTheCloud").getAbsolutePath() + "/workspace"; + final File sampleNameMapFile = IOUtils.writeTempFile(sampleNameMapContents, "testExplicitIndicesInSampleNameMapInTheCloud_samplemap", ".txt"); + + final ArgumentsBuilder args = new ArgumentsBuilder(); + args.add(GenomicsDBImport.SAMPLE_NAME_MAP_LONG_NAME, sampleNameMapFile.getAbsolutePath()) + .addInterval(INTERVAL.get(0)) + .add(GenomicsDBImport.WORKSPACE_ARG_LONG_NAME, workspace); + runCommandLine(args); + + checkJSONFilesAreWritten(workspace); + checkGenomicsDBAgainstExpected(workspace, INTERVAL, COMBINED, b38_reference_20_21, true, ATTRIBUTES_TO_IGNORE); + checkGenomicsDBAgainstExpected(workspace, INTERVAL, COMBINED, b38_reference_20_21, true, ATTRIBUTES_TO_IGNORE, false, false, true); + } + + // This test guards against the possibility of someone accidentally putting an index file into + // the "gvcfs_without_indices" bucket directory used by testExplicitIndicesInSampleNameMapInTheCloud() + @Test(groups = {"bucket"}) + public void testUnindexedCloudGVCFsAreActuallyUnindexed() throws IOException { + final String GVCFS_WITHOUT_INDICES_BUCKET = "gs://hellbender/test/resources/org/broadinstitute/hellbender/tools/genomicsdb/gvcfs_without_indices/"; + final Path bucketPath = IOUtils.getPath(GVCFS_WITHOUT_INDICES_BUCKET); + + Files.list(bucketPath).forEach(file -> { + Assert.assertFalse(file.endsWith(FileExtensions.TABIX_INDEX), + "Found a TABIX index in bucket " + GVCFS_WITHOUT_INDICES_BUCKET); + Assert.assertFalse(file.endsWith(FileExtensions.TRIBBLE_INDEX), + "Found a Tribble index in bucket " + GVCFS_WITHOUT_INDICES_BUCKET); + }); + } + @DataProvider public static Iterator getRenameCombinations() { final Map noRemapping = new LinkedHashMap<>(); @@ -1099,15 +1325,15 @@ private static FeatureReader getGenomicsDBFeatureReader( .setGenerateArrayNameFromPartitionBounds(true); GenomicsDBVidMapProto.VidMappingPB vidMapPB = null; try { - vidMapPB = org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBUtils.getProtobufVidMappingFromJsonFile(IOUtils.appendPathToDir(workspace, GenomicsDBConstants.DEFAULT_VIDMAP_FILE_NAME)); + vidMapPB = GATKGenomicsDBUtils.getProtobufVidMappingFromJsonFile(IOUtils.appendPathToDir(workspace, GenomicsDBConstants.DEFAULT_VIDMAP_FILE_NAME)); } catch (final IOException e) { throw new UserException("Could not open vid json file "+GenomicsDBConstants.DEFAULT_VIDMAP_FILE_NAME, e); } HashMap fieldNameToIndexInVidFieldsList = - org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBUtils.getFieldNameToListIndexInProtobufVidMappingObject(vidMapPB); + GATKGenomicsDBUtils.getFieldNameToListIndexInProtobufVidMappingObject(vidMapPB); - vidMapPB = org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBUtils.updateINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList, + vidMapPB = GATKGenomicsDBUtils.updateINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList, GATKVCFConstants.RAW_MAPPING_QUALITY_WITH_DEPTH_KEY, "element_wise_sum"); if(vidMapPB != null) { diff --git a/src/test/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImportUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImportUnitTest.java deleted file mode 100644 index 0a92a8a8944..00000000000 --- a/src/test/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImportUnitTest.java +++ /dev/null @@ -1,111 +0,0 @@ -package org.broadinstitute.hellbender.tools.genomicsdb; - -import htsjdk.tribble.FeatureReader; -import htsjdk.variant.variantcontext.VariantContext; -import org.broadinstitute.hellbender.exceptions.UserException; -import org.broadinstitute.hellbender.utils.io.IOUtils; -import org.broadinstitute.hellbender.GATKBaseTest; -import org.genomicsdb.importer.GenomicsDBImporter; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.io.File; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.util.*; -import java.util.LinkedHashMap; -import java.util.Map; -import java.net.URI; -import java.net.URISyntaxException; - -public class GenomicsDBImportUnitTest extends GATKBaseTest { - - private static final String ORDERED_SAMPLE_MAP = "Sample1\tfile1\n" + - "Sample2\tfile2\n" + - "Sample3\tfile3"; - - private static final String UNORDERED_SAMPLE_MAP = "Sample3\tfile3\n" + - "Sample2\tfile2\n" + - "Sample1\tfile1\n"; - - @DataProvider - public Object[][] getBadSampleNameMapFiles(){ - return new Object[][]{ - {"Sample1\tsamplePath\n" - +"Sample1\tsamplePath"}, // duplicate sample name - {""}, // empty file - {"Sample1\tSample2\tFile"}, // 3 columns - {"Sample1\t"}, // 1 column - {"Sample1"}, // 1 column no delimiter - {"\tfile"}, // empty first token - {" \tfile"}, // first token only whitespace - {"Sample1\tfile1\t"}, // extra tab - {"Sample1\nfile"}, // newline instead of tab - {"\t"}, // only tab - {"Sample1 file1"}, // 1 column - {" name name\tfile1"}, // preceding whitespace - {"name name \tfile1"}, // trailing whitespace - }; - } - - @Test(dataProvider = "getBadSampleNameMapFiles", expectedExceptions = UserException.BadInput.class) - public void testBadInputFiles(final String text){ - final File sampleFile = IOUtils.writeTempFile(text, "badSampleMapping", ".txt"); - GenomicsDBImport.loadSampleNameMapFile(sampleFile.toPath() ); - } - - @DataProvider - public Object[][] getGoodSampleNameMapFileSyntax(){ - return new Object[][]{ - // Note: none of these files are real, these are just valid files syntactically - {"Sample1\tsamplePath1 \n" - +"Sample2\tsamplePath2", new String[][] {{"Sample1","samplePath1"},{"Sample2","samplePath2"}}}, // normal sample names - {"Sample1 001\tFile", new String[][] {{"Sample1 001","File"}}}, // sample names with whitespace - {"name name\tfile1 ", new String[][] {{"name name","file1"}}}, // trailing whitespace second column - {"name name\t file1 ", new String[][] {{"name name","file1"}}} // leading and trailing whitespace second colum - }; - } - - @Test(dataProvider = "getGoodSampleNameMapFileSyntax") - public void testValidSampleFiles(final String text, final String[][] expectedEntries){ - final File sampleFile = IOUtils.writeTempFile(text, "goodSampleMapping", ".txt"); - final LinkedHashMap outputMap = GenomicsDBImport.loadSampleNameMapFile(sampleFile.toPath()); - Assert.assertEquals(outputMap.size(),expectedEntries.length); - - Arrays.stream(expectedEntries).forEach(s -> { Assert.assertTrue(outputMap.containsKey(s[0])); - Assert.assertEquals(outputMap.get(s[0]).toString(),s[1]);}); - } - - @Test - public void testLoadSampleNameMapFilePreservesOrder(){ - final File sampleFile = IOUtils.writeTempFile(UNORDERED_SAMPLE_MAP, "badSampleMapping", ".txt"); - final LinkedHashMap unsortedMap = GenomicsDBImport.loadSampleNameMapFile(sampleFile.toPath()); - Assert.assertEquals(new ArrayList<>(unsortedMap.keySet()), Arrays.asList("Sample3", "Sample2", "Sample1")); - } - - @DataProvider - public Object[][] getSampleMaps(){ - return new Object[][]{ - {ORDERED_SAMPLE_MAP}, - {UNORDERED_SAMPLE_MAP} - }; - } - - @Test(dataProvider = "getSampleMaps") - public void testLoadSampleNameMapFileInSortedOrder(final String sampleMapText){ - final File sampleFile = IOUtils.writeTempFile(sampleMapText, "sampleMapping", ".txt"); - final Map expected = new LinkedHashMap<>(); - try { - expected.put("Sample1", new URI("file1")); - expected.put("Sample2", new URI("file2")); - expected.put("Sample3", new URI("file3")); - } - catch(URISyntaxException e) { - throw new RuntimeException("Malformed URI "+e.toString()); - } - final Map actual = GenomicsDBImport.loadSampleNameMapFileInSortedOrder(sampleFile.toPath(), false); - Assert.assertEquals(actual, expected); - Assert.assertEquals(actual.keySet().iterator().next(), "Sample1"); - } -} diff --git a/src/test/java/org/broadinstitute/hellbender/tools/genomicsdb/SampleNameMapUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/genomicsdb/SampleNameMapUnitTest.java new file mode 100644 index 00000000000..94d393bef61 --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/tools/genomicsdb/SampleNameMapUnitTest.java @@ -0,0 +1,310 @@ +package org.broadinstitute.hellbender.tools.genomicsdb; + +import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.utils.io.IOUtils; +import org.broadinstitute.hellbender.GATKBaseTest; +import org.broadinstitute.hellbender.utils.text.XReadLines; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.IOException; +import java.util.*; +import java.util.LinkedHashMap; +import java.util.Map; +import java.net.URI; +import java.net.URISyntaxException; + +public class SampleNameMapUnitTest extends GATKBaseTest { + + private static final String ORDERED_SAMPLE_MAP = "Sample1\tfile1\n" + + "Sample2\tfile2\n" + + "Sample3\tfile3"; + + private static final String UNORDERED_SAMPLE_MAP = "Sample3\tfile3\n" + + "Sample2\tfile2\n" + + "Sample1\tfile1\n"; + + @DataProvider + public Object[][] getBadSampleNameMapFiles(){ + return new Object[][]{ + {"Sample1\tsamplePath\n" + +"Sample1\tsamplePath"}, // duplicate sample name + {""}, // empty file + {"Sample1\t"}, // 1 column + {"Sample1"}, // 1 column no delimiter + {"\tfile"}, // empty first token + {" \tfile"}, // first token only whitespace + {"Sample1\tfile1\t"}, // extra tab + {"Sample1\nfile"}, // newline instead of tab + {"\t"}, // only tab + {"Sample1 file1"}, // 1 column, internal whitespace + {" Sample1\tfile1"}, // preceding whitespace + {"Sample1 \tfile1"}, // trailing whitespace + {"Sample1\tfile1\t"}, // empty index + {"Sample1\tfile1\t "}, // all-whitespace index + {"Sample1\tfile1\tindex1\textraColumn"}, // 4 columns + {"Sample1\tfile1\tindex1\t"} // 4 columns, blank 4th column + }; + } + + @Test(dataProvider = "getBadSampleNameMapFiles", expectedExceptions = UserException.BadInput.class) + public void testBadInputFiles(final String text){ + final File sampleFile = IOUtils.writeTempFile(text, "badSampleMapping", ".txt"); + final SampleNameMap sampleMap = new SampleNameMap(sampleFile.toPath()); + } + + @DataProvider + public Object[][] getGoodSampleNameMapFiles(){ + return new Object[][]{ + // Note: none of these files are real, these are just valid files syntactically + + // normal sample names, no explicit indices + {"Sample1\tsamplePath1\n" + + "Sample2\tsamplePath2", + new String[][] { + {"Sample1", "samplePath1"}, + {"Sample2", "samplePath2"}}}, + + // normal sample names, explicit indices for all files + {"Sample1\tsamplePath1\tindexPath1\n" + + "Sample2\tsamplePath2\tindexPath2", + new String[][] { + {"Sample1", "samplePath1", "indexPath1"}, + {"Sample2", "samplePath2", "indexPath2"}}}, + + // normal sample names, explicit indices for some files but not others + {"Sample1\tsamplePath1\n" + + "Sample2\tsamplePath2\tindexPath2", + new String[][] { + {"Sample1", "samplePath1"}, + {"Sample2", "samplePath2", "indexPath2"}}}, + + // sample names with internal whitespace + {"Sample1 001\tFile", + new String[][] { + {"Sample1 001", "File"}} + }, + + // leading and trailing whitespace second column + {"name name\t file1 ", + new String[][] { + {"name name", "file1"}} + }, + + // leading and trailing whitespace third column + {"name name\tfile1\t index1 ", + new String[][] { + {"name name", "file1", "index1"}} + }, + + // leading and trailing whitespace second and third columns + {"name name\t file1 \t index1 ", + new String[][] { + {"name name", "file1", "index1"}} + }, + }; + } + + @Test(dataProvider = "getGoodSampleNameMapFiles") + public void testValidSampleFiles(final String text, final String[][] expectedEntries){ + final File sampleFile = IOUtils.writeTempFile(text, "goodSampleMapping", ".txt"); + + final SampleNameMap sampleMap = new SampleNameMap(sampleFile.toPath()); + final SortedMap outputMap = sampleMap.getSampleNameToVcfPath(); + + Assert.assertEquals(outputMap.size(),expectedEntries.length, + "Wrong number of entries in the Map returned by getSampleNameToVcfPath()"); + Assert.assertEquals(sampleMap.getNumSamples(), expectedEntries.length, + "Wrong number of samples returned by getNumSamples()"); + boolean expectedIndicesFound = false; + + for ( final String[] expected : expectedEntries ) { + Assert.assertTrue(outputMap.containsKey(expected[0])); + + Assert.assertEquals(outputMap.get(expected[0]).toString(),expected[1]); + Assert.assertEquals(sampleMap.getVCFForSample(expected[0]).toString(), expected[1], + "Wrong VCF returned by getVCFForSample() for sample " + expected[0]); + Assert.assertEquals(sampleMap.getVCFForSampleAsPath(expected[0]).toString(), expected[1], + "Wrong VCF returned by getVCFForSampleAsPath() for sample " + expected[0]); + + if ( expected.length == 3 ) { + expectedIndicesFound = true; + + Assert.assertNotNull(sampleMap.getVCFIndexForSample(expected[0]), + "No index returned by getVCFIndexForSample() for sample " + expected[0]); + Assert.assertNotNull(sampleMap.getVCFForSampleAsPath(expected[0]), + "No index returned by getVCFForSampleAsPath() for sample " + expected[0]); + + Assert.assertEquals(sampleMap.getVCFIndexForSample(expected[0]).toString(), expected[2], + "Wrong index returned by getVCFIndexForSample() for sample " + expected[0]); + Assert.assertEquals(sampleMap.getVCFIndexForSampleAsPath(expected[0]).toString(), expected[2], + "Wrong index returned by getVCFIndexForSampleAsPath() for sample " + expected[0]); + } else { + Assert.assertNull(sampleMap.getVCFIndexForSample(expected[0]), + "Index unexpectedly returned by getVCFIndexForSample() for sample " + expected[0]); + Assert.assertNull(sampleMap.getVCFIndexForSampleAsPath(expected[0]), + "Index unexpectedly returned by getVCFIndexForSampleAsPath() for sample " + expected[0]); + } + } + + Assert.assertEquals(sampleMap.indicesSpecified(), expectedIndicesFound, + "Wrong value returned by indicesSpecified()"); + + } + + // Test to ensure that the "unsorted" map used in subsequent tests is actually unsorted, + // to guard against future modifications + @Test + public void testUnorderedSampleMapIsActuallyUnordered() throws IOException { + final File sampleFile = IOUtils.writeTempFile(UNORDERED_SAMPLE_MAP, "badSampleMapping", ".txt"); + final List expectedSampleOrdering = Arrays.asList("Sample3", "Sample2", "Sample1"); + + try ( final XReadLines lineReader = new XReadLines(sampleFile) ) { + int lineNumber = 0; + for ( final String line : lineReader ) { + final String sampleFromFile = line.split("\\t", -1)[0]; + Assert.assertEquals(sampleFromFile, expectedSampleOrdering.get(lineNumber)); + ++lineNumber; + } + } + } + + @DataProvider + public Object[][] getSampleMapsForOrderingTest(){ + final Map expectedMap = new LinkedHashMap<>(); + try { + expectedMap.put("Sample1", new URI("file1")); + expectedMap.put("Sample2", new URI("file2")); + expectedMap.put("Sample3", new URI("file3")); + } + catch(URISyntaxException e) { + throw new RuntimeException("Malformed URI "+e.toString()); + } + + final List expectedSampleOrdering = Arrays.asList("Sample1", "Sample2", "Sample3"); + + return new Object[][]{ + {ORDERED_SAMPLE_MAP, expectedMap, expectedSampleOrdering}, + {UNORDERED_SAMPLE_MAP, expectedMap, expectedSampleOrdering} + }; + } + + @Test(dataProvider = "getSampleMapsForOrderingTest") + public void testSampleOrdering(final String sampleMapText, final Map expectedMap, final List expectedSampleOrdering){ + final File sampleFile = IOUtils.writeTempFile(sampleMapText, "sampleMapping", ".txt"); + + final SampleNameMap sampleMap = new SampleNameMap(sampleFile.toPath()); + final SortedMap actualMap = sampleMap.getSampleNameToVcfPath(); + + Assert.assertEquals(actualMap, expectedMap); + Assert.assertEquals(sampleMap.getNumSamples(), expectedSampleOrdering.size(), "Wrong number of samples returned by getNumSamples()"); + Assert.assertEquals(sampleMap.getSampleNamesInSortedOrder().size(), expectedSampleOrdering.size(), "Wrong number of samples returned by getSampleNamesInOrder()"); + + final Iterator actualSamplesFromMap = actualMap.keySet().iterator(); + final Iterator actualSamplesFromGetter = sampleMap.getSampleNamesInSortedOrder().iterator(); + + for ( final String expectedSample : expectedSampleOrdering ) { + Assert.assertEquals(actualSamplesFromMap.next(), expectedSample, + "Wrong sample found in Map returned by getSampleNameToVcfPath()"); + Assert.assertEquals(actualSamplesFromGetter.next(), expectedSample, + "Wrong sample found in Set returned by getSampleNamesInOrder()"); + } + Assert.assertFalse(actualSamplesFromMap.hasNext()); + Assert.assertFalse(actualSamplesFromGetter.hasNext()); + } + + @Test + public void testIncrementalAddition() throws URISyntaxException { + // Use the no-arg constructor to start with an empty SampleNameMap, then + // add samples incrementally: + final SampleNameMap sampleMap = new SampleNameMap(); + Assert.assertEquals(sampleMap.getNumSamples(), 0); + Assert.assertTrue(sampleMap.getSampleNamesInSortedOrder().isEmpty()); + Assert.assertTrue(sampleMap.getSampleNameToVcfPath().isEmpty()); + Assert.assertFalse(sampleMap.indicesSpecified()); + + sampleMap.addSample("Sample3", new URI("file3")); + Assert.assertEquals(sampleMap.getNumSamples(), 1); + Assert.assertEquals(sampleMap.getSampleNamesInSortedOrder().size(), 1); + Assert.assertEquals(sampleMap.getSampleNamesInSortedOrder().get(0), "Sample3"); + Assert.assertEquals(sampleMap.getSampleNameToVcfPath().size(), 1); + Assert.assertEquals(new ArrayList<>(sampleMap.getSampleNameToVcfPath().keySet()), Arrays.asList("Sample3")); + Assert.assertEquals(sampleMap.getVCFForSample("Sample3").toString(), "file3"); + Assert.assertEquals(sampleMap.getVCFForSampleAsPath("Sample3").toString(), "file3"); + Assert.assertFalse(sampleMap.indicesSpecified()); + + sampleMap.addSample("Sample1", new URI("file1"), new URI("index1")); + Assert.assertEquals(sampleMap.getNumSamples(), 2); + Assert.assertEquals(sampleMap.getSampleNamesInSortedOrder().size(), 2); + Assert.assertEquals(sampleMap.getSampleNamesInSortedOrder().get(0), "Sample1"); + Assert.assertEquals(sampleMap.getSampleNamesInSortedOrder().get(1), "Sample3"); + Assert.assertEquals(sampleMap.getSampleNameToVcfPath().size(), 2); + Assert.assertEquals(new ArrayList<>(sampleMap.getSampleNameToVcfPath().keySet()), Arrays.asList("Sample1", "Sample3")); + Assert.assertEquals(sampleMap.getVCFForSample("Sample1").toString(), "file1"); + Assert.assertEquals(sampleMap.getVCFForSampleAsPath("Sample1").toString(), "file1"); + Assert.assertEquals(sampleMap.getVCFForSample("Sample3").toString(), "file3"); + Assert.assertEquals(sampleMap.getVCFForSampleAsPath("Sample3").toString(), "file3"); + Assert.assertTrue(sampleMap.indicesSpecified()); + Assert.assertEquals(sampleMap.getVCFIndexForSample("Sample1").toString(), "index1"); + Assert.assertEquals(sampleMap.getVCFIndexForSampleAsPath("Sample1").toString(), "index1"); + Assert.assertNull(sampleMap.getVCFIndexForSample("Sample3")); + Assert.assertNull(sampleMap.getVCFIndexForSampleAsPath("Sample3")); + + sampleMap.addSample("Sample2", new URI("file2")); + Assert.assertEquals(sampleMap.getNumSamples(), 3); + Assert.assertEquals(sampleMap.getSampleNamesInSortedOrder().size(), 3); + Assert.assertEquals(sampleMap.getSampleNamesInSortedOrder().get(0), "Sample1"); + Assert.assertEquals(sampleMap.getSampleNamesInSortedOrder().get(1), "Sample2"); + Assert.assertEquals(sampleMap.getSampleNamesInSortedOrder().get(2), "Sample3"); + Assert.assertEquals(sampleMap.getSampleNameToVcfPath().size(), 3); + Assert.assertEquals(new ArrayList<>(sampleMap.getSampleNameToVcfPath().keySet()), Arrays.asList("Sample1", "Sample2", "Sample3")); + Assert.assertEquals(sampleMap.getVCFForSample("Sample1").toString(), "file1"); + Assert.assertEquals(sampleMap.getVCFForSampleAsPath("Sample1").toString(), "file1"); + Assert.assertEquals(sampleMap.getVCFForSample("Sample2").toString(), "file2"); + Assert.assertEquals(sampleMap.getVCFForSampleAsPath("Sample2").toString(), "file2"); + Assert.assertEquals(sampleMap.getVCFForSample("Sample3").toString(), "file3"); + Assert.assertEquals(sampleMap.getVCFForSampleAsPath("Sample3").toString(), "file3"); + Assert.assertTrue(sampleMap.indicesSpecified()); + Assert.assertEquals(sampleMap.getVCFIndexForSample("Sample1").toString(), "index1"); + Assert.assertEquals(sampleMap.getVCFIndexForSampleAsPath("Sample1").toString(), "index1"); + Assert.assertNull(sampleMap.getVCFIndexForSample("Sample3")); + Assert.assertNull(sampleMap.getVCFIndexForSampleAsPath("Sample3")); + Assert.assertNull(sampleMap.getVCFIndexForSample("Sample2")); + Assert.assertNull(sampleMap.getVCFIndexForSampleAsPath("Sample2")); + } + + @DataProvider + public Object[][] badInputsToAddSample() { + return new Object[][] { + { " Sample1", "vcf1" }, + { "Sample1 ", "vcf1" }, + { " Sample1 ", "vcf1" }, + { "", "vcf1" }, + { " ", "vcf1" }, + { null, "vcf1" }, + { "Sample1", null} + }; + } + + @Test(dataProvider = "badInputsToAddSample", expectedExceptions = UserException.BadInput.class) + public void testBadInputToAddSample(final String sampleName, final String vcf) throws URISyntaxException { + final URI vcfURI = vcf != null ? new URI(vcf) : null; + final SampleNameMap sampleMap = new SampleNameMap(); + sampleMap.addSample(sampleName, vcfURI); + } + + @Test(expectedExceptions = UserException.BadInput.class) + public void testAddDuplicateSample() throws URISyntaxException { + final SampleNameMap sampleMap = new SampleNameMap(); + sampleMap.addSample("Sample1", new URI("vcf1")); + sampleMap.addSample("Sample1", new URI("vcf1alt")); + } + + @Test(expectedExceptions = UserException.class) + public void testCheckVcfIsCompressedAndIndexed() { + final File sampleFile = IOUtils.writeTempFile(ORDERED_SAMPLE_MAP, "goodSampleMapping", ".txt"); + final SampleNameMap sampleMap = new SampleNameMap(sampleFile.toPath(), true); + } +}