diff --git a/src/main/java/org/broadinstitute/hellbender/engine/FeatureDataSource.java b/src/main/java/org/broadinstitute/hellbender/engine/FeatureDataSource.java index 21f4529601f..bd6fad2f6d2 100644 --- a/src/main/java/org/broadinstitute/hellbender/engine/FeatureDataSource.java +++ b/src/main/java/org/broadinstitute/hellbender/engine/FeatureDataSource.java @@ -36,7 +36,7 @@ import java.util.Optional; import java.util.function.Function; -import static org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBUtils.createExportConfiguration; +import static org.broadinstitute.hellbender.tools.genomicsdb.GATKGenomicsDBUtils.createExportConfiguration; import static org.broadinstitute.hellbender.utils.io.BlockCompressedIntervalStream.BCI_FILE_EXTENSION; /** diff --git a/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/GATKGenomicsDBUtils.java similarity index 94% rename from src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBUtils.java rename to src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/GATKGenomicsDBUtils.java index f81e825e39e..e0eb251501e 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBUtils.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/GATKGenomicsDBUtils.java @@ -1,6 +1,7 @@ package org.broadinstitute.hellbender.tools.genomicsdb; import com.googlecode.protobuf.format.JsonFormat; +import htsjdk.samtools.util.FileExtensions; import org.broadinstitute.hellbender.exceptions.UserException; import org.broadinstitute.hellbender.tools.walkers.annotator.AnnotationUtils; import org.broadinstitute.hellbender.utils.Utils; @@ -12,6 +13,7 @@ import org.genomicsdb.model.GenomicsDBVidMapProto; import java.io.IOException; +import java.nio.file.Path; import java.util.HashMap; import java.util.Map; @@ -28,7 +30,7 @@ * https://developers.google.com/protocol-buffers/docs/javatutorial#the-protocol-buffer-api * https://developers.google.com/protocol-buffers/docs/reference/java-generated */ -public class GenomicsDBUtils { +public class GATKGenomicsDBUtils { private static final String SUM = "sum"; private static final String ELEMENT_WISE_SUM = "element_wise_sum"; @@ -338,5 +340,20 @@ public static String genomicsDBApppendPaths(String parentPath, String path) { } } + public static void assertVariantFileIsCompressedAndIndexed(final Path vcfPath) { + assertVariantFileIsCompressedAndIndexed(vcfPath, null); + } + + public static void assertVariantFileIsCompressedAndIndexed(final Path vcfPath, final Path optionalVCFindexPath) { + if (!vcfPath.toString().toLowerCase().endsWith(FileExtensions.COMPRESSED_VCF)) { + throw new UserException("Input variant files must be block compressed vcfs when using " + + GenomicsDBImport.BYPASS_FEATURE_READER + ", but " + vcfPath.toString() + " does not end with " + + "the standard file extension " + FileExtensions.COMPRESSED_VCF); + } + Path indexPath = optionalVCFindexPath != null ? + optionalVCFindexPath : + vcfPath.resolveSibling(vcfPath.getFileName() + FileExtensions.COMPRESSED_VCF_INDEX); + IOUtils.assertFileIsReadable(indexPath); + } } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImport.java b/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImport.java index f68309565cf..fd7dedaab6d 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImport.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImport.java @@ -1,12 +1,10 @@ package org.broadinstitute.hellbender.tools.genomicsdb; -import com.google.common.annotations.VisibleForTesting; import com.google.common.util.concurrent.ThreadFactoryBuilder; import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.util.Interval; import htsjdk.samtools.util.IntervalList; import htsjdk.samtools.util.Locatable; -import htsjdk.samtools.util.FileExtensions; import htsjdk.tribble.AbstractFeatureReader; import htsjdk.tribble.CloseableTribbleIterator; import htsjdk.tribble.FeatureReader; @@ -69,8 +67,8 @@ import java.util.concurrent.ThreadFactory; import java.util.stream.Collectors; -import static org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBUtils.genomicsDBGetAbsolutePath; -import static org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBUtils.genomicsDBApppendPaths; +import static org.broadinstitute.hellbender.tools.genomicsdb.GATKGenomicsDBUtils.genomicsDBGetAbsolutePath; +import static org.broadinstitute.hellbender.tools.genomicsdb.GATKGenomicsDBUtils.genomicsDBApppendPaths; /** * Import single-sample GVCFs into GenomicsDB before joint genotyping. @@ -139,6 +137,22 @@ * sample3 sample3.vcf.gz * * + * The sample name map file may optionally contain a third column with an explicit index path/URI for each VCF: + * + *
+ *  sample1      sample1.vcf.gz      sample1.vcf.gz.tbi
+ *  sample2      sample2.vcf.gz      sample2.vcf.gz.tbi
+ *  sample3      sample3.vcf.gz      sample3.vcf.gz.tbi
+ *  
+ * + * It is also possible to specify an explicit index for only a subset of the samples: + * + *
+ *  sample1      sample1.vcf.gz
+ *  sample2      sample2.vcf.gz      sample2.vcf.gz.tbi
+ *  sample3      sample3.vcf.gz
+ *  
+ * * Add new samples to an existing genomicsdb workspace. * In the incremental import case, no intervals are specified in the command because the tool will use the same * intervals used in the initial import. Sample map is also supported for incremental import. @@ -409,14 +423,7 @@ public int getDefaultCloudIndexPrefetchBufferSize() { // Intervals from command line (merged if specified) private List intervals; - // Sorted mapping between sample names and corresponding GVCF file name - // - // IMPORTANT: This must be sorted or it will result in sample name swaps in the output database. - // This happens because the callset json is generated independently from the import process - // each imported batch is then sorted, so if we have an unsorted list we'll end up with different global vs batch - // sorting. - // We preemptively sort here so we will have consistent sorting. - private SortedMap sampleNameToVcfPath = new TreeMap<>(); + private SampleNameMap sampleNameMap; // Needed as smartMergeHeaders() returns a set of VCF header lines private Set mergedHeaderLines = null; @@ -511,16 +518,6 @@ private static void assertIntervalsCoverEntireContigs(GenomicsDBImporter importe } } - private static void assertVariantFileIsCompressedAndIndexed(final Path path) { - if (!path.toString().toLowerCase().endsWith(FileExtensions.COMPRESSED_VCF)) { - throw new UserException("Input variant files must be block compressed vcfs when using " + - BYPASS_FEATURE_READER + ", but " + path.toString() + " does not end with " + - "the standard file extension " + FileExtensions.COMPRESSED_VCF); - } - Path indexPath = path.resolveSibling(path.getFileName() + FileExtensions.COMPRESSED_VCF_INDEX); - IOUtils.assertFileIsReadable(indexPath); - } - /** * sets the values of mergedHeaderLines, mergedHeaderSequenceDictionary, and sampleNameToVcfPath */ @@ -529,23 +526,20 @@ private void initializeHeaderAndSampleMappings() { if (variantPaths != null && variantPaths.size() > 0) { // -V was specified final List headers = new ArrayList<>(variantPaths.size()); + sampleNameMap = new SampleNameMap(); for (final String variantPathString : variantPaths) { final Path variantPath = IOUtils.getPath(variantPathString); if (bypassFeatureReader) { - assertVariantFileIsCompressedAndIndexed(variantPath); + GATKGenomicsDBUtils.assertVariantFileIsCompressedAndIndexed(variantPath); } - final VCFHeader header = getHeaderFromPath(variantPath); + final VCFHeader header = getHeaderFromPath(variantPath, null); Utils.validate(header != null, "Null header was found in " + variantPath + "."); assertGVCFHasOnlyOneSample(variantPathString, header); headers.add(header); final String sampleName = header.getGenotypeSamples().get(0); try { - final URI previousPath = sampleNameToVcfPath.put(sampleName, new URI(variantPathString)); - if (previousPath != null) { - throw new UserException("Duplicate sample: " + sampleName + ". Sample was found in both " - + variantPath.toUri() + " and " + previousPath + "."); - } + sampleNameMap.addSample(sampleName, new URI(variantPathString)); } catch(final URISyntaxException e) { throw new UserException("Malformed URI "+e.toString(), e); @@ -561,9 +555,14 @@ private void initializeHeaderAndSampleMappings() { //it's VERY IMPORTANT that this map is Sorted according to String's natural ordering, if it is not //the resulting database will have incorrect sample names //see https://github.com/broadinstitute/gatk/issues/3682 for more information - sampleNameToVcfPath = loadSampleNameMapFileInSortedOrder(IOUtils.getPath(sampleNameMapFile), bypassFeatureReader); - final Path firstHeaderPath = IOUtils.getPath(sampleNameToVcfPath.entrySet().iterator().next().getValue().toString()); - final VCFHeader header = getHeaderFromPath(firstHeaderPath); + // The SampleNameMap class guarantees that the samples will be sorted correctly. + sampleNameMap = new SampleNameMap(IOUtils.getPath(sampleNameMapFile), bypassFeatureReader); + + final String firstSample = sampleNameMap.getSampleNameToVcfPath().entrySet().iterator().next().getKey(); + final Path firstVCFPath = sampleNameMap.getVCFForSampleAsPath(firstSample); + final Path firstVCFIndexPath = sampleNameMap.getVCFIndexForSampleAsPath(firstSample); + final VCFHeader header = getHeaderFromPath(firstVCFPath, firstVCFIndexPath); + //getMetaDataInInputOrder() returns an ImmutableSet - LinkedHashSet is mutable and preserves ordering mergedHeaderLines = new LinkedHashSet(header.getMetaDataInInputOrder()); mergedHeaderSequenceDictionary = header.getSequenceDictionary(); @@ -592,10 +591,17 @@ else if (getIntervalsFromExistingWorkspace){ if ( mergedHeaderSequenceDictionary == null) { throw new UserException("The merged vcf header has no sequence dictionary. Please provide a header that contains a sequence dictionary."); } + + // If any indices were specified in the sample name map file, make sure + // that --bypass-feature-reader wasn't also specified: + if ( sampleNameMap != null && sampleNameMap.indicesSpecified() && bypassFeatureReader ) { + throw new UserException("Indices were specified for some VCFs in the sample name map file, but --" + BYPASS_FEATURE_READER + + " was also specified. Specifying explicit indices is not supported when running with --" + BYPASS_FEATURE_READER); + } } - private VCFHeader getHeaderFromPath(final Path variantPath) { - try(final FeatureReader reader = getReaderFromPath(variantPath)) { + private VCFHeader getHeaderFromPath(final Path variantPath, final Path variantIndexPath) { + try(final FeatureReader reader = getReaderFromPath(variantPath, variantIndexPath)) { return (VCFHeader) reader.getHeader(); } catch (final IOException e) { throw new UserException("Error while reading vcf header from " + variantPath.toUri(), e); @@ -610,85 +616,7 @@ private static void assertGVCFHasOnlyOneSample(final String variantPath, final V } } - /** - * Load a tab delimited new line separated file of sample name to URI mapping: - * this maintains the keys in the same order that they appeared in the file - * - * This tool should only call {@link #loadSampleNameMapFileInSortedOrder(Path)}. - * This non-sorting overload is exposed for testing purposes only. - * - * ex: - * - * Sample1\tpathToSample1.vcf\n - * Sample2\tpathTosample2.vcf\n - * ... - * - * The sample names must be unique. - * @param sampleToFileMapPath path to the mapping file - * @return map of sample name to corresponding file, the map will be ordered according to the order in the input file - */ - @VisibleForTesting - static LinkedHashMap loadSampleNameMapFile(final Path sampleToFileMapPath) { - return loadSampleNameMapFile(sampleToFileMapPath, false); - } - - private static LinkedHashMap loadSampleNameMapFile(final Path sampleToFileMapPath, - final boolean checkVcfIsCompressedAndIndexed) { - try { - final List lines = Files.readAllLines(sampleToFileMapPath); - if (lines.isEmpty()) { - throw new UserException.BadInput( "At least 1 sample is required but none were found in the sample mapping file"); - } - final LinkedHashMap sampleToFilename = new LinkedHashMap<>(); - for ( final String line : lines) { - final String[] split = line.split("\\t",-1); - if (split.length != 2) { - throw new UserException.BadInput("Expected a file with 2 fields per line in the format\nSample\tFile\n but found line: \"" - + line +"\" with "+split.length+" fields"); - } - if ( !split[0].trim().equals(split[0]) || split[0].trim().isEmpty() - || split[1].trim().isEmpty()) { - throw new UserException.BadInput("Expected a file of format\nSample\tFile\n but found line: '" + line + "'\nValid sample names must be non-empty strings that cannot begin or end with whitespace and valid file names must be non-empty and not all whitespace"); - } - final String sample = split[0]; - final String path = split[1].trim(); - try { - final URI oldPath = sampleToFilename.put(sample, new URI(path)); - if (oldPath != null){ - throw new UserException.BadInput("Found two mappings for the same sample: " + sample + "\n" + path + "\n" + oldPath ); - } - if (checkVcfIsCompressedAndIndexed) { - assertVariantFileIsCompressedAndIndexed(IOUtils.getPath(path)); - } - } - catch(final URISyntaxException e) { - throw new UserException("Malformed URI "+e.toString()); - } - } - return sampleToFilename; - } catch (final IOException e) { - throw new UserException.CouldNotReadInputFile(sampleToFileMapPath, "exception while reading sample->filename mapping file", e); - } - } - - /** - * load a tab delimited new line separated file of sample name to URI mapping: - * - * ex: - * Sample1\tpathToSample1.vcf\n - * Sample2\tpathTosample2.vcf\n - * ... - * - * The sample names must be unique. - * @param sampleToFileMapPath path to the mapping file - * @param checkVcfIsCompressedAndIndexed boolean indicating whether to check vcf is compressed and indexed - * @return map of sample name to corresponding file, sorted by sample name - */ - public static SortedMap loadSampleNameMapFileInSortedOrder(final Path sampleToFileMapPath, - final boolean checkVcfIsCompressedAndIndexed){ - return new TreeMap<>(loadSampleNameMapFile(sampleToFileMapPath, checkVcfIsCompressedAndIndexed)); - } /** * write out interval list to file @@ -757,11 +685,11 @@ private Void logMessageOnBatchCompletion(final BatchCompletionCallbackFunctionAr logger.info("Done importing batch " + arg.batchCount + "/" + arg.totalBatchCount); logger.debug("List of samples imported in batch " + arg.batchCount + ":"); int index = 0; - final int sampleCount = sampleNameToVcfPath.size(); + final int sampleCount = sampleNameMap.getNumSamples(); final int updatedBatchSize = (batchSize == DEFAULT_ZERO_BATCH_SIZE) ? sampleCount : batchSize; final int startBatch = (arg.batchCount - 1) * updatedBatchSize; final int stopBatch = arg.batchCount * updatedBatchSize; - for(String key : sampleNameToVcfPath.keySet()) { + for(String key : sampleNameMap.getSampleNamesInSortedOrder()) { index++; if (index <= startBatch || index > stopBatch) { continue; @@ -817,7 +745,7 @@ private List generatePartitionListFromI private List generateIntervalListFromVidMap() { try { GenomicsDBVidMapProto.VidMappingPB vidMapPB = - org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBUtils.getProtobufVidMappingFromJsonFile(vidMapJSONFile); + GATKGenomicsDBUtils.getProtobufVidMappingFromJsonFile(vidMapJSONFile); List partitions = Arrays.asList(GenomicsDBUtils.listGenomicsDBArrays(workspace)); return partitions.stream().flatMap(partition -> { @@ -863,7 +791,7 @@ private ImportConfig createImportConfig(final int batchSize) { importConfigurationBuilder.setConsolidateTiledbArrayAfterLoad(doConsolidation); importConfigurationBuilder.setEnableSharedPosixfsOptimizations(sharedPosixFSOptimizations); ImportConfig importConfig = new ImportConfig(importConfigurationBuilder.build(), validateSampleToReaderMap, true, - batchSize, mergedHeaderLines, sampleNameToVcfPath, bypassFeatureReader ? null : this::createSampleToReaderMap, + batchSize, mergedHeaderLines, sampleNameMap.getSampleNameToVcfPath(), bypassFeatureReader ? null : this::createSampleToReaderMap, doIncrementalImport); importConfig.setOutputCallsetmapJsonFile(callsetMapJSONFile); importConfig.setOutputVidmapJsonFile(vidMapJSONFile); @@ -891,7 +819,7 @@ public void traverse() { // Force the progress meter to update after every batch progressMeter.setRecordsBetweenTimeChecks(1L); - final int sampleCount = sampleNameToVcfPath.size(); + final int sampleCount = sampleNameMap.getNumSamples(); final int updatedBatchSize = (batchSize == DEFAULT_ZERO_BATCH_SIZE) ? sampleCount : batchSize; final ImportConfig importConfig = createImportConfig(updatedBatchSize); @@ -899,7 +827,7 @@ public void traverse() { try { importer = new GenomicsDBImporter(importConfig); // Modify importer directly from updateImportProtobufVidMapping. - org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBUtils.updateImportProtobufVidMapping(importer); + GATKGenomicsDBUtils.updateImportProtobufVidMapping(importer); if (mergeContigsIntoNumPartitions != 0) { if (!doIncrementalImport) { assertIntervalsCoverEntireContigs(importer, intervals); @@ -952,8 +880,9 @@ private SortedMap> getFeatureReadersInPara final String sampleName = sampleNames.get(i); futures.put(sampleName, inputPreloadExecutorService.submit(() -> { final Path variantPath = IOUtils.getPath(sampleNametoPath.get(sampleName).toString()); + final Path variantIndexPath = sampleNameMap.getVCFIndexForSampleAsPath(sampleName); try { - return new InitializedQueryWrapper(getReaderFromPath(variantPath), intervals.get(0)); + return new InitializedQueryWrapper(getReaderFromPath(variantPath, variantIndexPath), intervals.get(0)); } catch (final IOException e) { throw new UserException.CouldNotReadInputFile("Couldn't read file: " + variantPath.toUri(), e); } @@ -980,7 +909,9 @@ private SortedMap> getFeatureReadersSerial final List sampleNames = new ArrayList<>(sampleNameToPath.keySet()); for(int i = lowerSampleIndex; i < sampleNameToPath.size() && i < lowerSampleIndex+batchSize; ++i) { final String sampleName = sampleNames.get(i); - final FeatureReader reader = getReaderFromPath(IOUtils.getPath(sampleNameToPath.get(sampleName).toString())); + final Path variantPath = IOUtils.getPath(sampleNameToPath.get(sampleName).toString()); + final Path variantIndexPath = sampleNameMap.getVCFIndexForSampleAsPath(sampleName); + final FeatureReader reader = getReaderFromPath(variantPath, variantIndexPath); sampleToReaderMap.put(sampleName, reader); } logger.info("Importing batch " + this.batchCount + " with " + sampleToReaderMap.size() + " samples"); @@ -993,10 +924,13 @@ private SortedMap> getFeatureReadersSerial * @return Feature reader * @param variantPath */ - private FeatureReader getReaderFromPath(final Path variantPath) { + private FeatureReader getReaderFromPath(final Path variantPath, final Path variantIndexPath) { + // TODO: we repeatedly convert between URI, Path, and String in this tool. Is this necessary? final String variantURI = variantPath.toAbsolutePath().toUri().toString(); + final String variantIndexURI = variantIndexPath == null ? null : variantIndexPath.toAbsolutePath().toUri().toString(); + try { - final FeatureReader reader = AbstractFeatureReader.getFeatureReader(variantURI, null, new VCFCodec(), true, + final FeatureReader reader = AbstractFeatureReader.getFeatureReader(variantURI, variantIndexURI, new VCFCodec(), true, BucketUtils.getPrefetchingWrapper(cloudPrefetchBuffer), BucketUtils.getPrefetchingWrapper(cloudIndexPrefetchBuffer)); @@ -1058,7 +992,7 @@ public VariantContext next() { */ private String overwriteCreateOrCheckWorkspace() { String workspaceDir = genomicsDBGetAbsolutePath(workspace); - // From JavaDoc for GenomicsDBUtils.createTileDBWorkspacevid + // From JavaDoc for GATKGenomicsDBUtils.createTileDBWorkspacevid // returnCode = 0 : OK. If overwriteExistingWorkspace is true and the workspace exists, it is deleted first. // returnCode = -1 : path was not a directory // returnCode = -2 : failed to create workspace diff --git a/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/SampleNameMap.java b/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/SampleNameMap.java new file mode 100644 index 00000000000..0448201d850 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/SampleNameMap.java @@ -0,0 +1,274 @@ +package org.broadinstitute.hellbender.tools.genomicsdb; + +import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.utils.io.IOUtils; + +import java.io.IOException; +import java.net.URI; +import java.net.URISyntaxException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.*; + +/** + * A class to hold the mappings of sample names to VCF / VCF index paths. Used by GenomicsDBImport. + * + * This class can be constructed from a textual file containing lines in the format: + * + * Sample\tVCF + * or: + * Sample\tVCF\tIndex + * + * The sample names may have internal whitespace, but not leading/trailing whitespace. + * The VCF and Index URIs may have leading/trailing whitespace, which is ignored. + * + * The third Index column is optional. It is permitted to specify the index for some samples + * and not others. If an index is not specified for a sample, its location is inferred from + * the VCF URI. + * + * It is also possible to construct an empty SampleNameMap using the no-arg constructor, and + * add sample mappings one at a time using addSample(). + */ +public final class SampleNameMap { + // Sorted mapping between sample names and corresponding GVCF file name + // + // IMPORTANT: This must be sorted or it will result in sample name swaps in the output database. + // This happens because the callset json is generated independently from the import process + // each imported batch is then sorted, so if we have an unsorted list we'll end up with different + // global vs batch sorting. + // We preemptively sort here so we will have consistent sorting. + private SortedMap sampleNameToVcfPath; + + // Mapping between sample names and corresponding VCF index path + // + // This Map contains only indices specified explicitly via the sample name map file. + // If an explicit index is not specified for a given sample, it will not have an + // entry in this Map, and the index path will be automatically inferred based on + // the location of the VCF. + // + // The ordering of the entries in this Map does not actually matter, since it's not + // directly exposed, and is used only for individual lookups via getVCFIndexForSample() + private SortedMap sampleNameToVcfIndexPath; + + /** + * Create an empty SampleNameMap. Samples can be added later using addSample() + */ + public SampleNameMap() { + sampleNameToVcfPath = new TreeMap<>(); + sampleNameToVcfIndexPath = new TreeMap<>(); + } + + /** + * Create a SampleNameMap from a textual file containing the sample mappings. The + * lines in this file must be in the format: + * + * Sample\tVCF + * or: + * Sample\tVCF\tIndex + * + * The sample names may have internal whitespace, but not leading/trailing whitespace. + * The VCF and Index URIs may have leading/trailing whitespace, which is ignored. + * + * The third Index column is optional. It is permitted to specify the index for some samples + * and not others. If an index is not specified for a sample, its location is inferred from + * the VCF URI. + * + * @param sampleMapFilePath Path to the file containing the sample name mappings to load + */ + public SampleNameMap(final Path sampleMapFilePath) { + this(sampleMapFilePath, false); + } + + /** + * Create a SampleNameMap from a textual file containing the sample mappings. The + * lines in this file must be in the format: + * + * SampleName1\tVCF + * or: + * SampleName1\tVCF\tIndex + * + * The sample names may have internal whitespace, but not leading/trailing whitespace. + * The VCF and Index URIs may have leading/trailing whitespace, which is ignored. + * + * The third Index column is optional. It is permitted to specify the index for some samples + * and not others. If an index is not specified for a sample, its location is inferred from + * the VCF URI. + * + * @param sampleMapFilePath Path to the file containing the sample name mappings to load + * @param checkVcfIsCompressedAndIndexed If true, check each VCF to make sure it's compressed and indexed + */ + public SampleNameMap(final Path sampleMapFilePath, final boolean checkVcfIsCompressedAndIndexed) { + sampleNameToVcfPath = new TreeMap<>(); + sampleNameToVcfIndexPath = new TreeMap<>(); + + loadSampleNameMapFile(sampleMapFilePath, checkVcfIsCompressedAndIndexed); + } + + private void loadSampleNameMapFile(final Path sampleToFileMapPath, final boolean checkVcfIsCompressedAndIndexed) { + try { + final List lines = Files.readAllLines(sampleToFileMapPath); + if (lines.isEmpty()) { + throw new UserException.BadInput( "At least 1 sample is required but none were found in the sample mapping file"); + } + + for (final String line : lines) { + final String[] split = line.split("\\t",-1); + if (split.length != 2 && split.length != 3) { + throw new UserException.BadInput("Sample name map file must have 2 or 3 fields per line in the format:\nSample\tFile\nor:\nSample\tFile\tIndex\nbut found line: \"" + + line +"\" with "+split.length+" fields"); + } + if ( ! sampleNameIsLegal(split[0]) || split[1].trim().isEmpty()) { + throw new UserException.BadInput("Sample name map file must have lines in the format:\nSample\tFile\nor:\nSample\tFile\tIndex\n but found line: '" + line + "'\nValid sample names must be non-empty strings that cannot begin or end with whitespace and valid file names must be non-empty and not all whitespace"); + } + final String sample = split[0]; + final String vcfPath = split[1].trim(); + + String vcfIndexPath = null; + if ( split.length == 3 ) { + vcfIndexPath = split[2].trim(); + + if ( vcfIndexPath.isEmpty() ) { + throw new UserException.BadInput("Found a line in the sample name map file with an empty or all-whitespace value for the index:\n" + "\"" + line + "\""); + } + } + + try { + final URI existingVCFPath = sampleNameToVcfPath.put(sample, new URI(vcfPath)); + if (existingVCFPath != null){ + throw new UserException.BadInput("Found two mappings for the same sample: " + sample + "\n" + vcfPath + "\n" + existingVCFPath); + } + + if ( vcfIndexPath != null ) { + final URI existingVCFIndexPath = sampleNameToVcfIndexPath.put(sample, new URI(vcfIndexPath)); + if (existingVCFIndexPath != null) { + throw new UserException.BadInput("Found two indices for the same sample: " + sample + "\n" + vcfIndexPath + "\n" + existingVCFIndexPath); + } + } + + if (checkVcfIsCompressedAndIndexed) { + GATKGenomicsDBUtils.assertVariantFileIsCompressedAndIndexed(IOUtils.getPath(vcfPath), vcfIndexPath == null ? null : IOUtils.getPath(vcfIndexPath)); + } + } + catch(final URISyntaxException e) { + throw new UserException("Malformed URI: " + e.toString()); + } + } + } catch (final IOException e) { + throw new UserException.CouldNotReadInputFile(sampleToFileMapPath, "exception while reading sample->filename mapping file", e); + } + } + + /** + * Tests whether the sample name is legal. Sample names must be non-empty, and + * may have internal whitespace but not leading/trailing whitespace. + * + * @param sampleName sample name to test + * @return true if sampleName is legal, otherwise false + */ + private boolean sampleNameIsLegal(final String sampleName) { + return sampleName != null && + ! sampleName.trim().isEmpty() && + sampleName.trim().equals(sampleName); + } + + /** + * Add a new sample mapping + * + * @param sampleName name of the sample + * @param vcfPath path to the VCF for the sample + */ + public void addSample(final String sampleName, final URI vcfPath) { + addSample(sampleName, vcfPath, null); + } + + /** + * Add a new sample mapping + * + * @param sampleName name of the sample + * @param vcfPath path to the VCF for the sample (not null) + * @param vcfIndexPath path to the index for the sample (may be null) + */ + public void addSample(final String sampleName, final URI vcfPath, final URI vcfIndexPath) { + if ( ! sampleNameIsLegal(sampleName) ) { + throw new UserException.BadInput("Sample name " + sampleName + " is not legal. Sample names must be non-empty and not contain leading or trailing whitespace"); + } + if ( vcfPath == null ) { + throw new UserException.BadInput("VCF path for sample " + sampleName + " was null"); + } + + final URI previousPath = sampleNameToVcfPath.put(sampleName, vcfPath); + if (previousPath != null) { + throw new UserException.BadInput("Duplicate sample: " + sampleName + ". Sample was found in both " + + vcfPath + " and " + previousPath + "."); + } + + if (vcfIndexPath != null) { + final URI previousIndexPath = sampleNameToVcfIndexPath.put(sampleName, vcfIndexPath); + if (previousIndexPath != null) { + throw new UserException.BadInput("For sample " + sampleName + ", attempted to specify multiple indices: " + vcfIndexPath + " and " + previousIndexPath); + } + } + } + + /** + * @return The full mapping of sample names -> VCF paths, with the sample names in sorted order + */ + public SortedMap getSampleNameToVcfPath() { + return sampleNameToVcfPath; + } + + /** + * @param sample sample name + * @return the VCF associated with that sample name, as a URI + */ + public URI getVCFForSample(final String sample) { + return sampleNameToVcfPath.get(sample); + } + + /** + * @param sample sample name + * @return the VCF associated with that sample name, as a Path + */ + public Path getVCFForSampleAsPath(final String sample) { + final URI vcfURI = sampleNameToVcfPath.get(sample); + return vcfURI == null ? null : IOUtils.getPath(vcfURI.toString()); + } + + /** + * @param sample sample name + * @return the VCF index associated with that sample name, as a URI, or null if no index + */ + public URI getVCFIndexForSample(final String sample) { + return sampleNameToVcfIndexPath.get(sample); + } + + /** + * @param sample sample name + * @return the VCF index associated with that sample name, as a Path, or null if no index + */ + public Path getVCFIndexForSampleAsPath(final String sample) { + final URI vcfIndexURI = sampleNameToVcfIndexPath.get(sample); + return vcfIndexURI == null ? null : IOUtils.getPath(vcfIndexURI.toString()); + } + + /** + * @return number of samples in this Map + */ + public int getNumSamples() { + return sampleNameToVcfPath.size(); + } + + /** + * @return a List of the sample names in this Map in sorted order + */ + public List getSampleNamesInSortedOrder() { + return new ArrayList<>(sampleNameToVcfPath.keySet()); + } + + /** + * @return true if an index was specified for at least one sample, otherwise false + */ + public boolean indicesSpecified() { + return ! sampleNameToVcfIndexPath.isEmpty(); + } +} diff --git a/src/test/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImportIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImportIntegrationTest.java index 27539d6bef0..17ba65831ea 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImportIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImportIntegrationTest.java @@ -2,10 +2,13 @@ import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.samtools.util.FileExtensions; import htsjdk.samtools.util.IntervalList; import htsjdk.tribble.AbstractFeatureReader; import htsjdk.tribble.CloseableTribbleIterator; import htsjdk.tribble.FeatureReader; +import htsjdk.tribble.index.Index; +import htsjdk.tribble.index.IndexFactory; import htsjdk.tribble.readers.LineIterator; import htsjdk.variant.bcf2.BCF2Codec; import htsjdk.variant.variantcontext.Allele; @@ -24,6 +27,7 @@ import java.io.File; import java.io.IOException; import java.nio.file.Files; +import java.nio.file.Path; import java.nio.file.Paths; import java.nio.file.StandardCopyOption; import java.util.ArrayList; @@ -47,6 +51,7 @@ import org.broadinstitute.hellbender.testutils.ArgumentsBuilder; import org.broadinstitute.hellbender.testutils.BaseTest; import org.broadinstitute.hellbender.testutils.VariantContextTestUtils; +import org.broadinstitute.hellbender.tools.IndexFeatureFile; import org.broadinstitute.hellbender.utils.SimpleInterval; import org.broadinstitute.hellbender.utils.Utils; import org.broadinstitute.hellbender.utils.gcs.BucketUtils; @@ -65,8 +70,11 @@ @Test(groups = {"variantcalling"}) public final class GenomicsDBImportIntegrationTest extends CommandLineProgramTest { private static final String HG_00096 = largeFileTestDir + "gvcfs/HG00096.g.vcf.gz"; + private static final String HG_00096_SAMPLE_NAME = "HG00096"; private static final String HG_00268 = largeFileTestDir + "gvcfs/HG00268.g.vcf.gz"; + private static final String HG_00268_SAMPLE_NAME = "HG00268"; private static final String NA_19625 = largeFileTestDir + "gvcfs/NA19625.g.vcf.gz"; + private static final String NA_19625_SAMPLE_NAME = "NA19625"; //The following 3 files were obtained by running CombineGVCFs on the above 3 files (separately). This introduces spanning //deletions in the files. Hence, these files can be used to test for spanning deletions in the input VCF. private static final String HG_00096_after_combine_gvcfs = largeFileTestDir + "gvcfs/HG00096_after_combine_gvcfs.g.vcf.gz"; @@ -879,6 +887,224 @@ private static File getSampleMapFile(final Map mapping){ .collect(Collectors.joining("\n"))); } + @DataProvider + public Object[][] dataForTestExplicitIndicesInSampleNameMap() { + final Map originalVCFsInOrder = new LinkedHashMap<>(); + originalVCFsInOrder.put(HG_00096_SAMPLE_NAME, new File(HG_00096)); + originalVCFsInOrder.put(HG_00268_SAMPLE_NAME, new File(HG_00268)); + originalVCFsInOrder.put(NA_19625_SAMPLE_NAME, new File(NA_19625)); + + final Map originalVCFsOutOfOrder = new LinkedHashMap<>(); + originalVCFsOutOfOrder.put(NA_19625_SAMPLE_NAME, new File(NA_19625)); + originalVCFsOutOfOrder.put(HG_00268_SAMPLE_NAME, new File(HG_00268)); + originalVCFsOutOfOrder.put(HG_00096_SAMPLE_NAME, new File(HG_00096)); + + return new Object[][] { + // All VCFs have explicit indices, samples in order, TABIX index + { originalVCFsInOrder, Arrays.asList(HG_00096_SAMPLE_NAME, HG_00268_SAMPLE_NAME, NA_19625_SAMPLE_NAME), false }, + + // All VCFs have explicit indices, samples in order, TRIBBLE index + { originalVCFsInOrder, Arrays.asList(HG_00096_SAMPLE_NAME, HG_00268_SAMPLE_NAME, NA_19625_SAMPLE_NAME), true }, + + // Some VCFs have explicit indices, samples in order, TABIX index + { originalVCFsInOrder, Arrays.asList(HG_00268_SAMPLE_NAME), false }, + + // Some VCFs have explicit indices, samples in order, TRIBBLE index + { originalVCFsInOrder, Arrays.asList(HG_00268_SAMPLE_NAME), true }, + + // All VCFs have explicit indices, samples out of order, TABIX index + { originalVCFsOutOfOrder, Arrays.asList(HG_00096_SAMPLE_NAME, HG_00268_SAMPLE_NAME, NA_19625_SAMPLE_NAME), false }, + + // All VCFs have explicit indices, samples out of order, TRIBBLE index + { originalVCFsOutOfOrder, Arrays.asList(HG_00096_SAMPLE_NAME, HG_00268_SAMPLE_NAME, NA_19625_SAMPLE_NAME), true }, + + // Some VCFs have explicit indices, samples out of order, TABIX index + { originalVCFsOutOfOrder, Arrays.asList(HG_00268_SAMPLE_NAME), false }, + + // Some VCFs have explicit indices, samples out of order, TRIBBLE index + { originalVCFsOutOfOrder, Arrays.asList(HG_00268_SAMPLE_NAME), true } + }; + } + + // Test that we can handle explicit index files from a sample name map locally. + // The cloud version of this test is separate. + // Note that this test decompresses/reindexes its GVCFs on-the-fly as necessary in order + // to avoid our having to check uncompressed VCFs in to our repo + @Test(dataProvider = "dataForTestExplicitIndicesInSampleNameMap") + public void testExplicitIndicesInSampleNameMap(final Map originalVCFs, final List samplesWithExplicitIndices, final boolean useTribbleIndex) throws IOException { + final String workspace = createTempDir("testExplicitIndicesInSampleNameMap").getAbsolutePath() + "/workspace"; + final File vcfDir = createTempDir("testExplicitIndicesInSampleNameMap_vcfs"); + final File indexDir = createTempDir("testExplicitIndicesInSampleNameMap_indices"); + Assert.assertNotEquals(vcfDir, indexDir, + "testExplicitIndicesInSampleNameMap failed to create separate directories for the vcfs and their indices"); + + final StringBuilder sampleNameMapContents = new StringBuilder(); + + for ( final Map.Entry originalVCFEntry : originalVCFs.entrySet() ) { + final String sampleName = originalVCFEntry.getKey(); + final File originalVCFFile = originalVCFEntry.getValue(); + final boolean createExplicitIndex = samplesWithExplicitIndices.contains(sampleName); + + final Path originalVCFPath = originalVCFFile.toPath(); + final String uncompressedVCFName = originalVCFFile.getName().replaceAll("\\.gz$", ""); + Path vcfDestination = new File(vcfDir, originalVCFFile.getName()).toPath(); + if ( useTribbleIndex ) { + vcfDestination = new File(vcfDir, uncompressedVCFName).toPath(); + IOUtils.gunzip(originalVCFPath.toAbsolutePath().toFile(), vcfDestination.toAbsolutePath().toFile()); + } else { + Files.copy(originalVCFPath, vcfDestination); + } + + final File originalVCFIndexFile = new File(originalVCFFile.getAbsolutePath() + FileExtensions.TABIX_INDEX); + Assert.assertTrue(originalVCFIndexFile.exists()); + final File thisVCFIndexDir = createExplicitIndex ? indexDir : vcfDir; + Path vcfIndexDestination = new File(thisVCFIndexDir, originalVCFIndexFile.getName()).toPath(); + if ( useTribbleIndex ) { + vcfIndexDestination = new File(thisVCFIndexDir, uncompressedVCFName + FileExtensions.TRIBBLE_INDEX).toPath(); + final Index inMemoryIndex = IndexFactory.createLinearIndex(vcfDestination, new VCFCodec(), IndexFeatureFile.OPTIMAL_GVCF_INDEX_BIN_SIZE); + inMemoryIndex.write(vcfIndexDestination); + } else { + Files.copy(originalVCFIndexFile.toPath(), vcfIndexDestination); + } + + if ( createExplicitIndex ) { + sampleNameMapContents.append(String.format("%s\t%s\t%s\n", sampleName, vcfDestination.toAbsolutePath().toString(), vcfIndexDestination.toAbsolutePath().toString())); + } else { + sampleNameMapContents.append(String.format("%s\t%s\n", sampleName, vcfDestination.toAbsolutePath().toString())); + } + } + + final File sampleNameMapFile = IOUtils.writeTempFile(sampleNameMapContents.toString(), "testExplicitIndicesInSampleNameMap_samplemap", ".txt"); + + final ArgumentsBuilder args = new ArgumentsBuilder(); + args.add(GenomicsDBImport.SAMPLE_NAME_MAP_LONG_NAME, sampleNameMapFile.getAbsolutePath()) + .addInterval(INTERVAL.get(0)) + .add(GenomicsDBImport.WORKSPACE_ARG_LONG_NAME, workspace); + runCommandLine(args); + + checkJSONFilesAreWritten(workspace); + checkGenomicsDBAgainstExpected(workspace, INTERVAL, COMBINED, b38_reference_20_21, true, ATTRIBUTES_TO_IGNORE); + checkGenomicsDBAgainstExpected(workspace, INTERVAL, COMBINED, b38_reference_20_21, true, ATTRIBUTES_TO_IGNORE, false, false, true); + } + + @DataProvider + public Object[][] dataForTestExplicitIndicesInSampleNameMapInTheCloud() { + final String GVCFS_WITH_INDICES_BUCKET = "gs://hellbender/test/resources/org/broadinstitute/hellbender/tools/genomicsdb/gvcfs_with_indices/"; + final String GVCFS_WITHOUT_INDICES_BUCKET = "gs://hellbender/test/resources/org/broadinstitute/hellbender/tools/genomicsdb/gvcfs_without_indices/"; + final String GVCF_INDICES_ONLY_BUCKET = "gs://hellbender/test/resources/org/broadinstitute/hellbender/tools/genomicsdb/gvcf_indices_only/"; + + final String HG00096_COMPRESSED_WITH_INDEX = GVCFS_WITH_INDICES_BUCKET + "HG00096.g.vcf.gz"; + final String HG00096_COMPRESSED_NO_INDEX = GVCFS_WITHOUT_INDICES_BUCKET + "HG00096.g.vcf.gz"; + final String HG00096_COMPRESSED_INDEX = GVCF_INDICES_ONLY_BUCKET + "HG00096.g.vcf.gz.tbi"; + final String HG00096_UNCOMPRESSED_WITH_INDEX = GVCFS_WITH_INDICES_BUCKET + "HG00096.g.vcf"; + final String HG00096_UNCOMPRESSED_NO_INDEX = GVCFS_WITHOUT_INDICES_BUCKET + "HG00096.g.vcf"; + final String HG00096_UNCOMPRESSED_INDEX = GVCF_INDICES_ONLY_BUCKET + "HG00096.g.vcf.idx"; + + final String HG00268_COMPRESSED_WITH_INDEX = GVCFS_WITH_INDICES_BUCKET + "HG00268.g.vcf.gz"; + final String HG00268_COMPRESSED_NO_INDEX = GVCFS_WITHOUT_INDICES_BUCKET + "HG00268.g.vcf.gz"; + final String HG00268_COMPRESSED_INDEX = GVCF_INDICES_ONLY_BUCKET + "HG00268.g.vcf.gz.tbi"; + final String HG00268_UNCOMPRESSED_WITH_INDEX = GVCFS_WITH_INDICES_BUCKET + "HG00268.g.vcf"; + final String HG00268_UNCOMPRESSED_NO_INDEX = GVCFS_WITHOUT_INDICES_BUCKET + "HG00268.g.vcf"; + final String HG00268_UNCOMPRESSED_INDEX = GVCF_INDICES_ONLY_BUCKET + "HG00268.g.vcf.idx"; + + final String NA19625_COMPRESSED_WITH_INDEX = GVCFS_WITH_INDICES_BUCKET + "NA19625.g.vcf.gz"; + final String NA19625_COMPRESSED_NO_INDEX = GVCFS_WITHOUT_INDICES_BUCKET + "NA19625.g.vcf.gz"; + final String NA19625_COMPRESSED_INDEX = GVCF_INDICES_ONLY_BUCKET + "NA19625.g.vcf.gz.tbi"; + final String NA19625_UNCOMPRESSED_WITH_INDEX = GVCFS_WITH_INDICES_BUCKET + "NA19625.g.vcf"; + final String NA19625_UNCOMPRESSED_NO_INDEX = GVCFS_WITHOUT_INDICES_BUCKET + "NA19625.g.vcf"; + final String NA19625_UNCOMPRESSED_INDEX = GVCF_INDICES_ONLY_BUCKET + "NA19625.g.vcf.idx"; + + return new Object[][] { + // All VCFs have explicit indices, samples in order, TABIX index + { + HG_00096_SAMPLE_NAME + "\t" + HG00096_COMPRESSED_NO_INDEX + "\t" + HG00096_COMPRESSED_INDEX + "\n" + + HG_00268_SAMPLE_NAME + "\t" + HG00268_COMPRESSED_NO_INDEX + "\t" + HG00268_COMPRESSED_INDEX + "\n" + + NA_19625_SAMPLE_NAME + "\t" + NA19625_COMPRESSED_NO_INDEX + "\t" + NA19625_COMPRESSED_INDEX + "\n" + }, + + // All VCFs have explicit indices, samples in order, TRIBBLE index + { + HG_00096_SAMPLE_NAME + "\t" + HG00096_UNCOMPRESSED_NO_INDEX + "\t" + HG00096_UNCOMPRESSED_INDEX + "\n" + + HG_00268_SAMPLE_NAME + "\t" + HG00268_UNCOMPRESSED_NO_INDEX + "\t" + HG00268_UNCOMPRESSED_INDEX + "\n" + + NA_19625_SAMPLE_NAME + "\t" + NA19625_UNCOMPRESSED_NO_INDEX + "\t" + NA19625_UNCOMPRESSED_INDEX + "\n" + }, + + // Some VCFs have explicit indices, samples in order, TABIX index + { + HG_00096_SAMPLE_NAME + "\t" + HG00096_COMPRESSED_WITH_INDEX + "\n" + + HG_00268_SAMPLE_NAME + "\t" + HG00268_COMPRESSED_NO_INDEX + "\t" + HG00268_COMPRESSED_INDEX + "\n" + + NA_19625_SAMPLE_NAME + "\t" + NA19625_COMPRESSED_WITH_INDEX + "\n" + }, + + // Some VCFs have explicit indices, samples in order, TRIBBLE index + { + HG_00096_SAMPLE_NAME + "\t" + HG00096_UNCOMPRESSED_WITH_INDEX + "\n" + + HG_00268_SAMPLE_NAME + "\t" + HG00268_UNCOMPRESSED_NO_INDEX + "\t" + HG00268_UNCOMPRESSED_INDEX + "\n" + + NA_19625_SAMPLE_NAME + "\t" + NA19625_UNCOMPRESSED_WITH_INDEX + "\n" + }, + + // All VCFs have explicit indices, samples out of order, TABIX index + { + NA_19625_SAMPLE_NAME + "\t" + NA19625_COMPRESSED_NO_INDEX + "\t" + NA19625_COMPRESSED_INDEX + "\n" + + HG_00268_SAMPLE_NAME + "\t" + HG00268_COMPRESSED_NO_INDEX + "\t" + HG00268_COMPRESSED_INDEX + "\n" + + HG_00096_SAMPLE_NAME + "\t" + HG00096_COMPRESSED_NO_INDEX + "\t" + HG00096_COMPRESSED_INDEX + "\n" + }, + + // All VCFs have explicit indices, samples out of order, TRIBBLE index + { + NA_19625_SAMPLE_NAME + "\t" + NA19625_UNCOMPRESSED_NO_INDEX + "\t" + NA19625_UNCOMPRESSED_INDEX + "\n" + + HG_00268_SAMPLE_NAME + "\t" + HG00268_UNCOMPRESSED_NO_INDEX + "\t" + HG00268_UNCOMPRESSED_INDEX + "\n" + + HG_00096_SAMPLE_NAME + "\t" + HG00096_UNCOMPRESSED_NO_INDEX + "\t" + HG00096_UNCOMPRESSED_INDEX + "\n" + }, + + // Some VCFs have explicit indices, samples out of order, TABIX index + { + NA_19625_SAMPLE_NAME + "\t" + NA19625_COMPRESSED_WITH_INDEX + "\n" + + HG_00268_SAMPLE_NAME + "\t" + HG00268_COMPRESSED_NO_INDEX + "\t" + HG00268_COMPRESSED_INDEX + "\n" + + HG_00096_SAMPLE_NAME + "\t" + HG00096_COMPRESSED_WITH_INDEX + "\n" + }, + + // Some VCFs have explicit indices, samples out of order, TRIBBLE index + { + NA_19625_SAMPLE_NAME + "\t" + NA19625_UNCOMPRESSED_WITH_INDEX + "\n" + + HG_00268_SAMPLE_NAME + "\t" + HG00268_UNCOMPRESSED_NO_INDEX + "\t" + HG00268_UNCOMPRESSED_INDEX + "\n" + + HG_00096_SAMPLE_NAME + "\t" + HG00096_UNCOMPRESSED_WITH_INDEX + "\n" + } + }; + } + + // Test that we can handle explicit index files from a sample name map in the cloud + @Test(dataProvider = "dataForTestExplicitIndicesInSampleNameMapInTheCloud", groups = {"bucket"}) + public void testExplicitIndicesInSampleNameMapInTheCloud(final String sampleNameMapContents) throws IOException { + final String workspace = createTempDir("testExplicitIndicesInSampleNameMapInTheCloud").getAbsolutePath() + "/workspace"; + final File sampleNameMapFile = IOUtils.writeTempFile(sampleNameMapContents, "testExplicitIndicesInSampleNameMapInTheCloud_samplemap", ".txt"); + + final ArgumentsBuilder args = new ArgumentsBuilder(); + args.add(GenomicsDBImport.SAMPLE_NAME_MAP_LONG_NAME, sampleNameMapFile.getAbsolutePath()) + .addInterval(INTERVAL.get(0)) + .add(GenomicsDBImport.WORKSPACE_ARG_LONG_NAME, workspace); + runCommandLine(args); + + checkJSONFilesAreWritten(workspace); + checkGenomicsDBAgainstExpected(workspace, INTERVAL, COMBINED, b38_reference_20_21, true, ATTRIBUTES_TO_IGNORE); + checkGenomicsDBAgainstExpected(workspace, INTERVAL, COMBINED, b38_reference_20_21, true, ATTRIBUTES_TO_IGNORE, false, false, true); + } + + // This test guards against the possibility of someone accidentally putting an index file into + // the "gvcfs_without_indices" bucket directory used by testExplicitIndicesInSampleNameMapInTheCloud() + @Test(groups = {"bucket"}) + public void testUnindexedCloudGVCFsAreActuallyUnindexed() throws IOException { + final String GVCFS_WITHOUT_INDICES_BUCKET = "gs://hellbender/test/resources/org/broadinstitute/hellbender/tools/genomicsdb/gvcfs_without_indices/"; + final Path bucketPath = IOUtils.getPath(GVCFS_WITHOUT_INDICES_BUCKET); + + Files.list(bucketPath).forEach(file -> { + Assert.assertFalse(file.endsWith(FileExtensions.TABIX_INDEX), + "Found a TABIX index in bucket " + GVCFS_WITHOUT_INDICES_BUCKET); + Assert.assertFalse(file.endsWith(FileExtensions.TRIBBLE_INDEX), + "Found a Tribble index in bucket " + GVCFS_WITHOUT_INDICES_BUCKET); + }); + } + @DataProvider public static Iterator getRenameCombinations() { final Map noRemapping = new LinkedHashMap<>(); @@ -1099,15 +1325,15 @@ private static FeatureReader getGenomicsDBFeatureReader( .setGenerateArrayNameFromPartitionBounds(true); GenomicsDBVidMapProto.VidMappingPB vidMapPB = null; try { - vidMapPB = org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBUtils.getProtobufVidMappingFromJsonFile(IOUtils.appendPathToDir(workspace, GenomicsDBConstants.DEFAULT_VIDMAP_FILE_NAME)); + vidMapPB = GATKGenomicsDBUtils.getProtobufVidMappingFromJsonFile(IOUtils.appendPathToDir(workspace, GenomicsDBConstants.DEFAULT_VIDMAP_FILE_NAME)); } catch (final IOException e) { throw new UserException("Could not open vid json file "+GenomicsDBConstants.DEFAULT_VIDMAP_FILE_NAME, e); } HashMap fieldNameToIndexInVidFieldsList = - org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBUtils.getFieldNameToListIndexInProtobufVidMappingObject(vidMapPB); + GATKGenomicsDBUtils.getFieldNameToListIndexInProtobufVidMappingObject(vidMapPB); - vidMapPB = org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBUtils.updateINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList, + vidMapPB = GATKGenomicsDBUtils.updateINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList, GATKVCFConstants.RAW_MAPPING_QUALITY_WITH_DEPTH_KEY, "element_wise_sum"); if(vidMapPB != null) { diff --git a/src/test/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImportUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImportUnitTest.java deleted file mode 100644 index 0a92a8a8944..00000000000 --- a/src/test/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImportUnitTest.java +++ /dev/null @@ -1,111 +0,0 @@ -package org.broadinstitute.hellbender.tools.genomicsdb; - -import htsjdk.tribble.FeatureReader; -import htsjdk.variant.variantcontext.VariantContext; -import org.broadinstitute.hellbender.exceptions.UserException; -import org.broadinstitute.hellbender.utils.io.IOUtils; -import org.broadinstitute.hellbender.GATKBaseTest; -import org.genomicsdb.importer.GenomicsDBImporter; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.io.File; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.util.*; -import java.util.LinkedHashMap; -import java.util.Map; -import java.net.URI; -import java.net.URISyntaxException; - -public class GenomicsDBImportUnitTest extends GATKBaseTest { - - private static final String ORDERED_SAMPLE_MAP = "Sample1\tfile1\n" + - "Sample2\tfile2\n" + - "Sample3\tfile3"; - - private static final String UNORDERED_SAMPLE_MAP = "Sample3\tfile3\n" + - "Sample2\tfile2\n" + - "Sample1\tfile1\n"; - - @DataProvider - public Object[][] getBadSampleNameMapFiles(){ - return new Object[][]{ - {"Sample1\tsamplePath\n" - +"Sample1\tsamplePath"}, // duplicate sample name - {""}, // empty file - {"Sample1\tSample2\tFile"}, // 3 columns - {"Sample1\t"}, // 1 column - {"Sample1"}, // 1 column no delimiter - {"\tfile"}, // empty first token - {" \tfile"}, // first token only whitespace - {"Sample1\tfile1\t"}, // extra tab - {"Sample1\nfile"}, // newline instead of tab - {"\t"}, // only tab - {"Sample1 file1"}, // 1 column - {" name name\tfile1"}, // preceding whitespace - {"name name \tfile1"}, // trailing whitespace - }; - } - - @Test(dataProvider = "getBadSampleNameMapFiles", expectedExceptions = UserException.BadInput.class) - public void testBadInputFiles(final String text){ - final File sampleFile = IOUtils.writeTempFile(text, "badSampleMapping", ".txt"); - GenomicsDBImport.loadSampleNameMapFile(sampleFile.toPath() ); - } - - @DataProvider - public Object[][] getGoodSampleNameMapFileSyntax(){ - return new Object[][]{ - // Note: none of these files are real, these are just valid files syntactically - {"Sample1\tsamplePath1 \n" - +"Sample2\tsamplePath2", new String[][] {{"Sample1","samplePath1"},{"Sample2","samplePath2"}}}, // normal sample names - {"Sample1 001\tFile", new String[][] {{"Sample1 001","File"}}}, // sample names with whitespace - {"name name\tfile1 ", new String[][] {{"name name","file1"}}}, // trailing whitespace second column - {"name name\t file1 ", new String[][] {{"name name","file1"}}} // leading and trailing whitespace second colum - }; - } - - @Test(dataProvider = "getGoodSampleNameMapFileSyntax") - public void testValidSampleFiles(final String text, final String[][] expectedEntries){ - final File sampleFile = IOUtils.writeTempFile(text, "goodSampleMapping", ".txt"); - final LinkedHashMap outputMap = GenomicsDBImport.loadSampleNameMapFile(sampleFile.toPath()); - Assert.assertEquals(outputMap.size(),expectedEntries.length); - - Arrays.stream(expectedEntries).forEach(s -> { Assert.assertTrue(outputMap.containsKey(s[0])); - Assert.assertEquals(outputMap.get(s[0]).toString(),s[1]);}); - } - - @Test - public void testLoadSampleNameMapFilePreservesOrder(){ - final File sampleFile = IOUtils.writeTempFile(UNORDERED_SAMPLE_MAP, "badSampleMapping", ".txt"); - final LinkedHashMap unsortedMap = GenomicsDBImport.loadSampleNameMapFile(sampleFile.toPath()); - Assert.assertEquals(new ArrayList<>(unsortedMap.keySet()), Arrays.asList("Sample3", "Sample2", "Sample1")); - } - - @DataProvider - public Object[][] getSampleMaps(){ - return new Object[][]{ - {ORDERED_SAMPLE_MAP}, - {UNORDERED_SAMPLE_MAP} - }; - } - - @Test(dataProvider = "getSampleMaps") - public void testLoadSampleNameMapFileInSortedOrder(final String sampleMapText){ - final File sampleFile = IOUtils.writeTempFile(sampleMapText, "sampleMapping", ".txt"); - final Map expected = new LinkedHashMap<>(); - try { - expected.put("Sample1", new URI("file1")); - expected.put("Sample2", new URI("file2")); - expected.put("Sample3", new URI("file3")); - } - catch(URISyntaxException e) { - throw new RuntimeException("Malformed URI "+e.toString()); - } - final Map actual = GenomicsDBImport.loadSampleNameMapFileInSortedOrder(sampleFile.toPath(), false); - Assert.assertEquals(actual, expected); - Assert.assertEquals(actual.keySet().iterator().next(), "Sample1"); - } -} diff --git a/src/test/java/org/broadinstitute/hellbender/tools/genomicsdb/SampleNameMapUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/genomicsdb/SampleNameMapUnitTest.java new file mode 100644 index 00000000000..94d393bef61 --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/tools/genomicsdb/SampleNameMapUnitTest.java @@ -0,0 +1,310 @@ +package org.broadinstitute.hellbender.tools.genomicsdb; + +import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.utils.io.IOUtils; +import org.broadinstitute.hellbender.GATKBaseTest; +import org.broadinstitute.hellbender.utils.text.XReadLines; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.IOException; +import java.util.*; +import java.util.LinkedHashMap; +import java.util.Map; +import java.net.URI; +import java.net.URISyntaxException; + +public class SampleNameMapUnitTest extends GATKBaseTest { + + private static final String ORDERED_SAMPLE_MAP = "Sample1\tfile1\n" + + "Sample2\tfile2\n" + + "Sample3\tfile3"; + + private static final String UNORDERED_SAMPLE_MAP = "Sample3\tfile3\n" + + "Sample2\tfile2\n" + + "Sample1\tfile1\n"; + + @DataProvider + public Object[][] getBadSampleNameMapFiles(){ + return new Object[][]{ + {"Sample1\tsamplePath\n" + +"Sample1\tsamplePath"}, // duplicate sample name + {""}, // empty file + {"Sample1\t"}, // 1 column + {"Sample1"}, // 1 column no delimiter + {"\tfile"}, // empty first token + {" \tfile"}, // first token only whitespace + {"Sample1\tfile1\t"}, // extra tab + {"Sample1\nfile"}, // newline instead of tab + {"\t"}, // only tab + {"Sample1 file1"}, // 1 column, internal whitespace + {" Sample1\tfile1"}, // preceding whitespace + {"Sample1 \tfile1"}, // trailing whitespace + {"Sample1\tfile1\t"}, // empty index + {"Sample1\tfile1\t "}, // all-whitespace index + {"Sample1\tfile1\tindex1\textraColumn"}, // 4 columns + {"Sample1\tfile1\tindex1\t"} // 4 columns, blank 4th column + }; + } + + @Test(dataProvider = "getBadSampleNameMapFiles", expectedExceptions = UserException.BadInput.class) + public void testBadInputFiles(final String text){ + final File sampleFile = IOUtils.writeTempFile(text, "badSampleMapping", ".txt"); + final SampleNameMap sampleMap = new SampleNameMap(sampleFile.toPath()); + } + + @DataProvider + public Object[][] getGoodSampleNameMapFiles(){ + return new Object[][]{ + // Note: none of these files are real, these are just valid files syntactically + + // normal sample names, no explicit indices + {"Sample1\tsamplePath1\n" + + "Sample2\tsamplePath2", + new String[][] { + {"Sample1", "samplePath1"}, + {"Sample2", "samplePath2"}}}, + + // normal sample names, explicit indices for all files + {"Sample1\tsamplePath1\tindexPath1\n" + + "Sample2\tsamplePath2\tindexPath2", + new String[][] { + {"Sample1", "samplePath1", "indexPath1"}, + {"Sample2", "samplePath2", "indexPath2"}}}, + + // normal sample names, explicit indices for some files but not others + {"Sample1\tsamplePath1\n" + + "Sample2\tsamplePath2\tindexPath2", + new String[][] { + {"Sample1", "samplePath1"}, + {"Sample2", "samplePath2", "indexPath2"}}}, + + // sample names with internal whitespace + {"Sample1 001\tFile", + new String[][] { + {"Sample1 001", "File"}} + }, + + // leading and trailing whitespace second column + {"name name\t file1 ", + new String[][] { + {"name name", "file1"}} + }, + + // leading and trailing whitespace third column + {"name name\tfile1\t index1 ", + new String[][] { + {"name name", "file1", "index1"}} + }, + + // leading and trailing whitespace second and third columns + {"name name\t file1 \t index1 ", + new String[][] { + {"name name", "file1", "index1"}} + }, + }; + } + + @Test(dataProvider = "getGoodSampleNameMapFiles") + public void testValidSampleFiles(final String text, final String[][] expectedEntries){ + final File sampleFile = IOUtils.writeTempFile(text, "goodSampleMapping", ".txt"); + + final SampleNameMap sampleMap = new SampleNameMap(sampleFile.toPath()); + final SortedMap outputMap = sampleMap.getSampleNameToVcfPath(); + + Assert.assertEquals(outputMap.size(),expectedEntries.length, + "Wrong number of entries in the Map returned by getSampleNameToVcfPath()"); + Assert.assertEquals(sampleMap.getNumSamples(), expectedEntries.length, + "Wrong number of samples returned by getNumSamples()"); + boolean expectedIndicesFound = false; + + for ( final String[] expected : expectedEntries ) { + Assert.assertTrue(outputMap.containsKey(expected[0])); + + Assert.assertEquals(outputMap.get(expected[0]).toString(),expected[1]); + Assert.assertEquals(sampleMap.getVCFForSample(expected[0]).toString(), expected[1], + "Wrong VCF returned by getVCFForSample() for sample " + expected[0]); + Assert.assertEquals(sampleMap.getVCFForSampleAsPath(expected[0]).toString(), expected[1], + "Wrong VCF returned by getVCFForSampleAsPath() for sample " + expected[0]); + + if ( expected.length == 3 ) { + expectedIndicesFound = true; + + Assert.assertNotNull(sampleMap.getVCFIndexForSample(expected[0]), + "No index returned by getVCFIndexForSample() for sample " + expected[0]); + Assert.assertNotNull(sampleMap.getVCFForSampleAsPath(expected[0]), + "No index returned by getVCFForSampleAsPath() for sample " + expected[0]); + + Assert.assertEquals(sampleMap.getVCFIndexForSample(expected[0]).toString(), expected[2], + "Wrong index returned by getVCFIndexForSample() for sample " + expected[0]); + Assert.assertEquals(sampleMap.getVCFIndexForSampleAsPath(expected[0]).toString(), expected[2], + "Wrong index returned by getVCFIndexForSampleAsPath() for sample " + expected[0]); + } else { + Assert.assertNull(sampleMap.getVCFIndexForSample(expected[0]), + "Index unexpectedly returned by getVCFIndexForSample() for sample " + expected[0]); + Assert.assertNull(sampleMap.getVCFIndexForSampleAsPath(expected[0]), + "Index unexpectedly returned by getVCFIndexForSampleAsPath() for sample " + expected[0]); + } + } + + Assert.assertEquals(sampleMap.indicesSpecified(), expectedIndicesFound, + "Wrong value returned by indicesSpecified()"); + + } + + // Test to ensure that the "unsorted" map used in subsequent tests is actually unsorted, + // to guard against future modifications + @Test + public void testUnorderedSampleMapIsActuallyUnordered() throws IOException { + final File sampleFile = IOUtils.writeTempFile(UNORDERED_SAMPLE_MAP, "badSampleMapping", ".txt"); + final List expectedSampleOrdering = Arrays.asList("Sample3", "Sample2", "Sample1"); + + try ( final XReadLines lineReader = new XReadLines(sampleFile) ) { + int lineNumber = 0; + for ( final String line : lineReader ) { + final String sampleFromFile = line.split("\\t", -1)[0]; + Assert.assertEquals(sampleFromFile, expectedSampleOrdering.get(lineNumber)); + ++lineNumber; + } + } + } + + @DataProvider + public Object[][] getSampleMapsForOrderingTest(){ + final Map expectedMap = new LinkedHashMap<>(); + try { + expectedMap.put("Sample1", new URI("file1")); + expectedMap.put("Sample2", new URI("file2")); + expectedMap.put("Sample3", new URI("file3")); + } + catch(URISyntaxException e) { + throw new RuntimeException("Malformed URI "+e.toString()); + } + + final List expectedSampleOrdering = Arrays.asList("Sample1", "Sample2", "Sample3"); + + return new Object[][]{ + {ORDERED_SAMPLE_MAP, expectedMap, expectedSampleOrdering}, + {UNORDERED_SAMPLE_MAP, expectedMap, expectedSampleOrdering} + }; + } + + @Test(dataProvider = "getSampleMapsForOrderingTest") + public void testSampleOrdering(final String sampleMapText, final Map expectedMap, final List expectedSampleOrdering){ + final File sampleFile = IOUtils.writeTempFile(sampleMapText, "sampleMapping", ".txt"); + + final SampleNameMap sampleMap = new SampleNameMap(sampleFile.toPath()); + final SortedMap actualMap = sampleMap.getSampleNameToVcfPath(); + + Assert.assertEquals(actualMap, expectedMap); + Assert.assertEquals(sampleMap.getNumSamples(), expectedSampleOrdering.size(), "Wrong number of samples returned by getNumSamples()"); + Assert.assertEquals(sampleMap.getSampleNamesInSortedOrder().size(), expectedSampleOrdering.size(), "Wrong number of samples returned by getSampleNamesInOrder()"); + + final Iterator actualSamplesFromMap = actualMap.keySet().iterator(); + final Iterator actualSamplesFromGetter = sampleMap.getSampleNamesInSortedOrder().iterator(); + + for ( final String expectedSample : expectedSampleOrdering ) { + Assert.assertEquals(actualSamplesFromMap.next(), expectedSample, + "Wrong sample found in Map returned by getSampleNameToVcfPath()"); + Assert.assertEquals(actualSamplesFromGetter.next(), expectedSample, + "Wrong sample found in Set returned by getSampleNamesInOrder()"); + } + Assert.assertFalse(actualSamplesFromMap.hasNext()); + Assert.assertFalse(actualSamplesFromGetter.hasNext()); + } + + @Test + public void testIncrementalAddition() throws URISyntaxException { + // Use the no-arg constructor to start with an empty SampleNameMap, then + // add samples incrementally: + final SampleNameMap sampleMap = new SampleNameMap(); + Assert.assertEquals(sampleMap.getNumSamples(), 0); + Assert.assertTrue(sampleMap.getSampleNamesInSortedOrder().isEmpty()); + Assert.assertTrue(sampleMap.getSampleNameToVcfPath().isEmpty()); + Assert.assertFalse(sampleMap.indicesSpecified()); + + sampleMap.addSample("Sample3", new URI("file3")); + Assert.assertEquals(sampleMap.getNumSamples(), 1); + Assert.assertEquals(sampleMap.getSampleNamesInSortedOrder().size(), 1); + Assert.assertEquals(sampleMap.getSampleNamesInSortedOrder().get(0), "Sample3"); + Assert.assertEquals(sampleMap.getSampleNameToVcfPath().size(), 1); + Assert.assertEquals(new ArrayList<>(sampleMap.getSampleNameToVcfPath().keySet()), Arrays.asList("Sample3")); + Assert.assertEquals(sampleMap.getVCFForSample("Sample3").toString(), "file3"); + Assert.assertEquals(sampleMap.getVCFForSampleAsPath("Sample3").toString(), "file3"); + Assert.assertFalse(sampleMap.indicesSpecified()); + + sampleMap.addSample("Sample1", new URI("file1"), new URI("index1")); + Assert.assertEquals(sampleMap.getNumSamples(), 2); + Assert.assertEquals(sampleMap.getSampleNamesInSortedOrder().size(), 2); + Assert.assertEquals(sampleMap.getSampleNamesInSortedOrder().get(0), "Sample1"); + Assert.assertEquals(sampleMap.getSampleNamesInSortedOrder().get(1), "Sample3"); + Assert.assertEquals(sampleMap.getSampleNameToVcfPath().size(), 2); + Assert.assertEquals(new ArrayList<>(sampleMap.getSampleNameToVcfPath().keySet()), Arrays.asList("Sample1", "Sample3")); + Assert.assertEquals(sampleMap.getVCFForSample("Sample1").toString(), "file1"); + Assert.assertEquals(sampleMap.getVCFForSampleAsPath("Sample1").toString(), "file1"); + Assert.assertEquals(sampleMap.getVCFForSample("Sample3").toString(), "file3"); + Assert.assertEquals(sampleMap.getVCFForSampleAsPath("Sample3").toString(), "file3"); + Assert.assertTrue(sampleMap.indicesSpecified()); + Assert.assertEquals(sampleMap.getVCFIndexForSample("Sample1").toString(), "index1"); + Assert.assertEquals(sampleMap.getVCFIndexForSampleAsPath("Sample1").toString(), "index1"); + Assert.assertNull(sampleMap.getVCFIndexForSample("Sample3")); + Assert.assertNull(sampleMap.getVCFIndexForSampleAsPath("Sample3")); + + sampleMap.addSample("Sample2", new URI("file2")); + Assert.assertEquals(sampleMap.getNumSamples(), 3); + Assert.assertEquals(sampleMap.getSampleNamesInSortedOrder().size(), 3); + Assert.assertEquals(sampleMap.getSampleNamesInSortedOrder().get(0), "Sample1"); + Assert.assertEquals(sampleMap.getSampleNamesInSortedOrder().get(1), "Sample2"); + Assert.assertEquals(sampleMap.getSampleNamesInSortedOrder().get(2), "Sample3"); + Assert.assertEquals(sampleMap.getSampleNameToVcfPath().size(), 3); + Assert.assertEquals(new ArrayList<>(sampleMap.getSampleNameToVcfPath().keySet()), Arrays.asList("Sample1", "Sample2", "Sample3")); + Assert.assertEquals(sampleMap.getVCFForSample("Sample1").toString(), "file1"); + Assert.assertEquals(sampleMap.getVCFForSampleAsPath("Sample1").toString(), "file1"); + Assert.assertEquals(sampleMap.getVCFForSample("Sample2").toString(), "file2"); + Assert.assertEquals(sampleMap.getVCFForSampleAsPath("Sample2").toString(), "file2"); + Assert.assertEquals(sampleMap.getVCFForSample("Sample3").toString(), "file3"); + Assert.assertEquals(sampleMap.getVCFForSampleAsPath("Sample3").toString(), "file3"); + Assert.assertTrue(sampleMap.indicesSpecified()); + Assert.assertEquals(sampleMap.getVCFIndexForSample("Sample1").toString(), "index1"); + Assert.assertEquals(sampleMap.getVCFIndexForSampleAsPath("Sample1").toString(), "index1"); + Assert.assertNull(sampleMap.getVCFIndexForSample("Sample3")); + Assert.assertNull(sampleMap.getVCFIndexForSampleAsPath("Sample3")); + Assert.assertNull(sampleMap.getVCFIndexForSample("Sample2")); + Assert.assertNull(sampleMap.getVCFIndexForSampleAsPath("Sample2")); + } + + @DataProvider + public Object[][] badInputsToAddSample() { + return new Object[][] { + { " Sample1", "vcf1" }, + { "Sample1 ", "vcf1" }, + { " Sample1 ", "vcf1" }, + { "", "vcf1" }, + { " ", "vcf1" }, + { null, "vcf1" }, + { "Sample1", null} + }; + } + + @Test(dataProvider = "badInputsToAddSample", expectedExceptions = UserException.BadInput.class) + public void testBadInputToAddSample(final String sampleName, final String vcf) throws URISyntaxException { + final URI vcfURI = vcf != null ? new URI(vcf) : null; + final SampleNameMap sampleMap = new SampleNameMap(); + sampleMap.addSample(sampleName, vcfURI); + } + + @Test(expectedExceptions = UserException.BadInput.class) + public void testAddDuplicateSample() throws URISyntaxException { + final SampleNameMap sampleMap = new SampleNameMap(); + sampleMap.addSample("Sample1", new URI("vcf1")); + sampleMap.addSample("Sample1", new URI("vcf1alt")); + } + + @Test(expectedExceptions = UserException.class) + public void testCheckVcfIsCompressedAndIndexed() { + final File sampleFile = IOUtils.writeTempFile(ORDERED_SAMPLE_MAP, "goodSampleMapping", ".txt"); + final SampleNameMap sampleMap = new SampleNameMap(sampleFile.toPath(), true); + } +}