Skip to content

Commit

Permalink
Allow GenomicsDBImport to connect to az:// files without interference
Browse files Browse the repository at this point in the history
* GATK's lack of support for az:// uri means that although genomicsdb can
  natively read them, parts of the java code crash when they're passed through
* Adding --avoid-nio and --header parameters
  these allow disabling all of the java codes interaction with the az:// links
  and simply pass them through to genomicsdb
  This disables some safeguards but allows operating on files in azure
* Move GenomicsDB version to released 1.5.1 for azure improved support

* There are no direct tests on azure since we do not yet have any infrastructure
  to generate the necessary tokens

---------

Co-authored-by: Nalini Ganapati <[email protected]>
Co-authored-by: Nalini Ganapati <[email protected]>
  • Loading branch information
3 people committed Dec 8, 2023
1 parent e2c5fab commit de45517
Show file tree
Hide file tree
Showing 9 changed files with 345 additions and 80 deletions.
4 changes: 2 additions & 2 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ final barclayVersion = System.getProperty('barclay.version','5.0.0')
final sparkVersion = System.getProperty('spark.version', '3.3.1')
final hadoopVersion = System.getProperty('hadoop.version', '3.3.6')
final disqVersion = System.getProperty('disq.version','0.3.8')
final genomicsdbVersion = System.getProperty('genomicsdb.version','1.5.0')
final genomicsdbVersion = System.getProperty('genomicsdb.version','1.5.1')
final bigQueryVersion = System.getProperty('bigQuery.version', '2.35.0')
final bigQueryStorageVersion = System.getProperty('bigQueryStorage.version', '2.47.0')
final guavaVersion = System.getProperty('guava.version', '32.1.3-jre')
Expand Down Expand Up @@ -976,7 +976,7 @@ ossIndexAudit {
outputFormat = 'DEFAULT' // Optional, other values are: 'DEPENDENCY_GRAPH' prints dependency graph showing direct/transitive dependencies, 'JSON_CYCLONE_DX_1_4' prints a CycloneDX 1.4 SBOM in JSON format.
showAll = false // if true prints all dependencies. By default is false, meaning only dependencies with vulnerabilities will be printed.
printBanner = true // if true will print ASCII text banner. By default is true.

// ossIndexAudit can be configured to exclude vulnerabilities from matching
// excludeVulnerabilityIds = ['39d74cc8-457a-4e57-89ef-a258420138c5'] // list containing ids of vulnerabilities to be ignored
// excludeCoordinates = ['commons-fileupload:commons-fileupload:1.3'] // list containing coordinate of components which if vulnerable should be ignored
Expand Down

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ public final class GenomicsDBImportIntegrationTest extends CommandLineProgramTes
//This file was obtained from combined.gatk3.7.g.vcf.gz by dropping all the samples
private static final String COMBINED_SITES_ONLY = largeFileTestDir + "gvcfs/combined.gatk3.7_sites_only.g.vcf.gz";
private static final String INTERVAL_PICARD_STYLE_EXPECTED = toolsTestDir + "GenomicsDBImport/interval_expected.interval_list";
private static final String MULTIPLE_NON_ADJACENT_INTERVALS_THAT_WORK_WITH_COMBINE_GVCFS_PICARD_STYLE_EXPECTED =
private static final String MULTIPLE_NON_ADJACENT_INTERVALS_THAT_WORK_WITH_COMBINE_GVCFS_PICARD_STYLE_EXPECTED =
toolsTestDir + "GenomicsDBImport/multiple_non_adjacent_intervals_combine_gvcfs_expected.interval_list";
private static final String MERGED_CONTIGS_INTERVAL_PICARD_STYLE_EXPECTED =
toolsTestDir + "GenomicsDBImport/chr20_chr21_merged_contigs_expected.interval_list";
Expand Down Expand Up @@ -383,6 +383,74 @@ public void testGenomicsDbImportThrowsOnMnp() throws IOException {
}
}

@DataProvider
public Object[][] getInvalidArgsForAvoidNio(){
final ArgumentsBuilder baseArgs = ArgumentsBuilder.create()
.add(GenomicsDBImport.WORKSPACE_ARG_LONG_NAME, createTempFile())
.addInterval("fake")
.addFlag(GenomicsDBImport.AVOID_NIO);
return new Object[][]{
{baseArgs, CommandLineException.MissingArgument.class}, //no input
{baseArgs.copy()
.addVCF("fake.vcf"), CommandLineException.class
}, //not allowed with variant, we shoul have some sort of mutex exception...
{baseArgs.copy()
.add(GenomicsDBImport.SAMPLE_NAME_MAP_LONG_NAME, "fake.samplenames"), CommandLineException.MissingArgument.class
}, //missing header
{baseArgs.copy()
.add(GenomicsDBImport.VCF_HEADER_OVERRIDE, "fake.vcf"), CommandLineException.MissingArgument.class
}, //missing input
{baseArgs.copy()
.add(GenomicsDBImport.VCF_HEADER_OVERRIDE, "fake.vcf")
.addVCF("fake.vcf"), CommandLineException.class // can't use with -V
}
};
}

@Test(dataProvider = "getInvalidArgsForAvoidNio")
public void testInvalidArgumentCombinationsWithAvoidNio(ArgumentsBuilder args, Class<? extends Exception> expectedException){
Assert.assertThrows(expectedException, () -> runCommandLine(args));
}

/*
* this is a test that can be run locally if you enable it and fill in the SAS token with one from
* https://app.terra.bio/#workspaces/axin-pipeline-testing-20230927/gatk-azure-testing
*
* it's basically an example of how to run the tool on azure
*
* note that the http url for the file azure files looks like this:
*
* https://<bucket_name>.blob.core.windows.net/<user_name>/<filepath>?<sas token>
* the SAS token includes the '?' generally
*
* to restructure into an az:// link you move the username
* az://<user_name>@<bucket_name>blob.core.windows.new/<filepath>
*
*/
@Test(enabled = false, groups={"cloud","azure"})
public void testImportFromAzure(){

final String SAS_TOKEN="put a sas token in me";

Check warning on line 433 in src/test/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImportIntegrationTest.java

View check run for this annotation

Codecov / codecov/patch

src/test/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImportIntegrationTest.java#L433

Added line #L433 was not covered by tests

final String workspace = createTempDir("genomicsdb-tests-").getAbsolutePath() + "/workspace";
final String sample = "NA19625";
final String azLocation = "az://lzb25a77f5eadb0fa72a2ae7.blob.core.windows.net/sc-62528cd7-3299-4440-8c17-10f458e589d3/NA19625.g.vcf.gz";
final String sampleMapText = String.format("%s\t%s\n", sample, azLocation);
final File sampleMappingFile = IOUtils.writeTempFile(sampleMapText, "sampleMapping", ".txt");

Check warning on line 439 in src/test/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImportIntegrationTest.java

View check run for this annotation

Codecov / codecov/patch

src/test/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImportIntegrationTest.java#L435-L439

Added lines #L435 - L439 were not covered by tests

final ArgumentsBuilder args = ArgumentsBuilder.create()
.add(GenomicsDBImport.WORKSPACE_ARG_LONG_NAME, workspace)
.addInterval("chr20")
.addFlag(GenomicsDBImport.AVOID_NIO)
.add(GenomicsDBImport.SAMPLE_NAME_MAP_LONG_NAME, sampleMappingFile)
.addFlag(GenomicsDBImport.BYPASS_FEATURE_READER)
.add(GenomicsDBImport.VCF_HEADER_OVERRIDE, GENOMICSDB_TEST_DIR + "azureHeader.vcf");
Map<String, String> environment = new HashMap<>(System.getenv());
final String sasTokenEnvVariable = "AZURE_STORAGE_SAS_TOKEN";
environment.put(sasTokenEnvVariable, SAS_TOKEN);
runToolInNewJVM(GenomicsDBImport.class.getSimpleName(), args, environment);
}

Check warning on line 452 in src/test/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImportIntegrationTest.java

View check run for this annotation

Codecov / codecov/patch

src/test/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImportIntegrationTest.java#L441-L452

Added lines #L441 - L452 were not covered by tests

private void testGenomicsDBImporterWithGenotypes(final List<String> vcfInputs, final List<SimpleInterval> intervals,
final String expectedCombinedVCF,
final String referenceFile) throws IOException {
Expand All @@ -408,7 +476,7 @@ private void testGenomicsDBImporterWithGenotypes(final List<String> vcfInputs, f
final boolean testAll,
final boolean produceGTField,
final boolean sitesOnlyQuery) throws IOException {
testGenomicsDBImporterWithGenotypes(vcfInputs, intervals, expectedCombinedVCF, referenceFile, testAll, produceGTField,
testGenomicsDBImporterWithGenotypes(vcfInputs, intervals, expectedCombinedVCF, referenceFile, testAll, produceGTField,
sitesOnlyQuery, false);
}

Expand Down Expand Up @@ -456,11 +524,11 @@ private void testGenomicsDBAgainstCombineGVCFs(final List<String> vcfInputs, fin

private void testGenomicsDBAgainstCombineGVCFs(final List<String> vcfInputs, final List<SimpleInterval> intervals,
final String referenceFile, final String[] CombineGVCFArgs,
final int numVCFReaderThreadsInImporter, final int chrsToPartitions,
final int numVCFReaderThreadsInImporter, final int chrsToPartitions,
final boolean useNativeReader) throws IOException {
final String workspace = createTempDir("genomicsdb-tests-").getAbsolutePath() + "/workspace";

writeToGenomicsDB(vcfInputs, intervals, workspace, 0, false, 0, numVCFReaderThreadsInImporter, false, false, false,
writeToGenomicsDB(vcfInputs, intervals, workspace, 0, false, 0, numVCFReaderThreadsInImporter, false, false, false,
chrsToPartitions, useNativeReader);
checkJSONFilesAreWritten(workspace);
for(SimpleInterval currInterval : intervals) {
Expand Down Expand Up @@ -504,7 +572,7 @@ public void testGenomicsDBAlleleSpecificAnnotationsInTheMiddleOfSpanningDeletion

@Test
public void testGenomicsDBNoRemapMissingToNonRef() throws IOException {
testGenomicsDBAgainstCombineGVCFs(Arrays.asList(COMBINEGVCFS_TEST_DIR+"NA12878.AS.NON_REF_remap_check.chr20snippet.g.vcf",
testGenomicsDBAgainstCombineGVCFs(Arrays.asList(COMBINEGVCFS_TEST_DIR+"NA12878.AS.NON_REF_remap_check.chr20snippet.g.vcf",
COMBINEGVCFS_TEST_DIR+"NA12892.AS.chr20snippet.g.vcf"),
new ArrayList<SimpleInterval>(Arrays.asList(new SimpleInterval("20", 10433313, 10700000))),
b37_reference_20_21,
Expand Down Expand Up @@ -671,14 +739,14 @@ private void writeToGenomicsDB(final List<String> vcfInputs, final List<SimpleIn
}

private void writeToGenomicsDB(final List<String> vcfInputs, final List<SimpleInterval> intervals, final String workspace,
final int batchSize, final Boolean useBufferSize, final int bufferSizePerSample, int threads,
final int batchSize, final Boolean useBufferSize, final int bufferSizePerSample, int threads,
final boolean mergeIntervals, final boolean overwriteWorkspace, final boolean incremental) {
writeToGenomicsDB(vcfInputs, intervals, workspace, batchSize, useBufferSize, bufferSizePerSample, threads, mergeIntervals,
writeToGenomicsDB(vcfInputs, intervals, workspace, batchSize, useBufferSize, bufferSizePerSample, threads, mergeIntervals,
overwriteWorkspace, incremental, 0, false);
}

private void writeToGenomicsDB(final List<String> vcfInputs, final List<SimpleInterval> intervals, final String workspace,
final int batchSize, final Boolean useBufferSize, final int bufferSizePerSample, int threads,
final int batchSize, final Boolean useBufferSize, final int bufferSizePerSample, int threads,
final boolean mergeIntervals, final boolean overwriteWorkspace, final boolean incremental,
final int chrsToPartitions, final boolean useNativeReader) {
final ArgumentsBuilder args = new ArgumentsBuilder();
Expand Down Expand Up @@ -1013,7 +1081,7 @@ public Object[][] dataForTestExplicitIndicesInSampleNameMapInTheCloud() {
final String NA19625_UNCOMPRESSED_WITH_INDEX = GVCFS_WITH_INDICES_BUCKET + "NA19625.g.vcf";
final String NA19625_UNCOMPRESSED_NO_INDEX = GVCFS_WITHOUT_INDICES_BUCKET + "NA19625.g.vcf";
final String NA19625_UNCOMPRESSED_INDEX = GVCF_INDICES_ONLY_BUCKET + "NA19625.g.vcf.idx";

return new Object[][] {
// All VCFs have explicit indices, samples in order, TABIX index
{
Expand Down Expand Up @@ -1371,7 +1439,7 @@ public void testIncrementalMustHaveExistingWorkspace() {
writeToGenomicsDB(LOCAL_GVCFS, INTERVAL, workspace + "workspace2", 0, false, 0, 1, false, false, true);
}

private void testIncrementalImport(final int stepSize, final List<SimpleInterval> intervals, final String workspace,
private void testIncrementalImport(final int stepSize, final List<SimpleInterval> intervals, final String workspace,
final int batchSize, final boolean produceGTField, final boolean useVCFCodec, final String expected,
final int chrsToPartitions, final boolean useNativeReader) throws IOException {
testIncrementalImport(stepSize, intervals, workspace, batchSize, produceGTField, useVCFCodec, expected,
Expand All @@ -1384,7 +1452,7 @@ private void testIncrementalImport(final int stepSize, final List<SimpleInterval
throws IOException {
for(int i=0; i<LOCAL_GVCFS.size(); i+=stepSize) {
int upper = Math.min(i+stepSize, LOCAL_GVCFS.size());
writeToGenomicsDB(LOCAL_GVCFS.subList(i, upper), intervals, workspace, batchSize, false, 0, 1, false, false, i!=0,
writeToGenomicsDB(LOCAL_GVCFS.subList(i, upper), intervals, workspace, batchSize, false, 0, 1, false, false, i!=0,
chrsToPartitions, (i == 0 && useNativeReaderInitial) || (i > 0 && useNativeReader));
checkJSONFilesAreWritten(workspace);
}
Expand Down Expand Up @@ -1434,7 +1502,7 @@ public void testGenomicsDBIncrementalAndBatchSize1WithNonAdjacentIntervalsNative

@Test(expectedExceptions = {UserException.class}, expectedExceptionsMessageRegExp=".*must be block compressed.*")
public void testGenomicsDBImportNativeReaderNoCompressedVcf() throws IOException {
testGenomicsDBImporterWithGenotypes(Arrays.asList(NA_12878_PHASED), MULTIPLE_INTERVALS, NA_12878_PHASED, b37_reference_20_21,
testGenomicsDBImporterWithGenotypes(Arrays.asList(NA_12878_PHASED), MULTIPLE_INTERVALS, NA_12878_PHASED, b37_reference_20_21,

Check warning on line 1505 in src/test/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImportIntegrationTest.java

View check run for this annotation

Codecov / codecov/patch

src/test/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImportIntegrationTest.java#L1505

Added line #L1505 was not covered by tests
false, true, false, true);
}

Expand All @@ -1448,14 +1516,14 @@ public void testGenomicsDBIncrementalAndBatchSize1WithNonAdjacentIntervalsMergeC
@Test
public void testGenomicsDBIncrementalAndBatchSize2() throws IOException {
final String workspace = createTempDir("genomicsdb-incremental-tests").getAbsolutePath() + "/workspace";
testIncrementalImport(2, MULTIPLE_INTERVALS_THAT_WORK_WITH_COMBINE_GVCFS, workspace, 2, true, false,
testIncrementalImport(2, MULTIPLE_INTERVALS_THAT_WORK_WITH_COMBINE_GVCFS, workspace, 2, true, false,
COMBINED_WITH_GENOTYPES, 0, false);
}

@Test
public void testGenomicsDBMultipleIncrementalImports() throws IOException {
final String workspace = createTempDir("genomicsdb-incremental-tests").getAbsolutePath() + "/workspace";
testIncrementalImport(1, MULTIPLE_INTERVALS_THAT_WORK_WITH_COMBINE_GVCFS, workspace, 2, true, true,
testIncrementalImport(1, MULTIPLE_INTERVALS_THAT_WORK_WITH_COMBINE_GVCFS, workspace, 2, true, true,
COMBINED_WITH_GENOTYPES, 0, false);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,15 @@ public Object[][] shuffleParameters() {
return new Object[][] { { false }, { true } };
}

private File createTempFile() throws IOException {
private File createAndDeleteTempFile() {
final File out = IOUtils.createTempFile("out", ".txt");
out.delete();
return out;
}

@Test(dataProvider = "shuffle")
public void testSimplePileup(boolean useShuffle) throws Exception {
final File out = createTempFile();
final File out = createAndDeleteTempFile();
final ArgumentsBuilder args = new ArgumentsBuilder();
args.addRaw("--input");
args.addRaw(NA12878_20_21_WGS_bam);
Expand All @@ -53,7 +53,7 @@ public void testSimplePileup(boolean useShuffle) throws Exception {

@Test(dataProvider = "shuffle")
public void testVerbosePileup(boolean useShuffle) throws Exception {
final File out = createTempFile();
final File out = createAndDeleteTempFile();
final ArgumentsBuilder args = new ArgumentsBuilder();
args.addRaw("--input");
args.addRaw(NA12878_20_21_WGS_bam);
Expand All @@ -74,7 +74,7 @@ public void testVerbosePileup(boolean useShuffle) throws Exception {

@Test(dataProvider = "shuffle")
public void testFeaturesPileup(boolean useShuffle) throws Exception {
final File out = createTempFile();
final File out = createAndDeleteTempFile();
final ArgumentsBuilder args = new ArgumentsBuilder();
args.addRaw("--input");
args.addRaw(NA12878_20_21_WGS_bam);
Expand All @@ -95,7 +95,7 @@ public void testFeaturesPileup(boolean useShuffle) throws Exception {

@Test(dataProvider = "shuffle")
public void testInsertLengthPileup(boolean useShuffle) throws Exception {
final File out = createTempFile();
final File out = createAndDeleteTempFile();
final ArgumentsBuilder args = new ArgumentsBuilder();
args.addRaw("--input");
args.addRaw(NA12878_20_21_WGS_bam);
Expand Down Expand Up @@ -128,7 +128,7 @@ public void testFeaturesPileupHdfs(boolean useShuffle) throws Exception {
cluster.getFileSystem().copyFromLocalFile(new Path(dbsnp_138_b37_20_21_vcf), vcfPath);
cluster.getFileSystem().copyFromLocalFile(new Path(dbsnp_138_b37_20_21_vcf + ".idx"), idxPath);

final File out = createTempFile();
final File out = createAndDeleteTempFile();

Check warning on line 131 in src/test/java/org/broadinstitute/hellbender/tools/spark/PileupSparkIntegrationTest.java

View check run for this annotation

Codecov / codecov/patch

src/test/java/org/broadinstitute/hellbender/tools/spark/PileupSparkIntegrationTest.java#L131

Added line #L131 was not covered by tests
final ArgumentsBuilder args = new ArgumentsBuilder();
args.addRaw("--input");
args.addRaw(NA12878_20_21_WGS_bam);
Expand Down
Loading

0 comments on commit de45517

Please sign in to comment.