Skip to content

Commit

Permalink
Adding base level comparison modes for CompareReferences tool (#7987)
Browse files Browse the repository at this point in the history
CompareReferences now has the ability to do base-level comparisons between pairs of differing reference sequences.

Running with --base-comparison FULL_ALIGNMENT uses MUMmer on the differing reference sequences to produce a VCF with SNPs and Indels. However, this mode ignores IUPAC / N bases.

Running with --base-comparison FIND_SNPS_ONLY finds single-base differences between differing reference sequences of the same length. This mode can handle IUPAC / N bases correctly.

As part of this change, GATK now packages a distribution of MUMmer for x86_64 Mac and Linux, which can be invoked from within the GATK using the new MummerExecutor class.

Comprehensive unit and integration tests.

Co-authored-by: David Roazen <[email protected]>
  • Loading branch information
orlicohen and droazen committed Aug 27, 2022
1 parent c28e244 commit 993c7f1
Show file tree
Hide file tree
Showing 43 changed files with 5,903 additions and 20 deletions.

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,14 @@ public ReferencePair(ReferenceSequenceTable table, GATKPath reference1, GATKPath
analysis = EnumSet.of(Status.EXACT_MATCH);
}

public GATKPath getRef1() {
return ref1;
}

public GATKPath getRef2() {
return ref2;
}

/**
* Given a Status, add it to the ReferencePair's analysis
*
Expand Down Expand Up @@ -76,12 +84,6 @@ public String getRef1AsString(){
public String getRef2AsString(){
return ReferenceSequenceTable.getReferenceColumnName(ref2);
}
public GATKPath getRef1(){
return ref1;
}
public GATKPath getRef2() {
return ref2;
}

/**
* Displays a ReferencePair's status set
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ public final List<ReferencePair> generateReferencePairs(){
}

/**
* Generate ReferencePairs for comaprison of all references present in the table against a provided dictionary.
* Generate ReferencePairs for comparison of all references present in the table against a provided dictionary.
*
* @return the list of ReferencePairs
*/
Expand All @@ -259,7 +259,7 @@ public final List<ReferencePair> generateReferencePairsAgainstDictionary(GATKPat
return referencePairs;
}

/** Create list ReferencePair for pairwise comparison of references in the table and analyze each pair.
/** Create list of ReferencePair for pairwise comparison of references in the table and analyze each pair.
*
* @return list of ReferencePairs in the table with updated status sets
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,13 @@ public static boolean runningOnPPCArchitecture() {
return runningOnArchitecture("ppc64le");
}

/**
* @return True if we're running on a 64-bit x86 platform, otherwise false
*/
public static boolean runningOn64BitX86Architecture() {
return runningOnArchitecture("x86_64") || runningOnArchitecture("amd64");
}

/**
* @param architecture Architecture to look for
* @return true if we're running on the specified architecture, otherwise false
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
package org.broadinstitute.hellbender.utils.alignment;

import com.google.common.annotations.VisibleForTesting;
import org.apache.commons.lang3.SystemUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.NativeUtils;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.io.Resource;
import org.broadinstitute.hellbender.utils.runtime.ProcessController;
import org.broadinstitute.hellbender.utils.runtime.ProcessOutput;
import org.broadinstitute.hellbender.utils.runtime.ProcessSettings;

import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.attribute.PosixFilePermissions;
import java.util.*;

/**
* Class for executing MUMmer alignment pipeline to detect SNPs and INDELs in mismatching sequences.
* The MUMmer pipeline runs nucmer, delta-filter, and show-snps programs, sequentially.
* Used in CompareReferences tool when running in FULL_ALIGNMENT mode.
*/
public final class MummerExecutor {

public static final String MUMMER_X86_64_MAC_BINARIES_ZIPFILE = "MUMmer-4.0.0rc1_mac_x64_64.zip";
public static final String MUMMER_X86_64_LINUX_BINARIES_ZIPFILE = "MUMmer-4.0.0rc1_linux_x64_64.zip";
// TODO: Need to recompile MUMmer 4.0.0rc1 for native M1 architecture as well
// TODO: (however the Mac Intel build should be runnable on M1 platforms with x86 emulation)

private static final Logger logger = LogManager.getLogger(MummerExecutor.class);
private File mummerExecutableDirectory;

/**
* Returns a MummerExecutor pointing to the provided directory
* @param mummerExecutableDirectory File representing the location of the MUMmer executables
*/
public MummerExecutor(File mummerExecutableDirectory){
this.mummerExecutableDirectory = mummerExecutableDirectory;
}

/**
* Returns a MummerExecutor pointing to the unzipped MUMmer executables packaged in GATK
*/
public MummerExecutor(){
final Resource mummerDistribution = new Resource(selectCorrectMummerDistributionForPlatform(), getClass());
mummerExecutableDirectory = prepareMUMmerExecutionDirectory(mummerDistribution);
}

private String selectCorrectMummerDistributionForPlatform() {
if ( NativeUtils.runningOnMac() && NativeUtils.runningOn64BitX86Architecture() ) {
logger.info("Using the x86_64 Mac OS build of MUMmer");
return MUMMER_X86_64_MAC_BINARIES_ZIPFILE;
}
else if ( NativeUtils.runningOnLinux() && NativeUtils.runningOn64BitX86Architecture() ) {
logger.info("Using the x86_64 Linux build of MUMmer");
return MUMMER_X86_64_LINUX_BINARIES_ZIPFILE;
}
else {
throw new UserException("Unable to run MUMmer aligner: GATK does not contain a MUMmer distribution for system architecture " +
SystemUtils.OS_ARCH + " on operating system " + SystemUtils.OS_NAME);
}
}

/**
* Getter method to access the location of the MUMmer executables.
* @return File representing the directory location of the MUMmer executables
*/
public File getMummerExecutableDirectory() {
return mummerExecutableDirectory;
}

/**
* Execute a full alignment of a sequence across a given pair of references to get a File containing the SNPs and INDELs.
* Begins by running nucmer to get a delta file, which is passed to delta-filter. The result of delta filter is a
* delta file filtered to only output alignments of length >= 1; this file is passed to show-snps, which displays SNPs
* and INDELs in a tabular format.
*
* @param fasta1 first reference
* @param fasta2 second reference
* @param outputDirectory directory to output final snps file
* @return the final snps File
*/
public File executeMummer(File fasta1, File fasta2, File outputDirectory){

// NUCMER
logger.debug("Running nucmer.");
File nucmerTempDirectory = IOUtils.createTempDir("nucmerTempDir");
File deltaFile = new File(nucmerTempDirectory, "deltaFile"); // delta file for nucmer output --> input to delta-filter
String[] nucmerArgs = {mummerExecutableDirectory.getAbsolutePath() + "/nucmer", "--mum", "-p", deltaFile.getAbsolutePath(), fasta1.getAbsolutePath(), fasta2.getAbsolutePath()};
ProcessOutput nucmer = runShellCommand(nucmerArgs, null, null,false);

// DELTA-FILTER
logger.debug("Running delta-filter.");
File deltaFilterOutput = IOUtils.createTempFile("deltaFilterOutput", ".delta"); // file for delta filter output --> input to show-snps
String[] deltaFilterArgs = {mummerExecutableDirectory.getAbsolutePath() + "/delta-filter", "-1", deltaFile.getAbsolutePath() + ".delta"};
ProcessOutput deltaFilter = runShellCommand(deltaFilterArgs, null, deltaFilterOutput, false);

// SHOW-SNPS
logger.debug("Running show-snps.");
File showSNPSOutput = new File(outputDirectory, "snps_output.snps");
String[] showSNPsArgs = {mummerExecutableDirectory.getAbsolutePath() + "/show-snps", "-rlTH", deltaFilterOutput.getAbsolutePath()};
ProcessOutput showSNPs = runShellCommand(showSNPsArgs, null, showSNPSOutput, false);

return showSNPSOutput;
}

/**
* Method to run shell commands from GATK.
*
* @param command String[] containing the commandline for execution
* @param environment a Map of environment variables to their values; may be left null if no environment variables needed
* @param stdoutCaptureFile File for output
* @param printStdout boolean to trigger displaying to stdout
* @return the ProcessOutput of the run
*/
public static ProcessOutput runShellCommand(String[] command, Map<String, String> environment, File stdoutCaptureFile, boolean printStdout){
ProcessController processController = ProcessController.getThreadLocal();
final ProcessSettings prs = new ProcessSettings(command);
if(printStdout){
prs.getStderrSettings().printStandard(true);
prs.getStdoutSettings().printStandard(true);
}
if(stdoutCaptureFile != null){
prs.getStdoutSettings().setOutputFile(stdoutCaptureFile);
}
prs.setEnvironment(environment);
final ProcessOutput output = processController.exec(prs);

if(output.getExitValue() != 0){
throw new UserException("Error running " + command[0] + ". Exit with code " + output.getExitValue());
}

return output;
}

// method to unzip and locate the MUMmer binaries packaged within GATK
@VisibleForTesting
static File prepareMUMmerExecutionDirectory(final Resource mummerZipFile){
try{
File tempMUMmerZipFile = IOUtils.writeTempResource(mummerZipFile);
File mummerExecutionDirectory = IOUtils.createTempDir("MUMmerExecutionDirectory");
IOUtils.unzipToFolder(Paths.get(tempMUMmerZipFile.getAbsolutePath()), Paths.get(mummerExecutionDirectory.getAbsolutePath()));
for(File file : mummerExecutionDirectory.listFiles()){
Path path = Paths.get(file.getAbsolutePath());
Files.setPosixFilePermissions(path, PosixFilePermissions.fromString("rwxr-xr-x"));
}
return mummerExecutionDirectory;
}
catch(IOException e){
throw new UserException("Unable to unzip MUMmer binaries.", e);
}
}
}
Loading

0 comments on commit 993c7f1

Please sign in to comment.