Merge pull request #1 from nadegeguiglielmoni/master

+ docopt, less required parameters
AntoineHo · Nov 12, 2020 · 660b770 · 660b770
2 parents d80f2e2 + 33ab33e
commit 660b770
Show file tree

Hide file tree

Showing 13 changed files with 850 additions and 510 deletions.
diff --git a/Hap.py b/Hap.py
diff --git a/README.md b/README.md
@@ -6,6 +6,22 @@ Easy haploidy and size completeness estimation.
 ## 1. General
 This tool helps assess the haploidy and proximity to size completeness.
 
+HapPy computes two values:
+- Area Under Curve (AUC) ratio: the ratio of the AUC of the diploid peak and the AUC of the haploid peak
+- Total Size Score (TSS): the estimated haploid size completeness
+
+For more information, see:
+  **Overcoming uncollapsed haplotypes in long-read assemblies of non-model organisms**, 
+  Nadège Guiglielmoni, Antoine Houtain,Alessandro Derzelle, Karine van Doninck, Jean-François Flot,
+  bioRxiv 2020, doi: https://doi.org/10.1101/2020.03.16.993428 
+
+### Requirements: 
+
+- sambamba
+- scipy
+- pandas
+- numpy
+
 ```
 $ python HapPy/Hap.py -h
 usage: Hap.py [-h] {depth,estimate} ...
@@ -59,3 +75,14 @@ optional arguments:
   -mp MIN_PEAK, --min-peak MIN_PEAK <INT> Minimum peak height. Default: [150000]
   -p, --plot      Output plots. Default: False
 ```
+
+## 4. Example
+
+Here is an example on how to use HapPy. HapPy requires a sorted BAM file as input. Here the PacBio long reads are mapped to the assembly with minimap2, and the output is sorted with samtools. The sorted BAM file is also indexed with samtools. The module depth computes the coverage histogram from the BAM file, and then the module estimate computes the AUC ratio and TSS. Here the max x value for the contaminant peak is set to 35, the max x value for the diploid peak is set to 120, and the min y for a peak is set to 150000. The expected genome size is set to 102M. 
+
+```
+minimap2 -ax map-pb assembly.fasta.gz pacbio_reads.fasta.gz --secondary=no | samtools sort -o mapping_LR.map-pb.bam -T tmp.ali
+samtools index mapping_LR.map-pb.bam
+Hap.py depth mapping_LR.map-pb.bam happy_output
+Hap.py estimate --max-contaminant 35 --max-diploid 120 --min-peak 150000 happy_output/mapping_LR.map-pb.bam.hist . 102M
+```
diff --git a/__pycache__/commands.cpython-36.pyc b/__pycache__/commands.cpython-36.pyc
diff --git a/__pycache__/coverage.cpython-36.pyc b/__pycache__/coverage.cpython-36.pyc
diff --git a/__pycache__/estimate.cpython-36.pyc b/__pycache__/estimate.cpython-36.pyc
diff --git a/__pycache__/plot.cpython-36.pyc b/__pycache__/plot.cpython-36.pyc
diff --git a/__pycache__/utils.cpython-36.pyc b/__pycache__/utils.cpython-36.pyc
diff --git a/commands.py b/commands.py
@@ -0,0 +1,88 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+# Based on Rémy Greinhofer (rgreinho) tutorial on subcommands in docopt
+# https://github.com/rgreinho/docopt-subcommands-example
+# Also based on Cyril Mathey-Doret (cmdoret) commands.py in hicstuff
+# https://github.com/koszullab/hicstuff
+
+from docopt import docopt
+import sys, os, shutil
+import tempfile
+from os.path import join, dirname
+import coverage as happyc
+import estimate as happye
+
+
+class AbstractCommand:
+    """Base class for the commands"""
+
+    def __init__(self, command_args, global_args):
+        """Initialize the commands."""
+        self.args = docopt(self.__doc__, argv=command_args)
+        self.global_args = global_args
+
+    def execute(self):
+        """Execute the commands"""
+        raise NotImplementedError
+
+    def check_output_path(self, path, force=False):
+        """Throws error if the output file exists. Create required file tree otherwise."""
+        # Get complete output filename and prevent overwriting unless force is enabled
+        if not force and os.path.exists(path):
+            raise IOError("Output file already exists. Use --force to overwrite")
+        if dirname(path):
+            os.makedirs(dirname(path), exist_ok=True)
+
+
+class Coverage(AbstractCommand):
+    """Coverage histogram command
+    Compute coverage histogram for mapping file.
+
+    usage:
+        coverage [--threads=1] --outdir=DIR <mapping.bam>
+        
+    arguments:
+        mapping.bam              Sorted BAM file after mapping reads to the assembly.
+        
+    options:
+        -t, --threads=INT        Number of parallel threads allocated for 
+                                 sambamba [default: 1].
+        -d, --outdir=DIR         Path where the .cov and .hist files are written.
+    """
+
+    def execute(self):
+        print("Running coverage module.")
+        happyc.get_cov_hist(
+            self.args["<mapping.bam>"], self.args["--threads"], self.args["--outdir"]
+        )
+
+
+class Estimate(AbstractCommand):
+    """Estimate command
+    Compute AUC ratio and TSS from coverage histogram.
+
+    usage:
+        estimate [--max-contaminant=INT] [--max-diploid=INT] [--min-peak=INT] --size=INT --outstats=FILE [--plot] <coverage.hist>
+        
+    arguments:
+        coverage.hist               Coverage histogram.
+        
+    options:
+        -C, --max-contaminant=INT   Maximum coverage of contaminants.
+        -D, --max-diploid=INT       Maximum coverage of the diploid peak.
+        -M, --min-peak=INT          Minimum peak height.
+        -S, --size=INT              Estimated haploid genome size.
+        -O, --outstats=FILE         Path where the AUC ratio and TSS values are written.
+        -p, --plot                  Generate histogram plot.
+    """
+
+    def execute(self):
+
+        happye.estimate_haploidy(
+            self.args["<coverage.hist>"],
+            self.args["--max-contaminant"],
+            self.args["--max-diploid"],
+            self.args["--min-peak"],
+            self.args["--size"],
+            self.args["--outstats"],
+        )
diff --git a/coverage.py b/coverage.py
@@ -0,0 +1,82 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+import os, sys
+import pandas as pd
+import numpy as np
+from utils import *
+
+
+def get_cov_hist(infile, threads: int, outdir):
+    """Finds peaks and modality, then computes scores of haploidy"""
+
+    print("# Hap.py coverage")
+    print(
+        "Input alignment file:\t{0}\nOutput directory:\t{1}\nNumber of threads:\t{2}".format(
+            infile, outdir, threads
+        )
+    )
+    print(
+        "===============================================================================\n"
+    )
+
+    if os.path.isdir(outdir):
+        print("WARNING: Output directory already exists! {}".format(outdir))
+    else:
+        os.makedirs(outdir)  # Create directory following path
+
+    # Read coverage histogram
+    coverage_output = os.path.join(outdir, os.path.basename(infile) + ".cov")
+    if not os.path.isfile(coverage_output):  # In case no coverage file found
+        log("Starting sambamba depth...")
+        coverage_output = os.path.abspath(coverage_output)
+        dc_sambamba = {"BAM": infile, "threads": threads, "out": coverage_output}
+        cmd = "sambamba depth base -t {threads} -o {out} --min-coverage=0 --min-base-quality=0 {BAM}"
+        cmd = cmd.format(**dc_sambamba)
+        sambamba_returncode = run(cmd)
+        if sambamba_returncode == 1:
+            log(
+                "ERROR: sambamba command returned: {0}, a common problem is a missing index (.bai) file...".format(
+                    sambamba_returncode
+                )
+            )
+            sys.exit(1)
+    else:
+        log(
+            "SKIP: Existing output coverage file found using it instead of running sambamba!"
+        )
+
+    # Read coverage histogram
+    output = os.path.join(outdir, os.path.basename(infile) + ".hist")
+    if not os.path.isfile(output):  # In case no coverage file found
+        log("Reading coverage file...")
+        output = os.path.abspath(output)
+
+        # Read coverage
+        df = pd.read_csv(coverage_output, sep="\t", usecols=["REF", "POS", "COV"])
+        total_bases = len(df)  # ~length of reference assembly
+        # Number of bins = number of coverage values between 0 and max found
+        hist, bin_edges = np.histogram(df["COV"], bins=range(0, max(df["COV"])))
+
+        total_summarized = 0
+        max_coverage_to_keep = 0
+        for freq, cov in zip(hist, bin_edges):
+            total_summarized += freq
+            if total_summarized / total_bases > 0.99:
+                max_coverage_to_keep = cov  # Find max coverage to keep
+                break
+
+        # Output histogram
+        f = open(output, "w")
+        for freq, cov in zip(hist, bin_edges):
+            if cov == max_coverage_to_keep:
+                break
+            else:
+                f.write("{}\t{}\n".format(cov, freq))
+        f.close()
+
+    else:
+        log("SKIP: Existing output histogram file found!")
+
+    log("Finished!")
+    sys.exit(0)