Merge pull request #2 from Edinburgh-Genome-Foundry/dev

Biopython fix + protein sequence support
Edinburgh-Genome-Foundry · Sep 16, 2020 · b571366 · b571366
2 parents 0940ba7 + b5f4f05
commit b571366
Show file tree

Hide file tree

Showing 12 changed files with 308 additions and 164 deletions.
diff --git a/LICENCE.txt b/LICENCE.txt
@@ -1,7 +1,4 @@
-The MIT License (MIT)
-[OSI Approved License]
-
-The MIT License (MIT)
+MIT License
 
 Copyright (c) 2018 Edinburgh Genome Foundry
 
@@ -12,13 +9,13 @@ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
 
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.rst b/README.rst
@@ -71,27 +71,40 @@ To write the sequences down as Genbank records, with annotations:
     from crazydoc import records_to_genbank
     records_to_genbank(biopython_records)
 
+Note that ``records_to_genbank()`` will truncate the record name to 20 characters, 
+to fit in the GenBank format. Additionally, slashes (``/``) will be replaced with 
+hyphens (``-``) in the filenames. To read protein sequences, pass ``is_protein=True``:
+
+.. code:: python
+
+    biopython_records = parse_doc_file(protein_path, is_protein=True)
+
+This will return *protein* records, which will be saved with a GenPept extension 
+(.gp) by ``records_to_genbank(biopython_records, is_protein=True)``, 
+unless specified otherwise with ``extension=``.
+
+
 Installation
--------------
+------------
 
-(soon) You can install crazydoc through PIP
+You can install crazydoc through PIP:
 
 .. code::
 
     sudo pip install crazydoc
 
-Alternatively, you can unzip the sources in a folder and type
+Alternatively, you can unzip the sources in a folder and type:
 
 .. code::
 
     sudo python setup.py install
 
 License = MIT
---------------
+-------------
 
-Crazydoc is an open-source software originally written at the `Edinburgh Genome Foundry <http://genomefoundry.org>`_ by `Zulko <https://github.com/Zulko>`_ and `released on Github <https://github.com/Edinburgh-Genome-Foundry/crazydoc>`_ under the MIT licence (copyright Edinburg Genome Foundry).
+Crazydoc is an open-source software originally written at the `Edinburgh Genome Foundry <http://genomefoundry.org>`_ by `Zulko <https://github.com/Zulko>`_ and `released on Github <https://github.com/Edinburgh-Genome-Foundry/crazydoc>`_ under the MIT licence (Copyright 2018 Edinburgh Genome Foundry).
 
-Everyone is welcome to contribute !
+Everyone is welcome to contribute!
 
 More biology software
 ---------------------

diff --git a/crazydoc/CrazydocParser.py b/crazydoc/CrazydocParser.py
@@ -1,7 +1,14 @@
 from docx import Document
 
-from .Observers import (HighlightColor, FontColor, Bold, Italic, UpperCase,
-                        LowerCase, Underline)
+from .Observers import (
+    HighlightColor,
+    FontColor,
+    Bold,
+    Italic,
+    UpperCase,
+    LowerCase,
+    Underline,
+)
 from .biotools import string_is_sequence
 
 
@@ -22,27 +29,33 @@ class CrazydocParser:
       ``highlight_color``, ``font_color``, ``bold``, ``italic``,
       ``upper_case``, ``lower_case``, ``underline``.
     """
+
     observers_dict = {
         _class.name: _class()
-        for _class in (HighlightColor, FontColor, Bold, Italic, UpperCase,
-                       LowerCase, Underline)
+        for _class in (
+            HighlightColor,
+            FontColor,
+            Bold,
+            Italic,
+            UpperCase,
+            LowerCase,
+            Underline,
+        )
     }
 
     def __init__(self, observers):
         self.observers = [
-            self.observers_dict[o] if isinstance(o, str) else o
-            for o in observers
+            self.observers_dict[o] if isinstance(o, str) else o for o in observers
         ]
 
-
-    def _extract_sequence_names_and_runs(self, doc):
+    def _extract_sequence_names_and_runs(self, doc, is_protein=False):
         """Parse the doc, return a list [(sequence_name, sequenceruns), ...]"""
         sequence_name = None
         sequence_paragraphs = []
         reading_sequence = False
         for paragraph in doc.paragraphs:
             stripped = paragraph.text.replace(" ", "")
-            if string_is_sequence(stripped):
+            if string_is_sequence(stripped, is_protein=is_protein):
                 if reading_sequence:
                     sequence_paragraphs[-1][1].append(paragraph)
                 else:
@@ -52,18 +65,17 @@ def _extract_sequence_names_and_runs(self, doc):
                 if reading_sequence:
                     sequence_name = None
                     reading_sequence = False
-                if paragraph.text.startswith('>'):
+                if paragraph.text.startswith(">"):
                     sequence_name = paragraph.text[1:].strip()
-        sequence_paragraphs
         return [
             (name, [run for par in paragraphs for run in par.runs])
             for name, paragraphs in sequence_paragraphs
         ]
 
-    def _msword_runs_to_record(self, runs):
+    def _msword_runs_to_record(self, runs, is_protein=False):
         """Transform a MS Word runs list to a biopython record."""
         records = [
-            observer.msword_runs_to_record(runs)
+            observer.msword_runs_to_record(runs, is_protein=is_protein)
             for observer in self.observers
         ]
         final_record = records[0]
@@ -82,29 +94,32 @@ def _msword_runs_to_record(self, runs):
                     record_features[location] = feature
         return final_record
 
-
-
-    def parse_doc_file(self, filepath=None, doc=None):
+    def parse_doc_file(self, filepath=None, doc=None, is_protein=False):
         """Return a list of records, 1 for each sequence contained in the docx.
 
         Parameters
         ----------
 
         filepath
-          A path to a docx file
+          A path to a docx file.
 
         doc
           A python-docx Document object, which can be provided instead of the
           file path.
+
+        is_protein
+          True if the sequences are protein sequences (default: False).
         """
         if doc is None:
             doc = Document(filepath)
         records = []
-        for name, runs in self._extract_sequence_names_and_runs(doc):
-            record = self._msword_runs_to_record(runs)
+        for name, runs in self._extract_sequence_names_and_runs(
+            doc, is_protein=is_protein
+        ):
+            record = self._msword_runs_to_record(runs, is_protein=is_protein)
             if name is not None:
                 record.id = name
-                record.name = name.replace(' ', '_')
+                record.name = name.replace(" ", "_")
             for observer in self.observers:
                 observer.process_record_features(record)
             records.append(record)

diff --git a/crazydoc/Observers.py b/crazydoc/Observers.py
@@ -1,13 +1,14 @@
 from .conf import conf
 from .biotools import sequence_to_record, sequence_to_annotated_record
 
+
 class StyleObserver:
     """Generic class for observing style-based annotations in sequences.
 
-    The subclasses observe each one particular type of DNA sequence annotation
-    such as the highlight color, bold text, underlines, etc.
-
+    The provided subclasses each observe one particular type of DNA sequence
+    annotation, such as the highlight color, bold text, underlines, etc.
     """
+
     def __init__(self):
         pass
 
@@ -19,31 +20,33 @@ def process_feature(self, feature):
         if self.name not in feature.qualifiers:
             return
         value = feature.qualifiers[self.name]
-        label = ''
-        if 'label' in feature.qualifiers:
-            label = feature.qualifiers['label'] + '; '
+        label = ""
+        if "label" in feature.qualifiers:
+            label = feature.qualifiers["label"] + "; "
         label += self.name
         if not isinstance(value, bool):
             label += ": " + str(value)
-        feature.qualifiers['label'] = label
+        feature.qualifiers["label"] = label
 
     def aggregate_features_from_runs(self, runs):
-        features = [[None, '']]
+        features = [[None, ""]]
         for run in runs:
             value = self.evaluate(run)
-            text = run.text.replace(' ', '')
+            text = run.text.replace(" ", "")
             if value == features[-1][0]:
                 features[-1][1] += text
             else:
                 features.append([value, text])
         return features
 
-    def msword_runs_to_record(self, runs):
+    def msword_runs_to_record(self, runs, is_protein=False):
         feature_records = [
             (
-                sequence_to_annotated_record(text, **{self.name: val})
+                sequence_to_annotated_record(
+                    text, is_protein=is_protein, **{self.name: val}
+                )
                 if val
-                else sequence_to_record(text)
+                else sequence_to_record(text, is_protein=is_protein)
             )
             for (val, text) in self.aggregate_features_from_runs(runs)
         ]
@@ -60,16 +63,16 @@ def process_feature(self, feature):
         if self.name not in feature.qualifiers:
             return
         color = feature.qualifiers[self.name]
-        for field in ['color', 'ApEinfo_revcolor', 'ApEinfo_fwdcolor']:
+        for field in ["color", "ApEinfo_revcolor", "ApEinfo_fwdcolor"]:
             feature.qualifiers[field] = color
 
 
 class CharactersObserver(StyleObserver):
     """Subclass for character-by-character observers."""
 
     def aggregate_features_from_runs(self, runs):
-        features = [[None, '']]
-        text = ''.join([r.text for r in runs])
+        features = [[None, ""]]
+        text = "".join([r.text for r in runs])
         for character in text:
             value = self.evaluate(character)
             if value == features[-1][0]:
@@ -81,7 +84,8 @@ def aggregate_features_from_runs(self, runs):
 
 class Italic(StyleObserver):
     """Captures italic text."""
-    name = 'italic'
+
+    name = "italic"
 
     def evaluate(self, run):
         """Return whether the run has italic style"""
@@ -90,15 +94,18 @@ def evaluate(self, run):
 
 class Bold(StyleObserver):
     """Captures bold text."""
-    name = 'bold'
+
+    name = "bold"
 
     def evaluate(self, run):
         """Return whether the run has bold style"""
         return run.bold
 
+
 class Underline(StyleObserver):
     """Captures underlined text."""
-    name = 'underline'
+
+    name = "underline"
 
     def evaluate(self, run):
         """Return whether the run has underline style"""
@@ -107,42 +114,47 @@ def evaluate(self, run):
 
 class FontColor(ColorObserver):
     """Captures text with non-black font color."""
-    name = 'font_color'
+
+    name = "font_color"
 
     def evaluate(self, run):
         """Return False if no color, else the #ae60bf color."""
         color = str(run.font.color.rgb)
-        if color in ['None', '000000']:
+        if color in ["None", "000000"]:
             return False
         else:
             return "#" + color
 
 
 class HighlightColor(ColorObserver):
     """Captures text with a background-highlighting color."""
-    name = 'highlight_color'
+
+    name = "highlight_color"
 
     def evaluate(self, run):
         """Return False if no background color, else the #ae60bf color."""
         color = run.font.highlight_color
         if color is None:
             return False
         else:
-            return conf['color_theme'][color._member_name]
+            return conf["color_theme"][color._member_name]
+
 
 class UpperCase(CharactersObserver):
     """Captures upper-case text."""
-    name = 'upper_case'
+
+    name = "upper_case"
 
     def evaluate(self, character):
         """Return whether the character is upper"""
-        return (character == character.upper())
+        return character == character.upper()
 
 
 class LowerCase(CharactersObserver):
     """Captures lower-case text."""
-    name = 'lower_case'
 
-    def evaluate(self, character):#
+    name = "lower_case"
+
+    def evaluate(self, character):  #
         """Return whether the character is lower"""
-        return (character == character.lower())
+        return character == character.lower()