Skip to content

Commit

Permalink
Merge pull request #2 from Edinburgh-Genome-Foundry/dev
Browse files Browse the repository at this point in the history
Biopython fix + protein sequence support
  • Loading branch information
veghp committed Sep 16, 2020
2 parents 0940ba7 + b5f4f05 commit b571366
Show file tree
Hide file tree
Showing 12 changed files with 308 additions and 164 deletions.
13 changes: 5 additions & 8 deletions LICENCE.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
The MIT License (MIT)
[OSI Approved License]

The MIT License (MIT)
MIT License

Copyright (c) 2018 Edinburgh Genome Foundry

Expand All @@ -12,13 +9,13 @@ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
25 changes: 19 additions & 6 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -71,27 +71,40 @@ To write the sequences down as Genbank records, with annotations:
from crazydoc import records_to_genbank
records_to_genbank(biopython_records)
Note that ``records_to_genbank()`` will truncate the record name to 20 characters,
to fit in the GenBank format. Additionally, slashes (``/``) will be replaced with
hyphens (``-``) in the filenames. To read protein sequences, pass ``is_protein=True``:

.. code:: python
biopython_records = parse_doc_file(protein_path, is_protein=True)
This will return *protein* records, which will be saved with a GenPept extension
(.gp) by ``records_to_genbank(biopython_records, is_protein=True)``,
unless specified otherwise with ``extension=``.


Installation
-------------
------------

(soon) You can install crazydoc through PIP
You can install crazydoc through PIP:

.. code::
sudo pip install crazydoc
Alternatively, you can unzip the sources in a folder and type
Alternatively, you can unzip the sources in a folder and type:

.. code::
sudo python setup.py install
License = MIT
--------------
-------------

Crazydoc is an open-source software originally written at the `Edinburgh Genome Foundry <http://genomefoundry.org>`_ by `Zulko <https://github.com/Zulko>`_ and `released on Github <https://github.com/Edinburgh-Genome-Foundry/crazydoc>`_ under the MIT licence (copyright Edinburg Genome Foundry).
Crazydoc is an open-source software originally written at the `Edinburgh Genome Foundry <http://genomefoundry.org>`_ by `Zulko <https://github.com/Zulko>`_ and `released on Github <https://github.com/Edinburgh-Genome-Foundry/crazydoc>`_ under the MIT licence (Copyright 2018 Edinburgh Genome Foundry).

Everyone is welcome to contribute !
Everyone is welcome to contribute!

More biology software
---------------------
Expand Down
55 changes: 35 additions & 20 deletions crazydoc/CrazydocParser.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,14 @@
from docx import Document

from .Observers import (HighlightColor, FontColor, Bold, Italic, UpperCase,
LowerCase, Underline)
from .Observers import (
HighlightColor,
FontColor,
Bold,
Italic,
UpperCase,
LowerCase,
Underline,
)
from .biotools import string_is_sequence


Expand All @@ -22,27 +29,33 @@ class CrazydocParser:
``highlight_color``, ``font_color``, ``bold``, ``italic``,
``upper_case``, ``lower_case``, ``underline``.
"""

observers_dict = {
_class.name: _class()
for _class in (HighlightColor, FontColor, Bold, Italic, UpperCase,
LowerCase, Underline)
for _class in (
HighlightColor,
FontColor,
Bold,
Italic,
UpperCase,
LowerCase,
Underline,
)
}

def __init__(self, observers):
self.observers = [
self.observers_dict[o] if isinstance(o, str) else o
for o in observers
self.observers_dict[o] if isinstance(o, str) else o for o in observers
]


def _extract_sequence_names_and_runs(self, doc):
def _extract_sequence_names_and_runs(self, doc, is_protein=False):
"""Parse the doc, return a list [(sequence_name, sequenceruns), ...]"""
sequence_name = None
sequence_paragraphs = []
reading_sequence = False
for paragraph in doc.paragraphs:
stripped = paragraph.text.replace(" ", "")
if string_is_sequence(stripped):
if string_is_sequence(stripped, is_protein=is_protein):
if reading_sequence:
sequence_paragraphs[-1][1].append(paragraph)
else:
Expand All @@ -52,18 +65,17 @@ def _extract_sequence_names_and_runs(self, doc):
if reading_sequence:
sequence_name = None
reading_sequence = False
if paragraph.text.startswith('>'):
if paragraph.text.startswith(">"):
sequence_name = paragraph.text[1:].strip()
sequence_paragraphs
return [
(name, [run for par in paragraphs for run in par.runs])
for name, paragraphs in sequence_paragraphs
]

def _msword_runs_to_record(self, runs):
def _msword_runs_to_record(self, runs, is_protein=False):
"""Transform a MS Word runs list to a biopython record."""
records = [
observer.msword_runs_to_record(runs)
observer.msword_runs_to_record(runs, is_protein=is_protein)
for observer in self.observers
]
final_record = records[0]
Expand All @@ -82,29 +94,32 @@ def _msword_runs_to_record(self, runs):
record_features[location] = feature
return final_record



def parse_doc_file(self, filepath=None, doc=None):
def parse_doc_file(self, filepath=None, doc=None, is_protein=False):
"""Return a list of records, 1 for each sequence contained in the docx.
Parameters
----------
filepath
A path to a docx file
A path to a docx file.
doc
A python-docx Document object, which can be provided instead of the
file path.
is_protein
True if the sequences are protein sequences (default: False).
"""
if doc is None:
doc = Document(filepath)
records = []
for name, runs in self._extract_sequence_names_and_runs(doc):
record = self._msword_runs_to_record(runs)
for name, runs in self._extract_sequence_names_and_runs(
doc, is_protein=is_protein
):
record = self._msword_runs_to_record(runs, is_protein=is_protein)
if name is not None:
record.id = name
record.name = name.replace(' ', '_')
record.name = name.replace(" ", "_")
for observer in self.observers:
observer.process_record_features(record)
records.append(record)
Expand Down
66 changes: 39 additions & 27 deletions crazydoc/Observers.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
from .conf import conf
from .biotools import sequence_to_record, sequence_to_annotated_record


class StyleObserver:
"""Generic class for observing style-based annotations in sequences.
The subclasses observe each one particular type of DNA sequence annotation
such as the highlight color, bold text, underlines, etc.
The provided subclasses each observe one particular type of DNA sequence
annotation, such as the highlight color, bold text, underlines, etc.
"""

def __init__(self):
pass

Expand All @@ -19,31 +20,33 @@ def process_feature(self, feature):
if self.name not in feature.qualifiers:
return
value = feature.qualifiers[self.name]
label = ''
if 'label' in feature.qualifiers:
label = feature.qualifiers['label'] + '; '
label = ""
if "label" in feature.qualifiers:
label = feature.qualifiers["label"] + "; "
label += self.name
if not isinstance(value, bool):
label += ": " + str(value)
feature.qualifiers['label'] = label
feature.qualifiers["label"] = label

def aggregate_features_from_runs(self, runs):
features = [[None, '']]
features = [[None, ""]]
for run in runs:
value = self.evaluate(run)
text = run.text.replace(' ', '')
text = run.text.replace(" ", "")
if value == features[-1][0]:
features[-1][1] += text
else:
features.append([value, text])
return features

def msword_runs_to_record(self, runs):
def msword_runs_to_record(self, runs, is_protein=False):
feature_records = [
(
sequence_to_annotated_record(text, **{self.name: val})
sequence_to_annotated_record(
text, is_protein=is_protein, **{self.name: val}
)
if val
else sequence_to_record(text)
else sequence_to_record(text, is_protein=is_protein)
)
for (val, text) in self.aggregate_features_from_runs(runs)
]
Expand All @@ -60,16 +63,16 @@ def process_feature(self, feature):
if self.name not in feature.qualifiers:
return
color = feature.qualifiers[self.name]
for field in ['color', 'ApEinfo_revcolor', 'ApEinfo_fwdcolor']:
for field in ["color", "ApEinfo_revcolor", "ApEinfo_fwdcolor"]:
feature.qualifiers[field] = color


class CharactersObserver(StyleObserver):
"""Subclass for character-by-character observers."""

def aggregate_features_from_runs(self, runs):
features = [[None, '']]
text = ''.join([r.text for r in runs])
features = [[None, ""]]
text = "".join([r.text for r in runs])
for character in text:
value = self.evaluate(character)
if value == features[-1][0]:
Expand All @@ -81,7 +84,8 @@ def aggregate_features_from_runs(self, runs):

class Italic(StyleObserver):
"""Captures italic text."""
name = 'italic'

name = "italic"

def evaluate(self, run):
"""Return whether the run has italic style"""
Expand All @@ -90,15 +94,18 @@ def evaluate(self, run):

class Bold(StyleObserver):
"""Captures bold text."""
name = 'bold'

name = "bold"

def evaluate(self, run):
"""Return whether the run has bold style"""
return run.bold


class Underline(StyleObserver):
"""Captures underlined text."""
name = 'underline'

name = "underline"

def evaluate(self, run):
"""Return whether the run has underline style"""
Expand All @@ -107,42 +114,47 @@ def evaluate(self, run):

class FontColor(ColorObserver):
"""Captures text with non-black font color."""
name = 'font_color'

name = "font_color"

def evaluate(self, run):
"""Return False if no color, else the #ae60bf color."""
color = str(run.font.color.rgb)
if color in ['None', '000000']:
if color in ["None", "000000"]:
return False
else:
return "#" + color


class HighlightColor(ColorObserver):
"""Captures text with a background-highlighting color."""
name = 'highlight_color'

name = "highlight_color"

def evaluate(self, run):
"""Return False if no background color, else the #ae60bf color."""
color = run.font.highlight_color
if color is None:
return False
else:
return conf['color_theme'][color._member_name]
return conf["color_theme"][color._member_name]


class UpperCase(CharactersObserver):
"""Captures upper-case text."""
name = 'upper_case'

name = "upper_case"

def evaluate(self, character):
"""Return whether the character is upper"""
return (character == character.upper())
return character == character.upper()


class LowerCase(CharactersObserver):
"""Captures lower-case text."""
name = 'lower_case'

def evaluate(self, character):#
name = "lower_case"

def evaluate(self, character): #
"""Return whether the character is lower"""
return (character == character.lower())
return character == character.lower()
Loading

0 comments on commit b571366

Please sign in to comment.