Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

make strip accept substrings in a list #485

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion camelot/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def set_config(self, key, value):
@click.option(
"-strip",
"--strip_text",
help="Characters that should be stripped from a string before"
help="Substrings that should be stripped from a string before"
" assigning it to a cell.",
)
@click.option(
Expand Down
4 changes: 2 additions & 2 deletions camelot/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ def read_pdf(
flag_size : bool, optional (default: False)
Flag text based on font size. Useful to detect
super/subscripts. Adds <s></s> around flagged text.
strip_text : str, optional (default: '')
Characters that should be stripped from a string before
strip_text : List, optional (default: [])
Substrings that should be stripped from a string before
assigning it to a cell.
row_tol^ : int, optional (default: 2)
Tolerance parameter used to combine text vertically,
Expand Down
6 changes: 3 additions & 3 deletions camelot/parsers/lattice.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,8 @@ class Lattice(BaseParser):
flag_size : bool, optional (default: False)
Flag text based on font size. Useful to detect
super/subscripts. Adds <s></s> around flagged text.
strip_text : str, optional (default: '')
Characters that should be stripped from a string before
strip_text : List, optional (default: [])
Substrings that should be stripped from a string before
assigning it to a cell.
line_tol : int, optional (default: 2)
Tolerance parameter used to merge close vertical and horizontal
Expand Down Expand Up @@ -98,7 +98,7 @@ def __init__(
shift_text=["l", "t"],
split_text=False,
flag_size=False,
strip_text="",
strip_text=[],
line_tol=2,
joint_tol=2,
threshold_blocksize=15,
Expand Down
6 changes: 3 additions & 3 deletions camelot/parsers/stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ class Stream(BaseParser):
flag_size : bool, optional (default: False)
Flag text based on font size. Useful to detect
super/subscripts. Adds <s></s> around flagged text.
strip_text : str, optional (default: '')
Characters that should be stripped from a string before
strip_text : List, optional (default: [])
Substrings that should be stripped from a string before
assigning it to a cell.
edge_tol : int, optional (default: 50)
Tolerance parameter for extending textedges vertically.
Expand All @@ -63,7 +63,7 @@ def __init__(
columns=None,
split_text=False,
flag_size=False,
strip_text="",
strip_text=[],
edge_tol=50,
row_tol=2,
column_tol=0,
Expand Down
32 changes: 16 additions & 16 deletions camelot/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -484,24 +484,24 @@ def merge_close_lines(ar, line_tol=2):
return ret


def text_strip(text, strip=""):
"""Strips any characters in `strip` that are present in `text`.
def text_strip(text, strip=[]):
"""Strips any substrings in `strip` that are present in `text`.
Parameters
----------
text : str
Text to process and strip.
strip : str, optional (default: '')
Characters that should be stripped from `text`.
strip : List, optional (default: [])
Substrings that should be stripped from `text`.
Returns
-------
stripped : str
"""
if not strip:
return text

stripped = re.sub(
rf"[{''.join(map(re.escape, strip))}]", "", text, flags=re.UNICODE
)
pattern = "|".join(map(re.escape, strip))

stripped = re.sub(pattern, "", text, flags=re.UNICODE)
return stripped


Expand All @@ -510,7 +510,7 @@ def text_strip(text, strip=""):
# (inspired from sklearn.pipeline.Pipeline)


def flag_font_size(textline, direction, strip_text=""):
def flag_font_size(textline, direction, strip_text=[]):
"""Flags super/subscripts in text by enclosing them with <s></s>.
May give false positives.

Expand All @@ -520,8 +520,8 @@ def flag_font_size(textline, direction, strip_text=""):
List of PDFMiner LTChar objects.
direction : string
Direction of the PDFMiner LTTextLine object.
strip_text : str, optional (default: '')
Characters that should be stripped from a string before
strip_text : List, optional (default: [])
Substrings that should be stripped from a string before
assigning it to a cell.

Returns
Expand Down Expand Up @@ -562,7 +562,7 @@ def flag_font_size(textline, direction, strip_text=""):
return text_strip(fstring, strip_text)


def split_textline(table, textline, direction, flag_size=False, strip_text=""):
def split_textline(table, textline, direction, flag_size=False, strip_text=[]):
"""Splits PDFMiner LTTextLine into substrings if it spans across
multiple rows/columns.

Expand All @@ -577,8 +577,8 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
Whether or not to highlight a substring using <s></s>
if its size is different from rest of the string. (Useful for
super and subscripts.)
strip_text : str, optional (default: '')
Characters that should be stripped from a string before
strip_text : List, optional (default: [])
Substrings that should be stripped from a string before
assigning it to a cell.

Returns
Expand Down Expand Up @@ -681,7 +681,7 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):


def get_table_index(
table, t, direction, split_text=False, flag_size=False, strip_text=""
table, t, direction, split_text=False, flag_size=False, strip_text=[]
):
"""Gets indices of the table cell where given text object lies by
comparing their y and x-coordinates.
Expand All @@ -700,8 +700,8 @@ def get_table_index(
Whether or not to highlight a substring using <s></s>
if its size is different from rest of the string. (Useful for
super and subscripts)
strip_text : str, optional (default: '')
Characters that should be stripped from a string before
strip_text : List, optional (default: [])
Substrings that should be stripped from a string before
assigning it to a cell.

Returns
Expand Down
4 changes: 3 additions & 1 deletion noxfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,9 @@ def tests(session: Session) -> None:
"coverage[toml]", "pytest", "pygments", *base_requires, *plot_requires
)
try:
session.run("coverage", "run", "--parallel", "-m", "pytest", *session.posargs)
session.run(
"coverage", "run", "--parallel", "-m", "pytest", "--pdb", *session.posargs
)
finally:
if session.interactive:
session.notify("coverage", posargs=[])
Expand Down
6 changes: 5 additions & 1 deletion tests/test_stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,11 @@ def test_stream_strip_text(testdir):
df = pd.DataFrame(data_stream_strip_text)

filename = os.path.join(testdir, "detect_vertical_false.pdf")
tables = camelot.read_pdf(filename, flavor="stream", strip_text=" ,\n")
tables = camelot.read_pdf(filename, flavor="stream", strip_text=[" ", ",", "\n"])
import pdb

pdb.set_trace()

assert_frame_equal(df, tables[0].df)


Expand Down
11 changes: 5 additions & 6 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,10 @@
import os

from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import (
LAParams,
LTTextBoxHorizontal
)
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.layout import LAParams
from pdfminer.layout import LTTextBoxHorizontal
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage

from camelot.utils import bbox_intersection_area
Expand All @@ -16,7 +15,7 @@ def get_text_from_pdf(filename):
"Method to extract text object from pdf"
# https://stackoverflow.com/questions/22898145/how-to-extract-text-and-text-coordinates-from-a-pdf-file
# https://pdfminer-docs.readthedocs.io/programming.html#performing-layout-analysis
document = open(filename, 'rb')
document = open(filename, "rb")
# Create resource manager
rsrcmgr = PDFResourceManager()
# Set parameters for analysis.
Expand Down