camelot-dev · ArifRasim · Jan 19, 2024
diff --git a/camelot/cli.py b/camelot/cli.py
@@ -63,7 +63,7 @@ def set_config(self, key, value):
 @click.option(
     "-strip",
     "--strip_text",
-    help="Characters that should be stripped from a string before"
+    help="Substrings that should be stripped from a string before"
     " assigning it to a cell.",
 )
 @click.option(

diff --git a/camelot/io.py b/camelot/io.py
@@ -52,8 +52,8 @@ def read_pdf(
     flag_size : bool, optional (default: False)
         Flag text based on font size. Useful to detect
         super/subscripts. Adds <s></s> around flagged text.
-    strip_text : str, optional (default: '')
-        Characters that should be stripped from a string before
+    strip_text : List, optional (default: [])
+        Substrings that should be stripped from a string before
         assigning it to a cell.
     row_tol^ : int, optional (default: 2)
         Tolerance parameter used to combine text vertically,

diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py
@@ -60,8 +60,8 @@ class Lattice(BaseParser):
     flag_size : bool, optional (default: False)
         Flag text based on font size. Useful to detect
         super/subscripts. Adds <s></s> around flagged text.
-    strip_text : str, optional (default: '')
-        Characters that should be stripped from a string before
+    strip_text : List, optional (default: [])
+        Substrings that should be stripped from a string before
         assigning it to a cell.
     line_tol : int, optional (default: 2)
         Tolerance parameter used to merge close vertical and horizontal
@@ -98,7 +98,7 @@ def __init__(
         shift_text=["l", "t"],
         split_text=False,
         flag_size=False,
-        strip_text="",
+        strip_text=[],
         line_tol=2,
         joint_tol=2,
         threshold_blocksize=15,

diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py
@@ -42,8 +42,8 @@ class Stream(BaseParser):
     flag_size : bool, optional (default: False)
         Flag text based on font size. Useful to detect
         super/subscripts. Adds <s></s> around flagged text.
-    strip_text : str, optional (default: '')
-        Characters that should be stripped from a string before
+    strip_text : List, optional (default: [])
+        Substrings that should be stripped from a string before
         assigning it to a cell.
     edge_tol : int, optional (default: 50)
         Tolerance parameter for extending textedges vertically.
@@ -63,7 +63,7 @@ def __init__(
         columns=None,
         split_text=False,
         flag_size=False,
-        strip_text="",
+        strip_text=[],
         edge_tol=50,
         row_tol=2,
         column_tol=0,

diff --git a/camelot/utils.py b/camelot/utils.py
@@ -484,24 +484,24 @@ def merge_close_lines(ar, line_tol=2):
     return ret
 
 
-def text_strip(text, strip=""):
-    """Strips any characters in `strip` that are present in `text`.
+def text_strip(text, strip=[]):
+    """Strips any substrings in `strip` that are present in `text`.
     Parameters
     ----------
     text : str
         Text to process and strip.
-    strip : str, optional (default: '')
-        Characters that should be stripped from `text`.
+    strip : List, optional (default: [])
+        Substrings that should be stripped from `text`.
     Returns
     -------
     stripped : str
     """
     if not strip:
         return text
 
-    stripped = re.sub(
-        rf"[{''.join(map(re.escape, strip))}]", "", text, flags=re.UNICODE
-    )
+    pattern = "|".join(map(re.escape, strip))
+
+    stripped = re.sub(pattern, "", text, flags=re.UNICODE)
     return stripped
 
 
@@ -510,7 +510,7 @@ def text_strip(text, strip=""):
 # (inspired from sklearn.pipeline.Pipeline)
 
 
-def flag_font_size(textline, direction, strip_text=""):
+def flag_font_size(textline, direction, strip_text=[]):
     """Flags super/subscripts in text by enclosing them with <s></s>.
     May give false positives.
 
@@ -520,8 +520,8 @@ def flag_font_size(textline, direction, strip_text=""):
         List of PDFMiner LTChar objects.
     direction : string
         Direction of the PDFMiner LTTextLine object.
-    strip_text : str, optional (default: '')
-        Characters that should be stripped from a string before
+    strip_text : List, optional (default: [])
+        Substrings that should be stripped from a string before
         assigning it to a cell.
 
     Returns
@@ -562,7 +562,7 @@ def flag_font_size(textline, direction, strip_text=""):
     return text_strip(fstring, strip_text)
 
 
-def split_textline(table, textline, direction, flag_size=False, strip_text=""):
+def split_textline(table, textline, direction, flag_size=False, strip_text=[]):
     """Splits PDFMiner LTTextLine into substrings if it spans across
     multiple rows/columns.
 
@@ -577,8 +577,8 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
         Whether or not to highlight a substring using <s></s>
         if its size is different from rest of the string. (Useful for
         super and subscripts.)
-    strip_text : str, optional (default: '')
-        Characters that should be stripped from a string before
+    strip_text : List, optional (default: [])
+        Substrings that should be stripped from a string before
         assigning it to a cell.
 
     Returns
@@ -681,7 +681,7 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
 
 
 def get_table_index(
-    table, t, direction, split_text=False, flag_size=False, strip_text=""
+    table, t, direction, split_text=False, flag_size=False, strip_text=[]
 ):
     """Gets indices of the table cell where given text object lies by
     comparing their y and x-coordinates.
@@ -700,8 +700,8 @@ def get_table_index(
         Whether or not to highlight a substring using <s></s>
         if its size is different from rest of the string. (Useful for
         super and subscripts)
-    strip_text : str, optional (default: '')
-        Characters that should be stripped from a string before
+    strip_text : List, optional (default: [])
+        Substrings that should be stripped from a string before
         assigning it to a cell.
 
     Returns

diff --git a/noxfile.py b/noxfile.py
@@ -171,7 +171,9 @@ def tests(session: Session) -> None:
         "coverage[toml]", "pytest", "pygments", *base_requires, *plot_requires
     )
     try:
-        session.run("coverage", "run", "--parallel", "-m", "pytest", *session.posargs)
+        session.run(
+            "coverage", "run", "--parallel", "-m", "pytest", "--pdb", *session.posargs
+        )
     finally:
         if session.interactive:
             session.notify("coverage", posargs=[])

diff --git a/tests/test_stream.py b/tests/test_stream.py
@@ -95,7 +95,11 @@ def test_stream_strip_text(testdir):
     df = pd.DataFrame(data_stream_strip_text)
 
     filename = os.path.join(testdir, "detect_vertical_false.pdf")
-    tables = camelot.read_pdf(filename, flavor="stream", strip_text=" ,\n")
+    tables = camelot.read_pdf(filename, flavor="stream", strip_text=[" ", ",", "\n"])
+    import pdb
+
+    pdb.set_trace()
+
     assert_frame_equal(df, tables[0].df)
 
 

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -2,11 +2,10 @@
 import os
 
 from pdfminer.converter import PDFPageAggregator
-from pdfminer.layout import (
-    LAParams,
-    LTTextBoxHorizontal
-)
-from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
+from pdfminer.layout import LAParams
+from pdfminer.layout import LTTextBoxHorizontal
+from pdfminer.pdfinterp import PDFPageInterpreter
+from pdfminer.pdfinterp import PDFResourceManager
 from pdfminer.pdfpage import PDFPage
 
 from camelot.utils import bbox_intersection_area
@@ -16,7 +15,7 @@ def get_text_from_pdf(filename):
     "Method to extract text object from pdf"
     # https://stackoverflow.com/questions/22898145/how-to-extract-text-and-text-coordinates-from-a-pdf-file
     # https://pdfminer-docs.readthedocs.io/programming.html#performing-layout-analysis
-    document = open(filename, 'rb')
+    document = open(filename, "rb")
     # Create resource manager
     rsrcmgr = PDFResourceManager()
     # Set parameters for analysis.