Skip to content

Commit

Permalink
remove unused _with_spans metric (#3342)
Browse files Browse the repository at this point in the history
The table metrics considering spans is not used and it messes with the
output thus I have cleaned the code from it. Though, I have left
table_as_cells in the source code - it still may be useful for the users
  • Loading branch information
plutasnyy committed Jul 8, 2024
1 parent caea73c commit 609a08a
Show file tree
Hide file tree
Showing 4 changed files with 12 additions and 28 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
## 0.14.10-dev11
## 0.14.10-dev12

### Enhancements

* **Update unstructured-client dependency** Change unstructured-client dependency pin back to
greater than min version and updated tests that were failing given the update.
* **`.doc` files are now supported in the `arm64` image.**. `libreoffice24` is added to the `arm64` image, meaning `.doc` files are now supported. We have follow on work planned to investigate adding `.ppt` support for `arm64` as well.
* Add table detection metrics: recall, precision and f1
* Remove unused _with_spans metrics

### Features

Expand Down
4 changes: 2 additions & 2 deletions test_unstructured/metrics/test_evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def test_text_extraction_evaluation():
UNSTRUCTURED_TABLE_STRUCTURE_DIRNAME,
GOLD_TABLE_STRUCTURE_DIRNAME,
Path("IRS-2023-Form-1095-A.pdf.json"),
23,
13,
{},
),
(
Expand Down Expand Up @@ -191,7 +191,7 @@ def test_table_structure_evaluation():
assert os.path.isfile(os.path.join(export_dir, "aggregate-table-structure-accuracy.tsv"))
df = pd.read_csv(os.path.join(export_dir, "all-docs-table-structure-accuracy.tsv"), sep="\t")
assert len(df) == 1
assert len(df.columns) == 23
assert len(df.columns) == 13
assert df.iloc[0].filename == "IRS-2023-Form-1095-A.pdf"


Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.14.10-dev11" # pragma: no cover
__version__ = "0.14.10-dev12" # pragma: no cover
31 changes: 7 additions & 24 deletions unstructured/metrics/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,35 +229,18 @@ def _process_document(self, doc: Path) -> list:
source_type="html",
)
report_from_html = processor_from_text_as_html.process_file()

processor_from_table_as_cells = TableEvalProcessor.from_json_files(
prediction_file=prediction_file,
ground_truth_file=ground_truth_file,
cutoff=self.cutoff,
source_type="cells",
)
report_from_cells = processor_from_table_as_cells.process_file()
return (
[
out_filename,
doctype,
connector,
]
+ [getattr(report_from_html, metric) for metric in self.supported_metric_names]
+ [getattr(report_from_cells, metric) for metric in self.supported_metric_names]
)
return [
out_filename,
doctype,
connector,
] + [getattr(report_from_html, metric) for metric in self.supported_metric_names]

def _generate_dataframes(self, rows):
# NOTE(mike): this logic should be simplified
suffixed_table_eval_metrics = [
f"{metric}_with_spans" for metric in self.supported_metric_names
]
combined_table_metrics = self.supported_metric_names + suffixed_table_eval_metrics
headers = [
"filename",
"doctype",
"connector",
] + combined_table_metrics
] + self.supported_metric_names

df = pd.DataFrame(rows, columns=headers)
has_tables_df = df[df["total_tables"] > 0]
Expand All @@ -268,7 +251,7 @@ def _generate_dataframes(self, rows):
).reset_index()
else:
element_metrics_results = {}
for metric in combined_table_metrics:
for metric in self.supported_metric_names:
metric_df = has_tables_df[has_tables_df[metric].notnull()]
agg_metric = metric_df[metric].agg([_mean, _stdev, _pstdev, _count]).transpose()
if agg_metric.empty:
Expand Down

0 comments on commit 609a08a

Please sign in to comment.