From 9b99b89b823ada1c7c9a8a06d268e99cd91e15d3 Mon Sep 17 00:00:00 2001 From: Timothy Willard Date: Mon, 1 Jul 2024 11:46:20 -0400 Subject: [PATCH 01/19] Overhual write_df unit tests. Overhauled the gempyor.utils.write_df unit tests by placing them in a new file with a class grouping similar fixtures. Added tests for the NotImplementedError, writing to csv, and writing to parquet. --- .../gempyor_pkg/tests/utils/test_write_df.py | 127 ++++++++++++++++++ 1 file changed, 127 insertions(+) create mode 100644 flepimop/gempyor_pkg/tests/utils/test_write_df.py diff --git a/flepimop/gempyor_pkg/tests/utils/test_write_df.py b/flepimop/gempyor_pkg/tests/utils/test_write_df.py new file mode 100644 index 000000000..d465f6434 --- /dev/null +++ b/flepimop/gempyor_pkg/tests/utils/test_write_df.py @@ -0,0 +1,127 @@ +import os +from tempfile import NamedTemporaryFile +from pathlib import Path +from typing import Callable, Any + +import pytest +import pandas as pd + +from gempyor.utils import write_df + + +class TestWriteDf: + """ + Unit tests for the `gempyor.utils.write_df` function. + """ + + sample_df: pd.DataFrame = pd.DataFrame( + { + "abc": [1, 2, 3, 4, 5], + "def": ["v", "w", "x", "y", "z"], + "ghi": [True, False, False, None, True], + "jkl": [1.2, 3.4, 5.6, 7.8, 9.0], + } + ) + + def test_raises_not_implemented_error(self) -> None: + """ + Tests that write_df raises a NotImplementedError for unsupported file + extensions. + """ + with pytest.raises( + expected_exception=NotImplementedError, + match="Invalid extension txt. Must be 'csv' or 'parquet'", + ) as _: + with NamedTemporaryFile(suffix=".txt") as temp_file: + write_df(fname=temp_file.name, df=self.sample_df) + + @pytest.mark.parametrize( + "fname_transformer,extension", + [ + (lambda x: str(x), ""), + (lambda x: x, ""), + (lambda x: f"{x.parent}/{x.stem}", "csv"), + (lambda x: Path(f"{x.parent}/{x.stem}"), "csv"), + ], + ) + def test_write_csv_dataframe( + self, + fname_transformer: Callable[[os.PathLike], Any], + extension: str, + ) -> None: + """ + Tests writing a DataFrame to a CSV file. + + Args: + fname_transformer: A function that transforms the file name to create the + `fname` arg. + extension: The file extension to use, provided directly to + `gempyor.utils.write_df`. + """ + self._test_write_df( + fname_transformer=fname_transformer, + df=self.sample_df, + extension=extension, + suffix=".csv", + path_reader=lambda x: pd.read_csv(x, index_col=False), + ) + + @pytest.mark.parametrize( + "fname_transformer,extension", + [ + (lambda x: str(x), ""), + (lambda x: x, ""), + (lambda x: f"{x.parent}/{x.stem}", "parquet"), + (lambda x: Path(f"{x.parent}/{x.stem}"), "parquet"), + ], + ) + def test_write_parquet_dataframe( + self, + fname_transformer: Callable[[os.PathLike], Any], + extension: str, + ) -> None: + """ + Tests writing a DataFrame to a Parquet file. + + Args: + fname_transformer: A function that transforms the file name to create the + `fname` arg. + extension: The file extension to use, provided directly to + `gempyor.utils.write_df`. + """ + self._test_write_df( + fname_transformer=fname_transformer, + df=self.sample_df, + extension=extension, + suffix=".parquet", + path_reader=lambda x: pd.read_parquet(x), + ) + + def _test_write_df( + self, + fname_transformer: Callable[[os.PathLike], Any], + df: pd.DataFrame, + extension: str, + suffix: str | None, + path_reader: Callable[[os.PathLike], pd.DataFrame], + ) -> None: + """ + Helper method to test writing a DataFrame to a file. + + Args: + fname_transformer: A function that transforms the file name. + df: The DataFrame to write. + extension: The file extension to use. + suffix: The suffix to use for the temporary file. + path_reader: A function to read the DataFrame from the file. + """ + with NamedTemporaryFile(suffix=suffix) as temp_file: + temp_path = Path(temp_file.name) + assert temp_path.stat().st_size == 0 + assert ( + write_df(fname=fname_transformer(temp_path), df=df, extension=extension) + is None + ) + assert temp_path.stat().st_size > 0 + test_df = path_reader(temp_path) + assert test_df.equals(df) From 6ae23b34ca3c094ff26371acf12a742816363926 Mon Sep 17 00:00:00 2001 From: Timothy Willard Date: Mon, 1 Jul 2024 12:19:56 -0400 Subject: [PATCH 02/19] Overhual read_df unit tests Overhauled the gempyor.utils.read_df unit tests by blacing them in a new file with a class for grouping similar fixtures. Added tests for the NotImplementedError, reading from csv, and reading from parquet. --- .../gempyor_pkg/tests/utils/test_read_df.py | 128 ++++++++++++++++++ 1 file changed, 128 insertions(+) create mode 100644 flepimop/gempyor_pkg/tests/utils/test_read_df.py diff --git a/flepimop/gempyor_pkg/tests/utils/test_read_df.py b/flepimop/gempyor_pkg/tests/utils/test_read_df.py new file mode 100644 index 000000000..d25ed9a59 --- /dev/null +++ b/flepimop/gempyor_pkg/tests/utils/test_read_df.py @@ -0,0 +1,128 @@ +import os +from tempfile import NamedTemporaryFile +from pathlib import Path +from typing import Callable, Any + +import pytest +import pandas as pd + +from gempyor.utils import read_df + + +class TestReadDf: + """ + Unit tests for the `gempyor.utils.read_df` function. + """ + + sample_df: pd.DataFrame = pd.DataFrame( + { + "abc": [1, 2, 3, 4, 5], + "def": ["v", "w", "x", "y", "z"], + "ghi": [True, False, False, None, True], + "jkl": [1.2, 3.4, 5.6, 7.8, 9.0], + } + ) + + def test_raises_not_implemented_error(self) -> None: + """ + Tests that write_df raises a NotImplementedError for unsupported file + extensions. + """ + with pytest.raises( + expected_exception=NotImplementedError, + match="Invalid extension txt. Must be 'csv' or 'parquet'", + ) as _: + with NamedTemporaryFile(suffix=".txt") as temp_file: + read_df(fname=temp_file.name) + with pytest.raises( + expected_exception=NotImplementedError, + match="Invalid extension txt. Must be 'csv' or 'parquet'", + ) as _: + with NamedTemporaryFile(suffix=".txt") as temp_file: + fname = temp_file.name[:-4] + read_df(fname=fname, extension="txt") + + @pytest.mark.parametrize( + "fname_transformer,extension", + [ + (lambda x: str(x), ""), + (lambda x: x, ""), + (lambda x: f"{x.parent}/{x.stem}", "csv"), + (lambda x: Path(f"{x.parent}/{x.stem}"), "csv"), + ], + ) + def test_read_csv_dataframe( + self, + fname_transformer: Callable[[os.PathLike], Any], + extension: str, + ) -> None: + """ + Tests reading a DataFrame from a CSV file. + + Args: + fname_transformer: A function that transforms the file name to create the + `fname` arg. + extension: The file extension to use, provided directly to + `gempyor.utils.read_df`. + """ + self._test_read_df( + fname_transformer=fname_transformer, + extension=extension, + suffix=".csv", + path_writer=lambda p, df: df.to_csv(p, index=False) + ) + + @pytest.mark.parametrize( + "fname_transformer,extension", + [ + (lambda x: str(x), ""), + (lambda x: x, ""), + (lambda x: f"{x.parent}/{x.stem}", "parquet"), + (lambda x: Path(f"{x.parent}/{x.stem}"), "parquet"), + ], + ) + def test_read_parquet_dataframe( + self, + fname_transformer: Callable[[os.PathLike], Any], + extension: str, + ) -> None: + """ + Tests reading a DataFrame from a Parquet file. + + Args: + fname_transformer: A function that transforms the file name to create the + `fname` arg. + extension: The file extension to use, provided directly to + `gempyor.utils.read_df`. + """ + self._test_read_df( + fname_transformer=fname_transformer, + extension=extension, + suffix=".parquet", + path_writer=lambda p, df: df.to_parquet(p, engine='pyarrow', index=False), + ) + + def _test_read_df( + self, + fname_transformer: Callable[[os.PathLike], Any], + extension: str, + suffix: str | None, + path_writer: Callable[[os.PathLike, pd.DataFrame], None], + ) -> None: + """ + Helper method to test writing a DataFrame to a file. + + Args: + fname_transformer: A function that transforms the file name. + extension: The file extension to use. + suffix: The suffix to use for the temporary file. + path_writer: A function to write the DataFrame to the file. + """ + with NamedTemporaryFile(suffix=suffix) as temp_file: + temp_path = Path(temp_file.name) + assert temp_path.stat().st_size == 0 + path_writer(temp_path, self.sample_df) + test_df = read_df(fname=fname_transformer(temp_path), extension=extension) + assert isinstance(test_df, pd.DataFrame) + assert temp_path.stat().st_size > 0 + assert test_df.equals(self.sample_df) From 9ad1abdd9cfc898f69c14144162988533726a5ea Mon Sep 17 00:00:00 2001 From: Timothy Willard Date: Mon, 1 Jul 2024 12:38:24 -0400 Subject: [PATCH 03/19] Formatted read_df tests, explicit parquet engine * Formatted the `tests/utils/test_read_df.py` file. * Added `engine="pyarrow"` to `write_df` unit tests. --- flepimop/gempyor_pkg/tests/utils/test_read_df.py | 10 +++++----- flepimop/gempyor_pkg/tests/utils/test_write_df.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/flepimop/gempyor_pkg/tests/utils/test_read_df.py b/flepimop/gempyor_pkg/tests/utils/test_read_df.py index d25ed9a59..0b2334602 100644 --- a/flepimop/gempyor_pkg/tests/utils/test_read_df.py +++ b/flepimop/gempyor_pkg/tests/utils/test_read_df.py @@ -41,7 +41,7 @@ def test_raises_not_implemented_error(self) -> None: with NamedTemporaryFile(suffix=".txt") as temp_file: fname = temp_file.name[:-4] read_df(fname=fname, extension="txt") - + @pytest.mark.parametrize( "fname_transformer,extension", [ @@ -69,9 +69,9 @@ def test_read_csv_dataframe( fname_transformer=fname_transformer, extension=extension, suffix=".csv", - path_writer=lambda p, df: df.to_csv(p, index=False) + path_writer=lambda p, df: df.to_csv(p, index=False), ) - + @pytest.mark.parametrize( "fname_transformer,extension", [ @@ -99,9 +99,9 @@ def test_read_parquet_dataframe( fname_transformer=fname_transformer, extension=extension, suffix=".parquet", - path_writer=lambda p, df: df.to_parquet(p, engine='pyarrow', index=False), + path_writer=lambda p, df: df.to_parquet(p, engine="pyarrow", index=False), ) - + def _test_read_df( self, fname_transformer: Callable[[os.PathLike], Any], diff --git a/flepimop/gempyor_pkg/tests/utils/test_write_df.py b/flepimop/gempyor_pkg/tests/utils/test_write_df.py index d465f6434..aad3bc1c4 100644 --- a/flepimop/gempyor_pkg/tests/utils/test_write_df.py +++ b/flepimop/gempyor_pkg/tests/utils/test_write_df.py @@ -94,7 +94,7 @@ def test_write_parquet_dataframe( df=self.sample_df, extension=extension, suffix=".parquet", - path_reader=lambda x: pd.read_parquet(x), + path_reader=lambda x: pd.read_parquet(x, engine="pyarrow"), ) def _test_write_df( From 3d22841f76812f2de65c0eaaf087c9b76e435c57 Mon Sep 17 00:00:00 2001 From: Timothy Willard Date: Mon, 1 Jul 2024 12:54:11 -0400 Subject: [PATCH 04/19] Reorganized utils.py Moved read_df to be next to write_df in utils.py. --- flepimop/gempyor_pkg/src/gempyor/utils.py | 34 +++++++++++------------ 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/flepimop/gempyor_pkg/src/gempyor/utils.py b/flepimop/gempyor_pkg/src/gempyor/utils.py index 4d3209061..2f13f0f57 100644 --- a/flepimop/gempyor_pkg/src/gempyor/utils.py +++ b/flepimop/gempyor_pkg/src/gempyor/utils.py @@ -38,6 +38,23 @@ def write_df(fname: str, df: pd.DataFrame, extension: str = ""): raise NotImplementedError(f"Invalid extension {extension}. Must be 'csv' or 'parquet'") +def read_df(fname: str, extension: str = "") -> pd.DataFrame: + """Load a dataframe from a file, agnostic to whether it is a parquet or a csv. The extension + can be provided as an argument or it is infered""" + fname = str(fname) + if extension: # Empty strings are falsy in python + fname = f"{fname}.{extension}" + extension = fname.split(".")[-1] + if extension == "csv": + # The converter prevents e.g leading geoid (0600) to be converted as int; and works when the column is absent + df = pd.read_csv(fname, converters={"subpop": lambda x: str(x)}, skipinitialspace=True) + elif extension == "parquet": + df = pa.parquet.read_table(fname).to_pandas() + else: + raise NotImplementedError(f"Invalid extension {extension}. Must be 'csv' or 'parquet'") + return df + + def command_safe_run(command, command_name="mycommand", fail_on_fail=True): import subprocess import shlex # using shlex to split the command because it's not obvious https://docs.python.org/3/library/subprocess.html#subprocess.Popen @@ -62,23 +79,6 @@ def command_safe_run(command, command_name="mycommand", fail_on_fail=True): return sr.returncode, stdout, stderr -def read_df(fname: str, extension: str = "") -> pd.DataFrame: - """Load a dataframe from a file, agnostic to whether it is a parquet or a csv. The extension - can be provided as an argument or it is infered""" - fname = str(fname) - if extension: # Empty strings are falsy in python - fname = f"{fname}.{extension}" - extension = fname.split(".")[-1] - if extension == "csv": - # The converter prevents e.g leading geoid (0600) to be converted as int; and works when the column is absent - df = pd.read_csv(fname, converters={"subpop": lambda x: str(x)}, skipinitialspace=True) - elif extension == "parquet": - df = pa.parquet.read_table(fname).to_pandas() - else: - raise NotImplementedError(f"Invalid extension {extension}. Must be 'csv' or 'parquet'") - return df - - def add_method(cls): "Decorator to add a method to a class" From df794e92609d1b56a2b8faf9340d74364c242902 Mon Sep 17 00:00:00 2001 From: Timothy Willard Date: Mon, 1 Jul 2024 13:02:08 -0400 Subject: [PATCH 05/19] Documented and extended write_df * Documented the gempyor.utils.write_df function using the Google style guide. * Extended write_df to explicitly support os.PathLike types for fname (was implicitly supported) and added support for bytes. * Changed the file manipulation logic to use pathlib rather than manipulating strings in write_df. --- flepimop/gempyor_pkg/src/gempyor/utils.py | 49 ++++++++++++++++------- 1 file changed, 35 insertions(+), 14 deletions(-) diff --git a/flepimop/gempyor_pkg/src/gempyor/utils.py b/flepimop/gempyor_pkg/src/gempyor/utils.py index 2f13f0f57..bace01582 100644 --- a/flepimop/gempyor_pkg/src/gempyor/utils.py +++ b/flepimop/gempyor_pkg/src/gempyor/utils.py @@ -16,26 +16,47 @@ from gempyor import file_paths from typing import List, Dict from botocore.exceptions import ClientError +from pathlib import Path logger = logging.getLogger(__name__) config = confuse.Configuration("flepiMoP", read=False) -def write_df(fname: str, df: pd.DataFrame, extension: str = ""): - """write without index, so assume the index has been put a column""" - # cast to str to use .split in case fname is a PosixPath - fname = str(fname) - if extension: # Empty strings are falsy in python - fname = f"{fname}.{extension}" - extension = fname.split(".")[-1] - if extension == "csv": - df.to_csv(fname, index=False) - elif extension == "parquet": - df = pa.Table.from_pandas(df, preserve_index=False) - pa.parquet.write_table(df, fname) - else: - raise NotImplementedError(f"Invalid extension {extension}. Must be 'csv' or 'parquet'") +def write_df( + fname: str | bytes | os.PathLike, + df: pd.DataFrame, + extension: str = "", +) -> None: + """Writes a pandas DataFrame without its index to a file. + + Writes a pandas DataFrame to either a CSV or Parquet file without its index and can + infer which format to use based on the extension given in `fname` or based on + explicit `extension`. + + Args: + fname: The name of the file to write to. + df: A pandas DataFrame whose contents to write, but without its index. + extension: A user specified extension to use for the file if not contained in + `fname` already. + + Returns: + None + + Raises: + NotImplementedError: The given output extension is not supported yet. + """ + # Decipher the path given + fname = fname.decode() if isinstance(fname, bytes) else fname + path = Path(f"{fname}.{extension}") if extension else Path(fname) + # Write df to either a csv or parquet or raise if an invalid extension + if path.suffix == ".csv": + return df.to_csv(path, index=False) + elif path.suffix == ".parquet": + return df.to_parquet(path, index=False, engine="pyarrow") + raise NotImplementedError( + f"Invalid extension {extension}. Must be 'csv' or 'parquet'" + ) def read_df(fname: str, extension: str = "") -> pd.DataFrame: From 19078a03595284ab42d9ef7ee5d4cc0fa7a8e4fe Mon Sep 17 00:00:00 2001 From: Timothy Willard Date: Mon, 1 Jul 2024 13:48:39 -0400 Subject: [PATCH 06/19] Add test for read_df with subpop column * Added unit test reading a file with a column called 'subpop', converted to a string when the file is a csv and left unaltered when the file is a parquet file. * Typo in test_raises_not_implemented_error docstring. --- .../gempyor_pkg/tests/utils/test_read_df.py | 42 ++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/flepimop/gempyor_pkg/tests/utils/test_read_df.py b/flepimop/gempyor_pkg/tests/utils/test_read_df.py index 0b2334602..c4c9a14b0 100644 --- a/flepimop/gempyor_pkg/tests/utils/test_read_df.py +++ b/flepimop/gempyor_pkg/tests/utils/test_read_df.py @@ -5,6 +5,7 @@ import pytest import pandas as pd +from pandas.api.types import is_object_dtype, is_numeric_dtype from gempyor.utils import read_df @@ -23,9 +24,16 @@ class TestReadDf: } ) + subpop_df: pd.DataFrame = pd.DataFrame( + { + "subpop": [1, 2, 3, 4], + "value": [5, 6, 7, 8], + } + ) + def test_raises_not_implemented_error(self) -> None: """ - Tests that write_df raises a NotImplementedError for unsupported file + Tests that read_df raises a NotImplementedError for unsupported file extensions. """ with pytest.raises( @@ -102,6 +110,38 @@ def test_read_parquet_dataframe( path_writer=lambda p, df: df.to_parquet(p, engine="pyarrow", index=False), ) + def test_subpop_is_cast_as_str(self) -> None: + """ + Tests that read_df returns an object dtype for the column 'subpop' when reading + a csv file, but not when reading a parquet file. + """ + # First confirm the dtypes of our test DataFrame + assert is_numeric_dtype(self.subpop_df["subpop"]) + assert is_numeric_dtype(self.subpop_df["value"]) + # Test that the subpop column is converted to a string for a csv file + with NamedTemporaryFile(suffix=".csv") as temp_file: + temp_path = Path(temp_file.name) + assert temp_path.stat().st_size == 0 + assert self.subpop_df.to_csv(temp_path, index=False) is None + assert temp_path.stat().st_size > 0 + test_df = read_df(fname=temp_path) + assert isinstance(test_df, pd.DataFrame) + assert is_object_dtype(test_df["subpop"]) + assert is_numeric_dtype(test_df["value"]) + # Test that the subpop column remains unaltered for a parquet file + with NamedTemporaryFile(suffix=".parquet") as temp_file: + temp_path = Path(temp_file.name) + assert temp_path.stat().st_size == 0 + assert ( + self.subpop_df.to_parquet(temp_path, engine="pyarrow", index=False) + is None + ) + assert temp_path.stat().st_size > 0 + test_df = read_df(fname=temp_path) + assert isinstance(test_df, pd.DataFrame) + assert is_numeric_dtype(test_df["subpop"]) + assert is_numeric_dtype(test_df["value"]) + def _test_read_df( self, fname_transformer: Callable[[os.PathLike], Any], From fd4021cf5a3b7955d9aab7cf1cf76724f849fb73 Mon Sep 17 00:00:00 2001 From: Timothy Willard Date: Mon, 1 Jul 2024 13:59:49 -0400 Subject: [PATCH 07/19] Documented and extended read_df * Documented the gempyor.utils.read_df function using the Google style guide. * Extended read_df to explicitly support os.PathLike types for fname (was implicitly supported) and added support for bytes. * Changed the file manipulation logic to use pathlib rather than manipulating strings in read_df. --- flepimop/gempyor_pkg/src/gempyor/utils.py | 47 +++++++++++++++-------- 1 file changed, 32 insertions(+), 15 deletions(-) diff --git a/flepimop/gempyor_pkg/src/gempyor/utils.py b/flepimop/gempyor_pkg/src/gempyor/utils.py index bace01582..f59646dc0 100644 --- a/flepimop/gempyor_pkg/src/gempyor/utils.py +++ b/flepimop/gempyor_pkg/src/gempyor/utils.py @@ -59,21 +59,38 @@ def write_df( ) -def read_df(fname: str, extension: str = "") -> pd.DataFrame: - """Load a dataframe from a file, agnostic to whether it is a parquet or a csv. The extension - can be provided as an argument or it is infered""" - fname = str(fname) - if extension: # Empty strings are falsy in python - fname = f"{fname}.{extension}" - extension = fname.split(".")[-1] - if extension == "csv": - # The converter prevents e.g leading geoid (0600) to be converted as int; and works when the column is absent - df = pd.read_csv(fname, converters={"subpop": lambda x: str(x)}, skipinitialspace=True) - elif extension == "parquet": - df = pa.parquet.read_table(fname).to_pandas() - else: - raise NotImplementedError(f"Invalid extension {extension}. Must be 'csv' or 'parquet'") - return df +def read_df(fname: str | bytes | os.PathLike, extension: str = "") -> pd.DataFrame: + """Reads a pandas DataFrame from either a CSV or Parquet file. + + Reads a pandas DataFrame to either a CSV or Parquet file and can infer which format + to use based on the extension given in `fname` or based on explicit `extension`. If + the file being read is a csv with a column called 'subpop' then that column will be + cast as a string. + + Args: + fname: The name of the file to read from. + extension: A user specified extension to use for the file if not contained in + `fname` already. + + Returns: + A pandas DataFrame parsed from the file given. + + Raises: + NotImplementedError: The given output extension is not supported yet. + """ + # Decipher the path given + fname = fname.decode() if isinstance(fname, bytes) else fname + path = Path(f"{fname}.{extension}") if extension else Path(fname) + # Read df from either a csv or parquet or raise if an invalid extension + if path.suffix == ".csv": + return pd.read_csv( + path, converters={"subpop": lambda x: str(x)}, skipinitialspace=True + ) + elif path.suffix == ".parquet": + return pd.read_parquet(path, engine="pyarrow") + raise NotImplementedError( + f"Invalid extension {extension}. Must be 'csv' or 'parquet'" + ) def command_safe_run(command, command_name="mycommand", fail_on_fail=True): From c51a509aeb677b09c5035efaabf77812e15d2630 Mon Sep 17 00:00:00 2001 From: Timothy Willard Date: Mon, 1 Jul 2024 14:06:56 -0400 Subject: [PATCH 08/19] Changed extension type from str to Literal * Changed extension param of read_df/write_df from str to Literal[None, "", "csv", "parquet"]. * Added unit tests for `extension=None` for both read_df/write_df. --- flepimop/gempyor_pkg/src/gempyor/utils.py | 9 ++++++--- flepimop/gempyor_pkg/tests/utils/test_read_df.py | 12 ++++++++---- flepimop/gempyor_pkg/tests/utils/test_write_df.py | 12 ++++++++---- 3 files changed, 22 insertions(+), 11 deletions(-) diff --git a/flepimop/gempyor_pkg/src/gempyor/utils.py b/flepimop/gempyor_pkg/src/gempyor/utils.py index f59646dc0..2ddf74f24 100644 --- a/flepimop/gempyor_pkg/src/gempyor/utils.py +++ b/flepimop/gempyor_pkg/src/gempyor/utils.py @@ -14,7 +14,7 @@ import logging import boto3 from gempyor import file_paths -from typing import List, Dict +from typing import List, Dict, Literal from botocore.exceptions import ClientError from pathlib import Path @@ -26,7 +26,7 @@ def write_df( fname: str | bytes | os.PathLike, df: pd.DataFrame, - extension: str = "", + extension: Literal[None, "", "csv", "parquet"] = "", ) -> None: """Writes a pandas DataFrame without its index to a file. @@ -59,7 +59,10 @@ def write_df( ) -def read_df(fname: str | bytes | os.PathLike, extension: str = "") -> pd.DataFrame: +def read_df( + fname: str | bytes | os.PathLike, + extension: Literal[None, "", "csv", "parquet"] = "", +) -> pd.DataFrame: """Reads a pandas DataFrame from either a CSV or Parquet file. Reads a pandas DataFrame to either a CSV or Parquet file and can infer which format diff --git a/flepimop/gempyor_pkg/tests/utils/test_read_df.py b/flepimop/gempyor_pkg/tests/utils/test_read_df.py index c4c9a14b0..68345ba29 100644 --- a/flepimop/gempyor_pkg/tests/utils/test_read_df.py +++ b/flepimop/gempyor_pkg/tests/utils/test_read_df.py @@ -1,7 +1,7 @@ import os from tempfile import NamedTemporaryFile from pathlib import Path -from typing import Callable, Any +from typing import Callable, Any, Literal import pytest import pandas as pd @@ -55,6 +55,8 @@ def test_raises_not_implemented_error(self) -> None: [ (lambda x: str(x), ""), (lambda x: x, ""), + (lambda x: str(x), None), + (lambda x: x, None), (lambda x: f"{x.parent}/{x.stem}", "csv"), (lambda x: Path(f"{x.parent}/{x.stem}"), "csv"), ], @@ -62,7 +64,7 @@ def test_raises_not_implemented_error(self) -> None: def test_read_csv_dataframe( self, fname_transformer: Callable[[os.PathLike], Any], - extension: str, + extension: Literal[None, "", "csv", "parquet"], ) -> None: """ Tests reading a DataFrame from a CSV file. @@ -85,6 +87,8 @@ def test_read_csv_dataframe( [ (lambda x: str(x), ""), (lambda x: x, ""), + (lambda x: str(x), None), + (lambda x: x, None), (lambda x: f"{x.parent}/{x.stem}", "parquet"), (lambda x: Path(f"{x.parent}/{x.stem}"), "parquet"), ], @@ -92,7 +96,7 @@ def test_read_csv_dataframe( def test_read_parquet_dataframe( self, fname_transformer: Callable[[os.PathLike], Any], - extension: str, + extension: Literal[None, "", "csv", "parquet"], ) -> None: """ Tests reading a DataFrame from a Parquet file. @@ -145,7 +149,7 @@ def test_subpop_is_cast_as_str(self) -> None: def _test_read_df( self, fname_transformer: Callable[[os.PathLike], Any], - extension: str, + extension: Literal[None, "", "csv", "parquet"], suffix: str | None, path_writer: Callable[[os.PathLike, pd.DataFrame], None], ) -> None: diff --git a/flepimop/gempyor_pkg/tests/utils/test_write_df.py b/flepimop/gempyor_pkg/tests/utils/test_write_df.py index aad3bc1c4..22c29240b 100644 --- a/flepimop/gempyor_pkg/tests/utils/test_write_df.py +++ b/flepimop/gempyor_pkg/tests/utils/test_write_df.py @@ -1,7 +1,7 @@ import os from tempfile import NamedTemporaryFile from pathlib import Path -from typing import Callable, Any +from typing import Callable, Any, Literal import pytest import pandas as pd @@ -40,6 +40,8 @@ def test_raises_not_implemented_error(self) -> None: [ (lambda x: str(x), ""), (lambda x: x, ""), + (lambda x: str(x), None), + (lambda x: x, None), (lambda x: f"{x.parent}/{x.stem}", "csv"), (lambda x: Path(f"{x.parent}/{x.stem}"), "csv"), ], @@ -47,7 +49,7 @@ def test_raises_not_implemented_error(self) -> None: def test_write_csv_dataframe( self, fname_transformer: Callable[[os.PathLike], Any], - extension: str, + extension: Literal[None, "", "csv", "parquet"], ) -> None: """ Tests writing a DataFrame to a CSV file. @@ -71,6 +73,8 @@ def test_write_csv_dataframe( [ (lambda x: str(x), ""), (lambda x: x, ""), + (lambda x: str(x), None), + (lambda x: x, None), (lambda x: f"{x.parent}/{x.stem}", "parquet"), (lambda x: Path(f"{x.parent}/{x.stem}"), "parquet"), ], @@ -78,7 +82,7 @@ def test_write_csv_dataframe( def test_write_parquet_dataframe( self, fname_transformer: Callable[[os.PathLike], Any], - extension: str, + extension: Literal[None, "", "csv", "parquet"], ) -> None: """ Tests writing a DataFrame to a Parquet file. @@ -101,7 +105,7 @@ def _test_write_df( self, fname_transformer: Callable[[os.PathLike], Any], df: pd.DataFrame, - extension: str, + extension: Literal[None, "", "csv", "parquet"], suffix: str | None, path_reader: Callable[[os.PathLike], pd.DataFrame], ) -> None: From 18d9498e5ba7bf434832719f71388cbd85cdb961 Mon Sep 17 00:00:00 2001 From: Timothy Willard Date: Mon, 1 Jul 2024 14:13:29 -0400 Subject: [PATCH 09/19] Formatted/organized utils * Applied formatting (all whitespace) to read_df/write_df functions. * Reorganized the imports in utils to be clearer. --- flepimop/gempyor_pkg/src/gempyor/utils.py | 45 ++++++++++++----------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/flepimop/gempyor_pkg/src/gempyor/utils.py b/flepimop/gempyor_pkg/src/gempyor/utils.py index 2ddf74f24..e6056b815 100644 --- a/flepimop/gempyor_pkg/src/gempyor/utils.py +++ b/flepimop/gempyor_pkg/src/gempyor/utils.py @@ -1,22 +1,25 @@ -import os import datetime import functools +import logging import numbers +import os +from pathlib import Path +import shutil +import subprocess import time +from typing import List, Dict, Literal + +import boto3 +from botocore.exceptions import ClientError import confuse import numpy as np import pandas as pd import pyarrow as pa import scipy.stats import sympy.parsing.sympy_parser -import subprocess -import shutil -import logging -import boto3 + from gempyor import file_paths -from typing import List, Dict, Literal -from botocore.exceptions import ClientError -from pathlib import Path + logger = logging.getLogger(__name__) @@ -24,25 +27,25 @@ def write_df( - fname: str | bytes | os.PathLike, - df: pd.DataFrame, + fname: str | bytes | os.PathLike, + df: pd.DataFrame, extension: Literal[None, "", "csv", "parquet"] = "", ) -> None: """Writes a pandas DataFrame without its index to a file. - + Writes a pandas DataFrame to either a CSV or Parquet file without its index and can - infer which format to use based on the extension given in `fname` or based on + infer which format to use based on the extension given in `fname` or based on explicit `extension`. - + Args: fname: The name of the file to write to. df: A pandas DataFrame whose contents to write, but without its index. extension: A user specified extension to use for the file if not contained in `fname` already. - + Returns: None - + Raises: NotImplementedError: The given output extension is not supported yet. """ @@ -60,24 +63,24 @@ def write_df( def read_df( - fname: str | bytes | os.PathLike, + fname: str | bytes | os.PathLike, extension: Literal[None, "", "csv", "parquet"] = "", ) -> pd.DataFrame: """Reads a pandas DataFrame from either a CSV or Parquet file. - - Reads a pandas DataFrame to either a CSV or Parquet file and can infer which format + + Reads a pandas DataFrame to either a CSV or Parquet file and can infer which format to use based on the extension given in `fname` or based on explicit `extension`. If the file being read is a csv with a column called 'subpop' then that column will be cast as a string. - + Args: fname: The name of the file to read from. extension: A user specified extension to use for the file if not contained in `fname` already. - + Returns: A pandas DataFrame parsed from the file given. - + Raises: NotImplementedError: The given output extension is not supported yet. """ From 0e8241d62101a9ca56029af373befc52f7fac728 Mon Sep 17 00:00:00 2001 From: Timothy Willard Date: Mon, 1 Jul 2024 14:49:28 -0400 Subject: [PATCH 10/19] Typo in NotImplementedError, corresponding tests * Added missing period in NotImplementedError in read_df/write_df, updated corresponding unit tests. Also use path suffix directly instead of given extension. * Added missing test for write_df with provided extension raising NotImplementedError. --- flepimop/gempyor_pkg/src/gempyor/utils.py | 4 ++-- flepimop/gempyor_pkg/tests/utils/test_read_df.py | 4 ++-- flepimop/gempyor_pkg/tests/utils/test_write_df.py | 9 ++++++++- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/flepimop/gempyor_pkg/src/gempyor/utils.py b/flepimop/gempyor_pkg/src/gempyor/utils.py index e6056b815..075162ce4 100644 --- a/flepimop/gempyor_pkg/src/gempyor/utils.py +++ b/flepimop/gempyor_pkg/src/gempyor/utils.py @@ -58,7 +58,7 @@ def write_df( elif path.suffix == ".parquet": return df.to_parquet(path, index=False, engine="pyarrow") raise NotImplementedError( - f"Invalid extension {extension}. Must be 'csv' or 'parquet'" + f"Invalid extension {path.suffix[1:]}. Must be 'csv' or 'parquet'." ) @@ -95,7 +95,7 @@ def read_df( elif path.suffix == ".parquet": return pd.read_parquet(path, engine="pyarrow") raise NotImplementedError( - f"Invalid extension {extension}. Must be 'csv' or 'parquet'" + f"Invalid extension {path.suffix[1:]}. Must be 'csv' or 'parquet'." ) diff --git a/flepimop/gempyor_pkg/tests/utils/test_read_df.py b/flepimop/gempyor_pkg/tests/utils/test_read_df.py index 68345ba29..7a0a0c581 100644 --- a/flepimop/gempyor_pkg/tests/utils/test_read_df.py +++ b/flepimop/gempyor_pkg/tests/utils/test_read_df.py @@ -38,13 +38,13 @@ def test_raises_not_implemented_error(self) -> None: """ with pytest.raises( expected_exception=NotImplementedError, - match="Invalid extension txt. Must be 'csv' or 'parquet'", + match="Invalid extension txt. Must be 'csv' or 'parquet'.", ) as _: with NamedTemporaryFile(suffix=".txt") as temp_file: read_df(fname=temp_file.name) with pytest.raises( expected_exception=NotImplementedError, - match="Invalid extension txt. Must be 'csv' or 'parquet'", + match="Invalid extension txt. Must be 'csv' or 'parquet'.", ) as _: with NamedTemporaryFile(suffix=".txt") as temp_file: fname = temp_file.name[:-4] diff --git a/flepimop/gempyor_pkg/tests/utils/test_write_df.py b/flepimop/gempyor_pkg/tests/utils/test_write_df.py index 22c29240b..b13e0b948 100644 --- a/flepimop/gempyor_pkg/tests/utils/test_write_df.py +++ b/flepimop/gempyor_pkg/tests/utils/test_write_df.py @@ -30,10 +30,17 @@ def test_raises_not_implemented_error(self) -> None: """ with pytest.raises( expected_exception=NotImplementedError, - match="Invalid extension txt. Must be 'csv' or 'parquet'", + match="Invalid extension txt. Must be 'csv' or 'parquet'.", ) as _: with NamedTemporaryFile(suffix=".txt") as temp_file: write_df(fname=temp_file.name, df=self.sample_df) + with pytest.raises( + expected_exception=NotImplementedError, + match="Invalid extension txt. Must be 'csv' or 'parquet'.", + ) as _: + with NamedTemporaryFile(suffix=".txt") as temp_file: + fname = temp_file.name[:-4] + write_df(fname=fname, df=self.sample_df, extension="txt") @pytest.mark.parametrize( "fname_transformer,extension", From 1b5b92d97371a91112ba28209cda1d9c9b46b2e9 Mon Sep 17 00:00:00 2001 From: Timothy Willard Date: Fri, 5 Jul 2024 10:39:13 -0400 Subject: [PATCH 11/19] Added unit tests for list_filenames * Created unit tests in tests/utils/test_list_filenames.py, including tests for searching flat and nested folders. * Added relevant documentation to the tests as well. * Created create_directories_with_files pytest class fixture, might need to be extracted into a more general purpose location later. --- .../tests/utils/test_list_filenames.py | 166 ++++++++++++++++++ 1 file changed, 166 insertions(+) create mode 100644 flepimop/gempyor_pkg/tests/utils/test_list_filenames.py diff --git a/flepimop/gempyor_pkg/tests/utils/test_list_filenames.py b/flepimop/gempyor_pkg/tests/utils/test_list_filenames.py new file mode 100644 index 000000000..dea2b4cbc --- /dev/null +++ b/flepimop/gempyor_pkg/tests/utils/test_list_filenames.py @@ -0,0 +1,166 @@ +"""Unit tests for the `gempyor.utils.list_filenames` function. + +These tests cover scenarios for finding files in both flat and nested directories. +""" + +from collections.abc import Generator +from pathlib import Path +from tempfile import TemporaryDirectory + +import pytest + +from gempyor.utils import list_filenames + + +@pytest.fixture(scope="class") +def create_directories_with_files( + request: pytest.FixtureRequest, +) -> Generator[tuple[TemporaryDirectory, TemporaryDirectory], None, None]: + """Fixture to create temporary directories with files for testing. + + This fixture creates two temporary directories: + - A flat directory with files. + - A nested directory with files organized in subdirectories. + + The directories and files are cleaned up after the tests are run. + + Args: + request: The pytest fixture request object. + + Yields: + tuple: A tuple containing the flat and nested TemporaryDirectory objects. + """ + # Create a flat and nested directories + flat_temp_dir = TemporaryDirectory() + nested_temp_dir = TemporaryDirectory() + # Setup flat directory + for file in ["hosp.csv", "hosp.parquet", "spar.csv", "spar.parquet"]: + Path(f"{flat_temp_dir.name}/{file}").touch() + # Setup nested directory structure + for file in [ + "hpar/chimeric/001.parquet", + "hpar/chimeric/002.parquet", + "hpar/global/001.parquet", + "hpar/global/002.parquet", + "seed/001.csv", + "seed/002.csv", + ]: + path = Path(f"{nested_temp_dir.name}/{file}") + path.parent.mkdir(parents=True, exist_ok=True) + path.touch() + # Yield + request.cls.flat_temp_dir = flat_temp_dir + request.cls.nested_temp_dir = nested_temp_dir + yield (flat_temp_dir, nested_temp_dir) + # Clean up directories on test end + flat_temp_dir.cleanup() + nested_temp_dir.cleanup() + + +@pytest.mark.usefixtures("create_directories_with_files") +class TestListFilenames: + """Unit tests for the `gempyor.utils.list_filenames` function.""" + + @pytest.mark.parametrize( + "filters,expected_basenames", + [ + ([], ["hosp.csv", "hosp.parquet", "spar.csv", "spar.parquet"]), + (["hosp"], ["hosp.csv", "hosp.parquet"]), + (["spar"], ["spar.csv", "spar.parquet"]), + ([".parquet"], ["hosp.parquet", "spar.parquet"]), + ([".csv"], ["hosp.csv", "spar.csv"]), + (["hosp", ".csv"], ["hosp.csv"]), + (["spar", ".parquet"], ["spar.parquet"]), + (["hosp", "spar"], []), + ([".csv", ".parquet"], []), + ], + ) + def test_finds_files_in_flat_directory( + self, + filters: list[str], + expected_basenames: list[str], + ) -> None: + """Test `list_filenames` in a flat directory. + + Args: + filters: List of filters to apply to filenames. + expected_basenames: List of expected filenames that match the filters. + """ + self._test_list_filenames( + folder=self.flat_temp_dir.name, + filters=filters, + expected_basenames=expected_basenames, + ) + + @pytest.mark.parametrize( + "filters,expected_basenames", + [ + ( + [], + [ + "hpar/chimeric/001.parquet", + "hpar/chimeric/002.parquet", + "hpar/global/001.parquet", + "hpar/global/002.parquet", + "seed/001.csv", + "seed/002.csv", + ], + ), + ( + ["hpar"], + [ + "hpar/chimeric/001.parquet", + "hpar/chimeric/002.parquet", + "hpar/global/001.parquet", + "hpar/global/002.parquet", + ], + ), + (["seed"], ["seed/001.csv", "seed/002.csv"]), + (["global"], ["hpar/global/001.parquet", "hpar/global/002.parquet"]), + ( + ["001"], + [ + "hpar/chimeric/001.parquet", + "hpar/global/001.parquet", + "seed/001.csv", + ], + ), + (["hpar", "001"], ["hpar/chimeric/001.parquet", "hpar/global/001.parquet"]), + (["seed", "002"], ["seed/002.csv"]), + ([".tsv"], []), + ], + ) + def test_find_files_in_nested_directory( + self, + filters: list[str], + expected_basenames: list[str], + ) -> None: + """Test `list_filenames` in a nested directory. + + Args: + filters: List of filters to apply to filenames. + expected_basenames: List of expected filenames that match the filters. + """ + self._test_list_filenames( + folder=self.nested_temp_dir.name, + filters=filters, + expected_basenames=expected_basenames, + ) + + def _test_list_filenames( + self, + folder: str, + filters: list[str], + expected_basenames: list[str], + ) -> None: + """Helper method to test `list_filenames`. + + Args: + folder: The directory to search for files. + filters: List of filters to apply to filenames. + expected_basenames: List of expected filenames that match the filters. + """ + files = list_filenames(folder=folder, filters=filters) + assert len(files) == len(expected_basenames) + basenames = [f.removeprefix(f"{folder}/") for f in files] + assert sorted(basenames) == sorted(expected_basenames) From d963fa407e90ec2b4c8552576db23e7f614aa942 Mon Sep 17 00:00:00 2001 From: Timothy Willard Date: Fri, 5 Jul 2024 11:36:35 -0400 Subject: [PATCH 12/19] Documented & refactored list_filenames * Added more detail to the `gempyor.utils.list_filenames` type hints. * Formatted the documentation to comply with the Google style guide. * Refactored the internals of list_filenames to be single list comprehension instead of a loop of nested conditionals. * Allow `filters` to accept a single string. --- flepimop/gempyor_pkg/src/gempyor/utils.py | 67 ++++++++++++++--------- 1 file changed, 41 insertions(+), 26 deletions(-) diff --git a/flepimop/gempyor_pkg/src/gempyor/utils.py b/flepimop/gempyor_pkg/src/gempyor/utils.py index 075162ce4..afafc5f89 100644 --- a/flepimop/gempyor_pkg/src/gempyor/utils.py +++ b/flepimop/gempyor_pkg/src/gempyor/utils.py @@ -309,33 +309,48 @@ def as_random_distribution(self): return functools.partial(np.random.uniform, self.as_evaled_expression(), self.as_evaled_expression(),) -def list_filenames(folder: str = ".", filters: list = []) -> list: - """ - return the list of all filename and path in the provided folders. - If filters [list] is provided, then only the files that contains each of the - substrings in filter will be returned. Example to get all hosp file: - ``` - gempyor.utils.list_filenames(folder="model_output/", filters=["hosp"]) - ``` - and be sure we only get parquet: - ``` - gempyor.utils.list_filenames(folder="model_output/", filters=["hosp" , ".parquet"]) - ``` +def list_filenames(folder: str = ".", filters: str | list[str] = []) -> list[str]: + """Return the list of all filenames and paths in the provided folder. + + This function lists all files in the specified folder and its subdirectories. + If filters are provided, only the files containing each of the substrings + in the filters will be returned. + + Example: + To get all files containing "hosp": + ``` + gempyor.utils.list_filenames( + folder="model_output/", + filters=["hosp"], + ) + ``` + + To get only "hosp" files with a ".parquet" extension: + ``` + gempyor.utils.list_filenames( + folder="model_output/", + filters=["hosp", ".parquet"], + ) + ``` + + Args: + folder: The directory to search for files. Defaults to the current directory. + filters: A string or a list of strings to filter filenames. Only files + containing all the provided substrings will be returned. Defaults to an + empty list. + + Returns: + A list of strings representing the paths to the files that match the filters. """ - from pathlib import Path - - fn_list = [] - for f in Path(str(folder)).rglob(f"*"): - if f.is_file(): # not a folder - f = str(f) - if not filters: - fn_list.append(f) - else: - if all(c in f for c in filters): - fn_list.append(str(f)) - else: - pass - return fn_list + filters = list(filters) if not isinstance(filters, list) else filters + filters = filters if len(filters) else [""] + folder = Path(folder) + files = [ + str(file) + for file in folder.rglob("*") + if file.is_file() and all(f in str(file) for f in filters) + ] + return files def rolling_mean_pad(data, window): From 0e710f4b5750ec7a6ddd145b8d32a0fa7a037303 Mon Sep 17 00:00:00 2001 From: Timothy Willard Date: Fri, 5 Jul 2024 12:02:36 -0400 Subject: [PATCH 13/19] Expanded types and tests for list_filenames * Expanded type support for the `folder` arg of `gempyor.utils.list_filnames` to support bytes and os.PathLike. * Vastly expanded test suite to target new supported types. * Corrected bug when filters was given as a string, uncovered by extensive test suite. --- flepimop/gempyor_pkg/src/gempyor/utils.py | 9 ++- .../tests/utils/test_list_filenames.py | 69 +++++++++++++++++-- 2 files changed, 71 insertions(+), 7 deletions(-) diff --git a/flepimop/gempyor_pkg/src/gempyor/utils.py b/flepimop/gempyor_pkg/src/gempyor/utils.py index afafc5f89..5c894225a 100644 --- a/flepimop/gempyor_pkg/src/gempyor/utils.py +++ b/flepimop/gempyor_pkg/src/gempyor/utils.py @@ -309,7 +309,10 @@ def as_random_distribution(self): return functools.partial(np.random.uniform, self.as_evaled_expression(), self.as_evaled_expression(),) -def list_filenames(folder: str = ".", filters: str | list[str] = []) -> list[str]: +def list_filenames( + folder: str | bytes | os.PathLike = ".", + filters: str | list[str] = [], +) -> list[str]: """Return the list of all filenames and paths in the provided folder. This function lists all files in the specified folder and its subdirectories. @@ -342,9 +345,9 @@ def list_filenames(folder: str = ".", filters: str | list[str] = []) -> list[str Returns: A list of strings representing the paths to the files that match the filters. """ - filters = list(filters) if not isinstance(filters, list) else filters + filters = [filters] if not isinstance(filters, list) else filters filters = filters if len(filters) else [""] - folder = Path(folder) + folder = Path(folder.decode() if isinstance(folder, bytes) else folder) files = [ str(file) for file in folder.rglob("*") diff --git a/flepimop/gempyor_pkg/tests/utils/test_list_filenames.py b/flepimop/gempyor_pkg/tests/utils/test_list_filenames.py index dea2b4cbc..f24c6aaa6 100644 --- a/flepimop/gempyor_pkg/tests/utils/test_list_filenames.py +++ b/flepimop/gempyor_pkg/tests/utils/test_list_filenames.py @@ -4,6 +4,7 @@ """ from collections.abc import Generator +import os from pathlib import Path from tempfile import TemporaryDirectory @@ -64,11 +65,18 @@ class TestListFilenames: @pytest.mark.parametrize( "filters,expected_basenames", [ + ("", ["hosp.csv", "hosp.parquet", "spar.csv", "spar.parquet"]), ([], ["hosp.csv", "hosp.parquet", "spar.csv", "spar.parquet"]), + ("hosp", ["hosp.csv", "hosp.parquet"]), (["hosp"], ["hosp.csv", "hosp.parquet"]), + ("spar", ["spar.csv", "spar.parquet"]), (["spar"], ["spar.csv", "spar.parquet"]), + (".parquet", ["hosp.parquet", "spar.parquet"]), ([".parquet"], ["hosp.parquet", "spar.parquet"]), + (".csv", ["hosp.csv", "spar.csv"]), ([".csv"], ["hosp.csv", "spar.csv"]), + (".tsv", []), + ([".tsv"], []), (["hosp", ".csv"], ["hosp.csv"]), (["spar", ".parquet"], ["spar.parquet"]), (["hosp", "spar"], []), @@ -77,7 +85,7 @@ class TestListFilenames: ) def test_finds_files_in_flat_directory( self, - filters: list[str], + filters: str | list[str], expected_basenames: list[str], ) -> None: """Test `list_filenames` in a flat directory. @@ -91,10 +99,31 @@ def test_finds_files_in_flat_directory( filters=filters, expected_basenames=expected_basenames, ) + self._test_list_filenames( + folder=self.flat_temp_dir.name.encode(), + filters=filters, + expected_basenames=expected_basenames, + ) + self._test_list_filenames( + folder=Path(self.flat_temp_dir.name), + filters=filters, + expected_basenames=expected_basenames, + ) @pytest.mark.parametrize( "filters,expected_basenames", [ + ( + "", + [ + "hpar/chimeric/001.parquet", + "hpar/chimeric/002.parquet", + "hpar/global/001.parquet", + "hpar/global/002.parquet", + "seed/001.csv", + "seed/002.csv", + ], + ), ( [], [ @@ -106,6 +135,15 @@ def test_finds_files_in_flat_directory( "seed/002.csv", ], ), + ( + "hpar", + [ + "hpar/chimeric/001.parquet", + "hpar/chimeric/002.parquet", + "hpar/global/001.parquet", + "hpar/global/002.parquet", + ], + ), ( ["hpar"], [ @@ -115,8 +153,18 @@ def test_finds_files_in_flat_directory( "hpar/global/002.parquet", ], ), + ("seed", ["seed/001.csv", "seed/002.csv"]), (["seed"], ["seed/001.csv", "seed/002.csv"]), + ("global", ["hpar/global/001.parquet", "hpar/global/002.parquet"]), (["global"], ["hpar/global/001.parquet", "hpar/global/002.parquet"]), + ( + "001", + [ + "hpar/chimeric/001.parquet", + "hpar/global/001.parquet", + "seed/001.csv", + ], + ), ( ["001"], [ @@ -127,12 +175,14 @@ def test_finds_files_in_flat_directory( ), (["hpar", "001"], ["hpar/chimeric/001.parquet", "hpar/global/001.parquet"]), (["seed", "002"], ["seed/002.csv"]), + (["hpar", "001", "global"], ["hpar/global/001.parquet"]), + (".tsv", []), ([".tsv"], []), ], ) def test_find_files_in_nested_directory( self, - filters: list[str], + filters: str | list[str], expected_basenames: list[str], ) -> None: """Test `list_filenames` in a nested directory. @@ -146,11 +196,21 @@ def test_find_files_in_nested_directory( filters=filters, expected_basenames=expected_basenames, ) + self._test_list_filenames( + folder=self.nested_temp_dir.name.encode(), + filters=filters, + expected_basenames=expected_basenames, + ) + self._test_list_filenames( + folder=Path(self.nested_temp_dir.name), + filters=filters, + expected_basenames=expected_basenames, + ) def _test_list_filenames( self, - folder: str, - filters: list[str], + folder: str | bytes | os.PathLike, + filters: str | list[str], expected_basenames: list[str], ) -> None: """Helper method to test `list_filenames`. @@ -162,5 +222,6 @@ def _test_list_filenames( """ files = list_filenames(folder=folder, filters=filters) assert len(files) == len(expected_basenames) + folder = folder.decode() if isinstance(folder, bytes) else str(folder) basenames = [f.removeprefix(f"{folder}/") for f in files] assert sorted(basenames) == sorted(expected_basenames) From 2c7a81d78ce8e5cfc19ac9fe2865a02e7b31900d Mon Sep 17 00:00:00 2001 From: Timothy Willard Date: Fri, 5 Jul 2024 13:28:27 -0400 Subject: [PATCH 14/19] Added unit tests for get_truncated_normal Added unit tests for `gempyor.utils.get_truncated_normal` function. --- .../tests/utils/test_get_truncated_normal.py | 49 +++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 flepimop/gempyor_pkg/tests/utils/test_get_truncated_normal.py diff --git a/flepimop/gempyor_pkg/tests/utils/test_get_truncated_normal.py b/flepimop/gempyor_pkg/tests/utils/test_get_truncated_normal.py new file mode 100644 index 000000000..23e4fad58 --- /dev/null +++ b/flepimop/gempyor_pkg/tests/utils/test_get_truncated_normal.py @@ -0,0 +1,49 @@ +import numpy as np +import pytest +import scipy.stats + +from gempyor.utils import get_truncated_normal + + +class TestGetTruncatedNormal: + """Unit tests for the `gempyor.utils.get_truncated_normal` function.""" + + @pytest.mark.parametrize( + "mean,sd,a,b", + [ + (0.0, 1.0, 0.0, 10.0), + (0.0, 2.0, -4.0, 4.0), + (-5.0, 3.0, -5.0, 10.0), + (-3.25, 1.4, -8.74, 4.89), + (0, 1, 0, 10), + (0, 2, -4, 4), + (-5, 3, -5, 10), + ], + ) + def test_construct_distribution( + self, + mean: float | int, + sd: float | int, + a: float | int, + b: float | int, + ) -> None: + """Test the construction of a truncated normal distribution. + + This test checks whether the `get_truncated_normal` function correctly + constructs a truncated normal distribution with the specified parameters. + It verifies that the returned object is an instance of `rv_frozen`, and that + its support and parameters (mean and standard deviation) are correctly set. + + Args: + mean: The mean of the truncated normal distribution. + sd: The standard deviation of the truncated normal distribution. + a: The lower bound of the truncated normal distribution. + b: The upper bound of the truncated normal distribution. + """ + dist = get_truncated_normal(mean=mean, sd=sd, a=a, b=b) + assert isinstance(dist, scipy.stats._distn_infrastructure.rv_frozen) + lower, upper = dist.support() + assert np.isclose(lower, a) + assert np.isclose(upper, b) + assert np.isclose(dist.kwds.get("loc"), mean) + assert np.isclose(dist.kwds.get("scale"), sd) From 17c715d344023fd4e3a0926739e1da2814859d97 Mon Sep 17 00:00:00 2001 From: Timothy Willard Date: Fri, 5 Jul 2024 13:36:33 -0400 Subject: [PATCH 15/19] Documented and refactored get_truncated_normal * Documented `gempyor.utils.get_truncated_normal` including adding appropriate type hints. * Refactored the function lightly for legibility. --- flepimop/gempyor_pkg/src/gempyor/utils.py | 27 ++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/flepimop/gempyor_pkg/src/gempyor/utils.py b/flepimop/gempyor_pkg/src/gempyor/utils.py index 5c894225a..826727c86 100644 --- a/flepimop/gempyor_pkg/src/gempyor/utils.py +++ b/flepimop/gempyor_pkg/src/gempyor/utils.py @@ -252,10 +252,31 @@ def as_evaled_expression(self): raise ValueError(f"expected numeric or string expression [got: {value}]") -def get_truncated_normal(*, mean=0, sd=1, a=0, b=10): - "Returns the truncated normal distribution" +def get_truncated_normal( + mean: float | int = 0, + sd: float | int = 1, + a: float | int = 0, + b: float | int = 10, +) -> scipy.stats._distn_infrastructure.rv_frozen: + """Returns a truncated normal distribution. + + This function constructs a truncated normal distribution with the specified + mean, standard deviation, and bounds. The truncated normal distribution is + a normal distribution bounded within the interval [a, b]. - return scipy.stats.truncnorm((a - mean) / sd, (b - mean) / sd, loc=mean, scale=sd) + Args: + mean: The mean of the truncated normal distribution. Defaults to 0. + sd: The standard deviation of the truncated normal distribution. Defaults to 1. + a: The lower bound of the truncated normal distribution. Defaults to 0. + b: The upper bound of the truncated normal distribution. Defaults to 10. + + Returns: + rv_frozen: A frozen instance of the truncated normal distribution with the + specified parameters. + """ + lower = (a - mean) / sd + upper = (b - mean) / sd + return scipy.stats.truncnorm(lower, upper, loc=mean, scale=sd) def get_log_normal(meanlog, sdlog): From 2aaa6fbb8e7dd27f8e29c41ca9015920ce03130c Mon Sep 17 00:00:00 2001 From: Timothy Willard Date: Fri, 5 Jul 2024 14:30:15 -0400 Subject: [PATCH 16/19] Added unit tests for get_log_normal Added unit tests for the `gempyor.utils.get_log_normal` function. --- .../tests/utils/test_get_log_normal.py | 49 +++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 flepimop/gempyor_pkg/tests/utils/test_get_log_normal.py diff --git a/flepimop/gempyor_pkg/tests/utils/test_get_log_normal.py b/flepimop/gempyor_pkg/tests/utils/test_get_log_normal.py new file mode 100644 index 000000000..367a7f550 --- /dev/null +++ b/flepimop/gempyor_pkg/tests/utils/test_get_log_normal.py @@ -0,0 +1,49 @@ +import numpy as np +import pytest +import scipy.stats + +from gempyor.utils import get_log_normal + + +class TestGetLogNormal: + """Unit tests for the `gempyor.utils.get_log_normal` function.""" + + @pytest.mark.parametrize( + "meanlog,sdlog", + [ + (1.0, 1.0), + (0.0, 2.0), + (10.0, 30.0), + (0.33, 4.56), + (9.87, 4.21), + (1, 1), + (0, 2), + (10, 30), + ], + ) + def test_construct_distribution( + self, + meanlog: float | int, + sdlog: float | int, + ) -> None: + """Test the construction of a log normal distribution. + + This test checks whether the `get_log_normal` function correctly constructs + a log normal distribution with the specified parameters. It verifies that + the returned object is an instance of `rv_frozen`, and that its support and + parameters (log mean and log standard deviation) are correctly set. + + Args: + mean: The mean of the truncated normal distribution. + sd: The standard deviation of the truncated normal distribution. + a: The lower bound of the truncated normal distribution. + b: The upper bound of the truncated normal distribution. + """ + dist = get_log_normal(meanlog=meanlog, sdlog=sdlog) + assert isinstance(dist, scipy.stats._distn_infrastructure.rv_frozen) + lower, upper = dist.support() + assert np.isclose(lower, 0.0) + assert np.isclose(upper, np.inf) + assert np.isclose(dist.kwds.get("s"), sdlog) + assert np.isclose(dist.kwds.get("scale"), np.exp(meanlog)) + assert np.isclose(dist.kwds.get("loc"), 0.0) From c09b9535ddfc364702bb048fad168fceab71ac3d Mon Sep 17 00:00:00 2001 From: Timothy Willard Date: Fri, 5 Jul 2024 14:35:47 -0400 Subject: [PATCH 17/19] Documented get_log_normal Added documentation for `gempyor.utils.get_log_normal` including adding appropriate type hints. --- flepimop/gempyor_pkg/src/gempyor/utils.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/flepimop/gempyor_pkg/src/gempyor/utils.py b/flepimop/gempyor_pkg/src/gempyor/utils.py index 826727c86..98c54a1f1 100644 --- a/flepimop/gempyor_pkg/src/gempyor/utils.py +++ b/flepimop/gempyor_pkg/src/gempyor/utils.py @@ -279,8 +279,23 @@ def get_truncated_normal( return scipy.stats.truncnorm(lower, upper, loc=mean, scale=sd) -def get_log_normal(meanlog, sdlog): - "Returns the log normal distribution" +def get_log_normal( + meanlog: float | int, + sdlog: float | int, +) -> scipy.stats._distn_infrastructure.rv_frozen: + """Returns a log normal distribution. + + This function constructs a log normal distribution with the specified + log mean and log standard deviation. + + Args: + meanlog: The log of the mean of the log normal distribution. + sdlog: The log of the standard deviation of the log normal distribution. + + Returns: + rv_frozen: A frozen instance of the log normal distribution with the + specified parameters. + """ return scipy.stats.lognorm(s=sdlog, scale=np.exp(meanlog), loc=0) From e073f883fa028467386f7292f4cf3f2703a82358 Mon Sep 17 00:00:00 2001 From: Timothy Willard Date: Mon, 8 Jul 2024 10:33:06 -0400 Subject: [PATCH 18/19] Added unit tests for rolling_mean_pad * Added unit tests for `gempyor.utils.rolling_mean_pad` in `tests/utils/test_rolling_mean_pad.py`. * Wrote a reference implementation of `rolling_mean_pad` for comparison purposes. --- .../tests/utils/test_rolling_mean_pad.py | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 flepimop/gempyor_pkg/tests/utils/test_rolling_mean_pad.py diff --git a/flepimop/gempyor_pkg/tests/utils/test_rolling_mean_pad.py b/flepimop/gempyor_pkg/tests/utils/test_rolling_mean_pad.py new file mode 100644 index 000000000..94be3394a --- /dev/null +++ b/flepimop/gempyor_pkg/tests/utils/test_rolling_mean_pad.py @@ -0,0 +1,149 @@ +import numpy as np +import numpy.typing as npt +import pytest + +from gempyor.utils import rolling_mean_pad + + +class TestRollingMeanPad: + """Unit tests for the `gempyor.utils.rolling_mean_pad` function.""" + + # Test data for various matrix configurations + test_data = { + # 1x1 matrices + "one_by_one_const": np.array([[1.0]]), + "one_by_one_nan": np.array([[np.nan]]), + "one_by_one_rand": np.random.uniform(size=(1, 1)), + # 1xN matrices + "one_by_many_const": np.arange(start=1.0, stop=6.0).reshape((1, 5)), + "one_by_many_nan": np.repeat(np.nan, 5).reshape((1, 5)), + "one_by_many_rand": np.random.uniform(size=(1, 5)), + # Mx1 matrices + "many_by_one_const": np.arange(start=3.0, stop=9.0).reshape((6, 1)), + "many_by_one_nan": np.repeat(np.nan, 6).reshape((6, 1)), + "many_by_one_rand": np.random.uniform(size=(6, 1)), + # MxN matrices + "many_by_many_const": np.arange(start=1.0, stop=49.0).reshape((12, 4)), + "many_by_many_nan": np.repeat(np.nan, 48).reshape((12, 4)), + "many_by_many_rand": np.random.uniform(size=(12, 4)), + } + + @pytest.mark.parametrize( + "test_data_name,expected_shape,window,put_nans", + [ + # 1x1 matrices + ("one_by_one_const", (1, 1), 3, []), + ("one_by_one_const", (1, 1), 4, []), + ("one_by_one_nan", (1, 1), 3, []), + ("one_by_one_nan", (1, 1), 4, []), + ("one_by_one_rand", (1, 1), 3, []), + ("one_by_one_rand", (1, 1), 4, []), + ("one_by_one_rand", (1, 1), 5, []), + ("one_by_one_rand", (1, 1), 6, []), + # 1xN matrices + ("one_by_many_const", (1, 5), 3, []), + ("one_by_many_const", (1, 5), 4, []), + ("one_by_many_nan", (1, 5), 3, []), + ("one_by_many_nan", (1, 5), 4, []), + ("one_by_many_rand", (1, 5), 3, []), + ("one_by_many_rand", (1, 5), 4, []), + ("one_by_many_rand", (1, 5), 5, []), + ("one_by_many_rand", (1, 5), 6, []), + # Mx1 matrices + ("many_by_one_const", (6, 1), 3, []), + ("many_by_one_const", (6, 1), 4, []), + ("many_by_one_nan", (6, 1), 3, []), + ("many_by_one_nan", (6, 1), 4, []), + ("many_by_one_rand", (6, 1), 3, []), + ("many_by_one_rand", (6, 1), 4, []), + ("many_by_one_rand", (6, 1), 5, []), + ("many_by_one_rand", (6, 1), 6, []), + # MxN matrices + ("many_by_many_const", (12, 4), 3, []), + ("many_by_many_const", (12, 4), 4, []), + ("many_by_many_const", (12, 4), 5, []), + ("many_by_many_const", (12, 4), 6, []), + ("many_by_many_nan", (12, 4), 3, []), + ("many_by_many_nan", (12, 4), 4, []), + ("many_by_many_nan", (12, 4), 5, []), + ("many_by_many_nan", (12, 4), 6, []), + ("many_by_many_rand", (12, 4), 3, []), + ("many_by_many_rand", (12, 4), 4, []), + ("many_by_many_rand", (12, 4), 5, []), + ("many_by_many_rand", (12, 4), 6, []), + ("many_by_many_rand", (12, 4), 7, []), + ("many_by_many_rand", (12, 4), 8, []), + ("many_by_many_rand", (12, 4), 9, []), + ("many_by_many_rand", (12, 4), 10, []), + ("many_by_many_rand", (12, 4), 11, []), + ("many_by_many_rand", (12, 4), 12, []), + ("many_by_many_rand", (12, 4), 13, []), + ("many_by_many_rand", (12, 4), 14, []), + ("many_by_many_rand", (12, 4), 3, [(2, 2), (4, 4)]), + ("many_by_many_rand", (12, 4), 4, [(2, 2), (4, 4)]), + ("many_by_many_rand", (12, 4), 5, [(2, 2), (4, 4)]), + ("many_by_many_rand", (12, 4), 6, [(2, 2), (4, 4)]), + ("many_by_many_rand", (12, 4), 7, [(2, 2), (4, 4)]), + ("many_by_many_rand", (12, 4), 8, [(2, 2), (4, 4)]), + ], + ) + def test_rolling_mean_pad( + self, + test_data_name: str, + expected_shape: tuple[int, int], + window: int, + put_nans: list[tuple[int, int]], + ) -> None: + """Tests `rolling_mean_pad` function with various configurations of input data. + + Args: + test_data_name: The name of the test data set to use. + expected_shape: The expected shape of the output array. + window: The size of the rolling window. + put_nans: A list of indices to insert NaNs into the input data. + + Raises: + AssertionError: If the shape or contents of the output do not match the + expected values. + """ + test_data = self.test_data.get(test_data_name).copy() + if put_nans: + np.put(test_data, put_nans, np.nan) + rolling_mean_data = rolling_mean_pad(test_data, window) + rolling_mean_reference = self._rolling_mean_pad_reference(test_data, window) + assert rolling_mean_data.shape == expected_shape + assert np.isclose( + rolling_mean_data, rolling_mean_reference, equal_nan=True + ).all() + + def _rolling_mean_pad_reference( + self, data: npt.NDArray[np.number], window: int + ) -> npt.NDArray[np.number]: + """Generates a reference rolling mean with padding. + + This implementation should match the `gempyor.utils.rolling_mean_pad` + implementation, but is written for readability. As a result this + reference implementation is extremely slow. + + Args: + data: The input array for which to compute the rolling mean. + window: The size of the rolling window. + + Returns: + An array of the same shape as `data` containing the rolling mean values. + """ + # Setup + rows, cols = data.shape + output = np.zeros((rows, cols), dtype=data.dtype) + # Slow but intuitive triple loop + for i in range(rows): + for j in range(cols): + # If the last row on an even window, change the window to be one less, + # so 4 -> 3, but 5 -> 5. + sub_window = window - 1 if window % 2 == 0 and i == rows - 1 else window + weight = 1.0 / sub_window + for l in range(-((sub_window - 1) // 2), 1 + (sub_window // 2)): + i_star = min(max(i + l, 0), rows - 1) + output[i, j] += weight * data[i_star, j] + # Done + return output From a91fc6df031181b0e537b88e0ca60f9f3df287db Mon Sep 17 00:00:00 2001 From: Timothy Willard Date: Mon, 8 Jul 2024 14:15:57 -0400 Subject: [PATCH 19/19] Documented `rolling_mean_pad` * Added type hints for `gempyor.utils.rolling_mean_pad`. * Expanded the existing docstring and included an example. --- flepimop/gempyor_pkg/src/gempyor/utils.py | 37 ++++++++++++++++++++--- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/flepimop/gempyor_pkg/src/gempyor/utils.py b/flepimop/gempyor_pkg/src/gempyor/utils.py index 98c54a1f1..85820d5b2 100644 --- a/flepimop/gempyor_pkg/src/gempyor/utils.py +++ b/flepimop/gempyor_pkg/src/gempyor/utils.py @@ -13,6 +13,7 @@ from botocore.exceptions import ClientError import confuse import numpy as np +import numpy.typing as npt import pandas as pd import pyarrow as pa import scipy.stats @@ -392,16 +393,44 @@ def list_filenames( return files -def rolling_mean_pad(data, window): +def rolling_mean_pad( + data: npt.NDArray[np.number], + window: int, +) -> npt.NDArray[np.number]: """ - Calculates rolling mean with centered window and pads the edges. + Calculates the column-wise rolling mean with centered window. Args: - data: A NumPy array !!! shape must be (n_days, nsubpops). + data: A two dimensional numpy array, typically the row dimension is time and the + column dimension is subpop. window: The window size for the rolling mean. Returns: - A NumPy array with the padded rolling mean (n_days, nsubpops). + A two dimensional numpy array that is the same shape as `data`. + + Examples: + Below is a brief set of examples showcasing how to smooth a metric, like + hospitalizations, using this function. + + >>> import numpy as np + >>> from gempyor.utils import rolling_mean_pad + >>> hospitalizations = np.arange(1., 29.).reshape((7, 4)) + >>> hospitalizations + array([[ 1., 2., 3., 4.], + [ 5., 6., 7., 8.], + [ 9., 10., 11., 12.], + [13., 14., 15., 16.], + [17., 18., 19., 20.], + [21., 22., 23., 24.], + [25., 26., 27., 28.]]) + >>> rolling_mean_pad(hospitalizations, 5) + array([[ 3.4, 4.4, 5.4, 6.4], + [ 5.8, 6.8, 7.8, 8.8], + [ 9. , 10. , 11. , 12. ], + [13. , 14. , 15. , 16. ], + [17. , 18. , 19. , 20. ], + [20.2, 21.2, 22.2, 23.2], + [22.6, 23.6, 24.6, 25.6]]) """ padding_size = (window - 1) // 2 padded_data = np.pad(data, ((padding_size, padding_size), (0, 0)), mode="edge")