[pre-commit.ci] pre-commit autoupdate (#79)

* [pre-commit.ci] pre-commit autoupdate updates: - [github.com/astral-sh/ruff-pre-commit: v0.1.5 → v0.1.9](astral-sh/ruff-pre-commit@v0.1.5...v0.1.9) - [github.com/pre-commit/mirrors-mypy: v1.6.1 → v1.8.0](pre-commit/mirrors-mypy@v1.6.1...v1.8.0) * Update GitHub Actions versions * fix line too long ruff errors --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Janosh Riebesell <[email protected]>
CompRhys · Jan 2, 2024 · a7344c1 · a7344c1
1 parent 18a5607
commit a7344c1
Show file tree

Hide file tree

Showing 20 changed files with 217 additions and 174 deletions.
diff --git a/.github/workflows/link-check.yml b/.github/workflows/link-check.yml
@@ -11,7 +11,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Check out repo
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Run markdown link check
         uses: gaurav-nelson/github-action-markdown-link-check@v1

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -13,10 +13,10 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Check out repo
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Set up Python
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
         with:
           python-version: 3.8
           cache: pip

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -5,7 +5,7 @@ ci:
 
 repos:
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.1.5
+    rev: v0.1.9
     hooks:
       - id: ruff
         args: [--fix]
@@ -30,7 +30,7 @@ repos:
         args: [--check-filenames]
 
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.6.1
+    rev: v1.8.0
     hooks:
       - id: mypy
         exclude: (tests|examples)/

diff --git a/aviary/cgcnn/data.py b/aviary/cgcnn/data.py
@@ -327,7 +327,9 @@ def get_structure_neighbor_info(
     )
 
     if max_num_nbr is not None:
-        _center_indices, _neighbor_indices, _neighbor_dists = [], [], []
+        _center_indices: list[int] = []
+        _neighbor_indices: list[int] = []
+        _neighbor_dists: list[float] = []
 
         for _, idx_group in itertools.groupby(  # group by site index
             zip(site_indices, neighbor_indices, neighbor_dists), key=lambda x: x[0]

diff --git a/aviary/cgcnn/model.py b/aviary/cgcnn/model.py
@@ -37,17 +37,18 @@ def __init__(
         """Initialize CrystalGraphConvNet.
 
         Args:
-            robust (bool): If True, the number of model outputs is doubled. 2nd output for each
-                target will be an estimate for the aleatoric uncertainty (uncertainty inherent to
-                the sample) which can be used with a robust loss function to attenuate the weighting
-                of uncertain samples.
+            robust (bool): If True, the number of model outputs is doubled. 2nd output
+                for each target will be an estimate for the aleatoric uncertainty
+                (uncertainty inherent to the sample) which can be used with a robust
+                loss function to attenuate the weighting of uncertain samples.
             n_targets (list[int]): Number of targets to train on
             elem_emb_len (int): Number of atom features in the input.
             nbr_fea_len (int): Number of bond features.
-            elem_fea_len (int, optional): Number of hidden atom features in the convolutional
-                layers. Defaults to 64.
+            elem_fea_len (int, optional): Number of hidden atom features in the
+                convolutional layers. Defaults to 64.
             n_graph (int, optional): Number of convolutional layers. Defaults to 4.
-            h_fea_len (int, optional): Number of hidden features after pooling. Defaults to 128.
+            h_fea_len (int, optional): Number of hidden features after pooling. Defaults
+                to 128.
             n_trunk (int, optional): Number of hidden layers in trunk after pooling.
                 Defaults to 1.
             n_hidden (int, optional): Number of hidden layers after trunk for each task.
@@ -120,7 +121,9 @@ def forward(
 
 
 class DescriptorNetwork(nn.Module):
-    """The Descriptor Network is the message passing section of the CrystalGraphConvNet Model."""
+    """The Descriptor Network is the message passing section of the CrystalGraphConvNet
+    Model.
+    """
 
     def __init__(
         self,
@@ -134,8 +137,8 @@ def __init__(
         Args:
             elem_emb_len (int): Number of atom features in the input.
             nbr_fea_len (int): Number of bond features.
-            elem_fea_len (int, optional): Number of hidden atom features in the graph convolution
-                layers. Defaults to 64.
+            elem_fea_len (int, optional): Number of hidden atom features in the graph
+                convolution layers. Defaults to 64.
             n_graph (int, optional): Number of graph convolution layers. Defaults to 4.
         """
         super().__init__()

diff --git a/aviary/core.py b/aviary/core.py
@@ -39,14 +39,16 @@ def __init__(
         """Store core model parameters.
 
         Args:
-            task_dict (dict[str, TaskType]): Map target names to "regression" or "classification".
-            robust (bool): If True, the number of model outputs is doubled. 2nd output for each
-                target will be an estimate for the aleatoric uncertainty (uncertainty inherent to
-                the sample) which can be used with a robust loss function to attenuate the weighting
-                of uncertain samples.
-            epoch (int, optional): Epoch model training will begin/resume from. Defaults to 0.
-            best_val_scores (dict[str, float], optional): Validation score to use for early
-                stopping. Defaults to None.
+            task_dict (dict[str, TaskType]): Map target names to "regression" or
+                "classification".
+            robust (bool): If True, the number of model outputs is doubled. 2nd output
+                for each target will be an estimate for the aleatoric uncertainty
+                (uncertainty inherent to the sample) which can be used with a robust
+                loss function to attenuate the weighting of uncertain samples.
+            epoch (int, optional): Epoch model training will begin/resume from.
+                Defaults to 0.
+            best_val_scores (dict[str, float], optional): Validation score to use for
+                early stopping. Defaults to None.
         """
         super().__init__()
         self.task_dict = task_dict
@@ -79,18 +81,22 @@ def fit(
         Args:
             train_loader (DataLoader): Dataloader containing training data.
             val_loader (DataLoader): Dataloader containing validation data.
-            optimizer (torch.optim.Optimizer): Optimizer used to carry out parameter updates.
+            optimizer (torch.optim.Optimizer): Optimizer used to carry out parameter
+                updates.
             scheduler (torch.optim.lr_scheduler._LRScheduler): Scheduler used to adjust
                 Optimizer during training.
             epochs (int): Number of epochs to train for.
-            loss_dict (dict[str, nn.Module]): Dictionary of losses to apply for each task.
+            loss_dict (dict[str, nn.Module]): Dict of losses to apply for each task.
             normalizer_dict (dict[str, Normalizer]): Dictionary of Normalizers to apply
                 to each task.
             model_name (str): String describing the model.
             run_id (int): Unique identifier of the model run.
-            checkpoint (bool, optional): Whether to save model checkpoints. Defaults to True.
-            writer (SummaryWriter, optional): TensorBoard writer for saving logs. Defaults to None.
-            verbose (bool, optional): Whether to print out intermediate results. Defaults to True.
+            checkpoint (bool, optional): Whether to save model checkpoints.
+                Defaults to True.
+            writer (SummaryWriter, optional): TensorBoard writer for saving logs.
+                Defaults to None.
+            verbose (bool, optional): Whether to print out intermediate results.
+                Defaults to True.
             patience (int, optional): Patience for early stopping. Defaults to None.
         """
         start_epoch = self.epoch
@@ -157,7 +163,8 @@ def fit(
                         self.es_patience += 1
                         if patience and self.es_patience > patience:
                             print(
-                                "Stopping early due to lack of improvement on validation set"
+                                f"No improvement on validation set for {patience} "
+                                "epochs, stopping early"
                             )
                             break
 
@@ -214,20 +221,23 @@ def evaluate(
         """Evaluate the model.
 
         Args:
-            data_loader (DataLoader): PyTorch Dataloader with the same data format used in fit().
+            data_loader (DataLoader): PyTorch Dataloader with the same data format used
+                in fit().
             loss_dict (dict[str, tuple[TaskType, nn.Module]]): Dictionary of losses
                 to apply for each task.
             optimizer (torch.optim.Optimizer): PyTorch Optimizer
             normalizer_dict (dict[str, Normalizer]): Dictionary of Normalizers to apply
                 to each task.
-            action ("train" | "evaluate"], optional): Whether to track gradients depending on
-                whether we are carrying out a training or validation pass. Defaults to "train".
-            verbose (bool, optional): Whether to print out intermediate results. Defaults to False.
+            action ("train" | "evaluate"], optional): Whether to track gradients
+                depending on whether we are carrying out a training or validation pass.
+                Defaults to "train".
+            verbose (bool, optional): Whether to print out intermediate results.
+                Defaults to False.
             pbar (bool, optional): Whether to display a progress bar. Defaults to False.
 
         Returns:
-            dict[str, dict["Loss" | "MAE" | "RMSE" | "Accuracy" | "F1", np.ndarray]]: nested
-                dictionary for each target of metrics averaged over an epoch.
+            dict[str, dict["Loss" | "MAE" | "RMSE" | "Accuracy" | "F1", np.ndarray]]:
+                nested dictionary for each target of metrics averaged over an epoch.
         """
         if action == "evaluate":
             self.eval()
@@ -240,9 +250,9 @@ def evaluate(
             lambda: defaultdict(list)
         )
 
-        # *_ discards identifiers like material_id and formula which we don't need when training
-        # tqdm(disable=None) means suppress output in non-tty (e.g. CI/log files) but keep in
-        # terminal (i.e. tty mode) https://git.io/JnBOi
+        # *_ discards identifiers like material_id and formula which we don't need when
+        # training tqdm(disable=None) means suppress output in non-tty (e.g. CI/log
+        # files) but keep in terminal (i.e. tty mode) https://git.io/JnBOi
         for inputs, targets_list, *_ in tqdm(
             data_loader, disable=None if pbar else True
         ):
@@ -298,8 +308,8 @@ def evaluate(
 
                 epoch_metrics[target_name]["Loss"].append(loss.cpu().item())
 
-                # NOTE multitasking currently just uses a direct sum of individual target losses
-                # this should be okay but is perhaps sub-optimal
+                # NOTE multitasking currently just uses a direct sum of individual
+                # target losses this should be okay but is perhaps sub-optimal
                 mixed_loss += loss
 
             if action == "train":
@@ -335,10 +345,11 @@ def predict(
         """Make model predictions. Supports multi-tasking.
 
         Args:
-            data_loader (DataLoader): Iterator that yields mini-batches with the same data
-                format used in fit(). To speed up inference, batch size can be set much
-                larger than during training.
-            verbose (bool, optional): Whether to print out intermediate results. Defaults to False.
+            data_loader (DataLoader): Iterator that yields mini-batches with the same
+                data format used in fit(). To speed up inference, batch size can be set
+                much larger than during training.
+            verbose (bool, optional): Whether to print out intermediate results.
+                Defaults to False.
 
         Returns:
             3 tuples where tuple items correspond to different multitask targets.
@@ -382,7 +393,8 @@ def featurize(self, data_loader: DataLoader) -> np.ndarray:
         this runs only the message-passing part of the model without the ResNet.
 
         Args:
-            data_loader (DataLoader): PyTorch Dataloader with the same data format used in fit()
+            data_loader (DataLoader): PyTorch Dataloader with the same data format used
+                in fit()
 
         Returns:
             np.array: 2d array of features
@@ -409,7 +421,8 @@ def num_params(self) -> int:
     def __repr__(self) -> str:
         """Return model name with number of parameters and epochs trained."""
         n_params, n_epochs = self.num_params, self.epoch
-        return f"{type(self).__name__} with {n_params:,} trainable params at {n_epochs:,} epochs"
+        cls_name = type(self).__name__
+        return f"{cls_name} with {n_params:,} trainable params at {n_epochs:,} epochs"
 
 
 class Normalizer:
@@ -425,8 +438,8 @@ def fit(self, tensor: Tensor, dim: int = 0, keepdim: bool = False) -> None:
 
         Args:
             tensor (Tensor): Tensor to determine the mean and standard deviation over.
-            dim (int, optional): Which dimension to take mean and standard deviation over.
-                Defaults to 0.
+            dim (int, optional): Which dimension to take mean and standard deviation
+                over. Defaults to 0.
             keepdim (bool, optional): Whether to keep the reduced dimension in Tensor.
                 Defaults to False.
         """
@@ -495,7 +508,8 @@ def save_checkpoint(
     """Saves a checkpoint and overwrites the best model when is_best = True.
 
     Args:
-        state (dict[str, Any]): Model parameters and other stateful objects like optimizer.
+        state (dict[str, Any]): Model parameters and other stateful objects like
+            optimizer.
         is_best (bool): Whether the model is the best seen according to validation set.
         model_name (str): String describing the model.
         run_id (int): Unique identifier of the model run.
@@ -542,7 +556,8 @@ def np_softmax(arr: np.ndarray, axis: int = -1) -> np.ndarray:
 
     Args:
         arr (np.ndarray): Arbitrary dimensional array.
-        axis (int, optional): Dimension over which to take softmax. Defaults to -1 (last).
+        axis (int, optional): Dimension over which to take softmax. Defaults to
+            -1 (last).
 
     Returns:
         np.ndarray: Same dimension as input array, but specified axis reduced

diff --git a/aviary/data.py b/aviary/data.py
@@ -16,11 +16,12 @@ class InMemoryDataLoader:
     Source: https://discuss.pytorch.org/t/27014/6.
 
     Args:
-        *tensors: List of arrays or tensors. Must all have the same length in dimension 0.
+        *tensors: List of arrays or tensors. Must all have the same length in
+            dimension 0.
         collate_fn (Callable): Should accept variadic list of tensors and
             output a minibatch of data ready for model consumption.
-        batch_size (int, optional): Usually 64, 128 or 256. Can be larger for test set loaders
-            to speedup inference. Defaults to 64.
+        batch_size (int, optional): Usually 64, 128 or 256. Can be larger for test set
+            loaders to speedup inference. Defaults to 64.
         shuffle (bool, optional): If True, shuffle the data *in-place* whenever an
             iterator is created from this object. Defaults to False.
     """

diff --git a/aviary/roost/data.py b/aviary/roost/data.py
@@ -31,15 +31,16 @@ def __init__(
 
         Args:
             df (pd.DataFrame): Pandas dataframe holding input and target values.
-            task_dict (dict[str, "regression" | "classification"]): Map from target names to task
-                type.
-            elem_embedding (str, optional): One of "matscholar200", "cgcnn92", "megnet16",
-                "onehot112" or path to a file with custom element embeddings.
-                Defaults to "matscholar200".
+            task_dict (dict[str, "regression" | "classification"]): Map from target
+                names to task type.
+            elem_embedding (str, optional): One of "matscholar200", "cgcnn92",
+                "megnet16", "onehot112" or path to a file with custom element
+                embeddings. Defaults to "matscholar200".
             inputs (str, optional): df column name holding material compositions.
                 Defaults to "composition".
-            identifiers (list, optional): df columns for distinguishing data points. Will be
-                copied over into the model's output CSV. Defaults to ["material_id", "composition"].
+            identifiers (list, optional): df columns for distinguishing data points.
+                Will be copied over into the model's output CSV. Defaults to
+                ["material_id", "composition"].
         """
         if len(identifiers) != 2:
             raise AssertionError("Two identifiers are required")
@@ -100,11 +101,11 @@ def __getitem__(self, idx: int):
             elem_fea = np.vstack([self.elem_features[element] for element in elements])
         except AssertionError as exc:
             raise AssertionError(
-                f"{material_ids} ({composition}) contains element types not in embedding"
+                f"{material_ids} contains element types not in embedding"
             ) from exc
         except ValueError as exc:
             raise ValueError(
-                f"{material_ids} ({composition}) composition cannot be parsed into elements"
+                f"{material_ids} composition cannot be parsed into elements"
             ) from exc
 
         n_elems = len(elements)
@@ -150,13 +151,14 @@ def collate_batch(
             - nbr_fea (Tensor):
             - self_idx (LongTensor):
             - nbr_idx (LongTensor):
-            - target (Tensor | LongTensor): target values containing floats for regression or
-                integers as class labels for classification
+            - target (Tensor | LongTensor): target values containing floats for
+                regression or integers as class labels for classification
             - cif_id: str or int
 
     Returns:
         tuple[
-            tuple[Tensor, Tensor, LongTensor, LongTensor, LongTensor]: batched Roost model inputs,
+            tuple[Tensor, Tensor, LongTensor, LongTensor, LongTensor]: batched Roost
+                model inputs,
             tuple[Tensor | LongTensor]: Target values for different tasks,
             # TODO this last tuple is unpacked how to do type hint?
             *tuple[str | int]: Identifiers like material_id, composition