drop black for ruff-format

CompRhys · Oct 25, 2023 · 9eaa9ea · 9eaa9ea
1 parent 8bd9a23
commit 9eaa9ea
Show file tree

Hide file tree

Showing 8 changed files with 124 additions and 106 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -5,10 +5,11 @@ ci:
 
 repos:
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.0.292
+    rev: v0.1.2
     hooks:
       - id: ruff
         args: [--fix]
+      - id: ruff-format
 
   - repo: https://github.com/pre-commit/pre-commit-hooks
     rev: v4.5.0
@@ -27,13 +28,8 @@ repos:
       - id: codespell
         exclude_types: [json]
 
-  - repo: https://github.com/psf/black
-    rev: 23.9.1
-    hooks:
-      - id: black-jupyter
-
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.6.0
+    rev: v1.6.1
     hooks:
       - id: mypy
         exclude: (tests|examples)/

diff --git a/aviary/cgcnn/data.py b/aviary/cgcnn/data.py
@@ -33,19 +33,22 @@ def __init__(
         dmin: float = 0,
         step: float = 0.2,
     ):
-        """Featurize crystal structures into neighborhood graphs with this data class for CGCNN.
+        """Featurize crystal structures into neighborhood graphs with this data class
+        for CGCNN.
 
         Args:
             df (pd.Dataframe): Pandas dataframe holding input and target values.
             task_dict ({target: task}): task dict for multi-task learning
-            elem_embedding (str, optional): One of "matscholar200", "cgcnn92", "megnet16",
-                "onehot112" or path to a file with custom element embeddings.
-                Defaults to "matscholar200".
-            structure_col (str, optional): df column holding pymatgen Structure objects as input.
-            identifiers (list[str], optional): df columns for distinguishing data points. Will be
-                copied over into the model's output CSV. Defaults to ().
+            elem_embedding (str, optional): One of matscholar200, cgcnn92, megnet16,
+                onehot112 or path to a file with custom element embeddings.
+                Defaults to matscholar200.
+            structure_col (str, optional): df column holding pymatgen Structure objects
+                as input.
+            identifiers (list[str], optional): df columns for distinguishing data
+                points. Will be copied over into the model's output CSV. Defaults to ().
             radius (float, optional): Cut-off radius for neighborhood. Defaults to 5.
-            max_num_nbr (int, optional): maximum number of neighbors to consider. Defaults to 12.
+            max_num_nbr (int, optional): maximum number of neighbors to consider.
+                Defaults to 12.
             dmin (float, optional): minimum distance in Gaussian basis. Defaults to 0.
             step (float, optional): increment size of Gaussian basis. Defaults to 0.2.
         """
@@ -158,7 +161,7 @@ def __getitem__(self, idx: int):
             raise ValueError(f"All atoms in {material_ids} are isolated")
         if len(nbr_idx) == 0:
             raise ValueError(
-                f"Empty nbr_idx. This should not be triggered but was for {material_ids}"
+                f"Empty nbr_idx. should not happen but did for {material_ids}"
             )
         if set(self_idx) != set(range(len(struct))):
             raise ValueError(f"At least one atom in {material_ids} is isolated")
@@ -185,7 +188,7 @@ def collate_batch(
         tuple[Tensor, Tensor, LongTensor, LongTensor],
         list[Tensor | LongTensor],
         list[str | int],
-    ]
+    ],
 ) -> tuple[Any, ...]:
     """Collate a list of data and return a batch for predicting crystal properties.
 
@@ -197,13 +200,14 @@ def collate_batch(
                 self_idx (LongTensor): indices of atoms in the structure
                 nbr_idx (LongTensor): indices of neighboring atoms
             ]
-            target (Tensor | LongTensor): target values containing floats for regression or
-                integers as class labels for classification
+            target (Tensor | LongTensor): target values containing floats for regression
+                or integers as class labels for classification
             identifiers: str or int
 
     Returns:
         tuple[
-            tuple[Tensor, Tensor, LongTensor, LongTensor, LongTensor]: batched CGCNN model inputs,
+            tuple[Tensor, Tensor, LongTensor, LongTensor, LongTensor]: batched CGCNN
+                model inputs,
             tuple[Tensor | LongTensor]: Target values for different tasks,
             *tuple[str | int]: identifiers like material_id, composition
         ]
@@ -267,11 +271,11 @@ def __init__(
             dmin (float): Minimum interatomic distance
             dmax (float): Maximum interatomic distance
             step (float): Step size for the Gaussian filter
-            var (float, optional): Variance of Gaussian basis. Defaults to step if not given.
+            var (float, optional): Variance of Gaussian basis. Defaults to step.
         """
         if dmin >= dmax:
             raise ValueError(
-                "Max radii must be larger than minimum radii for Gaussian basis expansion"
+                "Max radii must be > minimum radii for Gaussian basis expansion"
             )
         if dmax - dmin <= step:
             raise ValueError(
@@ -293,7 +297,8 @@ def expand(self, distances: np.ndarray) -> np.ndarray:
             distances (ArrayLike): A distance matrix of any shape.
 
         Returns:
-            np.ndarray: Expanded distance matrix with the last dimension of length len(self.filter)
+            np.ndarray: Expanded distance matrix with the last dimension of length
+                len(self.filter)
         """
         distances = np.array(distances)
 

diff --git a/aviary/losses.py b/aviary/losses.py
@@ -3,32 +3,30 @@
 
 
 def robust_l1_loss(pred_mean: Tensor, pred_log_std: Tensor, target: Tensor) -> Tensor:
-    """Robust L1 loss using a Lorentzian prior. Trains the model to learn to predict aleatoric
-    (per-sample) uncertainty.
+    """Robust L1 loss using a Lorentzian prior. Trains the model to learn to predict
+    aleatoric (i.e. per-sample) uncertainty.
 
     Args:
         pred_mean (Tensor): Tensor of predicted means.
-        pred_log_std (Tensor): Tensor of predicted log standard deviations representing per-sample
-            model uncertainties.
+        pred_log_std (Tensor): Tensor of predicted log standard deviations representing
+            per-sample model uncertainties.
         target (Tensor): Tensor of target values.
 
     Returns:
         Tensor: Evaluated robust L1 loss
     """
-    loss = (
-        2**0.5 * (pred_mean - target).abs() * torch.exp(-pred_log_std) + pred_log_std
-    )
+    loss = 2**0.5 * (pred_mean - target).abs() * torch.exp(-pred_log_std) + pred_log_std
     return torch.mean(loss)
 
 
 def robust_l2_loss(pred_mean: Tensor, pred_log_std: Tensor, target: Tensor) -> Tensor:
-    """Robust L2 loss using a Gaussian prior. Trains the model to learn to predict aleatoric
-    (per-sample) uncertainty.
+    """Robust L2 loss using a Gaussian prior. Trains the model to learn to predict
+    aleatoric (i.e. per-sample) uncertainty.
 
     Args:
         pred_mean (Tensor): Tensor of predicted means.
-        pred_log_std (Tensor): Tensor of predicted log standard deviations representing per-sample
-            model uncertainties.
+        pred_log_std (Tensor): Tensor of predicted log standard deviations representing
+            per-sample model uncertainties.
         target (Tensor): Tensor of target values.
 
     Returns:

diff --git a/aviary/predict.py b/aviary/predict.py
@@ -1,3 +1,4 @@
+# ruff: noqa: E501
 from __future__ import annotations
 
 import os
@@ -110,9 +111,7 @@ def make_ensemble_predictions(
     if df.columns.str.startswith("aleatoric_std_").any():
         aleatoric_std = df.filter(regex=r"aleatoric_std_\d").mean(axis=1)
         df[f"{target_col}_aleatoric_std_ens"] = aleatoric_std
-        df[f"{target_col}_total_std_ens"] = (
-            epistemic_std**2 + aleatoric_std**2
-        ) ** 0.5
+        df[f"{target_col}_total_std_ens"] = (epistemic_std**2 + aleatoric_std**2) ** 0.5
 
     if target_col:
         targets = df[target_col]

diff --git a/aviary/train.py b/aviary/train.py
@@ -1,3 +1,4 @@
+# ruff: noqa: E501
 from __future__ import annotations
 
 import os
@@ -136,13 +137,13 @@ def train_model(
     )
     loss_dict = {target_col: (task_type, loss_func)}
     normalizer_dict = {target_col: Normalizer() if task_type == reg_key else None}
-    # TODO consider actually fitting the normalizer, currently just passed into model.evaluate()
-    # to match function signature
+    # TODO consider actually fitting the normalizer, currently just passed into
+    # model.evaluate() to match function signature
 
-    # embedding_len is the length of the embedding vector for a Wyckoff position encoding the
-    # element type (usually 200-dim Matscholar embeddings) and Wyckoff position (see
-    # 'bra-alg-off.json') + 1 for the weight of that Wyckoff position (or element) in the material
-    # embedding_len = train_loader.tensors[0][0].shape[-1]
+    # embedding_len is the length of the embedding vector for a Wyckoff position
+    # encoding the element type (usually 200-dim Matscholar embeddings) and Wyckoff
+    # position (see 'bra-alg-off.json') + 1 for the weight of that Wyckoff position (or
+    # element) in the material embedding_len = train_loader.tensors[0][0].shape[-1]
     # # Roost and Wren embedding size resp.
     # assert embedding_len in (200 + 1, 200 + 1 + 444), f"{embedding_len=}"
 
@@ -262,7 +263,8 @@ def train_model(
     if swa_start is not None:
         n_swa_epochs = int((1 - swa_start) * epochs)
         print(
-            f"Using SWA model with weights averaged over {n_swa_epochs} epochs ({swa_start = })"
+            f"Using SWA model with weights averaged over {n_swa_epochs} epochs "
+            f"({swa_start=})"
         )
 
     inference_model = swa_model if swa_start else model
@@ -315,7 +317,8 @@ def train_model(
         )
         if scheduler_name == "LambdaLR":
             # exclude lr_lambda from pickled checkpoint since it causes errors when
-            # torch.load()-ing a checkpoint and the file defining lr_lambda() was renamed
+            # torch.load()-ing a checkpoint and the file defining lr_lambda() was
+            # renamed
             checkpoint_dict["run_params"]["lr_scheduler"].pop("params")
         if checkpoint == "local":
             os.makedirs(f"{ROOT}/models", exist_ok=True)
@@ -370,32 +373,35 @@ def train_wrenformer(
     model_params: dict[str, Any] | None = None,
     **kwargs,
 ) -> tuple[dict[str, float], dict[str, Any], pd.DataFrame]:
-    """Train a Wrenformer model on a dataframe. This function handles the DataLoader creation,
-    then delegates to train_model().
+    """Train a Wrenformer model on a dataframe. This function handles the DataLoader
+    creation, then delegates to train_model().
 
     Args:
-        run_name (str): A string to describe the training run. Should usually contain model type
-            (Roost/Wren) and important params. Include 'robust' to use a robust loss function and
-            have the model learn to predict an aleatoric uncertainty.
+        run_name (str): A string to describe the training run. Should usually contain
+            model type (Roost/Wren) and important params. Include 'robust' to use a
+            robust loss function and have the model learn to predict an aleatoric
+            uncertainty.
         target_col (str): Column name in train_df and test_df containing target values.
-        task_type ('regression' | 'classification'): What type of task to train the model for.
+        task_type ('regression' | 'classification'): What type of task to train the
+            model for.
         train_df (pd.DataFrame): Training set dataframe.
         test_df (pd.DataFrame): Test set dataframe.
         batch_size (int, optional): Batch size for training. Defaults to 128.
         embedding_type ('wyckoff' | 'composition', optional): Type of embedding to use.
             Defaults to None meaning auto-detect based on 'wren'/'roost' in run_name.
-        id_col (str, optional): Column name in train_df and test_df containing unique IDs for
-            each sample. Defaults to "material_id".
-        input_col (str, optional): Column name in train_df and test_df containing input values.
-            Defaults to None meaning auto-detect based on 'wren'/'roost' in run_name which default
-            to 'wyckoff' and 'composition' respectively.
+        id_col (str, optional): Column name in train_df and test_df containing unique
+            IDs for each sample. Defaults to "material_id".
+        input_col (str, optional): Column name in train_df and test_df containing input
+            values. Defaults to None meaning auto-detect based on 'wren'/'roost' in
+            run_name which default to 'wyckoff' and 'composition' respectively.
         model_params (dict): Passed to Wrenformer class. E.g. dict(n_attn_layers=6,
             embedding_aggregation=("mean", "std")).
         **kwargs: Additional keyword arguments are passed to train_model().
 
     Returns:
-        tuple[dict[str, float], dict[str, Any]]: 1st dict are the model's test set metrics.
-            2nd dict are the run's hyperparameters. 3rd is a dataframe with test set predictions.
+        tuple[dict[str, float], dict[str, Any]]: 1st dict are the model's test set
+            metrics. 2nd dict are the run's hyperparameters. 3rd is a dataframe with
+            test set predictions.
     """
     robust = "robust" in run_name.lower()
 
@@ -415,16 +421,23 @@ def train_wrenformer(
         embedding_type=embedding_type,
     )
     train_loader = df_to_in_mem_dataloader(
-        train_df, batch_size=batch_size, shuffle=True, **data_loader_kwargs  # type: ignore[arg-type]
+        train_df,
+        batch_size=batch_size,
+        shuffle=True,
+        **data_loader_kwargs,  # type: ignore[arg-type]
     )
 
     test_loader = df_to_in_mem_dataloader(
-        test_df, batch_size=512, shuffle=False, **data_loader_kwargs  # type: ignore[arg-type]
+        test_df,
+        batch_size=512,
+        shuffle=False,
+        **data_loader_kwargs,  # type: ignore[arg-type]
     )
 
-    # embedding_len is the length of the embedding vector for a Wyckoff position encoding the
-    # element type (usually 200-dim matscholar embeddings) and Wyckoff position (see
-    # 'bra-alg-off.json') + 1 for the weight of that Wyckoff position (or element) in the material
+    # embedding_len is the length of the embedding vector for a Wyckoff position
+    # encoding the element type (usually 200-dim matscholar embeddings) and Wyckoff
+    # position (see 'bra-alg-off.json') + 1 for the weight of that Wyckoff position (or
+    # element) in the material
     embedding_len = train_loader.tensors[0][0].shape[-1]
     # Roost and Wren embedding size resp.
     assert embedding_len in (200 + 1, 200 + 1 + 444), f"{embedding_len=}"