Skip to content

Commit

Permalink
drop black for ruff-format
Browse files Browse the repository at this point in the history
  • Loading branch information
janosh committed Oct 25, 2023
1 parent 8bd9a23 commit 9eaa9ea
Show file tree
Hide file tree
Showing 8 changed files with 124 additions and 106 deletions.
10 changes: 3 additions & 7 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@ ci:

repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.0.292
rev: v0.1.2
hooks:
- id: ruff
args: [--fix]
- id: ruff-format

- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.5.0
Expand All @@ -27,13 +28,8 @@ repos:
- id: codespell
exclude_types: [json]

- repo: https://github.com/psf/black
rev: 23.9.1
hooks:
- id: black-jupyter

- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.6.0
rev: v1.6.1
hooks:
- id: mypy
exclude: (tests|examples)/
Expand Down
37 changes: 21 additions & 16 deletions aviary/cgcnn/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,19 +33,22 @@ def __init__(
dmin: float = 0,
step: float = 0.2,
):
"""Featurize crystal structures into neighborhood graphs with this data class for CGCNN.
"""Featurize crystal structures into neighborhood graphs with this data class
for CGCNN.
Args:
df (pd.Dataframe): Pandas dataframe holding input and target values.
task_dict ({target: task}): task dict for multi-task learning
elem_embedding (str, optional): One of "matscholar200", "cgcnn92", "megnet16",
"onehot112" or path to a file with custom element embeddings.
Defaults to "matscholar200".
structure_col (str, optional): df column holding pymatgen Structure objects as input.
identifiers (list[str], optional): df columns for distinguishing data points. Will be
copied over into the model's output CSV. Defaults to ().
elem_embedding (str, optional): One of matscholar200, cgcnn92, megnet16,
onehot112 or path to a file with custom element embeddings.
Defaults to matscholar200.
structure_col (str, optional): df column holding pymatgen Structure objects
as input.
identifiers (list[str], optional): df columns for distinguishing data
points. Will be copied over into the model's output CSV. Defaults to ().
radius (float, optional): Cut-off radius for neighborhood. Defaults to 5.
max_num_nbr (int, optional): maximum number of neighbors to consider. Defaults to 12.
max_num_nbr (int, optional): maximum number of neighbors to consider.
Defaults to 12.
dmin (float, optional): minimum distance in Gaussian basis. Defaults to 0.
step (float, optional): increment size of Gaussian basis. Defaults to 0.2.
"""
Expand Down Expand Up @@ -158,7 +161,7 @@ def __getitem__(self, idx: int):
raise ValueError(f"All atoms in {material_ids} are isolated")
if len(nbr_idx) == 0:
raise ValueError(
f"Empty nbr_idx. This should not be triggered but was for {material_ids}"
f"Empty nbr_idx. should not happen but did for {material_ids}"
)
if set(self_idx) != set(range(len(struct))):
raise ValueError(f"At least one atom in {material_ids} is isolated")
Expand All @@ -185,7 +188,7 @@ def collate_batch(
tuple[Tensor, Tensor, LongTensor, LongTensor],
list[Tensor | LongTensor],
list[str | int],
]
],
) -> tuple[Any, ...]:
"""Collate a list of data and return a batch for predicting crystal properties.
Expand All @@ -197,13 +200,14 @@ def collate_batch(
self_idx (LongTensor): indices of atoms in the structure
nbr_idx (LongTensor): indices of neighboring atoms
]
target (Tensor | LongTensor): target values containing floats for regression or
integers as class labels for classification
target (Tensor | LongTensor): target values containing floats for regression
or integers as class labels for classification
identifiers: str or int
Returns:
tuple[
tuple[Tensor, Tensor, LongTensor, LongTensor, LongTensor]: batched CGCNN model inputs,
tuple[Tensor, Tensor, LongTensor, LongTensor, LongTensor]: batched CGCNN
model inputs,
tuple[Tensor | LongTensor]: Target values for different tasks,
*tuple[str | int]: identifiers like material_id, composition
]
Expand Down Expand Up @@ -267,11 +271,11 @@ def __init__(
dmin (float): Minimum interatomic distance
dmax (float): Maximum interatomic distance
step (float): Step size for the Gaussian filter
var (float, optional): Variance of Gaussian basis. Defaults to step if not given.
var (float, optional): Variance of Gaussian basis. Defaults to step.
"""
if dmin >= dmax:
raise ValueError(
"Max radii must be larger than minimum radii for Gaussian basis expansion"
"Max radii must be > minimum radii for Gaussian basis expansion"
)
if dmax - dmin <= step:
raise ValueError(
Expand All @@ -293,7 +297,8 @@ def expand(self, distances: np.ndarray) -> np.ndarray:
distances (ArrayLike): A distance matrix of any shape.
Returns:
np.ndarray: Expanded distance matrix with the last dimension of length len(self.filter)
np.ndarray: Expanded distance matrix with the last dimension of length
len(self.filter)
"""
distances = np.array(distances)

Expand Down
20 changes: 9 additions & 11 deletions aviary/losses.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,32 +3,30 @@


def robust_l1_loss(pred_mean: Tensor, pred_log_std: Tensor, target: Tensor) -> Tensor:
"""Robust L1 loss using a Lorentzian prior. Trains the model to learn to predict aleatoric
(per-sample) uncertainty.
"""Robust L1 loss using a Lorentzian prior. Trains the model to learn to predict
aleatoric (i.e. per-sample) uncertainty.
Args:
pred_mean (Tensor): Tensor of predicted means.
pred_log_std (Tensor): Tensor of predicted log standard deviations representing per-sample
model uncertainties.
pred_log_std (Tensor): Tensor of predicted log standard deviations representing
per-sample model uncertainties.
target (Tensor): Tensor of target values.
Returns:
Tensor: Evaluated robust L1 loss
"""
loss = (
2**0.5 * (pred_mean - target).abs() * torch.exp(-pred_log_std) + pred_log_std
)
loss = 2**0.5 * (pred_mean - target).abs() * torch.exp(-pred_log_std) + pred_log_std
return torch.mean(loss)


def robust_l2_loss(pred_mean: Tensor, pred_log_std: Tensor, target: Tensor) -> Tensor:
"""Robust L2 loss using a Gaussian prior. Trains the model to learn to predict aleatoric
(per-sample) uncertainty.
"""Robust L2 loss using a Gaussian prior. Trains the model to learn to predict
aleatoric (i.e. per-sample) uncertainty.
Args:
pred_mean (Tensor): Tensor of predicted means.
pred_log_std (Tensor): Tensor of predicted log standard deviations representing per-sample
model uncertainties.
pred_log_std (Tensor): Tensor of predicted log standard deviations representing
per-sample model uncertainties.
target (Tensor): Tensor of target values.
Returns:
Expand Down
5 changes: 2 additions & 3 deletions aviary/predict.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# ruff: noqa: E501
from __future__ import annotations

import os
Expand Down Expand Up @@ -110,9 +111,7 @@ def make_ensemble_predictions(
if df.columns.str.startswith("aleatoric_std_").any():
aleatoric_std = df.filter(regex=r"aleatoric_std_\d").mean(axis=1)
df[f"{target_col}_aleatoric_std_ens"] = aleatoric_std
df[f"{target_col}_total_std_ens"] = (
epistemic_std**2 + aleatoric_std**2
) ** 0.5
df[f"{target_col}_total_std_ens"] = (epistemic_std**2 + aleatoric_std**2) ** 0.5

if target_col:
targets = df[target_col]
Expand Down
65 changes: 39 additions & 26 deletions aviary/train.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# ruff: noqa: E501
from __future__ import annotations

import os
Expand Down Expand Up @@ -136,13 +137,13 @@ def train_model(
)
loss_dict = {target_col: (task_type, loss_func)}
normalizer_dict = {target_col: Normalizer() if task_type == reg_key else None}
# TODO consider actually fitting the normalizer, currently just passed into model.evaluate()
# to match function signature
# TODO consider actually fitting the normalizer, currently just passed into
# model.evaluate() to match function signature

# embedding_len is the length of the embedding vector for a Wyckoff position encoding the
# element type (usually 200-dim Matscholar embeddings) and Wyckoff position (see
# 'bra-alg-off.json') + 1 for the weight of that Wyckoff position (or element) in the material
# embedding_len = train_loader.tensors[0][0].shape[-1]
# embedding_len is the length of the embedding vector for a Wyckoff position
# encoding the element type (usually 200-dim Matscholar embeddings) and Wyckoff
# position (see 'bra-alg-off.json') + 1 for the weight of that Wyckoff position (or
# element) in the material embedding_len = train_loader.tensors[0][0].shape[-1]
# # Roost and Wren embedding size resp.
# assert embedding_len in (200 + 1, 200 + 1 + 444), f"{embedding_len=}"

Expand Down Expand Up @@ -262,7 +263,8 @@ def train_model(
if swa_start is not None:
n_swa_epochs = int((1 - swa_start) * epochs)
print(
f"Using SWA model with weights averaged over {n_swa_epochs} epochs ({swa_start = })"
f"Using SWA model with weights averaged over {n_swa_epochs} epochs "
f"({swa_start=})"
)

inference_model = swa_model if swa_start else model
Expand Down Expand Up @@ -315,7 +317,8 @@ def train_model(
)
if scheduler_name == "LambdaLR":
# exclude lr_lambda from pickled checkpoint since it causes errors when
# torch.load()-ing a checkpoint and the file defining lr_lambda() was renamed
# torch.load()-ing a checkpoint and the file defining lr_lambda() was
# renamed
checkpoint_dict["run_params"]["lr_scheduler"].pop("params")
if checkpoint == "local":
os.makedirs(f"{ROOT}/models", exist_ok=True)
Expand Down Expand Up @@ -370,32 +373,35 @@ def train_wrenformer(
model_params: dict[str, Any] | None = None,
**kwargs,
) -> tuple[dict[str, float], dict[str, Any], pd.DataFrame]:
"""Train a Wrenformer model on a dataframe. This function handles the DataLoader creation,
then delegates to train_model().
"""Train a Wrenformer model on a dataframe. This function handles the DataLoader
creation, then delegates to train_model().
Args:
run_name (str): A string to describe the training run. Should usually contain model type
(Roost/Wren) and important params. Include 'robust' to use a robust loss function and
have the model learn to predict an aleatoric uncertainty.
run_name (str): A string to describe the training run. Should usually contain
model type (Roost/Wren) and important params. Include 'robust' to use a
robust loss function and have the model learn to predict an aleatoric
uncertainty.
target_col (str): Column name in train_df and test_df containing target values.
task_type ('regression' | 'classification'): What type of task to train the model for.
task_type ('regression' | 'classification'): What type of task to train the
model for.
train_df (pd.DataFrame): Training set dataframe.
test_df (pd.DataFrame): Test set dataframe.
batch_size (int, optional): Batch size for training. Defaults to 128.
embedding_type ('wyckoff' | 'composition', optional): Type of embedding to use.
Defaults to None meaning auto-detect based on 'wren'/'roost' in run_name.
id_col (str, optional): Column name in train_df and test_df containing unique IDs for
each sample. Defaults to "material_id".
input_col (str, optional): Column name in train_df and test_df containing input values.
Defaults to None meaning auto-detect based on 'wren'/'roost' in run_name which default
to 'wyckoff' and 'composition' respectively.
id_col (str, optional): Column name in train_df and test_df containing unique
IDs for each sample. Defaults to "material_id".
input_col (str, optional): Column name in train_df and test_df containing input
values. Defaults to None meaning auto-detect based on 'wren'/'roost' in
run_name which default to 'wyckoff' and 'composition' respectively.
model_params (dict): Passed to Wrenformer class. E.g. dict(n_attn_layers=6,
embedding_aggregation=("mean", "std")).
**kwargs: Additional keyword arguments are passed to train_model().
Returns:
tuple[dict[str, float], dict[str, Any]]: 1st dict are the model's test set metrics.
2nd dict are the run's hyperparameters. 3rd is a dataframe with test set predictions.
tuple[dict[str, float], dict[str, Any]]: 1st dict are the model's test set
metrics. 2nd dict are the run's hyperparameters. 3rd is a dataframe with
test set predictions.
"""
robust = "robust" in run_name.lower()

Expand All @@ -415,16 +421,23 @@ def train_wrenformer(
embedding_type=embedding_type,
)
train_loader = df_to_in_mem_dataloader(
train_df, batch_size=batch_size, shuffle=True, **data_loader_kwargs # type: ignore[arg-type]
train_df,
batch_size=batch_size,
shuffle=True,
**data_loader_kwargs, # type: ignore[arg-type]
)

test_loader = df_to_in_mem_dataloader(
test_df, batch_size=512, shuffle=False, **data_loader_kwargs # type: ignore[arg-type]
test_df,
batch_size=512,
shuffle=False,
**data_loader_kwargs, # type: ignore[arg-type]
)

# embedding_len is the length of the embedding vector for a Wyckoff position encoding the
# element type (usually 200-dim matscholar embeddings) and Wyckoff position (see
# 'bra-alg-off.json') + 1 for the weight of that Wyckoff position (or element) in the material
# embedding_len is the length of the embedding vector for a Wyckoff position
# encoding the element type (usually 200-dim matscholar embeddings) and Wyckoff
# position (see 'bra-alg-off.json') + 1 for the weight of that Wyckoff position (or
# element) in the material
embedding_len = train_loader.tensors[0][0].shape[-1]
# Roost and Wren embedding size resp.
assert embedding_len in (200 + 1, 200 + 1 + 444), f"{embedding_len=}"
Expand Down
Loading

0 comments on commit 9eaa9ea

Please sign in to comment.