Skip to content

Commit

Permalink
[pre-commit.ci] pre-commit autoupdate (#79)
Browse files Browse the repository at this point in the history
* [pre-commit.ci] pre-commit autoupdate

updates:
- [github.com/astral-sh/ruff-pre-commit: v0.1.5 → v0.1.9](astral-sh/ruff-pre-commit@v0.1.5...v0.1.9)
- [github.com/pre-commit/mirrors-mypy: v1.6.1 → v1.8.0](pre-commit/mirrors-mypy@v1.6.1...v1.8.0)

* Update GitHub Actions versions

* fix line too long ruff errors

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Janosh Riebesell <[email protected]>
  • Loading branch information
pre-commit-ci[bot] and janosh committed Jan 2, 2024
1 parent 18a5607 commit a7344c1
Show file tree
Hide file tree
Showing 20 changed files with 217 additions and 174 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/link-check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Check out repo
uses: actions/checkout@v3
uses: actions/checkout@v4

- name: Run markdown link check
uses: gaurav-nelson/github-action-markdown-link-check@v1
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Check out repo
uses: actions/checkout@v3
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: 3.8
cache: pip
Expand Down
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ ci:

repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.1.5
rev: v0.1.9
hooks:
- id: ruff
args: [--fix]
Expand All @@ -30,7 +30,7 @@ repos:
args: [--check-filenames]

- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.6.1
rev: v1.8.0
hooks:
- id: mypy
exclude: (tests|examples)/
Expand Down
4 changes: 3 additions & 1 deletion aviary/cgcnn/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,7 +327,9 @@ def get_structure_neighbor_info(
)

if max_num_nbr is not None:
_center_indices, _neighbor_indices, _neighbor_dists = [], [], []
_center_indices: list[int] = []
_neighbor_indices: list[int] = []
_neighbor_dists: list[float] = []

for _, idx_group in itertools.groupby( # group by site index
zip(site_indices, neighbor_indices, neighbor_dists), key=lambda x: x[0]
Expand Down
23 changes: 13 additions & 10 deletions aviary/cgcnn/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,17 +37,18 @@ def __init__(
"""Initialize CrystalGraphConvNet.
Args:
robust (bool): If True, the number of model outputs is doubled. 2nd output for each
target will be an estimate for the aleatoric uncertainty (uncertainty inherent to
the sample) which can be used with a robust loss function to attenuate the weighting
of uncertain samples.
robust (bool): If True, the number of model outputs is doubled. 2nd output
for each target will be an estimate for the aleatoric uncertainty
(uncertainty inherent to the sample) which can be used with a robust
loss function to attenuate the weighting of uncertain samples.
n_targets (list[int]): Number of targets to train on
elem_emb_len (int): Number of atom features in the input.
nbr_fea_len (int): Number of bond features.
elem_fea_len (int, optional): Number of hidden atom features in the convolutional
layers. Defaults to 64.
elem_fea_len (int, optional): Number of hidden atom features in the
convolutional layers. Defaults to 64.
n_graph (int, optional): Number of convolutional layers. Defaults to 4.
h_fea_len (int, optional): Number of hidden features after pooling. Defaults to 128.
h_fea_len (int, optional): Number of hidden features after pooling. Defaults
to 128.
n_trunk (int, optional): Number of hidden layers in trunk after pooling.
Defaults to 1.
n_hidden (int, optional): Number of hidden layers after trunk for each task.
Expand Down Expand Up @@ -120,7 +121,9 @@ def forward(


class DescriptorNetwork(nn.Module):
"""The Descriptor Network is the message passing section of the CrystalGraphConvNet Model."""
"""The Descriptor Network is the message passing section of the CrystalGraphConvNet
Model.
"""

def __init__(
self,
Expand All @@ -134,8 +137,8 @@ def __init__(
Args:
elem_emb_len (int): Number of atom features in the input.
nbr_fea_len (int): Number of bond features.
elem_fea_len (int, optional): Number of hidden atom features in the graph convolution
layers. Defaults to 64.
elem_fea_len (int, optional): Number of hidden atom features in the graph
convolution layers. Defaults to 64.
n_graph (int, optional): Number of graph convolution layers. Defaults to 4.
"""
super().__init__()
Expand Down
85 changes: 50 additions & 35 deletions aviary/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,16 @@ def __init__(
"""Store core model parameters.
Args:
task_dict (dict[str, TaskType]): Map target names to "regression" or "classification".
robust (bool): If True, the number of model outputs is doubled. 2nd output for each
target will be an estimate for the aleatoric uncertainty (uncertainty inherent to
the sample) which can be used with a robust loss function to attenuate the weighting
of uncertain samples.
epoch (int, optional): Epoch model training will begin/resume from. Defaults to 0.
best_val_scores (dict[str, float], optional): Validation score to use for early
stopping. Defaults to None.
task_dict (dict[str, TaskType]): Map target names to "regression" or
"classification".
robust (bool): If True, the number of model outputs is doubled. 2nd output
for each target will be an estimate for the aleatoric uncertainty
(uncertainty inherent to the sample) which can be used with a robust
loss function to attenuate the weighting of uncertain samples.
epoch (int, optional): Epoch model training will begin/resume from.
Defaults to 0.
best_val_scores (dict[str, float], optional): Validation score to use for
early stopping. Defaults to None.
"""
super().__init__()
self.task_dict = task_dict
Expand Down Expand Up @@ -79,18 +81,22 @@ def fit(
Args:
train_loader (DataLoader): Dataloader containing training data.
val_loader (DataLoader): Dataloader containing validation data.
optimizer (torch.optim.Optimizer): Optimizer used to carry out parameter updates.
optimizer (torch.optim.Optimizer): Optimizer used to carry out parameter
updates.
scheduler (torch.optim.lr_scheduler._LRScheduler): Scheduler used to adjust
Optimizer during training.
epochs (int): Number of epochs to train for.
loss_dict (dict[str, nn.Module]): Dictionary of losses to apply for each task.
loss_dict (dict[str, nn.Module]): Dict of losses to apply for each task.
normalizer_dict (dict[str, Normalizer]): Dictionary of Normalizers to apply
to each task.
model_name (str): String describing the model.
run_id (int): Unique identifier of the model run.
checkpoint (bool, optional): Whether to save model checkpoints. Defaults to True.
writer (SummaryWriter, optional): TensorBoard writer for saving logs. Defaults to None.
verbose (bool, optional): Whether to print out intermediate results. Defaults to True.
checkpoint (bool, optional): Whether to save model checkpoints.
Defaults to True.
writer (SummaryWriter, optional): TensorBoard writer for saving logs.
Defaults to None.
verbose (bool, optional): Whether to print out intermediate results.
Defaults to True.
patience (int, optional): Patience for early stopping. Defaults to None.
"""
start_epoch = self.epoch
Expand Down Expand Up @@ -157,7 +163,8 @@ def fit(
self.es_patience += 1
if patience and self.es_patience > patience:
print(
"Stopping early due to lack of improvement on validation set"
f"No improvement on validation set for {patience} "
"epochs, stopping early"
)
break

Expand Down Expand Up @@ -214,20 +221,23 @@ def evaluate(
"""Evaluate the model.
Args:
data_loader (DataLoader): PyTorch Dataloader with the same data format used in fit().
data_loader (DataLoader): PyTorch Dataloader with the same data format used
in fit().
loss_dict (dict[str, tuple[TaskType, nn.Module]]): Dictionary of losses
to apply for each task.
optimizer (torch.optim.Optimizer): PyTorch Optimizer
normalizer_dict (dict[str, Normalizer]): Dictionary of Normalizers to apply
to each task.
action ("train" | "evaluate"], optional): Whether to track gradients depending on
whether we are carrying out a training or validation pass. Defaults to "train".
verbose (bool, optional): Whether to print out intermediate results. Defaults to False.
action ("train" | "evaluate"], optional): Whether to track gradients
depending on whether we are carrying out a training or validation pass.
Defaults to "train".
verbose (bool, optional): Whether to print out intermediate results.
Defaults to False.
pbar (bool, optional): Whether to display a progress bar. Defaults to False.
Returns:
dict[str, dict["Loss" | "MAE" | "RMSE" | "Accuracy" | "F1", np.ndarray]]: nested
dictionary for each target of metrics averaged over an epoch.
dict[str, dict["Loss" | "MAE" | "RMSE" | "Accuracy" | "F1", np.ndarray]]:
nested dictionary for each target of metrics averaged over an epoch.
"""
if action == "evaluate":
self.eval()
Expand All @@ -240,9 +250,9 @@ def evaluate(
lambda: defaultdict(list)
)

# *_ discards identifiers like material_id and formula which we don't need when training
# tqdm(disable=None) means suppress output in non-tty (e.g. CI/log files) but keep in
# terminal (i.e. tty mode) https://git.io/JnBOi
# *_ discards identifiers like material_id and formula which we don't need when
# training tqdm(disable=None) means suppress output in non-tty (e.g. CI/log
# files) but keep in terminal (i.e. tty mode) https://git.io/JnBOi
for inputs, targets_list, *_ in tqdm(
data_loader, disable=None if pbar else True
):
Expand Down Expand Up @@ -298,8 +308,8 @@ def evaluate(

epoch_metrics[target_name]["Loss"].append(loss.cpu().item())

# NOTE multitasking currently just uses a direct sum of individual target losses
# this should be okay but is perhaps sub-optimal
# NOTE multitasking currently just uses a direct sum of individual
# target losses this should be okay but is perhaps sub-optimal
mixed_loss += loss

if action == "train":
Expand Down Expand Up @@ -335,10 +345,11 @@ def predict(
"""Make model predictions. Supports multi-tasking.
Args:
data_loader (DataLoader): Iterator that yields mini-batches with the same data
format used in fit(). To speed up inference, batch size can be set much
larger than during training.
verbose (bool, optional): Whether to print out intermediate results. Defaults to False.
data_loader (DataLoader): Iterator that yields mini-batches with the same
data format used in fit(). To speed up inference, batch size can be set
much larger than during training.
verbose (bool, optional): Whether to print out intermediate results.
Defaults to False.
Returns:
3 tuples where tuple items correspond to different multitask targets.
Expand Down Expand Up @@ -382,7 +393,8 @@ def featurize(self, data_loader: DataLoader) -> np.ndarray:
this runs only the message-passing part of the model without the ResNet.
Args:
data_loader (DataLoader): PyTorch Dataloader with the same data format used in fit()
data_loader (DataLoader): PyTorch Dataloader with the same data format used
in fit()
Returns:
np.array: 2d array of features
Expand All @@ -409,7 +421,8 @@ def num_params(self) -> int:
def __repr__(self) -> str:
"""Return model name with number of parameters and epochs trained."""
n_params, n_epochs = self.num_params, self.epoch
return f"{type(self).__name__} with {n_params:,} trainable params at {n_epochs:,} epochs"
cls_name = type(self).__name__
return f"{cls_name} with {n_params:,} trainable params at {n_epochs:,} epochs"


class Normalizer:
Expand All @@ -425,8 +438,8 @@ def fit(self, tensor: Tensor, dim: int = 0, keepdim: bool = False) -> None:
Args:
tensor (Tensor): Tensor to determine the mean and standard deviation over.
dim (int, optional): Which dimension to take mean and standard deviation over.
Defaults to 0.
dim (int, optional): Which dimension to take mean and standard deviation
over. Defaults to 0.
keepdim (bool, optional): Whether to keep the reduced dimension in Tensor.
Defaults to False.
"""
Expand Down Expand Up @@ -495,7 +508,8 @@ def save_checkpoint(
"""Saves a checkpoint and overwrites the best model when is_best = True.
Args:
state (dict[str, Any]): Model parameters and other stateful objects like optimizer.
state (dict[str, Any]): Model parameters and other stateful objects like
optimizer.
is_best (bool): Whether the model is the best seen according to validation set.
model_name (str): String describing the model.
run_id (int): Unique identifier of the model run.
Expand Down Expand Up @@ -542,7 +556,8 @@ def np_softmax(arr: np.ndarray, axis: int = -1) -> np.ndarray:
Args:
arr (np.ndarray): Arbitrary dimensional array.
axis (int, optional): Dimension over which to take softmax. Defaults to -1 (last).
axis (int, optional): Dimension over which to take softmax. Defaults to
-1 (last).
Returns:
np.ndarray: Same dimension as input array, but specified axis reduced
Expand Down
7 changes: 4 additions & 3 deletions aviary/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,12 @@ class InMemoryDataLoader:
Source: https://discuss.pytorch.org/t/27014/6.
Args:
*tensors: List of arrays or tensors. Must all have the same length in dimension 0.
*tensors: List of arrays or tensors. Must all have the same length in
dimension 0.
collate_fn (Callable): Should accept variadic list of tensors and
output a minibatch of data ready for model consumption.
batch_size (int, optional): Usually 64, 128 or 256. Can be larger for test set loaders
to speedup inference. Defaults to 64.
batch_size (int, optional): Usually 64, 128 or 256. Can be larger for test set
loaders to speedup inference. Defaults to 64.
shuffle (bool, optional): If True, shuffle the data *in-place* whenever an
iterator is created from this object. Defaults to False.
"""
Expand Down
26 changes: 14 additions & 12 deletions aviary/roost/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,16 @@ def __init__(
Args:
df (pd.DataFrame): Pandas dataframe holding input and target values.
task_dict (dict[str, "regression" | "classification"]): Map from target names to task
type.
elem_embedding (str, optional): One of "matscholar200", "cgcnn92", "megnet16",
"onehot112" or path to a file with custom element embeddings.
Defaults to "matscholar200".
task_dict (dict[str, "regression" | "classification"]): Map from target
names to task type.
elem_embedding (str, optional): One of "matscholar200", "cgcnn92",
"megnet16", "onehot112" or path to a file with custom element
embeddings. Defaults to "matscholar200".
inputs (str, optional): df column name holding material compositions.
Defaults to "composition".
identifiers (list, optional): df columns for distinguishing data points. Will be
copied over into the model's output CSV. Defaults to ["material_id", "composition"].
identifiers (list, optional): df columns for distinguishing data points.
Will be copied over into the model's output CSV. Defaults to
["material_id", "composition"].
"""
if len(identifiers) != 2:
raise AssertionError("Two identifiers are required")
Expand Down Expand Up @@ -100,11 +101,11 @@ def __getitem__(self, idx: int):
elem_fea = np.vstack([self.elem_features[element] for element in elements])
except AssertionError as exc:
raise AssertionError(
f"{material_ids} ({composition}) contains element types not in embedding"
f"{material_ids} contains element types not in embedding"
) from exc
except ValueError as exc:
raise ValueError(
f"{material_ids} ({composition}) composition cannot be parsed into elements"
f"{material_ids} composition cannot be parsed into elements"
) from exc

n_elems = len(elements)
Expand Down Expand Up @@ -150,13 +151,14 @@ def collate_batch(
- nbr_fea (Tensor):
- self_idx (LongTensor):
- nbr_idx (LongTensor):
- target (Tensor | LongTensor): target values containing floats for regression or
integers as class labels for classification
- target (Tensor | LongTensor): target values containing floats for
regression or integers as class labels for classification
- cif_id: str or int
Returns:
tuple[
tuple[Tensor, Tensor, LongTensor, LongTensor, LongTensor]: batched Roost model inputs,
tuple[Tensor, Tensor, LongTensor, LongTensor, LongTensor]: batched Roost
model inputs,
tuple[Tensor | LongTensor]: Target values for different tasks,
# TODO this last tuple is unpacked how to do type hint?
*tuple[str | int]: Identifiers like material_id, composition
Expand Down
Loading

0 comments on commit a7344c1

Please sign in to comment.