-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
123 lines (107 loc) · 4.45 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import json
import os
import tempfile
import hydra
import mlflow
from omegaconf import DictConfig
_steps = [
"data_download",
"data_clean",
"data_check",
"data_split",
"train_random_forest",
# Not included so it is not run by mistake.
# "test_model"
]
# This automatically reads in the configuration
@hydra.main(config_name='config', config_path=".", version_base=None)
def go(config: DictConfig):
# Setup wandb experiment. All runs will be grouped under this name
os.environ["WANDB_PROJECT"] = config["main"]["project_name"]
os.environ["WANDB_RUN_GROUP"] = config["main"]["experiment_name"]
# Steps to execute
steps_par = config['main']['steps']
active_steps = steps_par.split(",") if steps_par != "all" else _steps
# Define original root directory
root = hydra.utils.get_original_cwd()
# Move to a temporary directory
with tempfile.TemporaryDirectory() as tmp_dir:
if "data_download" in active_steps:
# Download file and load in W&B
_ = mlflow.run(
os.path.join(root, "components", "data_download"),
"main",
parameters={
"sample": config["etl"]["sample"],
"artifact_name": "sample.csv",
"artifact_type": "raw_data",
"artifact_description": "Raw file as downloaded"
},
)
if "data_clean" in active_steps:
_ = mlflow.run(
# Hydra executes the script in a different directory than the root
os.path.join(root, "components", "data_clean"),
"main",
parameters={
"input_artifact": "sample.csv:latest",
"output_artifact": "clean_sample.csv",
"output_type": "clean_sample",
"output_description": "Data with outliers and null values removed",
"min_price": config['etl']['min_price'],
"max_price": config['etl']['max_price']
},
)
if "data_check" in active_steps:
_ = mlflow.run(
# Hydra executes the script in a different directory than the root
os.path.join(root, "components", "data_check"),
"main",
parameters={
"csv": "clean_sample.csv:latest",
"ref": "clean_sample.csv:reference",
"kl_threshold": config["data_check"]["kl_threshold"],
"min_price": config['etl']['min_price'],
"max_price": config['etl']['max_price'],
},
)
if "data_split" in active_steps:
_ = mlflow.run(
os.path.join(root, "components", "data_split"),
"main",
parameters={
"input": "clean_sample.csv:latest",
"test_size": config['modeling']['test_size'],
"random_seed": config['modeling']['random_seed'],
"stratify_by": config['modeling']['stratify_by']
},
)
if "train_random_forest" in active_steps:
# Serialize the random forest configuration into JSON
rf_config = os.path.abspath("rf_config.json")
with open(rf_config, "w+") as fp:
json.dump(dict(config["modeling"]["random_forest"].items()), fp) # DO NOT TOUCH
_ = mlflow.run(
os.path.join(root, "components", "train_random_forest"),
"main",
parameters={
"trainval_artifact": "trainval_data.csv:latest",
"val_size": config['modeling']['val_size'],
"random_seed": config['modeling']['random_seed'],
"stratify_by": config['modeling']['stratify_by'],
"rf_config": rf_config,
"max_tfidf_features": config['modeling']['max_tfidf_features'],
"output_artifact": "random_forest_export"
},
)
if "test_model" in active_steps:
_ = mlflow.run(
os.path.join(root, "components", "test_model"),
"main",
parameters={
"mlflow_model": "random_forest_export:prod",
"test_dataset": "test_data.csv:latest"
},
)
if __name__ == "__main__":
go()