main.py

import json
import os
import tempfile

import hydra
import mlflow
from omegaconf import DictConfig

_steps = [
    "data_download",
    "data_clean",
    "data_check",
    "data_split",
    "train_random_forest",
    # Not included so it is not run by mistake.
    # "test_model"
]


# This automatically reads in the configuration
@hydra.main(config_name='config', config_path=".", version_base=None)
def go(config: DictConfig):
    # Setup wandb experiment. All runs will be grouped under this name
    os.environ["WANDB_PROJECT"] = config["main"]["project_name"]
    os.environ["WANDB_RUN_GROUP"] = config["main"]["experiment_name"]

    # Steps to execute
    steps_par = config['main']['steps']
    active_steps = steps_par.split(",") if steps_par != "all" else _steps

    # Define original root directory
    root = hydra.utils.get_original_cwd()

    # Move to a temporary directory
    with tempfile.TemporaryDirectory() as tmp_dir:
        if "data_download" in active_steps:
            # Download file and load in W&B
            _ = mlflow.run(
                os.path.join(root, "components", "data_download"),
                "main",
                parameters={
                    "sample": config["etl"]["sample"],
                    "artifact_name": "sample.csv",
                    "artifact_type": "raw_data",
                    "artifact_description": "Raw file as downloaded"
                },
            )

        if "data_clean" in active_steps:
            _ = mlflow.run(
                # Hydra executes the script in a different directory than the root
                os.path.join(root, "components", "data_clean"),
                "main",
                parameters={
                    "input_artifact": "sample.csv:latest",
                    "output_artifact": "clean_sample.csv",
                    "output_type": "clean_sample",
                    "output_description": "Data with outliers and null values removed",
                    "min_price": config['etl']['min_price'],
                    "max_price": config['etl']['max_price']
                },
            )

        if "data_check" in active_steps:
            _ = mlflow.run(
                # Hydra executes the script in a different directory than the root
                os.path.join(root, "components", "data_check"),
                "main",
                parameters={
                    "csv": "clean_sample.csv:latest",
                    "ref": "clean_sample.csv:reference",
                    "kl_threshold": config["data_check"]["kl_threshold"],
                    "min_price": config['etl']['min_price'],
                    "max_price": config['etl']['max_price'],
                },
            )

        if "data_split" in active_steps:
            _ = mlflow.run(
                os.path.join(root, "components", "data_split"),
                "main",
                parameters={
                    "input": "clean_sample.csv:latest",
                    "test_size": config['modeling']['test_size'],
                    "random_seed": config['modeling']['random_seed'],
                    "stratify_by": config['modeling']['stratify_by']
                },
            )

        if "train_random_forest" in active_steps:
            # Serialize the random forest configuration into JSON
            rf_config = os.path.abspath("rf_config.json")
            with open(rf_config, "w+") as fp:
                json.dump(dict(config["modeling"]["random_forest"].items()), fp)  # DO NOT TOUCH

            _ = mlflow.run(
                os.path.join(root, "components", "train_random_forest"),
                "main",
                parameters={
                    "trainval_artifact": "trainval_data.csv:latest",
                    "val_size": config['modeling']['val_size'],
                    "random_seed": config['modeling']['random_seed'],
                    "stratify_by": config['modeling']['stratify_by'],
                    "rf_config": rf_config,
                    "max_tfidf_features": config['modeling']['max_tfidf_features'],
                    "output_artifact": "random_forest_export"
                },
            )

        if "test_model" in active_steps:
            _ = mlflow.run(
                os.path.join(root, "components", "test_model"),
                "main",
                parameters={
                    "mlflow_model": "random_forest_export:prod",
                    "test_dataset": "test_data.csv:latest"
                },
            )


if __name__ == "__main__":
    go()