Expose the n_components (Number of components) and `covariance_type…

…` (type of covariance between components) parameters of the one-class classifier. PiperOrigin-RevId: 653301088
google-research · Jul 17, 2024 · 1d4e648 · 1d4e648
1 parent bba6181
commit 1d4e648
Show file tree

Hide file tree

Showing 9 changed files with 70 additions and 15 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -24,6 +24,10 @@ To release a new version (e.g. from `1.0.0` -> `2.0.0`):
 
 ## [Unreleased]
 
+## [0.3.2] - 2024-07-16
+
+* Exposes the `n_component` and `covariance_type` parameters of the one-class classifier.
+
 ## [0.3.1] - 2024-07-13
 
 * Now writes out the pseudolabel weights and a flag that indicates whether a sample has a ground truth label (0) or a pseudolabel (1).
@@ -49,7 +53,8 @@ To release a new version (e.g. from `1.0.0` -> `2.0.0`):
 
 * Initial release
 
-[Unreleased]: https://github.com/google-research/spade_anomaly_detection/compare/v0.3.1...HEAD
+[Unreleased]: https://github.com/google-research/spade_anomaly_detection/compare/v0.3.2...HEAD
+[0.3.2]: https://github.com/google-research/spade_anomaly_detection/compare/v0.3.1...v0.3.2
 [0.3.1]: https://github.com/google-research/spade_anomaly_detection/compare/v0.3.0...v0.3.1
 [0.3.0]: https://github.com/google-research/spade_anomaly_detection/compare/v0.2.2...v0.3.0
 [0.2.2]: https://github.com/google-research/spade_anomaly_detection/compare/v0.2.1...v0.2.2

diff --git a/README.md b/README.md
@@ -103,6 +103,10 @@ one class classifier ensemble to label a point as negative. The higher this valu
 
 <span style="color:yellow;background-color:lightgrey">ensemble_count</span>: Integer representing the number of one class classifiers in the ensemble used for pseudo labeling unlabeled data points. The more models in the ensemble, the less likely it is for all the models to gain consensus, and thus will reduce the amount of labeled data points. By default, we use 5 one class classifiers.
 
+<span style="color:yellow;background-color:lightgrey">n_components</span>: Integer representing the number of components to use in the one class classifier ensemble. By default, we use 1 component.
+
+<span style="color:yellow;background-color:lightgrey">covariance_type</span>: String representing the covariance type to use in the one class classifier ensemble. By default, we use 'full' covariance. Note that when there are many components, a 'full' covariance matrix may not be suitable.
+
 <span style="color:yellow;background-color:lightgrey">verbose (boolean)</span>: The amount of console logs to display during training. Use the default value of  False to show fewer messages, and True for displaying many aspects of model training and scoring. This is useful for debugging model performance.
 
 ## Training Job Arguments

diff --git a/spade_anomaly_detection/__init__.py b/spade_anomaly_detection/__init__.py
@@ -31,4 +31,4 @@
 
 # A new PyPI release will be pushed every time `__version__` is increased.
 # When changing this, also update the CHANGELOG.md.
-__version__ = '0.3.1'
+__version__ = '0.3.2'
diff --git a/spade_anomaly_detection/occ_ensemble_test.py b/spade_anomaly_detection/occ_ensemble_test.py
@@ -29,15 +29,15 @@
 
 """Tests for the one class classifier ensemble."""
 
-import numpy as np
 
+from absl.testing import parameterized
+import numpy as np
 from spade_anomaly_detection import data_loader
 from spade_anomaly_detection import occ_ensemble
-
 import tensorflow as tf
 
 
-class OccEnsembleTest(tf.test.TestCase):
+class OccEnsembleTest(tf.test.TestCase, parameterized.TestCase):
 
   def test_ensemble_initialization_no_error(self):
     gmm_ensemble = occ_ensemble.GmmEnsemble(n_components=1, ensemble_count=10)
@@ -47,13 +47,21 @@ def test_ensemble_initialization_no_error(self):
     with self.subTest(name='ObjectAttributes'):
       self.assertEqual(gmm_ensemble.ensemble_count, 10)
 
-  def test_ensemble_training_no_error(self):
+  # Params to test: n_components, ensemble_count, covariance_type.
+  @parameterized.named_parameters(
+      ('components_1_ensemble_10_full', 1, 10, 'full'),
+      ('components_3_ensemble_5_full', 1, 5, 'full'),
+      ('components_3_ensemble_5_tied', 1, 5, 'tied'),
+  )
+  def test_ensemble_training_no_error(
+      self, n_components, ensemble_count, covariance_type
+  ):
     batches_per_occ = 10
-    ensemble_count = 5
-    n_components = 1
 
     ensemble_obj = occ_ensemble.GmmEnsemble(
-        n_components=n_components, ensemble_count=ensemble_count
+        n_components=n_components,
+        ensemble_count=ensemble_count,
+        covariance_type=covariance_type,
     )
 
     tf_dataset = data_loader.load_tf_dataset_from_csv(

diff --git a/spade_anomaly_detection/parameters.py b/spade_anomaly_detection/parameters.py
@@ -146,6 +146,11 @@ class RunnerParameters:
       the less likely it is for all the models to gain consensus, and thus will
       reduce the amount of labeled data points. By default, we use 5 one class
       classifiers.
+    n_components: The number of components to use in the one class classifier
+      ensemble. By default, we use 1 component.
+    covariance_type: The covariance type to use in the one class classifier
+      ensemble. By default, we use 'full' covariance. Note that when there are
+      many components, a 'full' covariance matrix may not be suitable.
     random_seed: The random seed to use for all random number generators in the
       algorithm.
     verbose: The amount of console logs to display during training. Use False to
@@ -177,6 +182,8 @@ class RunnerParameters:
   max_occ_batch_size: int = 50000
   labeling_and_model_training_batch_size: Optional[int] = None
   ensemble_count: int = 5
+  n_components: int = 1
+  covariance_type: str = 'full'
   random_seed: int = _RANDOM_SEED
   verbose: bool = False
 

diff --git a/spade_anomaly_detection/runner.py b/spade_anomaly_detection/runner.py
@@ -276,6 +276,8 @@ def instantiate_and_fit_ensemble(
     """
 
     ensemble_object = occ_ensemble.GmmEnsemble(
+        n_components=self.runner_parameters.n_components,
+        covariance_type=self.runner_parameters.covariance_type,
         ensemble_count=self.runner_parameters.ensemble_count,
         positive_threshold=self.runner_parameters.positive_threshold,
         negative_threshold=self.runner_parameters.negative_threshold,

diff --git a/spade_anomaly_detection/runner_test.py b/spade_anomaly_detection/runner_test.py
@@ -75,6 +75,8 @@ def setUp(self):
         max_occ_batch_size=50000,
         labeling_and_model_training_batch_size=None,
         ensemble_count=5,
+        n_components=1,
+        covariance_type='full',
         verbose=False,
     )
 

diff --git a/spade_anomaly_detection/scripts/run_cloud_spade_experiment.sh b/spade_anomaly_detection/scripts/run_cloud_spade_experiment.sh
@@ -48,22 +48,24 @@ TEST_LABEL_COL_NAME=${16:-"y"}
 ALPHA=${17:-"1.0"}
 BATCHES_PER_MODEL=${18:-"1"}
 ENSEMBLE_COUNT=${19:-"5"}
-MAX_OCC_BATCH_SIZE=${20:-"50000"}
-LABELING_AND_MODEL_TRAINING_BATCH_SIZE=${21:-"100000"}
-VERBOSE=${22:-"True"}
-UPLOAD_ONLY=${23:-"False"}
+N_COMPONENTS=${20:-"1"}
+COVARIANCE_TYPE=${21:-"full"}
+MAX_OCC_BATCH_SIZE=${22:-"50000"}
+LABELING_AND_MODEL_TRAINING_BATCH_SIZE=${23:-"100000"}
+VERBOSE=${24:-"True"}
+UPLOAD_ONLY=${25:-"False"}
 
 # Give a unique name to your training job.
 TRIAL_NAME="spade_${USER}_${DATETIME}"
 
 # Image name and location
 IMAGE_NAME="spade"
-IMAGE_TAG=${24:-"latest-oss"}
+IMAGE_TAG=${26:-"latest-oss"}
 # Project image (use this for testing)
 IMAGE_URI="us-docker.pkg.dev/${PROJECT_ID}/spade/${IMAGE_NAME}:${IMAGE_TAG}"
 echo "IMAGE_URI = ${IMAGE_URI}"
 
-BUILD=${25:-"TRUE"}
+BUILD=${27:-"TRUE"}
 
 if [[ "${BUILD}" == "TRUE" ]]; then
   /bin/bash ./scripts/build_and_push_image.sh "${IMAGE_TAG}" "${IMAGE_NAME}" "${PROJECT_ID}" || exit
@@ -97,6 +99,8 @@ gcloud ai custom-jobs create \
   --args=--alpha="${ALPHA}" \
   --args=--batches_per_model="${BATCHES_PER_MODEL}" \
   --args=--ensemble_count="${ENSEMBLE_COUNT}" \
+  --args=--n_components="${N_COMPONENTS}" \
+  --args=--covariance_type="${COVARIANCE_TYPE}" \
   --args=--max_occ_batch_size="${MAX_OCC_BATCH_SIZE}" \
   --args=--labeling_and_model_training_batch_size="${LABELING_AND_MODEL_TRAINING_BATCH_SIZE}" \
   --args=--upload_only="${UPLOAD_ONLY}" \

diff --git a/spade_anomaly_detection/task.py b/spade_anomaly_detection/task.py
@@ -301,6 +301,27 @@
     ),
 )
 
+_N_COMPONENTS = flags.DEFINE_integer(
+    "n_components",
+    default=1,
+    required=False,
+    help=(
+        "The number of components to use in the one class classifier ensemble. "
+        "By default, we use 1 component."
+    ),
+)
+
+_COVARIANCE_TYPE = flags.DEFINE_string(
+    "covariance_type",
+    default="full",
+    required=False,
+    help=(
+        "The covariance type to use in the one class classifier ensemble. By "
+        "default, we use 'full' covariance. Note that when there are many "
+        "components, a 'full' covariance matrix may not be suitable."
+    ),
+)
+
 _VERBOSE = flags.DEFINE_bool(
     "verbose",
     default=False,
@@ -367,6 +388,8 @@ def main(argv: Sequence[str]) -> None:
       max_occ_batch_size=_MAX_OCC_BATCH_SIZE.value,
       labeling_and_model_training_batch_size=_BATCH_SIZE.value,
       ensemble_count=_ENSEMBLE_COUNT.value,
+      n_components=_N_COMPONENTS.value,
+      covariance_type=_COVARIANCE_TYPE.value,
       random_seed=_RANDOM_SEED,
       verbose=_VERBOSE.value,
   )