Add kmeans clustering function

equinor · Jan 11, 2021 · a199c60 · a199c60
1 parent 8d639fe
commit a199c60
Show file tree

Hide file tree

Showing 5 changed files with 69 additions and 26 deletions.
diff --git a/semeio/workflows/misfit_preprocessor/job.py b/semeio/workflows/misfit_preprocessor/job.py
@@ -11,27 +11,21 @@ def run(config, measured_data, reporter):
     if config.workflow.method == SPEARMAN_CORRELATION:
         sconfig = config.workflow.spearman_correlation.clustering
         scaling_configs = spearman_job(
-            measured_data,
-            sconfig.hierarchical.t,
-            reporter,
-            criterion=sconfig.hierarchical.criterion,
-            depth=sconfig.hierarchical.depth,
-            method=sconfig.hierarchical.method,
-            metric=sconfig.hierarchical.metric,
+            measured_data, reporter, **sconfig.hierarchical._asdict()
         )
         pca_threshold = config.workflow.spearman_correlation.pca.threshold
     elif config.workflow.method == AUTO_SCALE:
         job = ObservationScaleFactor(reporter, measured_data)
         auto_scale_config = config.workflow.auto_scale
         nr_components, _ = job.perform_pca(auto_scale_config.pca.threshold)
         sconfig = auto_scale_config.clustering
+        config_dict = sconfig.hierarchical._asdict()
+        config_dict["criterion"] = "maxclust"
+        config_dict["t"] = nr_components
         scaling_configs = spearman_job(
             measured_data,
-            nr_components,
             reporter,
-            criterion="maxclust",
-            method=sconfig.hierarchical.method,
-            metric=sconfig.hierarchical.metric,
+            **config_dict,
         )
         pca_threshold = auto_scale_config.pca.threshold
     else:

diff --git a/semeio/workflows/spearman_correlation_job/cluster_analysis.py b/semeio/workflows/spearman_correlation_job/cluster_analysis.py
@@ -0,0 +1,27 @@
+from scipy.cluster.hierarchy import fcluster, linkage
+from sklearn.cluster import KMeans
+
+
+def fcluster_analysis(
+    correlation_matrix,
+    t=1.0,
+    criterion="inconsistent",
+    depth=2,
+    method="single",
+    metric="euclidean",
+):
+    a = linkage(correlation_matrix, method, metric)
+    return fcluster(a, t, criterion=criterion, depth=depth)
+
+
+def kmeans_analysis(
+    correlation_matrix, n_clusters, n_init=10, max_iter=300, random_state=0
+):
+    kmeans = KMeans(
+        init="random",
+        n_clusters=n_clusters,
+        n_init=n_init,
+        max_iter=max_iter,
+        random_state=random_state,
+    ).fit(correlation_matrix)
+    return kmeans.labels_ + 1  # Scikit clusters are 0-indexed while scipy is 1-indexed
diff --git a/semeio/workflows/spearman_correlation_job/job.py b/semeio/workflows/spearman_correlation_job/job.py
@@ -1,17 +1,21 @@
 # -*- coding: utf-8 -*-
 import itertools
 import logging
-from scipy.cluster.hierarchy import fcluster, linkage
+
+from semeio.workflows.spearman_correlation_job.cluster_analysis import (
+    fcluster_analysis,
+    kmeans_analysis,
+)
+
+
+_cluster_map = {"hierarchical": fcluster_analysis, "kmeans": kmeans_analysis}
 
 
 def spearman_job(
     measured_data,
-    threshold,
     reporter,
-    criterion="inconsistent",
-    depth=2,
-    method="single",
-    metric="euclidean",
+    cluster_function="hierarchical",
+    **cluster_config,
 ):
     """
     Given measured_data and threshold, it returns configurations that describe
@@ -25,9 +29,7 @@ def spearman_job(
     correlation_matrix = _calculate_correlation_matrix(simulated_data)
     reporter.publish_csv("correlation_matrix", correlation_matrix)
 
-    clusters = _cluster_analysis(
-        correlation_matrix, threshold, criterion, depth, method, metric
-    )
+    clusters = _cluster_map[cluster_function](correlation_matrix, **cluster_config)
 
     columns = correlation_matrix.columns
 
@@ -97,8 +99,3 @@ def _calculate_correlation_matrix(data):
     # of pandas (https://github.com/pandas-dev/pandas/pull/28151), for now this is
     # equivalent:
     return data.rank().corr(method="pearson")
-
-
-def _cluster_analysis(correlation_matrix, threshold, criterion, depth, method, metric):
-    a = linkage(correlation_matrix, method, metric)
-    return fcluster(a, threshold, criterion=criterion, depth=depth)
diff --git a/semeio/workflows/spearman_correlation_job/spearman_correlation.py b/semeio/workflows/spearman_correlation_job/spearman_correlation.py
@@ -27,7 +27,7 @@ def run(self, *args):
         parser = spearman_job_parser()
         args = parser.parse_args(args)
 
-        scaling_configs = spearman_job(measured_data, args.threshold, self.reporter)
+        scaling_configs = spearman_job(measured_data, self.reporter, t=args.threshold)
 
         if not args.dry_run:
             try:

diff --git a/tests/jobs/misfit_preprocessor/unit/test_cluster_functions.py b/tests/jobs/misfit_preprocessor/unit/test_cluster_functions.py
@@ -0,0 +1,25 @@
+# pylint: disable=unbalanced-tuple-unpacking
+from semeio.workflows.spearman_correlation_job.cluster_analysis import (
+    kmeans_analysis,
+    fcluster_analysis,
+)
+from sklearn.datasets import make_blobs
+import pytest
+
+
+@pytest.mark.parametrize(
+    "func, kwargs",
+    ((fcluster_analysis, {"criterion": "maxclust"}), (kmeans_analysis, {})),
+)
+def test_same_format(func, kwargs):
+    # The goal of this test is not to test the actual clustering functions,
+    # but rather their format. Scipy clusters (labels) are 1-indexed while
+    # sklearn are 0-indexed. We therefore set up a very simple dataset with
+    # clearly defined clusters so the result will be the same for all functions.
+    features, true_labels = make_blobs(
+        n_samples=200, centers=3, cluster_std=0.1, random_state=42
+    )
+    cluster_result = func(features, 3, **kwargs)
+    # The clusters are typically the same, but the labels vary so we perform the
+    # simplest test, just checking that the desired labels are present.
+    assert set(cluster_result) == {1, 2, 3}