Skip to content

Commit

Permalink
Add kmeans clustering function
Browse files Browse the repository at this point in the history
  • Loading branch information
oyvindeide committed Jan 11, 2021
1 parent 8d639fe commit a199c60
Show file tree
Hide file tree
Showing 5 changed files with 69 additions and 26 deletions.
16 changes: 5 additions & 11 deletions semeio/workflows/misfit_preprocessor/job.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,27 +11,21 @@ def run(config, measured_data, reporter):
if config.workflow.method == SPEARMAN_CORRELATION:
sconfig = config.workflow.spearman_correlation.clustering
scaling_configs = spearman_job(
measured_data,
sconfig.hierarchical.t,
reporter,
criterion=sconfig.hierarchical.criterion,
depth=sconfig.hierarchical.depth,
method=sconfig.hierarchical.method,
metric=sconfig.hierarchical.metric,
measured_data, reporter, **sconfig.hierarchical._asdict()
)
pca_threshold = config.workflow.spearman_correlation.pca.threshold
elif config.workflow.method == AUTO_SCALE:
job = ObservationScaleFactor(reporter, measured_data)
auto_scale_config = config.workflow.auto_scale
nr_components, _ = job.perform_pca(auto_scale_config.pca.threshold)
sconfig = auto_scale_config.clustering
config_dict = sconfig.hierarchical._asdict()
config_dict["criterion"] = "maxclust"
config_dict["t"] = nr_components
scaling_configs = spearman_job(
measured_data,
nr_components,
reporter,
criterion="maxclust",
method=sconfig.hierarchical.method,
metric=sconfig.hierarchical.metric,
**config_dict,
)
pca_threshold = auto_scale_config.pca.threshold
else:
Expand Down
27 changes: 27 additions & 0 deletions semeio/workflows/spearman_correlation_job/cluster_analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from scipy.cluster.hierarchy import fcluster, linkage
from sklearn.cluster import KMeans


def fcluster_analysis(
correlation_matrix,
t=1.0,
criterion="inconsistent",
depth=2,
method="single",
metric="euclidean",
):
a = linkage(correlation_matrix, method, metric)
return fcluster(a, t, criterion=criterion, depth=depth)


def kmeans_analysis(
correlation_matrix, n_clusters, n_init=10, max_iter=300, random_state=0
):
kmeans = KMeans(
init="random",
n_clusters=n_clusters,
n_init=n_init,
max_iter=max_iter,
random_state=random_state,
).fit(correlation_matrix)
return kmeans.labels_ + 1 # Scikit clusters are 0-indexed while scipy is 1-indexed
25 changes: 11 additions & 14 deletions semeio/workflows/spearman_correlation_job/job.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,21 @@
# -*- coding: utf-8 -*-
import itertools
import logging
from scipy.cluster.hierarchy import fcluster, linkage

from semeio.workflows.spearman_correlation_job.cluster_analysis import (
fcluster_analysis,
kmeans_analysis,
)


_cluster_map = {"hierarchical": fcluster_analysis, "kmeans": kmeans_analysis}


def spearman_job(
measured_data,
threshold,
reporter,
criterion="inconsistent",
depth=2,
method="single",
metric="euclidean",
cluster_function="hierarchical",
**cluster_config,
):
"""
Given measured_data and threshold, it returns configurations that describe
Expand All @@ -25,9 +29,7 @@ def spearman_job(
correlation_matrix = _calculate_correlation_matrix(simulated_data)
reporter.publish_csv("correlation_matrix", correlation_matrix)

clusters = _cluster_analysis(
correlation_matrix, threshold, criterion, depth, method, metric
)
clusters = _cluster_map[cluster_function](correlation_matrix, **cluster_config)

columns = correlation_matrix.columns

Expand Down Expand Up @@ -97,8 +99,3 @@ def _calculate_correlation_matrix(data):
# of pandas (https://github.com/pandas-dev/pandas/pull/28151), for now this is
# equivalent:
return data.rank().corr(method="pearson")


def _cluster_analysis(correlation_matrix, threshold, criterion, depth, method, metric):
a = linkage(correlation_matrix, method, metric)
return fcluster(a, threshold, criterion=criterion, depth=depth)
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def run(self, *args):
parser = spearman_job_parser()
args = parser.parse_args(args)

scaling_configs = spearman_job(measured_data, args.threshold, self.reporter)
scaling_configs = spearman_job(measured_data, self.reporter, t=args.threshold)

if not args.dry_run:
try:
Expand Down
25 changes: 25 additions & 0 deletions tests/jobs/misfit_preprocessor/unit/test_cluster_functions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# pylint: disable=unbalanced-tuple-unpacking
from semeio.workflows.spearman_correlation_job.cluster_analysis import (
kmeans_analysis,
fcluster_analysis,
)
from sklearn.datasets import make_blobs
import pytest


@pytest.mark.parametrize(
"func, kwargs",
((fcluster_analysis, {"criterion": "maxclust"}), (kmeans_analysis, {})),
)
def test_same_format(func, kwargs):
# The goal of this test is not to test the actual clustering functions,
# but rather their format. Scipy clusters (labels) are 1-indexed while
# sklearn are 0-indexed. We therefore set up a very simple dataset with
# clearly defined clusters so the result will be the same for all functions.
features, true_labels = make_blobs(
n_samples=200, centers=3, cluster_std=0.1, random_state=42
)
cluster_result = func(features, 3, **kwargs)
# The clusters are typically the same, but the labels vary so we perform the
# simplest test, just checking that the desired labels are present.
assert set(cluster_result) == {1, 2, 3}

0 comments on commit a199c60

Please sign in to comment.