make figures more like before

meyer-lab · Feb 6, 2024 · 2947481 · 2947481
1 parent 9917afd
commit 2947481
Show file tree

Hide file tree

Showing 14 changed files with 137 additions and 96 deletions.
diff --git a/ddmc/clustering.py b/ddmc/clustering.py
@@ -92,7 +92,9 @@ def fit(self, p_signal: pd.DataFrame):
                 the length-11 AA sequence of each peptide, containing the
                 phosphoacceptor in the middle and five AAs flanking it.
         """
-        assert isinstance(p_signal, pd.DataFrame), "`p_signal` must be a pandas dataframe."
+        assert isinstance(
+            p_signal, pd.DataFrame
+        ), "`p_signal` must be a pandas dataframe."
         sequences = p_signal.index.values
         assert (
             isinstance(sequences[0], str) and len(sequences[0]) == 11
@@ -153,8 +155,8 @@ def impute(self) -> pd.DataFrame:
         labels = self.labels()  # cluster assignments
         centers = self.transform()  # samples x clusters
         for ii in range(p_signal.shape[0]):
-            p_signal[ii, np.isnan(p_signal[ii, :])] = centers[
-                np.isnan(p_signal[ii, :]), labels[ii] - 1
+            p_signal.iloc[ii, np.isnan(p_signal.iloc[ii, :])] = centers[
+                np.isnan(p_signal.iloc[ii, :]), labels[ii] - 1
             ]
         assert np.all(np.isfinite(p_signal))
         return p_signal
@@ -259,12 +261,15 @@ def predict_upstream_kinases(
         )
         return distances
 
+    def get_nonempty_clusters(self) -> np.ndarray[int]:
+        return np.unique(self.labels())
+
     def has_empty_clusters(self) -> bool:
         """
         Checks whether the most recent call to fit() resulted in empty clusters.
         """
         check_is_fitted(self, ["scores_"])
-        return np.unique(self.labels()).size != self.n_components
+        return self.get_nonempty_clusters().size != self.n_components
 
     def predict(self) -> np.ndarray[int]:
         """Provided the current model parameters, predict the cluster each peptide belongs to."""

diff --git a/ddmc/datasets.py b/ddmc/datasets.py
@@ -77,15 +77,15 @@ def get_sample_to_experiment(self, as_df=False):
             return sample_to_experiment
         return sample_to_experiment.iloc[:, 1].values
 
-    def get_p_signal(self) -> pd.DataFrame:
+    def get_p_signal(self, min_experiments=2) -> pd.DataFrame:
         p_signal = pd.read_csv(self.data_dir / "CPTAC-preprocessedMotifs.csv").iloc[
             :, 1:
         ]
         p_signal = p_signal.set_index("Sequence")
         p_signal = p_signal.drop(columns=["Protein", "Gene", "Position"])
         return filter_incomplete_peptides(
             p_signal,
-            min_experiments=2,
+            min_experiments=min_experiments,
             sample_to_experiment=self.get_sample_to_experiment(),
         )
 

diff --git a/ddmc/figures/common.py b/ddmc/figures/common.py
@@ -158,8 +158,12 @@ def plot_cluster_kinase_distances(
 ):
     pssm_names = distances.columns
 
-    # melt distances
+    # these centering lines make no sense, but they were used in the original
+    # publication-version of this code
+    distances = distances.sub(distances.mean(axis=1), axis=0)
     distances = distances.sub(distances.mean(axis=0), axis=1)
+
+    # melt distances
     distances = pd.melt(
         distances.reset_index(names="Kinase"),
         id_vars="Kinase",

diff --git a/ddmc/figures/figureM2.py b/ddmc/figures/figureM2.py
@@ -16,51 +16,45 @@ def makeFigure():
     # diagram explaining reconstruction process
     ax[0].axis("off")
 
-    n_clusters = np.arange(1, 46, 45)
+    n_clusters = np.array([1, 45])
 
     # Imputation error across Cluster numbers
     dataC_W0 = run_repeated_imputation(
-        "PAM250", [0] * len(n_clusters), n_clusters=n_clusters, n_runs=1
+        "Binomial", [0] * len(n_clusters), n_clusters=n_clusters, n_runs=1
     )
     plot_imputation_errs(ax[1], dataC_W0, "Clusters")
-    ax[1].set_ylim(10.5, 12)
 
     dataC_W25 = run_repeated_imputation(
         "Binomial", [100] * len(n_clusters), n_clusters=n_clusters, n_runs=1
     )
     plot_imputation_errs(ax[2], dataC_W25, "Clusters")
-    ax[2].set_ylim(10.5, 12)
 
     dataC_W100 = run_repeated_imputation(
         "Binomial", [1000000] * len(n_clusters), n_clusters=n_clusters, n_runs=1
     )
     plot_imputation_errs(ax[3], dataC_W100, "Clusters")
-    ax[3].set_ylim(10.5, 12)
 
     # Imputation error across different weights
     weights = [0, 100]
     dataW_2C = run_repeated_imputation(
         "Binomial", weights=weights, n_clusters=[2] * len(weights), n_runs=1
     )
     plot_imputation_errs(ax[4], dataW_2C, "Weight", legend=False)
-    ax[4].set_ylim(10.5, 12)
 
     dataW_20C = run_repeated_imputation(
         "Binomial", weights=weights, n_clusters=[20] * len(weights), n_runs=1
     )
     plot_imputation_errs(ax[5], dataW_20C, "Weight", legend=False)
-    ax[5].set_ylim(10.5, 12)
 
     dataW_40C = run_repeated_imputation(
         "Binomial", weights=weights, n_clusters=[40] * len(weights), n_runs=1
     )
     plot_imputation_errs(ax[6], dataW_40C, "Weight", legend=False)
-    ax[6].set_ylim(10.5, 12)
 
     return f
 
 
-def plot_imputation_errs(ax, data, kind, legend=True):
+def plot_imputation_errs(ax, data, kind):
     """Plot artificial missingness error across different number of clusters or weighths."""
     if kind == "Weight":
         title = "Weight Selection"
@@ -77,39 +71,54 @@ def plot_imputation_errs(ax, data, kind, legend=True):
         x=kind,
         y="DDMC",
         data=gm,
-        scatter_kws={"alpha": 0.25},
+        scatter_kws={"alpha": 0.5},
         color="darkblue",
         ax=ax,
+        ci=None,
         label="DDMC",
         lowess=True,
     )
     sns.regplot(
         x=kind,
         y="Average",
         data=gm,
+        ci=None,
         color="black",
         scatter=False,
         ax=ax,
         label="Average",
     )
     sns.regplot(
-        x=kind, y="Zero", data=gm, color="lightblue", scatter=False, ax=ax, label="Zero"
+        x=kind,
+        y="Zero",
+        data=gm,
+        color="lightblue",
+        scatter=False,
+        ax=ax,
+        label="Zero",
+        ci=None,
     )
     sns.regplot(
-        x=kind, y="PCA", data=gm, color="orange", scatter=False, ax=ax, label="PCA"
+        x=kind,
+        y="PCA",
+        data=gm,
+        color="orange",
+        scatter=False,
+        ax=ax,
+        label="PCA",
+        ci=None,
     )
     ax.set_title(title)
     ax.set_ylabel("log(MSE)—Actual vs Imputed")
     ax.legend(prop={"size": 10}, loc="upper left")
-    if not legend:
-        ax.legend().remove()
 
 
 def run_repeated_imputation(distance_method, weights, n_clusters, n_runs=1):
     """Calculate missingness error across different numbers of clusters and/or weights."""
     assert len(weights) == len(n_clusters)
     cptac = CPTAC()
-    p_signal = cptac.get_p_signal()
+    p_signal = cptac.get_p_signal(min_experiments=6)
+    print(p_signal.shape)
     sample_to_experiment = cptac.get_sample_to_experiment()
 
     df = pd.DataFrame(
@@ -126,15 +135,17 @@ def run_repeated_imputation(distance_method, weights, n_clusters, n_runs=1):
     )
 
     for ii in range(n_runs):
-        X_miss = add_missingness(p_signal, sample_to_experiment)
+        X_miss = p_signal.copy()
+        X_miss.iloc[:, :] = add_missingness(p_signal.values, sample_to_experiment)
         baseline_imputations = [
-            impute_mean(X_miss),
-            impute_zero(X_miss),
-            impute_min(X_miss),
-            impute_pca(X_miss, 5),
+            impute_mean(X_miss.values),
+            impute_zero(X_miss.values),
+            impute_min(X_miss.values),
+            impute_pca(X_miss.values, 5),
         ]
         baseline_errs = [
-            imputation_error(p_signal, X_impute) for X_impute in baseline_imputations
+            imputation_error(p_signal.values, X_impute)
+            for X_impute in baseline_imputations
         ]
 
         for jj, cluster in enumerate(n_clusters):
@@ -143,8 +154,8 @@ def run_repeated_imputation(distance_method, weights, n_clusters, n_runs=1):
                 cluster,
                 weights[jj],
                 imputation_error(
-                    p_signal,
-                    impute_ddmc(p_signal, cluster, weights[jj], distance_method),
+                    p_signal.values,
+                    impute_ddmc(X_miss, cluster, weights[jj], distance_method).values,
                 ),
                 *baseline_errs,
             ]
@@ -164,8 +175,7 @@ def add_missingness(p_signal, sample_to_experiment):
 
 def imputation_error(X, X_impute):
     # returns MSE between X and X_impute
-    mse = np.sum(np.square(X - X_impute))
-    assert mse != np.NaN
+    mse = np.nansum(np.square(X - X_impute))
     return mse
 
 
@@ -193,8 +203,4 @@ def impute_pca(X, rank):
 
 
 def impute_ddmc(p_signal, n_clusters, weight, distance_method):
-    return (
-        DDMC(n_clusters, weight, distance_method, max_iter=1)
-        .fit(p_signal)
-        .impute()
-    )
+    return DDMC(n_clusters, weight, distance_method).fit(p_signal).impute()
diff --git a/ddmc/figures/figureM3.py b/ddmc/figures/figureM3.py
@@ -6,7 +6,7 @@
 import pandas as pd
 import seaborn as sns
 
-from ddmc.clustering import DDMC,compute_control_pssm, get_pspl_pssm_distances 
+from ddmc.clustering import DDMC, compute_control_pssm, get_pspl_pssm_distances
 from ddmc.binomial import AAlist
 from ddmc.figures.common import (
     getSetup,
@@ -36,6 +36,7 @@ def makeFigure():
 
     return f
 
+
 def plot_fig_3abd(ax_a, ax_b, ax_d):
     # Import signaling data
     p_signal = EBDT().get_p_signal()
@@ -46,7 +47,6 @@ def plot_fig_3abd(ax_a, ax_b, ax_d):
         seq_weight=5,
         distance_method="Binomial",
         random_state=10,
-        max_iter=1,
     ).fit(p_signal)
 
     # get cluster centers
@@ -135,10 +135,9 @@ def plot_fig_3fgh(ax_f, ax_g, ax_h):
         seq_weight=100,
         distance_method="Binomial",
         random_state=5,
-        max_iter=1,
     ).fit(p_signal)
 
-    clusters = [3, 7, 21]
+    clusters = [2, 6, 20]
     # get pssms from ddmc clusters
     pssms = model.get_pssms(PsP_background=True, clusters=clusters)
 
@@ -216,7 +215,6 @@ def plot_fig_3c():
         seq_weight=5,
         distance_method="Binomial",
         random_state=10,
-        max_iter=1,
     ).fit(p_signal)
     centers = model.transform(as_df=True)
     # the labels are structured as "MCF7.<drug>.fold"
@@ -236,4 +234,4 @@ def plot_fig_3c():
         figsize=(2, 15),
         yticklabels=True,
         xticklabels=False,
-    )
+    )
diff --git a/ddmc/figures/figureM4.py b/ddmc/figures/figureM4.py
@@ -7,51 +7,46 @@
 
 from ddmc.clustering import DDMC
 from ddmc.figures.common import getSetup
-from ddmc.logistic_regression import plot_roc
+from ddmc.logistic_regression import plot_roc, normalize_cluster_centers
 from ddmc.datasets import CPTAC, select_peptide_subset
 
 
 def makeFigure():
     axes, f = getSetup((5, 5), (2, 2), multz={0: 1})
 
     # use small numbers here so it doesn't take forever
-    regression_results = do_phenotype_regression(
-        n_runs=1, n_cv_folds=2, ratio_peptides=0.01
-    )
+    regression_results = do_phenotype_regression(n_runs=1, n_cv_folds=2)
     plot_phenotype_regression(regression_results, axes[0])
-    p_signal = select_peptide_subset(CPTAC().get_p_signal(), keep_num=500)
+    p_signal = select_peptide_subset(CPTAC().get_p_signal(), keep_num=2000)
     models = [
         DDMC(
             n_components=30,
             seq_weight=0,
             distance_method="Binomial",
             random_state=5,
-            max_iter=10,
         ).fit(p_signal),
         DDMC(
             n_components=30,
             seq_weight=250,
             distance_method="Binomial",
             random_state=5,
-            max_iter=10,
         ).fit(p_signal),
         DDMC(
             n_components=30,
             seq_weight=1e6,
             distance_method="Binomial",
             random_state=5,
-            max_iter=10,
         ).fit(p_signal),
     ]
     plot_peptide_to_cluster_p_signal_distances(p_signal, models, axes[1])
     plot_total_position_enrichment(models, axes[2])
     return f
 
 
-def do_phenotype_regression(n_runs=3, n_components=35, n_cv_folds=3, ratio_peptides=1):
+def do_phenotype_regression(n_runs=3, n_components=35, n_cv_folds=3):
     """Plot mean AUCs per phenotype across weights."""
     cptac = CPTAC()
-    p_signal = select_peptide_subset(cptac.get_p_signal(), keep_ratio=ratio_peptides)
+    p_signal = cptac.get_p_signal()
 
     mutations = cptac.get_mutations(
         ["STK11.mutation.status", "EGFR.mutation.status", "ALK.fusion"]
@@ -65,7 +60,7 @@ def do_phenotype_regression(n_runs=3, n_components=35, n_cv_folds=3, ratio_pepti
     lr = LogisticRegressionCV(
         cv=3,
         solver="saga",
-        max_iter=500,
+        max_iter=10000,
         n_jobs=-1,
         penalty="l1",
         class_weight="balanced",
@@ -79,9 +74,9 @@ def do_phenotype_regression(n_runs=3, n_components=35, n_cv_folds=3, ratio_pepti
                 n_components=n_components,
                 seq_weight=seq_weight,
                 distance_method="Binomial",
-                max_iter=10,
             ).fit(p_signal)
             centers = ddmc.transform(as_df=True)
+            centers.iloc[:, :] = normalize_cluster_centers(centers.values)
             # the available patients vary by label
             stk11_auc = plot_roc(
                 lr,