Skip to content

Commit

Permalink
make figures more like before
Browse files Browse the repository at this point in the history
  • Loading branch information
armaan-abraham committed Feb 6, 2024
1 parent 9917afd commit 2947481
Show file tree
Hide file tree
Showing 14 changed files with 137 additions and 96 deletions.
13 changes: 9 additions & 4 deletions ddmc/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,9 @@ def fit(self, p_signal: pd.DataFrame):
the length-11 AA sequence of each peptide, containing the
phosphoacceptor in the middle and five AAs flanking it.
"""
assert isinstance(p_signal, pd.DataFrame), "`p_signal` must be a pandas dataframe."
assert isinstance(
p_signal, pd.DataFrame
), "`p_signal` must be a pandas dataframe."
sequences = p_signal.index.values
assert (
isinstance(sequences[0], str) and len(sequences[0]) == 11
Expand Down Expand Up @@ -153,8 +155,8 @@ def impute(self) -> pd.DataFrame:
labels = self.labels() # cluster assignments
centers = self.transform() # samples x clusters
for ii in range(p_signal.shape[0]):
p_signal[ii, np.isnan(p_signal[ii, :])] = centers[
np.isnan(p_signal[ii, :]), labels[ii] - 1
p_signal.iloc[ii, np.isnan(p_signal.iloc[ii, :])] = centers[

Check warning on line 158 in ddmc/clustering.py

View check run for this annotation

Codecov / codecov/patch

ddmc/clustering.py#L157-L158

Added lines #L157 - L158 were not covered by tests
np.isnan(p_signal.iloc[ii, :]), labels[ii] - 1
]
assert np.all(np.isfinite(p_signal))
return p_signal

Check warning on line 162 in ddmc/clustering.py

View check run for this annotation

Codecov / codecov/patch

ddmc/clustering.py#L161-L162

Added lines #L161 - L162 were not covered by tests
Expand Down Expand Up @@ -259,12 +261,15 @@ def predict_upstream_kinases(
)
return distances

Check warning on line 262 in ddmc/clustering.py

View check run for this annotation

Codecov / codecov/patch

ddmc/clustering.py#L262

Added line #L262 was not covered by tests

def get_nonempty_clusters(self) -> np.ndarray[int]:
return np.unique(self.labels())

Check warning on line 265 in ddmc/clustering.py

View check run for this annotation

Codecov / codecov/patch

ddmc/clustering.py#L265

Added line #L265 was not covered by tests

def has_empty_clusters(self) -> bool:
"""
Checks whether the most recent call to fit() resulted in empty clusters.
"""
check_is_fitted(self, ["scores_"])
return np.unique(self.labels()).size != self.n_components
return self.get_nonempty_clusters().size != self.n_components

Check warning on line 272 in ddmc/clustering.py

View check run for this annotation

Codecov / codecov/patch

ddmc/clustering.py#L271-L272

Added lines #L271 - L272 were not covered by tests

def predict(self) -> np.ndarray[int]:
"""Provided the current model parameters, predict the cluster each peptide belongs to."""
Expand Down
4 changes: 2 additions & 2 deletions ddmc/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,15 +77,15 @@ def get_sample_to_experiment(self, as_df=False):
return sample_to_experiment

Check warning on line 77 in ddmc/datasets.py

View check run for this annotation

Codecov / codecov/patch

ddmc/datasets.py#L77

Added line #L77 was not covered by tests
return sample_to_experiment.iloc[:, 1].values

def get_p_signal(self) -> pd.DataFrame:
def get_p_signal(self, min_experiments=2) -> pd.DataFrame:
p_signal = pd.read_csv(self.data_dir / "CPTAC-preprocessedMotifs.csv").iloc[
:, 1:
]
p_signal = p_signal.set_index("Sequence")
p_signal = p_signal.drop(columns=["Protein", "Gene", "Position"])
return filter_incomplete_peptides(
p_signal,
min_experiments=2,
min_experiments=min_experiments,
sample_to_experiment=self.get_sample_to_experiment(),
)

Expand Down
6 changes: 5 additions & 1 deletion ddmc/figures/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,8 +158,12 @@ def plot_cluster_kinase_distances(
):
pssm_names = distances.columns

# melt distances
# these centering lines make no sense, but they were used in the original
# publication-version of this code
distances = distances.sub(distances.mean(axis=1), axis=0)
distances = distances.sub(distances.mean(axis=0), axis=1)

# melt distances
distances = pd.melt(
distances.reset_index(names="Kinase"),
id_vars="Kinase",
Expand Down
66 changes: 36 additions & 30 deletions ddmc/figures/figureM2.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,51 +16,45 @@ def makeFigure():
# diagram explaining reconstruction process
ax[0].axis("off")

n_clusters = np.arange(1, 46, 45)
n_clusters = np.array([1, 45])

# Imputation error across Cluster numbers
dataC_W0 = run_repeated_imputation(
"PAM250", [0] * len(n_clusters), n_clusters=n_clusters, n_runs=1
"Binomial", [0] * len(n_clusters), n_clusters=n_clusters, n_runs=1
)
plot_imputation_errs(ax[1], dataC_W0, "Clusters")
ax[1].set_ylim(10.5, 12)

dataC_W25 = run_repeated_imputation(
"Binomial", [100] * len(n_clusters), n_clusters=n_clusters, n_runs=1
)
plot_imputation_errs(ax[2], dataC_W25, "Clusters")
ax[2].set_ylim(10.5, 12)

dataC_W100 = run_repeated_imputation(
"Binomial", [1000000] * len(n_clusters), n_clusters=n_clusters, n_runs=1
)
plot_imputation_errs(ax[3], dataC_W100, "Clusters")
ax[3].set_ylim(10.5, 12)

# Imputation error across different weights
weights = [0, 100]
dataW_2C = run_repeated_imputation(
"Binomial", weights=weights, n_clusters=[2] * len(weights), n_runs=1
)
plot_imputation_errs(ax[4], dataW_2C, "Weight", legend=False)
ax[4].set_ylim(10.5, 12)

dataW_20C = run_repeated_imputation(
"Binomial", weights=weights, n_clusters=[20] * len(weights), n_runs=1
)
plot_imputation_errs(ax[5], dataW_20C, "Weight", legend=False)
ax[5].set_ylim(10.5, 12)

dataW_40C = run_repeated_imputation(
"Binomial", weights=weights, n_clusters=[40] * len(weights), n_runs=1
)
plot_imputation_errs(ax[6], dataW_40C, "Weight", legend=False)
ax[6].set_ylim(10.5, 12)

return f


def plot_imputation_errs(ax, data, kind, legend=True):
def plot_imputation_errs(ax, data, kind):
"""Plot artificial missingness error across different number of clusters or weighths."""
if kind == "Weight":
title = "Weight Selection"
Expand All @@ -77,39 +71,54 @@ def plot_imputation_errs(ax, data, kind, legend=True):
x=kind,
y="DDMC",
data=gm,
scatter_kws={"alpha": 0.25},
scatter_kws={"alpha": 0.5},
color="darkblue",
ax=ax,
ci=None,
label="DDMC",
lowess=True,
)
sns.regplot(
x=kind,
y="Average",
data=gm,
ci=None,
color="black",
scatter=False,
ax=ax,
label="Average",
)
sns.regplot(
x=kind, y="Zero", data=gm, color="lightblue", scatter=False, ax=ax, label="Zero"
x=kind,
y="Zero",
data=gm,
color="lightblue",
scatter=False,
ax=ax,
label="Zero",
ci=None,
)
sns.regplot(
x=kind, y="PCA", data=gm, color="orange", scatter=False, ax=ax, label="PCA"
x=kind,
y="PCA",
data=gm,
color="orange",
scatter=False,
ax=ax,
label="PCA",
ci=None,
)
ax.set_title(title)
ax.set_ylabel("log(MSE)—Actual vs Imputed")
ax.legend(prop={"size": 10}, loc="upper left")
if not legend:
ax.legend().remove()


def run_repeated_imputation(distance_method, weights, n_clusters, n_runs=1):
"""Calculate missingness error across different numbers of clusters and/or weights."""
assert len(weights) == len(n_clusters)
cptac = CPTAC()
p_signal = cptac.get_p_signal()
p_signal = cptac.get_p_signal(min_experiments=6)
print(p_signal.shape)
sample_to_experiment = cptac.get_sample_to_experiment()

df = pd.DataFrame(
Expand All @@ -126,15 +135,17 @@ def run_repeated_imputation(distance_method, weights, n_clusters, n_runs=1):
)

for ii in range(n_runs):
X_miss = add_missingness(p_signal, sample_to_experiment)
X_miss = p_signal.copy()
X_miss.iloc[:, :] = add_missingness(p_signal.values, sample_to_experiment)
baseline_imputations = [
impute_mean(X_miss),
impute_zero(X_miss),
impute_min(X_miss),
impute_pca(X_miss, 5),
impute_mean(X_miss.values),
impute_zero(X_miss.values),
impute_min(X_miss.values),
impute_pca(X_miss.values, 5),
]
baseline_errs = [
imputation_error(p_signal, X_impute) for X_impute in baseline_imputations
imputation_error(p_signal.values, X_impute)
for X_impute in baseline_imputations
]

for jj, cluster in enumerate(n_clusters):
Expand All @@ -143,8 +154,8 @@ def run_repeated_imputation(distance_method, weights, n_clusters, n_runs=1):
cluster,
weights[jj],
imputation_error(
p_signal,
impute_ddmc(p_signal, cluster, weights[jj], distance_method),
p_signal.values,
impute_ddmc(X_miss, cluster, weights[jj], distance_method).values,
),
*baseline_errs,
]
Expand All @@ -164,8 +175,7 @@ def add_missingness(p_signal, sample_to_experiment):

def imputation_error(X, X_impute):
# returns MSE between X and X_impute
mse = np.sum(np.square(X - X_impute))
assert mse != np.NaN
mse = np.nansum(np.square(X - X_impute))
return mse


Expand Down Expand Up @@ -193,8 +203,4 @@ def impute_pca(X, rank):


def impute_ddmc(p_signal, n_clusters, weight, distance_method):
return (
DDMC(n_clusters, weight, distance_method, max_iter=1)
.fit(p_signal)
.impute()
)
return DDMC(n_clusters, weight, distance_method).fit(p_signal).impute()
10 changes: 4 additions & 6 deletions ddmc/figures/figureM3.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import pandas as pd
import seaborn as sns

from ddmc.clustering import DDMC,compute_control_pssm, get_pspl_pssm_distances
from ddmc.clustering import DDMC, compute_control_pssm, get_pspl_pssm_distances
from ddmc.binomial import AAlist
from ddmc.figures.common import (
getSetup,
Expand Down Expand Up @@ -36,6 +36,7 @@ def makeFigure():

return f


def plot_fig_3abd(ax_a, ax_b, ax_d):
# Import signaling data
p_signal = EBDT().get_p_signal()
Expand All @@ -46,7 +47,6 @@ def plot_fig_3abd(ax_a, ax_b, ax_d):
seq_weight=5,
distance_method="Binomial",
random_state=10,
max_iter=1,
).fit(p_signal)

# get cluster centers
Expand Down Expand Up @@ -135,10 +135,9 @@ def plot_fig_3fgh(ax_f, ax_g, ax_h):
seq_weight=100,
distance_method="Binomial",
random_state=5,
max_iter=1,
).fit(p_signal)

clusters = [3, 7, 21]
clusters = [2, 6, 20]
# get pssms from ddmc clusters
pssms = model.get_pssms(PsP_background=True, clusters=clusters)

Expand Down Expand Up @@ -216,7 +215,6 @@ def plot_fig_3c():
seq_weight=5,
distance_method="Binomial",
random_state=10,
max_iter=1,
).fit(p_signal)
centers = model.transform(as_df=True)
# the labels are structured as "MCF7.<drug>.fold"
Expand All @@ -236,4 +234,4 @@ def plot_fig_3c():
figsize=(2, 15),
yticklabels=True,
xticklabels=False,
)
)
19 changes: 7 additions & 12 deletions ddmc/figures/figureM4.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,51 +7,46 @@

from ddmc.clustering import DDMC
from ddmc.figures.common import getSetup
from ddmc.logistic_regression import plot_roc
from ddmc.logistic_regression import plot_roc, normalize_cluster_centers
from ddmc.datasets import CPTAC, select_peptide_subset


def makeFigure():
axes, f = getSetup((5, 5), (2, 2), multz={0: 1})

# use small numbers here so it doesn't take forever
regression_results = do_phenotype_regression(
n_runs=1, n_cv_folds=2, ratio_peptides=0.01
)
regression_results = do_phenotype_regression(n_runs=1, n_cv_folds=2)
plot_phenotype_regression(regression_results, axes[0])
p_signal = select_peptide_subset(CPTAC().get_p_signal(), keep_num=500)
p_signal = select_peptide_subset(CPTAC().get_p_signal(), keep_num=2000)
models = [
DDMC(
n_components=30,
seq_weight=0,
distance_method="Binomial",
random_state=5,
max_iter=10,
).fit(p_signal),
DDMC(
n_components=30,
seq_weight=250,
distance_method="Binomial",
random_state=5,
max_iter=10,
).fit(p_signal),
DDMC(
n_components=30,
seq_weight=1e6,
distance_method="Binomial",
random_state=5,
max_iter=10,
).fit(p_signal),
]
plot_peptide_to_cluster_p_signal_distances(p_signal, models, axes[1])
plot_total_position_enrichment(models, axes[2])
return f


def do_phenotype_regression(n_runs=3, n_components=35, n_cv_folds=3, ratio_peptides=1):
def do_phenotype_regression(n_runs=3, n_components=35, n_cv_folds=3):
"""Plot mean AUCs per phenotype across weights."""
cptac = CPTAC()
p_signal = select_peptide_subset(cptac.get_p_signal(), keep_ratio=ratio_peptides)
p_signal = cptac.get_p_signal()

mutations = cptac.get_mutations(
["STK11.mutation.status", "EGFR.mutation.status", "ALK.fusion"]
Expand All @@ -65,7 +60,7 @@ def do_phenotype_regression(n_runs=3, n_components=35, n_cv_folds=3, ratio_pepti
lr = LogisticRegressionCV(
cv=3,
solver="saga",
max_iter=500,
max_iter=10000,
n_jobs=-1,
penalty="l1",
class_weight="balanced",
Expand All @@ -79,9 +74,9 @@ def do_phenotype_regression(n_runs=3, n_components=35, n_cv_folds=3, ratio_pepti
n_components=n_components,
seq_weight=seq_weight,
distance_method="Binomial",
max_iter=10,
).fit(p_signal)
centers = ddmc.transform(as_df=True)
centers.iloc[:, :] = normalize_cluster_centers(centers.values)
# the available patients vary by label
stk11_auc = plot_roc(
lr,
Expand Down
Loading

0 comments on commit 2947481

Please sign in to comment.