""" Module for OmicScope Object Visualization
This module allows the user to extract and visualize information from OmicScope object.
Here, it is possible to evaluate data normalization (MA-Plot, Volcano Plot, Dynamic range plot),
individual protein abundance (barplot, boxplot), search for protein-protein interactions,
and perform Principal Component Analysis (PCA), Hierarchical clustering analysis (heatmap, pearson
correlation plot) and K-means clustering (k-trend).
Some functions below allow user to choose protein to be highlighted and/or plotted.
For that, user must write protein 'gene_name' (See examples in OmicScope Object tab).
Additionally, colors and color palettes follows the matplotlib and seaborn libraries options.
"""
import copy
import itertools
import random
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.colors as mcolors
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import networkx as nx
import numpy as np
import pandas as pd
import requests
import seaborn as sns
from kneed import KneeLocator
from scipy.stats import zscore
from sklearn import preprocessing
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
[docs]def bar_ident(self, logscale=False, col='darkcyan', save=None, dpi=300,
vector=True):
"""Show the amount of entities identified and differentially regulated
in the study.
Args:
logscale (bool, optional): Y-axis log-scaled. Defaults to True.
col (str, optional): Color. Defaults to 'darkcyan'.
save (str, optional): Path to save figure. Defaults to None.
dpi (int, optional): Resolution to save figure. Defaults to 300.
vector (bool, optional): Save figure in as vector (.svg). Defaults to True.
Returns:
ax [matplotlib object]: Barplot
"""
# Define plt parameters
plt.style.use('default')
sns.set(rc={'figure.figsize': (11.7, 8.27)})
sns.set_style("ticks")
plt.rcParams["figure.dpi"] = dpi
OmicScope = copy.copy(self)
df = OmicScope.quant_data
# Get number of identified proteins
identified = df.Accession.count()
# Get number of quantified proteins
quantified = df.dropna(axis=0, subset=[OmicScope.pvalue]).Accession.count()
# Get number of differentially regulated proteins
deps = OmicScope.deps['Accession'].count()
if identified != quantified:
df = ['Identified', 'Quantified', 'Differentially\nregulated']
protein_number = [identified, quantified, deps]
else:
df = ['Quantified', 'Differentially\nregulated']
protein_number = [quantified, deps]
df = pd.DataFrame(protein_number, df)
# Log scaling y-axis
if logscale is True:
ax = df.plot(kind='bar', color=col, rot=0, edgecolor="black", log=True,
figsize=(2.4, 3.5))
else:
ax = df.plot(kind='bar', color=col, rot=0, edgecolor="black",
figsize=(2.4, 3.5))
# Add label upside bar
def autolabel(rects, ax):
for rect in rects:
x = rect.get_x() + rect.get_width() / 2.
y = rect.get_height()
ax.annotate("{}".format(y), (x, y), xytext=(0, 5), textcoords="offset points",
ha='center', va='bottom')
autolabel(ax.patches, ax)
ax.margins(y=0.1)
ax.legend().remove()
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
plt.ylabel('#Proteins')
plt.title(label=OmicScope.ctrl + ' vs ' + '-'.join(OmicScope.experimental), loc='left')
plt.grid(b=False)
if save is not None:
if vector is True:
plt.savefig(save + 'barplot.svg', bbox_inches='tight')
else:
plt.savefig(save + 'barplot.png', dpi=dpi, bbox_inches='tight')
plt.show()
return ax
[docs]def normalization_boxplot(self, palette='Dark2', dpi=300,
save=None, vector=True):
"""
Generates a boxplot of log2-transformed expression data for each sample to
evaluate normalization.
Parameters:
- palette (str): The color palette to use for the boxplot. Default is 'Dark2'.
- dpi (int): resolution of the saved figure. Default is 300.
- save (str): Filepath to save the generated plot. If None, the plot is not saved. Default is None.
- vector (bool): If True, save the plot in vector format (SVG). If False, save in raster format (PNG). Default is True.
"""
plt.rcParams['figure.dpi']=dpi
df = self.expression.copy()
if self.logTransform is True:
df = np.log2(df)
df = df.replace([-np.inf,np.inf],0)
conditions = self.pdata.Condition
colors = sns.color_palette(palette=palette,
as_cmap=False, n_colors=len(conditions.drop_duplicates()))
color_dict = {i:j for i,j in zip(conditions.drop_duplicates(),colors)}
bplot = plt.boxplot(df, vert=True, # vertical box alignment
patch_artist=True, labels=df.columns,
notch=True, medianprops=dict(color='black'),
boxprops=dict(linewidth=0.8))
plt.xticks(rotation=90)
plt.title('Normalization Boxplot')
for patch, condition in zip(bplot['boxes'], conditions):
patch.set_facecolor(color_dict[condition])
# Create legend for colors
legend_labels = [mpatches.Patch(color=color_dict[i], label=i) for i in color_dict]
plt.legend(handles=legend_labels, loc='center', bbox_to_anchor=(1.1, .5))
sns.despine()
plt.tight_layout()
if save is not None:
if vector is True:
plt.savefig(save + 'boxplot_normalization.svg', bbox_inches='tight')
else:
plt.savefig(save + 'boxplot_normalization.png', dpi=dpi, bbox_inches='tight')
plt.show()
[docs]def volcano_Multicond(self, *Proteins, pvalue=0.05, palette='viridis',
bcol='#962558', non_regulated='#606060',
save=None, dpi=300, vector=True):
"""Creates a volcano plot for multiple conditions.
In general, volcano plots are designed to compare 2 conditions.
Here, we aim to see the distribution of quantified proteins' p-value and
fold changes among multiple conditions.
Args:
pvalue (float, optional): p-value threshold. Defaults to 0.05.
palette (str, optional): Color palette to differentiate dots. Defaults to 'viridis'.
bcol (str, optional): color for density plot. Defaults to '#962558'.
non_regulated (str, optional): Proteins not differentially regulated. Defaults to '#606060'.
save (str, optional): Path to save figure. Defaults to None.
dpi (int, optional): figure resolution. Defaults to 300.
vector (bool, optional): Save figure in as vector (.svg). Defaults to True.
"""
plt.style.use('default')
plt.rcParams["figure.dpi"] = dpi
OmicScope = copy.copy(self)
FoldChange_cutoff = OmicScope.FoldChange_cutoff
# Definitions for the axes
df_initial = OmicScope.quant_data
dropped_inf = copy.copy(df_initial)
dropped_inf = dropped_inf.replace([np.inf, -np.inf], np.nan)
dropped_inf = dropped_inf.dropna()
fc = df_initial['log2(fc)']
fc = fc.replace(np.inf, dropped_inf['log2(fc)'].max() + 1)
fc = fc.replace(-np.inf, dropped_inf['log2(fc)'].min() - 1)
pval = df_initial[f'-log10({OmicScope.pvalue})']
pval = pval.replace(
np.inf, dropped_inf[f'-log10({OmicScope.pvalue})'].max() + 1)
df_initial[f'-log10({OmicScope.pvalue})'] = pval
df_initial['log2(fc)'] = fc
# colors per condition
comparisons = df_initial.Comparison
comparisons = comparisons.str[-1] + '-' + comparisons.str[0]
number_of_comparison = len(comparisons.drop_duplicates())
color_per_comparison = sns.color_palette(palette=palette,
n_colors=number_of_comparison).as_hex()
color_comp_dict = dict(zip(comparisons, color_per_comparison))
print(color_comp_dict)
col = []
comparison = []
for pv, comp in zip(pval, comparisons):
if pv >= -np.log10(pvalue):
comparison.append(comp)
col.append(comp)
else:
col.append(non_regulated)
comparison.append('Non-regulated')
col = pd.Series(col)
comparison = pd.Series(comparison)
df = pd.DataFrame(data=zip(pval, fc, col, comparison))
df.columns = [OmicScope.pvalue, 'fc', 'col', 'comparison']
df[OmicScope.pvalue] = df[OmicScope.pvalue].astype(float)
df['fc'] = df['fc'].astype(float)
df['col'] = df.col.replace(color_comp_dict)
# annotation if it is on args
if len(Proteins) > 0:
ls = pd.DataFrame(Proteins)
ls.columns = ['gene_name']
ls_final = ls.merge(df_initial)
ls_final = ls_final[['gene_name', f'-log10({OmicScope.pvalue})', 'log2(fc)']]
ls_final = ls_final.sort_values('log2(fc)', ascending=False)
# dimensions
left, width = 0.1, 0.65
bottom, height = 0.1, 0.65
spacing = 0.02
rect_scatter = [left, bottom, width, height]
rect_histx = [left, bottom + height + spacing, width, 0.2]
rect_histy = [left + width + spacing, bottom, 0.2, height]
# plot
plt.style.use('bmh')
plt.figure(figsize=(8, 8))
# Scatter plot
ax_scatter = plt.axes(rect_scatter)
sns.scatterplot(x=df.fc, y=df[OmicScope.pvalue], alpha=0.7,
hue=df.comparison)
sns.despine(left=False)
plt.xlabel("log2( FC)")
plt.ylabel(f'-log10({OmicScope.pvalue})')
ax_scatter.tick_params(direction='in', top=True, right=True)
plt.grid(b=None)
ax_scatter.plot(alpha=0.5)
# Density plot on x-axis (upper)
ax_histx = plt.axes(rect_histx)
plt.ylabel("Density")
plt.title(label=OmicScope.ctrl + ' vs ' + '-'.join(OmicScope.experimental), loc='left')
ax_histx.tick_params(direction='in', labelbottom=False)
ax_histx.set_facecolor('white')
ax_histx.grid(b=None)
sns.kdeplot(fc, fill=True, color=bcol, edgecolor='black')
sns.despine(bottom=True, top=True)
# Density plot on y-axis (right)
ax_histy = plt.axes(rect_histy)
ax_histy.tick_params(direction='in', labelleft=False)
ax_histy.set_facecolor('white')
ax_histy.grid(b=None)
sns.kdeplot(y=pval, fill=True, color=bcol, edgecolor='black')
sns.despine(top=True, left=True)
plt.xlabel("Density")
ax_scatter.set_facecolor('white')
# Annotation positions
if len(Proteins) > 0:
from adjustText import adjust_text
texts = []
for a, b, c in zip(ls_final['log2(fc)'], ls_final[f'-log10({OmicScope.pvalue})'],
ls_final['gene_name']):
texts.append(ax_scatter.text(a, b, c, size=8))
adjust_text(texts, ax=ax_scatter, force_points=0.25, force_text=0.25,
expand_points=(1.5, 1.5), expand_text=(1.5, 1.5),
arrowprops=dict(arrowstyle="-", color='black', lw=0.5))
ax_scatter.axhline(y=-np.log10(0.05), color='gray', linestyle=':')
ax_scatter.legend()
if FoldChange_cutoff == 0:
ax_scatter.axvline(x=0, color='gray', linestyle=':')
else:
ax_scatter.axvline(x=FoldChange_cutoff, color='gray', linestyle=':')
limh = round(fc.max(), 2)
liml = round(fc.min(), 2)
limp = round(pval.max() + 0.5, 0)
ax_scatter.set_xlim((liml - (limh - liml) * 0.10, limh + ((limh - liml) * 0.10)))
ax_scatter.set_ylim((pval.min(), limp + limp * .1))
ax_histx.set_xlim(ax_scatter.get_xlim())
ax_histy.set_ylim(ax_scatter.get_ylim())
if save is not None:
if vector is True:
plt.savefig(save + 'volcano.svg', bbox_inches='tight')
else:
plt.savefig(save + 'volcano.png', dpi=dpi, bbox_inches='tight')
plt.show()
plt.show(block=True)
[docs]def volcano_2cond(self, *Proteins, pvalue=0.05, bcol='darkcyan',
non_regulated='#606060', up_regulated='#E4001B', down_regulated='#6194BC',
save=None, dpi=300, vector=True):
"""Plot a conventional volcano plot
Args:
pvalue (float, optional): p-value threshold. Defaults to 0.05.
bcol (str, optional): color for density plot. Defaults to 'darkcyan'.
non_regulated (str, optional): Proteins not differentially regulated.
Defaults to '#606060'.
up_regulated (str, optional): Proteins up-regulated in relation
to Control group. Defaults to '#E4001B'.
down_regulated (str, optional): Proteins down-regulated in relation
to Control group. Defaults to '#6194BC'.
save (str, optional): Path to save figure. Defaults to None.
dpi (int, optional): figure resolution. Defaults to 300.
vector (bool, optional): Save figure in as vector (.svg). Defaults to
True.
"""
plt.style.use('default')
plt.rcParams["figure.dpi"] = dpi
OmicScope = copy.copy(self)
FoldChange_cutoff = OmicScope.FoldChange_cutoff
# Definitions for the axes
df_initial = OmicScope.quant_data
dropped_inf = copy.copy(df_initial)
dropped_inf = dropped_inf.replace([np.inf, -np.inf], np.nan)
dropped_inf = dropped_inf.dropna()
fc = df_initial['log2(fc)']
fc = fc.replace(np.inf, dropped_inf['log2(fc)'].max() + 1)
fc = fc.replace(-np.inf, dropped_inf['log2(fc)'].min() - 1)
pval = df_initial[f'-log10({OmicScope.pvalue})']
pval = pval.replace(
np.inf, dropped_inf[f'-log10({OmicScope.pvalue})'].max() + 1)
df_initial[f'-log10({OmicScope.pvalue})'] = pval
df_initial['log2(fc)'] = fc
# Defining colors for dots
col = []
comparison = []
for i, j in zip(fc, pval):
if j <= -np.log10(pvalue):
col.append(non_regulated)
comparison.append('non-regulated')
else:
if i >= FoldChange_cutoff:
col.append(up_regulated)
comparison.append('up-regulated')
elif i <= -FoldChange_cutoff:
col.append(down_regulated)
comparison.append('down-regulated')
else:
col.append(non_regulated)
comparison.append('non-regulated')
col, comparison = pd.Series(col), pd.Series(comparison)
df = pd.DataFrame(zip(pval, fc, col, comparison))
df.columns = ['pval', 'fc', 'col', 'Regulation']
# annotation
if len(Proteins) > 0:
ls = pd.DataFrame(Proteins)
ls.columns = ['gene_name']
ls_final = ls.merge(df_initial)
ls_final = ls_final[['gene_name', f'-log10({OmicScope.pvalue})', 'log2(fc)']]
ls_final = ls_final.sort_values('log2(fc)', ascending=False)
# plot dimensions
left, width = 0.1, 0.65
bottom, height = 0.1, 0.65
spacing = 0.02
rect_scatter = [left, bottom, width, height]
rect_histx = [left, bottom + height + spacing, width, 0.2]
rect_histy = [left + width + spacing, bottom, 0.2, height]
# Plot
plt.style.use('bmh')
plt.figure(figsize=(8, 8))
ax_scatter = plt.axes(rect_scatter)
# Scatter plot
sns.scatterplot(x=df.fc, y=df.pval, alpha=0.5, linewidth=0.5,
hue=df.Regulation,
palette=list(df.col.drop_duplicates().dropna()))
plt.xlabel("log2( FC)")
plt.ylabel(f'-log10({OmicScope.pvalue})')
ax_scatter.tick_params(direction='in', top=True, right=True)
plt.grid(b=None)
ax_scatter.plot(alpha=0.5)
# Density Plot in x-axis (upper)
ax_histx = plt.axes(rect_histx)
plt.ylabel("Density")
plt.title(label=OmicScope.experimental[0] + ' - ' + OmicScope.ctrl, loc='left')
ax_histx.tick_params(direction='in', labelbottom=False)
ax_histx.set_facecolor('white')
ax_histx.grid(b=None)
sns.kdeplot(fc, fill=True, color=bcol, alpha=0.8, edgecolor='black')
sns.despine(bottom=True, top=True)
# Density plot in y axis (right)
ax_histy = plt.axes(rect_histy)
ax_histy.tick_params(direction='in', labelleft=False)
ax_histy.set_facecolor('white')
ax_histy.grid(b=None)
sns.kdeplot(y=pval, fill=True, color=bcol, alpha=0.8, edgecolor='black')
sns.despine(top=True, left=True)
plt.xlabel("Density")
ax_scatter.set_facecolor('white')
# Annotation positions
if len(Proteins) > 0:
from adjustText import adjust_text
texts = []
for a, b, c in zip(ls_final['log2(fc)'], ls_final[f'-log10({OmicScope.pvalue})'],
ls_final['gene_name']):
texts.append(ax_scatter.text(a, b, c, size=8))
adjust_text(texts, ax=ax_scatter,
arrowprops=dict(arrowstyle="-", color='k', lw=0.5))
ax_scatter.axhline(y=-np.log10(0.05), color='gray', linestyle=':')
# Fold change cutoff
if FoldChange_cutoff != 0:
ax_scatter.axvline(x=-FoldChange_cutoff,
color='gray', linestyle=':')
ax_scatter.axvline(x=FoldChange_cutoff,
color='gray', linestyle=':')
else:
ax_scatter.axvline(x=FoldChange_cutoff,
color='gray', linestyle=':')
# Figure limits
limh = round(fc.max(), 2)
liml = round(fc.min(), 2)
limp = round(pval.max() + 0.5, 0)
ax_scatter.set_xlim((liml - ((limh - liml) * 0.10),
limh + ((limh - liml) * 0.10)))
ax_scatter.set_ylim((pval.min(), limp))
ax_histx.set_xlim(ax_scatter.get_xlim())
ax_histy.set_ylim(ax_scatter.get_ylim())
# save figure and how to save
if save is not None:
if vector is True:
plt.savefig(save + 'volcano.svg', bbox_inches='tight')
else:
plt.savefig(save + 'volcano.png', dpi=dpi, bbox_inches='tight')
plt.show()
plt.show(block=True)
[docs]def volcano(self, *Proteins,
pvalue=0.05, bcol='#962558', palette='viridis',
non_regulated='#606060', up_regulated='#E4001B', down_regulated='#6194BC',
save=None, dpi=300, vector=True):
"""Plot volcano plot.
Args:
pvalue (float, optional): p-value threshold. Defaults to 0.05.
bcol (str, optional): Density plot color. Defaults to '#962558'.
palette (str, optional): Palette for Multiconditions volcano plot.
Defaults to 'viridis'.
non_regulated (str, optional): Color of non-differentially regulated
proteins. Defaults to '#606060'.
up_regulated (str, optional): Color of up-regulated proteins for volcano
with 2 conditions. Defaults to '#E4001B'.
down_regulated (str, optional): Color of down-regulated proteins for volcano
with 2 conditions. Defaults to '#6194BC'.
save (str, optional): Path to save figure. Defaults to None.
dpi (int, optional): . Resolution to save figure. Defaults to 300.
vector (bool, optional): Save figure in as vector (.svg). Defaults to
True.
"""
OmicScope = copy.copy(self)
if len(OmicScope.Conditions) == 2:
volcano_2cond(OmicScope, *Proteins, pvalue=pvalue, bcol=bcol,
non_regulated=non_regulated, up_regulated=up_regulated,
down_regulated=down_regulated,
save=save, dpi=dpi, vector=vector)
if len(OmicScope.Conditions) > 2:
volcano_Multicond(OmicScope, *Proteins,
pvalue=pvalue, palette=palette, bcol=bcol,
save=None, dpi=dpi, vector=vector)
[docs]def heatmap(self, *Proteins, pvalue=0.05, c_cluster=True,
clust_metric="euclidean", clust_method='average',
palette='RdYlBu_r', linewidth=0.01, color_groups='tab20',
sample_label=False,
save=None, dpi=300, vector=True):
""" Heatmap with hierarchical clustering
Args:
pvalue (float, optional): p-value threshold. Defaults to 0.05.
c_cluster (bool, optional): Applies Hierarchical clustering for
columns. Defaults to True.
clust_metric (str, optional): The distance metric to use. Optionally: braycurtis, canberra,
chebyshev, cityblock, correlation, cosine, dice, euclidean, hamming, jaccard,
jensenshannon, kulczynski1, mahalanobis, matching, minkowski, rogerstanimoto, russellrao,
seuclidean, sokalmichener, sokalsneath, sqeuclidean, yule.
clust_method (str, optional): Linkage method to use for calculating clusters.
Optionally: single, complete, average, weighted, centroid, median, or ward.
color_groups (str, optional): Palette for group colors.
Defaults to 'tab20'.
palette (str, optional): Palette for protein abundance. Defaults to 'RdYlBu_r'.
linewidth (float, optional): Line width. Defaults to 0.01.
sample_label (bool, optional): insert biological sample label and condition.
Defaults to False.
save (str, optional): Path to save figure. Defaults to None.
dpi (int, optional): figure resolution. Defaults to 300.
vector (bool, optional): Save figure in as vector (.svg). Defaults to
True.
"""
plt.style.use('default')
plt.rcParams["figure.dpi"] = dpi
OmicScope = copy.copy(self)
FoldChange_cutoff = OmicScope.FoldChange_cutoff
df = OmicScope.quant_data
pdata = OmicScope.pdata
sample_condition = dict(zip(pdata.Sample, pdata.Condition))
sort_columns = list(pdata.sort_values(['Condition'])['Sample'])
if 'TimeCourse' in pdata.columns:
sort_columns = list(pdata.sort_values(
['Condition', 'TimeCourse'])['Sample'])
times = pdata.TimeCourse.drop_duplicates()
ntimes = len(times)
pal = sns.color_palette(palette='BuPu', as_cmap=False, n_colors=ntimes)
dic = {}
for C, c in zip(times, pal):
dic.update({C: c})
replacer = dic.get
time_colors = [replacer(n, n) for n in list(
pdata.sort_values(['Condition', 'TimeCourse']).TimeCourse)]
else:
time_colors = []
Conditions = OmicScope.Conditions
# Creating Heatmap Matrix
heatmap = df[df.loc[:, OmicScope.pvalue] <= pvalue]
heatmap = heatmap.loc[(heatmap['log2(fc)'] <= -FoldChange_cutoff)
| (heatmap['log2(fc)'] >= FoldChange_cutoff)]
heatmap = heatmap.dropna()
# Filtering for specific proteins
if len(Proteins) > 0:
heatmap = heatmap[heatmap['gene_name'].isin(Proteins)]
heatmap = heatmap.set_index('gene_name')
heatmap = heatmap.loc[:, heatmap.columns.str.contains('.', regex=False)]
# Log2 transform
if self.logTransform is True:
heatmap = np.log2(heatmap)
heatmap = heatmap.replace([-np.inf], int(0))
heatmap = heatmap[sort_columns]
heatmap_original_label = heatmap.copy()
heatmap = heatmap.rename(columns=sample_condition)
Col = list(heatmap.columns)
# Creating matrix for group colors
ngroup = len(Conditions)
pal = sns.color_palette(palette=color_groups,
as_cmap=False, n_colors=ngroup)
dic = {}
for C, c in zip(Conditions, pal):
dic.update({C: c})
replacer = dic.get
colcolors = [replacer(n, n) for n in Col]
colors = [colcolors] + [time_colors]
if colors[-1] == []:
colors = colcolors
# Title
title = 'Heatmap - ' + OmicScope.ctrl + \
' vs ' + '-'.join(OmicScope.experimental)
# Plot
heatmap.columns.name = 'Samples'
heatmap.index.name = 'Gene name'
if sample_label is True:
heatmap = heatmap_original_label
sns.clustermap(heatmap,
cmap=palette, z_score=0, linewidths=linewidth, linecolor='black',
col_colors=colors, metric=clust_metric, method=clust_method,
col_cluster=c_cluster, center=0).fig.suptitle(title, y=1.02)
# Create legend for colors
if type(colors[0]) is list:
col_dict_Time = {i:j for i,j in zip(pdata.TimeCourse, colors[1])}
legend_labels_time = [mpatches.Patch(color=col_dict_Time[i], label=i) for i in col_dict_Time]
legend1 = plt.legend(handles=legend_labels_time, bbox_to_anchor=(1, 1),
bbox_transform=plt.gcf().transFigure, loc='upper right',
title='Time Point')
col_dict = {i:j for i,j in zip(pdata.Condition, colors[0])}
legend_labels = [mpatches.Patch(color=col_dict[i], label=i) for i in col_dict]
legend2 = plt.legend(handles=legend_labels, bbox_to_anchor=(1, 0.88),
bbox_transform=plt.gcf().transFigure, loc='upper right',
title='Condition')
plt.gca().add_artist(legend1)
else:
col_dict = {i:j for i,j in zip(heatmap.columns, colors)}
legend_labels = [mpatches.Patch(color=col_dict[i], label=i) for i in col_dict]
plt.legend(handles=legend_labels, bbox_to_anchor=(1, 1),
bbox_transform=plt.gcf().transFigure, loc='upper right',
title='Condition')
# Save
if save is not None:
if vector is True:
plt.savefig(save + 'heatmap.svg', bbox_inches='tight')
else:
plt.savefig(save + 'heatmap.png', dpi=dpi, bbox_inches='tight')
plt.show()
[docs]def correlation(self, *Proteins, pvalue=1.0,
sample_method='pearson',
clust_metric='euclidean',
clust_method='average', palette='RdYlBu_r', linewidth=0.005,
color_groups='tab20', sample_label=False,
save=None, dpi=300, vector=True):
"""Pairwise correlation plot for samples.
Args:
pvalue (float, optional): p-value threshold. Defaults to 1.0.
sample_method (str, optional): Method to compute pair-wise correlation.
Options: pearson, kendal, spearman. Defaults to 'pearson'.
clust_metric (str, optional): The distance metric to use. Optionally: braycurtis, canberra,
chebyshev, cityblock, correlation, cosine, dice, euclidean, hamming, jaccard,
jensenshannon, kulczynski1, mahalanobis, matching, minkowski, rogerstanimoto, russellrao,
seuclidean, sokalmichener, sokalsneath, sqeuclidean, yule.
clust_method (str, optional): Linkage method to use for calculating clusters.
Optionally: single, complete, average, weighted, centroid, median, or ward.
palette (str, optional): Palette for R-distribution. Defaults to 'RdYlBu_r'.
linewidth (float, optional): Line width. Defaults to 0.005.
sample_label (bool, optional): insert biological sample label and condition.
Defaults to False.
color_groups (str, optional): Color of each group. Defaults to 'tab20'.
save (str, optional): Path to save figure. Defaults to None.
dpi (int, optional): figure resolution. Defaults to 300.
vector (bool, optional): Save figure in as vector (.svg). Defaults to
True.
"""
plt.style.use('default')
plt.rcParams["figure.dpi"] = dpi
OmicScope = copy.copy(self)
FoldChange_cutoff = OmicScope.FoldChange_cutoff
df = OmicScope.quant_data
pdata = OmicScope.pdata
sample_condition = dict(zip(pdata.Sample, pdata.Condition))
sort_columns = list(pdata.sort_values(['Condition'])['Sample'])
if 'TimeCourse' in pdata.columns:
sort_columns = list(pdata.sort_values(
['Condition', 'TimeCourse'])['Sample'])
times = pdata.TimeCourse.drop_duplicates()
ntimes = len(times)
pal = sns.color_palette(palette='BuPu', as_cmap=False, n_colors=ntimes)
dic = {}
for C, c in zip(times, pal):
dic.update({C: c})
replacer = dic.get
time_colors = [replacer(n, n) for n in list(
pdata.sort_values(['Condition', 'TimeCourse']).TimeCourse)]
else:
time_colors = []
Conditions = OmicScope.Conditions
# Selecting Matrix for correlation
pearson = df[df.loc[:, OmicScope.pvalue] <= pvalue]
pearson = pearson.loc[(pearson['log2(fc)'] <= -FoldChange_cutoff)
| (pearson['log2(fc)'] >= FoldChange_cutoff)]
pearson = pearson.dropna()
# Filtering for specific proteins
if len(Proteins) > 0:
pearson = pearson[pearson['gene_name'].isin(Proteins)]
pearson = pearson.set_index('gene_name')
pearson = pearson.loc[:, pearson.columns.str.contains('.', regex=False)]
# log2 transform
if self.logTransform is True:
pearson = np.log2(pearson)
pearson = pearson.replace([-np.inf], int(0))
# Performing Pearson's Correlation
corr_matrix_raw = pearson.corr(method=sample_method)
# Creating matrix for group colors
corr_matrix_raw = corr_matrix_raw[sort_columns]
corr_matrix = corr_matrix_raw.rename(columns=sample_condition)
Col = list(corr_matrix.columns)
# Creating matrix for group colors
ngroup = len(Conditions)
pal = sns.color_palette(palette=color_groups, as_cmap=False, n_colors=ngroup)
dic = {}
for C, c in zip(Conditions, pal):
dic.update({C: c})
replacer = dic.get
colcolors = [replacer(n, n) for n in Col]
colors = [colcolors] + [time_colors]
if colors[-1] == []:
colors = colcolors
# Title
title = 'Heatmap - ' + OmicScope.ctrl + ' vs ' + '-'.join(OmicScope.experimental)
# Plot
corr_matrix.columns.name = 'Samples'
corr_matrix.index.name = 'Samples'
if sample_label is False:
sns.clustermap(corr_matrix, metric=clust_metric, method=clust_method,
xticklabels=corr_matrix.columns, row_colors=colors,
yticklabels=corr_matrix.columns, col_colors=colors,
cmap=palette, linewidths=linewidth, linecolor='black').fig.suptitle(title, y=1.02)
else:
sns.clustermap(corr_matrix_raw, metric=clust_metric, method=clust_method,
xticklabels=corr_matrix_raw.columns, row_colors=colors,
yticklabels=corr_matrix_raw.columns, col_colors=colors,
cmap=palette, linewidths=linewidth, linecolor='black').fig.suptitle(title, y=1.02)
# Create legend for colors
if type(colors[0]) is list:
col_dict_Time = {i:j for i,j in zip(pdata.TimeCourse, colors[1])}
legend_labels_time = [mpatches.Patch(color=col_dict_Time[i], label=i) for i in col_dict_Time]
legend1 = plt.legend(handles=legend_labels_time, bbox_to_anchor=(1, 1),
bbox_transform=plt.gcf().transFigure, loc='upper right',
title='Time Point')
col_dict = {i:j for i,j in zip(pdata.Condition, colors[0])}
legend_labels = [mpatches.Patch(color=col_dict[i], label=i) for i in col_dict]
legend2 = plt.legend(handles=legend_labels, bbox_to_anchor=(1, 0.88),
bbox_transform=plt.gcf().transFigure, loc='upper right',
title='Condition')
plt.gca().add_artist(legend1)
else:
col_dict = {i:j for i,j in zip(pdata.Condition, colors)}
legend_labels = [mpatches.Patch(color=col_dict[i], label=i) for i in col_dict]
plt.legend(handles=legend_labels, bbox_to_anchor=(1, 1),
bbox_transform=plt.gcf().transFigure, loc='upper right',
title='Condition')
if save is not None:
if vector is True:
plt.savefig(save + 'pearson.svg', bbox_inches='tight')
else:
plt.savefig(save + 'pearson.png', dpi=dpi, bbox_inches='tight')
plt.xlabel('')
plt.ylabel('')
plt.show()
plt.show(block=True)
[docs]def DynamicRange(self, *Proteins, color='#565059',
protein_color='orange', max_min=False,
min_color='#18ab75', max_color='#ab4e18', dpi=300, save=None,
vector=True):
"""Dynamic range plot
Args:
mean_color (str, optional): default color for dots (mean abundance).
Defaults to '#565059'.
protein_color (str, optional): Color for specific proteins (Args).
Defaults to 'orange'.
max_min (bool, optional): Plot the maximum and minimum abundance
value for each protein. Defaults to False.
min_color (str, optional): Color of minimum values. Defaults to '#1daec7'.
max_color (str, optional): Color of maximum values. Defaults to '#f7463d'.
save (str, optional): Path to save figure. Defaults to None.
dpi (int, optional): figure resolution. Defaults to 300.
vector (bool, optional): Save figure in as vector (.svg). Defaults to
True.
"""
plt.style.use('default')
plt.rcParams["figure.dpi"] = dpi
OmicScope = copy.copy(self)
df = OmicScope.quant_data
# Dictionary for Accessions
accession = dict(zip(df.Accession, df.gene_name))
df = df.set_index('Accession')
df = df.loc[:, df.columns.str.contains('.', regex=False)]
df = np.log10(df).transpose()
# Getting Mean, Max and Min values for each protein
df = df.describe().transpose()
df = df.dropna()
df = df.reset_index()
# Applying Dictionary to get gene name
df['gene_name'] = df.Accession.replace(accession)
# Ranking entities
df['rank'] = df['mean'].rank(method='first')
# Return variation for each protein
ax_scatter = plt.axes()
if max_min is True:
plt.scatter(y=df['rank'], x=df['min'],
alpha=0.7, s=2, c=min_color)
plt.scatter(y=df['rank'], x=df['max'],
alpha=0.7, s=2, c=max_color)
# Plot mean
plt.scatter(y=df['rank'], x=df['mean'],
c=color, s=10, alpha=0.5,
linewidths=0.5)
# Highlight specific proteins
if len(Proteins) > 0:
df = df[df.gene_name.isin(Proteins)]
plt.scatter(y=df['rank'], x=df['mean'],
c=protein_color, s=10, alpha=1, edgecolors='black',
linewidths=0.5)
ls_final = df
ls_final = ls_final[['gene_name', 'rank', 'mean']]
from adjustText import adjust_text
texts = []
for a, b, c in zip(ls_final['mean'], ls_final['rank'],
ls_final['gene_name']):
texts.append(ax_scatter.text(a, b, c, size=8))
adjust_text(texts, ax=ax_scatter, force_points=0.25, force_text=0.25,
expand_points=(1.5, 1.5), expand_text=(1.5, 1.5),
arrowprops=dict(arrowstyle="-", color='black', lw=0.5))
plt.grid(b=False)
plt.xlabel('log10(Abundance)')
plt.ylabel('Rank')
plt.title('Dynamic Range')
if save is not None:
if vector is True:
plt.savefig(save + 'DynamicRange.svg', bbox_inches='tight')
else:
plt.savefig(save + 'DynamicRange.png', dpi=dpi, bbox_inches='tight')
sns.despine()
plt.show()
[docs]def pca(self, pvalue=1.00, scree_color='#900C3F',
sample_label=False, palette='tab20', FoldChange_cutoff=0,
save=None, dpi=300, vector=True):
""" Perform Principal Component Analysis.
Args:
pvalue (float, optional): p-value threshold. Defaults to 1.00.
scree_color (str, optional): Color of Scree plot. Defaults to '#900C3F'.
sample_label (bool, optional): Insert biological sample label. Defaults to False.
palette (str, optional): Palette for groups. Defaults to 'tab20'.
FoldChange_cutoff (int, optional): Fold change threshold. Defaults to 0.
save (str, optional): Path to save figure. Defaults to None.
dpi (int, optional): figure resolution. Defaults to 300.
vector (bool, optional): Save figure in as vector (.svg). Defaults to
True.
"""
plt.style.use('default')
plt.rcParams["figure.dpi"] = dpi
OmicScope = copy.copy(self)
df = OmicScope.quant_data
FoldChange_cutoff = OmicScope.FoldChange_cutoff
# Filtering P-value and Fold Change
df = df[df[OmicScope.pvalue] <= pvalue]
if len(OmicScope.Conditions) == 2:
df = df.loc[(df['log2(fc)'] <= -FoldChange_cutoff) | (df['log2(fc)'] >= FoldChange_cutoff)]
df = df.loc[:, df.columns.str.contains('.', regex=False)]
samples = df.columns
# Getting Conditions
Conditions = OmicScope.Conditions
group = []
for i in Conditions:
ncond = df.columns.str.contains(i).sum()
nlist = list(itertools.repeat(i, ncond))
group.extend(nlist)
# Center and Scale data
scaled_data = preprocessing.scale(df.T)
pca = PCA() # PCA object
pca.fit(scaled_data)
# get PCA coordinates for scaled_data
pca_data = pca.transform(scaled_data)
# Scree plot
per_var = np.round(pca.explained_variance_ratio_ * 100, decimals=1)
labels = ['PC' + str(x) for x in range(1, len(per_var) + 1)]
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_figwidth(16)
fig.set_figheight(7)
ax1.bar(x=range(1, len(per_var) + 1), height=per_var, tick_label=labels,
color=scree_color, edgecolor="black", linewidth=1)
ax1.set(ylabel='Percentage of Explained Variance',
xlabel='Principal Component', title='Scree plot')
ax1.tick_params(labelrotation=45)
sns.despine()
ax1.grid(b=False)
# PC1 vs PC2 plot
pca_df = pd.DataFrame(pca_data, index=samples, columns=labels)
pdata = OmicScope.pdata
conditions = pdata.set_index('Sample')[['Condition']]
if 'TimeCourse' in pdata.columns:
conditions = pdata.set_index('Sample')[['Condition', 'TimeCourse']]
pca_df = pca_df.merge(conditions, right_index=True, left_index=True)
pca_df = pca_df.set_index(['Condition', 'TimeCourse'])
else:
pca_df = pca_df.merge(conditions, right_index=True, left_index=True)
pca_df = pca_df.set_index(['Condition'])
ax_scatter = sns.scatterplot(x="PC1", y="PC2", data=pca_df,
hue=list(pca_df.index), palette=palette,
edgecolor='black', linewidth=1, s=100, alpha=0.7)
plt.title('Principal Component Analysis ' + '(pvalue=' + str(pvalue) + ')')
plt.xlabel('PC1 - {0}%'.format(per_var[0]))
plt.ylabel('PC2 - {0}%'.format(per_var[1]))
plt.grid(b=False)
sns.despine()
# Annotate samples in PCA plot
if sample_label is True:
from adjustText import adjust_text
texts = []
for a, b, c in zip(pca_df['PC1'], pca_df['PC2'],
conditions.index):
texts.append(ax_scatter.text(a, b, c.split('.')[0], size=8))
adjust_text(texts, ax=ax_scatter, force_points=0.25, force_text=0.25,
expand_points=(1.5, 1.5), expand_text=(1.5, 1.5),
arrowprops=dict(arrowstyle="-", color='black', lw=0.5))
if save is not None:
if vector is True:
plt.savefig(save + 'pca.svg', bbox_inches='tight')
else:
plt.savefig(save + 'pca.png', dpi=dpi, bbox_inches='tight')
plt.show()
[docs]def color_scheme(df, palette):
"""Generate colors to barplot and
Args:
df (DataFrame): dataframe
palette (str): palette
Returns:
color (dict): dictionary assign each condition with respective color.
"""
color = df.index.to_frame().reset_index(drop=True)
color['variable'] = color.astype(str).apply(lambda x: '_'.join(x), axis=1)
color = color.drop_duplicates().sort_values('variable')
colors_scheme = ['red', 'blue', 'orange', 'green', 'darkcyan', 'magenta']
if 'TimeCourse' in color.columns:
timecourse = len(color.TimeCourse.drop_duplicates())
else:
timecourse = 1
random.seed(10)
colors_random = random.sample(colors_scheme, color.Condition.nunique())
palettes = []
if timecourse > 1:
for i in colors_random:
palette = sns.light_palette(i, n_colors=timecourse).as_hex()
palettes.append(palette)
else:
ncolors = color.Condition.nunique()
palettes = sns.color_palette(palette=palette, n_colors=ncolors).as_hex()
color['colors'] = list(pd.Series(palettes).explode())
color = dict(zip(color.variable, color.colors))
return color
[docs]def bar_protein(self, *Proteins, logscale=True,
palette='Spectral', save=None, dpi=300,
vector=True):
"""Bar plot to show protein abundance in each condition
Args:
logscale (bool, optional): Apply log-transformed abundance.
Defaults to True.
palette (str, optional): Palette for groups. Defaults to 'Spectral'.
save (str, optional): Path to save figure. Defaults to None.
dpi (int, optional): figure resolution. Defaults to 300.
vector (bool, optional): Save figure in as vector (.svg). Defaults to
True.
"""
plt.rcParams['figure.dpi'] = dpi
OmicScope = copy.copy(self)
df = copy.copy(OmicScope.whole_data)
# Proteins to plot
df = df[df['gene_name'].isin(Proteins)]
# Get conditions
ctrl = [copy.copy(OmicScope.ctrl)]
conditions = copy.copy(OmicScope.Conditions)
conditions.remove(ctrl[0])
# Get protein abundance for each condition
df = df.set_index('gene_name')
df = df.iloc[:, df.columns.str.contains('.', regex=False)]
df = df.melt(var_name='variable', ignore_index=False)
df = df.reset_index()
pdata = copy.copy(OmicScope.pdata)
pdata = pdata.set_index('Sample')
df = df.set_index('variable')
df = pdata.merge(df, left_index=True, right_index=True)
if 'TimeCourse' in df.columns:
df = df[['Condition', 'TimeCourse', 'gene_name', 'value']]
df = df.set_index(['Condition', 'TimeCourse'])
else:
df = df[['Condition', 'gene_name', 'value']]
df = df.set_index('Condition')
# Apply log transformation
if logscale is True:
df['value'] = np.log2(df['value'])
df['value'] = df['value'].replace(-np.inf, np.nan)
color = color_scheme(df, palette=palette)
df_index = df.index.to_frame().reset_index(drop=True)
df_index = df_index.astype(str).apply(lambda x: '_'.join(x), axis=1)
df.index = df_index
# Size of figure
sns.set(rc={'figure.figsize': (1.5*len(Proteins), 5)})
custom_params = {"axes.spines.right": False, "axes.spines.top": False}
sns.set_theme(style="ticks", rc=custom_params)
# Plot
if len(Proteins) == 1:
sns.barplot(x=df.index, y='value',
data=df, errwidth=1, capsize=0.07,
palette=color,
errorbar='se',
edgecolor='black', linewidth=1, dodge=False)
else:
sns.barplot(x='gene_name', y='value', hue=list(df.index),
data=df, errwidth=1,
capsize=0.07, edgecolor='black',
errorbar='se',
palette=color,
linewidth=1)
plt.legend(bbox_to_anchor=(1.0, 1), loc=2, borderaxespad=0.)
sns.despine()
plt.title('Abundance - ' + ' and '.join(Proteins))
plt.xlabel('')
plt.xticks(rotation=70)
plt.ylabel('log2(Abundance)')
if save is not None:
if vector is True:
plt.savefig(save + 'barplot_' + '_'.join(Proteins) + '.svg', bbox_inches='tight')
else:
plt.savefig(save + 'barplot_' + '_'.join(Proteins) + '.png', dpi=dpi, bbox_inches='tight')
plt.show()
[docs]def boxplot_protein(self, *Proteins, logscale=True,
palette='Spectral', save=None, dpi=300,
vector=True):
"""Boxplot to show protein abundance in each condition
Args:
logscale (bool, optional): Apply abundance log-transformed.
Defaults to True.
palette (str, optional): Palette for groups. Defaults to 'Spectral'.
save (str, optional): Path to save figure. Defaults to None.
dpi (int, optional): figure resolution. Defaults to 300.
vector (bool, optional): Save figure in as vector (.svg). Defaults to
True.
"""
plt.rcParams['figure.dpi'] = dpi
OmicScope = copy.copy(self)
df = copy.copy(OmicScope.whole_data)
# Proteins to plot
df = df[df['gene_name'].isin(Proteins)]
# Get conditions
ctrl = [copy.copy(OmicScope.ctrl)]
conditions = copy.copy(OmicScope.Conditions)
conditions.remove(ctrl[0])
# Get protein abundance for each condition
df = df.set_index('gene_name')
df = df.iloc[:, df.columns.str.contains('.', regex=False)]
df = df.melt(var_name='variable', ignore_index=False)
df = df.reset_index()
pdata = copy.copy(OmicScope.pdata)
pdata = pdata.set_index('Sample')
df = df.set_index('variable')
df = pdata.merge(df, left_index=True, right_index=True)
if 'TimeCourse' in df.columns:
df = df[['Condition', 'TimeCourse', 'gene_name', 'value']]
df = df.set_index(['Condition', 'TimeCourse'])
else:
df = df[['Condition', 'gene_name', 'value']]
df = df.set_index('Condition')
# Apply log transformation
if logscale is True:
df['value'] = np.log2(df['value'])
df['value'] = df['value'].replace(-np.inf, np.nan)
color = color_scheme(df, palette=palette)
df_index = df.index.to_frame().reset_index(drop=True)
df_index = df_index.astype(str).apply(lambda x: '_'.join(x), axis=1)
df.index = df_index
# Size of figure
sns.set(rc={'figure.figsize': (1.5*len(Proteins), 5)})
custom_params = {"axes.spines.right": False, "axes.spines.top": False}
sns.set_theme(style="ticks", rc=custom_params)
# Plot
if len(Proteins) == 1:
sns.boxplot(x=df.index, y='value',
data=df,
palette=color,
linewidth=1, dodge=False)
else:
sns.boxplot(x='gene_name', y='value', hue=list(df.index),
data=df,
palette=color,
linewidth=1)
plt.legend(bbox_to_anchor=(1.0, 1), loc=2, borderaxespad=0.)
sns.despine()
plt.title('Abundance - ' + ' and '.join(Proteins))
plt.xlabel('')
plt.xticks(rotation=70)
plt.ylabel('log2(Abundance)')
if save is not None:
if vector is True:
plt.savefig(save + 'barplot_' + '_'.join(Proteins) + '.svg', bbox_inches='tight')
else:
plt.savefig(save + 'barplot_' + '_'.join(Proteins) + '.png', dpi=dpi, bbox_inches='tight')
plt.show()
[docs]def MAplot(self, *Proteins,
pvalue=0.05, non_regulated='#606060', up_regulated='#E4001B',
down_regulated='#6194BC', FoldChange_cutoff=0,
save=None, dpi=300, vector=True):
"""MA plot
Args:
pvalue (float, optional): p-value threshold. Defaults to 0.05.
non_regulated (str, optional): color for non-regulated proteins.
Defaults to '#606060'.
up_regulated (str, optional): color for up-regulated proteins.
Defaults to '#E4001B'.
down_regulated (str, optional): color for down-regulated proteins.
Defaults to '#6194BC'.
FoldChange_cutoff (int, optional): Foldchange threshold. Defaults to 0.
save (str, optional): Path to save figure. Defaults to None.
dpi (int, optional): figure resolution. Defaults to 300.
vector (bool, optional): Save figure in as vector (.svg). Defaults to
True.
"""
plt.rcParams['figure.dpi'] = dpi
OmicScope = copy.copy(self)
df = copy.copy(OmicScope.quant_data)
if self.logTransform is True:
df['TotalMean'] = np.log2(df['TotalMean'])
df['TotalMean'] = df['TotalMean'].replace(-np.inf, 0.01)
# Defining axis
y = df[f'-log10({OmicScope.pvalue})']
x = df['log2(fc)']
# Defining colors
col = np.where(y >= -np.log10(pvalue), np.where(x >= -FoldChange_cutoff,
np.where(x <= FoldChange_cutoff, non_regulated, up_regulated),
down_regulated), non_regulated)
regulation = np.where(y >= -np.log10(pvalue), np.where(x >= -FoldChange_cutoff,
np.where(x <= FoldChange_cutoff, 'non-regulated',
'up-regulated'), 'down-regulated'), 'non-regulated')
df['col'] = col
df['Regulation'] = regulation
# Plot
sns.set_style("whitegrid")
ax_scatter = sns.scatterplot(x='TotalMean', y='log2(fc)', data=df, hue='Regulation',
palette=list(df.col.drop_duplicates()))
if len(Proteins) > 0:
df = df[df.gene_name.isin(Proteins)]
ls_final = df
ls_final = ls_final[['gene_name', 'log2(fc)', 'TotalMean']]
from adjustText import adjust_text
texts = []
for a, b, c in zip(ls_final['TotalMean'], ls_final['log2(fc)'],
ls_final['gene_name']):
texts.append(ax_scatter.text(a, b, c, size=8))
adjust_text(texts, ax=ax_scatter, force_points=0.25, force_text=0.25,
expand_points=(1.5, 1.5), expand_text=(1.5, 1.5),
arrowprops=dict(arrowstyle="-", color='black', lw=0.5))
sns.despine()
plt.axhline(y=0, color='gray', linestyle=':')
plt.grid(b=False)
plt.xlabel('log2(Mean)')
plt.title('MA plot')
if save is not None:
if vector is True:
plt.savefig(save + 'MAPlot.svg', bbox_inches='tight')
else:
plt.savefig(save + 'MAPlot.png', dpi=dpi, bbox_inches='tight')
plt.show(block=True)
[docs]def find_k(df):
"""Find the optimum k clusters"""
df_norm = df
sse = {}
for k in range(1, 21):
kmeans = KMeans(n_clusters=k, random_state=1)
kmeans.fit(df_norm)
sse[k] = kmeans.inertia_
kn = KneeLocator(x=list(sse.keys()),
y=list(sse.values()),
curve='convex',
direction='decreasing')
k = kn.knee
print('KneeLocator identifies: ' + str(k) + ' clusters')
return k
[docs]def k_trend(self, pvalue=0.05, k_cluster=None,
save=None, dpi=300, vector=True):
"""Perform a K-mean algorithm
BigTrend apply k-mean algorithm to identify co-expressed
proteins/genes. For longitudinal analysis, k-means can help users to
visualize the trends of proteins in the evaluated timecourse.
Optionally, the user can define the number of clusters that k-means will cluster proteins and samples.
By default, OmicScope run KneeLocator algorithm to suggest an optimal number of clusters.
Args:
OmicScope (_type_): OmicScope object.
pvalue (float, optional): _description_. Defaults to 0.05.
k_cluster (int, optional): Number of cluster to perform k-means.
If None, OmicScope defines k-clusters based on KneeLocator algorithm.
Defaults to None.
save (str, optional): Path to save figure. Defaults to None.
dpi (int, optional): figure resolution. Defaults to 300.
vector (bool, optional): Save figure in as vector (.svg). Defaults to
True.
Returns:
_type_: _description_
"""
plt.rcParams['figure.dpi'] = dpi
omics = copy.copy(self)
pdata = omics.pdata
try:
pdata['sample'] = pdata[['Condition', 'TimeCourse']].astype(str).apply(lambda x: '-'.join(x), axis=1)
except KeyError:
pdata['sample'] = pdata[['Condition', 'Biological']].astype(str).apply(lambda x: '-'.join(x), axis=1)
data = omics.quant_data
protein_dictionary = dict(zip(data['Accession'], data['gene_name']))
data = data[data[omics.pvalue] <= pvalue]
data = data.set_index('Accession')
data = data.iloc[:, data.columns.str.contains('.', regex=False)]
data = data.replace(0, 0.01)
if self.logTransform is True:
data = np.log2(data)
# Scale protein abundance according to their mean (z-score)
zscored_data = zscore(data, axis=1)
if k_cluster is None:
n_clusters = find_k(zscored_data)
else:
n_clusters = k_cluster
kmeans = KMeans(n_clusters=n_clusters, init="k-means++",
max_iter=500)
protein_cluster_assig = kmeans.fit(zscored_data)
k_data = zscored_data
k_data.columns = pdata['sample']
k_data['cluster'] = protein_cluster_assig.labels_
k_data_protein = k_data.reset_index().melt(id_vars=['Accession', 'cluster'],
var_name='sample')
dictionary = dict(zip(pdata['sample'], pdata['Condition']))
k_data_protein['Condition'] = k_data_protein[['sample']].replace(dictionary)
k_data_protein['gene_name'] = k_data_protein[['Accession']].replace(protein_dictionary)
g = sns.catplot(
data=k_data_protein, x="sample", y="value", hue="Condition", col="cluster",
capsize=.2, palette="viridis",
kind="point", col_wrap=2
)
g.despine()
(g.map(plt.axhline, y=0, color="black", dashes=(2, 1), zorder=0)
.set_ylabels('z-score', clear_inner=True)
.set_xlabels('Conditions', clear_inner=True)
.set(xticklabels=[]))
if save is not None:
if vector is True:
plt.savefig(save + 'MAPlot.svg', bbox_inches='tight')
else:
plt.savefig(save + 'MAPlot.png', dpi=dpi, bbox_inches='tight')
plt.show()
k_data_protein = k_data_protein.groupby('cluster')['gene_name'].apply(set).reset_index()
return k_data_protein
[docs]def PPInteractions(self, *Proteins,
network_k=None,
network_iterations=50,
score_threshold=0.6, labels=False, modules=False,
module_palette='Paired', species=9606,
pvalue=0.05, network_type='functional', save=None, dpi=300, vector=True):
"""Protein-Protein interaction
Using String API, OmicScope search or protein-protein interactions among
the proteins from users's dataset.
Args:
network_k (int, optional): Optimal distance between nodes. If None the distance is set to
1/sqrt(n) where n is the number of nodes. Defauts to None.
network_iterations (int, optional): Maximum number of iterations taken. Defaults to 50.
score_threshold (float, optional): String-score threshold. Defaults to 0.6.
labels (bool, optional): Show protein names. Defaults to False.
modules (bool, optional): Perform Louvain Method to find communities/modules.
Defaults to False.
module_palette (str, optional): Color palette to assign modules. Defaults to 'Paired'.
species (int, optional): String requires the definition of organism to search
for proteins name, according to NCBI identifier. Defaults to 9606 (Human).
Mus musculus = 10090; Rattus norvegicus = 10116.
pvalue (float, optional): p-value threshold to proteins to be accepted.
Defaults to 0.05 (differentially regulated).
network_type (str, optional): Interactions can be defined as 'functional' or
'physical'. Defaults to 'functional'.
save (str, optional): Path to save figure. Defaults to None.
dpi (int, optional): figure resolution. Defaults to 300.
vector (bool, optional): Save figure in as vector (.svg). Defaults to
True.
Returns:
Networkx object
"""
# Set parameters to import data
plt.rcParams['figure.dpi'] = dpi
string_api_url = "https://version12.string-db.org/api"
output_format = "json"
method = "network"
data = copy.copy(self)
conds = data.Conditions
data = data.quant_data
data = data[data[self.pvalue] <= pvalue]
data = data.sort_values(self.pvalue)
data = data.reset_index(drop=True)
if len(data) > 2000:
data = data.sort_values(self.pvalue)
data = data.iloc[0:1999, :]
data['TopTerms'] = data.index+1
foldchange_range = data['log2(fc)']
genes = data['gene_name'].dropna().drop_duplicates()
genes = list(genes.astype(str))
request_url = "/".join([string_api_url, output_format, method])
params = {
"identifiers": "%0d".join(genes),
"species": species, # species NCBI identifier
"caller_identity": "Omicscope",
'show_query_node_labels': 1,
'network_type': network_type
}
response = requests.post(request_url, data=params)
string = pd.read_json(response.text)
# Filter string results based on score
string = string[string['score'] >= score_threshold]
if len(conds)==2:
norm = mpl.colors.TwoSlopeNorm(vmin=min(foldchange_range),
vmax=max(foldchange_range),
vcenter=0)
else:
norm = mpl.colors.TwoSlopeNorm(vmin=min(foldchange_range),
vmax=max(foldchange_range),
vcenter=np.mean(foldchange_range))
cmap = cm.RdBu_r
m = cm.ScalarMappable(norm=norm, cmap=cmap)
color_hex = [mcolors.to_hex(m.to_rgba(x)) for x in data['log2(fc)']]
source = pd.DataFrame({'ID': data['gene_name'],
'Size': 10,
'type': 'protein',
'color': color_hex,
'regulation': data['log2(fc)']})
source = source.drop_duplicates()
source = source.dropna()
string = string.rename(columns={'score': 'weight'})
G = nx.from_pandas_edgelist(string,
source='preferredName_A',
target='preferredName_B',
edge_attr='weight',
create_using=nx.Graph)
carac = source.set_index('ID')
carac['color_edge'] = 'black'
linewidths = 0.8
nx.set_node_attributes(G, dict(zip(carac.index, carac.color)), name="color")
nx.set_node_attributes(G, dict(zip(carac.index, carac.Size)), name="Size")
# widths = nx.get_edge_attributes(G, 'score')
G.edges(data=True)
# Find Communities
if modules is True:
import networkx.algorithms.community as nx_comm
# Find communities based on label propagation
communities = nx_comm.louvain_communities(G)
module = []
terms = []
color = []
degree = []
# Linking Terms, module, colors
cmap = sns.color_palette(module_palette, len(communities), desat=.9)
for i, g in enumerate(communities):
subgraph = G.subgraph(g)
degree_subgraph = subgraph.degree
degree.extend(degree_subgraph)
module.extend([i]*len(g))
terms.extend(list(g))
if len(g) > len(source)*0.05:
color.extend([mcolors.to_hex(cmap[i])]*len(g))
else:
color.extend(['#666666']*len(g))
# DataFrame
modularity = pd.DataFrame(zip(module, terms, color, degree), columns=[
'Module', 'ID', 'color_edge', 'Degree'])
# Merging carac with modularity data
carac = carac.merge(modularity, on='ID', suffixes=('_x', ''))
carac = carac.set_index('ID')
nx.set_node_attributes(G, dict(zip(carac.index, carac.Module)), name="Module")
carac = carac.reindex(G.nodes())
linewidths = 1.5
pos = nx.spring_layout(G,
k=network_k,
iterations=network_iterations,
threshold=0.0001, weight='weight',
center=None,)
carac = carac.reindex(G.nodes())
weights = nx.get_edge_attributes(G, 'weight')
nx.draw(G,
pos=pos,
node_color=carac['color'],
node_size=carac['Size']*10,
edgecolors=carac['color_edge'],
linewidths=linewidths,
alpha=0.9,
width=[i**4 for i in list(weights.values())],
edge_color='#666666')
if labels is True:
nx.draw_networkx_labels(G, pos, font_size=6)
if save is not None:
nx.write_graphml(G, save + 'PPinteraction.graphml', named_key_ids=True)
if vector is True:
plt.savefig(save + 'PPinteractions.svg', bbox_inches='tight')
else:
plt.savefig(save + 'PPinteractions.dpi', dpi=dpi, bbox_inches='tight')
plt.show(block=True)
return G