Source code for omicscope.MultipleData.MultipleVisualization

from copy import copy

import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import seaborn as sns
from pycirclize import Circos


[docs]def barplot(self, save=None, vector=True, dpi=300): """ Barplot Bar plot for proteins/genes identified and differentially regulated according to each group Args: save (str, optional): Path to save image. Defaults to None. vector (bool, optional): If image should be export as .svg. Defaults to True. dpi (int, optional): Image resolution. Defaults to 300. """ plt.rcParams['figure.dpi'] = dpi data = copy(self) conditions = copy(data.groups) group_data = data.original difreg = data.group_data # -Desregulation figures whole_proteome = [] deps = [] for i, a in zip(group_data, difreg): whole_proteome.append(len(i)) deps.append(len(a)) conditions.extend(['Total']) proteinsIdentified = [] for i in group_data: proteinsIdentified.append(i['gene_name']) proteinsIdentified = pd.concat(proteinsIdentified).drop_duplicates() proteinsdes = [] for i in difreg: proteinsdes.append(i['gene_name']) proteinsdes = pd.concat(proteinsdes).drop_duplicates() whole_proteome.extend([len(proteinsIdentified)]) deps.extend([len(proteinsdes)]) r = [i for i in range(0, len(conditions))] f, (ax, ax2) = plt.subplots(2, 1, sharex=True) ax.bar(r, whole_proteome, color='lightgray', edgecolor='black', width=0.5, linewidth=1) colors = copy(self.colors) colors.extend(['gray']) ax.bar(r, deps, color=colors, edgecolor='black', width=0.5, linewidth=1) plt.xticks(r, fontweight=None, rotation=45, ha='right') ax2.bar(r, whole_proteome, color='lightgray', edgecolor='black', width=0.5, linewidth=1) ax2.bar(r, deps, color=colors, edgecolor='black', width=0.5, linewidth=1) ax.set_ylim(max(whole_proteome[:-1])*1.1, max(whole_proteome)*1.01) # outliers only ax2.set_ylim(0, max(whole_proteome[:-1])*1.005) # most of the data sns.despine() ax.spines['bottom'].set_visible(False) ax.spines['top'].set_visible(False) ax.tick_params(bottom=False) plt.xticks(r, conditions, fontweight=None, rotation=45, ha='right') for i, j, k, c in zip(deps[:-1], r[:-1], whole_proteome[:-1], colors[:-1]): ax2.annotate(i, xy=[j, k*1.005], ha='center', color=c, weight='bold') ax.annotate(deps[-1], [r[-1], whole_proteome[-1]*1.001], ha='center', color=colors[-1], weight='bold') if save is not None: if vector is True: plt.savefig(save + 'barplot.svg', bbox_inches='tight') else: plt.savefig(save + 'barplot.png', dpi=dpi, bbox_inches='tight') plt.show()
[docs]def diff_reg(self, save=None, vector=True, dpi=300): """Dotplot Dotplot for number of proteins up- and down-regulated in each group. Args: save (str, optional): Path to save image. Defaults to None. vector (bool, optional): If image should be export as .svg. Defaults to True. dpi (int, optional): Image resolution. Defaults to 300. """ plt.rcParams['figure.dpi'] = dpi data = copy(self) groups = data.groups difreg = data.group_data up = [] down = [] for i in difreg: up.append(len(i[i['log2(fc)'] > 0])) down.append(-len(i[i['log2(fc)'] < 0])) dysregulations = pd.DataFrame(columns=groups, data=[up, down], index=['Up-regulated', 'Down-regulated']).transpose() df = dysregulations df = df.reset_index() df = df.melt('index') df['color'] = np.where(df['value'] > 0, '#e3432d', '#167a9c') df['value'] = abs(df['value']) M = 2 N = len(groups) fig, ax = plt.subplots() fig.set_figwidth(2) scatter = ax.scatter(x=df['variable'], y=df['index'], s=df['value'], c=df['color'], ec='black', lw=0.5) ax.set_xticks(np.arange(M+1)-0.5, minor=True) ax.set_yticks(np.arange(N+1)-0.5, minor=True) sns.despine() plt.xticks(rotation=45, ha='right') plt.yticks(rotation=45) kw = dict(prop="sizes", num=3, color='black', alpha=.6) ax.legend(*scatter.legend_elements(**kw), title="# Proteins", handleheight=2, bbox_to_anchor=(1, 1), loc="upper left", markerscale=1, edgecolor='white') plt.margins() if save is not None: if vector is True: plt.savefig(save + 'diff_reg.svg', bbox_inches='tight') else: plt.savefig(save + 'diff_reg.png', dpi=dpi, bbox_inches='tight') plt.show()
[docs]def whole_network(self, labels=False, save=None, vector=True, dpi=300): """Network of entities differentially regulated for each group analyzed. Args: labels (bool, optional): Show graph labels. Defaults to False. save (str, optional): Path to save image. Defaults to None. Defaults to None. dpi (int, optional): Image resolution. Defaults to 300. """ import matplotlib as mpl import matplotlib.cm as cm import matplotlib.colors as mcolors plt.rcParams['figure.dpi'] = dpi data = copy(self) network_frame = [] for group, df, color in zip(data.groups, data.original, data.colors): df['Experiment'] = group df = df[df[self.pvalue] <= 0.05] df['Size'] = len(df) df['color'] = color network_frame.append(df) network_frame = pd.concat(network_frame) source = pd.DataFrame({'ID': network_frame['Experiment'], 'Size': network_frame['Size'], 'type': 'Experiment', 'color': network_frame['color']}) source = source.drop_duplicates() norm = mpl.colors.TwoSlopeNorm(vmin=min(network_frame['log2(fc)']), vmax=max(network_frame['log2(fc)']), vcenter=0) cmap = cm.RdYlBu_r m = cm.ScalarMappable(norm=norm, cmap=cmap) color_hex = [mcolors.to_hex(m.to_rgba(x)) for x in network_frame['log2(fc)']] target = pd.DataFrame({'ID': network_frame['gene_name'], 'Size': int(min(network_frame['Size'])*0.5), 'type': 'Protein', 'color': color_hex}) target = target.drop_duplicates() edgelist = network_frame[['Experiment', 'gene_name']] G = nx.from_pandas_edgelist(edgelist, source='Experiment', target='gene_name', create_using=nx.Graph) carac = pd.concat([source, target]).reset_index(drop=True) carac = carac.drop_duplicates('ID') carac = carac.set_index('ID') nx.set_node_attributes(G, dict(zip(carac.index, carac.color)), name="Color") nx.set_node_attributes(G, dict(zip(carac.index, carac.Size)), name="Size") pos = nx.kamada_kawai_layout(G) carac = carac.reindex(G.nodes()) nx.draw(G, pos=pos, node_color=carac['color'], node_size=carac['Size']/20, edgecolors='black', linewidths=0.4, alpha=0.9, width=0.2, edge_color='gray') if labels is True: nx.draw_networkx_labels(G, pos, font_size=6) if save is not None: nx.write_graphml(G, save + 'PPNetwork.graphml', named_key_ids=True) if vector is True: plt.savefig(save + 'PPNetwork.svg', bbox_inches='tight') else: plt.savefig(save + 'PPNetwork.dpi', dpi=dpi, bbox_inches='tight') plt.show() return (G)
[docs]def dotplot_enrichment(self, *Terms, top=5, fig_height=None, palette='PuBu', save=None, vector=True, dpi=300): """Dotplot Enrichment Dotplot to visualize together the enrichment data for each group Args: top (int, optional): Top N pathway to considered in each group. Defaults to 5. palette (str, optional): color palette. Defaults to 'PuBu'. fig_height (int, optional): User optionally can define figure height. Defaults to None save (str, optional): Path to save image. Defaults to None. vector (bool, optional): If image should be export as .svg. Defaults to True. dpi (int, optional): Image resolution. Defaults to 300. """ plt.rcParams['figure.dpi'] = dpi enrichments = [] groups = [] for g,e in zip(copy(self.groups), copy(self.enrichment)): if e is not None: enrichments.append(e) groups.append(g) genesets = [list(x.Gene_set.drop_duplicates()) for x in enrichments] genesets = pd.Series(sum(genesets, [])).drop_duplicates() for i in genesets: data = enrichments data = [x[x['Gene_set'] == i] for x in data] terms = [x.iloc[:top, :] for x in data] terms = [list(x['Term']) for x in terms] terms = pd.Series(sum(terms, [])).drop_duplicates() if len(Terms) > 0: terms = Terms data = [x[x['Term'].isin(terms)] for x in data] data = [x.assign(Group=y) for x, y in zip(data, groups)] data = pd.concat(data) data = data[['Term', 'Overlap', 'Adjusted P-value', 'Group']] data['Overlap'] = data.Overlap.str.split('/', regex=False).str[0] data['Overlap'] = data.Overlap.astype(int) data['-log10(p)'] = -np.log10(data['Adjusted P-value']) fig, ax = plt.subplots() if fig_height is not None: fig.set_figheight(fig_height) sns.set_style('white') sns.scatterplot(data=data, x='Group', y='Term', size='-log10(p)', hue='-log10(p)', palette=palette, sizes=(40, 280), linewidth=0.5, edgecolor='black' ) plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0) sns.despine() plt.margins() plt.ylabel('') if save is not None: if vector is True: plt.savefig(save + '_'+i+'_' + 'dotplot_enrichment.svg', bbox_inches='tight') else: plt.savefig(save + '_'+i+'_' + 'dotplot_enrichmnet.dpi', dpi=dpi, bbox_inches='tight') plt.show()
[docs]def protein_overlap(self, min_subset=10, face_color='darkcyan', shad_color="#f0f0f0", edge_color='black', linewidth=1, save=None, vector=True, dpi=300): """Upset plot Upset plot to evaluate protein overlap among groups. Args: min_subset (int, optional): Minimum overlap size to consider for upset plot. Defaults to 10. face_color (str, optional): Bar and dot colors. Defaults to 'darkcyan'. shad_color (str, optional): Shad color in the dot part of the graph. Defaults to "#f0f0f0". edge_color (str, optional): edge colors. Defaults to 'black'. linewidth (int, optional): line widths. Defaults to 1. save (str, optional): Path to save image. Defaults to None. vector (bool, optional): If image should be export as .svg. Defaults to True. dpi (int, optional): Image resolution. Defaults to 300. """ plt.rcParams['figure.dpi'] = dpi from upsetplot import UpSet from upsetplot import from_contents plt.style.context('classic') plt.rcParams['grid.alpha'] = 0 plt.rcParams['figure.dpi'] = dpi plt.rcParams['patch.linewidth'] = linewidth plt.rcParams['patch.edgecolor'] = 'black' plt.rcParams['patch.force_edgecolor'] = True data = copy(self) genes = [] for i in data.group_data: genes.append(i['gene_name'].drop_duplicates()) dictionary = dict(zip(data.groups, genes)) upset = from_contents(dictionary) figure = UpSet(upset, facecolor=face_color, shading_color=shad_color, min_subset_size=min_subset, show_counts=True, with_lines=True) for i in data.groups: figure.style_subsets(present=i, edgecolor=edge_color, linewidth=linewidth) figure.plot() if save is not None: if vector is True: plt.savefig(save + 'upset_proteins.svg', bbox_inches='tight') else: plt.savefig(save + 'upset_proteins.png', dpi=dpi, bbox_inches='tight') plt.show()
[docs]def enrichment_overlap(self, min_subset=1, face_color='darkcyan', shad_color="#f0f0f0", edge_color='black', linewidth=1, save=None, vector=True, dpi=300): """Upset plot Upset plot to evaluate enrichment terms overlap among groups. Args: min_subset (int, optional): minimum number of overlap size to consider for upset plot . Defaults to 10. face_color (str, optional): Bar and dot colors. Defaults to 'darkcyan'. shad_color (str, optional): Shad color in the dot part of the graph. Defaults to "#f0f0f0". edge_color (str, optional): edge colors. Defaults to 'black'. linewidth (int, optional): line widths. Defaults to 1. save (str, optional): Path to save image. Defaults to None. vector (bool, optional): If image should be export as .svg. Defaults to True. dpi (int, optional): Image resolution. Defaults to 300. Raises: IndexError: If there is no Enrichment data on .omics file. """ from upsetplot import UpSet from upsetplot import from_contents plt.style.context('classic') plt.rcParams['grid.alpha'] = 0 plt.rcParams['figure.dpi'] = dpi plt.rcParams['patch.linewidth'] = linewidth plt.rcParams['patch.edgecolor'] = 'black' plt.rcParams['patch.force_edgecolor'] = True data_original = copy(self) genes = [] if all(enr is None for enr in self.enrichment): raise IndexError('There is not Enrichment result in data!') enrichment = [] groups = [] for e, g in zip(data_original.enrichment, data_original.groups): if e is not None: enrichment.append(e) groups.append(g) data = copy(self) data.enrichment = enrichment data.groups = groups for i in data.enrichment: try: genes.append(i['Term'].drop_duplicates()) except KeyError: df = pd.DataFrame(columns=['Gene_set', 'Term', 'Overlap', 'Adjusted P-value', 'Genes']) genes.append(df['Term'].drop_duplicates()) dictionary = dict(zip(data.groups, genes)) upset = from_contents(dictionary) figure = UpSet(upset, facecolor=face_color, shading_color=shad_color, min_subset_size=min_subset, show_counts=True, with_lines=True) for i in data.groups: figure.style_subsets(present=i, edgecolor=edge_color, linewidth=linewidth) figure.plot() if save is not None: if vector is True: plt.savefig(save + 'upset_pathways.svg', bbox_inches='tight') else: plt.savefig(save + 'upset_pathways.png', dpi=dpi, bbox_inches='tight') plt.show()
[docs]def similarity_network(self, pvalue=1, comparison_param='log2(fc)', metric='jaccard', absolute_similarity_cutoff=0.2, save=None, vector=True, dpi=300): """Similarity Network plot Perform a pairwise correlation analysis and create a graph where groups are depicted as nodes, and pairwise similarity indices serve as edges. In order to establish a connection between two groups, the function filters edges based on an absolute similarity cutoff, excluding edges that fall within a specified interval range, for instance, -0.2 to 0.2, when the absolute_similarity_cutoff is set to 0.2. Furthermore, when utilizing the Jaccard similarity index, this function takes into account the shared 'gene_name' between groups. In contrast, for the other available options, the function considers either 'TotalMean' or 'log2(fc)' columns Args: pvalue (int, optional): P-value threshold to proteins that OmicScope must consider for analysis. Defaults to 1. comparison_param (str, optional): Parameter/column to take into account in pairwise comparison. Defaults to 'log2(fc)'. Optionally 'TotalMean'. absolute_similarity_cutoff (float, optional): Cuttoff to consider the links between groups. Since major similarity indexes have positive and negative values, the function expect an absolute value to perform cuttof. Defaults to 0.2 (which means -0.2 < cutoff< 0.2). metric (str, optional): algorithm to perform pairwise comparison. Defaults to 'correlation'. Optionally, user can test other algorithm described in scipy.spatial.distance. center(float, optional): number to center the heatmap color gradient. palette (str, optional): color palette to plot heatmap. Defaults to 'RdYlBu'. save (str, optional): Path to save image. Defaults to None. vector (bool, optional): If image should be export as .svg. Defaults to True. dpi (int, optional): Image resolution. Defaults to 300. """ from copy import copy plt.rcParams['figure.dpi'] = dpi palette = self.colors conditions = self.groups pval = self.pvalue data = copy(self) data1 = data.original totalMean = [] for i in data1: df = i.groupby('gene_name').mean() df = df[df[self.pvalue] <= pvalue] df = df[[comparison_param]] totalMean.append(df) wholedata = pd.concat(totalMean, axis=1, join='outer') wholedata.columns = data.groups corr = wholedata from sklearn.metrics import pairwise_distances # Replace -inf to the lowest non-inf value in data corr = corr.replace(-np.inf, corr.replace(-np.inf, np.nan).dropna().min().min()) # Inicializar um DataFrame vazio para armazenar as distâncias # Inicializar um DataFrame vazio para armazenar as distâncias df_dist = pd.DataFrame(index=corr.columns, columns=corr.columns) # Calculate the distance between each group for col1 in corr.columns: for col2 in corr.columns: if col1 != col2: # Jaccard uses the index for the computation if metric == 'jaccard': set1 = set(corr[col1].dropna().index) set2 = set(corr[col2].dropna().index) distance = len(set1.intersection(set2))/len(set1.union(set2)) df_dist.at[col1, col2] = 1-distance else: # For other methods, we use the non-zero quantitative values from each condition distance = corr[[col1, col2]].dropna(axis=0) distance = pairwise_distances(distance.T.to_numpy(), metric=metric, force_all_finite='allow-nan') df_dist.at[col1, col2] = distance[0][1] df = 1-df_dist corr = df # Perform the absoltute_cuttoff for values corr = df.applymap(lambda x: 0 if -absolute_similarity_cutoff <= x <= absolute_similarity_cutoff else x) corr.columns, corr.index = data.groups, data.groups corr = corr.replace(np.nan, 0) # Plotting graph G = nx.from_pandas_adjacency(corr) G.edges(data=True) size = [len(set(x[x[pval] <= pvalue].gene_name)) for x in self.original] carac = pd.DataFrame(zip(conditions, palette, size), columns=['ID', 'color', 'Size']) carac = carac.set_index('ID') nx.set_node_attributes(G, dict(zip(carac.index, carac.color)), name="Color") nx.set_node_attributes(G, dict(zip(carac.index, carac.Size)), name="Size") carac = carac.reindex(G.nodes()) pos = nx.kamada_kawai_layout(G, weight=None) edges = G.edges weights = [G[u][v]['weight'] for u, v in edges] weights = [round(x, 2) for x in weights] norm = [float(i)*5/np.mean(weights) for i in weights] G.remove_edges_from(nx.selfloop_edges(G)) nx.draw(G, pos=pos, node_color=carac['color'], node_size=carac['Size'], edgecolors='black', linewidths=0.6, alpha=0.9, width=norm, edge_color='gray') nx.draw_networkx_labels(G, pos, font_size=6) labels = nx.get_edge_attributes(G, 'weight') nx.draw_networkx_edge_labels(G, pos, edge_labels=dict(zip(list(labels.keys()), weights))) if save is not None: nx.write_graphml(G, save + 'Similarity_network.graphml', named_key_ids=True) if vector is True: plt.savefig(save + 'Similarity_network.svg', bbox_inches='tight') else: plt.savefig(save + 'Similarity_network.dpi', dpi=dpi, bbox_inches='tight') plt.show()
[docs]def similarity_heatmap(self, pvalue=1, comparison_param='log2(fc)', metric='correlation', center=0, palette='RdYlBu_r', annotation=True, save=None, vector=True, dpi=300): """Similarity heatmap plot Perform a pair-wise similarity analysis and plot a heatmap. When utilizing the Jaccard similarity index, this function takes into account the shared 'gene_name' between groups. In contrast, for the other available options, the function considers either 'TotalMean' or 'log2(fc)' columns Args: pvalue (int, optional): P-value threshold to proteins that OmicScope must consider for analysis. Defaults to 1. comparison_param (str, optional): Parameter to take into account in pairwise comparison. Defaults to 'log2(fc)'. Optionally 'TotalMean'. metric (str, optional): algorithm to perform pairwise comparison. Defaults to 'correlation'. Optionally, user can test other algorithm described in scipy.spatial.distance. center(float, optional): number to center the heatmap color gradient. palette (str, optional): color palette to plot heatmap. Defaults to 'RdYlBu'. save (str, optional): Path to save image. Defaults to None. vector (bool, optional): If image should be export as .svg. Defaults to True. dpi (int, optional): Image resolution. Defaults to 300. """ from copy import copy plt.rcParams['figure.dpi'] = dpi data = copy(self) data1 = data.original totalMean = [] colors = data.colors for i in data1: df = i.groupby('gene_name').mean() df = df[df[self.pvalue] <= pvalue] df = df[[comparison_param]] totalMean.append(df) wholedata = pd.concat(totalMean, axis=1, join='outer') wholedata.columns = data.groups corr = wholedata from sklearn.metrics import pairwise_distances # Replace -inf to the lowest non-inf value in data corr = corr.replace(-np.inf, corr.replace(-np.inf, np.nan).dropna().min().min()) # Inicializar um DataFrame vazio para armazenar as distâncias # Inicializar um DataFrame vazio para armazenar as distâncias df_dist = pd.DataFrame(index=corr.columns, columns=corr.columns) # Calculate the distance between each group for col1 in corr.columns: for col2 in corr.columns: if col1 != col2: # Jaccard uses the index for the computation if metric == 'jaccard': set1 = set(corr[col1].dropna().index) set2 = set(corr[col2].dropna().index) distance = len(set1.intersection(set2))/len(set1.union(set2)) df_dist.at[col1, col2] = 1-distance else: # For other methods, we use the non-zero quantitative values from each condition distance = corr[[col1, col2]].dropna(axis=0) distance = pairwise_distances(distance.T.to_numpy(), metric=metric, force_all_finite='allow-nan') df_dist.at[col1, col2] = distance[0][1] df = 1-df_dist corr = df # Perform the absoltute_cuttoff for values corr.columns, corr.index = data.groups, data.groups corr = corr.replace(np.nan, 0) annot = copy(corr) if annotation is False: annot[annot != -2] = np.nan annot[annot == 1] = np.nan sns.clustermap(corr, cmap=palette, center=center, annot=annot, mask=annot.isnull(), col_colors=colors, row_colors=colors) if save is not None: if vector is True: plt.savefig(save + 'similarity_heatmap.svg', bbox_inches='tight') else: plt.savefig(save + 'similarity_heatmap.png', dpi=dpi, bbox_inches='tight') plt.show()
[docs]def overlap_fisher(group1, group2, union): """Perform a pair-wise comparison based on hypergeometric distribution. Args: group1 (Series, pandas): Column condition 1 group2 (Series, pandas): Column condition 2 union (int): number of whole entities evaluated in the study among all conditions Returns: Pvalue (float): P-value """ from scipy.stats import hypergeom deps1 = set(group1) deps2 = set(group2) intersection = len(deps1.intersection(deps2)) distribution = min([len(deps1), len(deps2)]) [M, n, N] = [union, len(deps1), len(deps2)] rv = hypergeom(M, n, N) x = np.arange(intersection, distribution) pval = sum(rv.pmf(x)) return pval
[docs]def distribution_test(self, protein_pvalue, method): """This function performs a statistical analysis on protein data considering overlaps between groups. The function performs different statistical tests depending on the chosen method: 1. t-test (ttest): This test is used to compare the means of two groups assuming normally distributed data. 2. Wilcoxon signed-rank test (wilcoxon): This non-parametric test is used to compare two related groups when the data may not be normally distributed. 3. Kolmogorov-Smirnov test (ks): This test is used to compare the probability distributions of two samples. Args: protein_pvalue (float): The cut-off value for protein p-values. method (str): The statistical method to be used for comparison. Valid options include "ttest" (t-test), "wilcoxon" (Wilcoxon signed-rank test), and "ks" (Kolmogorov-Smirnov test). comparison among groups considering t-test (for parametric distributions), wilcoxon (for non-parametric distributions), and ks (kolmorov-smirnov test). Returns: matrix (DataFrame, pandas): P-value """ from scipy.stats import ttest_ind, wilcoxon, kstest from scipy.spatial.distance import squareform import itertools protein_pvalue = protein_pvalue stat = method conditions = self.groups data = [i[i[self.pvalue]<=protein_pvalue] for i in self.original] data = [i.set_index('Accession') for i in data] data = [i['log2(fc)']for i in data] data = [i.replace(-np.inf, np.nan) for i in data] data = [i.replace(np.nan, min(i)-1) for i in data] pair_data = list(itertools.combinations(data,2)) overlap = [ set(i[0].index) & set(i[1].index) for i in pair_data] pair_data = [ (i[0][i[0].index.isin(j)], i[1][i[1].index.isin(j)] ) for i,j in zip(pair_data,overlap)] if stat == 'ttest': stats = [ttest_ind(i[0], i[1])[1] for i in pair_data] if stat == 'wilcoxon': stats = [wilcoxon(i[0], i[1])[1] for i in pair_data] if stat == 'ks': stats = [] for i in pair_data: try: stats.append(kstest(i[0], i[1])[1]) except: stats.append(0) matrix = squareform(stats) matrix = pd.DataFrame(matrix, columns=conditions, index=conditions) return matrix
[docs]def fisher_test(self, protein_pvalue, background_lenght): """This function performs a pair-wise statistical analysis using Fisher's exact test. Fisher's exact test is a statistical test used to compare two nominal variables from two samples. In this context, it's used to compare the proportions of proteins with significant p-values (determined by protein_pvalue) between groups. Args: protein_pvalue (float): The cut-off value for protein p-values. background_lenght (float, optional): The total number of entities in the background set (optional). If not provided, all genes from the original data are used as the background (Recommended). Returns: matrix (DataFrame, pandas): P-value """ from scipy.spatial.distance import squareform, pdist conditions = self.groups pval = self.pvalue if background_lenght is None: union = set(pd.concat(self.original).gene_name) union = len(union) else: union = background_lenght sets = [set(x[x[pval] <= protein_pvalue].gene_name) for x in self.original] sets = pd.DataFrame(sets) matrix = pdist(sets, lambda u, v: overlap_fisher(u, v, union=union)) matrix = squareform(matrix) matrix = pd.DataFrame(matrix, columns=conditions, index=conditions) return matrix
[docs]def stat_matrix(self, method, protein_pvalue, background_lenght): """ Performs a pair-wise statistical comparison between groups based on the chosen method and a protein p-value cut-off. Args: self: Reference to the class instance where this function is called. method (str): The statistical method to be used for the comparison. Valid options include: * "fisher": Performs Fisher's exact test, suitable for comparing proportions of significant proteins between groups. * "ttest": Performs a t-test, assuming normally distributed data. * "wilcoxon": Performs a Wilcoxon signed-rank test, a non-parametric alternative for comparing related groups when normality cannot be assumed. * "ks": Performs a Kolmogorov-Smirnov test, used to compare the probability distributions of two samples. protein_pvalue (float): The cut-off value for protein p-values. This value determines which proteins are considered significant based on a previous analysis. background_lenght (int, optional): The total number of entities in the background set. This argument is only used for the "fisher" method to define the population size. If not provided, all genes from the original data are used as the background. Returns: pandas.DataFrame: A DataFrame containing the p-values for each pair-wise comparison between groups. """ if method == 'fisher': matrix = fisher_test(self, protein_pvalue=protein_pvalue, background_lenght=background_lenght) elif method in ['ttest', 'wilcoxon', 'ks']: matrix = distribution_test(self, protein_pvalue=protein_pvalue, method=method) else: raise ValueError('''Please, verify if it was selected a valid method ("fisher", "ttest", "wilcoxon", or "ks").''') return matrix
[docs]def stat_network(self, method='fisher', protein_pvalue=0.05, background_lenght=None, dpi=300, graph_pvalue=0.1, save=None, vector=True): """ Generates a network visualization based on statistical comparisons between groups imported on Nebula Args: method (str, optional): The statistical method used for comparison between groups. Valid options include: * "fisher": Performs Fisher's exact test, suitable for comparing proportions of significant proteins. * "ttest": Performs a t-test, assuming normally distributed data. * "wilcoxon": Performs a Wilcoxon signed-rank test, a non-parametric alternative for related groups when normality cannot be assumed. * "ks": Performs a Kolmogorov-Smirnov test, used to compare the probability distributions of two samples. Defaults to "fisher". protein_pvalue (float, optional): The cut-off value for protein p-values. This value determines which proteins are considered significant based on a previous analysis. Defaults to 0.05. background_lenght (int, optional): The total number of entities in the background set. This argument is only used for the "fisher" method to define the population size. If not provided, all genes from the original data are used as the background. Defaults to None (Recommended). dpi (int, optional): The resolution (dots per inch) for the generated plot. Defaults to 300. graph_pvalue (float, optional): The threshold for p-values to consider the links between network nodes. Edges with p-values greater than (for "fisher") or less than (for other methods) to this value will be excluded from the network. Defaults to 0.1. save (str, optional): The filename prefix to save the network image (e.g., "groupNetwork"). If provided, the function will save both the GraphML representation of the network and the plot image. vector (bool, optional): If True (default), saves the image as an SVG file (scalable vector graphics) suitable for high-quality printing. If False, saves the image as a PNG file. Returns: None. This function generates a network visualization and potentially saves image files, but it doesn't return any data. """ import matplotlib.pyplot as plt plt.rcParams['figure.dpi'] = dpi palette = self.colors conditions = self.groups pval = self.pvalue matrix = stat_matrix(self, method=method, protein_pvalue=protein_pvalue, background_lenght=background_lenght) if method == 'fisher': matrix[matrix >= graph_pvalue] = 0 elif method in ['ttest', 'wilcoxon','ks']: matrix[matrix <= graph_pvalue] = 0 G = nx.from_pandas_adjacency(matrix) G.edges(data=True) size = [len(set(x[x[pval] <= protein_pvalue].gene_name)) for x in self.original] carac = pd.DataFrame(zip(conditions, palette, size), columns=['ID', 'color', 'Size']) carac = carac.set_index('ID') nx.set_node_attributes(G, dict(zip(carac.index, carac.color)), name="Color") nx.set_node_attributes(G, dict(zip(carac.index, carac.Size)), name="Size") pos = nx.kamada_kawai_layout(G, ) carac = carac.reindex(G.nodes()) pos = nx.kamada_kawai_layout(G, weight=None) edges = G.edges weights = [G[u][v]['weight'] for u, v in edges] if method == 'fisher': weights = -np.log10(weights) weights = [round(x, 2) for x in weights] G.remove_edges_from(nx.selfloop_edges(G)) norm = [float(i)*5/np.mean(weights) for i in weights] nx.draw(G, pos=pos, node_color=carac['color'], node_size=carac['Size'], edgecolors='black', linewidths=0.6, alpha=0.9, width=norm, edge_color='gray') nx.draw_networkx_labels(G, pos, font_size=6) labels = nx.get_edge_attributes(G, 'weight') nx.draw_networkx_edge_labels(G, pos, edge_labels=dict(zip(list(labels.keys()), weights))) if save is not None: nx.write_graphml(G, save + 'PPNetwork.graphml', named_key_ids=True) if vector is True: plt.savefig(save + 'groupNetwork.svg', bbox_inches='tight') else: plt.savefig(save + 'groupNetwork.dpi', dpi=dpi, bbox_inches='tight') plt.show()
[docs]def stat_heatmap(self, palette='Spectral', method='fisher', pvalue=0.05, background_lenght=None, annotation=True, save=None, vector=True, dpi=300): """ Generates a heatmap visualization to represent the p-values from pair-wise statistical comparisons between groups. Args: palette (str, optional): The color palette to use for the heatmap. Defaults to "Spectral". method (str, optional): The statistical method used for comparison between groups. Valid options include: * "fisher": Performs Fisher's exact test, suitable for comparing proportions of significant proteins. * "ttest": Performs a t-test, assuming normally distributed data. * "wilcoxon": Performs a Wilcoxon signed-rank test, a non-parametric alternative for related groups when normality cannot be assumed. * "ks": Performs a Kolmogorov-Smirnov test, used to compare the probability distributions of two samples. Defaults to "fisher". pvalue (float, optional): The cut-off value for protein p-values. This value determines which proteins are considered significant based on a previous analysis. Defaults to 0.05. background_lenght (int, optional): The total number of entities in the background set. This argument is only used for the "fisher" method to define the population size. If not provided, all genes from the original data are used as the background. Defaults to None. annotation (bool, optional): If True (default), displays the p-values within each heatmap cell. If False, hides the p-value annotations. save (str, optional): The filename prefix to save the heatmap image (e.g., "overlap_stat"). If provided, the function will save the plot image. vector (bool, optional): If True (default), saves the image as an SVG file (scalable vector graphics) suitable for high-quality printing. If False, saves the image as a PNG file. dpi (int, optional): The resolution (dots per inch) for the generated plot. Defaults to 300. Returns: None. This function generates a heatmap visualization and potentially saves an image file, but it doesn't return any data. """ plt.rcParams['figure.dpi'] = dpi colors = self.colors matrix = stat_matrix(self, method=method, protein_pvalue=pvalue, background_lenght=background_lenght) annot = matrix.copy() if annotation is False: annot[annot != np.inf] = np.nan annot[annot == 0] = np.nan sns.clustermap(matrix, cmap=palette, annot=annot, mask=annot.isnull(), col_colors=colors, row_colors=colors) plt.title('Pvalue') if save is not None: if vector is True: plt.savefig(save + 'overlap_stat.svg', bbox_inches='tight') else: plt.savefig(save + 'overlap_stat.png', dpi=dpi, bbox_inches='tight') plt.show()
def linkproteins(deps, groups): # retrieving overlapped proteins among groups overlapped_proteins = pd.concat(deps) overlapped_proteins = list(overlapped_proteins[overlapped_proteins.duplicated( subset=['gene_name'], keep=False)]['gene_name'].dropna()) # ordering matrix matrixes = [x.assign(duplicated=lambda x: x.gene_name.isin( overlapped_proteins)) for x in deps] matrixes = [x.sort_values( ['duplicated', 'log2(fc)'], ascending=False) for x in matrixes] matrixes = [x.reset_index(drop=True) for x in matrixes] # Retrieving links dataframes = copy(matrixes) result = pd.DataFrame( columns=['gene_name', 'query_chr', 'query_start', 'ref_chr', 'ref_start']) # Perform pairwise comparison between DataFrames for i in range(len(dataframes)-1): for j in range(i+1, len(dataframes)): common_names = set(dataframes[i]['gene_name']).intersection( dataframes[j]['gene_name']) for name in common_names: index1 = dataframes[i][dataframes[i] ['gene_name'] == name].index[0] index2 = dataframes[j][dataframes[j] ['gene_name'] == name].index[0] result = result.append({'gene_name': name, 'query_chr': i, 'query_start': index1, 'ref_chr': j, 'ref_start': index2}, ignore_index=True) result['query_end'], result['ref_end'] = result['query_start'] + \ 1, result['ref_start']+1 group_dict = dict(enumerate(groups)) result['query_chr'] = result.query_chr.replace(group_dict) result['ref_chr'] = result.ref_chr.replace(group_dict) return result, matrixes def linkenrichment(enrichment, groups, number_deps): enr_original = copy(enrichment) group_original = copy(groups) enrichment = [] group = [] ndeps = [] for e, g, n in zip(enr_original, group_original, number_deps): if e is not None: enrichment.append(e) group.append(g) ndeps.append(n) enrichment = [x.assign(db_term=x['Gene_set']+'-'+x['Term']) for x in enrichment] overlapped_enrichment = pd.concat(enrichment) overlapped_enrichment['db_term'] = overlapped_enrichment['Gene_set'] + \ '.'+overlapped_enrichment['Term'] overlapped_enrichment = list(overlapped_enrichment[overlapped_enrichment.duplicated( subset=['db_term'], keep=False)]['db_term'].dropna()) # ordering matrix matrixes = [x.assign(duplicated=lambda x: x.db_term.isin( overlapped_enrichment)) for x in enrichment] indexes = [list(np.random.randint(0, y, size=len(x))) for x, y in zip(matrixes, ndeps)] matrixes = [x.set_index(pd.Index(y)) for x, y in zip(matrixes, indexes)] # Retrieving links dataframes = copy(matrixes) result = pd.DataFrame( columns=['db_term', 'query_chr', 'query_start', 'ref_chr', 'ref_start']) # Perform pairwise comparison between DataFrames for i in range(len(dataframes)-1): for j in range(i+1, len(dataframes)): common_names = set(dataframes[i]['db_term']).intersection( dataframes[j]['db_term']) for name in common_names: index1 = dataframes[i][dataframes[i] ['db_term'] == name].index[0] index2 = dataframes[j][dataframes[j] ['db_term'] == name].index[0] result = result.append({'db_term': name, 'query_chr': i, 'query_start': index1, 'ref_chr': j, 'ref_start': index2}, ignore_index=True) result['query_end'], result['ref_end'] = result['query_start'] + \ 1, result['ref_start']+1 group_dict = dict(enumerate(group)) result['query_chr'] = result.query_chr.replace(group_dict) result['ref_chr'] = result.ref_chr.replace(group_dict) return result
[docs]def circos_plot(self, vmax=1, vmin=-1, colormap='RdYlBu_r', colorproteins='darkcyan', colorenrichment='black', linewidth_heatmap=0.1, save=None, vector=True, dpi=300): """Circos plot This plot offers an overview of proteins differentially regulated between groups using circular plots. Args: vmin (int, optional): minimum value for foldchange. Defaults to -1. vmax (int, optional): maximum value for foldchange. Defaults to 1. colormap (str, optional): Colormap for heatmap. Defaults to 'RdBu_r'. colorproteins (str, optional): Color for protein links. Defaults to 'darkcyan'. colorenrichment (str, optional): Color for enrichment links. Defaults to 'black'. save (str, optional): Path to save file. Defaults to None. vector (bool, optional): Save as svg extension, if False, save as png. Defaults to True. dpi (int, optional): Figure resolution. Defaults to 300. """ # Data enrichment = copy(self.enrichment) groups = copy(self.groups) deps = copy(self.group_data) deps = [x.dropna().reset_index(drop=True) for x in deps] colors = self.colors grouplen = [len(x) for x in deps] higher_group = max(grouplen) # retrieving links and matrixes links, matrixes = linkproteins(deps, groups) # Mapping heatmap matrixes = [y[['log2(fc)']].applymap( lambda x: vmax if x > vmax else x) for y in matrixes] matrixes = [y[['log2(fc)']].applymap( lambda x: vmin if x < vmin else x) for y in matrixes] matrixes = [x[['log2(fc)']].T.to_numpy() for x in matrixes] # Config circos sectors = dict(zip(groups, grouplen)) sector_colors = dict(zip(groups, colors)) circos = Circos(sectors, space=5) for sector, matrix in zip(circos.sectors, matrixes): # Outer name # Tamanho da primeira Track, contendo as cores dos grupos outer_track = sector.add_track((95, 110)) # Marcar o nome dos grupos outer_track.text(sector.name, color="Black") # Outer Track # Tamanho da primeira Track, contendo as cores dos grupos outer_track = sector.add_track((88, 90)) outer_track.axis(fc=sector_colors[sector.name]) # cores dos grupos outer_track.xticks_by_interval(interval=int( higher_group/20), label_orientation="vertical") # colocar xticks nas tracks # foldchange track rect_track = sector.add_track((80, 85)) rect_track.heatmap(matrix, cmap=colormap, rect_kws=dict(ec="black", lw=linewidth_heatmap)) # drawing enrichment links if applicable if len(enrichment) > 1: linkenr = linkenrichment(enrichment, groups, grouplen) for i in linkenr.to_dict('records'): region1 = (i['query_chr'], i['query_start'], i['query_end']) region2 = (i['ref_chr'], i['ref_start'], i['ref_end']) circos.link(region1, region2, color=colorenrichment) # drawing links for i in links.to_dict('records'): region1 = (i['query_chr'], i['query_start'], i['query_end']) region2 = (i['ref_chr'], i['ref_start'], i['ref_end']) circos.link(region1, region2, color=colorproteins) circos.colorbar(vmin=vmin, vmax=vmax, cmap=colormap, colorbar_kws=dict(label="log2(FoldChange)")) fig = circos.plotfig() if save is not None: if vector is True: fig.savefig(save + 'circos.svg') else: fig.savefig(save + 'circos.png', dpi=dpi)
[docs]def circular_term(self, *Terms, pvalue=0.05, vmin=-1, vmax=1, colormap='RdBu_r', label_size=12, save=None, vector=True, dpi=300): """Circular term Allows the visualization of all proteins related to a pre-specified term. This term is extracted from enrichment data. Args: pvalue (float, optional): Pvalue to consider differentially regulated proteins . Defaults to 0.05. vmin (int, optional): minimum value for foldchange. Defaults to -1. vmax (int, optional): maximum value for foldchange. Defaults to 1. colormap (str, optional): Colormap for heatmap. Defaults to 'RdBu_r'. save (str, optional): Path to save file. Defaults to None. vector (bool, optional): Save as svg extension, if False, save as png. Defaults to True. dpi (int, optional): Figure resolution. Defaults to 300. Raises: TypeError: Term/Terms was/were not found in dataset. """ enrichment = [x for x in self.enrichment if x is not None] deps = self.original deps = [x[x[self.pvalue] <= pvalue] for x in deps] groups = self.groups colors = dict(zip(groups, self.colors)) # Select genes from enriched term. enrichTerms = [] for term in Terms: enr = [x[x['Term'].str.contains(term)] for x in enrichment] enrichTerms.extend(enr) enrichTerms = pd.concat(enrichTerms) enrichGenes = list(enrichTerms['Genes']) enrichGenes = sum(enrichGenes, []) enrichGenes = list(set(enrichGenes)) # Filtering based on enrichgenes data = [x[x['gene_name'].isin(enrichGenes)] for x in deps] data = [x[['gene_name', "log2(fc)"]] for x in data] data = [x.rename(columns={'log2(fc)': y}) for x, y in zip(data, groups)] data = [x.groupby('gene_name').mean() for x in data] data = pd.concat(data, axis=1).T data = data.sort_index(axis=1) matrix = data matrix = matrix.notnull().astype(int) matrix = matrix.fillna(0) row_sums = matrix.sum(axis=1) matrix = matrix.drop(index=matrix.index[row_sums == 0]) heatmaps = data heatmaps[heatmaps > vmax] = vmax heatmaps[heatmaps < vmin] = vmin heatmaps = [np.array([heatmaps[x].dropna()]) for x in heatmaps] heatmaps = [np.array([[np.nan]])]*len(matrix) + heatmaps heatmaps = [x[:, ::-1] for x in heatmaps] if len(matrix.columns) == 0: raise TypeError('Matrix length is zero. Term/Terms was/were not found in dataset.') circos = Circos.initialize_from_matrix( matrix, start=-265, end=95, space=0.3, r_lim=(93, 100), cmap=colors, label_kws=dict(r=101, orientation="vertical", size=label_size), ) for sector, heatmap in zip(circos.sectors, heatmaps): # Outer Track # foldchange track if sector.name not in data.index: rect_track = sector.add_track((93, 100)) rect_track.heatmap(heatmap, cmap=colormap, vmin=vmin, vmax=vmax) circos.colorbar(bounds=(1.1, 0.3, 0.02, 0.4), vmin=vmin, vmax=vmax, cmap="RdBu_r", colorbar_kws=dict(label="log2(FoldChange)")) fig = circos.plotfig() if save is not None: if vector is True: fig.savefig(save+'Term_circular_plot.svg') else: fig.savefig(save+'Term_circular_plot.png', dpi=dpi)