Source code for omicscope.MultipleData.MultipleVisualization

from copy import copy

import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import seaborn as sns
from pycirclize import Circos


[docs]def barplot(self, save=None, vector=True, dpi=300):
    """ Barplot
    Bar plot for proteins/genes identified and differentially regulated
    according to each group

    Args:
        save (str, optional): Path to save image. Defaults to None.
        vector (bool, optional): If image should be export as .svg. Defaults to True.
        dpi (int, optional): Image resolution. Defaults to 300.
    """
    plt.rcParams['figure.dpi'] = dpi
    data = copy(self)
    conditions = copy(data.groups)
    group_data = data.original
    difreg = data.group_data

    # -Desregulation figures
    whole_proteome = []
    deps = []
    for i, a in zip(group_data, difreg):
        whole_proteome.append(len(i))
        deps.append(len(a))
    conditions.extend(['Total'])

    proteinsIdentified = []
    for i in group_data:
        proteinsIdentified.append(i['gene_name'])
    proteinsIdentified = pd.concat(proteinsIdentified).drop_duplicates()

    proteinsdes = []
    for i in difreg:
        proteinsdes.append(i['gene_name'])
    proteinsdes = pd.concat(proteinsdes).drop_duplicates()

    whole_proteome.extend([len(proteinsIdentified)])
    deps.extend([len(proteinsdes)])
    r = [i for i in range(0, len(conditions))]

    f, (ax, ax2) = plt.subplots(2, 1, sharex=True)
    ax.bar(r, whole_proteome, color='lightgray', edgecolor='black', width=0.5,
           linewidth=1)
    colors = copy(self.colors)
    colors.extend(['gray'])
    ax.bar(r, deps, color=colors, edgecolor='black', width=0.5, linewidth=1)
    plt.xticks(r,
               fontweight=None, rotation=45, ha='right')

    ax2.bar(r, whole_proteome, color='lightgray', edgecolor='black', width=0.5,
            linewidth=1)
    ax2.bar(r, deps, color=colors,  edgecolor='black', width=0.5, linewidth=1)
    ax.set_ylim(max(whole_proteome[:-1])*1.1, max(whole_proteome)*1.01)  # outliers only
    ax2.set_ylim(0, max(whole_proteome[:-1])*1.005)  # most of the data
    sns.despine()
    ax.spines['bottom'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.tick_params(bottom=False)
    plt.xticks(r, conditions,
               fontweight=None, rotation=45, ha='right')
    for i, j, k, c in zip(deps[:-1], r[:-1], whole_proteome[:-1],
                          colors[:-1]):
        ax2.annotate(i, xy=[j, k*1.005], ha='center', color=c,
                     weight='bold')
    ax.annotate(deps[-1], [r[-1], whole_proteome[-1]*1.001], ha='center',
                color=colors[-1], weight='bold')
    if save is not None:
        if vector is True:
            plt.savefig(save + 'barplot.svg', bbox_inches='tight')
        else:
            plt.savefig(save + 'barplot.png', dpi=dpi, bbox_inches='tight')
    plt.show()


[docs]def diff_reg(self,
             save=None, vector=True, dpi=300):
    """Dotplot

    Dotplot for number of proteins up- and down-regulated in each group.

    Args:
        save (str, optional): Path to save image. Defaults to None.
        vector (bool, optional): If image should be export as .svg.
        Defaults to True.
        dpi (int, optional): Image resolution. Defaults to 300.
    """
    plt.rcParams['figure.dpi'] = dpi
    data = copy(self)
    groups = data.groups
    difreg = data.group_data
    up = []
    down = []

    for i in difreg:
        up.append(len(i[i['log2(fc)'] > 0]))
        down.append(-len(i[i['log2(fc)'] < 0]))
    dysregulations = pd.DataFrame(columns=groups,
                                  data=[up, down], index=['Up-regulated', 'Down-regulated']).transpose()

    df = dysregulations

    df = df.reset_index()
    df = df.melt('index')
    df['color'] = np.where(df['value'] > 0, '#e3432d', '#167a9c')
    df['value'] = abs(df['value'])
    M = 2
    N = len(groups)
    fig, ax = plt.subplots()
    fig.set_figwidth(2)
    scatter = ax.scatter(x=df['variable'], y=df['index'], s=df['value'],
                         c=df['color'], ec='black', lw=0.5)
    ax.set_xticks(np.arange(M+1)-0.5, minor=True)
    ax.set_yticks(np.arange(N+1)-0.5, minor=True)
    sns.despine()
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=45)
    kw = dict(prop="sizes", num=3, color='black', alpha=.6)
    ax.legend(*scatter.legend_elements(**kw), title="# Proteins", handleheight=2,
              bbox_to_anchor=(1, 1), loc="upper left", markerscale=1,
              edgecolor='white')
    plt.margins()
    if save is not None:
        if vector is True:
            plt.savefig(save + 'diff_reg.svg', bbox_inches='tight')
        else:
            plt.savefig(save + 'diff_reg.png', dpi=dpi, bbox_inches='tight')
    plt.show()


[docs]def whole_network(self, labels=False, save=None, vector=True, dpi=300):
    """Network of entities differentially regulated for each
    group analyzed.

    Args:
        labels (bool, optional): Show graph labels. Defaults to False.
        save (str, optional): Path to save image. Defaults to None.
         Defaults to None.
         dpi (int, optional): Image resolution. Defaults to 300.
    """
    import matplotlib as mpl
    import matplotlib.cm as cm
    import matplotlib.colors as mcolors
    plt.rcParams['figure.dpi'] = dpi
    data = copy(self)
    network_frame = []
    for group, df, color in zip(data.groups, data.original, data.colors):
        df['Experiment'] = group
        df = df[df[self.pvalue] <= 0.05]
        df['Size'] = len(df)
        df['color'] = color
        network_frame.append(df)
    network_frame = pd.concat(network_frame)
    source = pd.DataFrame({'ID': network_frame['Experiment'],
                           'Size': network_frame['Size'],
                           'type': 'Experiment',
                           'color': network_frame['color']})
    source = source.drop_duplicates()
    norm = mpl.colors.TwoSlopeNorm(vmin=min(network_frame['log2(fc)']),
                                   vmax=max(network_frame['log2(fc)']),
                                   vcenter=0)
    cmap = cm.RdYlBu_r
    m = cm.ScalarMappable(norm=norm, cmap=cmap)
    color_hex = [mcolors.to_hex(m.to_rgba(x)) for x in network_frame['log2(fc)']]

    target = pd.DataFrame({'ID': network_frame['gene_name'],
                           'Size':  int(min(network_frame['Size'])*0.5),
                           'type': 'Protein',
                           'color': color_hex})
    target = target.drop_duplicates()
    edgelist = network_frame[['Experiment', 'gene_name']]
    G = nx.from_pandas_edgelist(edgelist,
                                source='Experiment',
                                target='gene_name',
                                create_using=nx.Graph)
    carac = pd.concat([source, target]).reset_index(drop=True)
    carac = carac.drop_duplicates('ID')
    carac = carac.set_index('ID')
    nx.set_node_attributes(G, dict(zip(carac.index, carac.color)), name="Color")
    nx.set_node_attributes(G, dict(zip(carac.index, carac.Size)), name="Size")
    pos = nx.kamada_kawai_layout(G)
    carac = carac.reindex(G.nodes())
    nx.draw(G,
            pos=pos,
            node_color=carac['color'],
            node_size=carac['Size']/20,
            edgecolors='black',
            linewidths=0.4,
            alpha=0.9,
            width=0.2,
            edge_color='gray')
    if labels is True:
        nx.draw_networkx_labels(G, pos, font_size=6)
    if save is not None:
        nx.write_graphml(G, save + 'PPNetwork.graphml', named_key_ids=True)
        if vector is True:
            plt.savefig(save + 'PPNetwork.svg', bbox_inches='tight')
        else:
            plt.savefig(save + 'PPNetwork.dpi', dpi=dpi, bbox_inches='tight')
    plt.show()
    return (G)


[docs]def dotplot_enrichment(self, *Terms, top=5,  fig_height=None, palette='PuBu', save=None, vector=True,
                       dpi=300):
    """Dotplot Enrichment

    Dotplot to visualize together the enrichment data for each group

    Args:
        top (int, optional): Top N pathway to considered in each group. Defaults to 5.
        palette (str, optional): color palette. Defaults to 'PuBu'.
        fig_height (int, optional): User optionally can define figure height. Defaults to None
        save (str, optional): Path to save image. Defaults to None.
        vector (bool, optional): If image should be export as .svg.
        Defaults to True.
        dpi (int, optional): Image resolution. Defaults to 300.
    """
    plt.rcParams['figure.dpi'] = dpi
    enrichments = []
    groups = []
    for g,e in zip(copy(self.groups), copy(self.enrichment)):
        if e is not None:
            enrichments.append(e)
            groups.append(g)
    genesets = [list(x.Gene_set.drop_duplicates()) for x in enrichments]
    genesets = pd.Series(sum(genesets, [])).drop_duplicates()
    for i in genesets:
        data = enrichments
        data = [x[x['Gene_set'] == i] for x in data]
        terms = [x.iloc[:top, :] for x in data]
        terms = [list(x['Term']) for x in terms]
        terms = pd.Series(sum(terms, [])).drop_duplicates()
        if len(Terms) > 0:
            terms = Terms
        data = [x[x['Term'].isin(terms)] for x in data]
        data = [x.assign(Group=y) for x, y in zip(data, groups)]
        data = pd.concat(data)
        data = data[['Term', 'Overlap', 'Adjusted P-value', 'Group']]
        data['Overlap'] = data.Overlap.str.split('/', regex=False).str[0]
        data['Overlap'] = data.Overlap.astype(int)
        data['-log10(p)'] = -np.log10(data['Adjusted P-value'])
        fig, ax = plt.subplots()
        if fig_height is not None:
            fig.set_figheight(fig_height)
        sns.set_style('white')
        sns.scatterplot(data=data, x='Group', y='Term', size='-log10(p)',
                        hue='-log10(p)', palette=palette, sizes=(40, 280),
                        linewidth=0.5, edgecolor='black'
                        )
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0)
        sns.despine()
        plt.margins()
        plt.ylabel('')
        if save is not None:
            if vector is True:
                plt.savefig(save + '_'+i+'_' + 'dotplot_enrichment.svg', bbox_inches='tight')
            else:
                plt.savefig(save + '_'+i+'_' + 'dotplot_enrichmnet.dpi', dpi=dpi, bbox_inches='tight')
        plt.show()


[docs]def protein_overlap(self, min_subset=10, face_color='darkcyan', shad_color="#f0f0f0",
                    edge_color='black', linewidth=1, save=None, vector=True, dpi=300):
    """Upset plot
    Upset plot to evaluate protein overlap among groups.

    Args:
        min_subset (int, optional): Minimum overlap size to consider for upset plot.
         Defaults to 10.
        face_color (str, optional): Bar and dot colors. Defaults to 'darkcyan'.
        shad_color (str, optional): Shad color in the dot part of the graph.
         Defaults to "#f0f0f0".
        edge_color (str, optional): edge colors. Defaults to 'black'.
        linewidth (int, optional): line widths. Defaults to 1.
        save (str, optional): Path to save image. Defaults to None.
        vector (bool, optional): If image should be export as .svg. Defaults to True.
        dpi (int, optional): Image resolution. Defaults to 300.
    """
    plt.rcParams['figure.dpi'] = dpi
    from upsetplot import UpSet
    from upsetplot import from_contents
    plt.style.context('classic')
    plt.rcParams['grid.alpha'] = 0
    plt.rcParams['figure.dpi'] = dpi
    plt.rcParams['patch.linewidth'] = linewidth
    plt.rcParams['patch.edgecolor'] = 'black'
    plt.rcParams['patch.force_edgecolor'] = True

    data = copy(self)
    genes = []
    for i in data.group_data:
        genes.append(i['gene_name'].drop_duplicates())
    dictionary = dict(zip(data.groups, genes))
    upset = from_contents(dictionary)

    figure = UpSet(upset,  facecolor=face_color, shading_color=shad_color,
                   min_subset_size=min_subset, show_counts=True,
                   with_lines=True)
    for i in data.groups:
        figure.style_subsets(present=i, edgecolor=edge_color, linewidth=linewidth)
    figure.plot()
    if save is not None:
        if vector is True:
            plt.savefig(save + 'upset_proteins.svg', bbox_inches='tight')
        else:
            plt.savefig(save + 'upset_proteins.png', dpi=dpi, bbox_inches='tight')
    plt.show()


[docs]def enrichment_overlap(self,  min_subset=1, face_color='darkcyan', shad_color="#f0f0f0",
                       edge_color='black', linewidth=1, save=None, vector=True, dpi=300):
    """Upset plot
    Upset plot to evaluate enrichment terms overlap among groups.

    Args:
        min_subset (int, optional): minimum number of overlap size to consider
         for upset plot . Defaults to 10.
        face_color (str, optional): Bar and dot colors. Defaults to 'darkcyan'.
        shad_color (str, optional): Shad color in the dot part of the graph.
         Defaults to "#f0f0f0".
        edge_color (str, optional): edge colors. Defaults to 'black'.
        linewidth (int, optional): line widths. Defaults to 1.
        save (str, optional): Path to save image. Defaults to None.
        vector (bool, optional): If image should be export as .svg. Defaults to True.
        dpi (int, optional): Image resolution. Defaults to 300.
    Raises:
        IndexError: If there is no Enrichment data on .omics file.
    """
    from upsetplot import UpSet
    from upsetplot import from_contents
    plt.style.context('classic')
    plt.rcParams['grid.alpha'] = 0
    plt.rcParams['figure.dpi'] = dpi
    plt.rcParams['patch.linewidth'] = linewidth
    plt.rcParams['patch.edgecolor'] = 'black'
    plt.rcParams['patch.force_edgecolor'] = True

    data_original = copy(self)
    genes = []
    if all(enr is None for enr in self.enrichment):
        raise IndexError('There is not Enrichment result in data!')
    enrichment = []
    groups = []
    for e, g in zip(data_original.enrichment, data_original.groups):
        if e is not None:
            enrichment.append(e)
            groups.append(g)
    data = copy(self)
    data.enrichment = enrichment
    data.groups = groups
    for i in data.enrichment:
        try:
            genes.append(i['Term'].drop_duplicates())
        except KeyError:
            df = pd.DataFrame(columns=['Gene_set', 'Term', 'Overlap', 'Adjusted P-value', 'Genes'])
            genes.append(df['Term'].drop_duplicates())
    dictionary = dict(zip(data.groups, genes))
    upset = from_contents(dictionary)

    figure = UpSet(upset,  facecolor=face_color, shading_color=shad_color, min_subset_size=min_subset, show_counts=True,
                   with_lines=True)
    for i in data.groups:
        figure.style_subsets(present=i, edgecolor=edge_color, linewidth=linewidth)
    figure.plot()
    if save is not None:
        if vector is True:
            plt.savefig(save + 'upset_pathways.svg', bbox_inches='tight')
        else:
            plt.savefig(save + 'upset_pathways.png', dpi=dpi, bbox_inches='tight')
    plt.show()


[docs]def similarity_network(self, pvalue=1, comparison_param='log2(fc)',
                       metric='jaccard',
                       absolute_similarity_cutoff=0.2,
                       save=None, vector=True,
                       dpi=300):
    """Similarity Network plot

        Perform a pairwise correlation analysis and create a graph where groups
        are depicted as nodes, and pairwise similarity indices serve as edges.
        In order to establish a connection between two groups, the function filters
        edges based on an absolute similarity cutoff, excluding edges that fall within
        a specified interval range, for instance, -0.2 to 0.2, when the
        absolute_similarity_cutoff is set to 0.2.

        Furthermore, when utilizing the Jaccard similarity index,
        this function takes into account the shared 'gene_name' between groups.
        In contrast, for the other available options, the function considers either
        'TotalMean' or 'log2(fc)' columns

    Args:
        pvalue (int, optional): P-value threshold to proteins that
         OmicScope must consider for analysis. Defaults to 1.
        comparison_param (str, optional): Parameter/column to take into account in pairwise comparison.
         Defaults to 'log2(fc)'. Optionally 'TotalMean'.
        absolute_similarity_cutoff (float, optional): Cuttoff to consider the links between groups. Since major
         similarity indexes have positive and negative values, the function expect an absolute value to perform cuttof.
         Defaults to 0.2 (which means -0.2 < cutoff< 0.2).
        metric (str, optional): algorithm to perform pairwise comparison. Defaults to 'correlation'.
         Optionally, user can test other algorithm described in scipy.spatial.distance.
        center(float, optional): number to center the heatmap color gradient.
        palette (str, optional): color palette to plot heatmap.
         Defaults to 'RdYlBu'.
        save (str, optional): Path to save image. Defaults to None.
        vector (bool, optional): If image should be export as .svg. Defaults to True.
        dpi (int, optional): Image resolution. Defaults to 300.
    """
    from copy import copy
    plt.rcParams['figure.dpi'] = dpi
    palette = self.colors
    conditions = self.groups
    pval = self.pvalue
    data = copy(self)
    data1 = data.original
    totalMean = []
    for i in data1:
        df = i.groupby('gene_name').mean()
        df = df[df[self.pvalue] <= pvalue]
        df = df[[comparison_param]]
        totalMean.append(df)
    wholedata = pd.concat(totalMean, axis=1, join='outer')
    wholedata.columns = data.groups
    corr = wholedata
    from sklearn.metrics import pairwise_distances

    # Replace -inf to the lowest non-inf value in data
    corr = corr.replace(-np.inf,
                        corr.replace(-np.inf,
                                     np.nan).dropna().min().min())

    # Inicializar um DataFrame vazio para armazenar as distâncias
# Inicializar um DataFrame vazio para armazenar as distâncias
    df_dist = pd.DataFrame(index=corr.columns, columns=corr.columns)
    # Calculate the distance between each group
    for col1 in corr.columns:
        for col2 in corr.columns:
            if col1 != col2:
                # Jaccard uses the index for the computation
                if metric == 'jaccard':
                    set1 = set(corr[col1].dropna().index)
                    set2 = set(corr[col2].dropna().index)
                    distance = len(set1.intersection(set2))/len(set1.union(set2))
                    df_dist.at[col1, col2] = 1-distance
                else:
                    # For other methods, we use the non-zero quantitative values from each condition
                    distance = corr[[col1, col2]].dropna(axis=0)
                    distance = pairwise_distances(distance.T.to_numpy(),
                                                  metric=metric, force_all_finite='allow-nan')
                    df_dist.at[col1, col2] = distance[0][1]
            df = 1-df_dist
    corr = df
    # Perform the absoltute_cuttoff for values
    corr = df.applymap(lambda x: 0 if -absolute_similarity_cutoff <= x <= absolute_similarity_cutoff else x)
    corr.columns, corr.index = data.groups, data.groups
    corr = corr.replace(np.nan, 0)
    # Plotting graph
    G = nx.from_pandas_adjacency(corr)
    G.edges(data=True)
    size = [len(set(x[x[pval] <= pvalue].gene_name)) for x in self.original]
    carac = pd.DataFrame(zip(conditions, palette, size),
                         columns=['ID', 'color', 'Size'])
    carac = carac.set_index('ID')
    nx.set_node_attributes(G, dict(zip(carac.index, carac.color)), name="Color")
    nx.set_node_attributes(G, dict(zip(carac.index, carac.Size)), name="Size")
    carac = carac.reindex(G.nodes())
    pos = nx.kamada_kawai_layout(G, weight=None)
    edges = G.edges
    weights = [G[u][v]['weight'] for u, v in edges]
    weights = [round(x, 2) for x in weights]
    norm = [float(i)*5/np.mean(weights) for i in weights]
    G.remove_edges_from(nx.selfloop_edges(G))
    nx.draw(G,
            pos=pos,
            node_color=carac['color'],
            node_size=carac['Size'],
            edgecolors='black',
            linewidths=0.6,
            alpha=0.9,
            width=norm,
            edge_color='gray')
    nx.draw_networkx_labels(G, pos, font_size=6)
    labels = nx.get_edge_attributes(G, 'weight')
    nx.draw_networkx_edge_labels(G, pos, edge_labels=dict(zip(list(labels.keys()), weights)))
    if save is not None:
        nx.write_graphml(G, save + 'Similarity_network.graphml', named_key_ids=True)
        if vector is True:
            plt.savefig(save + 'Similarity_network.svg', bbox_inches='tight')
        else:
            plt.savefig(save + 'Similarity_network.dpi', dpi=dpi, bbox_inches='tight')
    plt.show()


[docs]def similarity_heatmap(self, pvalue=1, comparison_param='log2(fc)',
                       metric='correlation',
                       center=0, palette='RdYlBu_r',
                       annotation=True, save=None, vector=True,
                       dpi=300):
    """Similarity heatmap plot
        Perform a pair-wise similarity analysis and plot a heatmap.

        When utilizing the Jaccard similarity index,
        this function takes into account the shared 'gene_name' between groups.
        In contrast, for the other available options, the function considers either
        'TotalMean' or 'log2(fc)' columns

    Args:
        pvalue (int, optional): P-value threshold to proteins that
         OmicScope must consider for analysis. Defaults to 1.
        comparison_param (str, optional): Parameter to take into account in pairwise comparison.
        Defaults to 'log2(fc)'. Optionally 'TotalMean'.
        metric (str, optional): algorithm to perform pairwise comparison. Defaults to 'correlation'.
         Optionally, user can test other algorithm described in scipy.spatial.distance.
        center(float, optional): number to center the heatmap color gradient.
        palette (str, optional): color palette to plot heatmap.
         Defaults to 'RdYlBu'.
        save (str, optional): Path to save image. Defaults to None.
        vector (bool, optional): If image should be export as .svg. Defaults to True.
        dpi (int, optional): Image resolution. Defaults to 300.
    """
    from copy import copy
    plt.rcParams['figure.dpi'] = dpi
    data = copy(self)
    data1 = data.original
    totalMean = []
    colors = data.colors
    for i in data1:
        df = i.groupby('gene_name').mean()
        df = df[df[self.pvalue] <= pvalue]
        df = df[[comparison_param]]
        totalMean.append(df)
    wholedata = pd.concat(totalMean, axis=1, join='outer')
    wholedata.columns = data.groups
    corr = wholedata
    from sklearn.metrics import pairwise_distances

    # Replace -inf to the lowest non-inf value in data
    corr = corr.replace(-np.inf,
                        corr.replace(-np.inf,
                                     np.nan).dropna().min().min())

    # Inicializar um DataFrame vazio para armazenar as distâncias
    # Inicializar um DataFrame vazio para armazenar as distâncias
    df_dist = pd.DataFrame(index=corr.columns, columns=corr.columns)
    # Calculate the distance between each group
    for col1 in corr.columns:
        for col2 in corr.columns:
            if col1 != col2:
                # Jaccard uses the index for the computation
                if metric == 'jaccard':
                    set1 = set(corr[col1].dropna().index)
                    set2 = set(corr[col2].dropna().index)
                    distance = len(set1.intersection(set2))/len(set1.union(set2))
                    df_dist.at[col1, col2] = 1-distance
                else:
                    # For other methods, we use the non-zero quantitative values from each condition
                    distance = corr[[col1, col2]].dropna(axis=0)
                    distance = pairwise_distances(distance.T.to_numpy(),
                                                  metric=metric, force_all_finite='allow-nan')
                    df_dist.at[col1, col2] = distance[0][1]
            df = 1-df_dist
    corr = df
    # Perform the absoltute_cuttoff for values
    corr.columns, corr.index = data.groups, data.groups
    corr = corr.replace(np.nan, 0)
    annot = copy(corr)
    if annotation is False:
        annot[annot != -2] = np.nan
    annot[annot == 1] = np.nan
    sns.clustermap(corr, cmap=palette, center=center, annot=annot,
                   mask=annot.isnull(),
                   col_colors=colors, row_colors=colors)
    if save is not None:
        if vector is True:
            plt.savefig(save + 'similarity_heatmap.svg', bbox_inches='tight')
        else:
            plt.savefig(save + 'similarity_heatmap.png', dpi=dpi, bbox_inches='tight')
    plt.show()


[docs]def overlap_fisher(group1, group2, union):
    """Perform a pair-wise comparison based on hypergeometric
    distribution.

    Args:
        group1 (Series, pandas): Column condition 1
        group2 (Series, pandas): Column condition 2
        union (int): number of whole entities evaluated in the study
        among all conditions

    Returns:
        Pvalue (float): P-value
    """
    from scipy.stats import hypergeom

    deps1 = set(group1)
    deps2 = set(group2)
    intersection = len(deps1.intersection(deps2))
    distribution = min([len(deps1), len(deps2)])
    [M, n, N] = [union, len(deps1), len(deps2)]
    rv = hypergeom(M, n, N)
    x = np.arange(intersection, distribution)
    pval = sum(rv.pmf(x))
    return pval

[docs]def distribution_test(self, protein_pvalue, 
                   method):
    """This function performs a statistical analysis on protein data considering overlaps between groups. The function performs different statistical tests depending on the chosen method:
        1. t-test (ttest): This test is used to compare the means of two groups assuming normally distributed data.
        2. Wilcoxon signed-rank test (wilcoxon): This non-parametric test is used to compare two related groups when the data may not be normally distributed.
        3. Kolmogorov-Smirnov test (ks): This test is used to compare the probability distributions of two samples.
    
    Args:
        protein_pvalue (float): The cut-off value for protein p-values.
        method (str):  The statistical method to be used for comparison. Valid options include "ttest" (t-test), "wilcoxon" (Wilcoxon signed-rank test), and "ks" (Kolmogorov-Smirnov test).
        comparison among groups considering t-test (for parametric distributions),
        wilcoxon (for non-parametric distributions), and ks (kolmorov-smirnov test).

    Returns:
        matrix (DataFrame, pandas): P-value
    """
    from scipy.stats import ttest_ind, wilcoxon, kstest
    from scipy.spatial.distance import squareform
    import itertools
    protein_pvalue = protein_pvalue
    stat = method

    conditions = self.groups

    data = [i[i[self.pvalue]<=protein_pvalue] for i in self.original]
    data = [i.set_index('Accession') for i in data]
    data = [i['log2(fc)']for i in data]
    data = [i.replace(-np.inf, np.nan) for i in data]
    data = [i.replace(np.nan, min(i)-1) for i in data]


    pair_data = list(itertools.combinations(data,2))

    overlap = [ set(i[0].index) & set(i[1].index) for i in pair_data]
    pair_data = [ (i[0][i[0].index.isin(j)], i[1][i[1].index.isin(j)]  ) for i,j in zip(pair_data,overlap)]

    if stat == 'ttest':
        stats = [ttest_ind(i[0], i[1])[1] for i in pair_data]
    if stat == 'wilcoxon':
        stats = [wilcoxon(i[0], i[1])[1] for i in pair_data]
    if stat == 'ks':
        stats = []
        for i in pair_data:
            try:
                stats.append(kstest(i[0], i[1])[1])
            except:
                stats.append(0)
    matrix = squareform(stats)
    matrix = pd.DataFrame(matrix, columns=conditions, index=conditions)
    return matrix

[docs]def fisher_test(self, protein_pvalue,
                   background_lenght):
    """This function performs a pair-wise statistical analysis using Fisher's exact test.
    Fisher's exact test is a statistical test used to compare two nominal variables from two samples. 
    In this context, it's used to compare the proportions of proteins with significant p-values
    (determined by protein_pvalue) between groups.

    Args:
        protein_pvalue (float): The cut-off value for protein p-values.
        background_lenght (float, optional): The total number of entities in the background set (optional).
        If not provided, all genes from the original data are used as the background (Recommended).
    Returns:
        matrix (DataFrame, pandas): P-value
    """
    from scipy.spatial.distance import squareform, pdist
    conditions = self.groups
    pval = self.pvalue
    if background_lenght is None:
        union = set(pd.concat(self.original).gene_name)
        union = len(union)
    else:
        union = background_lenght
    sets = [set(x[x[pval] <= protein_pvalue].gene_name) for x in self.original]
    sets = pd.DataFrame(sets)
    matrix = pdist(sets, lambda u, v: overlap_fisher(u, v, union=union))
    matrix = squareform(matrix)
    matrix = pd.DataFrame(matrix, columns=conditions, index=conditions)
    return matrix

[docs]def stat_matrix(self, method, protein_pvalue, background_lenght):
    """
  Performs a pair-wise statistical comparison between groups based on the 
  chosen method and a protein p-value cut-off.

  Args:
      self: Reference to the class instance where this function is called.
      method (str): The statistical method to be used for the comparison.
          Valid options include:
              * "fisher": Performs Fisher's exact test, suitable for comparing 
                  proportions of significant proteins between groups.
              * "ttest": Performs a t-test, assuming normally distributed data.
              * "wilcoxon": Performs a Wilcoxon signed-rank test, a non-parametric 
                  alternative for comparing related groups when normality cannot 
                  be assumed.
              * "ks": Performs a Kolmogorov-Smirnov test, used to compare the 
                  probability distributions of two samples.
      protein_pvalue (float): The cut-off value for protein p-values. This value 
          determines which proteins are considered significant based on a 
          previous analysis.
      background_lenght (int, optional): The total number of entities in the 
          background set. This argument is only used for the "fisher" method 
          to define the population size. If not provided, all genes from the 
          original data are used as the background.

  Returns:
      pandas.DataFrame: A DataFrame containing the p-values for each pair-wise 
          comparison between groups.
  """
    if method == 'fisher':
        matrix = fisher_test(self, protein_pvalue=protein_pvalue, 
                    background_lenght=background_lenght)
    elif method in ['ttest', 'wilcoxon', 'ks']:
        matrix = distribution_test(self, protein_pvalue=protein_pvalue,
                                   method=method)
    else:
        raise ValueError('''Please, verify if it was selected a valid method ("fisher", "ttest", "wilcoxon", or "ks").''')
    return matrix

[docs]def stat_network(self, method='fisher', protein_pvalue=0.05, background_lenght=None, dpi=300,
                 graph_pvalue=0.1,
                 save=None, vector=True):
    """
  Generates a network visualization based on statistical comparisons 
  between groups imported on Nebula

  Args:
      method (str, optional): The statistical method used for comparison 
          between groups. Valid options include:
              * "fisher": Performs Fisher's exact test, suitable for comparing 
                  proportions of significant proteins.
              * "ttest": Performs a t-test, assuming normally distributed data.
              * "wilcoxon": Performs a Wilcoxon signed-rank test, a non-parametric 
                  alternative for related groups when normality cannot be assumed.
              * "ks": Performs a Kolmogorov-Smirnov test, used to compare the 
                  probability distributions of two samples.
          Defaults to "fisher".
      protein_pvalue (float, optional): The cut-off value for protein p-values. 
          This value determines which proteins are considered significant 
          based on a previous analysis. Defaults to 0.05.
      background_lenght (int, optional): The total number of entities in the 
          background set. This argument is only used for the "fisher" method 
          to define the population size. If not provided, all genes from the 
          original data are used as the background. Defaults to None (Recommended).
      dpi (int, optional): The resolution (dots per inch) for the generated 
          plot. Defaults to 300.
      graph_pvalue (float, optional): The threshold for p-values to consider
          the links between network nodes. Edges with 
          p-values greater than (for "fisher") or less than (for other methods) to 
          this value will be excluded from the network.
          Defaults to 0.1.
      save (str, optional): The filename prefix to save the network image 
          (e.g., "groupNetwork"). If provided, the function will save both 
          the GraphML representation of the network and the plot image.
      vector (bool, optional): If True (default), saves the image as an SVG 
          file (scalable vector graphics) suitable for high-quality printing. 
          If False, saves the image as a PNG file.

  Returns:
      None. This function generates a network visualization and potentially 
          saves image files, but it doesn't return any data.
  """
    import matplotlib.pyplot as plt
    plt.rcParams['figure.dpi'] = dpi
    palette = self.colors
    conditions = self.groups
    pval = self.pvalue
    matrix = stat_matrix(self, method=method,
                         protein_pvalue=protein_pvalue,
                         background_lenght=background_lenght)
    if method == 'fisher':
        matrix[matrix >= graph_pvalue] = 0
    elif method in ['ttest', 'wilcoxon','ks']:
        matrix[matrix <= graph_pvalue] = 0
    G = nx.from_pandas_adjacency(matrix)
    G.edges(data=True)
    size = [len(set(x[x[pval] <= protein_pvalue].gene_name)) for x in self.original]
    carac = pd.DataFrame(zip(conditions, palette, size),
                         columns=['ID', 'color', 'Size'])
    carac = carac.set_index('ID')
    nx.set_node_attributes(G, dict(zip(carac.index, carac.color)), name="Color")
    nx.set_node_attributes(G, dict(zip(carac.index, carac.Size)), name="Size")
    pos = nx.kamada_kawai_layout(G, )
    carac = carac.reindex(G.nodes())
    pos = nx.kamada_kawai_layout(G, weight=None)
    edges = G.edges
    weights = [G[u][v]['weight'] for u, v in edges]
    if method == 'fisher':
            weights = -np.log10(weights)
    weights = [round(x, 2) for x in weights]
    G.remove_edges_from(nx.selfloop_edges(G))
    norm = [float(i)*5/np.mean(weights) for i in weights]
    nx.draw(G,
            pos=pos,
            node_color=carac['color'],
            node_size=carac['Size'],
            edgecolors='black',
            linewidths=0.6,
            alpha=0.9,
            width=norm,
            edge_color='gray')
    nx.draw_networkx_labels(G, pos, font_size=6)
    labels = nx.get_edge_attributes(G, 'weight')
    nx.draw_networkx_edge_labels(G, pos, edge_labels=dict(zip(list(labels.keys()), weights)))
    if save is not None:
        nx.write_graphml(G, save + 'PPNetwork.graphml', named_key_ids=True)
        if vector is True:
            plt.savefig(save + 'groupNetwork.svg', bbox_inches='tight')
        else:
            plt.savefig(save + 'groupNetwork.dpi', dpi=dpi, bbox_inches='tight')
    plt.show()

[docs]def stat_heatmap(self, palette='Spectral', method='fisher', pvalue=0.05,
                   background_lenght=None, annotation=True,
                   save=None, vector=True, dpi=300):
    """
    Generates a heatmap visualization to represent the p-values from 
    pair-wise statistical comparisons between groups.

    Args:
        palette (str, optional): The color palette to use for the heatmap. 
            Defaults to "Spectral".
        method (str, optional): The statistical method used for comparison 
            between groups. Valid options include:
                * "fisher": Performs Fisher's exact test, suitable for comparing 
                    proportions of significant proteins.
                * "ttest": Performs a t-test, assuming normally distributed data.
                * "wilcoxon": Performs a Wilcoxon signed-rank test, a non-parametric 
                    alternative for related groups when normality cannot be assumed.
                * "ks": Performs a Kolmogorov-Smirnov test, used to compare the 
                    probability distributions of two samples.
            Defaults to "fisher".
        pvalue (float, optional): The cut-off value for protein p-values. 
            This value determines which proteins are considered significant 
            based on a previous analysis. Defaults to 0.05.
        background_lenght (int, optional): The total number of entities in the 
            background set. This argument is only used for the "fisher" method 
            to define the population size. If not provided, all genes from the 
            original data are used as the background. Defaults to None.
        annotation (bool, optional): If True (default), displays the p-values 
            within each heatmap cell. If False, hides the p-value annotations.
        save (str, optional): The filename prefix to save the heatmap image 
            (e.g., "overlap_stat"). If provided, the function will save the plot 
            image.
        vector (bool, optional): If True (default), saves the image as an SVG 
            file (scalable vector graphics) suitable for high-quality printing. 
            If False, saves the image as a PNG file.
        dpi (int, optional): The resolution (dots per inch) for the generated 
            plot. Defaults to 300.

    Returns:
        None. This function generates a heatmap visualization and potentially 
            saves an image file, but it doesn't return any data.
    """
    plt.rcParams['figure.dpi'] = dpi
    colors = self.colors
    matrix = stat_matrix(self, method=method,
                         protein_pvalue=pvalue,
                         background_lenght=background_lenght)
    annot = matrix.copy()
    if annotation is False:
        annot[annot != np.inf] = np.nan
    annot[annot == 0] = np.nan
    sns.clustermap(matrix, cmap=palette, annot=annot,
                   mask=annot.isnull(),
                   col_colors=colors, row_colors=colors)
    plt.title('Pvalue')
    if save is not None:
        if vector is True:
            plt.savefig(save + 'overlap_stat.svg', bbox_inches='tight')
        else:
            plt.savefig(save + 'overlap_stat.png', dpi=dpi, bbox_inches='tight')
    plt.show()



def linkproteins(deps, groups):
    # retrieving overlapped proteins among groups
    overlapped_proteins = pd.concat(deps)
    overlapped_proteins = list(overlapped_proteins[overlapped_proteins.duplicated(
        subset=['gene_name'], keep=False)]['gene_name'].dropna())
    # ordering matrix
    matrixes = [x.assign(duplicated=lambda x: x.gene_name.isin(
        overlapped_proteins)) for x in deps]
    matrixes = [x.sort_values(
        ['duplicated', 'log2(fc)'], ascending=False) for x in matrixes]
    matrixes = [x.reset_index(drop=True) for x in matrixes]
    # Retrieving links
    dataframes = copy(matrixes)
    result = pd.DataFrame(
        columns=['gene_name', 'query_chr', 'query_start', 'ref_chr', 'ref_start'])
    # Perform pairwise comparison between DataFrames
    for i in range(len(dataframes)-1):
        for j in range(i+1, len(dataframes)):
            common_names = set(dataframes[i]['gene_name']).intersection(
                dataframes[j]['gene_name'])
            for name in common_names:
                index1 = dataframes[i][dataframes[i]
                                       ['gene_name'] == name].index[0]
                index2 = dataframes[j][dataframes[j]
                                       ['gene_name'] == name].index[0]
                result = result.append({'gene_name': name, 'query_chr': i, 'query_start': index1, 'ref_chr': j, 'ref_start': index2},
                                       ignore_index=True)
    result['query_end'], result['ref_end'] = result['query_start'] + \
        1, result['ref_start']+1
    group_dict = dict(enumerate(groups))
    result['query_chr'] = result.query_chr.replace(group_dict)
    result['ref_chr'] = result.ref_chr.replace(group_dict)
    return result, matrixes


def linkenrichment(enrichment, groups, number_deps):
    enr_original = copy(enrichment)
    group_original = copy(groups)
    enrichment = []
    group = []
    ndeps = []
    for e, g, n in zip(enr_original, group_original, number_deps):
        if e is not None:
            enrichment.append(e)
            group.append(g)
            ndeps.append(n)
    enrichment = [x.assign(db_term=x['Gene_set']+'-'+x['Term'])
                  for x in enrichment]
    overlapped_enrichment = pd.concat(enrichment)
    overlapped_enrichment['db_term'] = overlapped_enrichment['Gene_set'] + \
        '.'+overlapped_enrichment['Term']
    overlapped_enrichment = list(overlapped_enrichment[overlapped_enrichment.duplicated(
        subset=['db_term'], keep=False)]['db_term'].dropna())
    # ordering matrix
    matrixes = [x.assign(duplicated=lambda x: x.db_term.isin(
        overlapped_enrichment)) for x in enrichment]
    indexes = [list(np.random.randint(0, y, size=len(x)))
               for x, y in zip(matrixes, ndeps)]
    matrixes = [x.set_index(pd.Index(y)) for x, y in zip(matrixes, indexes)]
    # Retrieving links
    dataframes = copy(matrixes)
    result = pd.DataFrame(
        columns=['db_term', 'query_chr', 'query_start', 'ref_chr', 'ref_start'])
    # Perform pairwise comparison between DataFrames
    for i in range(len(dataframes)-1):
        for j in range(i+1, len(dataframes)):
            common_names = set(dataframes[i]['db_term']).intersection(
                dataframes[j]['db_term'])
            for name in common_names:
                index1 = dataframes[i][dataframes[i]
                                       ['db_term'] == name].index[0]
                index2 = dataframes[j][dataframes[j]
                                       ['db_term'] == name].index[0]
                result = result.append({'db_term': name, 'query_chr': i, 'query_start': index1, 'ref_chr': j, 'ref_start': index2},
                                       ignore_index=True)
    result['query_end'], result['ref_end'] = result['query_start'] + \
        1, result['ref_start']+1
    group_dict = dict(enumerate(group))
    result['query_chr'] = result.query_chr.replace(group_dict)
    result['ref_chr'] = result.ref_chr.replace(group_dict)
    return result


[docs]def circos_plot(self, vmax=1, vmin=-1, colormap='RdYlBu_r', colorproteins='darkcyan',
                colorenrichment='black',
                linewidth_heatmap=0.1, save=None, vector=True, dpi=300):
    """Circos plot
     This plot offers an overview of proteins differentially regulated
     between groups using circular plots.

    Args:
        vmin (int, optional): minimum value for foldchange. Defaults to -1.
        vmax (int, optional): maximum value for foldchange. Defaults to 1.
        colormap (str, optional): Colormap for heatmap. Defaults to 'RdBu_r'.
        colorproteins (str, optional): Color for protein links. Defaults to 'darkcyan'.
        colorenrichment (str, optional): Color for enrichment links. Defaults to 'black'.
        save (str, optional): Path to save file. Defaults to None.
        vector (bool, optional): Save as svg extension, if False, save as png. Defaults to True.
        dpi (int, optional): Figure resolution. Defaults to 300.
    """
    # Data
    enrichment = copy(self.enrichment)
    groups = copy(self.groups)
    deps = copy(self.group_data)
    deps = [x.dropna().reset_index(drop=True) for x in deps]
    colors = self.colors
    grouplen = [len(x) for x in deps]
    higher_group = max(grouplen)

    # retrieving links and matrixes
    links, matrixes = linkproteins(deps, groups)

    # Mapping heatmap
    matrixes = [y[['log2(fc)']].applymap(
        lambda x: vmax if x > vmax else x) for y in matrixes]
    matrixes = [y[['log2(fc)']].applymap(
        lambda x: vmin if x < vmin else x) for y in matrixes]
    matrixes = [x[['log2(fc)']].T.to_numpy() for x in matrixes]
    # Config circos
    sectors = dict(zip(groups, grouplen))
    sector_colors = dict(zip(groups, colors))
    circos = Circos(sectors, space=5)

    for sector, matrix in zip(circos.sectors, matrixes):
        # Outer name
        # Tamanho da primeira Track, contendo as cores dos grupos
        outer_track = sector.add_track((95, 110))
        # Marcar o nome dos grupos
        outer_track.text(sector.name, color="Black")
        # Outer Track
        # Tamanho da primeira Track, contendo as cores dos grupos
        outer_track = sector.add_track((88, 90))
        outer_track.axis(fc=sector_colors[sector.name])  # cores dos grupos
        outer_track.xticks_by_interval(interval=int(
            higher_group/20), label_orientation="vertical")  # colocar xticks nas tracks
        # foldchange track
        rect_track = sector.add_track((80, 85))
        rect_track.heatmap(matrix, cmap=colormap,
                           rect_kws=dict(ec="black", lw=linewidth_heatmap))

    # drawing enrichment links if applicable
    if len(enrichment) > 1:
        linkenr = linkenrichment(enrichment, groups, grouplen)
        for i in linkenr.to_dict('records'):
            region1 = (i['query_chr'], i['query_start'], i['query_end'])
            region2 = (i['ref_chr'], i['ref_start'], i['ref_end'])
            circos.link(region1, region2, color=colorenrichment)
    # drawing links
    for i in links.to_dict('records'):
        region1 = (i['query_chr'], i['query_start'], i['query_end'])
        region2 = (i['ref_chr'], i['ref_start'], i['ref_end'])
        circos.link(region1, region2, color=colorproteins)
    circos.colorbar(vmin=vmin, vmax=vmax, cmap=colormap,
                    colorbar_kws=dict(label="log2(FoldChange)"))
    fig = circos.plotfig()
    if save is not None:
        if vector is True:
            fig.savefig(save + 'circos.svg')
        else:
            fig.savefig(save + 'circos.png', dpi=dpi)


[docs]def circular_term(self, *Terms, pvalue=0.05, vmin=-1, vmax=1, colormap='RdBu_r',
                  label_size=12, save=None, vector=True, dpi=300):
    """Circular term
        Allows the visualization of all proteins related to a pre-specified term.
        This term is extracted from enrichment data.

    Args:
        pvalue (float, optional): Pvalue to consider differentially regulated proteins
         . Defaults to 0.05.
        vmin (int, optional): minimum value for foldchange. Defaults to -1.
        vmax (int, optional): maximum value for foldchange. Defaults to 1.
        colormap (str, optional): Colormap for heatmap. Defaults to 'RdBu_r'.
        save (str, optional): Path to save file. Defaults to None.
        vector (bool, optional): Save as svg extension, if False, save as png. Defaults to True.
        dpi (int, optional): Figure resolution. Defaults to 300.

    Raises:
        TypeError: Term/Terms was/were not found in dataset.
    """
    enrichment = [x for x in self.enrichment if x is not None]
    deps = self.original
    deps = [x[x[self.pvalue] <= pvalue] for x in deps]
    groups = self.groups
    colors = dict(zip(groups, self.colors))
    # Select genes from enriched term.
    enrichTerms = []
    for term in Terms:
        enr = [x[x['Term'].str.contains(term)] for x in enrichment]
        enrichTerms.extend(enr)
    enrichTerms = pd.concat(enrichTerms)
    enrichGenes = list(enrichTerms['Genes'])
    enrichGenes = sum(enrichGenes, [])
    enrichGenes = list(set(enrichGenes))
    # Filtering based on enrichgenes
    data = [x[x['gene_name'].isin(enrichGenes)] for x in deps]
    data = [x[['gene_name', "log2(fc)"]] for x in data]
    data = [x.rename(columns={'log2(fc)': y}) for x, y in zip(data, groups)]
    data = [x.groupby('gene_name').mean() for x in data]
    data = pd.concat(data, axis=1).T
    data = data.sort_index(axis=1)
    matrix = data
    matrix = matrix.notnull().astype(int)
    matrix = matrix.fillna(0)
    row_sums = matrix.sum(axis=1)
    matrix = matrix.drop(index=matrix.index[row_sums == 0])
    heatmaps = data
    heatmaps[heatmaps > vmax] = vmax
    heatmaps[heatmaps < vmin] = vmin
    heatmaps = [np.array([heatmaps[x].dropna()]) for x in heatmaps]
    heatmaps = [np.array([[np.nan]])]*len(matrix) + heatmaps
    heatmaps = [x[:, ::-1] for x in heatmaps]
    if len(matrix.columns) == 0:
        raise TypeError('Matrix length is zero. Term/Terms was/were not found in dataset.')
    circos = Circos.initialize_from_matrix(
        matrix,
        start=-265,
        end=95,
        space=0.3,
        r_lim=(93, 100),
        cmap=colors,
        label_kws=dict(r=101, orientation="vertical", size=label_size),
    )
    for sector, heatmap in zip(circos.sectors, heatmaps):
        # Outer Track
        # foldchange track
        if sector.name not in data.index:
            rect_track = sector.add_track((93, 100))
            rect_track.heatmap(heatmap, cmap=colormap, vmin=vmin, vmax=vmax)

    circos.colorbar(bounds=(1.1, 0.3, 0.02, 0.4), vmin=vmin, vmax=vmax, cmap="RdBu_r",
                    colorbar_kws=dict(label="log2(FoldChange)"))
    fig = circos.plotfig()
    if save is not None:
        if vector is True:
            fig.savefig(save+'Term_circular_plot.svg')
        else:
            fig.savefig(save+'Term_circular_plot.png', dpi=dpi)