Source code for omicscope.EnrichmentAnalysis.EnrichmentVisualization

""" Module for EnrichmentScope object visualization

This module allows the user to extract and visualize information from EnrichmentScope object.

Here, it is possible to evaluate enrichment results with 1) dotplots, 2)heatmaps and 3) network analysis.
Dotplots allows the visualization of statistical significance and size of dataset enriched (*number_deps*), and
the number of up- and down-regulated proteins (*number_deps*). Heatmaps can be plotted according statistical
significance and also protein foldchange; for GSEA analysis, the function gsea_heatmap plots the normalization
enrichment score (NES) as color pattern. Finally, the network plots can be used to visualize shared proteins
between pathways (*enrichment_network()*), or perform an enrichment map (*enrichment_map()*).

Additionally, some functions optionally show a specific Term while add the name as Args.

"""

import copy

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns


[docs]def dotplot(self, top=10, palette='BuPu', alpha=1, s=10,
            x_size=5, y_size=6, label_wrap=50,
            save=None, dpi=300, vector=True):
    """Dotplot for enriched terms.

    Args:
        top (int, optional): top-N enriched terms to be visualized.
         Defaults to 10.
        palette (str, optional): color map to visualization,
         for more information https://matplotlib.org/stable/tutorials/colors/colormaps.html.
         Defaults to 'BuPu'. For GSEA enrichment, we advise a divergent palette.
        alpha (int, optional): dots transparency. Defaults to 1.
        s (int, optional): dotsize. Defaults to 10.
        x_size (int, optional): Size of horizontal axis. Defaults to 5.
        y_size (int, optional): Size of vertical axis. Defaults to 6.
        label_wrap (int, optional): Label wrap. Defaults to 50.
        save (str, optional): Path to save figure. Defaults to None.
        dpi (int, optional): Resolution to save figure. Defaults to 300.
        vector (bool, optional): Save figure in as vector (.svg). Defaults to
         True.
    """
    plt.rcParams["figure.dpi"] = dpi
    dbs = self.dbs
    df = copy.copy(self.results)
    for i in list(dbs):
        df_db = df[df['Gene_set'] == i]
        df_db = df_db.iloc[:top, :]
        xaxis = df_db[['-log10(pAdj)']]
        size = df_db['N_Proteins']
        if self.Analysis == 'GSEA':
            color = df_db['NES']
            xaxis = df_db[['NES']]
        else:
            color = df_db['N_Proteins']
        yaxis = df_db['Term']
        plt.style.use('default')
        plt.figure(figsize=(x_size, y_size))
        from textwrap import wrap
        labels = ['\n'.join(wrap(label, label_wrap)) for label in yaxis]
        plt.scatter(x=xaxis, y=labels, s=size*s, c=color, edgecolors='black',
                    linewidths=0.7, cmap=palette, alpha=alpha)
        plt.gca().invert_yaxis()
        plt.title(i)
        sns.despine()
        if self.Analysis == 'GSEA':
            plt.colorbar().set_label('NES')
            plt.axvline(0, c='black', ls='--')
        else:
            plt.colorbar().set_label('# Proteins')
        plt.xlabel(xaxis.columns[0])
        plt.margins(x=.1, y=0.1)
        if save is not None:
            if vector is True:
                plt.savefig(save + 'dotplot_'+i+'.svg', bbox_inches='tight')
            else:
                plt.savefig(save + 'dotplot_'+i+'.png', dpi=dpi, bbox_inches='tight')
        plt.show(block=True)


[docs]def heatmap(self, *Terms, top=5, foldchange=False,
            x_size=5, linewidths=0.01,
            foldchange_range=[-0.5, 0.5], save=None, dpi=300,
            vector=True):
    """Heatmap for enriched terms and proteins

        Args:
            top (int, optional): top N enriched terms to be visualized.
             Defaults to 5.
            foldchange (bool, optional): show the protein fold change.
             Defaults to False.
            foldchange_range (list, optional): Fold change range to plot protein colors,
             such as a heatmap. Defaults to [-0.5, 0.5].
            save (str, optional): Path to save figure. Defaults to None.
            dpi (int, optional): Resolution to save figure. Defaults to 300.
            vector (bool, optional): Save figure in as vector (.svg).
             Defaults to True.
    """
    plt.rcParams["figure.dpi"] = dpi
    omics = copy.copy(self.results)
    foldchange_range = foldchange_range
    if len(Terms) > 0:
        top = len(Terms)
        omics = omics[omics['Term'].isin(Terms)]
    deps = copy.copy(self.OmicScope.deps)
    deps['gene_name'] = deps['gene_name'].str.upper()
    deps.columns = ['Genes', 'Accession', 'pvalue', '-log10(p)', 'log2(fc)']
    for i in omics.Gene_set.drop_duplicates():
        heatmap_path = omics[omics['Gene_set'] == i]
        heatmap_path = heatmap_path.iloc[0:top, :]
        heatmap_path = heatmap_path.explode(['Genes', 'regulation'])
        ordering = list(heatmap_path.Term.drop_duplicates())
        if foldchange is True:
            heatmap = heatmap_path.pivot(values='regulation', index='Genes', columns='Term')
        else:
            heatmap = heatmap_path.pivot(values='-log10(pAdj)', index='Genes', columns='Term')
            heatmap = heatmap[ordering]
            heatmap = heatmap.sort_values(ordering)
        sns.reset_orig()
        px = 1/plt.rcParams['figure.dpi']
        f, ax = plt.subplots(figsize=(x_size,
                                      len(heatmap)*px*20))
        if foldchange is True:
            sns.heatmap(heatmap.astype(float),  cmap='RdYlBu_r',
                        vmin=min(foldchange_range),
                        vmax=max(foldchange_range),
                        yticklabels=True, xticklabels=True,
                        cbar_kws={'label': 'log2(fc)', "shrink": .5},
                        linecolor='black', linewidths=linewidths)
        else:
            sns.heatmap(heatmap,  cmap='Spectral',
                        yticklabels=True, xticklabels=True,
                        cbar_kws={'label': '-log10(p)', "shrink": .5},
                        linecolor='black', linewidths=linewidths)
        plt.xlabel('')
        plt.xticks(rotation=45, ha='right')
        if save is not None:
            if vector is True:
                plt.savefig(save + 'heatmap_'+i+'.svg', bbox_inches='tight')
            else:
                plt.savefig(save + 'heatmap_'+i+'.png', dpi=dpi, bbox_inches='tight')
        plt.show(block=True)


[docs]def number_deps(self, *Terms, top=20, palette='RdBu',
                save=None, dpi=300, vector=True):
    """Number of DEPs

    Return number of down- and up-regulated proteins
    in top-N or specified pathways.

    Args:
        top (int, optional): top-N enriched terms to be visualized.
        Defaults to 10.
        palette (str, optional): color map for up- and down-regulated
        representations. Defaults to 'RdBu'.
        save (str, optional): Path to save figure. Defaults to None.
        dpi (int, optional): Resolution to save figure. Defaults to 300.
        vector (bool, optional): Save figure in as vector (.svg). Defaults to
         True.
    """
    plt.rcParams["figure.dpi"] = dpi
    df = copy.copy(self.results)
    if len(Terms) > 0:
        df = df[df['Term'].isin(Terms)]
    for i in df.Gene_set.drop_duplicates():
        df_db = df[df['Gene_set'] == i]
        df_db = df_db.sort_values(by=['Adjusted P-value', 'N_Proteins'])
        df_db = df_db.iloc[:top, ]
        df_db = df_db.set_index('Term')
        ysize = len(df_db)
        df_db = df_db[['up-regulated', 'down-regulated']]
        df_db = df_db.melt(ignore_index=False)
        df_db = df_db.reset_index()
        df_db = df_db.rename(columns={'variable': 'Regulation',
                                      'value': 'Size'})
        if min(df_db['Size']) == 0:
            sizes = (0, 500)
        else:
            sizes = (10, 500)
        plt.figure(figsize=(1, ysize*0.4))
        sns.scatterplot(data=df_db,
                        x='Regulation',
                        y='Term',
                        size='Size',
                        palette=palette,
                        edgecolor='black',
                        linewidth=0.5,
                        alpha=1,
                        hue='Regulation',
                        sizes=sizes)
        leg = plt.legend(bbox_to_anchor=(1, 1, 0, 0))
        leg.get_frame().set_edgecolor('white')
        plt.xticks(rotation=45, ha='right')
        plt.ylabel('')
        plt.xlabel('')
        plt.margins(x=1, y=1/ysize)
        plt.tight_layout()
        sns.despine()
        if save is not None:
            if vector is True:
                plt.savefig(save + 'number_deps_'+i+'.svg', bbox_inches='tight')
            else:
                plt.savefig(save + 'number_deps_'+i+'.png', dpi=dpi, bbox_inches='tight')
        plt.show(block=True)


[docs]def enrichment_network(self, *Terms, top=5, labels=False,
                       term_color='#a1a1a1', foldchange_range=[-0.5, 0.5],
                       save=None, vector=True, dpi=300):
    """EnrichmentTerm-protein network.

    Network visualization to find proteins that are shared by different enriched Terms.

    Args:
        top (int, optional): top N enriched terms to be visualized.
         Defaults to 5.
        labels (bool, optional): Show node labels. Defaults to False.
        term_color (str, optional): Color to plot pathways. Defaults to '#a1a1a1'.
        foldchange_range (list, optional): Fold change range to plot protein colors,
         such as a heatmap. Defaults to [-0.5, 0.5].
        save (str, optional): OmicScope saves networks as figure and Graphml files, which
         can be used in other network software (recommended). Defaults to None.
        vector (bool, optional): Save image as vector (.svg) Defaults to True.
        dpi (int, optional): Figure resolution. Defaults to 300.


    Returns:
        Graph (NetworkX output): Graph
    """
    import matplotlib as mpl
    import matplotlib.cm as cm
    import matplotlib.colors as mcolors
    import networkx as nx

    plt.rcParams["figure.dpi"] = dpi
    omics = copy.copy(self.results)
    if len(Terms) > 0:
        top = len(Terms)
        omics = omics[omics['Term'].isin(Terms)]
    G_objects = []
    for i in omics.Gene_set.drop_duplicates():
        df = omics[omics['Gene_set'] == i]
        df = df.iloc[0:top,]
        df = df.explode(['Genes', 'regulation'])
        if self.Analysis == 'GSEA':
            norm = mpl.colors.TwoSlopeNorm(vmin=omics[omics['Gene_set'] == i]['NES'].min(),
                                           vcenter=0,
                                           vmax=omics[omics['Gene_set'] == i]['NES'].max(),
                                           )
            cmap = cm.RdYlBu_r
            m = cm.ScalarMappable(norm=norm, cmap=cmap)
            term_color = [mcolors.to_hex(m.to_rgba(x)) for x in df['NES']]
        source = pd.DataFrame({'ID': df['Term'],
                               'Size': df['-log10(pAdj)']*20,
                               'type': 'pathway',
                               'color': term_color})
        source = source.drop_duplicates()

        norm = mpl.colors.TwoSlopeNorm(vmin=min(foldchange_range),
                                       vmax=max(foldchange_range),
                                       vcenter=0)
        cmap = cm.RdYlBu_r
        m = cm.ScalarMappable(norm=norm, cmap=cmap)
        color_hex = [mcolors.to_hex(m.to_rgba(x)) for x in df['regulation']]

        target = pd.DataFrame({'ID': df['Genes'],
                               'Size':  min(df['-log10(pAdj)'])*4,
                               'type': 'Protein',
                               'color': color_hex})
        target = target.drop_duplicates()
        edgelist = df[['Term', 'Genes']]
        G = nx.from_pandas_edgelist(edgelist,
                                    source='Term',
                                    target='Genes',
                                    create_using=nx.Graph)

        carac = pd.concat([source, target]).reset_index(drop=True)
        carac = carac.drop_duplicates('ID')
        carac = carac.set_index('ID')
        nx.set_node_attributes(G, dict(zip(carac.index, carac.color)), name="Color")
        nx.set_node_attributes(G, dict(zip(carac.index, carac.Size)), name="Size")
        pos = nx.kamada_kawai_layout(G)
        carac = carac.reindex(G.nodes())
        nx.draw(G,
                pos=pos,
                node_color=carac['color'],
                node_size=carac['Size'],
                edgecolors='black',
                linewidths=0.4,
                alpha=0.9,
                width=0.2,
                edge_color='#a1a1a1')
        if labels is True:
            nx.draw_networkx_labels(G, pos, font_size=6)
        if save is not None:
            nx.write_graphml(G, save + 'PPNetwork_'+i+'.graphml', named_key_ids=True)
            if vector is True:
                plt.savefig(save + 'PPNetwork_'+i+'.svg', bbox_inches='tight')
            else:
                plt.savefig(save + 'PPNetwork_'+i+'.dpi', dpi=300, bbox_inches='tight')
        plt.show()
        G_objects.append(G)
    return G_objects


[docs]def enrichment_map(self, *Terms, top=1000, modules=True, labels=False,
                   similarity_cutoff=0.25, metric='jaccard',
                   save=None, vector=True, dpi=300):
    """Enrichment map.

    Since several proteins are presented in more than one pathway, enrichment map
    shows pathway as nodes and the edge thickness is proportional to the amount
    of proteins shared between two terms (similarity score)

    Args:
        top (int, optional): Top terms used to construct network. Defaults to 1000.
        modules (bool, optional): Returns modularity analysis of Terms. Defaults to True.
        labels (bool, optional): Add Term labels to graph. Defaults to False.
        similarity_cutoff (float, optional): similarity score cutoff based on statistical analysis performed.
         Defaults to 0.25.
        metric (str, optional): statistical algorithm to perform pairwise comparison. Defaults to 'jaccard'.
         Optionally, user can test other algorithm described in scipy.spatial.distance.
        save (str, optional): OmicScope saves networks as figure and Graphml files, which
         can be used in other network software (recommended). Defaults to None.
        vector (bool, optional): Save image as vector (.svg) Defaults to True.
        dpi (int, optional): Figure resolution. Defaults to 300.

    Returns:
        Graph (NetworkX output): Graph
    """
    import matplotlib as mpl
    import matplotlib.cm as cm
    import matplotlib.colors as mcolors
    import networkx as nx
    from sklearn.metrics import pairwise_distances

    plt.rcParams["figure.dpi"] = dpi
    omics = copy.copy(self.results)
    # Select terms of interest
    if len(Terms) > 0:
        top = len(Terms)
        omics = omics[omics['Term'].isin(Terms)]
    Analysis = self.Analysis
    G_objects = []
    for i in omics.Gene_set.drop_duplicates():
        original = omics[omics['Gene_set'] == i]
        # Select top enriched terms by sorting values and droping duplicates with < pvalue
        original = original.sort_values('-log10(pAdj)', ascending=False)
        original = original.drop_duplicates('Term')
        original = original.iloc[0:top,]
        # Subset ORA-results in Term, Genes, and regulation columns
        df = original[['Term', 'Genes', 'regulation']]
        df = df.explode(['Genes', 'regulation'])
        # CrossTab and perform Pearson correlation among Terms
        df = pd.crosstab(df.Genes, df.Term)
        colnames = df.columns
        df = pairwise_distances(df.T.to_numpy(), metric=metric)
        df = 1 - pd.DataFrame(df, index=colnames, columns=colnames)
        # Filter correlation (R) to define Adjacency Matrix
        # Filter: R < pearson_cutoff
        df[df < similarity_cutoff] = 0
        # Construct graph
        G = nx.from_pandas_adjacency(df)
        G.edges(data=True)
        # removing self loops
        G.remove_edges_from(nx.selfloop_edges(G))

        if Analysis == 'ORA':
            # Node attributes
            carac = original[['Term', '-log10(pAdj)', 'Combined Score']]
            carac['Label'] = carac.Term
            # Produce color palette according to p-value of each term
            norm = mpl.colors.Normalize(vmin=int(carac['-log10(pAdj)'].drop_duplicates().min()),
                                        vmax=int(carac['-log10(pAdj)'].drop_duplicates().max()))
            cmap = cm.PuBu
            m = cm.ScalarMappable(norm=norm, cmap=cmap)
            color_hex = [mcolors.to_hex(m.to_rgba(x)) for x in carac['-log10(pAdj)']]
            carac['color'] = color_hex
        elif Analysis == 'GSEA':
            carac = original[['Term', '-log10(pAdj)', 'NES']]
            carac['Label'] = carac.Term
            norm = mpl.colors.TwoSlopeNorm(vmin=int(carac['NES'].drop_duplicates().min()),
                                           vmax=int(
                carac['NES'].drop_duplicates().max()),
                vcenter=0)
            cmap = cm.RdYlBu_r
            m = cm.ScalarMappable(norm=norm, cmap=cmap)
            carac['color'] = [mcolors.to_hex(m.to_rgba(x)) for x in carac['NES']]
        carac = carac.set_index('Term')
        # Set node attributes
        nx.set_node_attributes(G, dict(zip(carac.index, carac.color)), name="Color")
        nx.set_node_attributes(G, dict(zip(carac.index, carac['-log10(pAdj)'])), name="Size")
        # Find Communities
        if modules is True:
            import networkx.algorithms.community as nx_comm
            if Analysis == 'ORA':
                carac = carac[['-log10(pAdj)', 'Combined Score']]
            elif Analysis == 'GSEA':
                carac = carac[['-log10(pAdj)', 'NES']]
            # Find communities based on label propagation
            communities = nx_comm.louvain_communities(G)
            module = []
            color = []
            degree = []
            # Linking Terms, module, colors
            norm = mpl.colors.Normalize(vmin=0,
                                        vmax=len(communities)+1)
            cmap = sns.color_palette('turbo', len(communities)+1, desat=.9)
            for i, g in enumerate(communities):
                subgraph = G.subgraph(g)
                degree_subgraph = subgraph.degree
                degree.extend(degree_subgraph)
                module.extend([i]*len(g))
                if len(g) > 1:
                    color.extend([mcolors.to_hex(cmap[i])]*len(g))
                else:
                    color.extend(['#a6a6a6'])
                # DataFrame
            modularity = pd.DataFrame(zip(module, [x[0] for x in degree], color,
                                          [x[1] for x in degree]), columns=[
                'Module', 'Term', 'color', 'Degree'])
            carac = carac.merge(modularity, on='Term')
            carac = carac.sort_values('-log10(pAdj)', ascending=False)
            Label = carac[['Term', 'Module', 'Degree',
                           '-log10(pAdj)']].sort_values(['Module', 'Degree', '-log10(pAdj)'],
                                                        ascending=False)
            Label['mark'] = Label[['Module']].duplicated()
            carac = carac.merge(Label, how='left', on=['Term',
                                'Module', 'Degree', '-log10(pAdj)'])

            carac['Label'] = np.where(
                carac['mark'] == False,
                carac['Term'], "")
            carac = carac.set_index('Term')
            # Set node attributes to export
            nx.set_node_attributes(G, dict(zip(carac.index, carac.Degree)), name="IntraModuleDegree")
            nx.set_node_attributes(G, dict(zip(carac.index, carac.color)), name="Color")
            nx.set_node_attributes(G, dict(zip(carac.index, carac.Label)), name="Annotation")
            nx.set_node_attributes(G, dict(zip(carac.index, carac.Module)), name="Module")
            carac = carac.reindex(G.nodes())
            # Assign position of each mode
        edges = G.edges
        weights = [G[u][v]['weight'] for u, v in edges]
        weights = [round(x, 2) for x in weights]
        norm = [float(i)/np.mean(weights) for i in weights]
        pos = nx.spring_layout(G, k=1/len(G.nodes)**0.3)
        carac = carac.reindex(G.nodes())
        G.remove_edges_from(nx.selfloop_edges(G))
        # Draw network
        nx.draw(G, pos=pos,
                node_color=carac.color,
                node_size=carac['-log10(pAdj)']*10,
                edgecolors='black',
                linewidths=0.4,
                alpha=0.95,
                width=norm,
                edge_color='gray')
        if labels is True:
            nx.draw_networkx_labels(G, pos, carac.Label, font_size=6)
        if save is not None:
            nx.write_graphml(G, save + 'PathMap.graphml', named_key_ids=True)
            if vector is True:
                plt.savefig(save + 'PathMap.svg', bbox_inches='tight')
            else:
                plt.savefig(save + 'PathMap.dpi', dpi=300, bbox_inches='tight')
        plt.show(block=True)
        G_objects.append(G)
    return G_objects


[docs]def gsea_heatmap(self, *Terms, top=5, linewidths=0.01, save=None, dpi=300, vector=True):
    """GSEA Heatmap
    Plot a heatmap with colors based on Normalized Enrichment Score (NES)
    reported by GSEA Analysis.

    Args:
        save (str, optional): Path to save figure. Defaults to None.
        dpi (int, optional): Resolution to save figure. Defaults to 300.
        vector (bool, optional): Save figure in as vector (.svg). Defaults to
         True.
    """
    plt.rcParams['figure.dpi'] = dpi
    omics = copy.copy(self.results)
    omics = omics.sort_values('-log10(pAdj)')
    if len(Terms) > 0:
        top = len(Terms)
        omics = omics[omics['Term'].isin(Terms)]
    for i in omics.Gene_set.drop_duplicates():
        df = omics[omics['Gene_set'] == i]
        df = df.iloc[0:top, :]
    df = df.set_index('Term')
    df = df[['NES']].astype(float)
    df = df.sort_values('NES')
    px = 1/plt.rcParams['figure.dpi']
    f, ax = plt.subplots(figsize=(0.5,
                                  len(df)*px*100))
    sns.heatmap(df, center=0, cmap='RdYlBu_r', vmin=-2, vmax=2,
                linewidths=linewidths, linecolor='black')
    plt.ylabel('')
    plt.xticks(rotation=45, ha='right')
    if save is not None:
        if vector is True:
            plt.savefig(save + 'gsea_heatmap'+i+'.svg', bbox_inches='tight')
        else:
            plt.savefig(save + 'gsea_heatmap'+i+'.png', dpi=dpi, bbox_inches='tight')
    plt.show(block=True)