Source code for pathway_forte.pathway_enrichment.functional_class

# -*- coding: utf-8 -*-

"""This module contain the functional class methods implemented in PathwayForte.

Currently this includes GSEA and ssGSEA.
"""

import itertools as itt
import logging
import os
from collections import defaultdict
from typing import Optional

import bio2bel_kegg
import bio2bel_reactome
import bio2bel_wikipathways
import gseapy
import pandas as pd
from gseapy.gsea import SingleSampleGSEA
from numpy import sign

from pathway_forte.constants import (
    CLASSES, GSEA, KEGG, MPATH, PHENOTYPE_CLASSES, REACTOME, SSGSEA, WIKIPATHWAYS,
)
from pathway_forte.mappings import get_equivalent_mappings_dict, get_mapping_dict, load_compath_mapping_dfs
from pathway_forte.utils import get_num_samples

logger = logging.getLogger(__name__)


[docs]def create_cls_file(gene_expression_file, normal_sample_file, tumor_sample_file, data):
    """Create categorical (e.g. tumor vs sample) class file format (i.e., .cls) for input into GSEA.

    :param str gene_expression_file: Text file containing expression values for each gene from each sample.
    :param str normal_sample_file:
    :param str tumor_sample_file:
    :param data:
    """
    num_normal_samples = get_num_samples(normal_sample_file)
    num_tumor_samples = get_num_samples(tumor_sample_file)

    total_num_samples = len(gene_expression_file.columns)
    class_number = 2
    normal_class = 'Normal'
    tumor_class = 'Tumor'

    cls_indices = ['Normal'] * num_normal_samples + ['Tumor'] * num_tumor_samples

    with open(CLASSES.format(data), "w+") as classes_file:
        classes_file.write(
            "{} {} {} \n{} {} {}\n".format(total_num_samples, class_number, 1, "#", normal_class, tumor_class))
        for item in cls_indices:
            classes_file.write("{} ".format(item))

    output = PHENOTYPE_CLASSES.format(data)

    with open(CLASSES.format(data), 'r') as f, open(output, 'w') as w:
        data = f.read()
        w.write(data[:-1])


[docs]def run_gsea(gene_exp: str, gene_set: str, phenotype_class: str, permutations: int = 500, output_dir: str = GSEA):
    """Run GSEA on a given dataset with a given gene set.

    :param gene_exp: file with gene expression data
    :param gene_set: gmt files containing pathway gene sets
    :param phenotype_class: cls file containing information on class labels
    :param permutations: number of permutations
    :param output_dir: output directory
    :return:
    """
    return gseapy.gsea(
        data=gene_exp,
        gene_sets=gene_set,
        cls=phenotype_class,  # cls=class_vector
        max_size=3000,
        # set permutation_type to phenotype if samples >=15
        permutation_type='phenotype',
        permutation_num=permutations,  # reduce number to speed up test
        outdir=output_dir,  # do not write output to disk
        no_plot=True,  # Skip plotting
        processes=4,
        format='png',
    )


[docs]def filter_gsea_results(
        gsea_results_path: str,
        source,
        kegg_manager: Optional[bio2bel_kegg.Manager] = None,
        reactome_manager: Optional[bio2bel_reactome.Manager] = None,
        wikipathways_manager: Optional[bio2bel_wikipathways.Manager] = None,
        p_value: Optional[float] = None,
        absolute_nes_filter: Optional[float] = None,
        geneset_set_filter_minimum_size: Optional[int] = None,
        geneset_set_filter_maximum_size: Optional[int] = None,
) -> pd.DataFrame:
    """Get top and bottom rankings from GSEA results.

    :param gsea_results_path: path to GSEA results in .tsv file format
    :param source:
    :param kegg_manager: KEGG manager
    :param reactome_manager: Reactome manager
    :param wikipathways_manager: WikiPathways manager
    :param p_value: maximum p value allowed
    :param absolute_nes_filter: filter by magnitude of normalized enrichment scores
    :param geneset_set_filter_minimum_size: filter to include a minimum number of genes in a gene set
    :param geneset_set_filter_maximum_size: filter to include a maximum number of genes in a gene set
    :return: list of pathways ranked as having the highest and lowest significant enrichment scores
    """
    # Read GSEA results to pandas dataFrame
    gsea_results_df = pd.read_csv(gsea_results_path, sep='\t')

    # Filter dataFrame to include only those pathways with a p-value less than X
    if p_value is not None:
        gsea_results_df = gsea_results_df.loc[gsea_results_df['pval'] < p_value]

    #  Filter dataFrame by magnitude of normalized enrichment scores
    if absolute_nes_filter:
        gsea_results_df = gsea_results_df.loc[
            abs(gsea_results_df['nes']) > absolute_nes_filter]

    #  Filter dataFrame by minimun pathway geneset sizes
    if geneset_set_filter_minimum_size:
        gsea_results_df = gsea_results_df.loc[
            gsea_results_df['geneset_size'] > geneset_set_filter_minimum_size]

    #  Filter dataFrame by maximum pathway geneset sizes
    if geneset_set_filter_maximum_size:
        gsea_results_df = gsea_results_df.loc[
            gsea_results_df['geneset_size'] < geneset_set_filter_maximum_size]

    # Sort normalized enrichment scores in descending order
    gsea_results_df = gsea_results_df.sort_values('nes', ascending=False)

    # Rename column
    gsea_results_df = gsea_results_df.rename(columns={'Term': 'pathway_id'})

    # Reset index
    gsea_results_df.reset_index(inplace=True)

    # Remove redundant index column
    del gsea_results_df['index']

    # Get list of pathways after filtering dataFrame
    all_pathway_ids = gsea_results_df['pathway_id'].tolist()

    return pathway_names_to_df(
        gsea_results_df,
        all_pathway_ids,
        source,
        kegg_manager=kegg_manager,
        reactome_manager=reactome_manager,
        wikipathways_manager=wikipathways_manager,
    )


[docs]def merge_statistics(merged_pathways_df: pd.DataFrame, dataset: str):
    """Get statistics for pathways included in the merged gene sets dataFrame.

    These include the proportion of pathways from each of the other databases and the proportion of pathways
    deriving from 2 or more primary resources

    :param merged_pathways_df: dataFrame containing pathways from multiple databases
    :return: statistics of contents in merged dataset
    """
    num_of_pathways = len(merged_pathways_df.index)

    # Get the proportion that each database contributes to the merged pathways
    num_of_kegg_in_merged = 0
    num_of_reactome_in_merged = 0
    num_of_wikipathways_in_merged = 0
    merged_genesets = 0

    for pathway_id in merged_pathways_df['pathway_id']:

        if '|' in pathway_id:
            pathway_id.split('|')
            merged_genesets += 1

        if pathway_id.startswith('hsa'):
            num_of_kegg_in_merged += 1
        elif pathway_id.startswith('R-HSA'):
            num_of_reactome_in_merged += 1
        elif pathway_id.startswith('WP'):
            num_of_wikipathways_in_merged += 1
        else:
            raise ValueError(f'Invalid pathway ID {pathway_id}.')

    kegg_contributions_to_merged = num_of_kegg_in_merged / num_of_pathways * 100
    reactome_contributions_to_merged = num_of_reactome_in_merged / num_of_pathways * 100
    wikipathways_contributions_to_merged = num_of_wikipathways_in_merged / num_of_pathways * 100
    proportion_of_merged_genesets = merged_genesets / num_of_pathways * 100

    print('For the {} pathways in the merged dataset results for {}:'.format(num_of_pathways, dataset))
    print('{0:.2f}% are from KEGG'.format(kegg_contributions_to_merged))
    print('{0:.2f}% are from Reactome'.format(reactome_contributions_to_merged))
    print('{0:.2f}% are from WikiPathways'.format(wikipathways_contributions_to_merged))
    print('{0:.2f}% are a combination of 2 or more databases'.format(proportion_of_merged_genesets))


[docs]def rearrange_df_columns(df: pd.DataFrame) -> pd.DataFrame:
    """Rearrange order of columns."""
    cols = df.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    df = df[cols]
    return df


[docs]def get_pathway_names(
        database: str,
        pathway_df: pd.DataFrame,
        kegg_manager: Optional[bio2bel_kegg.Manager] = None,
        reactome_manager: Optional[bio2bel_reactome.Manager] = None,
        wikipathways_manager: Optional[bio2bel_wikipathways.Manager] = None
):
    """Get pathway names from database specific pathway IDs.

    :param database:
    :param pathway_df:
    :param kegg_manager:
    :param reactome_manager:
    :param wikipathways_manager:
    :return:
    """
    if database == KEGG:
        pathway_df['pathway_name'] = [
            kegg_manager.get_pathway_by_id('path:' + pathway_id)
            for pathway_id in list(pathway_df['pathway_id'])
        ]
        return pathway_df

    elif database == REACTOME:
        pathway_df['pathway_name'] = [
            reactome_manager.get_pathway_by_id(pathway_id)
            for pathway_id in list(pathway_df['pathway_id'])
        ]
        return pathway_df

    elif database == WIKIPATHWAYS:
        pathway_df['pathway_name'] = [
            wikipathways_manager.get_pathway_by_id(pathway_id)
            for pathway_id in list(pathway_df['pathway_id'])
        ]
        return pathway_df


[docs]def pathway_names_to_df(
        filtered_gsea_results_df,
        all_pathway_ids,
        source,
        kegg_manager: Optional[bio2bel_kegg.Manager] = None,
        reactome_manager: Optional[bio2bel_reactome.Manager] = None,
        wikipathways_manager: Optional[bio2bel_wikipathways.Manager] = None,
) -> pd.DataFrame:
    """Get pathway names.

    :param filtered_gsea_results_df:
    :param all_pathway_ids: list of pathway IDs
    :param source: pathway source (i.e., database name or 'MPath')
    :param kegg_manager: KEGG manager
    :param reactome_manager: Reactome manager
    :param wikipathways_manager: WikiPathways manager
    """
    if source == KEGG:
        filtered_gsea_results_df['pathway_name'] = [
            kegg_manager.get_pathway_by_id('path:' + pathway_id)
            for pathway_id in all_pathway_ids
        ]
        return rearrange_df_columns(filtered_gsea_results_df)

    elif source == REACTOME:
        filtered_gsea_results_df['pathway_name'] = [
            reactome_manager.get_pathway_by_id(pathway_id)
            for pathway_id in all_pathway_ids
        ]
        return rearrange_df_columns(filtered_gsea_results_df)

    elif source == WIKIPATHWAYS:
        filtered_gsea_results_df['pathway_name'] = [
            wikipathways_manager.get_pathway_by_id(pathway_id)
            for pathway_id in all_pathway_ids
        ]
        return rearrange_df_columns(filtered_gsea_results_df)

    merged_rankings = []

    for pathway_ids in all_pathway_ids:
        # A pathway might come from the union of multiple pathways via an equivalent mapping
        equivalent_pathways = []

        for pathway_id in pathway_ids.split('|'):
            if pathway_id.startswith('hsa'):
                kegg_pathway_name = str(kegg_manager.get_pathway_by_id('path:' + pathway_id))
                equivalent_pathways.append(kegg_pathway_name)

            elif pathway_id.startswith('R-HSA'):
                reactome_pathway_name = str(reactome_manager.get_pathway_by_id(pathway_id))
                equivalent_pathways.append(reactome_pathway_name)

            elif pathway_id.startswith('WP'):
                wikipathways_pathway_name = str(wikipathways_manager.get_pathway_by_id(pathway_id))
                equivalent_pathways.append(wikipathways_pathway_name)

        merged_pathways = '|'.join(equivalent_pathways)
        merged_rankings.append(merged_pathways)

    filtered_gsea_results_df['pathway_name'] = merged_rankings

    return rearrange_df_columns(filtered_gsea_results_df)


[docs]def gsea_results_to_filtered_df(
        dataset,
        kegg_manager: Optional[bio2bel_kegg.Manager] = None,
        reactome_manager: Optional[bio2bel_reactome.Manager] = None,
        wikipathways_manager: Optional[bio2bel_wikipathways.Manager] = None,
        p_value: Optional[float] = None,
        absolute_nes_filter=None,
        geneset_set_filter_minimum_size=None,
        geneset_set_filter_maximum_size=None
):
    """Get filtered GSEA results dataFrames."""
    kegg_gsea_path = os.path.join(GSEA, KEGG, f'kegg_{dataset}.tsv')
    reactome_gsea_path = os.path.join(GSEA, REACTOME, f'reactome_{dataset}.tsv')
    wikipathways_gsea_path = os.path.join(GSEA, WIKIPATHWAYS, f'wikipathways_{dataset}.tsv')
    merge_gsea_path = os.path.join(GSEA, MPATH, f'merge_{dataset}.tsv')

    # Load GSEA results and filter dataFrames
    kegg_pathway_df = filter_gsea_results(
        kegg_gsea_path,
        KEGG,
        kegg_manager=kegg_manager,
        reactome_manager=reactome_manager,
        wikipathways_manager=wikipathways_manager,
        p_value=p_value,
        absolute_nes_filter=absolute_nes_filter,
        geneset_set_filter_minimum_size=geneset_set_filter_minimum_size,
        geneset_set_filter_maximum_size=geneset_set_filter_maximum_size,
    )
    reactome_pathway_df = filter_gsea_results(
        reactome_gsea_path,
        REACTOME,
        kegg_manager=kegg_manager,
        reactome_manager=reactome_manager,
        wikipathways_manager=wikipathways_manager,
        p_value=p_value,
        absolute_nes_filter=absolute_nes_filter,
        geneset_set_filter_minimum_size=geneset_set_filter_minimum_size,
        geneset_set_filter_maximum_size=geneset_set_filter_maximum_size,

    )
    wikipathways_pathway_df = filter_gsea_results(
        wikipathways_gsea_path,
        WIKIPATHWAYS,
        kegg_manager=kegg_manager,
        reactome_manager=reactome_manager,
        wikipathways_manager=wikipathways_manager,
        p_value=p_value,
        absolute_nes_filter=absolute_nes_filter,
        geneset_set_filter_minimum_size=geneset_set_filter_minimum_size,
        geneset_set_filter_maximum_size=geneset_set_filter_maximum_size,
    )
    merged_pathway_df = filter_gsea_results(
        merge_gsea_path,
        MPATH,
        kegg_manager=kegg_manager,
        reactome_manager=reactome_manager,
        wikipathways_manager=wikipathways_manager,
        p_value=p_value,
        absolute_nes_filter=absolute_nes_filter,
        geneset_set_filter_minimum_size=geneset_set_filter_minimum_size,
        geneset_set_filter_maximum_size=geneset_set_filter_maximum_size,
    )

    # Merge pathway dataframe without applying filters
    merged_total_df = filter_gsea_results(
        merge_gsea_path,
        MPATH,
        kegg_manager=kegg_manager,
        reactome_manager=reactome_manager,
        wikipathways_manager=wikipathways_manager,
        # TODO why not give arguments for other parts?
    )

    return (
        kegg_pathway_df,
        reactome_pathway_df,
        wikipathways_pathway_df,
        merged_pathway_df,
        merged_total_df,
    )


[docs]def get_pathways_by_resource(pathways: iter, resource: str) -> list:
    """Return pathways by resource."""
    if resource == KEGG:
        return [
            pathway
            for pathway in pathways
            if pathway.startswith('hsa')
        ]

    elif resource == REACTOME:
        return [
            pathway
            for pathway in pathways
            if pathway.startswith('R-HSA-')
        ]

    elif resource == WIKIPATHWAYS:
        return [
            pathway
            for pathway in pathways
            if pathway.startswith('WP')
        ]

    raise ValueError(f'Pathway database {resource} not found')


def _pairwise_helper(pathways, mappings, source_resource, target_resource):
    """Compare pairwise pathways."""
    counter = 0

    source_pathways = get_pathways_by_resource(pathways, source_resource)
    target_pathways = get_pathways_by_resource(pathways, target_resource)

    for source_pathway in source_pathways:

        if (source_resource, source_pathway) not in mappings:
            continue

        if any(mapping in target_pathways
               for _, mapping in mappings[(source_resource, source_pathway)]):
            counter += 1

    return counter


[docs]def get_analogs_comparison_numbers(
        kegg_reactome_pathway_df,
        reactome_wikipathways_pathway_df,
        wikipathways_kegg_pathway_df,
        *,
        pathway_column="pathway_id"
):
    """Get number of existing versus expected pairwise mappings."""
    # Load mappings
    equivalent_mappings_dict = get_equivalent_mappings_dict()

    actual_num_dict = {}

    # IMPORTANT! These two dictionaries should be keep in the same order as the next one (TO PLOT IN THE RIGHT ORDER)

    actual_num_dict[(KEGG, REACTOME)] = _pairwise_helper(
        kegg_reactome_pathway_df[pathway_column], equivalent_mappings_dict, KEGG, REACTOME
    )

    actual_num_dict[(KEGG, WIKIPATHWAYS)] = _pairwise_helper(
        wikipathways_kegg_pathway_df[pathway_column], equivalent_mappings_dict, KEGG, WIKIPATHWAYS
    )

    actual_num_dict[(REACTOME, KEGG)] = _pairwise_helper(
        kegg_reactome_pathway_df[pathway_column], equivalent_mappings_dict, REACTOME, KEGG
    )

    actual_num_dict[(REACTOME, WIKIPATHWAYS)] = _pairwise_helper(
        reactome_wikipathways_pathway_df[pathway_column], equivalent_mappings_dict, REACTOME, WIKIPATHWAYS
    )

    actual_num_dict[(WIKIPATHWAYS, KEGG)] = _pairwise_helper(
        wikipathways_kegg_pathway_df[pathway_column], equivalent_mappings_dict, WIKIPATHWAYS, KEGG
    )

    actual_num_dict[(WIKIPATHWAYS, REACTOME)] = _pairwise_helper(
        reactome_wikipathways_pathway_df[pathway_column], equivalent_mappings_dict, WIKIPATHWAYS, REACTOME
    )

    # Ensure the number of equivalent pathways are the same
    # assert actual_num_dict[(KEGG, REACTOME)] == actual_num_dict[(REACTOME, KEGG)], \
    #     'Error with KEGG, Reactome'
    #
    # assert actual_num_dict[(KEGG, WIKIPATHWAYS)] == actual_num_dict[(WIKIPATHWAYS, KEGG)], \
    #     'Error with KEGG, WikiPathways'
    #
    # assert actual_num_dict[(WIKIPATHWAYS, REACTOME)] == actual_num_dict[(REACTOME, WIKIPATHWAYS)], \
    #     'Error with Reactome, Wikipathways'

    expected_num_dict = {
        (KEGG, REACTOME): len(
            get_pathways_by_resource(kegg_reactome_pathway_df[pathway_column], KEGG)
        ),
        (KEGG, WIKIPATHWAYS): len(
            get_pathways_by_resource(wikipathways_kegg_pathway_df[pathway_column], KEGG)
        ),
        (REACTOME, KEGG): len(
            get_pathways_by_resource(kegg_reactome_pathway_df[pathway_column], REACTOME)
        ),
        (REACTOME, WIKIPATHWAYS): len(
            get_pathways_by_resource(reactome_wikipathways_pathway_df[pathway_column], REACTOME)
        ),
        (WIKIPATHWAYS, KEGG): len(
            get_pathways_by_resource(wikipathways_kegg_pathway_df[pathway_column], WIKIPATHWAYS)
        ),
        (WIKIPATHWAYS, REACTOME): len(
            get_pathways_by_resource(reactome_wikipathways_pathway_df[pathway_column], WIKIPATHWAYS)
        ),
    }

    return actual_num_dict, expected_num_dict


[docs]def get_pairwise_mapping_numbers(
        kegg_pathway_df,
        reactome_pathway_df,
        wikipathways_pathway_df,
):
    """Get number of existing versus expected pairwise mappings."""
    pairwise_comparison = [
        (kegg_pathway_df, KEGG),
        (reactome_pathway_df, REACTOME),
        (wikipathways_pathway_df, WIKIPATHWAYS),
    ]
    # Load mappings
    dfs = load_compath_mapping_dfs()
    final_df = pd.concat([dfs[0], dfs[1], dfs[2]])

    equivalent_mappings_dict = get_mapping_dict(final_df, 'equivalentTo')
    # Dictionary with hierarchical mappings. This dictionary is not used to build the gene set for now, but it could be
    # used in the future for other applications
    # part_of_mappings_dict = get_mapping_dict(final_df, 'isPartOf')

    actual_mappings = {}
    expected_mappings = {}

    for (df1, resource1), (df2, resource2) in itt.permutations(pairwise_comparison, 2):
        matching_mappings, pathways_with_mappings = compare_database_results(
            df1, resource1, df2, resource2, equivalent_mappings_dict
        )

        actual_mappings[(resource1, resource2)] = matching_mappings
        expected_mappings[(resource1, resource2)] = pathways_with_mappings

    actual_num_dict = {}
    expected_num_dict = {}

    # Get number of actual mappings between 2 resources
    for resources, mappings in actual_mappings.items():
        actual_num_dict[resources] = len(mappings)

    # Get number of expected mappings between 2 resources
    for resources, mappings in expected_mappings.items():
        expected_num_dict[resources] = len(mappings)

    return actual_num_dict, expected_num_dict


[docs]def get_pairwise_mappings(
        kegg_pathway_df,
        reactome_pathway_df,
        wikipathways_pathway_df,
):
    """Get pairwise mappings."""
    pairwise_comparison = [
        (kegg_pathway_df, KEGG),
        (reactome_pathway_df, REACTOME),
        (wikipathways_pathway_df, WIKIPATHWAYS),
    ]
    # Load mappings
    dfs = load_compath_mapping_dfs()
    # Get KEGG-Reactome, KEGG-WikiPathways and WikiPathways-Reactome mappings
    final_df = pd.concat([dfs[0], dfs[1], dfs[2]])

    equivalent_mappings_dict = get_mapping_dict(final_df, 'equivalentTo')

    actual_mappings = {}

    for (df1, resource1), (df2, resource2) in itt.permutations(pairwise_comparison, 2):
        matching_mappings = get_matching_pairs(
            df1, resource1, df2, resource2, equivalent_mappings_dict
        )

        actual_mappings[(resource1, resource2)] = matching_mappings
    return actual_mappings


[docs]def compare_database_results(df_1, resource_1, df_2, resource_2, mapping_dict, check_contradiction=False):
    """Compare pathways in the dataframe from enrichment results to evaluate the concordance in similar pathways."""
    # Ensure index is set to pathway id column (not in place)
    df_1 = df_1.set_index('pathway_id')
    df_2 = df_2.set_index('pathway_id')

    # Get pathway ids as lists
    df_1_ids = df_1.index.values
    df_2_ids = df_2.index.values

    pathways_with_mappings = []
    matching_mappings = []

    contradictory_pathways = []

    for pathway_resource_1 in df_1_ids:

        # Pathway does not have mappings
        if (resource_1, pathway_resource_1) not in mapping_dict:
            continue

        # Get all mappings for a pathway
        mappings_for_pathway = mapping_dict[(resource_1, pathway_resource_1)]

        for resource, mapping_pathway_id in mappings_for_pathway:

            if resource != resource_2:
                continue

            # Add all expected pathways to list
            pathways_with_mappings.append(mapping_pathway_id)

            # Add all actual mappings to list
            if mapping_pathway_id in df_2_ids:
                matching_mappings.append(mapping_pathway_id)

                # Check for contradictory results
                if check_contradiction and \
                        sign(df_1.loc[pathway_resource_1]['nes']) != sign(df_2.loc[mapping_pathway_id]['nes']):
                    contradictory_pathways.append((df_1.loc[pathway_resource_1], df_2.loc[mapping_pathway_id]))

    if check_contradiction:
        print(f"Total of #{len(contradictory_pathways)} contradictory pathways: {contradictory_pathways}")

    return matching_mappings, pathways_with_mappings


[docs]def get_matching_pairs(df_1, resource_1, df_2, resource_2, equivalent_mappings_dict):
    """Get equivalent pathways and their direction of change."""
    df1_subset = df_1[['pathway_id', 'status']]
    df1_tuples = [tuple(x) for x in df1_subset.values]

    df2_subset = df_2[['pathway_id', 'status']]
    df2_tuples = [tuple(x) for x in df2_subset.values]

    matching_mappings = defaultdict(list)

    for pathway_resource_1, direction_1 in df1_tuples:

        # Pathway does not have mappings
        if (resource_1, pathway_resource_1) not in equivalent_mappings_dict:
            continue

        # Get all mappings for the pathway
        mappings_for_pathway = equivalent_mappings_dict[(resource_1, pathway_resource_1)]

        for resource, mapping_pathway_id in mappings_for_pathway:

            if resource != resource_2:
                continue

            for pathway_resource_2, direction_2 in df2_tuples:

                if pathway_resource_2 == mapping_pathway_id:
                    matching_mappings[resource_1, pathway_resource_1, direction_1].append(
                        (resource_2, pathway_resource_2, direction_2))

    return matching_mappings


[docs]def run_ssgsea(
        filtered_expression_data: pd.DataFrame,
        gene_set: str,
        output_dir: str = SSGSEA,
        processes: int = 1,
        max_size: int = 3000,
        min_size: int = 15,
) -> SingleSampleGSEA:
    """Run single sample GSEA (ssGSEA) on filtered gene expression data set.

    :param filtered_expression_data: filtered gene expression values for samples
    :param gene_set: .gmt file containing gene sets
    :param output_dir: output directory
    :return: ssGSEA results in respective directory
    """
    single_sample_gsea = gseapy.ssgsea(
        data=filtered_expression_data,
        gene_sets=gene_set,
        outdir=output_dir,  # do not write output to disk
        max_size=max_size,
        min_size=min_size,
        sample_norm_method='rank',  # choose 'custom' for your own rank list
        permutation_num=0,  # skip permutation procedure, because you don't need it
        no_plot=True,  # skip plotting to speed up
        processes=processes,
        format='png',
    )
    logger.info('Done with ssGSEA')
    return single_sample_gsea


[docs]def filter_gene_exp_data(expression_data: pd.DataFrame, gmt_file: str):
    """Filter gene expression data file to include only gene names which are found in the gene set files.

    :param expression_data: gene expression values for samples
    :param gmt_file: .gmt file containing gene sets
    :return: Filtered gene expression data with genes with no correspondences in gene sets removed
    :rtype: pandas.core.frame.DataFrame kegg_xml_parser.py
    """
    filtered_expression_data = expression_data.copy()

    # Gene universe from gene set
    gene_sets = gseapy.parser.gsea_gmt_parser(gmt_file, max_size=40000)

    # All the genes in gene set files
    gene_universe = set(itt.chain(*gene_sets.values()))

    genes_to_remove = [
        gene
        for gene in filtered_expression_data.index.values
        if gene not in gene_universe
    ]
    # Genes to be removed because they are not present in the gene sets
    counter = len(genes_to_remove)

    logger.info(f'Expression data has {len(filtered_expression_data.index.values)}')
    logger.info(f'Gene universe has {len(gene_universe)}')
    logger.info(f'{counter} were removed in expression data')
    logger.info(
        f'{(len(filtered_expression_data.index.values) - counter) / len(gene_universe) * 100:.4f}% '
        f'of the pathway gene sets is mapped to the gene expression'
    )

    # Remove non HGNC genes and return dataframe
    return filtered_expression_data.drop(genes_to_remove)