Source code for pathway_forte.constants

# -*- coding: utf-8 -*-

"""This module contains all the constants used in the PathwayForte repo."""

import logging
import os
import time

from bio2bel import get_data_dir

logger = logging.getLogger(__name__)

dir_path = os.path.dirname(os.path.realpath(__file__))
SOURCE = os.path.join(os.path.abspath(os.path.join(dir_path, os.pardir)))
# Data folder where gene sets files are
DATA = os.path.join(os.path.abspath(os.path.join(SOURCE, os.pardir)), 'data')

BIO2BEL_DATA_DIR = get_data_dir('pathwayforte')

"""Cancer Data Sets"""

CANCER_DATA_SETS = {
    'brca',
    'lihc',
    'kirc',
    'prad',
    'ov',
}

TCGA_DATASETS = os.path.join(DATA, 'tcga_datasets')
# Raw expression matrix from TCGA
EXPRESSION_MATRIX = os.path.join(TCGA_DATASETS, '{}', 'expression_matrix_full.txt')
# File with phenotype classes (e.g., tumor vs normal)
PHENOTYPE_CLASSES = os.path.join(TCGA_DATASETS, '{}', 'phenotype_classes.cls')
CLASSES = os.path.join(TCGA_DATASETS, '{}', 'class.cls')

# Clinical data from TCGA (necessary for survival analysis)
CLINICAL_DATA = os.path.join(TCGA_DATASETS, '{}', '{}_tcga_clinical_data.tsv')
TUMOR_EXPRESSION_MATRIX = os.path.join(TCGA_DATASETS, '{}', 'tumor_expression_matrix.txt')

NORMAL_EXPRESSION_SAMPLES = os.path.join(TCGA_DATASETS, '{}', 'normal_expression_dimension.txt')
TUMOR_EXPRESSION_SAMPLES = os.path.join(TCGA_DATASETS, '{}', 'tumor_expression_dimension.txt')

RESULTS = os.path.join(DATA, 'results')
CLASSIFIER_RESULTS = os.path.join(RESULTS, 'classifier')


[docs]def make_classifier_results_directory():
    """Ensure that the result folder exists."""
    os.makedirs(CLASSIFIER_RESULTS, exist_ok=True)


"""GSEA"""

GSEA = os.path.join(DATA, 'results', 'gsea')

KEGG_GSEA = os.path.join(GSEA, 'kegg')
REACTOME_GSEA = os.path.join(GSEA, 'reactome')
WIKIPATHWAYS_GSEA = os.path.join(GSEA, 'wikipathways')
MERGE_GSEA = os.path.join(GSEA, 'merge')
MSIG_GSEA = os.path.join(GSEA, 'msig')

"""Output files with results for GSEA"""

KEGG_GSEA_TSV = os.path.join(GSEA, 'kegg', 'kegg_{}_{}.tsv')
REACTOME_GSEA_TSV = os.path.join(GSEA, 'reactome', 'reactome_{}_{}.tsv')
WIKIPATHWAYS_GSEA_TSV = os.path.join(GSEA, 'wikipathways', 'wikipathways_{}_{}.tsv')
MERGE_GSEA_TSV = os.path.join(GSEA, 'merge', 'merge_{}_{}.tsv')
CONCATENATED_MERGE_GSEA_TSV = os.path.join(GSEA, 'merge', 'concatenated_merge_{}_{}.tsv')
KEGG_MSIG_GSEA_TSV = os.path.join(GSEA, 'msig', 'msig_kegg_{}_{}.tsv')
REACTOME_MSIG_GSEA_TSV = os.path.join(GSEA, 'msig', 'msig_reactome_{}_{}.tsv')


[docs]def make_gsea_export_directories():
    """Ensure that gsea export directories exist."""
    os.makedirs(RESULTS, exist_ok=True)
    os.makedirs(GSEA, exist_ok=True)
    os.makedirs(KEGG_GSEA, exist_ok=True)
    os.makedirs(REACTOME_GSEA, exist_ok=True)
    os.makedirs(WIKIPATHWAYS_GSEA, exist_ok=True)
    os.makedirs(MERGE_GSEA, exist_ok=True)
    os.makedirs(MSIG_GSEA, exist_ok=True)


"""ssGSEA"""

SSGSEA = os.path.join(DATA, 'results', 'ssgsea')

KEGG_SSGSEA = os.path.join(SSGSEA, 'kegg')
REACTOME_SSGSEA = os.path.join(SSGSEA, 'reactome')
WIKIPATHWAYS_SSGSEA = os.path.join(SSGSEA, 'wikipathways')
MERGE_SSGSEA = os.path.join(SSGSEA, 'merge')
MSIG_SSGSEA = os.path.join(SSGSEA, 'msig')

"""Pickles with results for ssGSEA"""

KEGG_SSGSEA_TSV = os.path.join(SSGSEA, 'kegg', 'kegg_{}_{}.tsv')
REACTOME_SSGSEA_TSV = os.path.join(SSGSEA, 'reactome', 'reactome_{}_{}.tsv')
WIKIPATHWAYS_SSGSEA_TSV = os.path.join(SSGSEA, 'wikipathways', 'wikipathways_{}_{}.tsv')
MERGE_SSGSEA_TSV = os.path.join(SSGSEA, 'merge', 'merge_{}_{}.tsv')
CONCATENATED_MERGE_SSGSEA_TSV = os.path.join(SSGSEA, 'merge', 'concatenated_merge_{}_{}.tsv')
KEGG_MSIG_SSGSEA_TSV = os.path.join(SSGSEA, 'msig', 'kegg_msig_{}_{}.tsv')
REACTOME_MSIG_SSGSEA_TSV = os.path.join(SSGSEA, 'msig', 'reactome_msig_{}_{}.tsv')


[docs]def make_ssgsea_export_directories():
    """Ensure that gsea export directories exist."""
    os.makedirs(RESULTS, exist_ok=True)
    os.makedirs(SSGSEA, exist_ok=True)
    os.makedirs(KEGG_SSGSEA, exist_ok=True)
    os.makedirs(REACTOME_SSGSEA, exist_ok=True)
    os.makedirs(WIKIPATHWAYS_SSGSEA, exist_ok=True)
    os.makedirs(MERGE_SSGSEA, exist_ok=True)
    os.makedirs(MSIG_SSGSEA, exist_ok=True)


"""GMT Files"""

GMT_FOLDER = os.path.join(DATA, 'gmt_files')


[docs]def check_gmt_files():
    """Check if GMT files exist and returns GMT files as constant variables."""
    # Get files located in the GMT directory
    gmt_file_names = [
        f for f in os.listdir(GMT_FOLDER)
        if os.path.isfile(os.path.join(GMT_FOLDER, f))
    ]

    # Raise error if files are not found
    if not gmt_file_names:
        logger.warning('GMT files missing, please create them by running the "export_gene_sets" command.')
        return None, None, None, None

    kegg_gmt_file, reactome_gmt_file, wikipathways_gmt_file, merge_gmt_file = None, None, None, None

    # Get gmt files using the prefix for each database
    for file in gmt_file_names:

        if file.startswith('kegg'):
            kegg_gmt_file = os.path.join(GMT_FOLDER, file)

        elif file.startswith('reactome'):
            reactome_gmt_file = os.path.join(GMT_FOLDER, file)

        elif file.startswith('wikipathways'):
            wikipathways_gmt_file = os.path.join(GMT_FOLDER, file)

        elif file.startswith('mpath'):
            merge_gmt_file = os.path.join(GMT_FOLDER, file)

        elif file.startswith('msigdb') or file == 'README.rst' or file == 'concatenated_merge.gmt':
            continue

        else:
            logger.warning('Unknown file {} in gmt folder'.format(os.path.join(GMT_FOLDER, file)))

    # If any of the GMT files is missing print warning
    if not all([kegg_gmt_file, reactome_gmt_file, wikipathways_gmt_file, merge_gmt_file]):
        logger.warning('GMT files missing, please create them by running the "export_gene_sets" command.')

    return kegg_gmt_file, reactome_gmt_file, wikipathways_gmt_file, merge_gmt_file


KEGG_GENE_SETS, REACTOME_GENE_SETS, WIKIPATHWAYS_GENE_SETS, MERGED_GENE_SETS = check_gmt_files()

PATHBANK_PATHWAYS_PATH = 'https://pathbank.org/downloads/pathbank_all_pathways.csv.zip'
PATHBANK_PROTEINS_PATH = 'https://pathbank.org/downloads/pathbank_all_proteins.csv.zip'
PATHBANK_PATHWAYS_FILE = 'pathbank_pathways.csv'
PATHBANK_PROTEINS_FILE = 'pathbank_all_proteins.csv'

PATHBANK_KEGG_MAPPINGS = "https://raw.githubusercontent.com/ComPath/compath-resources/master/mappings/pathbank_kegg.csv"
PATHBANK_REACTOME_MAPPINGS = "https://raw.githubusercontent.com/ComPath/compath-resources/master/mappings/pathbank_reactome.csv"
PATHBANK_WIKIPATHWAYS_MAPPINGS = "https://raw.githubusercontent.com/ComPath/compath-resources/master/mappings/pathbank_wikipathways.csv"

PATHBANK_KEGG_FILE = 'pathbank_kegg.csv'
PATHBANK_REACTOME_FILE = 'pathbank_reactome.csv'
PATHBANK_WIKIPATHWAYS_FILE = 'pathbank_wikipathways.csv'

# Export the gene set with a time stamp
TODAY = time.strftime("%d_%m_%Y")
NEW_KEGG_GENE_SETS = os.path.join(GMT_FOLDER, f'kegg_geneset_{TODAY}.gmt')
NEW_REACTOME_GENE_SETS = os.path.join(GMT_FOLDER, f'reactome_geneset_{TODAY}.gmt')
NEW_WIKIPATHWAYS_GENE_SETS = os.path.join(GMT_FOLDER, f'wikipathways_geneset_{TODAY}.gmt')
NEW_MERGED_GENE_SETS = os.path.join(GMT_FOLDER, f'mpath_geneset_{TODAY}.gmt')

TEMP_KEGG_PATHWAY_GENESET_CSV = os.path.join(GMT_FOLDER, 'kegg_pathway_geneset.csv')
TEMP_REACTOME_PATHWAY_GENESET_CSV = os.path.join(GMT_FOLDER, 'reactome_pathway_geneset.csv')
TEMP_WIKIPATHWAYS_PATHWAY_GENESET_CSV = os.path.join(GMT_FOLDER, 'wikipathways_pathway_geneset.csv')
TEMP_MERGED_PATHWAY_GENESET_CSV = os.path.join(GMT_FOLDER, 'merged_pathway_geneset.csv')

MSIGDB_KEGG_GENE_SETS = os.path.join(GMT_FOLDER, 'msigdb_kegg.gmt')
MSIGDB_REACTOME_GENE_SETS = os.path.join(GMT_FOLDER, 'msigdb_reactome.gmt')
CONCATENATED_MERGE_GENE_SETS = os.path.join(GMT_FOLDER, 'concatenated_merge.gmt')

# Get csv pairwise mapping files
KEGG_REACTOME_URL = "https://raw.githubusercontent.com/ComPath/resources/master/mappings/kegg_reactome.csv"
KEGG_REACTOME_PATH = os.path.join(BIO2BEL_DATA_DIR, KEGG_REACTOME_URL.split('/')[-1])

KEGG_WP_URL = "https://raw.githubusercontent.com/ComPath/resources/master/mappings/kegg_wikipathways.csv"
KEGG_WP_PATH = os.path.join(BIO2BEL_DATA_DIR, KEGG_WP_URL.split('/')[-1])

WP_REACTOME_URL = "https://raw.githubusercontent.com/ComPath/resources/master/mappings/wikipathways_reactome.csv"
WP_REACTOME_PATH = os.path.join(BIO2BEL_DATA_DIR, WP_REACTOME_URL.split('/')[-1])

SPECIAL_MAPPINGS_URL = "https://raw.githubusercontent.com/ComPath/resources/master/mappings/special_mappings.csv"
SPECIAL_MAPPINGS_PATH = os.path.join(BIO2BEL_DATA_DIR, SPECIAL_MAPPINGS_URL.split('/')[-1])

# Columns of the ComPath mapping data frame
RESOURCE = 'Resource'
PATHWAY_ID = 'Pathway ID'
IS_PART_OF = "isPartOf"
MAPPING_TYPE = "Mapping Type"
SOURCE_RESOURCE = "Source Resource"
TARGET_RESOURCE = 'Target Resource'
TARGET_ID = "Target ID"
SOURCE_ID = "Source ID"

# Pathway databases' codes
KEGG = "kegg"
REACTOME = 'reactome'
WIKIPATHWAYS = 'wikipathways'
MPATH = 'mpath'
MSIG = 'msig'
CONCATENATED_MERGE = 'concatenated_merge'

# List with all pathway resources
PATHWAY_RESOURCES = [
    KEGG,
    REACTOME,
    WIKIPATHWAYS,
    MPATH,
    MSIG,
    CONCATENATED_MERGE,
]

GENESET_COLUMN_NAMES = {
    KEGG: "KEGG Geneset",
    REACTOME: "Reactome Geneset",
    WIKIPATHWAYS: "WikiPathways Geneset",
}

"""Columns to read to perform ORA analysis."""

# Expected columns to do ORA analysis
GENE_SYMBOL = 'gene_symbol'
FOLD_CHANGE = 'log2FoldChange'
P_VALUE = 'padj'

FC_COLUMNS = {
    GENE_SYMBOL,
    FOLD_CHANGE,
    P_VALUE,
}