# -*- coding: utf-8 -*-
"""This module contains all the constants used in the PathwayForte repo."""
import logging
import os
import time
from bio2bel import get_data_dir
logger = logging.getLogger(__name__)
dir_path = os.path.dirname(os.path.realpath(__file__))
SOURCE = os.path.join(os.path.abspath(os.path.join(dir_path, os.pardir)))
# Data folder where gene sets files are
DATA = os.path.join(os.path.abspath(os.path.join(SOURCE, os.pardir)), 'data')
BIO2BEL_DATA_DIR = get_data_dir('pathwayforte')
"""Cancer Data Sets"""
CANCER_DATA_SETS = {
'brca',
'lihc',
'kirc',
'prad',
'ov',
}
TCGA_DATASETS = os.path.join(DATA, 'tcga_datasets')
# Raw expression matrix from TCGA
EXPRESSION_MATRIX = os.path.join(TCGA_DATASETS, '{}', 'expression_matrix_full.txt')
# File with phenotype classes (e.g., tumor vs normal)
PHENOTYPE_CLASSES = os.path.join(TCGA_DATASETS, '{}', 'phenotype_classes.cls')
CLASSES = os.path.join(TCGA_DATASETS, '{}', 'class.cls')
# Clinical data from TCGA (necessary for survival analysis)
CLINICAL_DATA = os.path.join(TCGA_DATASETS, '{}', '{}_tcga_clinical_data.tsv')
TUMOR_EXPRESSION_MATRIX = os.path.join(TCGA_DATASETS, '{}', 'tumor_expression_matrix.txt')
NORMAL_EXPRESSION_SAMPLES = os.path.join(TCGA_DATASETS, '{}', 'normal_expression_dimension.txt')
TUMOR_EXPRESSION_SAMPLES = os.path.join(TCGA_DATASETS, '{}', 'tumor_expression_dimension.txt')
RESULTS = os.path.join(DATA, 'results')
CLASSIFIER_RESULTS = os.path.join(RESULTS, 'classifier')
[docs]def make_classifier_results_directory():
"""Ensure that the result folder exists."""
os.makedirs(CLASSIFIER_RESULTS, exist_ok=True)
"""GSEA"""
GSEA = os.path.join(DATA, 'results', 'gsea')
KEGG_GSEA = os.path.join(GSEA, 'kegg')
REACTOME_GSEA = os.path.join(GSEA, 'reactome')
WIKIPATHWAYS_GSEA = os.path.join(GSEA, 'wikipathways')
MERGE_GSEA = os.path.join(GSEA, 'merge')
MSIG_GSEA = os.path.join(GSEA, 'msig')
"""Output files with results for GSEA"""
KEGG_GSEA_TSV = os.path.join(GSEA, 'kegg', 'kegg_{}_{}.tsv')
REACTOME_GSEA_TSV = os.path.join(GSEA, 'reactome', 'reactome_{}_{}.tsv')
WIKIPATHWAYS_GSEA_TSV = os.path.join(GSEA, 'wikipathways', 'wikipathways_{}_{}.tsv')
MERGE_GSEA_TSV = os.path.join(GSEA, 'merge', 'merge_{}_{}.tsv')
CONCATENATED_MERGE_GSEA_TSV = os.path.join(GSEA, 'merge', 'concatenated_merge_{}_{}.tsv')
KEGG_MSIG_GSEA_TSV = os.path.join(GSEA, 'msig', 'msig_kegg_{}_{}.tsv')
REACTOME_MSIG_GSEA_TSV = os.path.join(GSEA, 'msig', 'msig_reactome_{}_{}.tsv')
[docs]def make_gsea_export_directories():
"""Ensure that gsea export directories exist."""
os.makedirs(RESULTS, exist_ok=True)
os.makedirs(GSEA, exist_ok=True)
os.makedirs(KEGG_GSEA, exist_ok=True)
os.makedirs(REACTOME_GSEA, exist_ok=True)
os.makedirs(WIKIPATHWAYS_GSEA, exist_ok=True)
os.makedirs(MERGE_GSEA, exist_ok=True)
os.makedirs(MSIG_GSEA, exist_ok=True)
"""ssGSEA"""
SSGSEA = os.path.join(DATA, 'results', 'ssgsea')
KEGG_SSGSEA = os.path.join(SSGSEA, 'kegg')
REACTOME_SSGSEA = os.path.join(SSGSEA, 'reactome')
WIKIPATHWAYS_SSGSEA = os.path.join(SSGSEA, 'wikipathways')
MERGE_SSGSEA = os.path.join(SSGSEA, 'merge')
MSIG_SSGSEA = os.path.join(SSGSEA, 'msig')
"""Pickles with results for ssGSEA"""
KEGG_SSGSEA_TSV = os.path.join(SSGSEA, 'kegg', 'kegg_{}_{}.tsv')
REACTOME_SSGSEA_TSV = os.path.join(SSGSEA, 'reactome', 'reactome_{}_{}.tsv')
WIKIPATHWAYS_SSGSEA_TSV = os.path.join(SSGSEA, 'wikipathways', 'wikipathways_{}_{}.tsv')
MERGE_SSGSEA_TSV = os.path.join(SSGSEA, 'merge', 'merge_{}_{}.tsv')
CONCATENATED_MERGE_SSGSEA_TSV = os.path.join(SSGSEA, 'merge', 'concatenated_merge_{}_{}.tsv')
KEGG_MSIG_SSGSEA_TSV = os.path.join(SSGSEA, 'msig', 'kegg_msig_{}_{}.tsv')
REACTOME_MSIG_SSGSEA_TSV = os.path.join(SSGSEA, 'msig', 'reactome_msig_{}_{}.tsv')
[docs]def make_ssgsea_export_directories():
"""Ensure that gsea export directories exist."""
os.makedirs(RESULTS, exist_ok=True)
os.makedirs(SSGSEA, exist_ok=True)
os.makedirs(KEGG_SSGSEA, exist_ok=True)
os.makedirs(REACTOME_SSGSEA, exist_ok=True)
os.makedirs(WIKIPATHWAYS_SSGSEA, exist_ok=True)
os.makedirs(MERGE_SSGSEA, exist_ok=True)
os.makedirs(MSIG_SSGSEA, exist_ok=True)
"""GMT Files"""
GMT_FOLDER = os.path.join(DATA, 'gmt_files')
[docs]def check_gmt_files():
"""Check if GMT files exist and returns GMT files as constant variables."""
# Get files located in the GMT directory
gmt_file_names = [
f for f in os.listdir(GMT_FOLDER)
if os.path.isfile(os.path.join(GMT_FOLDER, f))
]
# Raise error if files are not found
if not gmt_file_names:
logger.warning('GMT files missing, please create them by running the "export_gene_sets" command.')
return None, None, None, None
kegg_gmt_file, reactome_gmt_file, wikipathways_gmt_file, merge_gmt_file = None, None, None, None
# Get gmt files using the prefix for each database
for file in gmt_file_names:
if file.startswith('kegg'):
kegg_gmt_file = os.path.join(GMT_FOLDER, file)
elif file.startswith('reactome'):
reactome_gmt_file = os.path.join(GMT_FOLDER, file)
elif file.startswith('wikipathways'):
wikipathways_gmt_file = os.path.join(GMT_FOLDER, file)
elif file.startswith('mpath'):
merge_gmt_file = os.path.join(GMT_FOLDER, file)
elif file.startswith('msigdb') or file == 'README.rst' or file == 'concatenated_merge.gmt':
continue
else:
logger.warning('Unknown file {} in gmt folder'.format(os.path.join(GMT_FOLDER, file)))
# If any of the GMT files is missing print warning
if not all([kegg_gmt_file, reactome_gmt_file, wikipathways_gmt_file, merge_gmt_file]):
logger.warning('GMT files missing, please create them by running the "export_gene_sets" command.')
return kegg_gmt_file, reactome_gmt_file, wikipathways_gmt_file, merge_gmt_file
KEGG_GENE_SETS, REACTOME_GENE_SETS, WIKIPATHWAYS_GENE_SETS, MERGED_GENE_SETS = check_gmt_files()
PATHBANK_PATHWAYS_PATH = 'https://pathbank.org/downloads/pathbank_all_pathways.csv.zip'
PATHBANK_PROTEINS_PATH = 'https://pathbank.org/downloads/pathbank_all_proteins.csv.zip'
PATHBANK_PATHWAYS_FILE = 'pathbank_pathways.csv'
PATHBANK_PROTEINS_FILE = 'pathbank_all_proteins.csv'
PATHBANK_KEGG_MAPPINGS = "https://raw.githubusercontent.com/ComPath/compath-resources/master/mappings/pathbank_kegg.csv"
PATHBANK_REACTOME_MAPPINGS = "https://raw.githubusercontent.com/ComPath/compath-resources/master/mappings/pathbank_reactome.csv"
PATHBANK_WIKIPATHWAYS_MAPPINGS = "https://raw.githubusercontent.com/ComPath/compath-resources/master/mappings/pathbank_wikipathways.csv"
PATHBANK_KEGG_FILE = 'pathbank_kegg.csv'
PATHBANK_REACTOME_FILE = 'pathbank_reactome.csv'
PATHBANK_WIKIPATHWAYS_FILE = 'pathbank_wikipathways.csv'
# Export the gene set with a time stamp
TODAY = time.strftime("%d_%m_%Y")
NEW_KEGG_GENE_SETS = os.path.join(GMT_FOLDER, f'kegg_geneset_{TODAY}.gmt')
NEW_REACTOME_GENE_SETS = os.path.join(GMT_FOLDER, f'reactome_geneset_{TODAY}.gmt')
NEW_WIKIPATHWAYS_GENE_SETS = os.path.join(GMT_FOLDER, f'wikipathways_geneset_{TODAY}.gmt')
NEW_MERGED_GENE_SETS = os.path.join(GMT_FOLDER, f'mpath_geneset_{TODAY}.gmt')
TEMP_KEGG_PATHWAY_GENESET_CSV = os.path.join(GMT_FOLDER, 'kegg_pathway_geneset.csv')
TEMP_REACTOME_PATHWAY_GENESET_CSV = os.path.join(GMT_FOLDER, 'reactome_pathway_geneset.csv')
TEMP_WIKIPATHWAYS_PATHWAY_GENESET_CSV = os.path.join(GMT_FOLDER, 'wikipathways_pathway_geneset.csv')
TEMP_MERGED_PATHWAY_GENESET_CSV = os.path.join(GMT_FOLDER, 'merged_pathway_geneset.csv')
MSIGDB_KEGG_GENE_SETS = os.path.join(GMT_FOLDER, 'msigdb_kegg.gmt')
MSIGDB_REACTOME_GENE_SETS = os.path.join(GMT_FOLDER, 'msigdb_reactome.gmt')
CONCATENATED_MERGE_GENE_SETS = os.path.join(GMT_FOLDER, 'concatenated_merge.gmt')
# Get csv pairwise mapping files
KEGG_REACTOME_URL = "https://raw.githubusercontent.com/ComPath/resources/master/mappings/kegg_reactome.csv"
KEGG_REACTOME_PATH = os.path.join(BIO2BEL_DATA_DIR, KEGG_REACTOME_URL.split('/')[-1])
KEGG_WP_URL = "https://raw.githubusercontent.com/ComPath/resources/master/mappings/kegg_wikipathways.csv"
KEGG_WP_PATH = os.path.join(BIO2BEL_DATA_DIR, KEGG_WP_URL.split('/')[-1])
WP_REACTOME_URL = "https://raw.githubusercontent.com/ComPath/resources/master/mappings/wikipathways_reactome.csv"
WP_REACTOME_PATH = os.path.join(BIO2BEL_DATA_DIR, WP_REACTOME_URL.split('/')[-1])
SPECIAL_MAPPINGS_URL = "https://raw.githubusercontent.com/ComPath/resources/master/mappings/special_mappings.csv"
SPECIAL_MAPPINGS_PATH = os.path.join(BIO2BEL_DATA_DIR, SPECIAL_MAPPINGS_URL.split('/')[-1])
# Columns of the ComPath mapping data frame
RESOURCE = 'Resource'
PATHWAY_ID = 'Pathway ID'
IS_PART_OF = "isPartOf"
MAPPING_TYPE = "Mapping Type"
SOURCE_RESOURCE = "Source Resource"
TARGET_RESOURCE = 'Target Resource'
TARGET_ID = "Target ID"
SOURCE_ID = "Source ID"
# Pathway databases' codes
KEGG = "kegg"
REACTOME = 'reactome'
WIKIPATHWAYS = 'wikipathways'
MPATH = 'mpath'
MSIG = 'msig'
CONCATENATED_MERGE = 'concatenated_merge'
# List with all pathway resources
PATHWAY_RESOURCES = [
KEGG,
REACTOME,
WIKIPATHWAYS,
MPATH,
MSIG,
CONCATENATED_MERGE,
]
GENESET_COLUMN_NAMES = {
KEGG: "KEGG Geneset",
REACTOME: "Reactome Geneset",
WIKIPATHWAYS: "WikiPathways Geneset",
}
"""Columns to read to perform ORA analysis."""
# Expected columns to do ORA analysis
GENE_SYMBOL = 'gene_symbol'
FOLD_CHANGE = 'log2FoldChange'
P_VALUE = 'padj'
FC_COLUMNS = {
GENE_SYMBOL,
FOLD_CHANGE,
P_VALUE,
}