diff --git a/MetaCycPWanalysis/check_coverage_and_rank_pathways.py b/MetaCycPWanalysis/check_coverage_and_rank_pathways.py new file mode 100755 index 0000000..e897a0f --- /dev/null +++ b/MetaCycPWanalysis/check_coverage_and_rank_pathways.py @@ -0,0 +1,139 @@ +# LCSB EPFL 2019, Jasmin Hafner + +import networkx as nx +import time +import pandas as pd +import os +import multiprocessing # pathway search is parallelized + +# Files +input_precursor_target_csv = 'data/MetacycExtractedLinearPathways.csv' +db_scopes = ['MetaCyc', 'MetaCyc_hp', 'chemATLAS', 'chemATLAS_hp'] + +distance_type = 'dist' # simple distance calculate as 1/CAR +CAR_cutoff = 0.34 # standard CAR cutoff as described in NICEpath + +max_num_pathways = 10000 # increasing this threshold increases the time for pathway search + +#################### Defaults ##################### +header = ['id', 'length', 'intermediates', 'pairs_id', 'pairs_weight', 'if_native'] +NetworkFilesFolder = '../NetworkAnalysis/Data/' + +##### Graph truncation #### +def getSimpleG(G): + # Removing the edges that are below the defined CAR + simple_G = nx.Graph() + for (u, v, c) in G.edges.data('car'): + if c >= CAR_cutoff: + simple_G.add_edge(u, v) + simple_G[u][v]['car'] = G[u][v]['car'] + simple_G[u][v]['dist'] = G[u][v]['dist'] + return simple_G + +######################################################## +# Check the right pathway as edges of Network # +######################################################## +def addEdgesCoverage(goldenDF): + # Check for each edge is it is included into the network + for NT in db_scopes: + graph_name = 'rpairs_'+NT + G = nx.read_gpickle(NetworkFilesFolder + graph_name + ".gpickle") + Gs = getSimpleG(G) + # getting simpler version of edges for faster comparison + G_edges = [] + for edge in Gs.edges: + e = list(edge) + e.sort() + G_edges.append(str(e[0])+'_'+str(e[1])) + G_edges = set(G_edges) + print('edges for', NT, 'calculated') + goldenDF[NT+'_coveredByNetwork']= goldenDF.apply(check_all_edges_in_network, args = (G_edges), axis=1) + # Check if any pathway exists between the two compounds in the network + goldenDF[NT+'_hasPath']=goldenDF.apply(check_if_has_path, args=(G,), axis=1) + + goldenDF.to_csv('output/networkEdgesCoverage.csv', index = False) + +def check_all_edges_in_network(row, G_edges): + # loading the native pathways + native_pathway = load_pathway(row) + # checking if all the edges of the native pathway are in the network + edges_native = [] + for edge in native_pathway: + e = list(edge) + e.sort() + edges_native.append(str(e[0])+'_'+str(e[1])) + edges_covered = G_edges.intersection(set(edges_native)) + if len(edges_covered)==len(native_pathway): + return 1 + else: + return 0 + +def load_pathway(row): + comps = [int(i) for i in row['Pathway_original_in_refV'].split('|')] + edges = [(comps[i-1], comps[i]) for i in range(1, len(comps))] + return edges + +def check_if_has_path(row, G): + if row['Precursor LCSB ID'] in G.nodes and row['Target LCSB ID'] in G.nodes: + return nx.has_path(G, row['Precursor LCSB ID'], row['Target LCSB ID']) + return False + +#################### PATHWAY SEARCH FUNCTIONS ###################### + +def shortest_paths(Graph, source, target, weight): + try: + return nx.shortest_simple_paths(Graph, source, target, weight=weight) + except Exception as e: + print(source, target, e) + return list() + +def find_pathway_rank(param_list): + #param_list: 0:prec; 1:target; 2:pwid; 3:ref_seq_of_intermediates + entry = 1 + + reference_path = [int(i) for i in param_list[3].split('|')] + + for path in shortest_paths(G, param_list[0], param_list[1], distance_type): + if entry > max_num_pathways: + return 0 + pathway_match = [i for i, j in zip(reference_path, path) if i == j] + if len(pathway_match) == len(reference_path): + return entry + entry += 1 + +##################### MAIN ###################### + +if not os.path.exists('output/networkEdgesCoverage.csv'): + metaCycPwDataset = pd.read_csv(input_precursor_target_csv) + addEdgesCoverage(metaCycPwDataset) + +metaCycPwDataset = pd.read_csv('output/networkEdgesCoverage.csv') + +# Pathway search +local_time = time.ctime(time.time()) +print("Time execution starts:", local_time) + +for db_scope in db_scopes: + column_network = db_scope+"_coveredByNetwork" + column_path = db_scope+"_hasPath" + df_scope = metaCycPwDataset.query('{}==1&{}==1'.format(column_network, column_path)) + print(db_scope) + G = nx.read_gpickle(NetworkFilesFolder + "rpairs_" + db_scope + ".gpickle") + G = getSimpleG(G) + + print("Find paths...", db_scope) + + param_list = list(zip(df_scope['Precursor LCSB ID'].to_list(), df_scope['Target LCSB ID'].to_list(), df_scope['MetaCyc Pathway ID'].to_list(), df_scope['Pathway_original_in_refV'].to_list())) + + # pathway search is parallelized + pool = multiprocessing.Pool() + pw_ranks = pool.map(find_pathway_rank, param_list) + # pool has to be closed to avoid OSError: [Errno 24] Too many open files + pool.close() + + df_scope['pw_rank'] = pw_ranks + + df_scope.to_csv('output/pw_ranking_'+db_scope+'.csv', index = False) + +local_time = time.ctime(time.time()) +print("Time execution ends:", local_time) \ No newline at end of file