diff --git a/Makefile b/Makefile index dac6fb4..98385ff 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,7 @@ init: git lfs install + git lfs pull conda install git pip pip install -r requirements.txt - pip install -e git+https://github.com/lorsbach/enviPath-python@develop#egg=enviPath-python \ No newline at end of file + pip install -e git+https://github.com/lorsbach/enviPath-python@develop#egg=enviPath-python + if [ ! -d TP_prediction/output ]; then mkdir TP_prediction/output; fi \ No newline at end of file diff --git a/TP_prediction/find_best_TPs.py b/TP_prediction/find_best_TPs.py index d8c322c..4ed03ee 100644 --- a/TP_prediction/find_best_TPs.py +++ b/TP_prediction/find_best_TPs.py @@ -1,228 +1,231 @@ +import sys +sys.path.insert(0, '../src/envipath-python/enviPath_python/') +sys.path.insert(0, '../src/envipath-python/') from enviPath_python.enviPath import * from enviPath_python.enviPath import * from util import * import getpass #---------------------------# # enviPath SETTINGS # #---------------------------# # Define the instance to use INSTANCE_HOST = 'https://envipath.org' -USERNAME = 'jasmin' # TODO USER: enter your username +USERNAME = '' # TODO USER: enter your username # MODEL # TODO USER: Select model for relative reasoning. Default: Standard model BBD - ML - ECC - 2022 # New enviPath - default relative reasoning model on envipath: BBD - ML - ECC - 2022 EP_MODEL_ID = 'https://envipath.org/package/de0cdca1-c3ff-44ed-8ffd-f29c269bfa55/relative-reasoning/646afb6c-6cfc-4d4b-8d22-e196d849ec73' # New enviPath - BBD+SOIL relative reasoning model on envipath: BBD+SOIL - ML - ECC - 2022 # rr_id = 'https://envipath.org/package/de0cdca1-c3ff-44ed-8ffd-f29c269bfa55/relative-reasoning/0aec0115-941d-4e33-a844-4431d3ec598d' # New enviPath - BBD+SOIL relative reasoning model on envipath: BBD+SLUDGE - ML - ECC - 2022 # rr_id = 'https://envipath.org/package/de0cdca1-c3ff-44ed-8ffd-f29c269bfa55/relative-reasoning/2a41e599-f962-4268-b7d1-5f1cb252b937' # New enviPath - BBD+SOIL relative reasoning model on envipath: BBD+SOIL+SLUDGE - ML - ECC - 2022 # rr_id = 'https://envipath.org/package/de0cdca1-c3ff-44ed-8ffd-f29c269bfa55/relative-reasoning/76bd8654-e02f-4fbd-98df-bf61411f9b92' # PACKAGE - # TODO USER: prepare a new package (manually) and add it's URI here - make sure it is empty when running script! EP_PACKAGE_ID = 'https://envipath.org/package/de0cdca1-c3ff-44ed-8ffd-f29c269bfa55' # Test package # List of output packages used for Sludge TP paper # pkg_id_1 = 'https://envipath.org/package/0915fad3-b889-4aa8-ac98-0707b717be57' # Package for results using BBD - ML - ECC - 2022 model # pkg_id_2 = 'https://envipath.org/package/80cf58b1-21e2-4c28-9cc6-dc69c6445bdf' # Package for results using BBD+SOIL - ML - ECC - 2022 model # pkg_id_3 = 'https://envipath.org/package/7d64aa85-2e3c-413f-a538-4d5f2bfd4662' # Package for results using BBD+SLUDGE - ML - ECC - 2022 model # pkg_id_4 = 'https://envipath.org/package/11f2acd5-5209-4d49-ad77-93f6f6965886' # Package for results using BBD+SOIL+SLUDGE - ML - ECC - 2022 model #---------------------------# # PATHWAY SEARCH SETTINGS # #---------------------------# # These are the default settings used for the Sludge TP paper. # They can be modified to direct the pathway search towards a specific objective. # Maximum number of TPs to predict -MAX_TP = 1 +MAX_TP = 50 # Lower probability threshold PROBABILITY_THRESHOLD = -1 # any value equal to or lower than the threshold will be excluded # Set probabilities of 0 to 0.01 to continue having a weighting scheme downstream of the pathway INCLUDE_0_PROBABILITIES = False # Follow moiety - only compounds containing moiety in SMILES will be expanded MOIETY = "" # e.g., "C(F)(F)F" # To prioritize small compounds in the queue SORT_TPS_BY_SIZE = False # Follow labeled atoms FOLLOW_LABELED_ATOM = False ATOM_LABEL = '14' #---------------------------# # FILE PATH SETTINGS # #---------------------------# # Input/output files INPUT_FILE_PATH = 'input/input_structures.tsv' OUTPUT_DIRECTORY = 'output/' OUTPUT_FILE_TAG = 'TEST' #---------------------------# # CONNECT TO ENVIPATH # #---------------------------# eP = enviPath(INSTANCE_HOST) password = getpass.getpass() eP.login(USERNAME, password) #---------------------------# # FUNCTIONS # #---------------------------# def __main__(rr_id, pkg_id, tag): """ Main function, predicts pathways for a list of input smiles Output: pathways are saved to specified enviPath package, TP list as .tsv file to output folder :param rr_id: URI of enviPath relative reasoning mode to be used :param pkg_id: URI of enviPath package to store resulting pathways :param tag: string tag to attach to output files for identification """ rr = RelativeReasoning(eP.requester, id=rr_id) pkg = Package(eP.requester, id=pkg_id) input_list = load_input(INPUT_FILE_PATH) output_file_path = '{}TP_prediction_{}_top_{}.tsv'.format(OUTPUT_DIRECTORY, tag, MAX_TP) outfile = open(output_file_path, 'w') for compound_input in input_list: result = predict_TPs(compound_input['smiles'], compound_input['name'], rr) result_list = clean_result(result) # sort and name TPs pathway_info = upload_envipath_pathway(eP, result_list, pkg) print_result(outfile, result_list, pathway_info) # continuous writing of result file def print_result(open_file, result, pathway = None): """ Prints output to open file :param open_file: writable file object :param result: clean result dictionary from predict_TPs() function :param pathway: pathway URI from upload_envipath_pathway() function """ open_file.write('///') # signifies new pathway entry if pathway: open_file.write(' Pathway name: {}, Pathway id: {}'.format(pathway['name'], pathway['id'])) open_file.write('\n') header= ('SMILES\tname\tcombined_probability\trules\tgeneration\tprobability\tparent\n') open_file.write(header) for TP in result: open_file.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(TP['smiles'], TP['name'], TP['combined_probability'], TP['rules'], TP['generation'], TP['probability'], TP['parent']) ) def predict_TPs(input_smiles, input_name, rr): """ Pathway prediction for single compound :param input_smiles: input smiles of parent compound :param input_name: name of parent compound :param rr: relative reasoning object :return: dictionary of resulting TPs """ print('\n### PREDICT TPs FOR COMPOUND {} ###\n'.format(input_name)) num_TP = -1 # counter starts at -1, because source compound is also in the TP list validated_TPs = {} # container for resulting predictions queued_items = [{'probability': 1, 'combined_probability': 1, 'smiles': input_smiles, 'generation': 0, 'parent': '', 'rules': '', 'name': input_name, 'size': len(input_smiles)}] queue = [input_smiles] # queue is updated after each cycle to have top TP first, list of smiles while num_TP < MAX_TP: if len(queue) == 0: print('\nEmpty queue - The exploration of has converged at {} predicted TPs'.format(num_TP)) return validated_TPs # stop TP prediction smiles = queue.pop(0) # get top item in queue data = queued_items.pop(0) # remove data from queued items result_list = expand_smiles(smiles, rr) # create children TP_dict = result_to_compound_dict(result_list) queue, queued_items, validated_TPs = update_queue(queue, queued_items, validated_TPs, TP_dict, data) validated_TPs[smiles] = data num_TP += 1 return validated_TPs def update_queue(_queue,_queued_items, _validated_TPs, _TPs, _parent_data): """ Update queue with TPs predicted in current iteration :param _queue: ordered list of smiles to explore :param _queued_items: ordered list of compound dictionaries, same order as _queue :param _validated_TPs: list of already validated TPs for resulting pathway :param _TPs: predicted TPs from current iteration, to be evaluated and added to queue :param _parent_data: compound dictionary of the parent compound of _TPs :return: new_queue: new ordered list of smiles to explore :return _queued_items: new ordered list of compound dictionaries :return: _validated_TPs: updated list of already validated TPs """ parent_probability = _parent_data['combined_probability'] parent_generation = _parent_data['generation'] parent_smiles = _parent_data['smiles'] queue_before = len(_queue) for smiles in _TPs: data = _TPs[smiles] # If the probability is 0 , we don't consider the TP further this_probability = data['probability'] if this_probability <= PROBABILITY_THRESHOLD: continue # If a moiety is given and it is not in SMILES, we don't follow the TP further if MOIETY not in smiles: continue if FOLLOW_LABELED_ATOM and ATOM_LABEL not in smiles: continue if INCLUDE_0_PROBABILITIES and this_probability == 0: this_probability = 0.01 # add combined probability this_combined_probability = parent_probability * this_probability this_generation = parent_generation + 1 rules = data['rules'] # first, check if compound already in validated. if yes, update if smiles in _validated_TPs.keys(): _validated_TPs[smiles] = update_compound_entry(_validated_TPs[smiles], this_combined_probability, rules, this_generation, parent_smiles, size_metric='size', size_value=len(smiles)) # next, check if compound is already in queue. if yes, update elif smiles in _queue: index = _queue.index(smiles) assert smiles == _queued_items[index]['smiles'], \ 'smiles {} does not match smiles in {}'.format(smiles, _queued_items[index]) _queued_items[index] = update_compound_entry(_queued_items[index], this_combined_probability, rules, this_generation, parent_smiles, size_metric='size', size_value=len(smiles)) # else, add new item to queue else: data['combined_probability'] = this_combined_probability data['generation'] = this_generation data['parent'] = parent_smiles data['carbon_count'] = smiles.upper().count('C') _queued_items.append(data) _queue.append(smiles) assert len(_queued_items) == len(_queue) # First sort by size if SORT_TPS_BY_SIZE: _queued_items.sort(reverse=False, key=lambda x: x['size']) # order dict by combined probability _queued_items.sort(reverse=True, key=lambda x: x['combined_probability']) queue_after = len(_queue) print('Added {} smiles to queue'.format(queue_after - queue_before)) new_queue = [] # resetting queue [new_queue.append(x['smiles']) for x in _queued_items] print ('New queue for compound', parent_smiles) for q in new_queue: print(q, _queued_items[new_queue.index(q)]['combined_probability']) return new_queue, _queued_items, _validated_TPs #---------------------------# # MAIN # #---------------------------# __main__(rr_id = EP_MODEL_ID, pkg_id = EP_PACKAGE_ID, tag = OUTPUT_FILE_TAG) diff --git a/TP_prediction/util.py b/TP_prediction/util.py index db1fc81..c759157 100644 --- a/TP_prediction/util.py +++ b/TP_prediction/util.py @@ -1,115 +1,118 @@ +import sys +sys.path.insert(0, '../src/envipath-python/enviPath_python/') +sys.path.insert(0, '../src/envipath-python/') from enviPath_python.enviPath import * from enviPath_python.enviPath import * def load_input(input_path): """ Load input smiles and names :param input_path: path to input file :return: list of dictionaries containing smiles and name of input compounds """ f = open(input_path) input_list = [] for line in f: if line not in ['', '\n']: line_split = line.rstrip().split('\t') input_list.append({'smiles': line_split[0], 'name': line_split[1]}) return input_list def upload_envipath_pathway(eP, result, pkg): """ Upload resulting pathway dictionary to enviPath :param eP: enviPath object :param result: result list of dictionaries from pathway prediction :param pkg: package object where results should be uploaded :return: dictionary {'name': pathway name, 'id': URI of pathway} """ assert 'anonymous' not in str(eP.who_am_i()), 'Upload not possible when not logged in' source = result[0] pkg.add_compound(smiles=source['smiles'],name=source['name']) pathway = Pathway.create(pkg, smiles=source['smiles'], name=source['name'], root_node_only=True) # Add the observed degradation product as a second node for TP in result[1:]: # print('adding to pw:', TP['name'], TP['smiles'], TP['generation'], TP['parent']) pathway.add_node(smiles=TP['smiles'], node_name=TP['name'], node_depth=TP['generation']) # check for the case of multiple parents: for parent in TP['parent'].split(','): pathway.add_node(smiles=parent) pathway.add_edge(smirks='{}>>{}'.format(parent, TP['smiles'])) print('New pathway created for {}: {}'.format(source['name'], pathway.id)) return {'name': source['name'], 'id': pathway.id} def expand_smiles(smiles, rr): """ Get all potential TPs by applying enviPath biotransformation and relative reasoning rules :param smiles: input smiles :param rr: relative reasoning object :return: list of dictionaries for each predicted TP: {'smiles': smiles, 'name': rule name, 'probability': relative reasoning probability} """ res = rr.classify_smiles(smiles) # sort by probability res.sort(reverse=True, key=lambda x: x['probability']) return res def clean_result(result_dict): """ Sorts TP list for output :param result_dict: result dictionary :return: sorted and named list of TPs """ result_list = list(result_dict.values()) result_list.sort(key=lambda x: x['generation']) # make sure that source compound is first result_list.sort(reverse=True, key=lambda x: x['combined_probability']) # get name of source compound source_name = result_list[0]['name'] TP_count = 0 for res in result_list[1:]: TP_count += 1 res['name'] = 'TP_{}_{}'.format(source_name, TP_count) return result_list def result_to_compound_dict(result): """ Translates result from enviPath node expansion into a compound dictionary :param result: list of dictionaries with predicted TP information :return: dictionary of TPs """ compound_dict = {} for r in result: probability = float(r['probability']) for product_smiles in r['products']: if product_smiles not in compound_dict.keys(): compound_dict[product_smiles] = {'rules' : r['name'], 'probability': probability, 'smiles': product_smiles} else: # check if there's a rule with better probability if probability > compound_dict[product_smiles]['probability']: # update probability and rules associated to this probability compound_dict[product_smiles]['probability'] = probability compound_dict[product_smiles]['rules'] = r['name'] return compound_dict def update_compound_entry(compound_entry, this_combined_probability, rules, this_generation, parent_smiles, size_metric, size_value): """ Update the compound entry with new information :param compound_entry: dictionary of compound information :param this_combined_probability: new combined probability :param rules: new rules :param this_generation: new generation :param parent_smiles: new parent compound :param size_metric: size metric :param size_value: new size value :return: updated compound entry """ if compound_entry['combined_probability'] < this_combined_probability: compound_entry['combined_probability'] = this_combined_probability compound_entry['rules'] = rules compound_entry['generation'] = this_generation compound_entry['parent'] = parent_smiles compound_entry[size_metric] = size_value elif compound_entry['combined_probability'] == this_combined_probability: compound_entry['rules'] += ',{}'.format(rules) compound_entry['parent'] += ',{}'.format(parent_smiles) return compound_entry diff --git a/readme.md b/readme.md index f9f460b..ee467c2 100644 --- a/readme.md +++ b/readme.md @@ -1,51 +1,49 @@ # TP_predict - Predict TPs and create suspect lists This collection of scripts allows the user to reproduce the TP prediction and data analyses presented in the following publication: Trostel, L. & Coll, C., Fenner, K., Hafner, J. Synergy of predictive and analytical methods advances elucidating biotransformation processes in activated sludge, 2023. [insert DOI] The tools can further be used to perform the same predictions and analyses on a different set of compounds. ## Content * **TP_prediction**: Script to predict TPs and corresponding biodegradation pathways * **File_conversion**: Conversion of prediction output to input for suspect screening tools * Prediction_output_to_mass_list * SMILES_to_mass_and_inclusion_list * **Additional_analyses** * Compare_methods * Analyse_cutoff_thresholds Specific user guidance can be found in the README.md files of the content folders. ## How to To fetch the code from the git repository, open a terminal and run: ``` $ git clone [insert link] ``` -Optional: To get the input/output data presented in the manuscript, additionally get the lfs files +Go to the newly created directory: ``` -$ git lfs install -$ git lfs pull +$ cd TP_predict ``` -To install the dependencies, go to the nicepath directory and run: +To set up TP_predict and install the dependencies, run: ``` -$ cd TP_predict $ make ``` ## Installation and requirements The scripts requires rdkit for python, which is easiest installed in a conda environment. All scripts have been developed and tested in Python version 3.6 on Mac, Linux and Windows operating systems. ### Anaconda step by step guide for non-python users: 1. [Download Anaconda](https://docs.anaconda.com/anaconda/install/index.html) and install it, then run `Anaconda Navigator` 2. create new environment under the `Environment` tab, select python version 3.6.13 3. go to environments, click `play button` on newly created environment, open Terminal 4. run following lines individually (need to confirm: type `y` and press `enter`)(might take a while): `conda install -c rdkit rdkit` and `pip install pubchempy` 5. check if pandas is installed and active according to [this Tutorial](https://docs.anaconda.com/anaconda/navigator/tutorials/pandas/) 6. open `Anaconda Navigator`, go to `Home` tab, check if `Applications on` is set to the new environment 7. click `gear icon` on `Spyder` > install specific version > 5.0.5 and wait for installation to finish 8. click `launch button` below `Spyder` diff --git a/requirements.txt b/requirements.txt index 8e46a2b..5d1b87c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,9 @@ networkx numpy pandas rdkit matplotlib seaborn pubchempy -matplotlib \ No newline at end of file +matplotlib +requests \ No newline at end of file