diff --git a/expand_dataset_.py b/expand_dataset_.py
new file mode 100644
index 0000000..dafbf73
--- /dev/null
+++ b/expand_dataset_.py
@@ -0,0 +1,261 @@
+# Script by Jasmin, Jan 2022, Eawag
+# Expand compound entries with all half-lives and associated data
+# Goal: create full data set
+import pandas as pd
+import numpy as np
+import sys
+import re
+sys.path.insert(0, 'C:\\Users\\leetseng\\enviPath-python\\')    #/Users/jasmin/enviPath-python/enviPath_python/
+from enviPath_python.enviPath import *
+from  enviPath_python.objects import *
+file_location = '/Halflife_modeling/'
+
+# Define the instance to use
+INSTANCE_HOST = 'https://envipath.org'
+username = 'leetseng'
+password = getpass.getpass()
+eP = enviPath(INSTANCE_HOST)
+eP.login(username, password)
+
+# files
+input_file_path = file_location+'input/soil_compounds_final.txt'
+output_file_path_full = file_location+'output/full_dataset_half-lives.txt'
+
+
+#todo :
+# 1. create file similar to tomu's data set - can we improve model just by re-obtaining all the data ?
+# 2. try to improve model
+
+# try to switch to the sludge dataset
+# try to collect the information and fill into the dictionary
+# transfer the function to extract all compounds
+
+
+
+def __main__():
+    # data frames
+    data = pd.read_csv(input_file_path, sep='\t')
+    # available headers: 	Index	Name	ID	Inchikey_full	Inchikey_first_14	Full_SMILES	Canonical_SMIES	Is_composite
+    # Cropped_canonical_SMILES	Cropped_canonical_SMILES_no_stereo	SMILES_pH_7	Number_halflives	Comment	Additional_HL_source
+    # Mol_weigth	pKa	pKa_5to8	charge_pH_7	log(Koc)	Koc_source
+
+    D = {'index': [], 'compound_id': [], 'smiles': [], 'reduced_smiles': [], 'halflife': [], 'scenario_id': [], 'study_name':[],
+         'halflife_model': [], 'halflife_comment': [], 'spike_compound': [], 'acidity': [], 'CEC': [], 'OC': [],
+         'biomass_start': [], 'biomass_end': [], 'biomass': [], 'temperature': [], 'wst_value': [],
+         'wst_type': [], 'humidity': [], 'humidity_conditions': [], 'soil_texture': [], 'sand': [], 'silt': [], 'clay': [], 'log(Koc)':[], 'Koc_source':[]}
+
+    for index, row in data.iterrows():
+        compound_id = row['ID']
+        print("COMPOUND: {}\n".format(compound_id))
+        D = add_halflives(D, compound_id, row)
+        if type(row['Additional_HL_source']) == str:
+            D = add_halflives(D, row['Additional_HL_source'], row)
+
+    hl_data = pd.DataFrame.from_dict(D)  #convert dict into DF
+    hl_data.to_csv(output_file_path_full, sep='\t')
+
+
+def fetch_acidity(info):
+    try:
+        raw_pH = info.get_acidity().get_value()
+    except:
+        return np.NaN
+    else:
+        if ';' in raw_pH:
+            if '-' in raw_pH.split(';')[0]:
+                pH = range_to_average(raw_pH.split(';')[0])
+            else:
+                pH = float(raw_pH.split(';')[0])
+        elif '-' in raw_pH: # if range, get mean value
+            pH = range_to_average(raw_pH)
+        else:
+            pH = float(raw_pH)
+        return np.round(pH, 1)
+
+def range_to_average(input_string):
+    min = float(input_string.split(' - ')[0])
+    max = float(input_string.split(' - ')[1])
+    avg = np.average([min, max])
+    return avg
+
+def fetch_cec(info):
+    try:
+        cec = info.get_cec().get_value()
+    except:
+        return np.NaN
+    else:
+        return cec
+
+def fetch_organic_content(info):
+    try:
+        raw = info.get_omcontent().get_value()
+    except:
+        return np.NaN
+    else:
+        raw_list = raw.split(';')
+        oc = np.NaN
+        for i in raw_list:
+            if i == 'OC':
+                oc = val
+            elif i == 'OM':
+                oc = val / 1.7 # OC = OM / 1.7, source: Schwarzenbach
+            else:
+                if '<' in i:
+                    val = float(i[1:])
+                    print("Warning: {} was converted to {}".format(i, val))
+                elif i == '' or i == '-':
+                    val = np.NaN
+                else:
+                    val = float(i)
+        return oc
+
+def fetch_biomass(info):
+    try:
+        raw = info.get_biomass().get_value()
+    except:
+        return np.NaN, np.NaN
+    else:
+        l = raw.split(' - ')
+        return float(l[0]), float(l[1])
+
+def fetch_temperature(info):
+    try:
+        raw = info.get_temperature().get_value()
+    except:
+        return np.NaN
+    else:
+        min = float(raw.split(';')[0])
+        max = float(raw.split(';')[1])
+        return np.round(np.average([min, max]), 0)
+
+def fetch_wst(info):
+    try:
+        raw = info.get_waterstoragecapacity().get_value()
+    except:
+        return np.NaN, ''
+    else:
+        raw_list = raw.replace(" ", "").split('-')
+        if len(raw_list) < 4:
+            value = float(raw_list[0])
+            type = raw_list[1]
+        else:
+            value = np.NaN
+            type = raw_list[2]
+        return value, type
+
+def fetch_humidity(info):
+    try:
+        raw = info.get_humidity().get_value()
+    except:
+        return np.NaN, ''
+    else:
+        if type(raw) == float:
+            return raw, ''
+        else:
+            l = raw.split(' - ')
+            return float(l[0]), l[1]
+
+def fetch_soiltexture1(info):
+    try:
+        raw = info.get_soiltexture1().get_value()
+    except:
+        return ''
+    else:
+        return raw
+
+def fetch_spikecompound(info):
+    try:
+        raw = info.get_spikecompound().get_compoundLink()
+        spike_cpd = CompoundStructure(eP.requester, id=raw)
+        spike_smiles = spike_cpd.get_smiles()
+    except:
+        return ''
+    else:
+        return spike_smiles
+
+def fetch_soiltexture2(info):
+    try:
+        raw = info.get_soiltexture2().get_value()
+    except:
+        return np.NaN, np.NaN, np.NaN
+    else:
+        values = re.findall(r'\s([\d.]+)%', raw)    ########################
+        if values == []:
+            return np.NaN, np.NaN, np.NaN
+        return get_float_or_nan(values[0]), get_float_or_nan(values[1]), get_float_or_nan(values[2]) # sand, silt, clay
+
+def fetch_halflife_model(info):
+    try:
+        raw = info.get_halflife().get_value()
+    except:
+        return ''
+    else:
+        return raw.split(';')[0]
+
+def fetch_halflife_comment(info):
+    try:
+        raw = info.get_halflife().get_value()
+    except:
+        return ''
+    else:
+        return raw.split(';')[2]
+
+def get_float_or_nan(x):
+    try:
+        return float(x)
+    except:
+        return np.NaN
+
+def add_halflives(D, compound_id, row):
+    compound_index = row['Index']
+    reduced_smiles = row['Cropped_canonical_SMILES_no_stereo']
+    compound_structure = CompoundStructure(eP.requester, id=compound_id)
+    compound = Compound(eP.requester, id=compound_id)
+    print(compound_id)
+    halflives = compound_structure.get_halflifes()
+    smiles = compound.get_smiles()
+    for hl in halflives:
+        compound info
+        D['index'].append(compound_index)
+        D['compound_id'].append(compound_id)
+        D['smiles'].append(smiles)
+        D['reduced_smiles'].append(reduced_smiles) # cropped_canonical_smiles_no_stereo
+        D['halflife'].append(float(hl.hl))
+        D['scenario_id'].append(hl.scenarioId)
+        D['log(Koc)'].append(row['log(Koc)'])
+        D['Koc_source'].append(row['Koc_source'])
+        print(hl.scenarioId)
+        fetch data structures
+        scenario = Scenario(eP.requester, id=hl.scenarioId)
+        add_info = scenario.get_additional_information()
+        add halflife details
+        D['halflife_model'].append(fetch_halflife_model(add_info))
+        D['halflife_comment'].append(fetch_halflife_comment(add_info))
+        D['study_name'].append(scenario.get_name().split(' - ')[0])
+        D['spike_compound'].append(fetch_spikecompound(add_info))
+        fetch data points
+        D['acidity'].append(fetch_acidity(add_info))
+        D['CEC'].append(fetch_cec(add_info))  # cation exchange capacity
+        D['OC'].append(fetch_organic_content(add_info))  # organic content as organic carbon (oc)
+        start, end = fetch_biomass(add_info)
+        D['biomass_start'].append(start)
+        D['biomass_end'].append(end)
+        D['biomass'].append(np.round(np.average([start, end]), 2))
+        D['temperature'].append(fetch_temperature(add_info))
+        wst_value, wst_type = fetch_wst(add_info)  # water storage capacity,
+        D['wst_value'].append(wst_value)
+        D['wst_type'].append(wst_type)
+        hum, hum_cond = fetch_humidity(add_info)
+        D['humidity'].append(hum)
+        D['humidity_conditions'].append(hum_cond)
+        D['soil_texture'].append(fetch_soiltexture1(add_info))
+        _sand, _silt, _clay = fetch_soiltexture2(add_info)
+        D['sand'].append(_sand)
+        D['silt'].append(_silt)
+        D['clay'].append(_clay)
+    return D
+
+
+__main__()
+
+