diff --git a/helper_scripts/csv2mit_with_dir_out.py b/helper_scripts/csv2mit_with_dir_out.py new file mode 100644 index 0000000..f05b321 --- /dev/null +++ b/helper_scripts/csv2mit_with_dir_out.py @@ -0,0 +1,467 @@ +import datetime +import multiprocessing +import posixpath +import re + +import numpy as np +import os +import pandas as pd +import requests +import math +import functools +import struct +import pdb + +from wfdb.io import _header +from wfdb.io import _signal +from wfdb.io import download +from wfdb.io import annotation +import wfdb + +def csv2mit_with_dir_out(file_name, fs, units, fmt=None, adc_gain=None, baseline=None, + samps_per_frame=None, counter_freq=None, base_counter=None, + base_time=None, base_date=None, comments=None, sig_name=None, + dat_file_name=None, skew=None, byte_offset=None, adc_res=None, + adc_zero=None, init_value=None, checksum=None, block_size=None, + record_only=False, header=True, delimiter=',', verbose=False, write_dir = ''): + """ + Read a WFDB header file and return either a `Record` object with the + record descriptors as attributes or write a record and header file. + + Parameters + ---------- + file_name : str + The name of the WFDB record to be read, without any file + extensions. If the argument contains any path delimiter + characters, the argument will be interpreted as PATH/BASE_RECORD. + Both relative and absolute paths are accepted. If the `pn_dir` + parameter is set, this parameter should contain just the base + record name, and the files fill be searched for remotely. + Otherwise, the data files will be searched for in the local path. + fs : float + This number can be expressed in any format legal for a Python input of + floating point numbers (thus '360', '360.', '360.0', and '3.6e2' are + all legal and equivalent). The sampling frequency must be greater than 0; + if it is missing, a value of 250 is assumed. + units : list, str + This will be applied as the passed list unless a single str is passed + instead - in which case the str will be assigned for all channels. + This field can be present only if the ADC gain is also present. It + follows the baseline field if that field is present, or the gain field + if the baseline field is absent. The units field is a list of character + strings that specifies the type of physical unit. If the units field is + absent, the physical unit may be assumed to be 1 mV. + fmt : list, str, optional + This will be applied as the passed list unless a single str is passed + instead - in which case the str will be assigned for all + channels. A list of strings giving the WFDB format of each file used to + store each channel. Accepted formats are: '80','212','16','24', and + '32'. There are other WFDB formats as specified by: + https://www.physionet.org/physiotools/wag/signal-5.htm + but this library will not write (though it will read) those file types. + Each field is an integer that specifies the storage format of the signal. + All signals in a given group are stored in the same format. The most + common format is format `16` (sixteen-bit amplitudes). The parameters + `samps_per_frame`, `skew`, and `byte_offset` are optional fields, and + if present, are bound to the format field. In other words, they may be + considered as format modifiers, since they further describe the encoding + of samples within the signal file. + adc_gain : list, int, optional + This will be applied as the passed list unless a single int is passed + instead - in which case the int will be assigned for all channels. + This field is a list of numbers that specifies the difference in + sample values that would be observed if a step of one physical unit + occurred in the original analog signal. For ECGs, the gain is usually + roughly equal to the R-wave amplitude in a lead that is roughly parallel + to the mean cardiac electrical axis. If the gain is zero or missing, this + indicates that the signal amplitude is uncalibrated; in such cases, a + value of 200 ADC units per physical unit may be assumed. + baseline : list, int, optional + This will be applied as the passed list unless a single int is passed + instead - in which case the int will be assigned for all channels. This + field can be present only if the ADC gain is also present. It is not + separated by whitespace from the ADC gain field; rather, it is + surrounded by parentheses, which delimit it. The baseline is an integer + that specifies the sample value corresponding to 0 physical units. If + absent, the baseline is taken to be equal to the ADC zero. Note that + the baseline need not be a value within the ADC range; for example, + if the ADC input range corresponds to 200-300 degrees Kelvin, the + baseline is the (extended precision) value that would map to 0 degrees + Kelvin. + samps_per_frame : list, int, optional + This will be applied as the passed list unless a single int is passed + instead - in which case the int will be assigned for all channels. + Normally, all signals in a given record are sampled at the (base) + sampling frequency as specified by `fs`; in this case, the number of + samples per frame is 1 for all signals, and this field is conventionally + omitted. If the signal was sampled at some integer multiple, n, of the + base sampling frequency, however, each frame contains n samples of the + signal, and the value specified in this field is also n. (Note that + non-integer multiples of the base sampling frequency are not supported). + counter_freq : float, optional + This field (a floating-point number, in the same format as `fs`) can be + present only if `fs` is also present. Typically, the counter frequency + may be derived from an analog tape counter, or from page numbers in a + chart recording. If the counter frequency is absent or not positive, + it is assumed to be equal to `fs`. + base_counter : float, optional + This field can be present only if the counter frequency is also present. + The base counter value is a floating-point number that specifies the counter + value corresponding to sample 0. If absent, the base counter value is + taken to be 0. + base_time : str, optional + This field can be present only if the number of samples is also present. + It gives the time of day that corresponds to the beginning of the + record, in 'HH:MM:SS' format (using a 24-hour clock; thus '13:05:00', or + '13:5:0', represent 1:05 pm). If this field is absent, the time-conversion + functions assume a value of '0:0:0', corresponding to midnight. + base_date : str, optional + This field can be present only if the base time is also present. It contains + the date that corresponds to the beginning of the record, in 'DD/MM/YYYY' + format (e.g., '25/4/1989' is '25 April 1989'). + comments : list, optional + A list of string comments to be written to the header file. Each string + entry represents a new line to be appended to the bottom of the header + file ('.hea'). + sig_name : list, optional + A list of strings giving the signal name of each signal channel. This + will be used for plotting the signal both in this package and + LightWave. Note, this value will be used in preference to the CSV + header, if applicable, to define custom signal names. + dat_file_name : str, optional + The name of the file in which samples of the signal are kept. Although the + record name is usually part of the signal file name, this convention is + not a requirement. Note that several signals can share the same file + (i.e., they can belong to the same signal group); all entries for signals + that share a given file must be consecutive, however. Note, the default + behavior is to save the files in the current working directory, not the + directory of the file being read. + skew : list, int, optional + This will be applied as the passed list unless a single int is passed + instead - in which case the int will be assigned for all channels. + Ideally, within a given record, samples of different signals with the + same sample number are simultaneous (within one sampling interval). + If this is not the case (as, for example, when a multitrack analog + tape recording is digitized and the azimuth of the playback head does + not match that of the recording head), the skew between signals can + sometimes be determined (for example, by locating recorded waveform + features with known time relationships, such as calibration signals). + If this has been done, the skew field may be inserted into the header + file to indicate the (positive) number of samples of the signal that + are considered to precede sample 0. These samples, if any, are included + in the checksum. (Note the checksum need not be changed if the skew field + is inserted or modified). + byte_offset : list, int, optional + This will be applied as the passed list unless a single int is passed + instead - in which case the int will be assigned for all channels. + Normally, signal files include only sample data. If a signal file + includes a preamble, however, this field specifies the offset in bytes + from the beginning of the signal file to sample 0 (i.e., the length + of the preamble). Data within the preamble is not included in the signal + checksum. Note that the byte offset must be the same for all signals + within a given group (use the skew field to correct for intersignal + skew). This feature is provided only to simplify the task of reading + signal files not generated using the WFDB library; the WFDB library + does not support any means of writing such files, and byte offsets must + be inserted into header files manually. + adc_res: list, int, optional + This will be applied as the passed list unless a single int is passed + instead - in which case the int will be assigned for all channels. + This field can be present only if the ADC gain is also present. It + specifies the resolution of the analog-to-digital converter used to + digitize the signal. Typical ADCs have resolutions between 8 and 16 + bits. If this field is missing or zero, the default value is 12 bits + for amplitude-format signals, or 10 bits for difference-format signals + (unless a lower value is specified by the format field). + adc_zero: list, int, optional + This will be applied as the passed list unless a single int is passed + instead - in which case the int will be assigned for all channels. + This field can be present only if the ADC resolution is also present. + It is an integer that represents the amplitude (sample value) that + would be observed if the analog signal present at the ADC inputs had + a level that fell exactly in the middle of the input range of the ADC. + For a bipolar ADC, this value is usually zero, but a unipolar (offset + binary) ADC usually produces a non-zero value in the middle of its + range. Together with the ADC resolution, the contents of this field + can be used to determine the range of possible sample values. If this + field is missing, a value of 0 is assumed. + init_value : list, int, optional + This will be applied as the passed list unless a single int is passed + instead - in which case the int will be assigned for all channels. + This field can be present only if the ADC zero is also present. It + specifies the value of sample 0 in the signal, but is used only if the + signal is stored in difference format. If this field is missing, a + value equal to the ADC zero is assumed. + checksum : list, optional + This field can be present only if the initial value is also present. It + is a 16-bit signed checksum of all samples in the signal. (Thus the + checksum is independent of the storage format.) If the entire record + is read without skipping samples, and the header’s record line specifies + the correct number of samples per signal, this field is compared against + a computed checksum to verify that the signal file has not been corrupted. + A value of zero may be used as a field placeholder if the number of + samples is unspecified. + block_size : list, int, optional + This will be applied as the passed list unless a single int is passed + instead - in which case the int will be assigned for all channels. + This field can be present only if the checksum is present. This field + is an integer and is usually 0. If the signal is stored in a file + that must be read in blocks of a specific size, however, this field + specifies the block size in bytes. (On UNIX systems, this is the case + only for character special files, corresponding to certain tape and + raw disk files. If necessary, the block size may be given as a negative + number to indicate that the associated file lacks I/O driver support for + some operations.) All signals belonging to the same signal group have + the same block size. + record_only : bool, optional + Whether to only return the record information (True) or not (False). + If false, this function will generate both a .dat and .hea file. + header : bool, optional + Whether to assume the CSV has a first line header (True) or not (False) + which defines the signal names. If false, this function will generate + either the signal names provided by `sig_name` or set `[ch_1, ch_2, ...]` + as the default. + delimiter : str, optional + What to use as the delimiter for the file to separate data. The default + if a comma (','). Other common delimiters are tabs ('\t'), spaces (' '), + pipes ('|'), and colons (':'). + verbose : bool, optional + Whether to print all the information read about the file (True) or + not (False). + write_dir : str, optional + the target directory to write the data to + + Returns + ------- + record : Record or MultiRecord, optional + The WFDB Record or MultiRecord object representing the contents + of the CSV file read. + + Notes + ----- + CSVs should be in the following format: + + sig_1_name,sig_2_name,... + sig_1_val_1,sig_2_val_1,... + sig_1_val_2,sig_2_val_2,... + ...,...,... + + Or this format if `header=False` is defined: + + sig_1_val_1,sig_2_val_1,... + sig_1_val_2,sig_2_val_2,... + ...,...,... + + The signal will be saved defaultly as a `p_signal` so both floats and + ints are acceptable. + + Examples + -------- + Create the header ('.hea') and record ('.dat') files, specifies both + units to be 'mV' + >>> wfdb.csv2mit('sample-data/100.csv', fs=360, units='mV') + + Create the header ('.hea') and record ('.dat') files, change units for + each signal + >>> wfdb.csv2mit('sample-data/100.csv', fs=360, units=['mV','kV']) + + Return just the record, note the use of lists to specify which values should + be applied to each signal + >>> csv_record = wfdb.csv2mit('sample-data/100.csv', fs=360, units=['mV','mV'], + fmt=['80',212'], adc_gain=[100,200], + baseline=[1024,512], record_only=True) + + Return just the record, note the use of single strings and ints to specify + when fields can be applied to all signals + >>> csv_record = wfdb.csv2mit('sample-data/100.csv', fs=360, units='mV', + fmt=['80','212'], adc_gain=200, baseline=1024, + record_only=True) + + """ + # NOTE: No need to write input checks here since the Record class should + # handle them (except verifying the CSV input format which is for Pandas) + if header: + df_CSV = pd.read_csv(file_name, delimiter=delimiter) + else: + df_CSV = pd.read_csv(file_name, delimiter=delimiter, header=None) + if verbose: + print('Successfully read CSV') + # Extract the entire signal from the dataframe + p_signal = df_CSV.values + # The dataframe should be in (`sig_len`, `n_sig`) dimensions + sig_len = p_signal.shape[0] + if verbose: + print('Signal length: {}'.format(sig_len)) + n_sig = p_signal.shape[1] + if verbose: + print('Number of signals: {}'.format(n_sig)) + # Check if signal names are valid and set defaults + if not sig_name: + if header: + sig_name = df_CSV.columns.to_list() + if any(map(str.isdigit, sig_name)): + print("WARNING: One or more of your signal names are numbers, this "\ + "is not recommended:\n- Does your CSV have a header line "\ + "which defines the signal names?\n- If not, please set the "\ + "parameter 'header' to False.\nSignal names: {}".format(sig_name)) + else: + sig_name = ['ch_'+str(i) for i in range(n_sig)] + if verbose: + print('Signal names: {}'.format(sig_name)) + + # Set the output header file name to be the same, remove path + if write_dir == '': + write_dir = os.path.join(*file_name.split(os.sep)[:-1]) + if os.sep in file_name: + file_name = file_name.split(os.sep)[-1] + record_name = file_name.replace('.csv','') + if verbose: + print('Output header: {}.hea'.format(record_name)) + + # Replace the CSV file tag with DAT + dat_file_name = file_name.replace('.csv','.dat') + dat_file_name = [dat_file_name] * n_sig + if verbose: + print('Output record: {}'.format(dat_file_name[0])) + + # Convert `units` from string to list if necessary + units = [units]*n_sig if type(units) is str else units + + # Set the default `fmt` if none exists + if not fmt: + fmt = ['16'] * n_sig + fmt = [fmt]*n_sig if type(fmt) is str else fmt + if verbose: + print('Signal format: {}'.format(fmt)) + + # Set the default `adc_gain` if none exists + if not adc_gain: + adc_gain = [200] * n_sig + adc_gain = [adc_gain]*n_sig if type(adc_gain) is int else adc_gain + if verbose: + print('Signal ADC gain: {}'.format(adc_gain)) + + # Set the default `baseline` if none exists + if not baseline: + if adc_zero: + baseline = [adc_zero] * n_sig + else: + baseline = [0] * n_sig + baseline = [baseline]*n_sig if type(baseline) is int else baseline + if verbose: + print('Signal baseline: {}'.format(baseline)) + + # Convert `samps_per_frame` from int to list if necessary + samps_per_frame = [samps_per_frame]*n_sig if type(samps_per_frame) is int else samps_per_frame + + # Convert `skew` from int to list if necessary + skew = [skew]*n_sig if type(skew) is int else skew + + # Convert `byte_offset` from int to list if necessary + byte_offset = [byte_offset]*n_sig if type(byte_offset) is int else byte_offset + + # Set the default `adc_res` if none exists + if not adc_res: + adc_res = [12] * n_sig + adc_res = [adc_res]*n_sig if type(adc_res) is int else adc_res + if verbose: + print('Signal ADC resolution: {}'.format(adc_res)) + + # Set the default `adc_zero` if none exists + if not adc_zero: + adc_zero = [0] * n_sig + adc_zero = [adc_zero]*n_sig if type(adc_zero) is int else adc_zero + if verbose: + print('Signal ADC zero: {}'.format(adc_zero)) + + # Set the default `init_value` + # NOTE: Initial value (and subsequently the digital signal) won't be correct + # unless the correct `baseline` and `adc_gain` are provided... this is just + # the best approximation + if not init_value: + init_value = p_signal[0,:] + init_value = baseline + (np.array(adc_gain) * init_value) + init_value = [int(i) for i in init_value.tolist()] + if verbose: + print('Signal initial value: {}'.format(init_value)) + + # Set the default `checksum` + if not checksum: + checksum = [int(np.sum(v) % 65536) for v in np.transpose(p_signal)] + if verbose: + print('Signal checksum: {}'.format(checksum)) + + # Set the default `block_size` + if not block_size: + block_size = [0] * n_sig + block_size = [block_size]*n_sig if type(block_size) is int else block_size + if verbose: + print('Signal block size: {}'.format(block_size)) + + # Change the dates and times into `datetime` objects + if base_time: + base_time = _header.wfdb_strptime(base_time) + if base_date: + base_date = datetime.datetime.strptime(base_date, '%d/%m/%Y').date() + + # Convert array to floating point + p_signal = p_signal.astype('float64') + + # Either return the record or generate the record and header files + # if requested + if record_only: + # Create the record from the input and generated values + record = wfdb.Record( + record_name = record_name, + n_sig = n_sig, + fs = fs, + samps_per_frame = samps_per_frame, + counter_freq = counter_freq, + base_counter = base_counter, + sig_len = sig_len, + base_time = base_time, + base_date = base_date, + comments = comments, + sig_name = sig_name, + p_signal = p_signal, + d_signal = None, + e_p_signal = None, + e_d_signal = None, + file_name = dat_file_name, + fmt = fmt, + skew = skew, + byte_offset = byte_offset, + adc_gain = adc_gain, + baseline = baseline, + units = units, + adc_res = adc_res, + adc_zero = adc_zero, + init_value = init_value, + checksum = checksum, + block_size = block_size + ) + if verbose: + print('Record generated successfully') + return record + + else: + # Write the information to a record and header file + wfdb.wrsamp( + record_name = record_name, + fs = fs, + units = units, + sig_name = sig_name, + p_signal = p_signal, + fmt = fmt, + adc_gain = adc_gain, + baseline = baseline, + comments = comments, + base_time = base_time, + base_date = base_date, + write_dir = write_dir + ) + if verbose: + print('File generated successfully') + +if __name__ == "__main__": + pass \ No newline at end of file diff --git a/src/precise_multiple_templates_prog.py b/src/precise_multiple_templates_prog.py index dfe1e9b..03b465e 100644 --- a/src/precise_multiple_templates_prog.py +++ b/src/precise_multiple_templates_prog.py @@ -1,1081 +1,1082 @@ import multiprocessing import os import pickle import shutil import sys from multiprocessing import Pool from time import time import matplotlib import matplotlib.pyplot as plt from numpy.core.numeric import base_repr # configure backend here matplotlib.use('SVG') import numpy as np from dtaidistance import dtw from numpy.core.fromnumeric import argmin scripts = '../helper_scripts' if scripts not in sys.path: sys.path.insert(0,scripts) import warnings from copy import copy from dtw.dtw import dtw_std +from csv2mit_with_dir_out import csv2mit_with_dir_out from scipy import stats from scipy.interpolate import CubicSpline, interp1d from scipy.signal.signaltools import fftconvolve import wfdb from build_template import build_template, multi_template data_beats_dir = "../data/beats/" log_dir = "../results/beat_recon_logs_multi_prog_" # Signal freqeuncy variables FREQ = 128 MULTIPLIER_FREQ = 20 #Final freq: 128*20 = 2.56 KHz ADC_GAIN = 200 # Parameters fixed for result evaluation PERC_BINS = 15 # Variable fixed by arguments (eg. arguments) CLUSTER_PERCENTAGE = 3 NUM_BEAT_ANALYZED = 50 INTERPOLATION_TYPE = 'flat' FILES_SELECTED = ["17052.pickle"] PARALLELIZE_ALONG = 'files' LEVELS = [3,4,5,6,7,8,9,10,11] SEC_FOR_INITIAL_TEMPLATES = 3*60 #5*60 LEN_DISTANCE_VECTOR = 60 #80 LEN_DISTANCE_VECTOR_REF = 400 #500 SEC_FOR_NEW_TEMPLATES = 40 #2*60 TIME_MODE = 'short' def add_beat_to_csv(base_path_name,beat): ''' base_path_name is the path to use to save the data in the form: 'results/$log_dir/$sample_name/$lvl/$resampling_type/$signal_type/ ''' mode = '' data_path = os.path.join(base_path_name,"data.csv") first_beat = True if os.path.exists(data_path): mode = 'a' else: mode = 'w' with open(data_path,mode) as f: for pt in beat: if mode == 'w' and first_beat: f.write(str(pt/ADC_GAIN)) first_beat = False else: f.write('\n'+str(pt/ADC_GAIN)) def decimate(v,factor): return v[::factor] def revert_normaliztion(data,params): un_norm_data = [] saturate = lambda x: x if abs(x)<2**11 else (2**11-1)*np.sign(x) for pt in data: un_norm_pt = saturate((pt + params['avg'])*(params['max']-params['min']) + params['min']) un_norm_data.append(un_norm_pt) return un_norm_data def un_norma_and_save_back_beat(beat,log_dir,signal_type,norm_params): dir_to_save_to = os.path.join(log_dir,signal_type) os.makedirs(dir_to_save_to, exist_ok=True) data_v = decimate(beat['v'],MULTIPLIER_FREQ) data_v_un_norm = revert_normaliztion(data_v,norm_params) add_beat_to_csv(dir_to_save_to,data_v_un_norm) def z_score_filter(vector,threshold = 3): out = None z = stats.zscore(vector) out = [vector[idx] for idx in range(len(vector)) if z[idx] Beat_annot: | --> "t": [] | --> "v": [] ''' file_name_full = os.path.join(data_beats_dir, os.path.basename(file_name)) data = {} with open(file_name_full,"rb") as f: data = pickle.load(f) for k in data.keys(): if k <= FREQ*start_after: data.pop(k) return data def upsample_uniform_beat(beat,t_QRS,multiplier): out_beat = {} upsampled = [] v = beat['v'] new_QRS_pos = beat['t'].index(t_QRS)*multiplier last_new_t = (len(v)-1)*multiplier t = np.arange(0,last_new_t+1,multiplier,dtype = np.int64) new_base = np.arange(0,last_new_t,1,dtype = np.int64) f = interp1d(t, v) upsampled = f(new_base) out_beat = {'t':new_base,'v':upsampled,'QRS_pos':new_QRS_pos} return out_beat def upsample_uniform_beats_in_beats_dict(beats,multiplier): up_samp_beats = {} i = 0 for beat in beats: i += 1 up_samp_beats[beat] = upsample_uniform_beat(beats[beat], beat, multiplier) return up_samp_beats def get_beats_in_time_span(data, lvl = 0 ,t_start_seconds = 0, t_stop_seconds = SEC_FOR_INITIAL_TEMPLATES): beats = {} for annot in data.keys(): if annot >= int(t_start_seconds*FREQ) and annot <= int(t_stop_seconds*FREQ): beats[annot] = data[annot] elif annot > int(t_stop_seconds*FREQ): break return beats def min_max_normalization_one_beat(beat,param = None): params_out = {} if 'QRS_pos' in list(beat.keys()): normalized_beat = {'t':beat['t'],'v':[], 'QRS_pos': beat['QRS_pos']} else: normalized_beat = {'t':beat['t'],'v':[]} vector = beat['v'] if param is not None: mi_v = param['min'] ma_v = param['max'] avg = param['avg'] norm = (np.array(vector)-mi_v)/(ma_v-mi_v) norm -= avg else: mi_v = min(vector) ma_v = max(vector) norm = (np.array(vector)-mi_v)/(ma_v-mi_v) avg = np.average(norm) norm -= avg normalized_beat['v'] = norm.tolist() params_out['min'] = mi_v params_out['max'] = ma_v params_out['avg'] = avg return normalized_beat,params_out def min_max_normalization_beats_chunk(beats, params = None): params_out = {} normalized_beats = {} for beat in beats: normalized_beats[beat],params_out[beat] = min_max_normalization_one_beat(beats[beat],params) return normalized_beats,params_out def resamp_one_signal(t,v,resample_type = INTERPOLATION_TYPE, min_t = None, max_t = None): if min_t is None: min_t = t[0] if max_t is None: max_t = t[-1] t_extended = copy(t) v_extended = copy(v) if max_t not in t: t_extended.insert(len(t_extended), max_t) v_extended.insert(len(v_extended), 0) if min_t not in t: # This is not needed in this implementation as the first sample is always an events for each beat t_extended.insert(0, min_t) # Still, we write it for clearity and consistency v_extended.insert(0, 0) if resample_type == "linear": f = interp1d(t_extended,v_extended, bounds_error = False, fill_value = (v[0],v[-1])) elif resample_type == "flat": f = interp1d(t_extended,v_extended, kind = 'previous', bounds_error = False, fill_value = (v[0],v[-1])) elif resample_type == "spline": f = CubicSpline(t_extended,v_extended, bc_type="natural") t_new = list(range(min_t,max_t+1)) v_new = f(t_new) return t_new,v_new def percentile_idx(vector,perc): pcen=np.percentile(np.array(vector),perc,interpolation='nearest') i_near=abs(np.array(vector)-pcen).argmin() return i_near def correlate(v1,v2): v1_n = np.array(v1) v2_n = np.array(v2) cov = np.cov(v1_n,v2_n)[0,1] cor = cov/(np.std(v1_n)*np.std(v2_n)) return cor def avg_std_prd(prd_descriptor): dict_out = {'warp':{'avg':None,'std':None},'resamp':{'avg':None,'std':None}} wr = [] rs = [] dict_out['warp']['avg'] = np.average(prd_descriptor['prd_warp']) dict_out['warp']['std'] = np.std(prd_descriptor['prd_warp']) dict_out['resamp']['avg'] = np.average(prd_descriptor['prd_resamp']) dict_out['resamp']['std'] = np.std(prd_descriptor['prd_resamp']) return dict_out def percentile_prd_beat_rel(prd,perc): vec = [] for beat in prd.keys(): vec.append(np.array(prd['prd_warp']) - np.array(prd['prd_resamp'])) idx = percentile_idx(vec,perc) return prd['beats'][idx],prd['ids'][idx] def percentile_prd_beat_abs(prd,perc): vec = [] for beat in prd.keys(): vec.append(prd['prd_warp']) idx = percentile_idx(vec,perc) return prd['beats'][idx],prd['ids'][idx] def find_all_connceted_template(id_temp,templates_info): id_collected = [id_temp] for id_connected in id_collected: for id_to_add in templates_info[id_connected]["connected_template"]: if id_to_add not in id_collected: id_collected.append(id_to_add) return id_collected def plot_perc(orig,warp,resamp,template,title,out_file_name): fig, (ax1, ax2) = plt.subplots(2) fig.suptitle(title) ax1.plot(orig[0],orig[1]) ax1.plot(warp[0],warp[1]) ax1.plot(resamp[0],resamp[1]) ax1.legend(['original','warped template','resampled']) ax2.plot(template) ax2.legend(['template used']) fig.savefig(out_file_name) plt.close(fig) def plot_beat_prd_percentile(orig, prd_id_coupling, templates_collection, lvl, perc, log_dir_this_lvl, file, interpolation_type = INTERPOLATION_TYPE): # prd_id_coupling = {"prd_resamp":[],"prd_right_resamp":[],"prd_left_resamp":[], "prd_warp": [],"prd_warp_right": [],"prd_warp_left": [], "ids":[], "beats": []} #Relative template_sel = {"template":None} beat_id, template_id = percentile_prd_beat_rel(prd_id_coupling, perc) file_name_to_save_fig_rel = os.path.join(log_dir_this_lvl,file.split(".")[0]+"_"+str(perc)+"_perc"+str(lvl)+"_relative.svg") template_sel['template'] = templates_collection[template_id] up_samp_beat = upsample_uniform_beat(orig[beat_id], beat_id, MULTIPLIER_FREQ) #TODO: keep track of QRS events = ADC(up_samp_beat,lvl) up_samp_beat,params = min_max_normalization_one_beat(up_samp_beat) events,_ = min_max_normalization_one_beat(events,params) resampled = resample(events, resample_type = interpolation_type, min_t = up_samp_beat['t'][0], max_t = up_samp_beat['t'][-1]) _, reconstructed, _, _ = warp(resampled,template_sel,events) orig_rel = up_samp_beat['t'],up_samp_beat['v'] recon_rel = reconstructed['t'],reconstructed['v'] resamp_rel = resampled['t'],resampled['v'] title_rel = f'File: {file}, Lvl: {lvl}, Beat time (samples): {beat_id}, {str(perc)} percentile, Relative' plot_perc(orig_rel,recon_rel,resamp_rel,template_sel['template']['point'],title_rel,file_name_to_save_fig_rel) #Absolute beat_id, template_id = percentile_prd_beat_abs(prd_id_coupling, perc) file_name_to_save_fig_abs = os.path.join(log_dir_this_lvl,file.split(".")[0]+"_"+str(perc)+"_perc"+str(lvl)+"_absolute.svg") template_sel['template'] = templates_collection[template_id] up_samp_beat = upsample_uniform_beat(orig[beat_id], beat_id, MULTIPLIER_FREQ) #TODO: keep track of QRS events = ADC(up_samp_beat,lvl) up_samp_beat,params = min_max_normalization_one_beat(up_samp_beat) events,_ = min_max_normalization_one_beat(events,params) resampled = resample(events, resample_type = interpolation_type, min_t = up_samp_beat['t'][0], max_t = up_samp_beat['t'][-1]) _, reconstructed, _, _ = warp(resampled,template_sel,events) orig_abs = up_samp_beat['t'],up_samp_beat['v'] recon_abs = reconstructed['t'],reconstructed['v'] resamp_abs = resampled['t'],resampled['v'] title_abs = f'File: {file}, Lvl: {lvl}, Beat time (samples): {beat_id}, {str(perc)} percentile, Absolute' plot_perc(orig_abs,recon_abs,resamp_abs,template_sel['template']['point'],title_abs,file_name_to_save_fig_abs) def stitch_segments(segments): stitched = [] for i,segment in enumerate(segments): if i == len(segments)-1: stitched.extend(segment) else: stitched.extend(segment[0:-1]) return stitched def change_sampling(vector, size_to_warp): resampled = [] if len(vector) < 3: #If the length of the vector is either 2 or 1 it will be mnorfed in a straight line and such line will be morfed equaly resampled = [vector[0]]*size_to_warp return resampled delta = abs(size_to_warp - len(vector)) # --> number of points to insert/delete #At least one more sample for each sample (except the last one)? --> insert k in each hole and recompute delta if delta >= len(vector)-1 and delta > 0: holes = len(vector)-1 k = delta // holes #Number of point for each hole time = range(0,len(vector)*(k+1),k+1) # [o1,i11,i12,...,i1k, o2,i21,i22,...,i2k, o3,..., o_fin] --> NOTE: last sampel is not taken # 0 k+1 2k+2 l*k +l --> Hence, we get to the exact end t_new = range(time[-1]+1) f = interp1d(time,vector, bounds_error = False, fill_value = "extrapolate") vector = list(f(t_new)) delta = abs(size_to_warp - len(vector)) if len(vector) == size_to_warp: return vector grad = np.gradient(vector) grad[-1] = max(grad) + 1 # --> we don't want to insert anything after the last sample grad_idxs = sorted(range(len(grad)), key = lambda idx: grad[idx]) idx_to_consider = grad_idxs[:delta] for i,sample in enumerate(vector): resampled.append(sample) if i in idx_to_consider: if size_to_warp < len(vector): resampled.pop() elif size_to_warp > len(vector): resampled.append((vector[i]+vector[i+1])/2) return resampled def segment_warp(segment): resampled = np.array(change_sampling(segment['segment'], segment['length_to_warp'])) resampled += (segment['v_start'] - resampled[0]) m = (segment['v_stop'] - resampled[-1])/(segment['length_to_warp']-1) coef_to_sum = [m*x for x in range(segment['length_to_warp'])] resampled += coef_to_sum return list(resampled) def warp(resamp_event, templates, events): warped = {} segments = [] # --> {"segment":v, "length_to_warp":length, "v_start":v_eb_0, "v_stop":v_eb_1} segment = {"segment":None, "length_to_warp":None, "v_start":None, "v_stop":None} segment_start = None event_start = None #ad the first and last points of the resampled and truncated beat if not already in event if resamp_event['t'][0] not in events['t']: events['t'].insert(0, resamp_event['t'][0]) events['v'].insert(0, 0) if resamp_event['t'][-1] not in events['t']: events['t'].insert(len(events['t']),resamp_event['t'][-1]) events['v'].insert(len(events['v']),0) #Apply DTW for matching resampled event to template v_src = resamp_event['v'] #This is how its actualy done on the library when calling 'warping_path' dist = float('inf') path = [] selected_template = [] disatances_vector = [] template_id = None for id in templates: t = templates[id]['point'] #dist_this_template, paths_this_template = dtw.warping_paths(v_src, t) with warnings.catch_warnings(): warnings.simplefilter("ignore") dist_this_template, _, path_this_template = dtw_std(v_src, t, dist_only=False) disatances_vector.append(dist_this_template) if dist_this_template < dist: dist = dist_this_template path = path_this_template selected_template = t template_id = id path = [(v1,v2) for v1,v2 in zip(path[0],path[1])] #Remove the "left" steps from the path and segment the template based on the events point #TODO: Problem here in the warping (some beats does not include the ending of the template, especially with low levels), check why !!!! With only 10 beats the effect is not visible # RESULTS: ALL THE TEMPLATES POINTS ARE IN THE LAST POINT OF THE WARP!! prev_idx_src = path[-1][0]+1 for idx_src, idx_temp in path: if prev_idx_src == idx_src: continue prev_idx_src = idx_src for idx, t in enumerate(events['t']): if resamp_event['t'][idx_src] == t: if segment_start == None: segment_start = idx_temp event_start = idx else: segment['segment'] = np.array(selected_template[segment_start:idx_temp+1], dtype=np.float64) segment['length_to_warp'] = events['t'][idx] - events['t'][event_start] + 1 segment['v_start'] = events['v'][event_start] segment['v_stop'] = events['v'][idx] w = segment_warp(segment) segments.append(w) segment_start = idx_temp event_start = idx break segment_stitched = stitch_segments(segments) warped = {'t':resamp_event['t'],'v':segment_stitched} return dist, warped, disatances_vector, template_id def dtw_dist(v1,v2): with warnings.catch_warnings(): warnings.simplefilter("ignore") dist = dtw_std(v1, v2) return dist def compute_new_templates(data, t_start, t_stop, old_templates, templates_info): beats_for_new_template = get_beats_in_time_span(data, t_start_seconds=t_start,t_stop_seconds=t_stop) beats_for_new_template,_ = min_max_normalization_beats_chunk(beats_for_new_template) _, new_templates_descriptor = multi_template(beats_for_new_template, percentage_each_cluster = CLUSTER_PERCENTAGE*2, freq= FREQ) lbls_new_templates = list(new_templates_descriptor.keys()) old_ids = list(old_templates.keys()) if old_ids == []: next_id = 0 else: next_id = max(old_ids) + 1 cluster_representatives = {} new_template_set = {} print(f"\n\tNew template built, number of new templates:{len(lbls_new_templates)}\n") old_templates_kept = 0 old_templates_substituted = 0 new_templates_kept = 0 new_templates_substituted = 0 # We search which of the old templates can be considered clustered with the newly founded ones # TODO: /!\ wRONG BACK PROJECTION!!!! need to send back the function to template prediction and measure distance from THAT template if len(lbls_new_templates) > 0: for id_old_template in old_templates: t_o = old_templates[id_old_template]['point'] dists = np.zeros((len(lbls_new_templates))) for k,t_n in enumerate(lbls_new_templates): dists[k] = dtw_dist(t_o,new_templates_descriptor[t_n]['center']) min_dist_pos = np.argmin(dists) min_val = dists[min_dist_pos] this_lbls_new_templates = lbls_new_templates[min_dist_pos] distances_intra_cluster = new_templates_descriptor[this_lbls_new_templates]['dist_all_pt_cluster'] dist_threshold = np.average(distances_intra_cluster)+np.std(distances_intra_cluster) if min_val < dist_threshold: if this_lbls_new_templates not in cluster_representatives.keys(): cluster_representatives[this_lbls_new_templates] = {"ids": [], "dists": []} cluster_representatives[this_lbls_new_templates]["ids"].append(id_old_template) cluster_representatives[this_lbls_new_templates]["dists"].append(min_val) else: #This templates minimum distances to the newly founded templates are too big to be considered the same cluster new_template_set[id_old_template] = old_templates[id_old_template] old_templates_kept += 1 # Now we check if the old templates, clustered with the newly founded one, are representative of the cluster of if we should use the new templates for local_new_id in lbls_new_templates: if local_new_id in cluster_representatives.keys(): connected_ids = cluster_representatives[local_new_id]["ids"] arg_min_dist = np.argmin(cluster_representatives[local_new_id]["dists"]) old_template_id = cluster_representatives[local_new_id]["ids"][arg_min_dist] old_template_dist = cluster_representatives[local_new_id]["dists"][arg_min_dist] # The old template is nearer to the found cluster center than the beat found by the clustering alg if old_template_dist < new_templates_descriptor[local_new_id]["dist"]: new_template_set[old_template_id] = old_templates[old_template_id] connected_ids.remove(old_template_id) templates_info[old_template_id]["connected_template"].extend(connected_ids) new_templates_substituted += 1 # The beat found by the clustering alg is nearer to the found cluster center than the old template else: new_template_set[next_id] = new_templates_descriptor[local_new_id] templates_info[next_id] = {"used": 0, "deceased": False, "connected_template": connected_ids} next_id += 1 old_templates_substituted += len(cluster_representatives[local_new_id]["ids"]) # The templates in this cluster that are not the center are now deceased for id_removed in connected_ids: templates_info[id_removed]["deceased"] = True else: new_template_set[next_id] = new_templates_descriptor[local_new_id] templates_info[next_id] = {"used": 0, "deceased": False, "connected_template": []} next_id += 1 new_templates_kept += 1 print(f"\tOld templates kept untuched: {old_templates_kept}/{len(old_templates)}") print(f"\tOld templates kept, representative of new clusters (but already present): {new_templates_substituted}/{len(old_templates)}") print(f"\tOld templates removed for old clusters (with new defined params): {len(old_templates) - (old_templates_kept+old_templates_substituted+new_templates_substituted)}/{len(old_templates)}") print(f"\tOld templates removed for new clusters: {old_templates_substituted}/{len(old_templates)}") print(f"\n\tNew templates kept untuched: {new_templates_kept}/{len(lbls_new_templates)}") print(f"\tNew templates kept, representative of old clusters (with new params): {len(lbls_new_templates) - (new_templates_kept+new_templates_substituted)}/{len(lbls_new_templates)}") print(f"\tNew templates removed for old clusters: {new_templates_substituted}/{len(lbls_new_templates)}") print(f"\n\tFinal lenght of the new tempalte set: {len(new_template_set)}") else: new_template_set = old_templates print("\tNo new template found, keeping the previously computed ones") return new_template_set def resample(data, resample_type = INTERPOLATION_TYPE, min_t = None, max_t = None): resampled_data = {"t":None,"v":None} t = data['t'] v = data['v'] t_r,v_r = resamp_one_signal(t,v,resample_type = resample_type, min_t = min_t, max_t = max_t) resampled_data['t'] = t_r resampled_data['v'] = v_r return resampled_data def ADC(beat, nBits, hist = 5, original_bits = 11): #ADC stats delta = 2**original_bits dV = (delta)/(2**nBits) hist = hist/100*dV min_val = -delta//2 events = {'t':[],'v':[], 'QRS_pos': beat['QRS_pos']} #init value, first sample (we assume we always sample the nearest level at start) and ADC status v_0 = beat['v'][0] lowTh = min_val+((v_0-min_val)//dV)*dV highTh = lowTh + dV events['t'].append(beat['t'][0]) events['v'].append(int(lowTh if v_0-lowTh < highTh - v_0 else highTh)) for val,time in zip(beat['v'],beat['t']): #print(f"Value: {val}, time: {time}, low_th = {lowTh - hist}, high_th = { highTh + hist}") if val > highTh + hist or val < lowTh - hist: direction = 1 if val > highTh else -1 lowTh = min_val+((val-min_val)//dV)*dV #Delta from the bottom: (val-min_val)//dV*dV then compute the actual level summin min_val highTh = lowTh + dV events['t'].append(time) events['v'].append(int(lowTh if direction == 1 else highTh)) return events def reconstruct_beats(data_orig, lvl_number, init_templates = None, start_after = SEC_FOR_INITIAL_TEMPLATES, resample_type = INTERPOLATION_TYPE, num_beats_analyzed = None, verbose = False, log_dir = None): beat_seq_number = 0 prev_len = 0 beats_used = [] distances = [] distances_ref = [] num_distances_out = 0 skip_until = start_after time_info = {'beats_low_res_num': 0, 'tot_beats_num': 0} # COMPUTE INITIAL TEMPLATES #Init templet info templates_info = {} #{$templet_id: {used: 0, deceased: False, "connected_template": []}} all_templates_collection = {} if init_templates is None: print(f"\n################################################################") print(f"\nLEVELS:{lvl_number}, resample type: {resample_type}: Starting templates ... ") templates = compute_new_templates(data_orig, 0, start_after, {}, templates_info) else: templates = init_templates for id_new in templates: templates_info[id_new] = {"used": 0, "deceased": False, "connected_template": []} all_templates_collection = copy (templates) init_templates = copy(templates) prd_id_coupling = {"prd_resamp":[],"prd_right_resamp":[],"prd_left_resamp":[], "prd_warp": [],"prd_warp_right": [],"prd_warp_left": [], "ids":[], "beats":[]} dist_vector = [0]*len(templates) for beat in data_orig.keys(): t_beat = beat/FREQ time_info['tot_beats_num'] += 1 if t_beat < skip_until: continue time_info['beats_low_res_num'] += 1 if beat_seq_number >= num_beats_analyzed: break beat_seq_number +=1 if (beat_seq_number %(num_beats_analyzed/20)==0 or beat_seq_number == 1) and verbose: print(f"Reconstructing beat {beat} ({beat_seq_number}/{num_beats_analyzed}: {100*beat_seq_number /num_beats_analyzed}%, LEVELS:{lvl_number}, resample type: {resample_type})") up_samp_beat = upsample_uniform_beat(data_orig[beat], beat, MULTIPLIER_FREQ) events = ADC(up_samp_beat,lvl_number) # For the warping to work, both signal need to be in the same form (wither normalized or un-normailized), otherwise, the distance matrix # express non coherent distances between points. Because the templates NEED to be normalized (to better express a topology, instead of raw magnitude) # This oblige us to normalize also the input signals up_samp_beat, min_max_params = min_max_normalization_one_beat(up_samp_beat) events,_ = min_max_normalization_one_beat(events, min_max_params) # Re-sample and warp resampled = resample(events, resample_type = resample_type, min_t = up_samp_beat['t'][0], max_t = up_samp_beat['t'][-1]) dist, reconstructed, dist_all_template, local_template_id = warp(resampled,templates,events) # Save-back beat un_norma_and_save_back_beat(resampled,log_dir,"resampled",min_max_params) un_norma_and_save_back_beat(reconstructed,log_dir,"warped",min_max_params) un_norma_and_save_back_beat(up_samp_beat,log_dir,"original",min_max_params) #Update templates usage info template_id = list(templates_info.keys())[-len(templates)+local_template_id] templates_info[template_id]["used"] += 1 dist_vector = [dist_vector[i]+dist_all_template[i] for i in range(len(dist_vector))] #Compute PRD #{'warp':None,'resamp':None,'warp_left':None,'resamp_left':None,'warp_right':None,'resamp_right':None} prd_this_beat = PRD_one_beat(up_samp_beat,resampled,reconstructed) prd_id_coupling['ids'].append(local_template_id) prd_id_coupling['beats'].append(beat) prd_id_coupling['prd_resamp'].append(prd_this_beat['resamp']) prd_id_coupling['prd_right_resamp'].append(prd_this_beat['resamp_right']) prd_id_coupling['prd_left_resamp'].append(prd_this_beat['resamp_left']) prd_id_coupling['prd_warp'].append(prd_this_beat['warp']) prd_id_coupling['prd_warp_right'].append(prd_this_beat['warp_right']) prd_id_coupling['prd_warp_left'].append(prd_this_beat['warp_left']) # check if we need to re-compute the templates if len(distances_ref) < LEN_DISTANCE_VECTOR_REF: distances_ref.append(dist) if len(distances_ref) >= LEN_DISTANCE_VECTOR_REF: distances.append(dist) if len(distances) >= LEN_DISTANCE_VECTOR: with warnings.catch_warnings(): warnings.simplefilter("ignore") p = stats.anderson_ksamp([distances_ref,distances])[2] distances = [] if (p>= 0.05): num_distances_out = 0 else: # p value les than 0.05: null hypothesis (same distribution) rejected (please Kolmogorov forgive me) num_distances_out += 1 if num_distances_out > 2: # The acquired vector of distances was out for 2 times max_accum_dist = max(dist_vector) print(f"\n################################################################") print(f"\nLEVELS:{lvl_number}, resample type: {resample_type}: New template needed ... ") print(f"Beat number:{beat_seq_number} ({beat_seq_number}/{num_beats_analyzed}: {100*beat_seq_number /num_beats_analyzed}%)") print(f"\t p-value: {p}") for j in range(len(dist_vector)): print(f"\tTemplate {j}, dist: {dist_vector[j]}:\t","|"*int(20*dist_vector[j]/max_accum_dist)) print("\n") templates = compute_new_templates(data_orig, t_beat, t_beat+SEC_FOR_NEW_TEMPLATES, templates, templates_info) for t in templates: all_templates_collection[t] = templates[t] dist_vector = [0]*len(templates) print(f"\n################################################################\n") distances_ref = [] skip_until = t_beat + SEC_FOR_NEW_TEMPLATES num_distances_out = 0 #Save the used beats: new_QRS_pos = int((prev_len+up_samp_beat['QRS_pos'])/MULTIPLIER_FREQ) prev_len += len(up_samp_beat['t']) beats_used.append(new_QRS_pos) return all_templates_collection,prd_id_coupling,init_templates,time_info,beats_used def reconstruction_one_lvl_one_file_and_compare(data_orig,lvl,file,initial_templates,verbose = True): log_dir_this_file = os.path.join(log_dir,file.split(".")[0]) interpolation_type_list = None if INTERPOLATION_TYPE == 'all': interpolation_type_list = ['flat','spline','linear'] else: interpolation_type_list = [INTERPOLATION_TYPE] prd_each_interpolation_type = {} for interpolation_type in interpolation_type_list: if verbose: print(f"Level:{lvl}, Using interpolation: {interpolation_type}") log_dir_this_lvl = os.path.join(log_dir_this_file,str(lvl),interpolation_type) os.makedirs(log_dir_this_lvl, exist_ok=True) all_templates_collection, prd_id_coupling, initial_templates, time_info, beats_used = reconstruct_beats(data_orig, lvl, init_templates = initial_templates, start_after = SEC_FOR_INITIAL_TEMPLATES, resample_type = interpolation_type, num_beats_analyzed = NUM_BEAT_ANALYZED, verbose = True, log_dir = log_dir_this_lvl) #resample_type = flat vs linear ############################################################################# # RESULTS EVALUATION # ############################################################################# original_singal_folder = os.path.join(log_dir_this_lvl,"original","data.csv") resampled_singal_folder = os.path.join(log_dir_this_lvl,"resampled","data.csv") warped_singal_folder = os.path.join(log_dir_this_lvl,"warped","data.csv") #TODO: problem here: /!\ level 3- spline does not with many samples, maybe a random beats trigger a condition that breake ecgpwuave (not halting) # IT IS SOLVED BY TERMINATING (CTRL-C ing) ONCE WE USE THE LEVEL 3 --> arresting ecgpuwave stop only the delineator and everything else works # TO ANALYZE BY NOT DELETING LEVELS AND TEST FROM TERMINAL --> there are annotations where level 3 fails to have any reconizable peak around the given annotation ! - wfdb.csv2mit(original_singal_folder,fs = FREQ, units = 'mV', header = False, fmt = '212', adc_gain=ADC_GAIN, baseline = 0) - wfdb.csv2mit(resampled_singal_folder,fs = FREQ, units = 'mV', header = False, fmt = '212', adc_gain=ADC_GAIN, baseline = 0) - wfdb.csv2mit(warped_singal_folder,fs = FREQ, units = 'mV', header = False, fmt = '212', adc_gain=ADC_GAIN, baseline = 0) + csv2mit_with_dir_out(original_singal_folder,fs = FREQ, units = 'mV', header = False, fmt = '212', adc_gain=ADC_GAIN, baseline = 0) + csv2mit_with_dir_out(resampled_singal_folder,fs = FREQ, units = 'mV', header = False, fmt = '212', adc_gain=ADC_GAIN, baseline = 0) + csv2mit_with_dir_out(warped_singal_folder,fs = FREQ, units = 'mV', header = False, fmt = '212', adc_gain=ADC_GAIN, baseline = 0) - #os.remove(original_singal_folder) - #os.remove(resampled_singal_folder) - #os.remove(warped_singal_folder) + os.remove(original_singal_folder) + os.remove(resampled_singal_folder) + os.remove(warped_singal_folder) wfdb.wrann("data","atr",np.array(beats_used), symbol= ['N']*len(beats_used),write_dir=os.path.join(log_dir_this_lvl,"original")) shutil.copy(os.path.join(log_dir_this_lvl,"original","data.atr"),os.path.join(log_dir_this_lvl,"resampled","data.atr")) shutil.copy(os.path.join(log_dir_this_lvl,"original","data.atr"),os.path.join(log_dir_this_lvl,"warped","data.atr")) cmd_delineator_original = f"(cd {os.path.join(log_dir_this_lvl,'original')} ; timeout 300 ecgpuwave -r data -a atr_new -i atr > /dev/null 2>&1)" cmd_delineator_warped = f"(cd {os.path.join(log_dir_this_lvl,'warped')} ; timeout 300 ecgpuwave -r data -a atr_new -i atr > /dev/null 2>&1)" cmd_delineator_resampled = f"(cd {os.path.join(log_dir_this_lvl,'resampled')} ; timeout 300 ecgpuwave -r data -a atr_new -i atr > /dev/null 2>&1)" os.system(cmd_delineator_original) os.system(cmd_delineator_warped) os.system(cmd_delineator_resampled) stats = avg_std_prd(prd_id_coupling) avg_prd_warp = stats['warp']['avg'] std_prd_warp = stats['warp']['std'] avg_prd_resamp = stats['resamp']['avg'] std_prd_resamp = stats['resamp']['std'] file_name_to_save = "L_"+interpolation_type+"_"+file.split(".")[0]+".log" # Particular log for each lvl with open(os.path.join(log_dir_this_lvl,file_name_to_save),"a") as f: f.write(f"Lvl: {lvl}, using the {interpolation_type} interpolation:\n") f.write(f"\tWarp: {avg_prd_warp}, +-{std_prd_warp}\n") f.write(f"\tInterpolation: {avg_prd_resamp}, +-{std_prd_resamp}\n") f.write(f"\tTime (percentage) passed in low-sampling mode: {time_info['beats_low_res_num']/time_info['tot_beats_num']*100}%\n") f.write(f"\n\n") # General log (the same but all toghether: more confusing but with all infos) with open(os.path.join(log_dir_this_file,file_name_to_save),"a") as f: f.write(f"Lvl: {lvl}, using the {interpolation_type} interpolation:\n") f.write(f"\tWarp: {avg_prd_warp}, +-{std_prd_warp}\n") f.write(f"\tInterpolation: {avg_prd_resamp}, +-{std_prd_resamp}\n") ''' for id_temp in templates_info: usage = templates_info[id_temp]["used"] deceased = templates_info[id_temp]["deceased"] f.write(f"\t\tTemplate {id_temp} was used {usage} times out of {NUM_BEAT_ANALYZED}, \ deceased? {deceased}\n") ''' f.write(f"\tTime (percentage) passed in low-sampling mode: {time_info['beats_low_res_num']/time_info['tot_beats_num']*100}%\n") f.write(f"\n\n") print("\n-------------------------------------------------------------------------") print(f"File:{file_name_to_save}, using the {interpolation_type} interpolation:") print(f"\tLvl: {lvl}") print(f"\t\twarp: {avg_prd_warp}, +-{std_prd_warp}") print(f"\t\tinterpolation: {avg_prd_resamp}, +-{std_prd_resamp}") print(f"\t\tTime (percentage) passed in low-sampling mode: {time_info['beats_low_res_num']/time_info['tot_beats_num']*100}%") print("\n") # 01 percentile plot_beat_prd_percentile(data_orig, prd_id_coupling, all_templates_collection, lvl, 1, log_dir_this_lvl, file, interpolation_type = interpolation_type) # 25 percentile plot_beat_prd_percentile(data_orig, prd_id_coupling, all_templates_collection, lvl, 25, log_dir_this_lvl, file, interpolation_type = interpolation_type) # 50 percentile plot_beat_prd_percentile(data_orig, prd_id_coupling, all_templates_collection, lvl, 50, log_dir_this_lvl, file, interpolation_type = interpolation_type) # 75 percentile plot_beat_prd_percentile(data_orig, prd_id_coupling, all_templates_collection, lvl, 75, log_dir_this_lvl, file, interpolation_type = interpolation_type) # 99 percentile plot_beat_prd_percentile(data_orig, prd_id_coupling, all_templates_collection, lvl, 99, log_dir_this_lvl, file, interpolation_type = interpolation_type) #Filter and save back the PRD values for better plots (the filtered data is still accounted for before) prd_id_coupling['prd_warp'] = z_score_filter(prd_id_coupling['prd_warp']) prd_id_coupling['prd_resamp'] = z_score_filter(prd_id_coupling['prd_resamp']) prd_id_coupling['prd_warp_left'] = z_score_filter(prd_id_coupling['prd_warp_left']) prd_id_coupling['prd_left_resamp'] = z_score_filter(prd_id_coupling['prd_left_resamp']) prd_id_coupling['prd_warp_right'] = z_score_filter(prd_id_coupling['prd_warp_right']) prd_id_coupling['prd_right_resamp'] = z_score_filter(prd_id_coupling['prd_right_resamp']) prd_each_interpolation_type[interpolation_type] = prd_id_coupling # Histograms file_name_to_save_fig_hist = os.path.join(log_dir_this_lvl,file.split(".")[0]+"_hist"+str(lvl)+".svg") file_name_to_save_fig_hist_left = os.path.join(log_dir_this_lvl,file.split(".")[0]+"_hist_left"+str(lvl)+".svg") file_name_to_save_fig_hist_right = os.path.join(log_dir_this_lvl,file.split(".")[0]+"_hist_right"+str(lvl)+".svg") n_bins = min(len(prd_id_coupling['prd_warp']),len(prd_id_coupling['prd_resamp']))*PERC_BINS//100 min_bin = min(min(prd_id_coupling['prd_warp']),min(prd_id_coupling['prd_resamp'])) max_bin = max(max(prd_id_coupling['prd_warp']),max(prd_id_coupling['prd_resamp'])) delta = (max_bin-min_bin)/n_bins bins = np.arange(min_bin,max_bin+delta,delta) plt.figure() plt.hist(prd_id_coupling['prd_warp'], bins = bins, alpha=0.5) plt.hist(prd_id_coupling['prd_resamp'], bins = bins, alpha=0.5) plt.title(f'File: {file}, Lvl: {lvl}, PRD histogram') plt.legend(['PRD warp','PRD resampled']) plt.savefig(file_name_to_save_fig_hist) plt.close() n_bins = min(len(prd_id_coupling['prd_warp_left']),len(prd_id_coupling['prd_left_resamp']))*PERC_BINS//100 min_bin = min(min(prd_id_coupling['prd_warp_left']),min(prd_id_coupling['prd_left_resamp'])) max_bin = max(max(prd_id_coupling['prd_warp_left']),max(prd_id_coupling['prd_left_resamp'])) delta = (max_bin-min_bin)/n_bins bins = np.arange(min_bin,max_bin+delta,delta) plt.figure() plt.hist(prd_id_coupling['prd_warp_left'], bins = bins, alpha=0.5) plt.hist(prd_id_coupling['prd_left_resamp'], bins = bins, alpha=0.5) plt.title(f'File: {file}, Lvl: {lvl}, left PRD histogram') plt.legend(['PRD warp','PRD resampled']) plt.savefig(file_name_to_save_fig_hist_left) plt.close() n_bins = min(len(prd_id_coupling['prd_warp_right']),len(prd_id_coupling['prd_right_resamp']))*PERC_BINS//100 min_bin = min(min(prd_id_coupling['prd_warp_right']),min(prd_id_coupling['prd_right_resamp'])) max_bin = max(max(prd_id_coupling['prd_warp_right']),max(prd_id_coupling['prd_right_resamp'])) delta = (max_bin-min_bin)/n_bins bins = np.arange(min_bin,max_bin+delta,delta) plt.figure() plt.hist(prd_id_coupling['prd_warp_right'], bins = bins, alpha=0.5) plt.hist(prd_id_coupling['prd_right_resamp'], bins = bins, alpha=0.5) plt.title(f'File: {file}, Lvl: {lvl}, right PRD histogram') plt.legend(['PRD warp','PRD resampled']) plt.savefig(file_name_to_save_fig_hist_right) plt.close() if INTERPOLATION_TYPE == 'all': log_dir_this_lvl_combined = os.path.join(log_dir_this_file,str(lvl),"combined_results") os.makedirs(log_dir_this_lvl_combined, exist_ok=True) file_name_to_save = "L_"+file.split(".")[0]+".log" global_file_name_to_save = "L_all_interpolations_"+file.split(".")[0]+".log" with open(os.path.join(log_dir_this_lvl_combined,file_name_to_save),"a") as f: f.write(f"Lvl: {lvl}\n") with open(os.path.join(log_dir_this_file,global_file_name_to_save),"a") as f: f.write(f"Lvl: {lvl}\n") # Histograms plt.figure() file_name_to_save_fig_hist = os.path.join(log_dir_this_lvl_combined,file.split(".")[0]+"_hist"+str(lvl)+".svg") n_bins = np.inf min_bin = np.inf max_bin = -np.inf legend_str = [] for interp_type in prd_each_interpolation_type: r_w = prd_each_interpolation_type[interp_type]['prd_warp'] r_r = prd_each_interpolation_type[interp_type]['prd_resamp'] n_bins = min(min(len(r_w),len(r_r))*PERC_BINS//100,n_bins) min_bin = min(min(r_w),min(r_r),min_bin) max_bin = max(max(r_w),max(r_r),max_bin) legend_str.extend([f'PRD warp {interp_type}',f'PRD resampled {interp_type}']) delta = (max_bin-min_bin)/n_bins bins = np.arange(min_bin,max_bin+delta,delta) for interp_type in prd_each_interpolation_type: stats = avg_std_prd(prd_each_interpolation_type[interp_type]) avg_prd_warp = stats['warp']['avg'] std_prd_warp = stats['warp']['std'] avg_prd_resamp = stats['resamp']['avg'] std_prd_resamp = stats['resamp']['std'] # Particular log for each lvl with open(os.path.join(log_dir_this_lvl_combined,file_name_to_save),"a") as f: f.write(f"\tWarp using the {interp_type} interpolation: {avg_prd_warp}, +-{std_prd_warp}\n") f.write(f"\tInterpolation using the {interp_type} interpolation: {avg_prd_resamp}, +-{std_prd_resamp}\n") # General log (the same but all toghether: more confusing but with all infos) with open(os.path.join(log_dir_this_file,global_file_name_to_save),"a") as f: f.write(f"\tWarp using the {interp_type} interpolation: {avg_prd_warp}, +-{std_prd_warp}\n") f.write(f"\tInterpolation using the {interp_type} interpolation: {avg_prd_resamp}, +-{std_prd_resamp}\n") plt.hist(prd_each_interpolation_type[interp_type]['prd_warp'], bins = bins, alpha=0.5) plt.hist(prd_each_interpolation_type[interp_type]['prd_resamp'], bins = bins, alpha=0.5) plt.legend(legend_str) plt.title(f'File: {file}, Lvl: {lvl}, PRD histogram') plt.savefig(file_name_to_save_fig_hist) plt.close() with open(os.path.join(log_dir_this_lvl_combined,file_name_to_save),"a") as f: f.write(f"\n\n") with open(os.path.join(log_dir_this_file,global_file_name_to_save),"a") as f: f.write(f"\n\n") return initial_templates def reconstruct_and_compare_level_parallel(lvl): for file in FILES_SELECTED: verbose = True log_dir_this_file = os.path.join(log_dir,file.split(".")[0]) os.makedirs(log_dir_this_file,exist_ok=True) init_templates = None if verbose: print(f"(Level: {lvl}): Extracting original data") data_orig = open_file(file, start_after = 0) init_templates = reconstruction_one_lvl_one_file_and_compare(data_orig,lvl,file,init_templates,verbose = True) def recontruct_and_compare_file_parallel(file): verbose = True log_dir_this_file = os.path.join(log_dir,file.split(".")[0]) os.mkdir(log_dir_this_file) init_templates = None if verbose: print(f"(File: {file}): Extracting original data") data_orig = open_file(file, start_after = 0) for lvl in LEVELS: init_templates = reconstruction_one_lvl_one_file_and_compare(data_orig,lvl,file,init_templates,verbose = True) def process(files, levels, parallelize_along = PARALLELIZE_ALONG, cores=1): # ------------ INIT ------------ global log_dir for i in range(1,10000): tmp_log_dir = log_dir+str(i) if not os.path.isdir(tmp_log_dir): log_dir = tmp_log_dir break os.makedirs(log_dir, exist_ok=True) with open(os.path.join(log_dir,"specs.txt"), "w") as f: f.write(f"Results generated by script: {sys.argv[0]}\n") f.write(f"Time: {time.ctime(time.time())}\n\n") f.write(f"Files: {files}\n") f.write(f"Levels: {levels}\n") f.write(f"Parallelize along: {parallelize_along}\n") f.write(f"Cores: {cores}\n") f.write(f"Beats: {NUM_BEAT_ANALYZED}\n") f.write(f"Cluster percentage: {CLUSTER_PERCENTAGE}\n") f.write(f"Interpolation type: {INTERPOLATION_TYPE}\n") f.write(f"Timing mode: {TIME_MODE}\n") # ------------ Extract DATA & ANNOTATIONS ------------ if cores == 1: print("Single core") if parallelize_along == 'levels': for lvl in levels: reconstruct_and_compare_level_parallel(lvl) elif parallelize_along == 'files': for f in files: recontruct_and_compare_file_parallel(f) else: with Pool(cores) as pool: if parallelize_along == 'levels': print(f"parallelizing along levels: {levels}") pool.map(reconstruct_and_compare_level_parallel, levels) elif parallelize_along == 'files': print("parallelizing along files") pool.map(recontruct_and_compare_file_parallel, files) if __name__ == "__main__": import argparse import time seconds_start = time.time() local_time_start = time.ctime(seconds_start) print("\nStarted at:", local_time_start,"\n\n") #global NUM_BEAT_ANALYZED parser = argparse.ArgumentParser() parser.add_argument("--file", help="Force to analyze one specific file instead of default one (first found)") parser.add_argument("--levels", help="Decide how many bits to use in the ADC: options:\n\t->1: [3]\n\t->2: [3,4]\n\t->3: [3,4,5]\n\t->4: ...") parser.add_argument("--cores", help="Force used number of cores (default, half of the available ones") parser.add_argument("--parallelize_along", help="describe if to parallelize on the number of 'files'\n or on the number of 'levels'") parser.add_argument("--beats", help="Number of used beats, default: 5000") parser.add_argument("--cluster_opt", help="Percentage of points for a cluster to be considered") parser.add_argument("--interpolation_type", help="Chose between: spline, flat, linear, and all. Default: falt") parser.add_argument("--acquisition_time_mode", help="Menage the time length the algorithm acquire the data at \ full speed ant the time horizon used for template recomputation. \nChose between: short, normal, and long. Default: normal") args = parser.parse_args() files = os.listdir(data_beats_dir) if args.file is not None: if args.file == 'all': FILES_SELECTED = files else: FILES_SELECTED = list(filter(lambda string: True if args.file in string else False, files)) else: FILES_SELECTED = [files[0]] if args.cores is not None: used_cores = int(args.cores) else: used_cores = multiprocessing.cpu_count()//3 if args.beats is not None: NUM_BEAT_ANALYZED = int(args.beats) else: NUM_BEAT_ANALYZED = 5000 if args.cluster_opt is not None: CLUSTER_PERCENTAGE = int(args.cluster_opt) else: CLUSTER_PERCENTAGE = 3 if args.interpolation_type is not None: INTERPOLATION_TYPE = args.interpolation_type else: INTERPOLATION_TYPE = "flat" if args.parallelize_along is not None: PARALLELIZE_ALONG = args.parallelize_along else: PARALLELIZE_ALONG = "levels" if args.levels is not None: LEVELS = LEVELS[:int(args.levels)] else: LEVELS = [3,4,5,6,7,8] if args.acquisition_time_mode is not None: if args.acquisition_time_mode == "short": SEC_FOR_INITIAL_TEMPLATES = 3*60 #5*60 LEN_DISTANCE_VECTOR = 60 #80 LEN_DISTANCE_VECTOR_REF = 400 #500 SEC_FOR_NEW_TEMPLATES = 40 #2*60 TIME_MODE = 'short' elif args.acquisition_time_mode == "long": SEC_FOR_INITIAL_TEMPLATES = 8*60 LEN_DISTANCE_VECTOR = 110 LEN_DISTANCE_VECTOR_REF = 650 SEC_FOR_NEW_TEMPLATES = 3*60 TIME_MODE = 'long' else: SEC_FOR_INITIAL_TEMPLATES = 5*60 LEN_DISTANCE_VECTOR = 80 LEN_DISTANCE_VECTOR_REF = 500 SEC_FOR_NEW_TEMPLATES = 2*60 TIME_MODE = 'medium' else: SEC_FOR_INITIAL_TEMPLATES = 5*60 LEN_DISTANCE_VECTOR = 80 LEN_DISTANCE_VECTOR_REF = 500 SEC_FOR_NEW_TEMPLATES = 2*60 TIME_MODE = 'medium' print(f"Analyzing files: {FILES_SELECTED}") print(f"Extracting data with {used_cores} cores...") process(files = FILES_SELECTED, levels = LEVELS, parallelize_along = PARALLELIZE_ALONG, cores=used_cores) seconds_stop = time.time() local_time_stop = time.ctime(seconds_stop) elapsed = seconds_stop - seconds_start hours = elapsed//60//60 minutes = (elapsed - hours * 60 * 60) // 60 seconds = (elapsed - hours * 60 * 60 - minutes * 60) // 1 print("\n\n\n-----------------------------------------------------------------------------------------------------------------") print(f"Finished at: {local_time_stop}, elapsed: {elapsed} seconds ({hours} hours, {minutes} minutes, {seconds} seconds)")