diff --git a/MITAnnotation.py b/MITAnnotation.py new file mode 100644 index 0000000..23296c6 --- /dev/null +++ b/MITAnnotation.py @@ -0,0 +1,214 @@ +# -*- coding: utf-8 -*- +# pylint: disable-msg=E1101, E0102, E0202 +""" +Created on Mon Feb 27 12:08:29 2012 + +Module with definition of classes to work with annotations in the MIT format. + +@author: T. Teijeiro +""" +import struct + +class MITAnnotation(object): + """ + This class represents an annotation in the MIT format. Currently only + standard 2-byte annotations are supported. + """ + + #We use slots instead of __dict__ to save memory space when a lot of + #annotations are created and managed. + __slots__ = ('code', 'time', 'subtype', 'chan', 'num', 'aux') + + def __init__(self, code=0, time=0, subtype=0, chan=0, num=0, aux=None): + self.code = code + self.time = time + self.subtype = subtype + self.chan = chan + self.num = num + self.aux = aux + + + def __str__(self): + return '{0} {1} {2} {3} {4} {5}'.format(self.time, self.code, + self.subtype, self.chan, + self.num, repr(self.aux)) + + def __repr__(self): + return str(self) + + def __lt__(self, other): + return self.time < other.time + +#MIT format special codes +SKIP_TIME = 1023 +AUX_CODE = 63 +SKIP_CODE = 59 +NUM_CODE = 60 +SUB_CODE = 61 +CHN_CODE = 62 + + +def is_qrs_annotation(annot): + """ + Checks if an annotation corresponds to a QRS annotation. + NORMAL + UNKNOWN + SVPB + FUSION + VESC + LEARN + AESC + NESC + SVESC + PVC + BBB + LBBB + ABERR + RBBB + NPC + PACE + PFUS + APC + RONT + """ + return annot.code in (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 25, 30, 34, 35, 38, 41) + + +def read_annotations(path): + """ + Reads an annotation file in the MIT format. + See: http://www.physionet.org/physiotools/wag/annot-5.htm + + Parameters + ---------- + path: + Path to the file containing the annotations. + + Returns + ------- + out: + List for annotation objects + """ + result = [] + f = open(path, 'rb') + num = 0 + chan = 0 + displ = 0 + while True: + bann = f.read(2) + if not bann: + break + (b0, b1) = struct.unpack('bb', bann) + A = (b1 & 0xff) >> 2 + I = ((b1 & 0x03) << 8) | (0xff & b0) + #Special codes parsing + if A == SKIP_CODE and I == 0: + (b0, b1, b2, b3) = struct.unpack('4b', f.read(4)) + displ = ((b1 << 24) | ((b0 & 0xff) << 16) | + ((b3 & 0xff) << 8) | (b2 & 0xff)) + elif A is NUM_CODE: + num = I + result[-1].num = num + elif A is SUB_CODE: + result[-1].subtype = I + elif A is CHN_CODE: + chan = I + result[-1].chan = chan + elif A is AUX_CODE: + result[-1].aux = f.read(I) + if I % 2 != 0: + f.read(1) + elif A == I == 0: + break + else: + result.append(MITAnnotation(code=A, time=I+displ, chan=chan, num=num)) + displ = 0 + f.close() + #Now, for each annotation we put the absolute time + abs_time = 0 + for annot in result: + abs_time += annot.time + annot.time = max(0, abs_time) + return result + +def save_annotations(annots, path): + """ + Saves a list of annotations in a file, in the MIT format. + See: http://www.physionet.org/physiotools/wag/annot-5.htm + + Parameters + ---------- + annots: List of MITAnnotation objects to be saved. It is sorted before + the writing operation. + path: Path to the file where the list is saved. + """ + annots = sorted(annots) + f = open(path, 'wb') + prev_time = 0 + prev_num = 0 + prev_chn = 0 + + + for anot in annots: + rel_time = anot.time - prev_time + #If the distance is greater than 1023 (what we can express with 10 + #bits), we should write an skip code in the file. + if rel_time > SKIP_TIME: + #A = SKIP_CODE, I = 0; Then 4 byte PDP-11 long integer + f.write(struct.pack('>H', SKIP_CODE << 2)) + f.write(struct.pack('> 16)) + f.write(struct.pack(' + # hyp: ---------------- + if (start_hyp <= start_ref and stop_hyp > start_ref -toleranceFP_bef ): #+ tolerance + tp = 1 + # ref: <---------------------> | + # hyp: <---------------- + elif (start_hyp < stop_ref + toleranceFP_aft and stop_hyp >= stop_ref ): #- tolerance + tp = 1 + # ref: <---------------------> + # hyp: <----> + elif (stop_hyp <= stop_ref and start_hyp >= start_ref ): #- tolerance + tp = 1 + + + #### detect fp + fp = 0 + fp_bef=0 + fp_aft = 0 + # ref: | <---------------------> | + # hyp: <---------------- + if (start_hyp < start_ref - toleranceFP_bef): + fp = fp + 1 + fp_bef = 1 + # ref: | <---------------------> | + # hyp: ------------------> + if (stop_hyp > stop_ref + toleranceFP_aft): + fp = fp + 1 + fp_aft = 1 + + return (tp, fp, fp_bef, fp_aft ) + +def calc_TPAndFP(ref, hyp, toleranceFP_bef, toleranceFP_aft): + ''' for pair of ref and hyp event decides for fp and tp''' + ## collect start and stop times from input arg events + start_ref = ref[0] + stop_ref = ref[1] + start_hyp = hyp[0] + stop_hyp = hyp[1] + + ##### detect if hyp and ref have some overlap + tp = 0 + # ref: <---------------------> + # hyp: <---------------- + if (start_hyp <= start_ref and stop_hyp > start_ref ): #+ tolerance + tp = 1 + # ref: <---------------------> + # hyp: <---------------- + elif (start_hyp < stop_ref and stop_hyp >= stop_ref ): #- tolerance + tp = 1 + # ref: <---------------------> + # hyp: <----> + elif (stop_hyp <= stop_ref and start_hyp >= start_ref ): #- tolerance + tp = 1 + + #### detect fp + fp = 0 + fp_bef=0 + fp_aft = 0 + # ref: | <---------------------> | + # hyp: <---------------- + if (start_hyp < start_ref - toleranceFP_bef): + fp = fp + 1 + fp_bef = 1 + # ref: | <---------------------> | + # hyp: ------------------> + if (stop_hyp > stop_ref + toleranceFP_aft): + fp = fp + 1 + fp_aft = 1 + return (tp, fp, fp_bef, fp_aft ) + +def performance_episodes(predLab, trueLab, toleranceFP_bef, toleranceFP_aft): + ''' + function that detects episodes in a stream of true labels and predictions + detects overlaps and measures sensitivity, precision , F1 score and number of false positives + ''' + totalTP=0 + totalFP=0 + #transform to events + predEvents= calculateStartsAndStops(predLab) + trueEvents = calculateStartsAndStops(trueLab) + #create flags for each event if it has been used + flag_predEvents=np.zeros(len(predEvents)) + flag_trueEvents = np.zeros(len(trueEvents)) + flag_trueEventsFPAround = np.zeros(len(trueEvents)) + # goes through ref events + if (len(trueEvents)==0): + totalFP=len(predEvents) + else: + for etIndx, eTrue in enumerate(trueEvents): + for epIndx, ePred in enumerate(predEvents): + (tp0, fp0, fp_bef, fp_aft)= calc_TPAndFP_Mod(eTrue, ePred, toleranceFP_bef, toleranceFP_aft) + + #if overlap detected (tp=1) and this refSeiz hasnt been used + # ref: <-----> <-----> <-----> <--------------> + # hyp: <----------> <----------> <--------------> <-----> + if (tp0==1 and flag_trueEvents[etIndx]==0 and flag_predEvents[epIndx]==0): + totalTP=totalTP+tp0 + totalFP = totalFP + fp0 + flag_trueEvents[etIndx] = 1 + flag_predEvents[epIndx] = 1 #1 means match + if (fp0==2): + flag_trueEventsFPAround[etIndx]=2 + else: + flag_trueEventsFPAround[etIndx] = fp_aft-fp_bef #1 if was after, or -1 if was before + #if ref event was already matched and now we have some extra predicted seiz ones + # ref: <------------------------------------------> + # hyp: <----------> <--------------> <-----> + elif (tp0 == 1 and flag_trueEvents[etIndx] == 1 and flag_predEvents[epIndx] == 0): + #totalTP = totalTP + tp0 #we already counted that one + totalFP = totalFP + fp0 #ideally fp0 should be 0, but if at the end we might have 1 fp + #flag_trueEvents[etIndx] = 1 it is already 1 so not needed again + flag_predEvents[epIndx] = 2 #2 means overlaping but not the first match with siezure + #if one big pred seizure covering more ref + # ref: <----------> <--------------> <-----> + # hyp: <------------------------------------------> + elif (tp0==1 and flag_trueEvents[etIndx]==0 and flag_predEvents[epIndx]==1): + #totalTP=totalTP+tp0 # HERE WE NEED TO DECIDE TO WE COUND THEM AS TP OR NOT !!!! + totalFP = totalFP + fp0 #we treat this as 1 FP + if (flag_trueEventsFPAround[etIndx-1]>0 and fp_bef==1): # if there was FP after true seiyure and already counted and now we want to count it again because of before + totalFP=totalFP-1 + flag_trueEvents[etIndx] = 0 #it has to stay unmatched + #flag_predEvents[epIndx] = 1 #already matched + #if pred seiyure was named FP in pass with previous seizure but now matches this seizure + elif (tp0==1 and flag_trueEvents[etIndx]==0 and flag_predEvents[epIndx]==-1): + totalTP=totalTP+tp0 + totalFP = totalFP -1 + fp0 #remove fp from before + flag_trueEvents[etIndx] = 1 #match seizure + if (fp0==2): + flag_trueEventsFPAround[etIndx]=2 + else: + flag_trueEventsFPAround[etIndx] = fp_aft-fp_bef #1 if was after, or -1 if was before + flag_predEvents[epIndx] = 1 #relabel this pred seizure + #if pred seizure was named FP in pass with previous seizure , now overlaps with seizure but this seizure was already matched + elif (tp0 == 1 and flag_trueEvents[etIndx] == 1 and flag_predEvents[epIndx] == -1): + #totalTP = totalTP + tp0 #we already counted that one + totalFP = totalFP -1 + fp0 #ideally fp0 should be 0, but if at the end we might have 1 fp, remove Fp from before + #flag_trueEvents[etIndx] = 1 it is already 1 so not needed again + flag_predEvents[epIndx] = 2 #2 means overlaping but not the first match with siezure + #prdiction but not matched with true seizure + elif (tp0==0 and flag_predEvents[epIndx]==0): + totalFP = totalFP + 1 #+fp0 + flag_predEvents[epIndx] = -1 #-1 means used as FP + elif (flag_predEvents[epIndx]==2): #already counted as part of previous seiyure, we dont need to count it again + a=0 + elif (flag_predEvents[epIndx]==-1): #already counted as FP, we dont need to count it again + a=0 + elif (flag_trueEvents[etIndx]==1 and flag_predEvents[epIndx]==1): #both already matched + a=0 + elif ( flag_trueEvents[etIndx]==0 and flag_predEvents[epIndx]==1): #pred seiz was matched, true hasnt found yet + a=0 + else: + #flag_predEvents[epIndx] = 1 + print('ERROR: new case i havent covered') + + #calculating final performance + numTrueSeiz=len(trueEvents) + numPredSeiz = len(predEvents) + + numMissedSeiz = numTrueSeiz- np.sum(flag_trueEvents) + + #precision =TP/ numPredSeiz but if all is one big predicted seiz then thigs are wrong and value would be >1 + if((totalTP +totalFP) != 0): + precision=totalTP/ (totalTP +totalFP) + else: + precision=0 #np.nan #if no seizures predicted (if there was really no seiz ti will be np.nan later) + + #sensitivity= TP/ numTrueSeiy + if ((numTrueSeiz) != 0): + sensitivity= totalTP/numTrueSeiz + else: + sensitivity=np.nan #IF NOT TRUE SEIZRUES SENSITIVITY=NAN + precision = np.nan + + if ((sensitivity + precision)!=0): + F1score= (2* sensitivity * precision)/ (sensitivity + precision) + else: + F1score= 0 #np.nan + + + #checkups + # if ( (totalTP +totalFP)!= numPredSeiz): + # print('there are long pred seiz') + if ( (numMissedSeiz +totalTP)!= numTrueSeiz): + print('sth wrong with counting seizures') + if ( totalFP < len(np.where(flag_predEvents==-1)[0])): + print('sth wrong with counting FP') + if (totalTP != len(np.where(flag_predEvents==1)[0])): + print('sth wrong with counting seizures 2') + + # #visualize + # xvalues = np.arange(0, len(trueLab), 1) + # fig1 = plt.figure(figsize=(16, 16), constrained_layout=False) + # gs = GridSpec(1, 1, figure=fig1) + # fig1.subplots_adjust(wspace=0.4, hspace=0.6) + # fig1.suptitle('True and pred labels') + # ax1 = fig1.add_subplot(gs[0,0]) + # ax1.plot(xvalues, trueLab, 'r') + # ax1.plot(xvalues, predLab*0.9, 'k') + # ax1.set_xlabel('Time') + # ax1.legend(['True', 'Pred']) + # #calcualte performance for duration to put in title + # (sensitivity_duration, precision_duration, F1score_duration) = perfomance_duration(predLab, trueLab) + # ax1.set_title('EPISODES: sensitivity '+ str(sensitivity)+' precision ' + str(precision)+' F1score '+ str(F1score)+' totalTP ' + str(totalTP)+ ' totalFP '+ str(totalFP) + '\n' + + # 'DURATION: sensitivity ' + str(sensitivity_duration) + ' precision ' + str(precision_duration) + ' F1score ' + str(F1score_duration)) + # ax1.grid() + # fig1.show() + # fig1.savefig(folderOut + '/' +figName) + # plt.close(fig1) + + return(sensitivity, precision, F1score, totalFP) + + +def performance_all9(predLab, trueLab, toleranceFP_bef, toleranceFP_aft, numLabelsPerHour): + ''' fucntion that returns 9 different performance measures of prediction on epilepsy + - on the level of seizure episodes (sensitivity, precision and F1 score) + - on the level of seizure duration, or each sample (sens, prec, F1) + - combination of F1 scores for episodes and duration (either mean or gmean) + - number of false positives per day ''' + (sensE, precisE, F1E, totalFP)= performance_episodes(predLab, trueLab, toleranceFP_bef, toleranceFP_aft) + (sensD, precisD , F1D)=performance_duration(predLab, trueLab) + + #calculate combinations + F1DEmean=(F1D+F1E)/2 + F1DEgeoMean=np.sqrt(F1D*F1E) + + + #calculate numFP per day + #numLabelsPerHour = 60 * 60 / SegSymbParams.slidWindStepSec + timeDurOfLabels = len(trueLab) / numLabelsPerHour + if (timeDurOfLabels != 0): + numFPperHour = totalFP / timeDurOfLabels + else: + numFPperHour = np.nan + numFPperDay=numFPperHour*24 + + if (sensE>1.0 or precisE>1.0 or F1E>1.0 or sensD>1.0 or precisD>1.0 or F1D>1.0 or F1DEmean>1.0 or F1DEgeoMean>1.0 ): + print('ERROR - perf measures impossibly big!') + # if (np.sum(trueLab)==0): + # print('No Seiz in file') + + return( sensE, precisE, F1E, sensD, precisD, F1D, F1DEmean, F1DEgeoMean, numFPperDay) + + +def smoothenLabels(prediction, seizureStableLenToTestIndx, seizureStablePercToTest, distanceBetweenSeizuresIndx): + ''' returns labels after two steps of postprocessing + first moving window with voting - if more then threshold of labels are 1 final label is 1 otherwise 0 + second merging seizures that are too close ''' + + #labels = labels.reshape(len(labels)) + smoothLabelsStep1=np.zeros((len(prediction))) + smoothLabelsStep2=np.zeros((len(prediction))) + try: + a=int(seizureStableLenToTestIndx) + except: + print('error seizureStableLenToTestIndx') + print(seizureStableLenToTestIndx) + try: + a=int(len(prediction)) + except: + print('error prediction') + print(prediction) + #first classifying as true 1 if at laest GeneralParams.seizureStableLenToTest in a row is 1 + for i in range(int(seizureStableLenToTestIndx), int(len(prediction))): + s= sum( prediction[i-seizureStableLenToTestIndx+1: i+1] )/seizureStableLenToTestIndx + try: + if (s>= seizureStablePercToTest): #and prediction[i]==1 + smoothLabelsStep1[i]=1 + except: + print('error') + smoothLabelsStep2=np.copy(smoothLabelsStep1) + + #second part + prevSeizureEnd=-distanceBetweenSeizuresIndx + for i in range(1,len(prediction)): + if (smoothLabelsStep2[i] == 1 and smoothLabelsStep2[i-1] == 0): # new seizure started + # find end of the seizure + j = i + while (smoothLabelsStep2[j] == 1 and j< len(smoothLabelsStep2)-1): + j = j + 1 + #if current seizure distance from prev seizure is too close merge them + if ((i - prevSeizureEnd) < distanceBetweenSeizuresIndx): # if seizure started but is too close to previous one + #delete secon seizure + #prevSeizureEnd = j + #[i:prevSeizureEnd]=np.zeros((prevSeizureEnd-i-1)) #delete second seizure - this was before + #concatenate seizures + if (prevSeizureEnd<0): #if exactly first seizure + prevSeizureEnd=0 + smoothLabelsStep2[prevSeizureEnd:j] = np.ones((j - prevSeizureEnd )) + prevSeizureEnd = j + i=prevSeizureEnd + + return (smoothLabelsStep2, smoothLabelsStep1) + +def smoothenLabels_Bayes(prediction, probability, seizureStableLenToTestIndx, probThresh, distanceBetweenSeizuresIndx): + ''' returns labels bayes postprocessing + calculates cummulative probability of seizure and non seizure over the window of size seizureStableLenToTestIndx + if log (cong_pos /cong_ned )> probThresh then seizure ''' + + #'probability' has the probability of predicted class, but Bayes needs the probability per target + probability_pos=np.copy(probability) + indxs=np.where(prediction==0)[0] + probability_pos[indxs]=1-probability[indxs] #keeps only the target '1''s probability + + conf_pos=np.zeros((len(probability))) + conf_neg=np.zeros((len(probability))) + + try: + a=int(seizureStableLenToTestIndx) + except: + print('error seizureStableLenToTestIndx') + print(seizureStableLenToTestIndx) + try: + a=int(len(prediction)) + except: + print('error prediction') + print(prediction) + + #first classifying as true 1 if at laest GeneralParams.seizureStableLenToTest in a row is 1 + conf_pos[0:seizureStableLenToTestIndx] = probability_pos[0:seizureStableLenToTestIndx] + conf_neg[0:seizureStableLenToTestIndx] = 1-probability_pos[0:seizureStableLenToTestIndx] + for i in range(int(seizureStableLenToTestIndx), int(len(prediction))): + probThisWind=probability_pos[i-seizureStableLenToTestIndx: i] + conf_pos[i]=np.prod(probThisWind) + conf_neg[i]=np.prod( 1-probThisWind) + + conf=np.log( (conf_pos+ 0.00000001) /(conf_neg + 0.00000001)) + smoothLabels = conf > probThresh + #second part: merging detections that are too close into one + smoothLabelsStep2 = np.copy(smoothLabels) + prevSeizureEnd=-distanceBetweenSeizuresIndx + for i in range(1,len(prediction)): + if (smoothLabelsStep2[i] == 1 and smoothLabelsStep2[i-1] == 0): # new seizure started + # find end of the seizure + j = i + while (smoothLabelsStep2[j] == 1 and j< len(smoothLabelsStep2)-1): + j = j + 1 + #if current seizure distance from prev seizure is too close merge them + if ((i - prevSeizureEnd) < distanceBetweenSeizuresIndx): # if seizure started but is too close to previous one + #delete secon seizure + #prevSeizureEnd = j + #[i:prevSeizureEnd]=np.zeros((prevSeizureEnd-i-1)) #delete second seizure - this was before + #concatenate seizures + if (prevSeizureEnd<0): #if exactly first seizure + prevSeizureEnd=0 + smoothLabelsStep2[prevSeizureEnd:j] = np.ones((j - prevSeizureEnd )) + prevSeizureEnd = j + i=prevSeizureEnd + + + return (smoothLabelsStep2) + + + +def calculatePerformanceAfterVariousSmoothing(predLabels, label, probabilityLabels,toleranceFP_bef, toleranceFP_aft, numLabelsPerHour, seizureStableLenToTestIndx, seizureStablePercToTest, distanceBetweenSeizuresIndx, bayesWind, probThresh): + numTypes=4 #no smooth, movignAvrg, MmovingAvrg+merging, bayes + numPerf=9 + performancesAll=np.zeros((numPerf*numTypes)) + + # calculate different performance measures - only no Smooth + performancesAll[ : numPerf] = performance_all9(predLabels, label, toleranceFP_bef, toleranceFP_aft, numLabelsPerHour) + + #smoothing using moving average and then merging + (yPred_SmoothOurStep2, yPred_SmoothOurStep1) = smoothenLabels(predLabels, seizureStableLenToTestIndx, seizureStablePercToTest, distanceBetweenSeizuresIndx) + performancesAll[numPerf: 2*numPerf] = performance_all9(yPred_SmoothOurStep1, label, toleranceFP_bef, toleranceFP_aft, numLabelsPerHour) + performancesAll[2*numPerf: 3*numPerf]= performance_all9(yPred_SmoothOurStep1, label, toleranceFP_bef, toleranceFP_aft, numLabelsPerHour) + + #bayes smoothing + yPred_SmoothBayes=smoothenLabels_Bayes(predLabels, probabilityLabels, bayesWind, probThresh, distanceBetweenSeizuresIndx) + performancesAll[3 * numPerf: 4 * numPerf] = performance_all9(yPred_SmoothBayes, label, toleranceFP_bef, toleranceFP_aft, numLabelsPerHour) + + + return(performancesAll,yPred_SmoothOurStep1, yPred_SmoothOurStep2, yPred_SmoothBayes ) + + +if __name__ == "__main__": + pass diff --git a/VariousFunctionsLib.py b/VariousFunctionsLib.py new file mode 100644 index 0000000..7c54e1c --- /dev/null +++ b/VariousFunctionsLib.py @@ -0,0 +1,1380 @@ +#!~/anaconda3/bin python + +__author__ = "Una Pale" +__credits__ = ["Una Pale", "Renato Zanetti"] +__license__ = "LGPL" +__version__ = "1.0" +__email__ = "una.pale at epfl.ch" + +import os +import glob +import csv +import math +# import sklearn +import multiprocessing as mp +import time +from math import ceil +# from sklearn import metrics +import matplotlib +matplotlib.use('Agg') +import matplotlib.pyplot as plt +from matplotlib.gridspec import GridSpec +import pywt +import scipy +import sys +import pyedflib +import MITAnnotation as MIT +from sklearn import svm +from sklearn.neighbors import KNeighborsClassifier +from sklearn.tree import DecisionTreeClassifier +from sklearn.ensemble import RandomForestClassifier +from sklearn.ensemble import BaggingClassifier +from sklearn.ensemble import AdaBoostClassifier +import scipy.io +# import random +from PerformanceMetricsLib import * +from scipy import signal +# from scipy import interpolate +import pandas as pd +# import seaborn as sns +# import wfdb +from antropy import * + + +######################################################## +#global variables for multi-core operation +number_free_cores = 0 +n_cores_semaphore = 1 + + + +def createFolderIfNotExists(folderOut): + ''' creates folder if doesnt already exist + warns if creation failed ''' + if not os.path.exists(folderOut): + try: + os.mkdir(folderOut) + except OSError: + print("Creation of the directory %s failed" % folderOut) + # else: + # print("Successfully created the directory %s " % folderOut) + + +def calculateOtherMLfeatures_oneCh(X, ZeroCrossParams): + numFeat = 56 #54 from Sopic2018 and LL and meanAmpl + lenSig= len(X) + segLenIndx = int(ZeroCrossParams.winLen * ZeroCrossParams.samplFreq) # length of EEG segments in samples + slidWindStepIndx = int( ZeroCrossParams.winStep * ZeroCrossParams.samplFreq) # step of slidin window to extract segments in samples + index = np.arange(0, lenSig - segLenIndx, slidWindStepIndx).astype(int) + + featureValues=np.zeros((len(index), numFeat)) + for i in range(len(index)): + sig = X[index[i]:index[i] + segLenIndx] + feat54 = calculateMLfeatures_oneDataWindow_54(sig, ZeroCrossParams.samplFreq) + meanAmpl = np.mean(np.abs(sig)) + LL = np.mean(np.abs(np.diff(sig))) + featureValues[i, :] = np.hstack((meanAmpl, LL, feat54)) + return (featureValues) + + +def calculateMLfeatures_oneDataWindow_54(data, samplFreq): + ''' function that calculates various features relevant for epileptic seizure detection + from paper: D. Sopic, A. Aminifar, and D. Atienza, e-Glass: A Wearable System for Real-Time Detection of Epileptic Seizures, 2018 + but uses only features that are (can be) normalized + _54 means that return also absolute values of power bands + ''' + #some parameters + DWTfilterName = 'db4' # 'sym5' + DWTlevel = 7 + n1 = 2 #num dimensions for sample entropy + r1 = 0.2 # num of STD for sample entropy + r2 = 0.35 # num of STD for sample entropy + a = 2 # param for shannon, renyi and tsallis enropy + q = 2 # param for shannon, renyi and tsallis enropy + + #DWT + coeffs = pywt.wavedec(data, DWTfilterName, level=DWTlevel) + a7, d7, d6, d5, d4, d3, d2, d1= coeffs + + # #sample entropy (Obs: the function sampen2 takes too long for executing, hence it was substituted with sample_entropy) + # samp_1_d7_1 = sampen2(n1, r1 * np.std(d7), d7) + # samp_1_d6_1 = sampen2(n1, r1 * np.std(d6), d6) + # samp_2_d7_1 = sampen2(n1, r2 * np.std(d7), d7) + # samp_2_d6_1 = sampen2(n1, r2 * np.std(d6), d6) + + #sample entropy: only allows r=0.2, does we zeroed samp_2_d7_1 an samp_2_d6_1 (after first tests) + samp_1_d7_1 = sample_entropy(d7) + samp_1_d6_1 = sample_entropy(d6) + samp_2_d7_1 = 0 #sample entropy with r=0.35 was discarded for the use of 'antropy' library + samp_2_d6_1 = 0 + + #permutation entropy + perm_d7_3 = perm_entropy(d7, order=3, delay=1, normalize=True) # normalize=True instead of false as in paper + perm_d7_5 = perm_entropy(d7, order=5, delay=1, normalize=True) + perm_d7_7 = perm_entropy(d7, order=7, delay=1, normalize=True) + perm_d6_3 = perm_entropy(d6, order=3, delay=1, normalize=True) + perm_d6_5 = perm_entropy(d6, order=5, delay=1, normalize=True) + perm_d6_7 = perm_entropy(d6, order=7, delay=1, normalize=True) + perm_d5_3 = perm_entropy(d5, order=3, delay=1, normalize=True) + perm_d5_5 = perm_entropy(d5, order=5, delay=1, normalize=True) + perm_d5_7 = perm_entropy(d5, order=7, delay=1, normalize=True) + perm_d4_3 = perm_entropy(d4, order=3, delay=1, normalize=True) + perm_d4_5 = perm_entropy(d4, order=5, delay=1, normalize=True) + perm_d4_7 = perm_entropy(d4, order=7, delay=1, normalize=True) + perm_d3_3 = perm_entropy(d3, order=3, delay=1, normalize=True) + perm_d3_5 = perm_entropy(d3, order=5, delay=1, normalize=True) + perm_d3_7 = perm_entropy(d3, order=7, delay=1, normalize=True) + + #shannon renyi and tsallis entropy + (shannon_en_sig, renyi_en_sig, tsallis_en_sig) = sh_ren_ts_entropy(data, a, q) + (shannon_en_d7, renyi_en_d7, tsallis_en_d7) = sh_ren_ts_entropy(d7, a, q) + (shannon_en_d6, renyi_en_d6, tsallis_en_d6) = sh_ren_ts_entropy(d6, a, q) + (shannon_en_d5, renyi_en_d5, tsallis_en_d5) = sh_ren_ts_entropy(d5, a, q) + (shannon_en_d4, renyi_en_d4, tsallis_en_d4) = sh_ren_ts_entropy(d4, a, q) + (shannon_en_d3, renyi_en_d3, tsallis_en_d3) = sh_ren_ts_entropy(d3, a, q) + + #band power + p_tot = bandpower(data, samplFreq, 0, 45) + p_dc = bandpower(data, samplFreq, 0, 0.5) + p_mov = bandpower(data, samplFreq, 0.1, 0.5) + p_delta = bandpower(data, samplFreq, 0.5, 4) + p_theta = bandpower(data, samplFreq, 4, 8) + p_alfa = bandpower(data, samplFreq, 8, 13) + p_middle = bandpower(data, samplFreq, 12, 13) + p_beta = bandpower(data, samplFreq, 13, 30) + p_gamma = bandpower(data, samplFreq, 30, 45) + p_dc_rel = p_dc / p_tot + p_mov_rel = p_mov / p_tot + p_delta_rel = p_delta / p_tot + p_theta_rel = p_theta / p_tot + p_alfa_rel = p_alfa / p_tot + p_middle_rel = p_middle / p_tot + p_beta_rel = p_beta / p_tot + p_gamma_rel = p_gamma / p_tot + + featuresAll= [samp_1_d7_1, samp_1_d6_1, samp_2_d7_1, samp_2_d6_1, perm_d7_3, perm_d7_5, perm_d7_7, perm_d6_3, perm_d6_5, perm_d6_7, perm_d5_3, perm_d5_5, \ + perm_d5_7, perm_d4_3, perm_d4_5, perm_d4_7, perm_d3_3, perm_d3_5, perm_d3_7, shannon_en_sig, renyi_en_sig, tsallis_en_sig, shannon_en_d7, renyi_en_d7, tsallis_en_d7, \ + shannon_en_d6, renyi_en_d6, tsallis_en_d6, shannon_en_d5, renyi_en_d5, tsallis_en_d5, shannon_en_d4, renyi_en_d4, tsallis_en_d4, shannon_en_d3, renyi_en_d3, tsallis_en_d3, \ + p_dc_rel, p_mov_rel, p_delta_rel, p_theta_rel, p_alfa_rel, p_middle_rel, p_beta_rel, p_gamma_rel, + p_dc, p_mov, p_delta, p_theta, p_alfa, p_middle, p_beta, p_gamma, p_tot] + return (featuresAll) + + +def sh_ren_ts_entropy(x, a, q): + ''' function that calculates three different entropy meausres from given widow: + shannon, renyi and tsallis entropy''' + p, bin_edges = np.histogram(x) + p = p/ np.sum(p) + p=p[np.where(p >0)] # to exclude log(0) + shannon_en = - np.sum(p* np.log2(p)) + renyi_en = np.log2(np.sum(pow(p,a))) / (1 - a) + tsallis_en = (1 - np.sum(pow(p,q))) / (q - 1) + return (shannon_en, renyi_en, tsallis_en) + +def bandpower(x, fs, fmin, fmax): + '''function that calculates energy of specific frequency band of FFT spectrum''' + f, Pxx = scipy.signal.periodogram(x, fs=fs) + ind_min = scipy.argmax(f > fmin) - 1 + ind_max = scipy.argmax(f >fmax) - 1 + return scipy.trapz(Pxx[ind_min: ind_max+1], f[ind_min: ind_max+1]) + +def sampen2(dim,r,data): + ''' function that calculates sample entropy from given window of data''' + epsilon = 0.001 + N = len(data) + correl = np.zeros( 2) + dataMat = np.zeros((dim + 1, N - dim)) + for i in range(dim+1): + dataMat[i,:]= data[i: N - dim + i] + + for m in range(dim,dim + 2): + count = np.zeros( N - dim) + tempMat = dataMat[0:m,:] + + for i in range(N - m): + #calculate distance, excluding self - matching case + dist = np.max(np.abs(tempMat[:, i + 1: N - dim] - np.tile(tempMat[:, i],( (N - dim - i-1),1)).T ), axis=0) + D = (dist < r) + count[i] = np.sum(D) / (N - dim - 1) + + correl[m - dim] = np.sum(count) / (N - dim) + + saen = np.log((correl[0] + epsilon) / (correl[1] + epsilon)) + return saen + +def readEdfFile (fileName): + ''' reads .edf file and returnes data[numSamples, numCh], sampling frequency, names of channels''' + f = pyedflib.EdfReader(fileName) + n = f.signals_in_file + channelNames = f.getSignalLabels() + f.getSampleFrequency(0) + samplFreq= data = np.zeros(( f.getNSamples()[0], n)) + for i in np.arange(n): + data[:, i] = f.readSignal(i) + return (data, samplFreq, channelNames) + +def writeToCsvFile( data, labels, fileName): + outputName= fileName+'.csv' + myFile = open(outputName, 'w',newline='') + dataToWrite=np.column_stack((data, labels)) + with myFile: + writer = csv.writer(myFile) + writer.writerows(dataToWrite) + +def writeToCompressedCsvFile( data, labels, fileName): + outputName= fileName+'.csv.gz' + dataToWrite=np.column_stack((data, labels)) + df = pd.DataFrame(data=dataToWrite) + df.to_csv(outputName, index=False, compression='gzip') + + +def extractEDFdataToCSV_originalData(parallelize, perc_cores, samedir, folderIn, folderOut, SigInfoParams, patients): + ''' converts data from edf format to csv + 20210705 UnaPale''' + global number_free_cores + print('Extracting .csv from CHB edf files') + if parallelize: + n_cores = mp.cpu_count() + n_cores = ceil(n_cores*perc_cores) + + if n_cores > len(patients): + n_cores = len(patients) + + print('Number of used cores: ' + str(n_cores)) + + pool = mp.Pool(n_cores) + number_free_cores = n_cores + + # cutting segments + for patIndx, pat in enumerate(patients): + # print('-- Patient:', pat) + PATIENT = pat if len(sys.argv) < 2 else '{0:02d}'.format(int(sys.argv[1])) + #number of Seiz and nonSeiz files + if samedir: + SeizFiles=sorted(glob.glob(f'{folderIn}chb{PATIENT}*.seizures')) + EDFNonSeizFiles=sorted(glob.glob(f'{folderIn}chb{PATIENT}*.edf')) + else: + SeizFiles=sorted(glob.glob(f'{folderIn}chb{PATIENT}/chb{PATIENT}*.seizures')) + EDFNonSeizFiles=sorted(glob.glob(f'{folderIn}chb{PATIENT}/chb{PATIENT}*.edf')) + + folderOutFiles = folderOut + 'chb' + pat + '/' + createFolderIfNotExists(folderOutFiles) + print('Patient: ' + pat + ' Number of files: ' + str(len(EDFNonSeizFiles))) + # create lists with just names, to be able to compare them + SeizFileNames = list() + for fIndx, f in enumerate(SeizFiles): + justName = os.path.split(f)[1][:-13] + if (fIndx == 0): + SeizFileNames = [justName] + else: + SeizFileNames.append(justName) + NonSeizFileNames = list() + NonSeizFileFullNames = list() + for fIndx, f in enumerate(EDFNonSeizFiles): + justName = os.path.split(f)[1][:-4] + if (justName not in SeizFileNames): + if (fIndx == 0): + NonSeizFileNames = [justName] + NonSeizFileFullNames = [f] + else: + NonSeizFileNames.append(justName) + NonSeizFileFullNames.append(f) + + if parallelize: + pool.apply_async(extractDataLabels_CHB, args=(patIndx, SeizFiles, NonSeizFileFullNames, SigInfoParams, folderOutFiles), callback=collect_result) + number_free_cores = number_free_cores -1 + if number_free_cores==0: + while number_free_cores==0: #synced in the callback + time.sleep(0.1) + pass + else: + extractDataLabels_CHB(patIndx, SeizFiles, NonSeizFileFullNames, SigInfoParams, folderOutFiles) + + if parallelize: + while number_free_cores < n_cores: #wait till all subjects have their data processed + time.sleep(0.1) + pass + pool.close() + pool.join() + +def extractDataLabels_CHB(patIndx, SeizFiles, NonSeizFileFullNames, SigInfoParams, folderOut): + + #EXPORT SEIZURE FILES + for fileIndx,fileName in enumerate(SeizFiles): + allGood=1 + + fileName0 = os.path.splitext(fileName)[0] # removing .seizures from the string + # here replaced reading .hea files with .edf reading to avoid converting !!! + (rec, samplFreq, channels) = readEdfFile(fileName0) + # take only the channels we need and in correct order + try: + chToKeepAndInCorrectOrder=[channels.index(SigInfoParams.channels[i]) for i in range(len(SigInfoParams.channels))] + except: + print('Sth wrong with the channels in a file: ', fileName) + allGood=0 + + if (allGood==1): + newData = rec[1:, chToKeepAndInCorrectOrder] + (lenSig, numCh) = newData.shape + newLabel = np.zeros(lenSig) + # read times of seizures + szStart = [a for a in MIT.read_annotations(fileName) if a.code == 32] # start marked with '[' (32) + szStop = [a for a in MIT.read_annotations(fileName) if a.code == 33] # start marked with ']' (33) + # for each seizure cut it out and save (with few parameters) + numSeizures = len(szStart) + for i in range(numSeizures): + seizureLen = szStop[i].time - szStart[i].time + newLabel[int(szStart[i].time):int(szStop[i].time)] = np.ones(seizureLen) + + # saving to csv file - saving all seizures to one file,with name of how many seizures is there + pom, fileName1 = os.path.split(fileName0) + fileName2 = os.path.splitext(fileName1)[0] + + fileName3 = folderOut + fileName2 + + writeToCompressedCsvFile(newData, newLabel, fileName3) + + #EXPORT NON SEIZURE FILES + for fileIndx,fileName in enumerate(NonSeizFileFullNames): + allGood=1 + + # # here replaced reading .hea files with .edf reading to avoid converting !!! + (rec, samplFreq, channels) = readEdfFile(fileName) + # take only the channels we need and in correct order + try: + chToKeepAndInCorrectOrder=[channels.index(SigInfoParams.channels[i]) for i in range(len(SigInfoParams.channels))] + except: + print('Sth wrong with the channels in a file: ', fileName) + allGood=0 + + if (allGood==1): + newData = rec[1:, chToKeepAndInCorrectOrder] + (lenSig, numCh) = newData.shape + newLabel = np.zeros(lenSig) + + # saving to csv file + pom, fileName1 = os.path.split(fileName) + fileName2 = os.path.splitext(fileName1)[0] + fileName3 = folderOut + fileName2 + writeToCompressedCsvFile(newData, newLabel, fileName3) + + return (patIndx) + + + +def train_StandardML_moreModelsPossible(X_train, y_train, StandardMLParams): + #X_train0= np.float32(X_train) + #y_train = np.float32(y_train) + #y_train = y_train.reshape((len(y_train),)) + X_train0=X_train + #replacing nan and inf values + #X_train0[np.where(np.isinf(X_train0))] = np.nan + + if (np.size(X_train0,0)==0): + print('X train size 0 is 0!!', X_train0.shape, y_train.shape) + if (np.size(X_train0,1)==0): + print('X train size 1 is 0!!', X_train0.shape, y_train.shape) + col_mean = np.nanmean(X_train0, axis=0) + inds = np.where(np.isnan(X_train0)) + X_train0[inds] = np.take(col_mean, inds[1]) + # if still somewhere nan replace with 0 + X_train0[np.where(np.isnan(X_train0))] = 0 + X_train=X_train0 + + #MLmodels.modelType = 'KNN' + if (StandardMLParams.modelType=='KNN'): + model = KNeighborsClassifier(n_neighbors=StandardMLParams.KNN_n_neighbors, metric=StandardMLParams.KNN_metric) + model.fit(X_train, y_train) + elif (StandardMLParams.modelType=='SVM'): + #model = svm.SVC(kernel=StandardMLParams.SVM_kernel, C=StandardMLParams.SVM_C, gamma=StandardMLParams.SVM_gamma) + model = svm.SVC(kernel=StandardMLParams.SVM_kernel, C=StandardMLParams.SVM_C, gamma=StandardMLParams.SVM_gamma, probability=True) # when using bayes smoothing + model.fit(X_train, y_train) + elif (StandardMLParams.modelType=='DecisionTree'): + if (StandardMLParams.DecisionTree_max_depth==0): + model = DecisionTreeClassifier(random_state=0, criterion=StandardMLParams.DecisionTree_criterion, splitter=StandardMLParams.DecisionTree_splitter) + else: + model = DecisionTreeClassifier(random_state=0, criterion=StandardMLParams.DecisionTree_criterion, splitter=StandardMLParams.DecisionTree_splitter, max_depth=StandardMLParams.DecisionTree_max_depth) + model = model.fit(X_train, y_train) + elif (StandardMLParams.modelType=='RandomForest'): + if (StandardMLParams.DecisionTree_max_depth == 0): + model = RandomForestClassifier(random_state=0, n_estimators=StandardMLParams.RandomForest_n_estimators, criterion=StandardMLParams.DecisionTree_criterion, n_jobs=StandardMLParams.n_jobs) # min_samples_leaf=10 + else: + model = RandomForestClassifier(random_state=0, n_estimators=StandardMLParams.RandomForest_n_estimators, criterion=StandardMLParams.DecisionTree_criterion, n_jobs=StandardMLParams.n_jobs, max_depth=StandardMLParams.DecisionTree_max_depth ) # min_samples_leaf=10 + model = model.fit(X_train, y_train) + elif (StandardMLParams.modelType=='BaggingClassifier'): + if (StandardMLParams.Bagging_base_estimator=='SVM'): + model = BaggingClassifier(base_estimator=svm.SVC(kernel=StandardMLParams.SVM_kernel, C=StandardMLParams.SVM_C, gamma=StandardMLParams.SVM_gamma), n_estimators=StandardMLParams.Bagging_n_estimators,random_state=0) + elif (StandardMLParams.Bagging_base_estimator=='KNN'): + model = BaggingClassifier(base_estimator= KNeighborsClassifier(n_neighbors=StandardMLParams.KNN_n_neighbors, metric=StandardMLParams.KNN_metric), n_estimators=StandardMLParams.Bagging_n_estimators,random_state=0) + elif (StandardMLParams.Bagging_base_estimator == 'DecisionTree'): + model = BaggingClassifier(DecisionTreeClassifier(random_state=0, criterion=StandardMLParams.DecisionTree_criterion, splitter=StandardMLParams.DecisionTree_splitter), + n_estimators=StandardMLParams.Bagging_n_estimators, random_state=0) + model = model.fit(X_train, y_train) + elif (StandardMLParams.modelType=='AdaBoost'): + if (StandardMLParams.Bagging_base_estimator=='SVM'): + model = AdaBoostClassifier(base_estimator=svm.SVC(kernel=StandardMLParams.SVM_kernel, C=StandardMLParams.SVM_C, gamma=StandardMLParams.SVM_gamma), n_estimators=StandardMLParams.Bagging_n_estimators,random_state=0) + elif (StandardMLParams.Bagging_base_estimator=='KNN'): + model = AdaBoostClassifier(base_estimator= KNeighborsClassifier(n_neighbors=StandardMLParams.KNN_n_neighbors, metric=StandardMLParams.KNN_metric), n_estimators=StandardMLParams.Bagging_n_estimators,random_state=0) + elif (StandardMLParams.Bagging_base_estimator == 'DecisionTree'): + model = AdaBoostClassifier(DecisionTreeClassifier(random_state=0, criterion=StandardMLParams.DecisionTree_criterion, splitter=StandardMLParams.DecisionTree_splitter), + n_estimators=StandardMLParams.Bagging_n_estimators, random_state=0) + model = model.fit(X_train, y_train) + + return (model) + + + +def test_StandardML_moreModelsPossible_v3(data,trueLabels, model): + # number of clases + (unique_labels, counts) = np.unique(trueLabels, return_counts=True) + numLabels = len(unique_labels) + if (numLabels==1): #in specific case when in test set all the same label + numLabels=2 + + # #PREDICT LABELS + # X_test0 = np.float32(data) + # X_test0[np.where(np.isinf(X_test0))] = np.nan + # if (np.size(X_test0,0)==0): + # print('X test size 0 is 0!!', X_test0.shape) + # if (np.size(X_test0,1)==0): + # print('X test size 1 is 0!!', X_test0.shape) + # col_mean = np.nanmean(X_test0, axis=0) + # inds = np.where(np.isnan(X_test0)) + # X_test0[inds] = np.take(col_mean, inds[1]) + # # if still somewhere nan replace with 0 + # X_test0[np.where(np.isnan(X_test0))] = 0 + # X_test=X_test0 + X_test = data + #calculate predictions + y_pred= model.predict(X_test) + y_probability = model.predict_proba(X_test) + + #pick only probability of predicted class + y_probability_fin=np.zeros(len(y_pred)) + indx=np.where(y_pred==1) + if (len(indx[0])!=0): + y_probability_fin[indx]=y_probability[indx,1] + else: + a=0 + print('no seiz predicted') + indx = np.where(y_pred == 0) + if (len(indx[0])!=0): + y_probability_fin[indx] = y_probability[indx,0] + else: + a=0 + print('no non seiz predicted') + + #calculate accuracy + diffLab=y_pred-trueLabels + indx=np.where(diffLab==0) + acc= len(indx[0])/len(trueLabels) + + # calculate performance and distances per class + accPerClass=np.zeros(numLabels) + distFromCorr_PerClass = np.zeros(numLabels) + distFromWrong_PerClass = np.zeros(numLabels) + for l in range(numLabels): + indx=np.where(trueLabels==l) + trueLabels_part=trueLabels[indx] + predLab_part=y_pred[indx] + diffLab = predLab_part - trueLabels_part + indx2 = np.where(diffLab == 0) + if (len(indx[0])==0): + accPerClass[l] = np.nan + else: + accPerClass[l] = len(indx2[0]) / len(indx[0]) + + return(y_pred, y_probability_fin, acc, accPerClass) + + +def calcHistogramValues_v2(sig, segmentedLabels, histbins): + '''takes one window of signal - all ch and labels, separates seiz and nonSeiz and + calculates histogram of values during seizure and non seizure ''' + numBins=int(histbins) + sig2 = sig[~np.isnan(sig)] + sig2 = sig2[np.isfinite(sig2)] + # maxValFeat=np.max(sig) + # binBorders=np.arange(0, maxValFeat+1, (maxValFeat+1)/numBins) + + # sig[sig == np.inf] = np.nan + indxs=np.where(segmentedLabels==0)[0] + nonSeiz = sig[indxs] + nonSeiz = nonSeiz[~np.isnan(nonSeiz)] + try: + nonSeiz_hist = np.histogram(nonSeiz, bins=numBins, range=(np.min(sig2), np.max(sig2))) + except: + print('Error with hist ') + + indxs = np.where(segmentedLabels == 1)[0] + Seiz = sig[indxs] + Seiz = Seiz[~np.isnan(Seiz)] + try: + Seiz_hist = np.histogram(Seiz, bins=numBins, range=(np.min(sig2), np.max(sig2))) + except: + print('Error with hist ') + + # normalizing values that are in percentage of total samples - to not be dependand on number of samples + nonSeiz_histNorm=[] + nonSeiz_histNorm.append(nonSeiz_hist[0]/len(nonSeiz)) + nonSeiz_histNorm.append(nonSeiz_hist[1]) + Seiz_histNorm=[] + Seiz_histNorm.append(Seiz_hist[0]/len(Seiz)) + Seiz_histNorm.append(Seiz_hist[1]) + # Seiz_hist[0] = Seiz_hist[0] / len(Seiz_allCh) + return( Seiz_histNorm, nonSeiz_histNorm) + +def concatenateDataFromFiles(fileNames): + dataAll = [] + for f, fileName in enumerate(fileNames): + print(os.path.split(fileName)[-1]) + data = readDataFromFile(fileName) + + if (dataAll == []): + dataAll = data + else: + dataAll = np.vstack((dataAll, data)) + + return dataAll + +def concatenateDataFromFiles_AllocateFirst(fileNames, nrows): + + data = readDataFromFile(fileNames[0]) #reads the first file to uncover the number of columns + + dataAll = np.zeros((nrows, data.shape[1])) + start=0 + for f, fileName in enumerate(fileNames): + print(os.path.split(fileName)[-1]) + data = readDataFromFile(fileName) + + dataAll[start:start+data.shape[0]] = data + start=start+data.shape[0] + # if (dataAll == []): + # dataAll = data + # else: + # dataAll = np.vstack((dataAll, data)) + + return dataAll + +def concatenateDataFromFiles_TSCVFilesInput(fileNames): + dataAll = [] + startIndxOfFiles=np.zeros(len(fileNames)) + for f, fileName in enumerate(fileNames): + data = readDataFromFile(fileName) + data= np.float32(data) + # reader = csv.reader(open(fileName, "r")) + # data0 = list(reader) + # data = np.array(data0).astype("float") + # separating to data and labels + # X = data[:, 0:-1] + # y = data[:, -1] + dataSource= np.ones(len(data[:, -1]))*f + + if (dataAll == []): + dataAll = data[:, 0:-1] + labelsAll = data[:, -1].astype(int) + # startIndxOfFiles[f]=0 + lenPrevFile=int(len(data[:, -1])) + startIndxOfFiles[f]=lenPrevFile + else: + dataAll = np.vstack((dataAll, data[:, 0:-1])) + labelsAll = np.hstack((labelsAll, data[:, -1].astype(int))) + # startIndxOfFiles[f]=int(lenPrevFile) + lenPrevFile = lenPrevFile+ len(data[:, -1]) + startIndxOfFiles[f] = int(lenPrevFile) + startIndxOfFiles = startIndxOfFiles.astype((int)) + return (dataAll, labelsAll, startIndxOfFiles) + + +def generateTSCVindexesFromAllFiles_storeTSCVLabels(dataset, GeneralParams, ZeroCrossFeatureParams, SegSymbParams, folderIn, folderOut, maxWinLen): + '''Reads the names of the files with labels, generating a vector with the indexes for the TSCV considering all data of a subject stacked + in one matrix. It also generate files with the labels for each CV.''' + + maxWinLenIndx=int((maxWinLen-SegSymbParams.segLenSec)/SegSymbParams.slidWindStepSec) + minFirsFileLen=int((3600-SegSymbParams.segLenSec)/SegSymbParams.slidWindStepSec*5) #number of windows of 1h x nHours min + for patIndx, pat in enumerate(GeneralParams.patients): + if dataset=='01_SWEC': + allFiles=np.sort(glob.glob(folderIn + 'ID' + pat + '/'+'ID' + pat + '*Labels.csv.gz')) + if dataset=='02_CHB': + allFiles = np.sort(glob.glob(folderIn +'chb' + pat+ '/chb' + pat + '*Labels.csv.gz')) + + # print(os.path.split(allFiles[0][:])[-1]) + #chb02_16+ commes first than chb02_16 when ordering files + if dataset=='02_CHB' and pat=='02': + if '+' in allFiles[15][:]: #chb02_16+ + cpy = allFiles[15] + allFiles[15] = allFiles[16] + allFiles[16] = cpy + + numCVThisSubj=0 + firstFileCreated=0 + startIndxOfFiles=[] + #LOAD ALL FILES ONE BY ONE + for fIndx, fileName in enumerate(allFiles): + print(os.path.split(fileName)[-1]) + + labels = readDataFromFile(fileName) + + pom, fileName1 = os.path.split(fileName) + fileNameOut = fileName1.split('_')[0] #for the labels output files to plot the TSCV sequence + + if 'chb17' in fileNameOut: #chb17 has files 'a', 'b', and 'c' + fileNameOut='chb17' + + eachSubjDataOutFolder = folderOut+ fileNameOut+'/' + createFolderIfNotExists(eachSubjDataOutFolder) #for each subject + + #if there is seizure in file find start and stops + if (np.sum(labels)!=0): + diffSig=np.diff(np.squeeze(labels)) + szStart=np.where(diffSig==1)[0] + szStop= np.where(diffSig == -1)[0] + + + if (firstFileCreated==0): #first file, append until at least one seizure + if (fIndx==0): + newLabel=labels + else: #appending to existing file + newLabel =np.vstack((newLabel,labels)) + + if (np.sum(newLabel)>0 and len(newLabel)>=minFirsFileLen): #at least min of hours and 1 seizure in the first file + firstFileCreated=1 + startIndxOfFiles = np.append(startIndxOfFiles, int(len(newLabel))) + + fileNameOut2 = eachSubjDataOutFolder + fileNameOut + '_cv' + str(numCVThisSubj).zfill(3) + '_Labels' + saveDataToFile(newLabel.reshape((-1,1)), fileNameOut2, 'gzip') + + numCVThisSubj = numCVThisSubj + 1 + + else: #not first file, just resave with different cv name + newLabel=labels + if len(newLabel) < 1.5*maxWinLenIndx: #up to 1.5x the target, we kepp the original file in full + startIndxOfFiles = np.append(startIndxOfFiles, startIndxOfFiles[-1] + int(len(newLabel))) + + fileNameOut2 = eachSubjDataOutFolder + fileNameOut + '_cv' + str(numCVThisSubj).zfill(3) + '_Labels' + saveDataToFile(newLabel.reshape((-1,1)), fileNameOut2, 'gzip') + + numCVThisSubj = numCVThisSubj + 1 + else: + n_new_files = int(len(labels)/maxWinLenIndx) + indxStart=0 + dataMissing = maxWinLenIndx + for i in range(n_new_files-1): + #check if we would cut seizure in half + if (np.sum(labels)!=0): + for s in range(len(szStart)): + try: + if ( szStart[s]indxStart+dataMissing ): #cut would be whenre seizure is + dataMissing=szStop[s]- indxStart #move cut to the end of the seizure + except: + print('error') + + newLabel=labels[indxStart:indxStart+dataMissing,:] + + #finished this new file to save + startIndxOfFiles = np.append(startIndxOfFiles, startIndxOfFiles[-1] + int(len(newLabel))) + + fileNameOut2 = eachSubjDataOutFolder + fileNameOut + '_cv' + str(numCVThisSubj).zfill(3) + '_Labels' + saveDataToFile(newLabel.reshape((-1,1)), fileNameOut2, 'gzip') + + indxStart = indxStart+dataMissing #start where we stopped + numCVThisSubj = numCVThisSubj + 1 + + #last part of the data of each file + newLabel=labels[indxStart:len(labels),:] + startIndxOfFiles = np.append(startIndxOfFiles, startIndxOfFiles[-1] + int(len(newLabel))) + + fileNameOut2 = eachSubjDataOutFolder + fileNameOut + '_cv' + str(numCVThisSubj).zfill(3) + '_Labels' + saveDataToFile(newLabel.reshape((-1,1)), fileNameOut2, 'gzip') + + numCVThisSubj = numCVThisSubj + 1 #for next file + + startIndxOfFiles = startIndxOfFiles.astype((int)) #contains indexes of test data + fileNameOut2 = folderOut + fileNameOut + '/' + fileNameOut + '_TSCV_indexes.csv.gz' + np.savetxt(fileNameOut2, startIndxOfFiles) + + + + +def concatenateFeatures_FirstFileNeedsSeizure_KeepOriginal(folderIn, folderOut,GeneralParams, SegSymbParams, SigInfoParams, patients, maxWinLen): + '''Concatenate data from files till having the first seizure and write it to another file. The rest of the files are copied to the cross-validation ones as + they are ''' + + createFolderIfNotExists(folderOut) + for patIndx, pat in enumerate(GeneralParams.patients): + print('Subj:'+ pat) + allFiles=np.sort(glob.glob(folderIn + '/chb' + pat + '/*chb' + pat + '*_OtherFeat.csv.gz')) + firstFileCreated=0 + numFilesThisSubj=0 + for fIndx, fileName in enumerate(allFiles): + # reader = csv.reader(open(fileName, "r")) + # data0 = np.array(list(reader)).astype("float") + data0 = readDataFromFile(fileName) + data=data0[:,0:-1] + label=data0[:,-1] + pom, fileName1 = os.path.split(fileName) + fileNameOut = os.path.splitext(fileName1)[0][0:5] + + if (firstFileCreated==0): #first file, append until at least one seizure + if (fIndx==0): + dataOut=data + labelOut=label + else: + dataOut=np.vstack((dataOut,data)) + labelOut = np.hstack((labelOut, label)) + if (np.sum(labelOut)>0 and fIndx>4): #at least 6 h or at least 1 seizure in first file + firstFileCreated=1 + fileNameOut2 = folderOut + '/' + fileNameOut + '_cv' + str(numFilesThisSubj).zfill(3) + # writeToCsvFile(dataOut, labelOut, fileNameOut2) + saveDataToFile(np.hstack((dataOut, labelOut.reshape((-1,1)))), fileNameOut2, 'gzip') + numFilesThisSubj = numFilesThisSubj + 1 + else: #not first file, just resave with different cv name + fileNameOut2 = folderOut + '/' + fileNameOut + '_cv' + str(numFilesThisSubj).zfill(3) + # writeToCsvFile(data, label, fileNameOut2) + saveDataToFile(np.hstack((data,label.reshape((-1,1)))), fileNameOut2, 'gzip') + numFilesThisSubj = numFilesThisSubj + 1 + + + +def concatenateFeatures_allDataInEqualWindows_FirstFileNeedsSeizure(parallelize, perc_cores, dataset, folderIn, folderOut, GeneralParams, SegSymbParams, ZeroCrossFeatureParams, maxWinLen): + ''' ''' + print('Reorganizing features files to TSCV approach') + global number_free_cores + + if parallelize: + n_cores = mp.cpu_count() + n_cores = ceil(n_cores*perc_cores) + + if n_cores > len(GeneralParams.patients): + n_cores = len(GeneralParams.patients) + + print('Number of used cores: ' + str(n_cores)) + + pool = mp.Pool(n_cores) + number_free_cores = n_cores + + + maxWinLenIndx=int((maxWinLen-SegSymbParams.segLenSec)/SegSymbParams.slidWindStepSec) + minFirsFileLen=int((3600-SegSymbParams.segLenSec)/SegSymbParams.slidWindStepSec*5) #number of windows of 1h x nHours min + for patIndx, pat in enumerate(GeneralParams.patients): + if dataset=='01_SWEC': + allFiles=np.sort(glob.glob(folderIn + 'ID' + pat + '/'+'ID' + pat + '*_OtherFeat.csv.gz')) + if dataset=='02_CHB': + allFiles = np.sort(glob.glob(folderIn +'chb' + pat+ '/chb' + pat + '*_OtherFeat.csv.gz')) + + # print(os.path.split(allFiles[0][:])[-1]) + #chb02_16+ commes first than chb02_16 when ordering files + if dataset=='02_CHB' and pat=='02': + if '+' in allFiles[15][:]: #chb02_16+ + cpy = allFiles[15] + allFiles[15] = allFiles[16] + allFiles[16] = cpy + + if parallelize: + pool.apply_async(concatenateFeatureFiles_SeizureInFirstFile_DivideBigFiles, args=(patIndx, allFiles, folderOut, maxWinLenIndx, ZeroCrossFeatureParams, minFirsFileLen), callback=collect_result) + number_free_cores = number_free_cores -1 + if number_free_cores==0: + while number_free_cores==0: #synced in the callback + time.sleep(0.1) + pass + else: + concatenateFeatureFiles_SeizureInFirstFile_DivideBigFiles(patIndx, allFiles, folderOut, maxWinLenIndx, ZeroCrossFeatureParams, minFirsFileLen) + + if parallelize: + while number_free_cores < n_cores: #wait till all subjects have their data processed + time.sleep(0.1) + pass + + pool.close() + pool.join() + + + +def concatenateFeatureFiles_EqualSize_SeizureInFirstFile(patIndx, allFiles, folderOut, maxWinLenIndx, ZeroCrossFeatureParams, minFirsFileLen): + indxStart=0 + dataMissing=maxWinLenIndx #how much data we target to have per file + newFileToSave=1 + numFilesThisSubj=0 + firstFileCreated=0 + #LOAD ALL FILES ONE BY ONE + for fIndx, fileName in enumerate(allFiles): + dataOtherFeat=readDataFromFile(fileName) + + fileName2=fileName[0:-16]+'ZCFeat.csv.gz' + dataZCFeat = readDataFromFile(fileName2) + + numCh=int(len(dataZCFeat[0, :])/(len(ZeroCrossFeatureParams.EPS_thresh_arr)+1)) + data= mergeFeatFromTwoMatrixes(dataOtherFeat, dataZCFeat, numCh) + + fileName2=fileName[0:-16]+'Labels.csv.gz' + labels = readDataFromFile(fileName2) + + pom, fileName1 = os.path.split(fileName) + fileNameOut = fileName1.split('_')[0] + + print('Patient:', str(patIndx+1), ' ', fileName1) + + #if there is seizure in file find start and stops + if (np.sum(labels)!=0): + diffSig=np.diff(np.squeeze(labels)) + szStart=np.where(diffSig==1)[0] + szStop= np.where(diffSig == -1)[0] + + + if (firstFileCreated==0): #first file, append until at least one seizure + if (fIndx==0): + newData=data + newLabel=labels + else: #appending to existing file + newData= np.vstack((newData,data)) + newLabel =np.vstack((newLabel,labels)) + + if (np.sum(newLabel)>0 and len(newLabel)>=minFirsFileLen): #at least 6 h or at least 1 seizure in first file + firstFileCreated=1 + fileNameOut2 = folderOut + fileNameOut + '/' + fileNameOut + '_cv' + str(numFilesThisSubj).zfill(3) + # writeToCsvFile(dataOut, labelOut, fileNameOut2) + saveDataToFile(np.hstack((newData, newLabel.reshape((-1,1)))), fileNameOut2, 'gzip') + + fileNameOut2 = folderOut + 'Labels' + '/' + fileNameOut + '_cv' + str(numFilesThisSubj).zfill(3) + '_Labels' + saveDataToFile(newLabel.reshape((-1,1)), fileNameOut2, 'gzip') + + numFilesThisSubj = numFilesThisSubj + 1 + del newData + del newLabel + else: #not first file, just resave with different cv name + thisFileStillHasData=1 + while (thisFileStillHasData==1): + #if enough data in file + if (indxStart + dataMissing <= len(labels)): + #in case we are to create a new file but the remaining data after its creating is too few, this remaing + #is added to the current file + rest = len(labels) - indxStart - dataMissing + if rest < int(0.05*maxWinLenIndx) and newFileToSave==1: + dataMissing = len(labels) + + #check if we would cut seizure in half + if (np.sum(labels)!=0): + for s in range(len(szStart)): + try: + if ( szStart[s]indxStart+dataMissing ): #cut would be whenre seizure is + dataMissing=szStop[s]- indxStart #move cut to the end of the seizure + except: + print('error') + + if (newFileToSave==1): + newData=data[indxStart:indxStart+dataMissing,:] + newLabel=labels[indxStart:indxStart+dataMissing,:] + else: #appending to existing file + newData= np.vstack((newData,data[indxStart:indxStart+dataMissing,:])) + newLabel =np.vstack((newLabel,labels[indxStart:indxStart+dataMissing,:])) + + #finished this new file to save + fileNameOut2 = folderOut + '/' + fileNameOut + '/' + fileNameOut + '_cv' + str(numFilesThisSubj).zfill(3) + saveDataToFile(np.hstack((newData, newLabel)), fileNameOut2, 'gzip') + + fileNameOut2 = folderOut + 'Labels' + '/' + fileNameOut + '_cv' + str(numFilesThisSubj).zfill(3) + '_Labels' + saveDataToFile(newLabel.reshape((-1,1)), fileNameOut2, 'gzip') + + numFilesThisSubj = numFilesThisSubj + 1 + newFileToSave = 1 + indxStart = indxStart+dataMissing #start where we stopped + dataMissing = maxWinLenIndx + thisFileStillHasData=1 + if indxStart < len(labels): + thisFileStillHasData=1 + else: + thisFileStillHasData=0 + indxStart = 0 + else: #not enough data in file + if (newFileToSave==1): + newData=data[indxStart:,:] #read until the end of the file + newLabel=labels[indxStart:,:] + else: #appending to existing file + newData= np.vstack((newData,data[indxStart:,:])) + newLabel =np.vstack((newLabel,labels[indxStart:,:])) + dataMissing = maxWinLenIndx - len(newLabel) #calculate how much data is missing + indxStart = 0 #in next file start from beginning + thisFileStillHasData=0 #this file has no more data, need to load new one + newFileToSave=0 + + + # save last file's remaining data + if (len(labels)- indxStart> int(0.05*maxWinLenIndx) and thisFileStillHasData): + fileNameOut2 = folderOut + '/' + fileNameOut + '/' + fileNameOut + '_cv' + str(numFilesThisSubj).zfill(3) + saveDataToFile(np.hstack((newData, newLabel)), fileNameOut2, 'gzip') + + fileNameOut2 = folderOut + 'Labels' + '/' + fileNameOut + '_cv' + str(numFilesThisSubj).zfill(3) + '_Labels' + saveDataToFile(newLabel.reshape((-1,1)), fileNameOut2, 'gzip') + + return patIndx + + +def concatenateFeatureFiles_SeizureInFirstFile_DivideBigFiles(patIndx, allFiles, folderOut, maxWinLenIndx, ZeroCrossFeatureParams, minFirsFileLen): + + numFilesThisSubj=0 + firstFileCreated=0 + #LOAD ALL FILES ONE BY ONE + for fIndx, fileName in enumerate(allFiles): + print(os.path.split(fileName)[-1]) + dataOtherFeat=readDataFromFile(fileName) + + fileName2=fileName[0:-16]+'ZCFeat.csv.gz' + dataZCFeat = readDataFromFile(fileName2) + + numCh=int(len(dataZCFeat[0, :])/(len(ZeroCrossFeatureParams.EPS_thresh_arr)+1)) + data= mergeFeatFromTwoMatrixes(dataOtherFeat, dataZCFeat, numCh) + + fileName2=fileName[0:-16]+'Labels.csv.gz' + labels = readDataFromFile(fileName2) + + pom, fileName1 = os.path.split(fileName) + fileNameOut = fileName1.split('_')[0] + + if 'chb17' in fileNameOut: #chb17 has files 'a', 'b', and 'c' + fileNameOut='chb17' + + eachSubjDataOutFolder = folderOut+fileNameOut + '/' + createFolderIfNotExists(eachSubjDataOutFolder) #for each subject + + #if there is seizure in file find start and stops + if (np.sum(labels)!=0): + diffSig=np.diff(np.squeeze(labels)) + szStart=np.where(diffSig==1)[0] + szStop= np.where(diffSig == -1)[0] + + + if (firstFileCreated==0): #first file, append until at least one seizure + if (fIndx==0): + newData=data + newLabel=labels + else: #appending to existing file + newData= np.vstack((newData,data)) + newLabel =np.vstack((newLabel,labels)) + + if (np.sum(newLabel)>0 and len(newLabel)>=minFirsFileLen): #at least min of hours and 1 seizure in the first file + firstFileCreated=1 + fileNameOut2 = eachSubjDataOutFolder + fileNameOut + '_cv' + str(numFilesThisSubj).zfill(3) + # writeToCsvFile(dataOut, labelOut, fileNameOut2) + saveDataToFile(np.hstack((newData, newLabel.reshape((-1,1)))), fileNameOut2, 'gzip') + + LabelsOutFolder = folderOut + 'Labels' + '/' + createFolderIfNotExists(LabelsOutFolder) #for each subject + + fileNameOut2 = LabelsOutFolder + fileNameOut + '_cv' + str(numFilesThisSubj).zfill(3) + '_Labels' + saveDataToFile(newLabel.reshape((-1,1)), fileNameOut2, 'gzip') + + numFilesThisSubj = numFilesThisSubj + 1 + del newData + del newLabel + else: #not first file, just resave with different cv name + if len(labels) < 1.5*maxWinLenIndx: #up to 1.5x the target, we kepp the original file in full + fileNameOut2 = eachSubjDataOutFolder + fileNameOut + '_cv' + str(numFilesThisSubj).zfill(3) + saveDataToFile(np.hstack((data, labels.reshape((-1,1)))), fileNameOut2, 'gzip') + + fileNameOut2 = LabelsOutFolder + fileNameOut + '_cv' + str(numFilesThisSubj).zfill(3) + '_Labels' + saveDataToFile(labels.reshape((-1,1)), fileNameOut2, 'gzip') + + numFilesThisSubj = numFilesThisSubj + 1 + else: + n_new_files = int(len(labels)/maxWinLenIndx) + indxStart=0 + dataMissing = maxWinLenIndx + for i in range(n_new_files-1): + #check if we would cut seizure in half + if (np.sum(labels)!=0): + for s in range(len(szStart)): + try: + if ( szStart[s]indxStart+dataMissing ): #cut would be whenre seizure is + dataMissing=szStop[s]- indxStart #move cut to the end of the seizure + except: + print('error') + + newData=data[indxStart:indxStart+dataMissing,:] + newLabel=labels[indxStart:indxStart+dataMissing,:] + + #finished this new file to save + fileNameOut2 = fileNameOut2 = eachSubjDataOutFolder + fileNameOut + '_cv' + str(numFilesThisSubj).zfill(3) + saveDataToFile(np.hstack((newData, newLabel)), fileNameOut2, 'gzip') + + fileNameOut2 = LabelsOutFolder + fileNameOut + '_cv' + str(numFilesThisSubj).zfill(3) + '_Labels' + saveDataToFile(newLabel.reshape((-1,1)), fileNameOut2, 'gzip') + + indxStart = indxStart+dataMissing #start where we stopped + numFilesThisSubj = numFilesThisSubj + 1 + + #last part of the data of each file + newData=data[indxStart:len(labels),:] + newLabel=labels[indxStart:len(labels),:] + + #finished this new file to save + fileNameOut2 = fileNameOut2 = eachSubjDataOutFolder + fileNameOut + '_cv' + str(numFilesThisSubj).zfill(3) + saveDataToFile(np.hstack((newData, newLabel)), fileNameOut2, 'gzip') + + fileNameOut2 = LabelsOutFolder + fileNameOut + '_cv' + str(numFilesThisSubj).zfill(3) + '_Labels' + saveDataToFile(newLabel.reshape((-1,1)), fileNameOut2, 'gzip') + + numFilesThisSubj = numFilesThisSubj + 1 #for next file + + + return patIndx + + + +def mergeFeatFromTwoMatrixes(mat1, mat2, numCh): + numFeat1=int(len(mat1[0, :])/numCh) + numFeat2=int(len(mat2[0, :])/numCh) + numColPerCh=numFeat1+numFeat2 + matFinal=np.zeros((len(mat1[:,0]), numColPerCh*numCh)) + for ch in range(numCh): + matFinal[:, ch*numColPerCh : ch*numColPerCh + numFeat1]=mat1[ :, ch*numFeat1: (ch+1)*numFeat1] + matFinal[:, ch * numColPerCh + numFeat1 : ch * numColPerCh + numColPerCh ] = mat2[:, ch * numFeat2: (ch + 1) * numFeat2] + return matFinal + +def plotRawDataLabels(dataset, folderIn, GeneralParams,SegSymbParams): + + print('Print labels for all subjects extracted feature files') + + folderOut=folderIn +'LabelsInTime/' + createFolderIfNotExists(folderOut) + + seizureStableLenToTestIndx = int(GeneralParams.seizureStableLenToTest / SegSymbParams.slidWindStepSec) + seizureStablePercToTest = GeneralParams.seizureStablePercToTest + distanceBetweenSeizuresIndx = int(GeneralParams.distanceBetween2Seizures / SegSymbParams.slidWindStepSec) + numLabelsPerHour = 60 * 60 / SegSymbParams.slidWindStepSec + toleranceFP_bef = int(GeneralParams.toleranceFP_befSeiz / SegSymbParams.slidWindStepSec) + toleranceFP_aft = int(GeneralParams.toleranceFP_aftSeiz / SegSymbParams.slidWindStepSec) + + for patIndx, pat in enumerate(GeneralParams.patients): + + if dataset=='01_SWEC': + allFiles=np.sort(glob.glob(folderIn + 'ID' + pat + '/ID' + pat + '*_Labels*')) + if dataset=='02_CHB': + allFiles = np.sort(glob.glob(folderIn +'chb' + pat+ '/chb' + pat +'*_Labels*')) + + if dataset=='02_CHB' and pat=='02': + if '+' in allFiles[15][:]: #chb02_16+ + cpy = allFiles[15] + allFiles[15] = allFiles[16] + allFiles[16] = cpy + + #concatinatin predictions so that predictions for one seizure are based on train set with all seizures before it + for fIndx, fileName in enumerate(allFiles): + data=readDataFromFile(fileName) + if fIndx==0: + labels = np.squeeze(data) + testIndx=np.ones(len(data))*(fIndx+1) + else: + labels = np.hstack((labels, np.squeeze(data))) + testIndx= np.hstack((testIndx, np.ones(len(data))*(fIndx+1))) + + (yPred_SmoothOurStep2, yPred_SmoothOurStep1) = smoothenLabels(labels, seizureStableLenToTestIndx, seizureStablePercToTest, distanceBetweenSeizuresIndx) + + + #Plot predictions in time + fig1 = plt.figure(figsize=(16, 8), constrained_layout=False) + gs = GridSpec(4, 1, figure=fig1) + fig1.subplots_adjust(wspace=0.2, hspace=0.2) + xValues = np.arange(0, len(labels), 1) / (60*60*2) + ax1 = fig1.add_subplot(gs[0,0]) + ax1.plot(xValues, labels , 'r') + ax1.set_ylabel('TrueLabel') + ax1.set_title('Subj'+pat) + ax1.grid() + ax1 = fig1.add_subplot(gs[1, 0]) + ax1.plot(xValues, yPred_SmoothOurStep1, 'b') + ax1.set_ylabel('Step1') + ax1.grid() + ax1 = fig1.add_subplot(gs[2, 0]) + ax1.plot(xValues, yPred_SmoothOurStep2, 'm') + ax1.set_ylabel('Step2') + ax1.grid() + ax1 = fig1.add_subplot(gs[3, 0]) + ax1.plot(xValues, testIndx , 'k') + ax1.set_ylabel('FileNr') + ax1.grid() + ax1.set_xlabel('Time') + fig1.show() + fig1.savefig(folderOut + 'Subj' + pat + '_RawLabels.png', bbox_inches='tight') + plt.close(fig1) + + +def kl_divergence(p,q): + delta=0.000001 + deltaArr=np.ones(len(p))*delta + p=p+deltaArr + q=q+deltaArr + res=sum(p[i] * math.log2(p[i]/q[i]) for i in range(len(p))) + return res + +def js_divergence(p,q): + m=0.5* (p+q) + res=0.5* kl_divergence(p,m) +0.5* kl_divergence(q,m) + return (res) + + + +def saveDataAndLabelsToFile( data, labels, fileName, type): + outputName= fileName+'.csv' + df = pd.DataFrame(data=np.hstack((data, labels.reshape((-1, 1))))) + if (type=='gzip'): + df.to_csv(outputName + '.gz', index=False, compression='gzip') + else: + df.to_csv(outputName, index=False) + +def saveDataToFile( data, outputName, type): + if ('.csv' not in outputName): + outputName= outputName+'.csv' + df = pd.DataFrame(data=data) + if (type=='gzip'): + df.to_csv(outputName + '.gz', index=False, compression='gzip') + else: + df.to_csv(outputName, index=False) + +def readDataFromFile( inputName): + + if ('.h5' in inputName): + df = pd.read_hdf(inputName) + elif ('.csv.gz' in inputName): + df= pd.read_csv(inputName, compression='gzip') + else: + df= pd.read_csv(inputName) + data=df.to_numpy() + return (data) + + +def func_calculatePerformance_AppendingTestData_TSCV(folderIn, GeneralParams, SegSymbParams, dataset): + folderOut=folderIn +'PerformanceWithAppendedTests/' + createFolderIfNotExists(folderOut) + + AllSubjDiffPerf_test = np.zeros((len(GeneralParams.patients), 4* 9)) + seizureStableLenToTestIndx = int(GeneralParams.seizureStableLenToTest / SegSymbParams.slidWindStepSec) + seizureStablePercToTest = GeneralParams.seizureStablePercToTest + distanceBetweenSeizuresIndx = int(GeneralParams.distanceBetween2Seizures / SegSymbParams.slidWindStepSec) + numLabelsPerHour = 60 * 60 / SegSymbParams.slidWindStepSec + toleranceFP_bef = int(GeneralParams.toleranceFP_befSeiz / SegSymbParams.slidWindStepSec) + toleranceFP_aft = int(GeneralParams.toleranceFP_aftSeiz / SegSymbParams.slidWindStepSec) + for patIndx, pat in enumerate(GeneralParams.patients): + print('Patient: ' + pat) + filesAll = np.sort(glob.glob(folderIn + pat + '/Subj' + pat + '*TestPredictions.csv.gz')) + + numFiles=len(filesAll) + for cv in range(len(filesAll)): + fileName2 = 'Subj' + pat + '_cv' + str(cv).zfill(3) + print(fileName2) + data= readDataFromFile(filesAll[cv]) + dataSource_test = data[:, -1] + # indxs = np.where(dataSource_test == 1)[0] # 0 is the first next seizure after the trained one + if cv==0: + trueLabels_AllCV = data[:, 0] + probabLabels_AllCV=data[:,1] + predLabels_AllCV=data[ :,2] + #predictionsSmoothed=data[ indxs,3:-1] + dataSource_AllCV= data[ :,3] #np.ones(len(indxs))*(cv+1) + else: + trueLabels_AllCV = np.hstack((trueLabels_AllCV, data[:, 0])) + probabLabels_AllCV = np.hstack((probabLabels_AllCV, data[: ,1])) + predLabels_AllCV=np.hstack((predLabels_AllCV,data[:,2])) + #predictionsSmoothed = np.vstack((predictionsSmoothed,data[indxs, 3:-1])) + dataSource_AllCV= np.hstack((dataSource_AllCV, data[ :,3]))#np.ones(len(indxs))*(cv+1))) + + + (performanceTest, yPredTest_MovAvrgStep1, yPredTest_MovAvrgStep2,yPredTest_SmoothBayes) = calculatePerformanceAfterVariousSmoothing(predLabels_AllCV, trueLabels_AllCV,probabLabels_AllCV, + toleranceFP_bef, toleranceFP_aft, + numLabelsPerHour, + seizureStableLenToTestIndx, + seizureStablePercToTest, + distanceBetweenSeizuresIndx, + GeneralParams.bayesWind, + GeneralParams.bayesProbThresh) + dataToSave = np.vstack((trueLabels_AllCV, probabLabels_AllCV, predLabels_AllCV, yPredTest_MovAvrgStep1,yPredTest_MovAvrgStep2, yPredTest_SmoothBayes, dataSource_AllCV)).transpose() # added from which file is specific part of test set + outputName = folderOut + '/Subj' + pat + '_AppendedTest_Predictions.csv' + saveDataToFile(dataToSave, outputName, 'gzip') + + AllSubjDiffPerf_test[patIndx, :] =performanceTest + + #plot predictions in time + # Plot predictions in time + fig1 = plt.figure(figsize=(16, 8), constrained_layout=False) + gs = GridSpec(5, 1, figure=fig1) + fig1.subplots_adjust(wspace=0.2, hspace=0.2) + xValues = np.arange(0, len(trueLabels_AllCV), 1)*SegSymbParams.slidWindStepSec/3600 + ax1 = fig1.add_subplot(gs[0, 0]) + ax1.plot(xValues, predLabels_AllCV, 'k') + ax1.set_ylabel('NoSmooth') + ax1.set_title('Subj' + pat) + ax1.grid() + ax1 = fig1.add_subplot(gs[1, 0]) + ax1.plot(xValues, yPredTest_MovAvrgStep1 * 0.8, 'b') + ax1.plot(xValues, yPredTest_MovAvrgStep2, 'c') + ax1.set_ylabel('Step1&2') + ax1.grid() + ax1 = fig1.add_subplot(gs[2, 0]) + ax1.plot(xValues, yPredTest_SmoothBayes, 'm') + ax1.set_ylabel('Bayes') + ax1.grid() + ax1 = fig1.add_subplot(gs[3, 0]) + ax1.plot(xValues, trueLabels_AllCV, 'r') + ax1.set_ylabel('TrueLabel') + ax1.grid() + ax1 = fig1.add_subplot(gs[4, 0]) + ax1.plot(xValues, dataSource_AllCV, 'k') + ax1.set_ylabel('FileNr') + ax1.grid() + ax1.set_xlabel('Time (h)') + fig1.show() + fig1.savefig(folderOut + 'Subj' + pat + '_Appended_TestPredictions.png', bbox_inches='tight') + plt.close(fig1) + + # ------------------------------------------------------------------------------------------------ + font_size=11 + fig1 = plt.figure(figsize=(6, 3), constrained_layout=False) + gs = GridSpec(2, 1, figure=fig1) + fig1.subplots_adjust(wspace=0.2, hspace=0.2) + + xValues = np.arange(0, len(trueLabels_AllCV), 1)*SegSymbParams.slidWindStepSec/3600 + ax1 = fig1.add_subplot(gs[0, 0]) + ax1.plot(xValues, trueLabels_AllCV, color='g', alpha=0.75, linewidth=1) + ax1.set_ylabel('Label', fontsize=font_size) + # plt.yticks([0, 1], fontsize=font_size) + ax1.grid(axis='x', alpha=0.5, linestyle='--', linewidth=1) + ax1.spines['top'].set_visible(False) + ax1.spines['right'].set_visible(False) + ax1.spines['left'].set_visible(False) + ax1.spines['bottom'].set_visible(False) + ax1.set_xlim(left=0, right=np.max(xValues)) + # ax1.set_ylim(0, 1) + # ax1.legend('Predicted',fontsize=font_size-1) + plt.tick_params( + axis='both', # changes apply to the x-axis + which='both', # both major and minor ticks are affected + bottom=False, # ticks along the bottom edge are off + left=False, + labelbottom=False, # labels along the bottom edge are off + labelleft=False) + + ax1 = fig1.add_subplot(gs[1, 0]) + ax1.plot(xValues, yPredTest_SmoothBayes, color='r', alpha=0.75, linewidth=1) + ax1.set_ylabel('Predicted', fontsize=font_size) + + plt.yticks([0, 1], fontsize=font_size) + ax1.grid(axis='x', alpha=0.5, linestyle='--', linewidth=1) + ax1.spines['top'].set_visible(False) + ax1.spines['right'].set_visible(False) + ax1.spines['left'].set_visible(False) + # ax1.spines['bottom'].set_visible(False) + ax1.set_xlim(left=0, right=np.max(xValues)) + # ax1.set_ylim(0, 1) + + plt.tick_params( + axis='y', # changes apply to the x-axis + which='both', # both major and minor ticks are affected + left=False, # ticks along the bottom edge are off + labelleft=False) # labels along the bottom edge are off + # ax1.legend('Label',fontsize=font_size-1) + ax1.set_xlabel('Time (h)', fontsize=font_size) + # plt.xticks(xValues, xValues, fontsize=font_size) + + fig1.show() + + plt.savefig(folderOut+ dataset + '_Subj' + pat + '_Appended_TestPredictions_onlyBayes.pdf', bbox_inches='tight', dpi=200, format='pdf') + # fig1.savefig(folderOut + '/Subj' + pat + '_Appended_TestPredictions_onlyBayes.png', bbox_inches='tight') + plt.close(fig1) + + # SAVE PERFORMANCE MEASURES FOR ALL SUBJ + outputName = folderOut + 'AllSubj_AppendedTest_AllPerfMeasures.csv' + saveDataToFile(AllSubjDiffPerf_test, outputName, 'gzip') + + + + +def plotRearangedDataLabelsInTime(dataset, folderIn, GeneralParams,SegSymbParams): + folderOut=folderIn +'/LabelsInTime/' + createFolderIfNotExists(folderOut) + + print('Printing labels for TSCV reorganized files') + + seizureStableLenToTestIndx = int(GeneralParams.seizureStableLenToTest / SegSymbParams.slidWindStepSec) + seizureStablePercToTest = GeneralParams.seizureStablePercToTest + distanceBetweenSeizuresIndx = int(GeneralParams.distanceBetween2Seizures / SegSymbParams.slidWindStepSec) + numLabelsPerHour = 60 * 60 / SegSymbParams.slidWindStepSec + toleranceFP_bef = int(GeneralParams.toleranceFP_befSeiz / SegSymbParams.slidWindStepSec) + toleranceFP_aft = int(GeneralParams.toleranceFP_aftSeiz / SegSymbParams.slidWindStepSec) + + # CALCULATING SEIZURE INFORMATION FOR TABLE FOR PAPER + numSubj=len(GeneralParams.patients) + totalDataLenInHours=np.zeros((numSubj)) + testDataLenInHours=np.zeros((numSubj)) + firstFileDataLenInHours=np.zeros((numSubj)) + totalNumSeiz=np.zeros((numSubj)) + testNumSeiz=np.zeros((numSubj)) + + for patIndx, pat in enumerate(GeneralParams.patients): + if dataset=='01_SWEC': + allFiles=np.sort(glob.glob(folderIn + 'ID'+ pat +'/ID' + pat + '*_Labels*')) + if dataset=='02_CHB': + allFiles = np.sort(glob.glob(folderIn +'chb' + pat +'/chb' + pat +'*_Labels*')) + numFiles = len(allFiles) + + print('Pat = ', pat) + #concatinatin predictions so that predictions for one seizure are based on train set with all seizures before it + for fIndx, fileName in enumerate(allFiles): + data=readDataFromFile(fileName) + if fIndx==0: + labels = np.squeeze(data[:,-1]) + testIndx=np.ones(len(data[:,-1]))*(fIndx+1) + firstFileDataLenInHours[patIndx]= len(labels)/(2*60*60) + startIndx=np.where(np.diff(labels)==1)[0] + firstFileNumSeiz=len(startIndx) + else: + labels = np.hstack((labels, np.squeeze(data[:,-1]))) + testIndx = np.hstack((testIndx, np.ones(len(data[:,-1]))*(fIndx+1))) + + #Information for paper table + totalDataLenInHours[patIndx] = len(labels)/(2*60*60) + startIndx=np.where(np.diff(labels)==1)[0] + totalNumSeiz[patIndx]=len(startIndx) + testNumSeiz[patIndx]=totalNumSeiz[patIndx]-firstFileNumSeiz + + #Plot predictions in time + fig1 = plt.figure(figsize=(16, 8), constrained_layout=False) + gs = GridSpec(2, 1, figure=fig1) + fig1.subplots_adjust(wspace=0.2, hspace=0.2) + xValues = np.arange(0, len(labels), 1) / (60*60*2) + ax1 = fig1.add_subplot(gs[0,0]) + ax1.plot(xValues, labels , 'r') + ax1.set_ylabel('TrueLabel') + ax1.set_title('Subj'+pat) + ax1.grid() + + ax1 = fig1.add_subplot(gs[1, 0]) + ax1.plot(xValues, testIndx , 'k') + ax1.set_ylabel('FileNr') + ax1.grid() + ax1.set_xlabel('Time [h]') + fig1.show() + fig1.savefig(folderOut + 'Subj' + pat + '_RawLabels.png', bbox_inches='tight') + plt.close(fig1) + + testDataLenInHours = totalDataLenInHours - firstFileDataLenInHours + dataToSave=np.vstack(( totalDataLenInHours, testDataLenInHours, totalNumSeiz, testNumSeiz )) + dataToSave=(dataToSave*10000).astype(int) /10000 + + outputName = folderOut + '/SeizureInfo_ForPaperTable' + saveDataToFile(dataToSave, outputName, 'csv') + + +#callback for the apply_async process paralization +def collect_result(result): + global number_free_cores + global n_cores_semaphore + + while n_cores_semaphore==0: #block callback in case of multiple accesses + pass + + if n_cores_semaphore: + n_cores_semaphore=0 + number_free_cores = number_free_cores+1 + n_cores_semaphore=1 + +if __name__ == "__main__": + pass + diff --git a/ZeroCrossLibrary.py b/ZeroCrossLibrary.py new file mode 100644 index 0000000..b514314 --- /dev/null +++ b/ZeroCrossLibrary.py @@ -0,0 +1,375 @@ +#!~/anaconda3/bin python + +'''script that includes all functions related to the use of the AZC feature''' + +__authors__ = "Una Pale, Renato Zanetti, and Tomas Teijeiro" +__license__ = "LGPL" +__version__ = "1.0" +__email__ = "{una.pale, renato.zaneti} at epfl.ch" + + +import os +import glob +# import csv +import matplotlib +matplotlib.use('Agg') #to do not show figures +from scipy.io import loadmat +from scipy.signal import decimate +from VariousFunctionsLib import * + +######################################################## +#global variables for multi-core operation +files_processed=[] +n_files_per_patient=[] +number_free_cores = 0 +number_free_cores_pat = 0 +n_cores_semaphore = 1 + + +########################################################################### +##defining functions needed +def polygonal_approx(arr, epsilon): + """ + Performs an optimized version of the Ramer-Douglas-Peucker algorithm assuming as an input + an array of single values, considered consecutive points, and **taking into account only the + vertical distances**. + """ + def max_vdist(arr, first, last): + """ + Obtains the distance and the index of the point in *arr* with maximum vertical distance to + the line delimited by the first and last indices. Returns a tuple (dist, index). + """ + if first == last: + return (0.0, first) + frg = arr[first:last+1] + leng = last-first+1 + dist = np.abs(frg - np.interp(np.arange(leng),[0, leng-1], [frg[0], frg[-1]])) + idx = np.argmax(dist) + return (dist[idx], first+idx) + + if epsilon <= 0.0: + raise ValueError('Epsilon must be > 0.0') + if len(arr) < 3: + return arr + result = set() + stack = [(0, len(arr) - 1)] + while stack: + first, last = stack.pop() + max_dist, idx = max_vdist(arr, first, last) + if max_dist > epsilon: + stack.extend([(first, idx),(idx, last)]) + else: + result.update((first, last)) + return np.array(sorted(result)) + +def zero_crossings(arr): + """Returns the positions of zero-crossings in the derivative of an array, as a binary vector""" + return np.diff(np.sign(np.diff(arr))) != 0 + + +def calculateMovingAvrgMeanWithUndersampling_v2(data, winLen, winStep): + lenSig=len(data) + index = np.arange(0, lenSig - winLen, winStep) + + segmData = np.zeros(len(index)) + for i in range(len(index)): #-1 + x = data[index[i]:index[i] + winLen] + segmData[i]=np.mean(x) + return(segmData) + + + +def calculateFeaturesPerEachFile_CLFAndAZCFeatures_CHB(parallelize, perc_cores, EPS_thresh_arr,folderIn, folderOutFeatures, patients, SigInfoParams, ZeroCrossFeatureParams, calculateOtherFeat, calculateZCFeat): + global files_processed + global n_files_per_patient + global number_free_cores + + print('Extracting features from all subjects files') + + numFeat = len(EPS_thresh_arr) + 1 + + if parallelize: + n_cores = mp.cpu_count() + n_cores = ceil(n_cores*perc_cores) + print('Number of cores: ' + str(n_cores)) + + pool = mp.Pool(n_cores) + number_free_cores = n_cores + + files_processed=np.zeros(len(patients)) + n_files_per_patient = np.zeros(len(patients)) + + # go through all patients + for patIndx, pat in enumerate(patients): + filesIn=np.sort(glob.glob(folderIn + 'chb' + pat+ '/chb' + pat + '*.csv.gz')) #only the data files + numFiles=len(filesIn) + n_files_per_patient[patIndx]=numFiles + + print('Pat: ' + str(pat) + ' Nfiles: ' + str(numFiles)) + + folderOutFeaturesSub = folderOutFeatures + 'chb' + pat + '/' + + if not os.path.exists(folderOutFeaturesSub): + try: + os.mkdir(folderOutFeaturesSub) + except OSError: + print("Creation of the directory %s failed" % folderOutFeaturesSub) + + for fileIndx, file in enumerate(filesIn): + pom, fileName1 = os.path.split(file) + fileName2 = fileName1[:-7] + print('File: '+ fileName2 + ' NFilesprocessed: '+ str(files_processed[patIndx]) + ' Out of: ' + str(n_files_per_patient[patIndx])) + fileName3 = fileName2.split('_') + file_number_str = fileName3[1].zfill(3) #for file-reading proper ordering + + if '+' in fileName2 and pat=='02': #chb02_16+ + file_number_str = '0'+ file_number_str + + numFeat = len(EPS_thresh_arr) + 1 + + # reading data + data = readDataFromFile(file) + # separating to data and labels + X = data[:, SigInfoParams.chToKeep] + y = data[:, -1] + (lenData, numCh) = X.shape + labels = y[0:lenData - 2] + + index = np.arange(0, lenData - int(ZeroCrossFeatureParams.samplFreq * ZeroCrossFeatureParams.winLen), + int(ZeroCrossFeatureParams.samplFreq * ZeroCrossFeatureParams.winStep)) + + labelsSegm = calculateMovingAvrgMeanWithUndersampling_v2(labels, int( + ZeroCrossFeatureParams.samplFreq * ZeroCrossFeatureParams.winLen), int( + ZeroCrossFeatureParams.samplFreq * ZeroCrossFeatureParams.winStep)) + labelsSegm = (labelsSegm > 0.5) * 1 + + + if parallelize: + pool.apply_async(readFileCalculateFeatures, args=(patIndx, data, labelsSegm, index, numCh, numFeat, ZeroCrossFeatureParams, EPS_thresh_arr, calculateOtherFeat, calculateZCFeat, folderOutFeaturesSub, fileName3, file_number_str), callback=collect_result) + number_free_cores = number_free_cores -1 + if number_free_cores==0: + while number_free_cores==0: #synced in the callback + time.sleep(0.1) + pass + else: + readFileCalculateFeatures(patIndx, data, labelsSegm, index, numCh, numFeat, ZeroCrossFeatureParams, EPS_thresh_arr, calculateOtherFeat, calculateZCFeat, folderOutFeaturesSub, fileName3, file_number_str) + + if parallelize: + while number_free_cores < n_cores: #wait till all subjects have their data processed + time.sleep(0.1) + pass + pool.close() + pool.join() + + + + +def calculateFeaturesPerEachFile_CLFAndAZCFeatures_SWEC(parallelize, perc_cores, EPS_thresh_arr,folderIn, folderOutFeatures, patients, SigInfoParams, ZeroCrossFeatureParams, calculateOtherFeat, calculateZCFeat): + global files_processed + global n_files_per_patient + global number_free_cores + + print('Extracting features from all subjects files') + + numFeat = len(EPS_thresh_arr) + 1 + + diff_files_seiz=0 + + if parallelize: + n_cores = mp.cpu_count() + n_cores = ceil(n_cores*perc_cores) + print('Number of cores: ' + str(n_cores)) + + pool = mp.Pool(n_cores) + number_free_cores = n_cores + + files_processed=np.zeros(len(patients)) + n_files_per_patient = np.zeros(len(patients)) + + # go through all patients + for patIndx, pat in enumerate(patients): + filesIn=np.sort(glob.glob(folderIn + 'ID' + pat + '/'+ 'ID' + pat + '*h.mat')) #only the data files + numFiles=len(filesIn) + n_files_per_patient[patIndx]=numFiles + + print('Pat: ' + str(pat) + ' Nfiles: ' + str(numFiles)) + + f = loadmat(folderIn + 'ID' + pat + '/'+ 'ID' + pat +'_info.mat', simplify_cells=True) + seizure_begin = np.array(f['seizure_begin']) #given in seconds + seizure_end = np.array(f['seizure_end']) + fs = np.array(f['fs']).astype(int) + + file_index = (seizure_begin/3600 + 1).astype(int) #sum 1 to fit calculate index to files indexes + print('file_index: ' + str(file_index)) + seiz_count=0 + + folderOutFeaturesSub = folderOutFeatures + 'ID' + pat + '/' + + if not os.path.exists(folderOutFeaturesSub): + try: + os.mkdir(folderOutFeaturesSub) + except OSError: + print("Creation of the directory %s failed" % folderOutFeaturesSub) + + for fileIndx, file in enumerate(filesIn): + pom, fileName1 = os.path.split(file) + fileName2 = os.path.splitext(fileName1)[0] + print('File: '+ fileName2 + ' NFilesprocessed: '+ str(files_processed[patIndx]) + ' Out of: ' + str(n_files_per_patient[patIndx])) + fileName3 = fileName2.split('_') + file_number_str = fileName3[1][:-1] + file_number = int(file_number_str) + + file_number_str = file_number_str.zfill(3) #for file-reading proper ordering + + numFeat = len(EPS_thresh_arr) + 1 + + # reading data + reader = loadmat(file, simplify_cells=True) + data0 = np.array(reader['EEG']).astype("float").T + + # + data = decimate(data0, int(fs/ZeroCrossFeatureParams.samplFreq), axis=0, zero_phase=True) + + (lenData, numCh) = data.shape + y=np.zeros((lenData,)) + + if diff_files_seiz: #previous file had a seizure which spams till the current file + y[old_beg:old_end]=1 + diff_files_seiz=0 + + idx_seiz = np.where(file_index==file_number)[0] + for i, v in enumerate(idx_seiz): + print('Seizure found at file: ' + str(file_index[v])) + beg = int((seizure_begin[v]%3600*fs)*(ZeroCrossFeatureParams.samplFreq/fs)) + end = int((seizure_end[v]%3600*fs)*(ZeroCrossFeatureParams.samplFreq/fs)) + + if end < beg: #seizures in different files + diff_files_seiz = 1 + old_end = end + old_beg = 0 + end = int((3600*fs)*(ZeroCrossFeatureParams.samplFreq/fs)) + + y[beg:end]=1 + + + labels = y[0:lenData - 2] + # windowing the data: 'index' contains start:stop indexes + index = np.arange(0, lenData - int(ZeroCrossFeatureParams.samplFreq * ZeroCrossFeatureParams.winLen), + int(ZeroCrossFeatureParams.samplFreq * ZeroCrossFeatureParams.winStep)) + + #'labels' contains one value per datapoint. 'labelsSegm' contains one value per window (> 50% ictal data --> ictal label) + labelsSegm = calculateMovingAvrgMeanWithUndersampling_v2(labels, int( + ZeroCrossFeatureParams.samplFreq * ZeroCrossFeatureParams.winLen), int( + ZeroCrossFeatureParams.samplFreq * ZeroCrossFeatureParams.winStep)) + labelsSegm = (labelsSegm > 0.5) * 1 + + if parallelize: + pool.apply_async(readFileCalculateFeatures, args=(patIndx, data, labelsSegm, index, numCh, numFeat, ZeroCrossFeatureParams, EPS_thresh_arr, calculateOtherFeat, calculateZCFeat, folderOutFeaturesSub, fileName3, file_number_str), callback=collect_result) + number_free_cores = number_free_cores -1 + if number_free_cores==0: + while number_free_cores==0: #synced in the callback 'collect_result' + time.sleep(0.1) + pass + else: + readFileCalculateFeatures(patIndx, data, labelsSegm, index, numCh, numFeat, ZeroCrossFeatureParams, EPS_thresh_arr, calculateOtherFeat, calculateZCFeat, folderOutFeaturesSub, fileName3, file_number_str) + + if parallelize: + while number_free_cores < n_cores: #wait till all subjects have their data processed + time.sleep(0.1) + pass + + pool.close() + pool.join() + + + +def readFileCalculateFeatures(patIndx, data, labelsSegm, index, numCh, numFeat, ZeroCrossFeatureParams, EPS_thresh_arr, calculateOtherFeat, calculateZCFeat, folderOutFeaturesSub, fileName3, file_number_str): + + sos = signal.butter(4, [1, 20], 'bandpass', fs=ZeroCrossFeatureParams.samplFreq, output='sos') + allsigFilt = signal.sosfiltfilt(sos, data, axis=0) + del data + zeroCrossStandard = np.zeros((len(index), numCh)) + zeroCrossApprox = np.zeros((len(index), numCh)) + zeroCrossFeaturesAll = np.zeros((len(index), numFeat * numCh)) + for ch in range(numCh): + + # t=time.time() + sigFilt=allsigFilt[:,ch] + # calculate CLF + if (calculateOtherFeat==1): + featOther = calculateOtherMLfeatures_oneCh(np.copy(sigFilt), ZeroCrossFeatureParams) + if (ch == 0): + AllFeatures = featOther + else: + AllFeatures = np.hstack((AllFeatures, featOther)) + + #AZC + if (calculateZCFeat==1): + # Zero-crossing of the original signal, counted in 1-second continuous sliding window + # zeroCrossStandard[:,ch] = np.convolve(zero_crossings(sigFilt), np.ones(ZeroCrossFeatures.samplFreq), mode='same') + x = np.convolve(zero_crossings(sigFilt), np.ones(ZeroCrossFeatureParams.samplFreq), mode='same') + # zeroCrossStandard[:,ch] =calculateMovingAvrgMeanWithUndersampling_v2(x, ZeroCrossFeatureParams.samplFreq) + zeroCrossStandard[:, ch] = calculateMovingAvrgMeanWithUndersampling_v2(x, int( + ZeroCrossFeatureParams.samplFreq * ZeroCrossFeatureParams.winLen), int( + ZeroCrossFeatureParams.samplFreq * ZeroCrossFeatureParams.winStep)) + zeroCrossFeaturesAll[:, numFeat * ch] = zeroCrossStandard[:, ch] + + + for EPSthrIndx, EPSthr in enumerate(EPS_thresh_arr): + # Signal simplification at the given threshold, and zero crossing count in the same way + sigApprox = polygonal_approx(sigFilt, epsilon=EPSthr) + # axs[0].plot(sigApprox, sigFilt[sigApprox], alpha=0.6) + sigApproxInterp = np.interp(np.arange(len(sigFilt)), sigApprox, sigFilt[sigApprox]) + # zeroCrossApprox[:,ch] = np.convolve(zero_crossings(sigApproxInterp), np.ones(ZeroCrossFeatures.samplFreq), mode='same') + x = np.convolve(zero_crossings(sigApproxInterp), np.ones(ZeroCrossFeatureParams.samplFreq), + mode='same') + # zeroCrossApprox[:, ch] = calculateMovingAvrgMeanWithUndersampling_v2(x, ZeroCrossFeatureParams.samplFreq) + zeroCrossApprox[:, ch] = calculateMovingAvrgMeanWithUndersampling_v2(x, int( + ZeroCrossFeatureParams.samplFreq * ZeroCrossFeatureParams.winLen), int( + ZeroCrossFeatureParams.samplFreq * ZeroCrossFeatureParams.winStep)) + # save to matrix with all features that will be saved + zeroCrossFeaturesAll[:, numFeat * ch + EPSthrIndx + 1] = zeroCrossApprox[:, ch] + + if (calculateZCFeat == 1): + # save for this file Zero cross features + df = pd.DataFrame(data=zeroCrossFeaturesAll) + outputName = folderOutFeaturesSub + fileName3[0]+ '_' + file_number_str+ 'h_ZCFeat.csv.gz' + #to read from disk: df = pd.read_csv('dfsavename.csv.gz', compression='gzip') + df.to_csv(outputName, index=False, compression='gzip') + # np.savetxt(outputName, zeroCrossFeaturesAll, delimiter=",") + + if (calculateOtherFeat == 1): + # save for this file all other features + df = pd.DataFrame(data=AllFeatures) + outputName = folderOutFeaturesSub + fileName3[0]+ '_' + file_number_str+ 'h_OtherFeat.csv.gz' + df.to_csv(outputName, index=False, compression='gzip') + # np.savetxt(outputName, AllFeatures, delimiter=",") + + # save for this file labels + df = pd.DataFrame(data=labelsSegm) + outputName = folderOutFeaturesSub + fileName3[0]+ '_' + file_number_str+ 'h_Labels.csv.gz' + df.to_csv(outputName, index=False, compression='gzip') + # np.savetxt(outputName, labelsSegm, delimiter=",") + + return (patIndx) + + + +#callback for the apply_async process paralization +def collect_result(result): + global files_processed + global number_free_cores + global n_cores_semaphore + global n_files_per_patient + + while n_cores_semaphore==0: #block callback in case of multiple accesses + pass + + if n_cores_semaphore: + n_cores_semaphore=0 + files_processed[result] += 1 + number_free_cores = number_free_cores+1 + n_cores_semaphore=1 + +if __name__ == "__main__": + pass diff --git a/bash_run_python_variosPat.sh b/bash_run_python_variosPat.sh new file mode 100644 index 0000000..1eefcba --- /dev/null +++ b/bash_run_python_variosPat.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +for i in {1..18} #! e.g. of use: 1, {1,5,9,11,24}, and {1..18}. +do + python script_AZC_Classification.py $i & +done \ No newline at end of file diff --git a/parametersSetup.py b/parametersSetup.py new file mode 100644 index 0000000..69e94b6 --- /dev/null +++ b/parametersSetup.py @@ -0,0 +1,86 @@ +#!~/anaconda3/bin python + +__author__ = "Una Pale" +__credits__ = ["Una Pale", "Renato Zanetti"] +__license__ = "LGPL" +__version__ = "1.0" +__email__ = "una.pale at epfl.ch" + +''' file with all parameters''' +import numpy as np +import pickle + +class GeneralParams: + #Label smoothing with moving average + seizureStableLenToTest=5 #in seconds - window for performing label voting + seizureStablePercToTest=0.5 # 50% of 1 in last seizureStableLenToTest values that needs to be 1 to finally keep label 1 + distanceBetween2Seizures=30 #in seconds - if seizures are closer then this then they are merged + timeBeforeSeizureConsideredAsSeizure=30 #in seconds - if seizure starts bit before true seizure to still consider ok + numFPperDayThr=1 #for additional measure of performance what number of FP seizures per days we consider ok + #Label smoothing with bayes + bayesWind=10 #orignal paper uses 5 windows + bayesProbThresh= 1.5 #smoothing with cummulative probabilities, threshold from Valentins paper + #tolerance for FP before and after seizure not to considered as FP + toleranceFP_befSeiz=10 #in sec + toleranceFP_aftSeiz=30 #in sec + + patients=[] #on which subjects to train and test + plottingON=0 #determines whether some additional plots are plotted + PersGenApproach='personalized' #'personalized', 'generalized' approaches - generalized not used here + +class SigInfoParams: + #CHB-MIT: common channels among all subjects + channels = ['FP1-F7', 'F7-T7', 'T7-P7', 'P7-O1', 'FP1-F3', 'F3-C3', 'C3-P3', 'P3-O1', + 'FP2-F4', 'F4-C4', 'C4-P4', 'P4-O2', 'FP2-F8', 'F8-T8', 'T8-P8', 'P8-O2', 'FZ-CZ', 'CZ-PZ'] + chToKeep=np.array([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17]) #which channels to keep - after reordering them in above order + samplFreq=256 #sampling frequency of data + +class SegSymbParams: + # window discretization + segLenSec=4 #length of discrete EEG windows on which to perform analysis + slidWindStepSec=0.5 #step of sliding window + labelVotingType='majority' #'majority', 'atLeastOne' or 'allOne' #defines how final label of a segment is chosen + +class StandardMLParams: + modelType='DecisionTree' #'KNN', 'SVM', 'DecisionTree', 'RandomForest','BaggingClassifier','AdaBoost' + trainingDataResampling='NoResampling' #'NoResampling','ROS','RUS','TomekLinks','ClusterCentroids','SMOTE','SMOTEtomek' + samplingStrategy='default' # depends on resampling, but if 'default' then default for each resampling type, otherwise now implemented only for RUS if not default + #KNN parameters + KNN_n_neighbors=5 + KNN_metric='euclidean' #'euclidean', 'manhattan' + #SVM parameters + SVM_kernel = 'linear' # 'linear', 'rbf','poly' + SVM_C = 1 # 1,100,1000 + SVM_gamma = 'auto' # 0 # 0,10,100 + #DecisionTree and random forest parameters + DecisionTree_criterion = 'gini' # 'gini', 'entropy' + DecisionTree_splitter = 'best' # 'best','random' + DecisionTree_max_depth = 0 # 0, 2, 5,10,20 + RandomForest_n_estimators = 100 #10,50, 100,250 + #Bagging, boosting classifier parameters + Bagging_base_estimator='SVM' #'SVM','KNN', 'DecisionTree' + Bagging_n_estimators = 100 # 10,50, 100,250 + n_jobs = 1 + + +class FeaturesParams: + numStandardFeat=62 #56 features from Sopic2018 + 6 AZC features + featNorm='noNorm' #'Norm','noNorm' #feature normalization flag + + +class ZeroCrossFeatureParams: + EPS_thresh_arr = [16, 32, 64, 128, 256] #thresholds for AZC calculation + buttFilt_order=4 #Butterworth filter order + buttFilt_lfreq = 1 #low freq + buttFilt_hfreq=20 #high freq + samplFreq=256 #expected sampling frequency for the input signal + winLen=4 # data window length in sec + winStep=0.5 # non-overlapping data in sec + + +#SAVING SETUP once again to update if new info +with open('../PARAMETERS.pickle', 'wb') as f: + # pickle.dump([GeneralParams, SegSymbParams, SigInfoParams, EEGfreqBands, StandardMLParams, FeaturesParams, ZeroCrossFeatureParams, patients], f) + pickle.dump([GeneralParams, SegSymbParams, SigInfoParams, StandardMLParams, FeaturesParams, ZeroCrossFeatureParams], f) + + \ No newline at end of file diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..b55fecb --- /dev/null +++ b/readme.md @@ -0,0 +1,27 @@ +This repository will include the code used for the paper entitled //AZC: A new feature for EEG and iEEG seizure detection]{Approximate Zero-Crossing: A new interpretable, highly discriminative and low complex feature for EEG and iEEG seizure detection// + +License: LGPL + +We have used Anaconda toolset, including the following packages (original and added later): + + - Python (v3.9.7) + - numpy (v1.20.3) + - pandas (v1.3.5) + - scikit-learn (v1.0.2) + - scipy (v1.7.3) + - seaborn (v0.11.2) + - matplotlib (v3.5.0) + - pyedflib (v0.1.25) + - antropy (v0.1.4) + - PyWavelets (v1.1.1) + +Script descriptions: + 1) `script_AZC_datasetPreProcessing.py`: **to be executed first**, including feature extraction and preparation for TSCV execution. + 2) `script_AZC_FeatDivergence.py`: calculates the KL divergence and generate related plots. + 3) `script_AZC_Classification.py`: used for seizure classification (TSCV approach). + 4) `script_AZC_ConsolidateResults.py`: consolidation of results and its plots. + +All scripts include few expected user setups before execution (e.g., the path of the folder containing the datasets), also observing the parameters set in `parametersSetup.py`. + +It's important to note that the scrpit `script_AZC_datasetPreProcessing.py` employs a pool of processing cores in case the variable **parallelize** is set. Similar behaviour can be achieved for `script_AZC_Classification.py` script, but in this case, you should launch the execution via **bash** using the `bash_run_python_variosPat.sh` script. + diff --git a/script_AZC_Classification.py b/script_AZC_Classification.py new file mode 100644 index 0000000..e34a5a4 --- /dev/null +++ b/script_AZC_Classification.py @@ -0,0 +1,250 @@ +#!~/anaconda3/bin python + +__authors__ = "Una Pale and Renato Zanetti" +__license__ = "LGPL" +__version__ = "1.0" +__email__ = "{una.pale, renato.zaneti} at epfl.ch" + +''' script that performs seizure detection based on the TSCV approach''' + +from VariousFunctionsLib import * +from parametersSetup import * +from PerformanceMetricsLib import * + +import sys +import tracemalloc + +################################### +# Initial setup +################################### + +Dataset='01_SWEC' #'01_SWEC', '02_CHB' #select the dataset to be processed +Server = 1 #0 - local execution, 1- server execution +bash = 1 #using bash to launch script execution: patient as input, hence we can parellelize in more cores +datasetPreparationType='Filt_1to20Hz' # 'Raw', 'Filt_1to20Hz' + +if bash: + if int(str(sys.argv[1]))<10: + patients=['0'+str(sys.argv[1])] + else: + patients=[str(sys.argv[1])] + +# DEFINING INPUT/OUTPUT FOLDERS +if Dataset=='01_SWEC': + if Server: + folderInData = '/shares/eslfiler1/databases/medical/swec-ethz/ieeg/long-term-dataset/' + os.nice(20) #set low priority level for longer-running process + else: + folderInData = '../iEEG/' + + if bash==0: + patients =['01','02','03','04','05','06','07','08','09','10','11','12','13','14','15','16','17','18'] + +if Dataset=='02_CHB': + if Server: + folderInData = '/shares/eslfiler1/databases/medical/chb-mit/edf/' #expect all subject's files in the same directory + os.nice(20) #set low priority level for longer-running process + else: + folderInData = '../chbmit/' + + if bash==0: + patients =['01','02','03','04','05','06','07','08','09','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24'] + +#folder that store partial results +if Server: + folderResults = '/shares/eslfiler1/scratch/ZeroCrossFeatEpilepsy/' +else: + folderResults = '../AZC_results/' + +# ########################################################################################### +#SETUPS +GeneralParams.patients= patients +GeneralParams.plottingON=0 +GeneralParams.PersGenApproach='personalized' #'personalized', 'generalized' +StandardMLParams.modelType = 'RandomForest' +FeaturesParams.numStandardFeat=62 #56 (Sopic2018) + 6 AZC + +############################### +## FEATURE SET +#features subset for evaluation +FeatSetNamesArray = ['StandardFeat'] #[ 'ZeroCross', 'StandardFeat'] + +# ############################### +# DEFINING INPUT/OUTPUT FOLDERS +windowSize=60*60 #files would have 1h data +folderOutTSCV=folderResults+Dataset+'/03_Features_' +datasetPreparationType+'_TSCVprep_'+str(windowSize) +'s_DivBig/' +folderOutFeatures= folderResults+Dataset+'/02_Features_' +datasetPreparationType + '/' +############################################################################################################### +# RUNNING FOR ALL COMBINATIONS OF FEATURES THAT ARE DEFINED ABOVE + +featNamesAll=np.array(['meanAmpl', 'LineLength', 'samp_1_d7_1', 'samp_1_d6_1', 'samp_2_d7_1', 'samp_2_d6_1', 'perm_d7_3', + 'perm_d7_5', 'perm_d7_7', 'perm_d6_3', 'perm_d6_5', 'perm_d6_7', 'perm_d5_3', 'perm_d5_5', 'perm_d5_7', + 'perm_d4_3', 'perm_d4_5', 'perm_d4_7', 'perm_d3_3', 'perm_d3_5', 'perm_d3_7', + 'shannon_en_sig', 'renyi_en_sig', 'tsallis_en_sig', 'shannon_en_d7', 'renyi_en_d7', 'tsallis_en_d7', + 'shannon_en_d6', 'renyi_en_d6', 'tsallis_en_d6', 'shannon_en_d5', 'renyi_en_d5', 'tsallis_en_d5', + 'shannon_en_d4', 'renyi_en_d4', 'tsallis_en_d4', 'shannon_en_d3', 'renyi_en_d3', 'tsallis_en_d3', + 'p_dc_rel', 'p_mov_rel', 'p_delta_rel', 'p_theta_rel', 'p_alfa_rel', 'p_middle_rel', 'p_beta_rel', + 'p_gamma_rel', 'p_dc', 'p_mov', 'p_delta', 'p_theta', 'p_alfa', 'p_middle', 'p_beta', 'p_gamma', 'p_tot', + 'ZC_NoApprox', 'ZC_approx_16', 'ZC_approx_32', 'ZC_approx_64', 'ZC_approx_128', 'ZC_approx_256']) + +for fs in range( len(FeatSetNamesArray)): + FeatureSetName=FeatSetNamesArray[fs] + + if FeatureSetName=='All': + inportCLF = 1 + inportAZC = 1 + featNames = featNamesAll + elif FeatureSetName=='StandardFeat': + inportCLF = 1 + inportAZC = 0 + featNames = np.array(['meanAmpl', 'LineLength', 'samp_1_d7_1', 'samp_1_d6_1', 'samp_2_d7_1', 'samp_2_d6_1', 'perm_d7_3', + 'perm_d7_5', 'perm_d7_7', 'perm_d6_3', 'perm_d6_5', 'perm_d6_7', 'perm_d5_3', 'perm_d5_5', 'perm_d5_7', + 'perm_d4_3', 'perm_d4_5', 'perm_d4_7', 'perm_d3_3', 'perm_d3_5', 'perm_d3_7', + 'shannon_en_sig', 'renyi_en_sig', 'tsallis_en_sig', 'shannon_en_d7', 'renyi_en_d7', 'tsallis_en_d7', + 'shannon_en_d6', 'renyi_en_d6', 'tsallis_en_d6', 'shannon_en_d5', 'renyi_en_d5', 'tsallis_en_d5', + 'shannon_en_d4', 'renyi_en_d4', 'tsallis_en_d4', 'shannon_en_d3', 'renyi_en_d3', 'tsallis_en_d3', + 'p_dc_rel', 'p_mov_rel', 'p_delta_rel', 'p_theta_rel', 'p_alfa_rel', 'p_middle_rel', 'p_beta_rel', + 'p_gamma_rel', 'p_dc', 'p_mov', 'p_delta', 'p_theta', 'p_alfa', 'p_middle', 'p_beta', 'p_gamma', 'p_tot']) + + elif FeatureSetName == 'ZeroCross': + inportCLF = 0 + inportAZC = 1 + featNames = np.array(['ZC_NoApprox', 'ZC_approx_16', 'ZC_approx_32', 'ZC_approx_64', 'ZC_approx_128', 'ZC_approx_256']) + else: + inportCLF = 1 + inportAZC = 1 + featNames = np.array( + ['meanAmpl', 'LineLength', 'samp_1_d7_1', 'samp_1_d6_1', 'samp_2_d7_1', 'samp_2_d6_1', 'perm_d7_3', + 'perm_d7_5', 'perm_d7_7', 'perm_d6_3', 'perm_d6_5', 'perm_d6_7', 'perm_d5_3', 'perm_d5_5', 'perm_d5_7', + 'perm_d4_3', 'perm_d4_5', 'perm_d4_7', 'perm_d3_3', 'perm_d3_5', 'perm_d3_7', + 'shannon_en_sig', 'renyi_en_sig', 'tsallis_en_sig', 'shannon_en_d7', 'renyi_en_d7', 'tsallis_en_d7', + 'shannon_en_d6', 'renyi_en_d6', 'tsallis_en_d6', 'shannon_en_d5', 'renyi_en_d5', 'tsallis_en_d5', + 'shannon_en_d4', 'renyi_en_d4', 'tsallis_en_d4', 'shannon_en_d3', 'renyi_en_d3', 'tsallis_en_d3', + 'p_dc_rel', 'p_mov_rel', 'p_delta_rel', 'p_theta_rel', 'p_alfa_rel', 'p_middle_rel', 'p_beta_rel', + 'p_gamma_rel', 'p_dc', 'p_mov', 'p_delta', 'p_theta', 'p_alfa', 'p_middle', 'p_beta', 'p_gamma', 'p_tot', + 'ZC_NoApprox', 'ZC_approx_16', 'ZC_approx_32', 'ZC_approx_64', 'ZC_approx_128', 'ZC_approx_256']) + + + #creating output folder + folderOut = folderResults + Dataset + '/05_RF_TSCV/' + createFolderIfNotExists(folderOut) + folderOut = folderOut + datasetPreparationType +FeatureSetName +'/' + createFolderIfNotExists(folderOut) + + ############################################################################################ + seizureStableLenToTestIndx = int(GeneralParams.seizureStableLenToTest / SegSymbParams.slidWindStepSec) + seizureStablePercToTest = GeneralParams.seizureStablePercToTest + distanceBetweenSeizuresIndx = int(GeneralParams.distanceBetween2Seizures / SegSymbParams.slidWindStepSec) + numLabelsPerHour = 60 * 60 / SegSymbParams.slidWindStepSec + toleranceFP_bef = int(GeneralParams.toleranceFP_befSeiz / SegSymbParams.slidWindStepSec) + toleranceFP_aft = int(GeneralParams.toleranceFP_aftSeiz / SegSymbParams.slidWindStepSec) + tracemalloc.start() + for patIndx, pat in enumerate(GeneralParams.patients): + print('Patient:', pat) + #load all files only once and mark where each file starts + + if Dataset=='01_SWEC': + filesAll=np.sort(glob.glob(folderOutFeatures + 'ID' + pat + '/ID' + pat + '*Labels.csv.gz')) + labelsAll = concatenateDataFromFiles(filesAll).reshape(-1) + nrows = np.size(labelsAll) + + if inportCLF: + filesAll=np.sort(glob.glob(folderOutFeatures + 'ID' + pat + '/ID' + pat + '*OtherFeat.csv.gz')) + OtherFeat = concatenateDataFromFiles_AllocateFirst(filesAll, nrows) + + if inportAZC: + filesAll=np.sort(glob.glob(folderOutFeatures + 'ID' + pat + '/ID' + pat + '*ZCFeat.csv.gz')) + ZCFeat = concatenateDataFromFiles_AllocateFirst(filesAll, nrows) + + fileNameIndexes = folderOutTSCV + 'ID' + pat + '/ID' + pat + '_TSCV_indexes.csv' + + if Dataset=='02_CHB': + filesAll = np.sort(glob.glob(folderOutFeatures +'chb' + pat+ '/chb' + pat + '*Labels.csv.gz')) + #chb02_16+ commes first than chb02_16 when sorting files + if pat=='02' and '+' in filesAll[15][:]: #chb02_16+ + cpy = filesAll[15] + filesAll[15] = filesAll[16] + filesAll[16] = cpy + + labelsAll = concatenateDataFromFiles(filesAll).reshape(-1) + nrows = np.size(labelsAll) + if inportCLF: + filesAll = np.sort(glob.glob(folderOutFeatures +'chb' + pat+ '/chb' + pat + '*OtherFeat.csv.gz')) + if pat=='02' and '+' in filesAll[15][:]: #chb02_16+ + cpy = filesAll[15] + filesAll[15] = filesAll[16] + filesAll[16] = cpy + OtherFeat = concatenateDataFromFiles_AllocateFirst(filesAll, nrows) + + if inportAZC: + filesAll = np.sort(glob.glob(folderOutFeatures +'chb' + pat+ '/chb' + pat + '*ZCFeat.csv.gz')) + if pat=='02' and '+' in filesAll[15][:]: #chb02_16+ + cpy = filesAll[15] + filesAll[15] = filesAll[16] + filesAll[16] = cpy + ZCFeat = concatenateDataFromFiles_AllocateFirst(filesAll, nrows) + + fileNameIndexes = folderOutTSCV +'chb' + pat+ '/chb' + pat + '_TSCV_indexes.csv' + + folderOutPat = folderOut + pat + '/' + createFolderIfNotExists(folderOutPat) + #select the data to be used + startIndxOfFiles = np.loadtxt(fileNameIndexes).astype(int) #startIndxOfFiles holde TSCV files indexes considering stacking all subject's data in the same matrix, + + if inportAZC and inportCLF: #in case we use both features + numCh=int(len(ZCFeat[0, :])/(len(ZeroCrossFeatureParams.EPS_thresh_arr)+1)) + dataAll= mergeFeatFromTwoMatrixes(OtherFeat, ZCFeat, numCh) + del OtherFeat #as all data is merged into another matrix, we can free previous allocated memory + del ZCFeat + elif inportAZC: + dataAll = ZCFeat + else: + dataAll = OtherFeat + + n_cores = mp.cpu_count() + StandardMLParams.n_jobs = int(n_cores/24) #let's target half of the available cores, considering all the patients will be processed at once + + if StandardMLParams.n_jobs < 2: + StandardMLParams.n_jobs=2 #at least 2 cores allocated for training + + # remove nan and inf from matrix + dataAll[np.where(np.isinf(dataAll))] = np.nan + col_mean = np.nanmean(dataAll, axis=0) + inds = np.where(np.isnan(dataAll)) + dataAll[inds] = np.take(col_mean, inds[1]) + # if still somewhere nan replace with 0 + dataAll[np.where(np.isnan(dataAll))] = 0 + + + current, peak = tracemalloc.get_traced_memory() + print('peak memory (GB): ' + str(peak/1024**3)) + + numCV = len(startIndxOfFiles) - 1 #first position is the begining of the testing data and last position is the length of total input data + performanceTrain = np.zeros((numCV, 4*9)) # 3 for noSmooth, step1, step2, and 9 or 9 perf meausres + performanceTest = np.zeros((numCV, 4*9)) # 3 for noSmooth, step1, step2, and 9 or 9 perf meausres + performanceTest_justNext = np.zeros(( numCV-1, 4*9)) # 3 for noSmooth, step1, step2, and 9 or 9 perf meausres + + for cv in range(numCV): + print('Pat', pat, 'CV', cv+1, 'Out of', numCV, 'CVs') + + # create train and test data + dataTest = dataAll[startIndxOfFiles[cv]:startIndxOfFiles[cv + 1], :] # test data comes from only one file after this CV + label_test = labelsAll[startIndxOfFiles[cv]:startIndxOfFiles[cv + 1]] + dataTrain = dataAll[0:startIndxOfFiles[cv], :] + label_train = labelsAll[0:startIndxOfFiles[cv]] + dataSource_test = (cv + 1) * np.ones( (startIndxOfFiles[cv + 1] - startIndxOfFiles[cv])) # value 1 means file after the train one + + # initializing model and then training + MLmodel = train_StandardML_moreModelsPossible(dataTrain, label_train, StandardMLParams) + + #testing + (predLabels_test, probabLab_test, acc_test, accPerClass_test)= test_StandardML_moreModelsPossible_v3(dataTest,label_test, MLmodel) + + # save predictions + fileName2='Subj' + pat + '_cv'+str(cv).zfill(3) + dataToSave = np.vstack((label_test, probabLab_test, predLabels_test, dataSource_test)).transpose() # added from which file is specific part of test set + outputName = folderOutPat + fileName2 + '_TestPredictions.csv' + saveDataToFile(dataToSave, outputName, 'gzip') + + + diff --git a/script_AZC_ConsolidateResults.py b/script_AZC_ConsolidateResults.py new file mode 100644 index 0000000..a5ea7b8 --- /dev/null +++ b/script_AZC_ConsolidateResults.py @@ -0,0 +1,259 @@ +#!~/anaconda3/bin python + +__authors__ = "Una Pale and Renato Zanetti" +__license__ = "LGPL" +__version__ = "1.0" +__email__ = "{una.pale, renato.zaneti} at epfl.ch" + +''' script that consolidates the classification results obtained for both +datasets, hence partial results should be available before its execution ''' + +from VariousFunctionsLib import * +from parametersSetup import * + + +################################### +# Initial setup +################################### + +Server = 1 #0 - local execution, 1- server execution +datasetPreparationType='Filt_1to20Hz' # 'Raw', 'Filt_1to20Hz' + +#folder that store partial results +if Server: + folderResults = '/shares/eslfiler1/scratch/ZeroCrossFeatEpilepsy/' +else: + folderResults = '../AZC_results/' + + +#------------------------------------------------------------------------------------------------------ +#Consolidating results +#------------------------------------------------------------------------------------------------------ +#chb-mit +Dataset='02_CHB' +patients =['01','02','03','04','05','06','07','08','09','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24'] +GeneralParams.patients = patients + +folderOut = folderResults + Dataset + '/05_RF_TSCV/' + datasetPreparationType +'ZeroCross/' +func_calculatePerformance_AppendingTestData_TSCV(folderOut, GeneralParams, SegSymbParams, Dataset) + +folderOut = folderResults + Dataset + '/05_RF_TSCV/' + datasetPreparationType +'StandardFeat/' +func_calculatePerformance_AppendingTestData_TSCV(folderOut, GeneralParams, SegSymbParams, Dataset) + + +#------------------------------------------------------------------------------------------------------ +#swec-ethz +Dataset='01_SWEC' #02_CHBMIT, 01_iEEG_Bern +patients =['01','02','03','04','05','06','07','08','09','10','11','12','13','14','15','16','17','18'] +GeneralParams.patients = patients + +folderOut = folderResults + Dataset + '/05_RF_TSCV/' + datasetPreparationType +'ZeroCross/' +func_calculatePerformance_AppendingTestData_TSCV(folderOut, GeneralParams, SegSymbParams, Dataset) + +folderOut = folderResults + Dataset + '/05_RF_TSCV/' + datasetPreparationType +'StandardFeat/' +func_calculatePerformance_AppendingTestData_TSCV(folderOut, GeneralParams, SegSymbParams, Dataset) + +#--------------------------------------------------------------------------------------------------------- +#plots for paper +#--------------------------------------------------------------------------------------------------------- +font_size=11 +mksize=7 +fig1 = plt.figure(figsize=(15, 6), constrained_layout=False) +gs = GridSpec(2, 2, figure=fig1, width_ratios=[10, 7.5]) +fig1.subplots_adjust(wspace=0.15, hspace=0.4) + +#------------------------------------------------------------------------------------------------------- +#Subplot CHB - AZC +Dataset='02_CHB' +patients =['01','02','03','04','05','06','07','08','09','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24'] +GeneralParams.patients = patients + +folderOut = folderResults + Dataset + '/05_RF_TSCV/' + datasetPreparationType +'ZeroCross/' +folderOut=folderOut +'PerformanceWithAppendedTests/' +outputName = folderOut + 'AllSubj_AppendedTest_AllPerfMeasures.csv.gz' +AllSubjDiffPerf_test= readDataFromFile(outputName) + +f1 = np.zeros(len(GeneralParams.patients)) +recall = np.zeros(len(GeneralParams.patients)) +precision = np.zeros(len(GeneralParams.patients)) +far = np.zeros(len(GeneralParams.patients)) +for patIndx, pat in enumerate(GeneralParams.patients): + recall[patIndx] = AllSubjDiffPerf_test[patIndx, 27] + precision[patIndx] = AllSubjDiffPerf_test[patIndx, 28] + f1[patIndx] = AllSubjDiffPerf_test[patIndx, 29] + far[patIndx] = AllSubjDiffPerf_test[patIndx, 35] + +f1_azc_chb = f1 +recall_azc_chb = recall +precision_azc_chb = precision + +idx_sorted = np.argsort(-f1_azc_chb) #always ascending, so the minus is used to invert the order +p = np.arange(np.size(GeneralParams.patients)) +# xValues = GeneralParams.patients[idx_sorted] +ax1 = fig1.add_subplot(gs[0,0]) +ax1.plot(f1[idx_sorted]*100, '*', color='r', alpha=0.5, markersize=mksize) +ax1.plot(recall[idx_sorted]*100, '+', color='b', alpha=0.75, markersize=mksize) +ax1.plot(precision[idx_sorted]*100, '.', color='g', alpha=0.75, markersize=mksize) + +ax1.set_ylabel('%', fontsize=font_size) +# ax1.set_xlabel('Subjects', fontsize=font_size+1) +ax1.set_title('AZC - CHB-MIT', fontsize=font_size+2) +# ax1.set_xlabel('a) AZC on CHB-MIT', fontsize=font_size+1) +ax1.spines['top'].set_visible(False) +ax1.spines['right'].set_visible(False) +ax1.grid(alpha=0.5, linestyle='--', linewidth=1) +# plt.plot(xValues, far, 'o', color='r') +perfNames = ['F1= ' + str(np.mean(f1_azc_chb)*100)[0:4] + ' ± ' + str(np.std(f1_azc_chb)*100)[0:4],\ + 'Sens= ' + str(np.mean(recall_azc_chb)*100)[0:4] + ' ± ' + str(np.std(recall_azc_chb)*100)[0:4],\ + 'Prec= ' + str(np.mean(precision_azc_chb)*100)[0:4] + ' ± ' + str(np.std(precision_azc_chb)*100)[0:4]] +ax1.legend(perfNames,fontsize=font_size-1) +plt.xticks(p, p[idx_sorted]+1, fontsize=font_size) +plt.yticks([0, 20, 40, 60, 80, 100], fontsize=font_size) + + +#------------------------------------------------------------------------------------------------------- +#Subplot CHB - CLF +folderOut = folderResults + Dataset + '/05_RF_TSCV/' + datasetPreparationType +'StandardFeat/' +folderOut=folderOut +'PerformanceWithAppendedTests/' +outputName = folderOut + 'AllSubj_AppendedTest_AllPerfMeasures.csv.gz' +AllSubjDiffPerf_test= readDataFromFile(outputName) + +f1 = np.zeros(len(GeneralParams.patients)) +recall = np.zeros(len(GeneralParams.patients)) +precision = np.zeros(len(GeneralParams.patients)) +far = np.zeros(len(GeneralParams.patients)) +for patIndx, pat in enumerate(GeneralParams.patients): + recall[patIndx] = AllSubjDiffPerf_test[patIndx, 27] + precision[patIndx] = AllSubjDiffPerf_test[patIndx, 28] + f1[patIndx] = AllSubjDiffPerf_test[patIndx, 29] + far[patIndx] = AllSubjDiffPerf_test[patIndx, 35] + +f1_std_chb = f1 +recall_std_chb = recall +precision_std_chb = precision +# idx_sorted = np.argsort(-f1) #always ascending, so the minus is used to invert the order +# p = np.arange(np.size(GeneralParams.patients)) +# xValues = GeneralParams.patients[idx_sorted] +ax1 = fig1.add_subplot(gs[1,0]) +ax1.plot(f1[idx_sorted]*100, '*', color='r', alpha=0.5, markersize=mksize) +ax1.plot(recall[idx_sorted]*100, '+', color='b', alpha=0.75, markersize=mksize) +ax1.plot(precision[idx_sorted]*100, '.', color='g', alpha=0.75, markersize=mksize) + +# ax1.set_title('CHB-MIT') +ax1.set_ylabel('%', fontsize=font_size) +ax1.set_xlabel('Subjects', fontsize=font_size+1) +ax1.set_title('CLF - CHB-MIT', fontsize=font_size+2) +ax1.spines['top'].set_visible(False) +ax1.spines['right'].set_visible(False) +ax1.grid(alpha=0.5, linestyle='--', linewidth=1) +# plt.plot(xValues, far, 'o', color='r') +perfNames = ['F1= ' + str(np.mean(f1_std_chb)*100)[0:4] + ' ± ' + str(np.std(f1_std_chb)*100)[0:4],\ + 'Sens= ' + str(np.mean(recall_std_chb)*100)[0:4] + ' ± ' + str(np.std(recall_std_chb)*100)[0:4],\ + 'Prec= ' + str(np.mean(precision_std_chb)*100)[0:4] + ' ± ' + str(np.std(precision_std_chb)*100)[0:4]] +ax1.legend(perfNames,fontsize=font_size-1) +plt.xticks(p, p[idx_sorted]+1, fontsize=font_size) +plt.yticks([0, 20, 40, 60, 80, 100], fontsize=font_size) + + + +#------------------------------------------------------------------------------------------------------- +#Subplot SWEC - AZC + +Dataset='01_SWEC' +patients =['01','02','03','04','05','06','07','08','09','10','11','12','13','14','15','16','17','18'] +GeneralParams.patients = patients +folderOut = folderResults + Dataset + '/05_RF_TSCV/' + datasetPreparationType +'ZeroCross/' +folderOut=folderOut +'PerformanceWithAppendedTests/' +outputName = folderOut + 'AllSubj_AppendedTest_AllPerfMeasures.csv.gz' +AllSubjDiffPerf_test= readDataFromFile(outputName) + +p = np.arange(np.size(GeneralParams.patients)) + +f1 = np.zeros(len(GeneralParams.patients)) +recall = np.zeros(len(GeneralParams.patients)) +precision = np.zeros(len(GeneralParams.patients)) +far = np.zeros(len(GeneralParams.patients)) +for patIndx, pat in enumerate(GeneralParams.patients): + recall[patIndx] = AllSubjDiffPerf_test[patIndx, 27] + precision[patIndx] = AllSubjDiffPerf_test[patIndx, 28] + f1[patIndx] = AllSubjDiffPerf_test[patIndx, 29] + far[patIndx] = AllSubjDiffPerf_test[patIndx, 35] + +f1_azc_ieeg = f1 +recall_azc_ieeg = recall +precision_azc_ieeg = precision +idx_sorted = np.argsort(-f1_azc_ieeg) #always ascending, so the minus is used to invert the order + +ax1 = fig1.add_subplot(gs[0,1]) +ax1.plot(f1[idx_sorted]*100, '*', color='r', alpha=0.5, markersize=mksize) +ax1.plot(recall[idx_sorted]*100, '+', color='b', alpha=0.75, markersize=mksize) +ax1.plot(precision[idx_sorted]*100, '.', color='g', alpha=0.75, markersize=mksize) +ax1.grid(alpha=0.5, linestyle='--', linewidth=1) +ax1.set_title('AZC - SWEC-ETHZ', fontsize=font_size+2) +# ax1.set_ylabel('%', fontsize=font_size) +# ax1.set_xlabel('Subjects', fontsize=font_size+1) +# ax1.set_title('SWEC-ETHZ') +ax1.spines['top'].set_visible(False) +ax1.spines['right'].set_visible(False) + +perfNames = ['F1= ' + str(np.mean(f1_azc_ieeg)*100)[0:4] + ' ± ' + str(np.std(f1_azc_ieeg)*100)[0:4],\ + 'Sens= ' + str(np.mean(recall_azc_ieeg)*100)[0:4] + ' ± ' + str(np.std(recall_azc_ieeg)*100)[0:4],\ + 'Prec= ' + str(np.mean(precision_azc_ieeg)*100)[0:4] + ' ± ' + str(np.std(precision_azc_ieeg)*100)[0:4]] +ax1.legend(perfNames,fontsize=font_size-1) +plt.xticks(p, p[idx_sorted]+1, fontsize=font_size) +plt.yticks([0, 20, 40, 60, 80, 100], fontsize=font_size) + + +#------------------------------------------------------------------------------------------------------- +#Subplot SWEC - CLF +folderOut = folderResults + Dataset + '/05_RF_TSCV/' + datasetPreparationType +'StandardFeat/' +folderOut=folderOut +'PerformanceWithAppendedTests/' +outputName = folderOut + 'AllSubj_AppendedTest_AllPerfMeasures.csv.gz' +AllSubjDiffPerf_test= readDataFromFile(outputName) + +p = np.arange(np.size(GeneralParams.patients)) + +f1 = np.zeros(len(GeneralParams.patients)) +recall = np.zeros(len(GeneralParams.patients)) +precision = np.zeros(len(GeneralParams.patients)) +far = np.zeros(len(GeneralParams.patients)) +for patIndx, pat in enumerate(GeneralParams.patients): + recall[patIndx] = AllSubjDiffPerf_test[patIndx, 27] + precision[patIndx] = AllSubjDiffPerf_test[patIndx, 28] + f1[patIndx] = AllSubjDiffPerf_test[patIndx, 29] + far[patIndx] = AllSubjDiffPerf_test[patIndx, 35] + +# idx_sorted = np.argsort(-f1) #always ascending, so the minus is used to invert the order +f1_std_ieeg = f1 +recall_std_ieeg = recall +precision_std_ieeg = precision + +ax1 = fig1.add_subplot(gs[1,1]) +ax1.plot(f1[idx_sorted]*100, '*', color='r', alpha=0.5, markersize=mksize) +ax1.plot(recall[idx_sorted]*100, '+', color='b', alpha=0.75, markersize=mksize) +ax1.plot(precision[idx_sorted]*100, '.', color='g', alpha=0.75, markersize=mksize) +ax1.grid(alpha=0.5, linestyle='--', linewidth=1) +# ax1.set_ylabel('%', fontsize=font_size) +ax1.set_xlabel('Subjects', fontsize=font_size+1) +ax1.set_title('CLF - SWEC-ETHZ', fontsize=font_size+2) +# ax1.set_title('SWEC-ETHZ') +ax1.spines['top'].set_visible(False) +ax1.spines['right'].set_visible(False) + +perfNames = ['F1= ' + str(np.mean(f1_std_ieeg)*100)[0:4] + ' ± ' + str(np.std(f1_std_ieeg)*100)[0:4],\ + 'Sens= ' + str(np.mean(recall_std_ieeg)*100)[0:4] + ' ± ' + str(np.std(recall_std_ieeg)*100)[0:4],\ + 'Prec= ' + str(np.mean(precision_std_ieeg)*100)[0:4] + ' ± ' + str(np.std(precision_std_ieeg)*100)[0:4]] +ax1.legend(perfNames,fontsize=font_size-1) +plt.xticks(p, p[idx_sorted]+1, fontsize=font_size) +plt.yticks([0, 20, 40, 60, 80, 100], fontsize=font_size) + + +print('AZC- Sen > 0.7: ' + str((np.sum(recall_azc_chb>0.7) + np.sum(recall_azc_ieeg>0.7))/42)) +print('STD- Sen > 0.7: ' + str((np.sum(recall_std_chb>0.7) + np.sum(recall_std_ieeg>0.7))/42)) + +print('AZC- F1 > 0.7: ' + str((np.sum(f1_azc_chb>0.7) + np.sum(f1_azc_ieeg>0.7))/42)) +print('STD - F1 > 0.7: ' + str((np.sum(f1_std_chb>0.7) + np.sum(f1_std_ieeg>0.7))/42)) + +# plt.show() +plt.savefig(folderResults+'AllSubj_Consolidate_PaperPerfMeasures.pdf', bbox_inches='tight', dpi=200, format='pdf') +plt.close() diff --git a/script_AZC_FeatDivergence.py b/script_AZC_FeatDivergence.py new file mode 100644 index 0000000..d201a24 --- /dev/null +++ b/script_AZC_FeatDivergence.py @@ -0,0 +1,288 @@ +#!~/anaconda3/bin python + +__authors__ = "Una Pale and Renato Zanetti" +__license__ = "LGPL" +__version__ = "1.0" +__email__ = "{una.pale, renato.zaneti} at epfl.ch" + +''' script that : +- calculates KL divergence values (KL_NSS - non seizure to seizure, JS- jensen shannon) +- calculated from raw feature values and saves it for each subject +- calculated average for all subjects and plots in different ways +''' + +from parametersSetup import * +from VariousFunctionsLib import * + +from matplotlib.patches import Polygon + +################################### +# Initial setup +################################### + +Dataset='01_SWEC' #'01_SWEC', '02_CHB' #select the dataset to be processed +Server = 1 #0 - local execution, 1- server execution +datasetPreparationType='Filt_1to20Hz' # 'Raw', 'Filt_1to20Hz' + +# DEFINING INPUT/OUTPUT FOLDERS +if Dataset=='01_SWEC': + if Server: + folderInData = '/shares/eslfiler1/databases/medical/swec-ethz/ieeg/long-term-dataset/' + os.nice(20) #set low priority level for longer-running process + else: + folderInData = '../iEEG/' + patients =['01','02','03','04','05','06','07','08','09','10','11','12','13','14','15','16','17','18'] + + +if Dataset=='02_CHB': + if Server: + folderInData = '/shares/eslfiler1/databases/medical/chb-mit/edf/' #expect all subject's files in the same directory + os.nice(20) #set low priority level for longer-running process + else: + folderInData = '../chbmit/' + patients =['01','02','03','04','05','06','07','08','09','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24'] + +#folder that store partial results +if Server: + folderResults = '/shares/eslfiler1/scratch/ZeroCrossFeatEpilepsy/' +else: + folderResults = '../AZC_results/' + +folderOutFeatures= folderResults+Dataset+'/02_Features_' +datasetPreparationType + '/' +folderDiverg= folderResults+Dataset+'/04_FeatureDivergence/' +createFolderIfNotExists(folderDiverg) + +# ########################################################################################### +#SETUPS +GeneralParams.patients= patients +GeneralParams.plottingON=0 +GeneralParams.PersGenApproach='personalized' #'personalized', 'generalized' +StandardMLParams.modelType = 'RandomForest' # 'KNN', 'SVM', 'DecisionTree', 'RandomForest','BaggingClassifier','AdaBoost' + +FeaturesParams.numStandardFeat=62 #56 (Sopic2018) + 6 AZC + +numBins=100 #(30,100) bins used for the histogram calculation +FeatNames=np.array(['meanAmpl', 'LineLength', 'samp_1_d7_1', 'samp_1_d6_1', 'samp_2_d7_1', 'samp_2_d6_1', 'perm_d7_3', 'perm_d7_5', 'perm_d7_7', 'perm_d6_3', 'perm_d6_5', 'perm_d6_7', 'perm_d5_3', 'perm_d5_5', 'perm_d5_7', 'perm_d4_3', 'perm_d4_5', 'perm_d4_7', 'perm_d3_3', 'perm_d3_5', 'perm_d3_7', + 'shannon_en_sig', 'renyi_en_sig', 'tsallis_en_sig', 'shannon_en_d7', 'renyi_en_d7', 'tsallis_en_d7', 'shannon_en_d6', 'renyi_en_d6', 'tsallis_en_d6', 'shannon_en_d5', 'renyi_en_d5', 'tsallis_en_d5', 'shannon_en_d4', 'renyi_en_d4', 'tsallis_en_d4', 'shannon_en_d3', 'renyi_en_d3', 'tsallis_en_d3', + 'p_dc_rel', 'p_mov_rel', 'p_delta_rel', 'p_theta_rel', 'p_alfa_rel', 'p_middle_rel', 'p_beta_rel', 'p_gamma_rel', 'p_dc', 'p_mov', 'p_delta', 'p_theta', 'p_alfa', 'p_middle', 'p_beta', 'p_gamma', 'p_tot', + 'zeroCrossNoApprox', 'ZC_approx_16', 'ZC_approx_32', 'ZC_approx_64', 'ZC_approx_128', 'ZC_approx_256' ]) + + +############################################################################################ +# CALCULATING KL DIVERGENCE +numFeat=FeaturesParams.numStandardFeat* len(SigInfoParams.chToKeep) +JSdiverg=np.zeros((len(GeneralParams.patients), numFeat)) +KLdiverg_NSS = np.zeros((len(GeneralParams.patients), numFeat)) +for patIndx, pat in enumerate(GeneralParams.patients): + if Dataset=='01_SWEC': + filesIn=np.sort(glob.glob(folderOutFeatures + 'ID' + pat + '/ID' + pat + '*_Labels.csv.gz')) + if Dataset=='02_CHB': + filesIn = np.sort(glob.glob(folderOutFeatures +'chb' + pat+ '/chb' + pat + '*_Labels.csv.gz')) + + numFiles=len(filesIn) + print('-- Patient:', pat, 'NumFiles:', numFiles) + + #load all files of that subject and concatenate feature values + for fIndx, fileName in enumerate(filesIn): + print('----- File:', fIndx) + #load labels + labels0 = readDataFromFile(fileName) + #load feature values + fileName2 = fileName[:-14] + '_OtherFeat.csv.gz' + OtherFeat0 = readDataFromFile(fileName2) + fileName2 = fileName[:-14] + '_ZCFeat.csv.gz' + ZCFeat0 = readDataFromFile(fileName2) + + nch = np.int32((ZCFeat0.shape[1] + OtherFeat0.shape[1])/FeaturesParams.numStandardFeat) + AllFeat0=mergeFeatFromTwoMatrixes(OtherFeat0, ZCFeat0, nch) + if (fIndx==0): + LabelsAll=labels0 + FeatAll=AllFeat0 + else: + LabelsAll=np.vstack((LabelsAll, labels0)) + FeatAll = np.vstack((FeatAll, AllFeat0)) + + #calculate histograms per seizure and non seizure + for f in range(numFeat): + (SeizHist, nonSeizHist) = calcHistogramValues_v2(FeatAll[:,f], LabelsAll,numBins) + KLdiverg_NSS[patIndx, f] = kl_divergence(nonSeizHist[0],SeizHist[0]) + JSdiverg[patIndx,f] = js_divergence(nonSeizHist[0], SeizHist[0]) + + # save predictions + outputName = folderDiverg + 'AllSubj_KLdivergence_NSS.csv' + np.savetxt(outputName, KLdiverg_NSS, delimiter=",") + outputName = folderDiverg + 'AllSubj_JSdivergence.csv' + np.savetxt(outputName, JSdiverg, delimiter=",") + +############################################################################################ +############################################################################################ +# PLOTTING +reader = csv.reader(open(folderDiverg + 'AllSubj_KLdivergence_NSS.csv', "r")) +KLdiverg_NSS = np.array(list(reader)).astype("float") +reader = csv.reader(open(folderDiverg + 'AllSubj_JSdivergence.csv', "r")) +JSdiverg = np.array(list(reader)).astype("float") + +#analysing per ch +KLdiverg_NSS_reshaped=np.reshape(KLdiverg_NSS, (len(GeneralParams.patients),-1,FeaturesParams.numStandardFeat )) +KLdiverg_NSS_meanForCh=np.nanmean(KLdiverg_NSS_reshaped,1) +KLdiverg_NSS_stdForCh=np.nanstd(KLdiverg_NSS_reshaped,1) +JSdiverg_reshaped=np.reshape(JSdiverg, (len(GeneralParams.patients),-1,FeaturesParams.numStandardFeat )) +JSdiverg_meanForCh=np.nanmean(JSdiverg_reshaped,1) +JSdiverg_stdForCh=np.nanstd(JSdiverg_reshaped,1) + +#################################### +#PLOTTING KL DIVERGENCE PER SUBJECT +fig1 = plt.figure(figsize=(16, 16), constrained_layout=False) +gs = GridSpec(6, 4, figure=fig1) +fig1.subplots_adjust(wspace=0.4, hspace=0.6) +xValues = np.arange(0, FeaturesParams.numStandardFeat, 1) +for p, pat in enumerate(GeneralParams.patients): + ax1 = fig1.add_subplot(gs[int(np.floor(p / 4)), np.mod(p, 4)]) + ax1.errorbar(xValues, KLdiverg_NSS_meanForCh[p, :], yerr=KLdiverg_NSS_stdForCh[p, :], fmt='b', label='KL_NSS') + ax1.errorbar(xValues, JSdiverg_meanForCh[p, :], yerr=JSdiverg_stdForCh[p, :], fmt='m', label='JS') + ax1.legend() + ax1.set_xlabel('Feature') + ax1.set_ylabel('Divergence') + ax1.set_title('Subj ' + pat) + ax1.grid() +fig1.show() +fig1.savefig(folderDiverg + 'AllSubj_DifferentDivergenceMeasures_perSubj.png', bbox_inches='tight') +plt.close(fig1) + + +#################################### +# CALCULATING AVERAGE KL VALUES FOR ALL SUBJECTS +KLdiverg_NSS_meanAllSubj=np.nanmean(KLdiverg_NSS_meanForCh,0) +JSdiverg_meanAllSubj=np.nanmean(JSdiverg_meanForCh,0) +KLdiverg_NSS_stdAllSubj=np.nanstd(KLdiverg_NSS_meanForCh,0) +JSdiverg_stdAllSubj=np.nanstd(JSdiverg_meanForCh,0) +# save predictions +outputName = folderDiverg + 'AllSubjAvrg_KLdivergence_NSS.csv' +np.savetxt(outputName, np.vstack((KLdiverg_NSS_meanAllSubj, KLdiverg_NSS_stdAllSubj)), delimiter=",") +outputName = folderDiverg + 'AllSubjAvrg_JSdivergence.csv' +np.savetxt(outputName, np.vstack((JSdiverg_meanAllSubj, JSdiverg_stdAllSubj)), delimiter=",") + +#plotting +fig1 = plt.figure(figsize=(16, 16), constrained_layout=False) +gs = GridSpec(2, 1, figure=fig1) +fig1.subplots_adjust(wspace=0.4, hspace=0.2) +#fig1.suptitle('Feature ') +xValues = np.arange(0, FeaturesParams.numStandardFeat, 1) +ax1 = fig1.add_subplot(gs[0,0]) +ax1.errorbar(xValues, KLdiverg_NSS_meanAllSubj, yerr=KLdiverg_NSS_stdAllSubj, fmt='b', label='KL_NSS') +#ax1.errorbar(xValues, JSdiverg_meanAllSubj, yerr=JSdiverg_stdAllSubj, fmt='k', label='JS') +ax1.legend() +ax1.set_xlabel('Feature') +ax1.set_ylabel('Divergence') +ax1.set_title('Kullback Leibler divergence') +ax1.grid() +ax1 = fig1.add_subplot(gs[1,0]) +ax1.errorbar(xValues, JSdiverg_meanAllSubj, yerr=JSdiverg_stdAllSubj, fmt='m', label='JS') +ax1.legend() +ax1.set_xlabel('Feature') +ax1.set_ylabel('Divergence') +ax1.set_title('Jensen Shannon divergence') +ax1.grid() +fig1.show() +fig1.savefig(folderDiverg + 'AllSubj_DifferentDivergenceMeasures_avrgAllSubj.png', bbox_inches='tight') +plt.close(fig1) + +#PLOTTING ONLY NSS WITH BOXPLOTS +fig1 = plt.figure(figsize=(16, 16), constrained_layout=False) +gs = GridSpec(2, 1, figure=fig1) +fig1.subplots_adjust(wspace=0.4, hspace=0.2) +#fig1.suptitle('Feature ') +xValues = np.arange(0, FeaturesParams.numStandardFeat, 1) +ax1 = fig1.add_subplot(gs[0,0]) +ax1.boxplot(KLdiverg_NSS_meanForCh, medianprops=dict(color='red', linewidth=2), + boxprops=dict(linewidth=2), capprops=dict(linewidth=2), whiskerprops=dict(linewidth=2), showfliers=False) +ax1.set_xlabel('Feature') +ax1.set_xticks(np.arange(1, FeaturesParams.numStandardFeat+1, 1)) +ax1.set_xticklabels(FeatNames, fontsize=10, rotation=45, ha='right', rotation_mode='anchor') +ax1.set_ylabel('Divergence') +ax1.set_title('Kullback Leibler divergence') +ax1.grid() +fig1.show() +fig1.savefig(folderDiverg + 'AllSubj_KLdiverg_NSS_avrgAllSubj_BoxplotAllCh.png', bbox_inches='tight') +fig1.savefig(folderDiverg + 'AllSubj_KLdiverg_NSS_avrgAllSubj_BoxplotAllCh.svg', bbox_inches='tight') +plt.close(fig1) + + + +############################################################################################ +## CALCULATE THE BEST ONES FEATURES BASED ON KL DIVERGENCE AND SORTING THEM +featNames=np.array(['meanAmpl', 'LineLength', 'samp_1_d7_1', 'samp_1_d6_1', 'samp_2_d7_1', 'samp_2_d6_1', 'perm_d7_3', 'perm_d7_5', 'perm_d7_7', 'perm_d6_3', 'perm_d6_5', 'perm_d6_7', 'perm_d5_3', 'perm_d5_5', 'perm_d5_7', 'perm_d4_3', 'perm_d4_5', 'perm_d4_7', 'perm_d3_3', 'perm_d3_5', 'perm_d3_7', + 'shannon_en_sig', 'renyi_en_sig', 'tsallis_en_sig', 'shannon_en_d7', 'renyi_en_d7', 'tsallis_en_d7', 'shannon_en_d6', 'renyi_en_d6', 'tsallis_en_d6', 'shannon_en_d5', 'renyi_en_d5', 'tsallis_en_d5', 'shannon_en_d4', 'renyi_en_d4', 'tsallis_en_d4', 'shannon_en_d3', 'renyi_en_d3', 'tsallis_en_d3', + 'p_dc_rel', 'p_mov_rel', 'p_delta_rel', 'p_theta_rel', 'p_alfa_rel', 'p_middle_rel', 'p_beta_rel', 'p_gamma_rel', 'p_dc', 'p_mov', 'p_delta', 'p_theta', 'p_alfa', 'p_middle', 'p_beta', 'p_gamma', 'p_tot', + 'zeroCrossNoApprox', 'ZC_approx_16', 'ZC_approx_32', 'ZC_approx_64', 'ZC_approx_128', 'ZC_approx_256' ]) +indexSorted=np.argsort(-JSdiverg_meanAllSubj) +ranking=np.arange(1,len(indexSorted)+1,1) +dataAppend=np.vstack((ranking, indexSorted, featNames[indexSorted],JSdiverg_meanAllSubj[indexSorted])).transpose() +FeaturesSorted_JS = pd.DataFrame(dataAppend, columns=['Ranking', 'FeatIndex', 'FeatName', 'JS_divergence']) +fileOutputName=folderDiverg + '/AllSubj_FeaturesSorted_JS.csv' +FeaturesSorted_JS.to_csv(fileOutputName) + +indexSorted=np.argsort(-KLdiverg_NSS_meanAllSubj) +ranking=np.arange(1,len(indexSorted)+1,1) +dataAppend=np.vstack((ranking, indexSorted, featNames[indexSorted],KLdiverg_NSS_meanAllSubj[indexSorted])).transpose() +FeaturesSorted_JS = pd.DataFrame(dataAppend, columns=['Ranking', 'FeatIndex', 'FeatName', 'KL_NSS_divergence']) +fileOutputName=folderDiverg + '/AllSubj_FeaturesSorted_KL_NSS.csv' +FeaturesSorted_JS.to_csv(fileOutputName) + + +#-------------------------------------------------------------------------------------------------------------------- +# PLOT FOR PAPER +numFeat =FeaturesParams.numStandardFeat +font_size=14 +fig1, ax1 = plt.subplots(figsize=(14, 7)) + +fig1.canvas.set_window_title('KL divergenge') +fig1.subplots_adjust(left=0.075, right=0.95, top=0.9, bottom=0.25) + +bp = ax1.boxplot(KLdiverg_NSS_meanForCh, medianprops=dict(color='k', linewidth=1), boxprops=dict(linewidth=0.5, color='k'), + capprops=dict(linewidth=1), whiskerprops=dict(linewidth=1), showfliers=False, showcaps=False) + +box_colors = ['lightcoral','#FFA07A', 'papayawhip', '#BDFCC9', 'mediumseagreen', 'olive', '#33A1C9'] #mint=BDFCC9, peacock = 33A1C9 +num_colors = len(box_colors) +num_boxes = [2,4,15,18,8,9,6] #number of features per feature type + +medians = np.empty(numFeat) +n_used_box=0 +for l in range(num_colors): + for i in range(num_boxes[l]): + box = bp['boxes'][i+n_used_box] + boxX = [] + boxY = [] + for j in range(5): + boxX.append(box.get_xdata()[j]) + boxY.append(box.get_ydata()[j]) + box_coords = np.column_stack([boxX, boxY]) + # Alternate colors + ax1.add_patch(Polygon(box_coords, facecolor=box_colors[l])) + + n_used_box = n_used_box + num_boxes[l] + +colorLegends = ['Time domain', 'Sample Entropy', 'Permutation Entropy', 'Shan_Ren_Tsa Entropy', 'Relative Power', 'Bandpower', 'AZC'] +ax1.legend(colorLegends,fontsize=font_size-2) + +ax1.set_xlabel('Features', fontsize=font_size+1) +# ax1.set_xticks(np.arange(1, FeaturesParams.numStandardFeat+1, 1)) +ax1.set_xticks([]) +ax1.tick_params(axis='y', which='major', labelsize=font_size-1) +# ax1.set_axisbelow(True) +# ax1.set_xticklabels(FeatNames, fontsize=10, rotation=45, ha='right', rotation_mode='anchor') +ax1.set_ylabel('KL Scores', fontsize=font_size+1) +ax1.spines['top'].set_visible(False) +ax1.spines['right'].set_visible(False) +ax1.spines['bottom'].set_visible(False) +ax1.spines['left'].set_visible(False) +# ax1.set_title('Kullback Leibler divergence') +ax1.grid(True, linestyle='-', which='major', color='lightgrey', + alpha=0.5) +fig1.show() +fig1.savefig(folderDiverg + '/'+ Dataset+ '_AllSubj_KLdiverg_NSS_avrgAllSubj_BoxplotAllCh.svg', bbox_inches='tight', dpi=200, format='svg') +plt.close(fig1) + + + + diff --git a/script_AZC_datasetPreProcessing.py b/script_AZC_datasetPreProcessing.py new file mode 100644 index 0000000..901d321 --- /dev/null +++ b/script_AZC_datasetPreProcessing.py @@ -0,0 +1,99 @@ +#!~/anaconda3/bin python + +__authors__ = "Una Pale and Renato Zanetti" +__license__ = "LGPL" +__version__ = "1.0" +__email__ = "{una.pale, renato.zaneti} at epfl.ch" + +'''script for: + - converts edf files to csv files without any changes in data (for CHB-MIT) adding the labels + - calculated features for each input file and saves them + - reorders feature data on a rolling basis (TSCV approach) and stores the a indexes vector to future train/test division + - plots labels in time (before/after TSCV reordering) +''' + +from parametersSetup import * +from ZeroCrossLibrary import * + +################################### +# Initial setup +################################### + +Dataset='01_SWEC' #'01_SWEC', '02_CHB' #select the dataset to be processed +Server = 1 #0 - local execution, 1- server execution +parallelize=1 #chose whether to parallelize feature extraction per file per core +perc_cores= 0.5 #percetage of cores target to be charged with data processing + +datasetPreparationType='Filt_1to20Hz' # 'Raw', 'Filt_1to20Hz' + +# DEFINING INPUT/OUTPUT FOLDERS +if Dataset=='01_SWEC': + if Server: + folderInData = '/shares/eslfiler1/databases/medical/swec-ethz/ieeg/long-term-dataset/' + os.nice(20) #set low priority level for process running longer + else: + folderInData = '../iEEG/' + patients =['01','02','03','04','05','06','07','08','09','10','11','12','13','14','15','16','17','18'] + +if Dataset=='02_CHB': + if Server: + folderInData = '/shares/eslfiler1/databases/medical/chb-mit/edf/' #expect all subject's files in the same directory + os.nice(20) #set low priority level for process running longer + samedir=1 #1- in case all chb files are in the same directory + else: + folderInData = '../chbmit/' + samedir=0 #1- in case all chb files are in the same directory + patients =['01','02','03','04','05','06','07','08','09','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24'] + +#folder that store partial results +if Server: + folderResults = '/shares/eslfiler1/scratch/ZeroCrossFeatEpilepsy/' +else: + folderResults = '../AZC_results/' + +createFolderIfNotExists(folderResults) +createFolderIfNotExists( folderResults+Dataset) + +#folder for feature files +folderOutFeatures= folderResults+Dataset+'/02_Features_' +datasetPreparationType + '/' +createFolderIfNotExists(folderOutFeatures) + +################################### +# Parameters update regarding definition in 'parametersSetup.py' +GeneralParams.patients = patients + +# ############################################################################################ +#CHB-MIT edf files are converted to csv, adding extra column for labels +if Dataset=='02_CHB': + folderOutCSV= folderResults+Dataset+'/01_DataCSV_raw/' + createFolderIfNotExists(folderOutCSV) + # next function considers that all CHB-MIT edf files are in the same directory + extractEDFdataToCSV_originalData(parallelize, perc_cores, samedir, folderInData, folderOutCSV, SigInfoParams, patients) + +######################################################################### +# CALCULATE FEATURES FOR EACH DATA FILE +calculateCLFFeat = 1 #enable the calculation of each group of features +calculateAZCFeat = 1 +if Dataset=='01_SWEC': + calculateFeaturesPerEachFile_CLFAndAZCFeatures_SWEC(parallelize, perc_cores, ZeroCrossFeatureParams.EPS_thresh_arr,folderInData, folderOutFeatures, GeneralParams.patients, SigInfoParams, ZeroCrossFeatureParams, calculateCLFFeat, calculateAZCFeat) + +if Dataset=='02_CHB': + calculateFeaturesPerEachFile_CLFAndAZCFeatures_CHB(parallelize, perc_cores, ZeroCrossFeatureParams.EPS_thresh_arr,folderOutCSV, folderOutFeatures, GeneralParams.patients, SigInfoParams, ZeroCrossFeatureParams, calculateCLFFeat, calculateAZCFeat) + +# # Plotting data labels before reorganizing files for TSCV approach +plotRawDataLabels(Dataset, folderOutFeatures, GeneralParams,SegSymbParams) + +########################################################################################### +# Reorganizing feature files for TSCV (first file containing min 6h of data and a seizure) +windowSize=60*60 #target for files length in data +folderOutTSCV=folderResults+Dataset+'/03_Features_' +datasetPreparationType+'_TSCVprep_'+str(windowSize) +'s_DivBig/' +createFolderIfNotExists(folderOutTSCV) + +#considering that CHB-MIT presents big time gaps in between some files for a subject and that some files have more than 1hour of data, we generate indexes +#for TSCV classification considering a first file with a minimum of 5hours and a seizure, trying to keep data grouped be origin file and only subdividing +#data of a file in case it's 1.5x bigger than the target 'windowSize'. Test data is always originated from a specific file, thus no data is merged from +# multiple files. +generateTSCVindexesFromAllFiles_storeTSCVLabels(Dataset, GeneralParams, ZeroCrossFeatureParams, SegSymbParams, folderOutFeatures, folderOutTSCV, windowSize) + +plotRearangedDataLabelsInTime(Dataset, folderOutTSCV, GeneralParams,SegSymbParams) +