diff --git a/VariousFunctionsLib.py b/VariousFunctionsLib.py index 73103f8..148bc53 100644 --- a/VariousFunctionsLib.py +++ b/VariousFunctionsLib.py @@ -1,867 +1,902 @@ ''' library including various functions for HD project but not necessarily related to HD vectors''' __author__ = "Una Pale" __email__ = "una.pale at epfl.ch" import os import glob import csv import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt from matplotlib.gridspec import GridSpec from entropy import * import sys import pyedflib import MITAnnotation as MIT from scipy import signal - +import pandas as pd def createFolderIfNotExists(folderOut): ''' creates folder if doesnt already exist warns if creation failed ''' if not os.path.exists(folderOut): try: os.mkdir(folderOut) except OSError: print("Creation of the directory %s failed" % folderOut) # else: # print("Successfully created the directory %s " % folderOut) def calculateFeatures_MeanAmpl(data, index, segLenIndx): numCh=len(data[0,:]) featureValues=np.zeros((len(index), numCh)) for i in range(len(index)): sig = data[index[i]:index[i] + segLenIndx, :] featureValues[i, :] = np.mean(np.abs(sig), 0) return (featureValues) def calculateFeatures_LineLength(data, index, segLenIndx): numCh=len(data[0,:]) featureValues=np.zeros((len(index), numCh)) for i in range(len(index)): sig = data[index[i]:index[i] + segLenIndx, :] featureValues[i, :] = np.mean(np.abs(np.diff(sig,n=1, axis=0)),0) return (featureValues) def calculateLabelPerSegment(labelsPerSample, type): """ calculate one label from labels of all samples in a segment assumes labels are only 0 and 1 three types of voting are possible """ (numSeg, segLen) = labelsPerSample.shape labelsPerSeg = np.zeros(numSeg) for s in range(numSeg): if type == 'majority': labelsPerSeg[s] = np.round(np.average(labelsPerSample[s, :])+0.001) #labelsPerSeg[s] = math.ceil(np.average(labelsPerSample[s, :])) was wrong!! everythign that way above 0 was 1 elif type == 'atLeastOne': labelsPerSeg[s] = int(1 in labelsPerSample[s, :]) elif type == 'allOne': labelsPerSeg[s] = int(sum(labelsPerSample[s, :]) == segLen) return labelsPerSeg def readEdfFile (fileName): ''' reads .edf file and returnes data[numSamples, numCh], sampling frequency, names of channels''' f = pyedflib.EdfReader(fileName) n = f.signals_in_file channelNames = f.getSignalLabels() f.getSampleFrequency(0) samplFreq= data = np.zeros(( f.getNSamples()[0], n)) for i in np.arange(n): data[:, i] = f.readSignal(i) return (data, samplFreq, channelNames) -def writeToCsvFile( data, labels, fileName): +def writeToCsvFile_old( data, labels, fileName): outputName= fileName+'.csv' myFile = open(outputName, 'w',newline='') dataToWrite=np.column_stack((data, labels)) with myFile: writer = csv.writer(myFile) writer.writerows(dataToWrite) +def readDataAndLabelsToFile( inputName, type): + #inputName= fileName+'.csv' + if (type=='gzip'): + df= pd.read_csv(inputName + '.gz', compression='gzip') + else: + df= pd.read_csv(inputName) + data0=df.to_numpy() + data=data0[:,0:-1] + labels=data0[:,-1] + return (data, labels) + +def saveDataAndLabelsToFile( data, labels, fileName, type): + outputName= fileName+'.csv' + df = pd.DataFrame(data=np.hstack((data, labels.reshape((-1, 1))))) + if (type=='gzip'): + df.to_csv(outputName + '.gz', index=False, compression='gzip') + else: + df.to_csv(outputName, index=False) def extractEDFdataToCSV_originalData(folderIn, folderOut, Params, patients): ''' converts data from edf format to csv 20210705 UnaPale''' createFolderIfNotExists(folderOut) #goes through each patient one by one for pat in patients: print('-- Patient:', pat) PATIENT = pat if len(sys.argv) < 2 else '{0:02d}'.format(int(sys.argv[1])) #number of Seiz and nonSeiz files SeizFiles=sorted(glob.glob(f'{folderIn}/chb{PATIENT}*.seizures')) AllFiles=sorted(glob.glob(f'{folderIn}/chb{PATIENT}*.edf')) #CREATING LIST OF SEIZURE AND NON SEIZURE FILES # create lists with just names (without path), to be able to compare them (SeizFileNames, NonSeizFileNames, NonSeizFileFullNames)=createListOfSeizAndNonSeizFiles(SeizFiles, AllFiles) #EXPORT SEIZURE FILES for fileIndx,fileName in enumerate(SeizFiles): # file name extraction fileName0 = os.path.splitext(fileName)[0] # removing .seizures from the string pom, fileName1 = os.path.split(fileName0) fileName2 = os.path.splitext(fileName1)[0] # read data # here replaced reading .hea files with .edf reading to avoid converting !!! (rec, samplFreq, channels) = readEdfFile(fileName0) # take only the channels we need and in correct order allGood=0 try: # keep only ch of interest chToKeepAndInCorrectOrder=[channels.index(Params.channelNamesToKeep[i]) for i in range(len(Params.channelNamesToKeep))] allGood=1 except: print('Sth wrong with the channels in a file: ', fileName) if allGood==1: newData = rec[1:, chToKeepAndInCorrectOrder] (lenSig, numCh) = newData.shape newLabel = np.zeros(lenSig) # read times of seizures szStart = [a for a in MIT.read_annotations(fileName) if a.code == 32] # start marked with '[' (32) szStop = [a for a in MIT.read_annotations(fileName) if a.code == 33] # start marked with ']' (33) # create label that will be added as the last column numSeizures = len(szStart) for i in range(numSeizures): seizureLen = szStop[i].time - szStart[i].time newLabel[int(szStart[i].time):int(szStop[i].time)] = np.ones(seizureLen) - # saving to csv file - saving all seizures to one file,with name of how many seizures is there - fileNameOut = folderOut + '/' + fileName2 + '_' + str(i+1) + 's' # 's' marks it is file with seizure - writeToCsvFile(newData, newLabel, fileNameOut) + + #saving all seizures to one file,with name of how many seizures is there + fileNameOut = folderOut + '/' + fileName2 + '_' + str(i + 1) + 's' # 's' marks it is file with seizure + saveDataAndLabelsToFile(newData, newLabel, fileNameOut, Params.SaveType) + # else: #gzip + # df=pd.DataFrame(data=np.hstack((newData, newLabel.reshape((-1,1)))) ) + # df.to_csv(fileNameOut+'.csv.gz', index=False, compression='gzip') #EXPORT NON SEIZURE FILES for fileIndx,fileName in enumerate(NonSeizFileFullNames): # file name extraction pom, fileName1 = os.path.split(fileName) fileName2 = os.path.splitext(fileName1)[0] # read data # # here replaced reading .hea files with .edf reading to avoid converting !!! (rec, samplFreq, channels) = readEdfFile(fileName) # take only the channels we need and in correct order allGood=0 try: # keep only ch of interest chToKeepAndInCorrectOrder=[channels.index(Params.channelNamesToKeep[i]) for i in range(len(Params.channelNamesToKeep))] allGood=1 except: print('Sth wrong with the channels in a file: ', fileName) if allGood==1: newData = rec[1:, chToKeepAndInCorrectOrder] # create label that will be added as the last column (lenSig, numCh) = newData.shape newLabel = np.zeros(lenSig) - # saving to csv file + + #saving fileNameOut = folderOut + '/' + fileName2 - writeToCsvFile(newData, newLabel, fileNameOut) + saveDataAndLabelsToFile(newData, newLabel, fileNameOut, Params.SaveType) + # if (Params.SaveType=='csv'): + # writeToCsvFile(newData, newLabel, fileNameOut) + # else: #gzip + # df=pd.DataFrame(data=np.hstack((newData, newLabel.reshape((-1,1)))) ) + # df.to_csv(fileNameOut, index=False, compression='gzip') + + def readDataAndLabelFromEDFFile_SeizFile(fileName, Params): fileNameHead=fileName+'.seizures' fileName0 = os.path.splitext(fileName)[0] # removing .seizures from the string # here replaced reading .hea files with .edf reading to avoid converting !!! (rec, samplFreq, channels) = readEdfFile(fileName) # take only the channels we need and in correct order allGood = 0 try: # keep only ch of interest chToKeepAndInCorrectOrder = [channels.index(Params.channelNamesToKeep[i]) for i in range(len(Params.channelNamesToKeep))] allGood = 1 except: print('Sth wrong with the channels in a file: ', fileName) allGood=0 newData=[] newLabel=[] if allGood == 1: newData = rec[1:, chToKeepAndInCorrectOrder] (lenSig, numCh) = newData.shape newLabel = np.zeros(lenSig) # read times of seizures szStart = [a for a in MIT.read_annotations(fileNameHead) if a.code == 32] # start marked with '[' (32) szStop = [a for a in MIT.read_annotations(fileNameHead) if a.code == 33] # start marked with ']' (33) # create label that will be added as the last column numSeizures = len(szStart) for i in range(numSeizures): seizureLen = szStop[i].time - szStart[i].time newLabel[int(szStart[i].time):int(szStop[i].time)] = np.ones(seizureLen) return( allGood, newData, newLabel) def readDataAndLabelFromEDFFile_NonSeizFile(fileName, Params): (rec, samplFreq, channels) = readEdfFile(fileName) # take only the channels we need and in correct order allGood = 0 try: # keep only ch of interest chToKeepAndInCorrectOrder = [channels.index(Params.channelNamesToKeep[i]) for i in range(len(Params.channelNamesToKeep))] allGood = 1 except: print('Sth wrong with the channels in a file: ', fileName) newData=[] newLabel=[] if allGood == 1: newData = rec[1:, chToKeepAndInCorrectOrder] # create label that will be added as the last column (lenSig, numCh) = newData.shape newLabel = np.zeros(lenSig) return (allGood, newData, newLabel) def createListOfSeizAndNonSeizFiles(SeizFiles, AllFiles): SeizFileNames = list() for fIndx, f in enumerate(SeizFiles): justName = os.path.split(f)[1][:-13] if (fIndx == 0): SeizFileNames = [justName] else: SeizFileNames.append(justName) NonSeizFileNames = list() NonSeizFileFullNames = list() for fIndx, f in enumerate(AllFiles): justName = os.path.split(f)[1][:-4] if (justName not in SeizFileNames): if (fIndx == 0): NonSeizFileNames = [justName] NonSeizFileFullNames = [f] else: NonSeizFileNames.append(justName) NonSeizFileFullNames.append(f) return (SeizFileNames, NonSeizFileNames, NonSeizFileFullNames) def extractEDFdataToCSV_KeepAllData_StoSFiles(folderIn, folderOut, Params, patients): ''' goes through original edf files and rearanges them in a way that each file contains data from end of previous seizure to the end of the current seizure this way every file contains exactly one seizure that is at the end of the file only the last file doesnt contain any seizure but all data after the last seizure this way files are of different length, and there should be numSeiz + 1 file for each subject files are renamed in form Subj__cv____.csv''' createFolderIfNotExists(folderOut) lenBeforeSeizIndx=int(Params.PreIctalTimeToRemove*Params.samplFreq) lenAfterSeizIndx=int(Params.PostIctalTimeToRemove*Params.samplFreq) #goes through each patient one by one for pat in patients: print('-- Patient:', pat) PATIENT = pat if len(sys.argv) < 2 else '{0:02d}'.format(int(sys.argv[1])) #number of Seiz and nonSeiz files SeizFiles=sorted(glob.glob(f'{folderIn}/chb{PATIENT}*.seizures')) AllFiles=sorted(glob.glob(f'{folderIn}/chb{PATIENT}*.edf')) #CREATING LIST OF SEIZURE AND NON SEIZURE FILES # create lists with just names (without path), to be able to compare them (SeizFileNames, NonSeizFileNames, NonSeizFileFullNames)=createListOfSeizAndNonSeizFiles(SeizFiles, AllFiles) #GO THROUGH ONE BY ONE FILE newStart=1 numSeiz=0 posStartNextTime=0 for fileIndx, fileName in enumerate(AllFiles): # file name extraction fileName0 = os.path.splitext(fileName)[0] # removing .seizures from the string pom, fileName1 = os.path.split(fileName0) fileName2 = os.path.splitext(fileName1)[0] #read data if (fileName2 not in SeizFileNames):# if no seizures SeizInFile=0 (allGood, data, labels) = readDataAndLabelFromEDFFile_NonSeizFile(fileName, Params) else: SeizInFile = 1 (allGood, data, labels) = readDataAndLabelFromEDFFile_SeizFile(fileName, Params) if allGood==0: print(' Sth wrong with channels in file:', fileName2) break if (SeizInFile==0): # if no seizures if (newStart == 1): allData = data[posStartNextTime:, :] allLabels = labels[posStartNextTime:] newStart = 0 else: allData = np.vstack((allData, data[posStartNextTime:, :])) allLabels = np.hstack((allLabels, labels[posStartNextTime:])) posStartNextTime = 0 elif (SeizInFile==1): # if seizures differ = np.diff(np.squeeze(labels)) seizStarts = np.where(differ == 1)[0] seizStops = np.where(differ == -1)[0] print('NumSeiz in file=', len(seizStarts)) for sIndx in range(len(seizStarts)): seizIndxStart = int(seizStarts[sIndx]) seizIndxStop = int(seizStops[sIndx]) if (sIndx != len(seizStarts) - 1): endIndxForNextFile = int(seizStarts[sIndx + 1]) else: endIndxForNextFile = len(labels) Indx1 = seizIndxStart - lenBeforeSeizIndx Indx2 = seizIndxStop + lenAfterSeizIndx # data to add if (Indx1 < 0 and newStart != 1): allData = allData[0:Indx1, :] # remove previous data allLabels = allLabels[0:Indx1, :] elif (Indx1 > 0 and newStart == 0): if (sIndx == 0): # if first seizure in a file # add non seizure data enough far from seizure start allData = np.vstack((allData, data[posStartNextTime:Indx1, :])) allLabels = np.hstack((allLabels, labels[posStartNextTime:Indx1])) # add sizure data allData = np.vstack((allData, data[seizIndxStart:seizIndxStop, :])) allLabels = np.hstack((allLabels, labels[seizIndxStart:seizIndxStop])) elif (Indx1 > 0 and newStart == 1): # if first file and seizure # add non seizure data enough far from seizure start allData = data[posStartNextTime:Indx1, :] allLabels = labels[posStartNextTime:Indx1] # add sizure data allData = np.vstack((allData, data[seizIndxStart:seizIndxStop, :])) allLabels = np.hstack((allLabels, labels[seizIndxStart:seizIndxStop])) # save file fileNameOut = folderOut + '/Subj'+pat + '_cv' + str(numSeiz).zfill(3) + '.csv' - dataToSave = np.hstack((allData, np.reshape(allLabels, (-1,1)))) - np.savetxt(fileNameOut, dataToSave, delimiter=",") + # dataToSave = np.hstack((allData, np.reshape(allLabels, (-1,1)))) + # np.savetxt(fileNameOut, dataToSave, delimiter=",") + saveDataAndLabelsToFile(allData, allLabels, fileNameOut, Params.SaveType) print('Saved file:', fileNameOut) numSeiz = numSeiz + 1 # start new data collection if (Indx2 > len(labels)): # some amount of next file should not be used posStartNextTime = Indx2 - len(labels) newStart = 1 else: # part of this file should be used posStartNextTime = 0 newStart = 0 allData = data[Indx2:endIndxForNextFile, :] allLabels = labels[Indx2:endIndxForNextFile] # save file - last file that will only contain nonseizure, but sometimes it is lot of data fileNameOut = folderOut + '/Subj'+pat + '_cv' + str(numSeiz).zfill(3) + '.csv' - dataToSave = np.hstack((allData, np.reshape(allLabels, (-1,1)))) - np.savetxt(fileNameOut, dataToSave, delimiter=",") + saveDataAndLabelsToFile(allData, allLabels, fileNameOut, Params.SaveType) + # dataToSave = np.hstack((allData, np.reshape(allLabels, (-1,1)))) + # np.savetxt(fileNameOut, dataToSave, delimiter=",") print('Saved file:', fileNameOut) def extractEDFdataToCSV_KeepAllData_FixedSizeFiles(folderIn, folderOut, Params, patients): ''' goes through original edf files and rearanges them in a way that each file is equally long (defined by Params.FileLen in minutes) this can mean that some files might contain more then 1 seizure depending on how far they are, and also no seizures at all (then we must be carefull how we asses performance in this file) if one seizure would be split in half by this then this file is bit longer in order to contain whole seizure files are renamed in form Subj__cv____.csv''' createFolderIfNotExists(folderOut) lenBeforeSeizIndx=int(Params.PreIctalTimeToRemove*Params.samplFreq) lenAfterSeizIndx=int(Params.PostIctalTimeToRemove*Params.samplFreq) fileSize=int(Params.FileLen*60*Params.samplFreq) #originaly given in minutes, calculate as number of samples #goes through each patient one by one for pat in patients: print('-- Patient:', pat) PATIENT = pat if len(sys.argv) < 2 else '{0:02d}'.format(int(sys.argv[1])) #number of Seiz and nonSeiz files SeizFiles=sorted(glob.glob(f'{folderIn}/chb{PATIENT}*.seizures')) AllFiles=sorted(glob.glob(f'{folderIn}/chb{PATIENT}*.edf')) #CREATING LIST OF SEIZURE AND NON SEIZURE FILES # create lists with just names (without path), to be able to compare them (SeizFileNames, NonSeizFileNames, NonSeizFileFullNames)=createListOfSeizAndNonSeizFiles(SeizFiles, AllFiles) indxStart=0 dataMissing=fileSize newFileToSave=1 numFilesThisSubj=0 #GO THROUGH ONE BY ONE FILE for fileIndx, fileName in enumerate(AllFiles): # file name extraction fileName0 = os.path.splitext(fileName)[0] # removing .seizures from the string pom, fileName1 = os.path.split(fileName0) fileName2 = os.path.splitext(fileName1)[0] #read data if (fileName2 not in SeizFileNames):# if no seizures SeizInFile=0 (allGood, data, labels) = readDataAndLabelFromEDFFile_NonSeizFile(fileName, Params) else: SeizInFile = 1 (allGood, data, labels) = readDataAndLabelFromEDFFile_SeizFile(fileName, Params) if allGood==0: print(' Sth wrong with channels in file:', fileName2) break # if there is seizure in file if (SeizInFile==1): #find start and stop of seizures diffSig = np.diff(np.squeeze(labels)) szStart = np.where(diffSig == 1)[0] szStop = np.where(diffSig == -1)[0] # remove data after seizure (unless seizre within that period) (data, labels) = removePreAndPostIctalAreas(data, labels, szStart, szStop, lenBeforeSeizIndx,lenAfterSeizIndx) # update now position of seizrues diffSig = np.diff(np.squeeze(labels)) szStart = np.where(diffSig == 1)[0] szStop = np.where(diffSig == -1)[0] if (len(szStart) != 0 and len(szStop) == 0): szStop = [len(labels)] thisFileStillHasData = 1 while (thisFileStillHasData == 1): # if enough data in file if (indxStart + dataMissing < len(labels)): # check if we would cut seizure in half if (np.sum(labels) != 0): for s in range(len(szStart)): try: if (szStart[s] < indxStart + dataMissing and szStop[ s] > indxStart + dataMissing): # cut would be where seizure is dataMissing = szStop[s] - indxStart # move cut to the end of the seizure except: print('error') if (newFileToSave == 1): newData = data[indxStart:indxStart + dataMissing, :] newLabel = labels[indxStart:indxStart + dataMissing] else: # appending to existing file newData = np.vstack((newData, data[indxStart:indxStart + dataMissing, :])) newLabel = np.hstack((newLabel, labels[indxStart:indxStart + dataMissing])) # finished this new file to save fileNameOut = folderOut + '/Subj' + pat + '_cv' + str(numFilesThisSubj).zfill(3) - writeToCsvFile(newData, newLabel, fileNameOut) + saveDataAndLabelsToFile(newData, newLabel, fileNameOut, Params.SaveType) print('Saved file:', fileNameOut) numFilesThisSubj = numFilesThisSubj + 1 newFileToSave = 1 indxStart = indxStart + dataMissing # start where we stopped dataMissing = fileSize thisFileStillHasData = 1 else: # not enough data in file if (newFileToSave == 1): newData = data[indxStart:, :] # read until the end of the file newLabel = labels[indxStart:] else: # appending to existing file newData = np.vstack((newData, data[indxStart:, :])) newLabel = np.hstack((newLabel, labels[indxStart:])) dataMissing = fileSize - len(newLabel) # calculate how much data is missing indxStart = 0 # in next file start from beginning thisFileStillHasData = 0 # this file has no more data, need to load new one newFileToSave = 0 # save file - last file that will only contain nonseizure, but sometimes it is lot of data fileNameOut = folderOut + '/Subj' + pat + '_cv' + str(numFilesThisSubj).zfill(3) - writeToCsvFile(newData, newLabel, fileNameOut) + saveDataAndLabelsToFile(newData, newLabel, fileNameOut, Params.SaveType) print('Saved file:', fileNameOut) def extractEDFdataToCSV_KeepSubselectionOfData_NonSeizAroundSeiz(folderIn, folderOut, Params, patients): ''' goes through original edf files and rearanges them in a way that each file seizure data (whole duration) and Fact times more nonSeizure data is kept non seizure data is chosen in a way that half of it is before and half after seizure if there is not enough non seizure data before or after it is set to nan files are renamed in form Subj__cv____.csv''' createFolderIfNotExists(folderOut) lenBeforeSeizIndx=int(Params.PreIctalTimeToRemove*Params.samplFreq) lenAfterSeizIndx=int(Params.PostIctalTimeToRemove*Params.samplFreq) factor=Params.RatioNonSeizSeiz #goes through each patient one by one for pat in patients: print('-- Patient:', pat) PATIENT = pat if len(sys.argv) < 2 else '{0:02d}'.format(int(sys.argv[1])) #number of Seiz and nonSeiz files SeizFiles=sorted(glob.glob(f'{folderIn}/chb{PATIENT}*.seizures')) AllFiles=sorted(glob.glob(f'{folderIn}/chb{PATIENT}*.edf')) #GO THROUGH ONE BY ONE SEIZURE FILE for fileIndx, fileName in enumerate(SeizFiles): # file name extraction fileName0 = os.path.splitext(fileName)[0] # removing .seizures from the string pom, fileName1 = os.path.split(fileName0) fileName2 = os.path.splitext(fileName1)[0] #read data (allGood, data, labels) = readDataAndLabelFromEDFFile_SeizFile(fileName0, Params) if (allGood == 1): # find start and stop of seizures diffSig = np.diff(np.squeeze(labels)) szStart = np.where(diffSig == 1)[0] szStop = np.where(diffSig == -1)[0] # remove data after seizure (unless seizre within that period) (data, labels) = removePreAndPostIctalAreas(data, labels, szStart, szStop, lenBeforeSeizIndx,lenAfterSeizIndx) # update now position of seizrues diffSig = np.diff(np.squeeze(labels)) szStart = np.where(diffSig == 1)[0] szStop = np.where(diffSig == -1)[0] if (len(szStart) != 0 and len(szStop) <= len(szStart) ): szStop=np.hstack((szStop, len(labels))) # for each seizure cut it out and save numSeizures = len(szStart) for i in range(numSeizures): # prepare where to save new cutout seizureLen = int(szStop[i] - szStart[i]) newLabel = np.zeros(seizureLen * (factor + 1)) # both for seizure nad nonSeizure lavels newData = np.zeros((seizureLen * (factor + 1), len(data[0,:]))) # save seizure part nonSeizLen = int(factor * seizureLen) newData[int(nonSeizLen / 2):int(nonSeizLen / 2) + seizureLen] = data[(szStart[i]): ( szStart[i] + seizureLen), :] newLabel[int(nonSeizLen / 2):int(nonSeizLen / 2) + seizureLen] = np.ones(seizureLen) #put non seizure data before seizure if (szStart[i]> int(nonSeizLen/2)): #if enough non seizure data before seizure newData[0:int(nonSeizLen / 2)] = data[szStart[i]-int(nonSeizLen / 2): szStart[i], :] else: #if not enough data before (put nan values before) print( 'Not enough nonSeizure data before seizure, file: ', fileName2) borderIndx=int(nonSeizLen /2)-szStart[i] newData[0:borderIndx] =np.nan try: newData[borderIndx:int(nonSeizLen / 2)] = data[0: szStart[i], :] except: print ('dsada') # put non seizure data after seizure if ( (len(labels)-szStop[i]) > int(nonSeizLen / 2) ): # if enough non seizure data after seizure newData[int(nonSeizLen / 2) + seizureLen:int(nonSeizLen / 2) + seizureLen +int(nonSeizLen / 2) , :] = data[szStop[i]: szStop[i]+ int(nonSeizLen / 2), :] else: # if not enough data after (put nan values after) print('Not enough nonSeizure data after seizure, file: ', fileName2) borderIndx = len(labels) -szStop[i] # how much data after seizure there is newData[int(nonSeizLen / 2) + seizureLen: int(nonSeizLen / 2) + seizureLen+borderIndx,:] = data[szStop[i]:, :] newData[int(nonSeizLen / 2) + seizureLen+borderIndx:,:] = np.nan #save file fileNameOut = folderOut + '/Subj' + pat + '_cv' + str(fileIndx).zfill(3) - writeToCsvFile(newData, newLabel, fileNameOut) + saveDataAndLabelsToFile(newData, newLabel, fileNameOut, Params.SaveType) print('Saved file:', fileNameOut) def extractEDFdataToCSV_KeepSubselectionOfData_NonSeizRandomlySelected(folderIn, folderOut, Params, patients): ''' goes through original edf files and rearanges them in a way that each file contains seizure data (whole duration) and Fact times more nonSeizure data non seizure data is chose from randomly selected non seizure file and is arranged in a way that half of it is before and half after seizure files are renamed in form Subj__cv____.csv''' createFolderIfNotExists(folderOut) lenBeforeSeizIndx=int(Params.PreIctalTimeToRemove*Params.samplFreq) lenAfterSeizIndx=int(Params.PostIctalTimeToRemove*Params.samplFreq) factor=Params.RatioNonSeizSeiz #goes through each patient one by one for pat in patients: print('-- Patient:', pat) PATIENT = pat if len(sys.argv) < 2 else '{0:02d}'.format(int(sys.argv[1])) #number of Seiz and nonSeiz files SeizFiles=sorted(glob.glob(f'{folderIn}/chb{PATIENT}*.seizures')) AllFiles=sorted(glob.glob(f'{folderIn}/chb{PATIENT}*.edf')) #CREATING LIST OF SEIZURE AND NON SEIZURE FILES # create lists with just names (without path), to be able to compare them (SeizFileNames, NonSeizFileNames, NonSeizFileFullNames)=createListOfSeizAndNonSeizFiles(SeizFiles, AllFiles) NonSeizFileFullNames_Orig = np.copy(NonSeizFileFullNames) NonSeizFileNames_Orig=np.copy(NonSeizFileNames) np.random.shuffle(NonSeizFileFullNames) nonSeizUsed=np.zeros((len(NonSeizFileFullNames))) #GO THROUGH ONE BY ONE SEIZURE FILE pairedFiles = [] outputFileIndx=0 IndxNonSeizFile=0 for fileIndx, fileName in enumerate(SeizFiles): # file name extraction fileName0 = os.path.splitext(fileName)[0] # removing .seizures from the string pom, fileName1 = os.path.split(fileName0) fileNameS = os.path.splitext(fileName1)[0] #read data (allGood, data, labels) = readDataAndLabelFromEDFFile_SeizFile(fileName0, Params) if (allGood==1): # find start and stop of seizures diffSig = np.diff(np.squeeze(labels)) szStart = np.where(diffSig == 1)[0] szStop = np.where(diffSig == -1)[0] if (data==[]): print ('dsdasd') # remove data after seizure (unless seizre within that period) (data, labels) = removePreAndPostIctalAreas(data, labels, szStart, szStop, lenBeforeSeizIndx,lenAfterSeizIndx) # update now position of seizrues diffSig = np.diff(np.squeeze(labels)) szStart = np.where(diffSig == 1)[0] szStop = np.where(diffSig == -1)[0] if (len(szStart) != 0 and len(szStop) <= len(szStart) ): szStop=np.hstack((szStop, len(labels))) # for each seizure cut it out and save numSeizures = len(szStart) for i in range(numSeizures): # prepare where to save new cutout seizureLen = int(szStop[i] - szStart[i]) newLabel = np.zeros(seizureLen * (factor + 1)) # both for seizure nad nonSeizure lavels newData = np.zeros((seizureLen * (factor + 1), len(data[0,:]))) # save seizure part nonSeizLen = int(factor * seizureLen) newData[int(nonSeizLen / 2):int(nonSeizLen / 2) + seizureLen] = data[(szStart[i]): ( szStart[i] + seizureLen), :] newLabel[int(nonSeizLen / 2):int(nonSeizLen / 2) + seizureLen] = np.ones(seizureLen) # LOAD NON SEIZURUE DATA for fns in range(IndxNonSeizFile, len(NonSeizFileFullNames)): #load non seizure data fileNameNS0 = NonSeizFileFullNames[fns] (allGood, dataNS, labelsNS) = readDataAndLabelFromEDFFile_NonSeizFile(fileNameNS0, Params) lenSigNonSeiz = len(dataNS[:, 0]) #enough non seizure data in this file if (lenSigNonSeiz > nonSeizLen): nonSeizUsed[fns]=1 # cut nonseizure part nonSeizStart = np.random.randint(lenSigNonSeiz - nonSeizLen - 1) #randomly initialize where we take non seizure data nonSeizCutout = dataNS[nonSeizStart: nonSeizStart + nonSeizLen, :] newData[0:int(nonSeizLen / 2)] = nonSeizCutout[0:int(nonSeizLen / 2)] newData[int(nonSeizLen / 2) + seizureLen:] = nonSeizCutout[int(nonSeizLen / 2):] # SAVING TO CSV FILE pom, fileName1 = os.path.split(fileNameNS0) fileNameNS = os.path.splitext(fileName1)[0] # save file fileNameOut = folderOut + '/Subj' + pat + '_cv' + str(outputFileIndx).zfill(3) - writeToCsvFile(newData, newLabel, fileNameOut) + saveDataAndLabelsToFile(newData, newLabel, fileNameOut, Params.SaveType) print('Saved file:', fileNameOut) outputFileIndx = outputFileIndx + 1 IndxNonSeizFile = IndxNonSeizFile + 1 #print and save which file was paired with which print('PAIRED: ', fileNameS, '- ', fileNameNS) pairedFiles.append('Subj' + pat + '_cv' + str(outputFileIndx).zfill(3) + ' : ' + fileNameS + ' -- ' + fileNameNS) # in cases when there is more seizure files then non seizure ones, we will not save this seizures # because there is no seizure file to randomly select from # thus we can start from firs non seizure file again # or if we want to be absolutely sure there is no overlap of non seizure files we can comment this, # but we will loose some seizures (or we need to think of smarter way to do this matching) if (IndxNonSeizFile == len(NonSeizFileFullNames)): IndxNonSeizFile = 0 break else: # fns = fns + 1 print('not enough nonSeiz data in this file') #remove it and put to the end toMove=NonSeizFileFullNames[fns] NonSeizFileFullNames = np.delete(NonSeizFileFullNames, fns) NonSeizFileFullNames = np.append(NonSeizFileFullNames, toMove) # save paired files file = open(folderOut + '/Subj' +pat+'_PairedFiles.txt', 'w') for i in range(len(pairedFiles)): file.write(pairedFiles[i] + '\n') file.close() def analyseSeizureDurations(folderIn, folderOut, patients): ''' loads one by one patient from raw data folder and if it is seizure file detects where is seizure, and prints to .txtx positions and duration 20200510 UnaPale''' createFolderIfNotExists(folderOut) seizLenFolder=folderOut +'/SeizLens/' createFolderIfNotExists(seizLenFolder) avrgLenPerSubj=np.zeros(len(patients)) for patIndx, pat in enumerate(patients): print('-- Patient:', pat) PATIENT = pat if len(sys.argv) < 2 else '{0:02d}'.format(int(sys.argv[1])) SeizFiles=sorted(glob.glob(f'{folderIn}/chb{PATIENT}*.seizures')) numSeiz=0 pairedFiles=[] seizLensThisSubj=[] for fileIndx,fileName in enumerate(SeizFiles): fileName0 = os.path.splitext(fileName)[0] # removing .seizures from the string pom, fileName1 = os.path.split(fileName0) fileNameShort = os.path.splitext(fileName1)[0] print('FILE:', fileNameShort) # here replaced reading .hea files with .edf reading to avoid converting !!! (data, samplFreq, channels) = readEdfFile(fileName0) (lenSig, numCh) = data.shape # read times of seizures szStart = [a for a in MIT.read_annotations(fileName) if a.code == 32] # start marked with '[' (32) szStop = [a for a in MIT.read_annotations(fileName) if a.code == 33] # start marked with ']' (33) # for each seizure cut it out and save (with few parameters) numSeizures = len(szStart) for i in range(numSeizures): seizureLen = szStop[i].time - szStart[i].time numSeiz=numSeiz+1 print('SEIZ NR ', numSeiz, ': ',szStart[i].time /256, '-', szStop[i].time /256, ' dur: ', seizureLen/256) pairedFiles.append('SEIZ NR '+ str(numSeiz)+': '+ str(szStart[i].time /256)+ ' - '+ str(szStop[i].time /256) + ' dur: '+ str(seizureLen/256) + ' seizFile: '+fileNameShort) seizLensThisSubj=np.append(seizLensThisSubj, seizureLen/256) avrgLenPerSubj[patIndx]=np.mean(seizLensThisSubj) # save paired files file = open(folderOut + '/Subj' +pat+ '_SeizureInformation.txt', 'w') for i in range(len(pairedFiles)): file.write(pairedFiles[i] + '\n') file.close() #save lens for this subj only outputName = seizLenFolder + '/Subj'+pat+'_SeizLens.csv' np.savetxt(outputName, seizLensThisSubj, delimiter=",") # save avrg Lens All Subj outputName = seizLenFolder + '/AllSubj_AvrgSeizLens.csv' np.savetxt(outputName, avrgLenPerSubj, delimiter=",") def removePreAndPostIctalAreas(data, labels, szStart, szStop, lenBeforeSeizIndx, lenAfterSeizIndx): keepIndxs=np.ones(len(labels)) for s in range(len(szStart)): #pre seizure part if (s == 0): #if first seizure, so no seizure before keepIndxs[ np.max([0,szStart[s]-lenBeforeSeizIndx ]): szStart[s]]=0 elif (szStop[s - 1] in range(szStart[s] - lenBeforeSeizIndx, szStart[s])): #previous seizure would we cut keepIndxs[szStop[s - 1]: szStart[s]] = 0 else: # seizure in the middle and all ok keepIndxs[szStart[s]-lenBeforeSeizIndx : szStart[s]] = 0 #post seizure part if (s == (len(szStart) - 1)): #if last seizure, so no seizure after keepIndxs[szStop[s]: np.min([szStop[s]+lenAfterSeizIndx,len(labels)])] = 0 elif (szStart[s + 1] in range(szStop[s], szStop[s] + lenAfterSeizIndx)): #if next seizure in postictal of this, dont cut keepIndxs[szStop[s]: szStart[s+1]] = 0 else: #seizure in the middle and all ok keepIndxs[szStop[s]: szStop[s]+lenAfterSeizIndx] = 0 pos=np.where(keepIndxs==1)[0] try: dataNew=data[pos, :] except: print('dsadas') labelsNew=labels[pos] return(dataNew, labelsNew) def plotRawDataLabelsPerSubject(folderIn, patients ): ''' loads all fiels of each subject, appends its labels and plots them in the end also tracks from which file which labels are ''' folderOut=folderIn +'/LabelsInTime' createFolderIfNotExists(folderOut) for patIndx, pat in enumerate(patients): inputFiles = np.sort(glob.glob(folderIn + '/*Subj' + pat + '*.csv')) #concatinatin predictions from all files for fIndx, fileName in enumerate(inputFiles): reader = csv.reader(open(fileName, "r")) data = np.array(list(reader)).astype("float") if fIndx==0: labels = np.squeeze(data[:,-1]) testIndx=np.ones(len(data))*(fIndx+1) else: labels = np.hstack((labels, np.squeeze(data[:,-1]))) testIndx= np.hstack((testIndx, np.ones(len(data[:,-1]))*(fIndx+1))) #Plot predictions in time fig1 = plt.figure(figsize=(16, 8), constrained_layout=False) gs = GridSpec(2, 1, figure=fig1) fig1.subplots_adjust(wspace=0.2, hspace=0.2) xValues = np.arange(0, len(labels), 1) ax1 = fig1.add_subplot(gs[0,0]) ax1.plot(xValues, labels , 'r') ax1.set_ylabel('TrueLabel') ax1.set_title('Subj'+pat) ax1.grid() ax1 = fig1.add_subplot(gs[1, 0]) ax1.plot(xValues, testIndx , 'k') ax1.set_ylabel('FileNr') ax1.grid() ax1.set_xlabel('Time') fig1.show() fig1.savefig(folderOut + '/Subj' + pat + '_RawLabels.png', bbox_inches='tight') plt.close(fig1) def decideForFinalLabel(data, winLen, winStep, type): lenSig=len(data) index = np.arange(0, lenSig - winLen, winStep) segmData = np.zeros(len(index)) for i in range(len(index)): #-1 x = data[index[i]:index[i] + winLen] if (type=='majority'): segmData[i]=(np.mean(x) > 0.5) * 1 elif (type=='allOne'): segmData[i]= (np.sum(x)==len(x))*1 else: #'atLeastOne' segmData[i] = (np.sum(x) >0) * 1 return(segmData) def filterSignal(data, Params): numCh=len(data[0,:]) #butterworth filter initialization sos = signal.butter(4, [Params.BorderFreqLow, Params.BorderFreqHigh], 'bandpass', fs=Params.samplFreq, output='sos') for ch in range(numCh): sig = data[:, ch] if (ch==0): sigFilt = signal.sosfiltfilt(sos, sig) else: sigFilt= np.vstack((sigFilt,signal.sosfiltfilt(sos, sig) )) return (sigFilt.transpose()) def calculateFeaturesPerEachFile(folderIn, folderOut, PreprocessParams, FeatureParams, patients): ''' loads one by one file (from prepared datasets) and segments it indo data windows of specific length and calculates all features here onyl meanAmpl and lineLength features are used for example foes feature calculation for all channels saves output files with the same name and extenstion _Features.csv and _Labels.csv''' - + # go through all patients for patIndx, pat in enumerate(patients): filesIn=np.sort(glob.glob(folderIn + '/*Subj' + pat + '*.csv')) numFiles=len(filesIn) print('-- Patient:', pat, 'NumSeizures:', numFiles) for fileIndx, fileIn in enumerate(filesIn): pom, fileName1 = os.path.split(fileIn) fileName2 = os.path.splitext(fileName1)[0] # reading data - reader = csv.reader(open(fileIn, "r")) - data = np.array(list(reader)).astype("float") - # separating to data and labels - X = data[:, 0:-1] - y = data[:, -1] + (X,y)=readDataAndLabelsToFile(fileIn, PreprocessParams.SaveType) + + + # reader = csv.reader(open(fileIn, "r")) + # data = np.array(list(reader)).astype("float") + # # separating to data and labels + # X = data[:, 0:-1] + # y = data[:, -1] (lenData, numCh) = X.shape labels = y[0:lenData - 2] index = np.arange(0, lenData - int(PreprocessParams.samplFreq * FeatureParams.winLen), int(PreprocessParams.samplFreq * FeatureParams.winStep)) #segment labels - one label per window labelsSegm=decideForFinalLabel(labels,int(PreprocessParams.samplFreq * FeatureParams.winLen), int( PreprocessParams.samplFreq * FeatureParams.winStep), FeatureParams.LabelVotingType) #filter whole data (all channels) - but this might not be implementable in real time, maybe filtering of each window is more appropriate Xfilt= filterSignal(X, PreprocessParams) for fIndx, fName in enumerate(FeatureParams.featNames): if (fName=='MeanAmpl'): features=calculateFeatures_MeanAmpl(Xfilt, index, int(PreprocessParams.samplFreq * FeatureParams.winLen)) elif (fName=='LineLength'): features = calculateFeatures_LineLength(Xfilt, index, int(PreprocessParams.samplFreq * FeatureParams.winLen)) if fIndx==0: featuresAll=features else: featuresAll=np.hstack((featuresAll, features)) #save features outputName = folderOut + '/' + fileName2 + '_Features.csv' np.savetxt(outputName, featuresAll, delimiter=",") # save for this file labels outputName = folderOut + '/' + fileName2 + '_Labels.csv' np.savetxt(outputName, labelsSegm, delimiter=",") \ No newline at end of file diff --git a/parametersSetup.py b/parametersSetup.py index 792eae1..9f4c434 100644 --- a/parametersSetup.py +++ b/parametersSetup.py @@ -1,46 +1,49 @@ ''' file with all parameters''' import numpy as np import pickle Dataset='01_CHBMIT' #'01_CHBMIT', '01_iEEG_Bern' patients =['01','02','03','04','05','06','07','08','09','10','11','12','13','14','15','16'] class DatasetPreprocessParams: samplFreq = 256 # sampling frequency of data #channels to keep channelNamesToKeep = ['FP1-F7', 'F7-T7', 'T7-P7', 'P7-O1', 'FP1-F3', 'F3-C3', 'C3-P3', 'P3-O1', 'FP2-F4', 'F4-C4', 'C4-P4', 'P4-O2', 'FP2-F8', 'F8-T8', 'T8-P8', 'P8-O2', 'FZ-CZ', 'CZ-PZ'] #pre and post ictal data to be removed PreIctalTimeToRemove=60 #in seconds PostIctalTimeToRemove=600 #in seconds #how to select and rearange data in files before feature extraction and training - FileRearangeAllData='AllData_FixedSize' #'AllData_FixedSize','AllData_StoS', 'SubselData_NonSeizAroundSeiz', 'SubselData_NonSeizRandom' + FileRearangeAllData='SubselData_NonSeizRandom' #'AllData_FixedSize','AllData_StoS', 'SubselData_NonSeizAroundSeiz', 'SubselData_NonSeizRandom' FileLen=60 #60, 240 in minutes - only for AllData_FixedSize needed RatioNonSeizSeiz=1 #1, 10 - only for SubselData needed #filtering parameters BorderFreqLow=1#Hz for the bandpass butterworth filter BorderFreqHigh=30 #Hz for the bandpass butterworth filter + #saving type + SaveType='gzip' #'csv, 'gzip'' #gzip saves a lot of memory so is recommended + class FeaturesUsedParams: #window size and step in which is moved winLen= 4 #in seconds, window length on which to calculate features winStep=0.5 #in seconds, step of moving window length #when we have more labels in one window, how final label is chosen LabelVotingType='atLeastOne' #'majority', 'allOne', 'atLeastOne' #features extracted from data featNames = np.array( ['MeanAmpl', 'LineLength']) #SAVING SETUP once again to update if new info with open('../PARAMETERS.pickle', 'wb') as f: pickle.dump([DatasetPreprocessParams, FeaturesUsedParams, patients], f) \ No newline at end of file diff --git a/script_prepareDatasets.py b/script_prepareDatasets.py index 4635125..7af947a 100644 --- a/script_prepareDatasets.py +++ b/script_prepareDatasets.py @@ -1,104 +1,104 @@ __author__ = "Una Pale" __email__ = "una.pale at epfl.ch" '''script that does several things (comment out things not needed) - converts edf files to csv files without any changes in data - calculates statistics for each subject and its seizures (start, stop, duration) - selects data and rearanges in in different ways to use later for feature calculation - plots data for each subject after searangemens - calculated features for each input file and saves them ''' from parametersSetup import * from VariousFunctionsLib import * ######################################################################### ################################### #CHBMIT Dataset='01_CHBMIT' #'01_CHBMIT', '01_iEEG_Bern' createFolderIfNotExists( Dataset) folderEDF = '../../../../databases/medical/chb-mit/edf/' patients =['01','02','03','04','05','06','07','08','09','10','11', '12','13','14','15','16','17','18','19','20','21','22','23','24'] ####################################################################### #CONVERT EDF TO CSV DATA '''converting original EDF files and .seizure files to one .csv file per input file keeping only channels of interedt from DatasetPreprocessParams.channelNamesToKeep renaming that output file keeps the same name except that files with seizures have extension '_Ns.csv' where N is number of seizures that is in that file ''' folderOut= Dataset+'/01_datasetProcessed_RawData' extractEDFdataToCSV_originalData(folderEDF, folderOut, DatasetPreprocessParams, patients) #EXPORTING INFORMATION ABOUT SEIZURE STRUCTURE PER FILE -'''printing seizure start, ends and duration per file +'''printing seizure start, ends and duration per file ''' folderOutStats= Dataset+'/00_SeizureInfoOriginalData/' analyseSeizureDurations(folderEDF, folderOutStats, patients) # ####################################################################### # SELECT AND REARANGE DATA # ALL DATA - SEIZURE TO SEIZURE CUT '''important parameters are DatasetPreprocessParams.PreIctalTimeToRemove and DatasetPreprocessParams.PostIctalTimeToRemove, they determine hwo much data before and after seizure will be removed this can be set to 0 and then removed also after features are calculated ''' if (DatasetPreprocessParams.FileRearangeAllData=='AllData_StoS'): folderOut = Dataset + '/01_datasetProcessed_' + DatasetPreprocessParams.FileRearangeAllData - extractEDFdataToCSV_KeepAllData_StoSFiles(folderEDF, DatasetPreprocessParams, patients) + extractEDFdataToCSV_KeepAllData_StoSFiles(folderEDF, folderOut, DatasetPreprocessParams, patients) # # ALL DATA - FIXED SIZE FILES (e.g. 1h, 4h) '''important parameters are: DatasetPreprocessParams.FileLen - in min how long files we want DatasetPreprocessParams.PreIctalTimeToRemove and DatasetPreprocessParams.PostIctalTimeToRemove, they determine hwo much data before and after seizure will be removed ''' if (DatasetPreprocessParams.FileRearangeAllData=='AllData_FixedSize'): folderOut= Dataset+'/01_datasetProcessed_'+DatasetPreprocessParams.FileRearangeAllData+'_'+str(DatasetPreprocessParams.FileLen)+'min' extractEDFdataToCSV_KeepAllData_FixedSizeFiles(folderEDF, folderOut, DatasetPreprocessParams, patients) # SUBSELECTION OF DATA - NON SEIZURE AROUND SEIZURE, FROM THE SAME FILE, ALSO FACTOR x NON SEIZURE DATA MORE THEN SEIZRUE '''important parameters are: DatasetPreprocessParams.RatioNonSeizSeiz DatasetPreprocessParams.PreIctalTimeToRemove and DatasetPreprocessParams.PostIctalTimeToRemove, they determine hwo much data before and after seizure will be removed ''' if (DatasetPreprocessParams.FileRearangeAllData=='SubselData_NonSeizAroundSeiz'): folderOut= Dataset+'/01_datasetProcessed_'+DatasetPreprocessParams.FileRearangeAllData+'_Fact'+str(DatasetPreprocessParams.RatioNonSeizSeiz) extractEDFdataToCSV_KeepSubselectionOfData_NonSeizAroundSeiz(folderEDF, folderOut, DatasetPreprocessParams, patients) # SUBSELECTION OF DATA - NON SEIZURE RANDOMLY SELECTED FROM NON SEIZURE FILES, ALSO FACTOR x NON SEIZURE DATA MORE THEN SEIZRUE '''important parameters are: DatasetPreprocessParams.RatioNonSeizSeiz DatasetPreprocessParams.PreIctalTimeToRemove and DatasetPreprocessParams.PostIctalTimeToRemove, they determine hwo much data before and after seizure will be removedOut, DatasetPreprocessParams, patients) ''' if (DatasetPreprocessParams.FileRearangeAllData=='SubselData_NonSeizRandom'): folderOut= Dataset+'/01_datasetProcessed_'+DatasetPreprocessParams.FileRearangeAllData+'_Fact'+str(DatasetPreprocessParams.RatioNonSeizSeiz) extractEDFdataToCSV_KeepSubselectionOfData_NonSeizRandomlySelected(folderEDF, folderOut, DatasetPreprocessParams, patients) ############################################################################################ # PLOTTING RAW DATA LABELS (FOR CHECKING WITH ROLL BASE ONES) ''' -Plotting labels appended one after antoher for each subject to check files rearanging +Plotting labels appended one after antoher for each subject to check files rearanging ''' plotRawDataLabelsPerSubject(folderOut, patients ) # # ######################################################################### # CALCULATE FEATURES FOR EACH FILE ''' parameters are: FeaturesUsedParams.winLen and FeaturesUsedParams.winStep which data preparation type (from above ones is used) ''' folderOutFeatures= Dataset+'/02_features_'+DatasetPreprocessParams.FileRearangeAllData+'_Fact'+str(DatasetPreprocessParams.RatioNonSeizSeiz) createFolderIfNotExists(folderOutFeatures) folderOutFeatures= folderOutFeatures +'/WinLen'+str(FeaturesUsedParams.winLen)+'_'+str(FeaturesUsedParams.winStep) createFolderIfNotExists(folderOutFeatures) calculateFeaturesPerEachFile(folderOut, folderOutFeatures, DatasetPreprocessParams, FeaturesUsedParams, patients)