diff --git a/.directory b/.directory deleted file mode 100644 index ffb53b7..0000000 --- a/.directory +++ /dev/null @@ -1,6 +0,0 @@ -[Dolphin] -Timestamp=2020,1,27,11,5,50 -Version=4 - -[Settings] -HiddenFilesShown=true diff --git a/.gitignore b/.gitignore index 61e8ba1..60be3b3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,9 @@ - *.directory - data/ - *data - *data/ - **.ipynb_checkpoints - temp/ - .directory +*.directory +data/ +*data +*data/ +**.ipynb_checkpoints +temp/ +*temp +/temp +.directory diff --git a/preProc/ECG_polyapprox.py b/preProc/ECG_polyapprox.py index 54afe01..3f5dfd1 100755 --- a/preProc/ECG_polyapprox.py +++ b/preProc/ECG_polyapprox.py @@ -1,128 +1,152 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- # pylint: disable-msg= C0103 """ Created on Fri Feb 22 09:07:30 2019 Simple script to perform a non-uniform subsampling of an ECG record in the MIT-BIH format, using the Wall-Danielsson algorithm. As output, it produces a 2-column csv file with the sample times and values. Dependencies: - numpy - tqdm (https://pypi.org/project/tqdm/) - wfdb-python (https://pypi.org/project/wfdb/) @author: T. Teijeiro """ import numpy as np from tqdm import trange import os def WD_polyapprox(arr, epsilon=75, isDir = False): """ Performs a piecewise linear approximation to *arr* by means of the Wall-Danielsson algorithm, described in "A fast sequential method for polygonal approximation of digitized curves". It includes a correction to be more sensitive to prominent peaks in the signal, that is also described in the same paper. Parameters ---------- arr: Sequence object (list, numpy array...) containing the signal. epsilon: Error threshold accepted in the approximation. It corresponds to the area deviation between the signal and the approximation. By default, it corresponds to an area of 0.25mm2 in common ECG scales (25mm/s, 10mm/mV), assuming an input sampling frequency of 360Hz and an ADC gain of 200 units. Returns ------- out: Numpy array with the indices of the samples selected in the approximation. """ if len(arr) < 3: return np.arange(len(arr)) dx = 1.0 result = [0] f = x = y = p = L = 0 if isDir: ite = lambda x:range(1,x) else: ite = lambda x: trange(1,x) for i in ite(len(arr)): dy = arr[i]-arr[i-1] x += dx y += dy f += x*dy - y*dx #Peak-test correction Ln = abs(y)+x p = (Ln < L) * ((p == 0)*(i-1) + p) L = Ln if abs(f) > epsilon: t = i-1 if p == 0 else p result.append(t) f, x, y = 0, (i-t)*dx, arr[i]-arr[t] p, L = 0, abs(y) + x result.append(len(arr)-1) return np.array(result) if __name__ == "__main__": import argparse import pandas as pd parser = argparse.ArgumentParser(description='Performs a nonuniform subsampling of a csv ' 'file, generating as output a two-column ' 'csv file with the samples time (position) and value.') parser.add_argument('-i', metavar='input', required=True, help='Input record to be processed') parser.add_argument('-o', metavar='output', required=True, help='Name of the output csv file') parser.add_argument('-e', metavar='extension', default='csv', help='extension of input (and output) file') parser.add_argument('--threshold', default=75, type=float, help=('Threshold used in the polygonal approximation method. It ' 'corresponds to the maximum area deviation accepted by the linear ' - 'approximation, in digital units. By default, it corresponds to an ' - 'area of 0.25mm2 in common ECG scales (25mm/s, 10mm/mV), assuming an ' - 'input sampling frequency of 360Hz and an ADC gain of 200 units. ' - 'This value is very conservative, ensuring a good reconstruction ' - 'appropriate for P, QRS and T wave delineation.')) + 'approximation, in digital units.')) args = parser.parse_args() if os.path.isdir(args.i): if not os.path.exists(args.o): os.makedirs(args.o) files = [os.path.join(args.i,x) for x in os.listdir(args.i) if (os.path.isfile(os.path.join(args.i,x)) and args.e in x)] n = 1 listOfFiles = list(set(files)) for i in trange(0,len(listOfFiles)): f = listOfFiles[i] print("working on file: ",f,"( ",n,"/",len(listOfFiles),")") n+=1 #Input signal as a plain array - s = np.array(pd.read_csv(f,header = None).iloc[:,0]) + sTot = np.array(pd.read_csv(f))#.iloc[:,0]) + sx = sTot[:,0]*1000 + sy = sTot[:,1]*1000 + sz = sTot[:,2]*1000 #Subsampling if args.threshold == 0: - subsampled = list(range(len(s))) + subsampledx = list(range(len(sx))) + subsampledy = list(range(len(sy))) + subsampledz = list(range(len(sz))) else: - subsampled = WD_polyapprox(s, args.threshold, isDir = True) + subsampledx = WD_polyapprox(sx, args.threshold, isDir = True) + subsampledy = WD_polyapprox(sy, args.threshold, isDir = True) + subsampledz = WD_polyapprox(sz, args.threshold, isDir = True) + #Result storage - fileName = os.path.basename(os.path.normpath(f)) - outPath = os.path.join(args.o,fileName) - np.savetxt(outPath, np.column_stack([subsampled, s[subsampled]]), fmt='%d', delimiter=', ') + fileNamex = os.path.basename(os.path.normpath(f)).split(".")[0]+'x.csv' + outPathx = os.path.join(args.o,fileNamex) + toSavex = pd.DataFrame({'tx':subsampledx}) + toSavex['x'] = sTot[subsampledx,0] + toSavex['lb'] = sTot[subsampledx,3].astype('int') + + fileNamey = os.path.basename(os.path.normpath(f)).split(".")[0]+'y.csv' + outPathy = os.path.join(args.o,fileNamey) + toSavey = pd.DataFrame({'ty':subsampledy}) + toSavey['y'] = sTot[subsampledy,1] + toSavey['lb'] = sTot[subsampledy,3].astype('int') + + fileNamez = os.path.basename(os.path.normpath(f)).split(".")[0]+'z.csv' + outPathz = os.path.join(args.o,fileNamez) + toSavez = pd.DataFrame({'ty':subsampledz}) + toSavez['z'] = sTot[subsampledz,2] + toSavez['lb'] = sTot[subsampledz,3].astype('int') + + + toSavex.to_csv(outPathx,header = None, index = False) + toSavey.to_csv(outPathy,header = None, index = False) + toSavez.to_csv(outPathz,header = None, index = False) else: #Record reading #Input signal as a plain array - s = np.array(pd.read_csv(args.i,header = None).iloc[:,0]) + sTot = np.array(pd.read_csv(args.i))#.iloc[:,0]) + s = np.linalg.norm(sTot[:,0:3],axis = 1)*1000 #Subsampling if args.threshold == 0: subsampled = list(range(len(s))) else: subsampled = WD_polyapprox(s, args.threshold) #Result storage - np.savetxt(args.o, np.column_stack([subsampled, s[subsampled]]), fmt='%d', delimiter=', ') + np.savetxt(args.o, np.column_stack([subsampled, sTot[subsampled]]), fmt='%d', delimiter=', ') diff --git a/preProc/extractAndSub.py b/preProc/extractAndSub.py index cb1757d..76a4b14 100644 --- a/preProc/extractAndSub.py +++ b/preProc/extractAndSub.py @@ -1,142 +1,35 @@ # -*- coding: utf-8 -*- """ Spyder Editor This is a temporary script file. """ -import scipy as sp -import scipy.io import numpy as np import pandas as pd import matplotlib.pyplot as plt import os import math -def subNan(x): - if math.isnan(x): - return 0 - else: - return x -def getNum(name): - startNum = name.find("_")+1 - endNum = len(name)-4 - num = int(name[startNum:endNum]) - return num if __name__ == "__main__": - matDir = "../data/eslppg" - csvDir1 = "../data/data64" - csvDir2 = "../data/data2k" - csvDir3 = "../data/data250" + dataDir = "../data/dataLabeld" + outDir = "../data/dataEB" - print("Opening matrix files...") - files = [name for name in os.listdir(matDir) if name.endswith(".mat")] - paths = [(os.path.join(matDir,f),os.path.splitext(f)[0]+".csv") for f in files] - mats = [(sp.io.loadmat(p[0])['sample'][0][0][0][:,0],p[1]) for p in paths] - - print("Purging NaN elements...") - nanElem = list(filter(lambda x: len([nan for nan in x[0] if math.isnan(nan)])>0,mats)) - nanName = sorted(list(map(lambda x: x[1], nanElem))) - # 'zeroing' the Nan array's elements - final = [] - n = 1 - for elem in mats: - #print(str(n)+"/"+str(len(mats))) - n+=1 - vec = [] - for e in elem[0]: - vec.append(subNan(e)) - final.append([np.array(vec),elem[1]]) - print("Finished") - mats = final - - print("Normalizing...") - temp64 = [] - temp250 = [] - temp2k = [] - for elem in mats: - #print(elem[1]) - if(getNum(elem[1])<11): - temp64.extend(elem[0]) - if(getNum(elem[1])>25): - temp250.extend(elem[0]) - if(getNum(elem[1])<26 and getNum(elem[1])>10): - temp2k.extend(elem[0]) - - print("64 Hz files") - max64 = np.percentile(temp64,97) - min64 = np.percentile(temp64,3) - print("Max percentile for 64: ",str(max64)) - print("Min percentile for 64: ",str(min64)) - print("Delta: ",str(max64-min64)) - - print("250 Hz files") - max250 = np.percentile(temp250,97) - min250 = np.percentile(temp250,3) - print("-"*20) - print("Max percentile for 250: ",str(max250)) - print("Min percentile for 250: ",str(min250)) - print("Delta: ",str(max250-min250)) - - print("2k Hz files") - max2k = np.percentile(temp2k,97) - min2k = np.percentile(temp2k,3) - print("-"*20) - print("Max percentile for 2k: ",str(max2k)) - print("Min percentile for 2k: ",str(min2k)) - print("Delta: ",str(max2k-min2k)) - print("-"*20) - - print("\nSaving original files (purged and normalized)") - n = 1 - saved = [] - for elem in mats: - print(str(n)+"/"+str(len(mats))) - print("\tsaving ",elem[1]) - startNum = elem[1].find("_")+1 - endNum = len(elem[1])-4 - num = int(elem[1][startNum:endNum]) - if num < 11: - #64 - csvDir = csvDir1 - div = max64-min64 - elif num < 26: - #2k - csvDir = csvDir2 - div = max2k-min2k - else: - #250 - csvDir = csvDir3 - div = max250-min250 - - print("\tOriginal dynamic: ",max(elem[0])-min(elem[0]),"(max = ",str(max(elem[0]))," min = ",str(min(elem[0]))) - elem[0] = 1024 * elem[0]/div - mats[n-1][0] =elem[0] - print("\tNormalized dynamic: ",max(elem[0])-min(elem[0]),"(max = ",str(max(elem[0]))," min = ",str(min(elem[0]))) - df = pd.DataFrame(elem[0]) - if not os.path.exists(csvDir): - os.makedirs(csvDir) - #df.to_csv(os.path.join(csvDir,elem[1]),header = False, index = False) - saved.append(os.path.join(csvDir,elem[1])) - n+=1 - - - + print("Opening files...") + files = [name for name in os.listdir(dataDir) if name.endswith(".csv")] + paths = [(os.path.join(dataDir,f),os.path.splitext(f)[0]+".csv") for f in files] th = range(4,100,4) - dirs = ["../data/data250/","../data/data64/","../data/data2k/"] - print("Subsampling and saving...") for t in th: print("\n\n############################## THRESHOLD: ",t,"##############################") - for d in dirs: - print("\n############################## DIR: ",d,"##############################\n") - destDir = d[0:len(d)-1]+"sub"+str(t)+"/" - print(destDir) - command = "./ECG_polyapprox.py -i "+str(d)+" -o "+str(destDir)+" --threshold "+str(t) - os.system(command) + + destDir = os.path.join(outDir,"sub"+str(t)) + print(destDir) + command = "./ECG_polyapprox.py -i "+dataDir+" -o "+destDir+" --threshold "+str(t) + os.system(command) \ No newline at end of file diff --git a/preProc/reWriteData.py b/preProc/reWriteData.py index cb95d73..1a4fa91 100644 --- a/preProc/reWriteData.py +++ b/preProc/reWriteData.py @@ -1,34 +1,34 @@ import os import numpy as np import pandas as pd import sys import matplotlib.pyplot as plt def extractExpSubjSensor(fileName): - name = os.path.basename(fileName).split(".")[0] - temp = name.find("exp")+len("exp") - expNo = int(name[temp:temp+2]) - temp = name.find("user")+len("user") - subjNo = int(name[temp:temp+2]) - temp = name.find("_exp") - sensor = name[0:temp] - return expNo,subjNo,sensor + name = os.path.basename(fileName).split(".")[0] + temp = name.find("exp")+len("exp") + expNo = int(name[temp:temp+2]) + temp = name.find("user")+len("user") + subjNo = int(name[temp:temp+2]) + temp = name.find("_exp") + sensor = name[0:temp] + return expNo,subjNo,sensor if __name__ == "__main__": filenames = [os.path.join("../data/RawData",x) for x in os.listdir("../data/RawData") if ".txt" in x and "user" in x] labels = pd.read_csv("../data/RawData/labels.txt", names = ["experiment","subject","label","start","end"],sep = " ") for f in filenames: exp,subj, sensor = extractExpSubjSensor(f) fileToSave = "../data/dataLabeld/"+str(exp)+sensor+".csv" labelsThisExp = labels.loc[labels["experiment"] == exp] dfFile = pd.read_csv(f,names = ["x","y","z"],sep = " ") dfFile["lbl"] = -1 for lbl in labelsThisExp.iterrows(): start = lbl[1]["start"] stop = lbl[1]["end"] l = lbl[1]["label"] - dfFile.loc[start:stop,"lbl"] = l + dfFile.loc[start:stop+1,"lbl"] = l unlabeldLen = len(dfFile.loc[dfFile["lbl"] == -1]) dfFile.to_csv(fileToSave,index = False) print("Wrote: {}. File is {}s long, total unlabeld time {}s (unlabeld percentage: {})".format(fileToSave, len(dfFile)/50, unlabeldLen/50,100*(unlabeldLen/50)/(len(dfFile)/50))) \ No newline at end of file diff --git a/proc/Untitled.ipynb b/proc/Untitled.ipynb new file mode 100644 index 0000000..5215390 --- /dev/null +++ b/proc/Untitled.ipynb @@ -0,0 +1,291 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-02-06T13:42:53.520785Z", + "start_time": "2020-02-06T13:42:52.997099Z" + } + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import os\n", + "import pandas as pd \n", + "import sys\n", + "import matplotlib.pyplot as plt\n", + "from eventBased.descriptor import signalDescriptor, signalDescriptorVector\n", + "from eventBased.EBfilter import FIR\n", + "\n", + "dataFolder = '../data'\n", + "experiment = 4\n", + "sub = 72\n", + "dataRwLb = os.path.join(dataFolder, \"dataLabeld\")\n", + "dataEBLb = os.path.join(dataFolder, \"dataEB/sub\"+str(sub))\n", + "baseFreq = 50\n", + "dT = 1/baseFreq\n", + "\n", + "start = 10000\n", + "stop = int(start+baseFreq*2.56)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-02-06T15:23:23.840896Z", + "start_time": "2020-02-06T15:23:23.838748Z" + } + }, + "outputs": [], + "source": [ + "plt.close('all')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-02-06T13:42:53.562846Z", + "start_time": "2020-02-06T13:42:53.525038Z" + } + }, + "outputs": [], + "source": [ + "%matplotlib auto\n", + "np.set_printoptions(precision=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-02-06T13:42:53.565987Z", + "start_time": "2020-02-06T13:42:53.564297Z" + } + }, + "outputs": [], + "source": [ + "import eventBased as eb" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-02-06T13:42:53.573045Z", + "start_time": "2020-02-06T13:42:53.567187Z" + } + }, + "outputs": [], + "source": [ + "eb.signalDescriptor" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## load data " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-02-06T13:42:53.654021Z", + "start_time": "2020-02-06T13:42:53.574079Z" + } + }, + "outputs": [], + "source": [ + "def getExpNo(s):\n", + " idx = 0\n", + " for i in range(len(s)):\n", + " idx = i\n", + " if s[i].isalpha():\n", + " break\n", + " return int(s[0:idx])\n", + "def pik(ls, s):\n", + " for e in ls:\n", + " if s in e:\n", + " return e\n", + "def generateFreqScale(dt,N, correctCoeff = None):\n", + " if correctCoeff:\n", + " dt *= correctCoeff\n", + " fs = np.arange(-1/(2*dt),1/(2*dt)-dT/10,1/(dt*N))\n", + " return fs \n", + "\n", + "pathToRwFiles = sorted([os.path.join(dataRwLb,x) for x in os.listdir(dataRwLb) if getExpNo(x) == experiment])\n", + "pathToEBFiles = sorted([os.path.join(dataEBLb,x) for x in os.listdir(dataEBLb) if getExpNo(x) == experiment])\n", + "\n", + "#Original data ---------------------------------------\n", + "dataAccRw = pd.read_csv(pik(pathToRwFiles, \"acc\"))\n", + "dataGyroRw = pd.read_csv(pik(pathToRwFiles, \"gyro\"))\n", + "\n", + "#EB data ---------------------------------------\n", + "dataAccX = pd.read_csv(pik(pathToEBFiles, \"accx\"), header = None)\n", + "dataAccY = pd.read_csv(pik(pathToEBFiles, \"accy\"), header = None)\n", + "dataAccZ = pd.read_csv(pik(pathToEBFiles, \"accz\"), header = None)\n", + "dataGyroX = pd.read_csv(pik(pathToEBFiles, \"gyrox\"), header = None)\n", + "dataGyroY = pd.read_csv(pik(pathToEBFiles, \"gyroy\"), header = None)\n", + "dataGyroZ = pd.read_csv(pik(pathToEBFiles, \"gyroz\"), header = None)\n", + "\n", + "#Some infos:\n", + "lengthT = len(dataAccRw)/baseFreq\n", + "print(\"Length of the signal, in seconds: {}s\".format(lengthT))\n", + "print(\"--------------------------------------------------------------------\")\n", + "avgFreqAccX = len(dataAccX)/lengthT\n", + "print(\"Average frequency for the x accelerometer: {}Hz\".format(avgFreqAccX))\n", + "avgFreqAccY = len(dataAccY)/lengthT\n", + "print(\"Average frequency for the y accelerometer: {}Hz\".format(avgFreqAccY))\n", + "avgFreqAccZ = len(dataAccZ)/lengthT\n", + "print(\"Average frequency for the z accelerometer: {}Hz\".format(avgFreqAccZ))\n", + "print(\"--------------------------------------------------------------------\")\n", + "avgFreqGyroX = len(dataGyroX)/lengthT\n", + "print(\"Average frequency for the x gyroscope: {}Hz\".format(avgFreqGyroX))\n", + "avgFreqGyroY = len(dataGyroY)/lengthT\n", + "print(\"Average frequency for the y gyroscope: {}Hz\".format(avgFreqGyroY))\n", + "avgFreqGyroZ = len(dataGyroZ)/lengthT\n", + "print(\"Average frequency for the z gyroscope: {}Hz\".format(avgFreqGyroZ))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-02-06T13:42:53.696208Z", + "start_time": "2020-02-06T13:42:53.655195Z" + } + }, + "outputs": [], + "source": [ + "eb = signalDescriptor(dataAccX[0],dataAccX[1])\n", + "rw = signalDescriptor(np.arange(len(dataAccRw['x'])),dataAccRw['x'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-02-06T13:42:54.010494Z", + "start_time": "2020-02-06T13:42:53.697294Z" + } + }, + "outputs": [], + "source": [ + "fEB = eb.nudft(startT = start, endT = stop)\n", + "fRW = rw.nudft(startT = start, endT = stop-1)\n", + "print(\"Finished nuDft\")\n", + "fEBres = eb.resampledDFT(startT = start, endT = stop)\n", + "fRWres = rw.resampledDFT(startT = start, endT = stop)\n", + "print(\"Finished reDft\")\n", + "f = np.fft.fft(dataAccRw['x'][start:stop])\n", + "print(\"Finished numpy fft\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-02-06T13:42:54.560252Z", + "start_time": "2020-02-06T13:42:54.011527Z" + } + }, + "outputs": [], + "source": [ + "fRWScale = generateFreqScale(dT,len(f))\n", + "fEBScale = generateFreqScale(dT,len(fEB),len(f)/len(fEB))\n", + "#--------------------------------------\n", + "plt.figure()\n", + "plt.title(\"data Raw\")\n", + "plt.plot(dataAccRw['x'][start:stop])\n", + "\n", + "plt.figure()\n", + "plt.title(\"data EB sampled\")\n", + "t,v = eb.findInterval(start,stop)\n", + "plt.plot(t,v)\n", + "#--------------------------------------\n", + "plt.figure()\n", + "plt.title(\"Non uniform DFT on Event Based\")\n", + "plt.plot(fEBScale,np.abs(eb.shift(fEB)))\n", + "\n", + "plt.figure()\n", + "plt.title(\"Non uniform DFT on Raw\")\n", + "plt.plot(fRWScale,np.abs(rw.shift(fRW)))\n", + "#---------------------------------------\n", + "plt.figure()\n", + "plt.title(\"uniform DFT on re-sampled Event Based\")\n", + "plt.plot(fRWScale,np.abs(eb.shift(fEBres)))\n", + "\n", + "plt.figure()\n", + "plt.title(\"uniform DFT on raw signal\")\n", + "plt.plot(fRWScale,np.abs(rw.shift(fRWres)))\n", + "#---------------------------------------\n", + "plt.figure()\n", + "plt.title(\"Numpy FFT on the original signal\")\n", + "plt.plot(fRWScale,np.abs(rw.shift(f)))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.1" + }, + "varInspector": { + "cols": { + "lenName": 16, + "lenType": 16, + "lenVar": 40 + }, + "kernels_config": { + "python": { + "delete_cmd_postfix": "", + "delete_cmd_prefix": "del ", + "library": "var_list.py", + "varRefreshCmd": "print(var_dic_list())" + }, + "r": { + "delete_cmd_postfix": ") ", + "delete_cmd_prefix": "rm(", + "library": "var_list.r", + "varRefreshCmd": "cat(var_dic_list()) " + } + }, + "types_to_exclude": [ + "module", + "function", + "builtin_function_or_method", + "instance", + "_Feature" + ], + "window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}