diff --git a/helpers/gather_apex_data.m b/helpers/gather_apex_data.m index 925d173..65c464c 100644 --- a/helpers/gather_apex_data.m +++ b/helpers/gather_apex_data.m @@ -1,192 +1,193 @@ % GATHER_APEX_DATA gathers all the data from APEX compressed archives % into one single CSV file that can be loaded by Octave. % % Blanchoud Group, UNIFR % Simon Blanchoud % 29/11/2020 function csv_files = gather_apex_data(fnames) % load the required packages pkg load io % Format the input as a list of cells if ~iscell(fnames) if (isfile(fnames)) fnames = {fnames}; elseif (isfolder(fnames)) fnames = glob(fullfile(fnames, '*.tgz')); end end % Prepare the temporary folder tmpdir = fullfile(pwd, 'TmpData'); if (length(glob(fullfile(tmpdir, '*.csv')))>0) delete(fullfile(tmpdir, '*.csv')); end % Prepare the output files csv_files = {}; % Loop through all files to gather for i=1:length(fnames) % Extract the actual files files = unpack(fnames{i}, tmpdir, 'tgz'); % Loop through those files for j=1:length(files) xml = fullfile(tmpdir,files{j}); % Make sure this is an actual file if isfile(xml) % Just try loading it as an XML file, ignore other types try dom = xmlread(xml); catch ME continue; end % Get the content of the file content = dom.getDocumentElement(); content.normalize(); % Check which type of file this is type = content.getNodeName(); switch type case 'datalog' node = 'record'; target = 'probe'; % Currently we only parse the datalog otherwise node = ''; target = ''; end % Get the nodes that are actually useful nodes = content.getElementsByTagName(node); % And convert them to CSV - csv_files = [csv_files; convert_xml(nodes, target, tmpdir)]; + new_files = convert_xml(nodes, target, tmpdir); + csv_files = [csv_files; new_files(:)]; end end % Delete the temporary files [dname, fname, ext] = fileparts(fnames{i}); delete(fullfile(tmpdir, fname, '*')); rmdir(fullfile(tmpdir, fname)); end % Remove duplicates csv_files = unique(csv_files); return; end % Here we physically copy the XML data into a CSV file function files = convert_xml(xml, target, fdir) % Some handlers for the files to be written files = {}; fids = struct(); % We loop over all the nodes in the XML for i=1:xml.getLength() item = xml.item(i-1); % We loop over all the attributes of each node for j=1:item.getLength() node = item.item(j-1); % We check which type of node this is name = node.getNodeName(); switch name % We extract the data from the target type case target % We get all the data from the child nodes content = node2cell(node.getChildNodes()); % We loop through all the cells and copy the proper data name = ''; type = ''; val = NaN; for k=1:size(content, 1) switch content{k,1} case 'name' name = content{k,2}; case 'type' type = content{k,2}; case 'value' val = str2double(content{k,2}); end end % If we got all the data we need, then we write it if (~isnan(val) && ~isempty(name)) % We store the file handlers in a structure, which we % need to create if it isn't ready yet if (~isfield(fids, name)) fname = fullfile(fdir, [name '.csv']); fids.(name) = fopen(fname, 'a'); if (fids.(name) > -1) files{end+1} = fname; else error(['Cannot create the proper CSV file at ' fname]) end end % Actually write the data on disk fprintf(fids.(name), '%d,%f\n', curr_time, val); end % We store the date for proper ordering of the CSV case 'date' val = node.getTextContent(); [curr_time, indx] = strptime(val, '%m/%d/%Y %H:%M:%S'); if (indx > length(val)) curr_time = mktime(curr_time); else error(['Cannot interpret the time format ' val]); end otherwise val = ''; end end end % We close all the handlers fields = fieldnames(fids); for i=1:length(fields) fclose(fids.(fields{i})); end return; end % Here we extract the nodes into a cell matrix function data = node2cell(nodes) % We populate a {name, text} stucture data = cell(0,2); for i=1:nodes.getLength() node = nodes.item(i-1); node.normalize(); % Get the two fields name = node.getNodeName(); val = node.getTextContent(); % If there is something, store it if (~isempty(name) && name(1)~='#') data{end+1, 1} = name; data{end, 2} = val; end end return; end diff --git a/helpers/parse_strains_db.m b/helpers/parse_strains_db.m new file mode 100644 index 0000000..10bd40e --- /dev/null +++ b/helpers/parse_strains_db.m @@ -0,0 +1,177 @@ +% PARSE_STRAINS_DB extracts from a SciNote export CSV file the clones' +% history and relations. +% +% STRAINS = PARSE_STRAINS_DB(CSV_FILE) extracts STRAINS from CSV_FILE. +% STRAINS is a matrix with the following format: +% [ StrainID CloneID ParentSlide SlideID CreationDate DeathDate ] +% +% Blanchoud Group, UNIFR +% Simon Blanchoud +% 01/12/2020 +function strains = parse_strains_db(fname) + + % Prepare the output + strains = NaN(0,6); + + % Only process files + if (isfile(fname)) + fid = fopen(fname, 'r'); + + % Make sure we can read it + if fid >= 0 + + % Process the header line + line = fgetl(fid); + headers = strsplit(line, ','); + + % Prepare a cell matrix to store the CSV file + data = cell(0, length(headers)); + + % Process the CSV file + line = fgetl(fid); + while ischar(line) + + % In case there are \", then there is likely a problematic comma + if (any(line == '"')) + ids = find(line == '"'); + ids = reshape(ids, 2, []); + + % We simply replace the commas with \; + for i=1:size(ids, 2) + tmpstr = line(ids(1,i):ids(2,i)); + tmpstr(tmpstr==',') = ';'; + line(ids(1,i):ids(2,i)) = tmpstr; + end + + % And we delete the \" + line(line == '"') = []; + end + + % We simplify a bit the symbols used + line(line=='.') = '/'; + line(line==':') = ';'; + + % And we store the line of the CSV into a cell matrix + data(end+1,:) = strsplit(line, ',', 'COLLAPSEDELIMITERS', false); + + % Continue the file + line = fgetl(fid); + end + end + + % Assign the variables + nclones = size(data,1); + nchars = 1; + strains = NaN(nclones, 6); + + % Loop through the CSV table + for i=1:nclones + + % Make sure this is a proper strain data + nstrain = data{i,1}; + if (nstrain(1) ~= 'S') + continue; + end + + % Get the strain and clone IDs from the name + strains(i,1) = str2double(nstrain(2:4)); + strains(i,2) = str2double(nstrain(6:8)); + + % The parent ID + parent = data{i, 3}; + strains(i,3) = str2double(parent); + + % The slide ID + slide = data{i, 2}; + strains(i,4) = str2double(slide); + + % And the record creation date as initial creation date + val = data{i, end}; + [curr_time, indx] = strptime(val, '%m/%d/%Y %H;%M'); + if (indx > length(val)) + strains(i, 5) = mktime(curr_time); + end + + % Now we process the comments + % Start by splitting them + comments = strsplit(data{i,4}, ';'); + + % We match the expression of interest once + deads = regexpi(comments, '(dead)|(empty)|(sacrificed)|(removed)|(fixation)|(slipped)|(missing)|(died)|(death)'); + clone = regexpi(comments, 'subcloned'); + + % Loop over each comment + for j=1:length(comments) + + % Temporary variables in case of no match + indx = NaN; + num = NaN; + + % Loop over each word of the comment + words = strsplit(comments{j}); + for k=1:length(words) + + % Word cleanup to maximize date matching + w = words{k}; + if (~isempty(w) && w(1)=='(') + w = w(2:end-1); + end + if (~isempty(w) && w(end)==')') + w = w(1:end-1); + end + if (~isempty(w) && w(end)=='/') + w = w(1:end-1); + end + + % Short values might be number + nchars = length(w); + if (nchars<4) + tmp = str2double(w); + if (~isnan(tmp)) + num = tmp; + end + + % Longer ones could be dates, in several formats + elseif (nchars>7) + [curr_time, indx] = strptime(w, '%d/%m/%y'); + if (indx <= nchars) + [curr_time, indx] = strptime(w, '%d/%m/%Y'); + if (indx <= nchars) + [curr_time, indx] = strptime(w, '%m/%d/%Y'); + if (indx <= nchars) + [curr_time, indx] = strptime(w, '%m/%d/%y'); + end + end + end + end + end + + % If we found a mention of colony death + if (length(deads{j}>0)) + % Store the death date if any found + if (indx > nchars) + strains(i,6) = mktime(curr_time); + end + % And the slide number if any + if (~isnan(num) && strains(i,4)==0) + strains(i,4) = num; + end + + % If it mentioned subcloning + elseif length(clone{j}>0) + % Store the date if it is anterior to the creation of the record + tmptime = mktime(curr_time); + if (indx > nchars && strains(i,5) > tmptime) + strains(i,5) = tmptime; + end + end + end + end + end + + strains = strains(~isnan(strains(:,1)),:); + [junk, indx] = sortrows(strains(:,1:2)); + strains = strains(indx,:); + + return; +end