diff --git a/helpers/gather_apex_data.m b/helpers/gather_apex_data.m index 65c464c..6651760 100644 --- a/helpers/gather_apex_data.m +++ b/helpers/gather_apex_data.m @@ -1,193 +1,194 @@ % GATHER_APEX_DATA gathers all the data from APEX compressed archives -% into one single CSV file that can be loaded by Octave. +% into individual CSV files that can be loaded by Octave. % % Blanchoud Group, UNIFR % Simon Blanchoud % 29/11/2020 function csv_files = gather_apex_data(fnames) % load the required packages pkg load io % Format the input as a list of cells if ~iscell(fnames) if (isfile(fnames)) fnames = {fnames}; elseif (isfolder(fnames)) fnames = glob(fullfile(fnames, '*.tgz')); end end % Prepare the temporary folder tmpdir = fullfile(pwd, 'TmpData'); if (length(glob(fullfile(tmpdir, '*.csv')))>0) delete(fullfile(tmpdir, '*.csv')); end % Prepare the output files csv_files = {}; % Loop through all files to gather for i=1:length(fnames) % Extract the actual files files = unpack(fnames{i}, tmpdir, 'tgz'); % Loop through those files for j=1:length(files) xml = fullfile(tmpdir,files{j}); % Make sure this is an actual file if isfile(xml) % Just try loading it as an XML file, ignore other types try dom = xmlread(xml); catch ME + disp(ME.message) continue; end % Get the content of the file content = dom.getDocumentElement(); content.normalize(); % Check which type of file this is type = content.getNodeName(); switch type case 'datalog' node = 'record'; target = 'probe'; % Currently we only parse the datalog otherwise node = ''; target = ''; end % Get the nodes that are actually useful nodes = content.getElementsByTagName(node); % And convert them to CSV new_files = convert_xml(nodes, target, tmpdir); csv_files = [csv_files; new_files(:)]; end end % Delete the temporary files [dname, fname, ext] = fileparts(fnames{i}); delete(fullfile(tmpdir, fname, '*')); rmdir(fullfile(tmpdir, fname)); end % Remove duplicates csv_files = unique(csv_files); return; end % Here we physically copy the XML data into a CSV file function files = convert_xml(xml, target, fdir) % Some handlers for the files to be written files = {}; fids = struct(); % We loop over all the nodes in the XML for i=1:xml.getLength() item = xml.item(i-1); % We loop over all the attributes of each node for j=1:item.getLength() node = item.item(j-1); % We check which type of node this is name = node.getNodeName(); switch name % We extract the data from the target type case target % We get all the data from the child nodes content = node2cell(node.getChildNodes()); % We loop through all the cells and copy the proper data name = ''; type = ''; val = NaN; for k=1:size(content, 1) switch content{k,1} case 'name' name = content{k,2}; case 'type' type = content{k,2}; case 'value' val = str2double(content{k,2}); end end % If we got all the data we need, then we write it if (~isnan(val) && ~isempty(name)) % We store the file handlers in a structure, which we % need to create if it isn't ready yet if (~isfield(fids, name)) fname = fullfile(fdir, [name '.csv']); fids.(name) = fopen(fname, 'a'); if (fids.(name) > -1) files{end+1} = fname; else error(['Cannot create the proper CSV file at ' fname]) end end % Actually write the data on disk fprintf(fids.(name), '%d,%f\n', curr_time, val); end % We store the date for proper ordering of the CSV case 'date' val = node.getTextContent(); [curr_time, indx] = strptime(val, '%m/%d/%Y %H:%M:%S'); if (indx > length(val)) curr_time = mktime(curr_time); else error(['Cannot interpret the time format ' val]); end otherwise val = ''; end end end % We close all the handlers fields = fieldnames(fids); for i=1:length(fields) fclose(fids.(fields{i})); end return; end % Here we extract the nodes into a cell matrix function data = node2cell(nodes) % We populate a {name, text} stucture data = cell(0,2); for i=1:nodes.getLength() node = nodes.item(i-1); node.normalize(); % Get the two fields name = node.getNodeName(); val = node.getTextContent(); % If there is something, store it if (~isempty(name) && name(1)~='#') data{end+1, 1} = name; data{end, 2} = val; end end return; end diff --git a/helpers/parse_strains_db.m b/helpers/parse_strains_db.m index d88cb32..8f717c7 100644 --- a/helpers/parse_strains_db.m +++ b/helpers/parse_strains_db.m @@ -1,178 +1,211 @@ % PARSE_STRAINS_DB extracts from a SciNote export CSV file the clones' % history and relations. % % STRAINS = PARSE_STRAINS_DB(CSV_FILE) extracts STRAINS from CSV_FILE. % STRAINS is a matrix with the following format: % [ StrainID CloneID ParentSlide SlideID CreationDate DeathDate ] % % Blanchoud Group, UNIFR % Simon Blanchoud % 01/12/2020 function strains = parse_strains_db(fname) % Prepare the output strains = NaN(0,6); % Only process files if (isfile(fname)) fid = fopen(fname, 'r'); % Make sure we can read it if fid >= 0 % Process the header line line = fgetl(fid); headers = strsplit(line, ','); % Prepare a cell matrix to store the CSV file data = cell(0, length(headers)); % Process the CSV file line = fgetl(fid); while ischar(line) % In case there are \", then there is likely a problematic comma if (any(line == '"')) ids = find(line == '"'); ids = reshape(ids, 2, []); % We simply replace the commas with \; for i=1:size(ids, 2) tmpstr = line(ids(1,i):ids(2,i)); tmpstr(tmpstr==',') = ';'; line(ids(1,i):ids(2,i)) = tmpstr; end % And we delete the \" line(line == '"') = []; end % We simplify a bit the symbols used line(line=='.') = '/'; line(line==':') = ';'; % And we store the line of the CSV into a cell matrix data(end+1,:) = strsplit(line, ',', 'COLLAPSEDELIMITERS', false); % Continue the file line = fgetl(fid); end end % Assign the variables nclones = size(data,1); nchars = 1; strains = NaN(nclones, 6); % Loop through the CSV table for i=1:nclones % Make sure this is a proper strain data nstrain = data{i,1}; if (nstrain(1) ~= 'S') continue; end % Get the strain and clone IDs from the name strains(i,1) = str2double(nstrain(2:4)); strains(i,2) = str2double(nstrain(6:8)); % The parent ID parent = data{i, 3}; strains(i,3) = str2double(parent); % The slide ID slide = data{i, 2}; strains(i,4) = str2double(slide); % And the record creation date as initial creation date val = data{i, end}; [curr_time, indx] = strptime(val, '%m/%d/%Y %H;%M'); if (indx > length(val)) strains(i, 5) = mktime(curr_time); end % Now we process the comments % Start by splitting them comments = strsplit(data{i,4}, ';'); % We match the expression of interest once - deads = regexpi(comments, '(dead)|(empty)|(sacrificed)|(removed)|(fixation)|(slipped)|(missing)|(died)|(death)'); - clone = regexpi(comments, 'subcloned'); + deads = regexpi(comments, '(dead)|(empty)|(sacrificed)|(removed)|(fixation)|(slipped)|(missing)|(died)|(death)|(virtual)|(isolated)|(deattached)'); + clone = regexpi(comments, '(subcloned)|(sub.cloned)|(positioned)|(repositioned)'); % Loop over each comment for j=1:length(comments) % Temporary variables in case of no match indx = NaN; num = NaN; + is_date = false; % Loop over each word of the comment words = strsplit(comments{j}); for k=1:length(words) % Word cleanup to maximize date matching w = words{k}; if (~isempty(w) && w(1)=='(') w = w(2:end-1); end if (~isempty(w) && w(end)==')') w = w(1:end-1); end if (~isempty(w) && w(end)=='/') w = w(1:end-1); end % Short values might be number nchars = length(w); if (nchars<4) tmp = str2double(w); if (~isnan(tmp)) num = tmp; end % Longer ones could be dates, in several formats elseif (nchars>7) [curr_time, indx] = strptime(w, '%d/%m/%y'); if (indx <= nchars) [curr_time, indx] = strptime(w, '%d/%m/%Y'); if (indx <= nchars) [curr_time, indx] = strptime(w, '%m/%d/%Y'); if (indx <= nchars) [curr_time, indx] = strptime(w, '%m/%d/%y'); end end end + is_date = (indx > nchars); end end % If we found a mention of colony death if (length(deads{j}>0)) % Store the death date if any found - if (indx > nchars) + tmptime = mktime(curr_time); + if (is_date && ~(strains(i,6) > tmptime)) strains(i,6) = mktime(curr_time); end % And the slide number if any if (~isnan(num) && strains(i,4)==0) strains(i,4) = -num; end % If it mentioned subcloning elseif length(clone{j}>0) % Store the date if it is anterior to the creation of the record tmptime = mktime(curr_time); - if (indx > nchars && strains(i,5) > tmptime) + if (is_date && strains(i,5) > tmptime) strains(i,5) = tmptime; end end end + + if (strains(i,1)==131 && strains(i,2)==-1) + disp(data(i,:)); + if (isnan(strains(i,6))) + disp(['-> ' strftime('%d/%m/%y', localtime(strains(i,5))) ' : NaN']); + else + disp(['-> ' strftime('%d/%m/%y', localtime(strains(i,5))) ' : ' strftime('%d/%m/%y', localtime(strains(i,6))) ]); + end + keyboard + end end end % Clean and sort the output strains = strains(~isnan(strains(:,1)),:); [junk, indx] = sortrows(strains(:,1:2)); strains = strains(indx,:); + % Try to infer a minimal death date based on subcloning information + unknown = (strains(:,4)<0 & isnan(strains(:,6))); + ids = abs(strains(unknown, [1 4 5])); + subclones = strains(~isnan(strains(:,3)), [1 3 5]); + + % Loop over each missing value and check if some clones were subcloned from it + alive = NaN(size(ids, 1), 1); + for i=1:numel(alive) + hits = (subclones(:,1)==ids(i,1) & subclones(:,2)==ids(i,2)); + tmax = max(subclones(hits, 3)); + + % Make sure we get something valid + if (~isempty(tmax) && ids(i,3) < tmax) + alive(i) = tmax; + end + end + + % Copy the results + strains(unknown, 6) = alive; + return; end diff --git a/process_results/parse_husbandry.m b/process_results/parse_husbandry.m index 029b033..5e3713c 100644 --- a/process_results/parse_husbandry.m +++ b/process_results/parse_husbandry.m @@ -1,150 +1,213 @@ % PARSE_HUSBANDRY produces the plots necessary to analyze our husbandry setup. % % Blanchoud Group, UNIFR % Simon Blanchoud % 01/12/2020 function parse_husbandry(recompute) + % Checkif we need to recompute everything if (nargin==0) recompute = false; else recompute = logical(recompute); end - db_file = '/home/blanchou/Documents/Projects/Culture/Long-term/export_2020-11-27_14-55-07_UTC.csv'; - water = '/home/blanchou/Documents/Projects/Culture/Long-term/Water_quality.csv'; + % Location of the reference files + %db_file = '/home/blanchou/Documents/Projects/Culture/Long-term/export_2020-11-27_14-55-07_UTC.csv'; + db_file = '/home/blanchou/Documents/Manuscripts/Tunicate_husbandry/data/export_strains.csv'; + %water = '/home/blanchou/Documents/Projects/Culture/Long-term/Water_quality.csv'; + water = '/home/blanchou/Documents/Manuscripts/Tunicate_husbandry/data/Water_quality.csv'; apex_dir = '/home/blanchou/Code/Apex/Apex/'; tmpfiles = glob('/home/blanchou/Code/TmpData/*.csv'); + strain_id = 131; + % Either recompute or gather the existing files if (recompute || length(tmpfiles) == 0) disp('Recomputing Apex data...') files = gather_apex_data(apex_dir); disp('done!') else files = tmpfiles; end + % Parse the data from Apexand dispatch it on several panels hfig = figure;hold on; nsensors = length(files); sensor_label = cell(nsensors, 1); plot_indx = [1 1 2 3 3 4 4 4 4 4 5]; + + % Read each file, extract its name and plot it accordingly for i=1:nsensors data = csvread(files{i}); [x, indxs] = sort(data(:,1)); [dir, sensor_label{i}, ext] = fileparts(files{i}); subplot(2,3, plot_indx(i)); hold on; plot(x, data(indxs,2)); end - fid = fopen(water, 'r'); - if fid > 0 - line = fgetl(fid); - headers = strsplit(line, ','); + % Gather the data of our manual measurements of water quality + fid = fopen(water, 'r'); + if fid > 0 + line = fgetl(fid); + headers = strsplit(line, ','); data = NaN(500, length(headers)); + % Parse the CSV manually i = 1; - line = fgetl(fid); + line = fgetl(fid); while ischar(line) - columns = strsplit(line, ',', 'COLLAPSEDELIMITERS', false); + columns = strsplit(line, ',', 'COLLAPSEDELIMITERS', false); for j=1:length(columns) if (~isempty(columns{j})) val = columns{j}; if (j == 1) [curr_time, indx] = strptime(val, '%d/%m/%Y'); if (indx > length(val)) data(i, j) = mktime(curr_time); end else data(i, j) = str2double(val); end end end - - line = fgetl(fid); + line = fgetl(fid); i = i + 1; end - fclose(fid); data = data(1:i-1, :); end + % Plot the various columns on top of the Apex data plot_indx2 = [6 6 6 1 3 5 6]; for i=2:size(data,2) - subplot(2,3, plot_indx2(i-1)); hold on; plot(data(:,1), data(:,i)); end + % Set the axis, legends and labels of all panels labels = [sensor_label; headers(2:end).']; plot_indxs = [plot_indx plot_indx2]; limx = xlim; limy = [0 300; 0 600; 0 50; 0 30; 7 9; 0 6]; + % Set the limits and legends for i=1:6 subplot(2,3,i); xlim(limx); ylim(limy(i,:)); legend(labels(plot_indxs==i)); end + % Format the axis into readable format t = get(gca, 'XTick'); - tdate = cell(size(t)); for i=1:numel(t) - tdate{i} = strftime('%m/%y', gmtime(t(i))); + tdate{i} = strftime('%m/%y', localtime(t(i))); + end + for i=1:6 + hax = subplot(2,3,i); + set(hax, 'XTickLabel', tdate); + xlabel('date (mm/yy)'); end - set(gca, 'XTickLabel', tdate); - xlabel('date (mm/yy)'); - + % Parse the strains CSV file strains = parse_strains_db(db_file); + % Estimate the time range of our data today = mktime(localtime(time())); - month = 60*60*24*365.25/12; - - start = min(strains(:,5)); + [start, nindx] = min(strains(:,5)); nmonths = ceil((today - start) / month); + if (nmonths>200) + disp(strins(nindx,:)) + error(['Some date issue with entry ' nindx ' !']); + end nstrains = max(strains(:,1)); + % Bin the data to count the number of clones per strain over time + populations = zeros(nmonths, nstrains); death = strains(:,6); death(strains(:,4) > 0) = today; - lifespan = max(ceil((death - strains(:,5))/month), 1); + % Count one "bin" for unknown death times + lifespan = max(ceil((death - strains(:,5))/month), 1); ticks = (start-1)+[0:nmonths]*month; - populations = zeros(nmonths, nstrains); + % Loop over each clone to add its lifespan over the count of its strain for i=1:size(strains, 1) id = strains(i, 1); new = strains(i, 5); + % Add one count to the proper strain over the lifespan of the clone indx = find(new>ticks(1:end-1) & new<=ticks(2:end)); populations(indx+[0:lifespan(i)-1],id) = populations(indx+[0:lifespan(i)-1],id) + 1; end + % Don't show ongoing periods + populations = populations(1:end-1, :); + ticks = ticks(1:end-2); - figure;bar(populations, 'stacked') - hold on - plot([0 nmonths+1], [128 128], 'k') + % Display the statistics using stacked bars + figure;bar(populations, 'stacked'); + hold on; + plot([0 nmonths], [128 128], 'k'); + xlim([0 nmonths]); + % Update the axes, labels and strain index pdate = cell(size(ticks)); for i=1:2:numel(ticks) - pdate{i} = strftime('%m/%y', gmtime(ticks(i))); + pdate{i} = strftime('%m/%y', localtime(ticks(i))); end set(gca, 'XTick', [1:length(ticks)], 'XTickLabel', pdate); xlabel('date (mm/yy)'); ylabel('clone number'); title('Husbandry population') - hbar = colorbar(); t = get(hbar, 'YTick'); set(hbar, 'YTickLabel', cellstr(num2str(round(t.'*(nstrains-1))+1))) + % Parse one single strain (strain_id) to show its lineage + clones = strains(strains(:,1)==strain_id, :); + + % Infer death dates (today for living clones) + missings = isnan(clones(:,end)); + clones(missings, end) = clones(missings, end-1)+month; + clones(missings & clones(:,4)>0, end) = today; + + % Plot the lifespan of each clone, and link it to its parent clone + figure;hold on; + for i=1:size(clones, 1) + plot(clones(i,[end-1 end]), clones(i, [2 2]), 'r', 'LineWidth', 2); + mom = find(abs(clones(1:i-1, 4))==clones(i,3), 1, 'last'); + if (~isempty(mom)) + plot([clones(i, end-1)-month/12 clones(i, end-1)], [clones(mom, 2) clones(i,2)], 'k'); + + % Here we manually fix a few links to unknown slides + else + if clones(i, 3) == 31 + plot([clones(i, end-1)-month/12 clones(i, end-1)], [37 clones(i,2)], 'k'); + elseif clones(i, 3) == 96 + plot([clones(i, end-1)-month/12 clones(i, end-1)], [126 clones(i,2)], 'k'); + elseif ~isnan(clones(i, 3)) + disp(clones(i,[1 3])) + end + end + end + + % Update the axes, labels and strain index + xval = get(gca, 'XTick'); + xdate = cell(size(xval)); + for i=1:numel(xval) + xdate{i} = strftime('%m/%y', localtime(xval(i))); + end + set(gca, 'XTickLabel', xdate); + xlabel('date (mm/yy)'); + ylabel('clone ID'); keyboard return; end