diff --git a/process_results/parse_proteomics.m b/process_results/parse_proteomics.m new file mode 100644 index 0000000..f98ca0a --- /dev/null +++ b/process_results/parse_proteomics.m @@ -0,0 +1,268 @@ +% PARSE_PROTEOMICS produces the plots necessary to analyze our proteomics data. +% +% Blanchoud Group, UNIFR +% Simon Blanchoud +% 18/05/2021 +function parse_proteomics + + % Location of the reference files + prot_file = '/home/blanchou/Documents/Projects/Techniques/MolecularBiology/Proteomics/proteinGroups_PP2101_normalized.csv'; + + % Gather the data of our manual measurements of water quality + fid = fopen(prot_file, 'r'); + if fid > 0 + line = fgetl(fid); + headers = strsplit(line, ','); + ncol = length(headers)-1; + data = NaN(5000, ncol); + ids = cell(5000, 1); + + % Parse the CSV manually + i = 1; + line = fgetl(fid); + while ischar(line) + columns = strsplit(line, ',', 'COLLAPSEDELIMITERS', false); + + ids{i} = columns{1}; + for j=1:ncol + if (~isempty(columns{j+1})) + data(i, j) = str2double(columns{j+1}); + end + end + + line = fgetl(fid); + i = i + 1; + end + + fclose(fid); + data = data(1:i-1, :); + ids = ids(1:i-1); + end + + nprots = data(:,1); + + confs = data(:,2:4); + conf = sum(confs, 2); + conf(conf==0) = 1; + + valids = (nprots == 1) & all(confs > 1, 2) & any(confs == 3, 2); + + dprots = hist(nprots, [1:5]); + dconfs = hist(confs, [0:3]); + dvals = hist(confs(valids,:), [0:3]); + + figure; + + subplot(2,2,1); + bar([1:5], dprots, 'facecolor', 'k'); + xt = get(gca, 'xticklabel'); + xt{end} = ['>' xt{end}]; + set(gca, 'yscale', 'log', 'xticklabel', xt); + title('Ambiguous peptide assignments') + xlabel('Number of possible proteins per peptide'); + ylabel('Peptide number'); + + subplot(2,2,2); + bar([0:3], dconfs); + legend(headers(3:5)) + + subplot(2,2,3); + bar([0:3], dconfs); + set(gca, 'yscale', 'log'); + y = ylim(); + title('Peptide detection over biological replica') + xlabel('Number of detections per peptide'); + ylabel('Peptide number'); + + subplot(2,2,4); + bar([0:3], dvals); + set(gca, 'yscale', 'log', 'ylim', y); + title('Peptide detection of most robust peptides') + xlabel('Number of detections per peptide'); + ylabel('Peptide number'); + + disp('N. proteins') + disp(dprots) + disp('N. peptides') + disp(dconfs) + disp('N. robust') + disp(dvals) + + keyboard + + data = data(:,5:end); + + figure; + subplot(2,2,1); + [up1,dn1,col] = volcano_plot(data(:,1), data(:,2), conf); + axis([-12 12 0 25]) + ylabel('-log_2 (p-value)') + xlabel('-log_2 (fold changes)') + title('Control - Takeover') + subplot(2,2,2); + [x,y]=meshgrid([1:3],[1:size(col,1)/3]); + scatter(x,y,[],col, 'filled'); + axis([-1 5 -1 11]) + xlabel('Groups') + ylabel('Confidence') + title('colorcode legend') + subplot(2,2,3); + [up2,dn2] = volcano_plot(data(:,3), data(:,4), conf); + axis([-12 12 0 25]) + ylabel('-log_2 (p-value)') + xlabel('-log_2 (fold changes)') + title('Control - WBR 3 dpa') + subplot(2,2,4); + [up3,dn3] = volcano_plot(data(:,5), data(:,6), conf); + axis([-12 12 0 25]) + ylabel('-log_2 (p-value)') + xlabel('-log_2 (fold changes)') + title('Takeover - WBR 3 dpa') + + keyboard + + figure; + subplot(2,2,1); + [vup1,vdn1,col] = volcano_plot(data(:,1), data(:,2), conf, valids); + axis([-12 12 0 25]) + ylabel('-log_2 (p-value)') + xlabel('-log_2 (fold changes)') + title('Control - Takeover') + subplot(2,2,2); + [x,y]=meshgrid([1:3],[1:size(col,1)/3]); + scatter(x,y,[],col, 'filled'); + axis([-1 5 -1 11]) + xlabel('Groups') + ylabel('Confidence') + title('colorcode legend') + subplot(2,2,3); + [vup2,vdn2] = volcano_plot(data(:,3), data(:,4), conf, valids); + axis([-12 12 0 25]) + ylabel('-log_2 (p-value)') + xlabel('-log_2 (fold changes)') + title('Control - WBR 3 dpa') + subplot(2,2,4); + [vup3,vdn3] = volcano_plot(data(:,5), data(:,6), conf, valids); + axis([-12 12 0 25]) + ylabel('-log_2 (p-value)') + xlabel('-log_2 (fold changes)') + title('Takeover - WBR 3 dpa') + + keyboard + + figure; + + vuu = vup2 & vup3; + vud = vup2 & vdn3; + vdu = vdn2 & vup3; + vdd = vdn2 & vdn3; + vun = vup2 & ~(vup3 | vdn3); + vdn = vdn2 & ~(vup3 | vdn3); + vnu = vup3 & ~(vup2 | vdn2); + vnd = vdn3 & ~(vup2 | vdn2); + vnn = valids & ~(vup2 | vdn2 | vup3 | vdn3); + + cpurp = brewermap(3,'Purples'); + corag = brewermap(3,'Oranges'); + + subplot(2,2,1); + hold on; + scatter(0.5*(data(vnu,3)+data(vnu,5)),0.5*(data(vnu,4)+data(vnu,6)),[],corag(2,:), 'filled'); + scatter(0.5*(data(vuu,3)+data(vuu,5)),0.5*(data(vuu,4)+data(vuu,6)),[],corag(3,:), 'filled'); + scatter(0.5*(data(vdn,3)+data(vdn,5)),0.5*(data(vdn,4)+data(vdn,6)),[],cpurp(2,:), 'filled'); + scatter(0.5*(data(vdd,3)+data(vdd,5)),0.5*(data(vdd,4)+data(vdd,6)),[],cpurp(3,:), 'filled'); + axis([-12 12 0 25]) + ylabel('-log_2 (p-value)') + xlabel('-log_2 (fold changes)') + title('Avg - WBR 3 dpa') + + subplot(2,2,2); + scatter([2 2 1 1],[1 2 1 2],[],[corag(2:3,:); cpurp(2:3,:)], 'filled'); + axis([0 3 0 3]) + xlabel('Down - Up') + ylabel('Not significant - significant') + title('colorcode legend') + + subplot(2,2,3); + hold on; + scatter(data(vnu,6),data(vnu,4),[],corag(2,:), 'filled'); + scatter(data(vuu,6),data(vuu,4),[],corag(3,:), 'filled'); + scatter(data(vdn,6),data(vdn,4),[],cpurp(2,:), 'filled'); + scatter(data(vdd,6),data(vdd,4),[],cpurp(3,:), 'filled'); + axis([0 25 0 25]) + xlabel('Takeover') + ylabel('Control') + title('p-value (-log_2)') + + subplot(2,2,4); + hold on; + scatter(data(vnu,5),data(vnu,3),[],corag(2,:), 'filled'); + scatter(data(vuu,5),data(vuu,3),[],corag(3,:), 'filled'); + scatter(data(vdn,5),data(vdn,3),[],cpurp(2,:), 'filled'); + scatter(data(vdd,5),data(vdd,3),[],cpurp(3,:), 'filled'); + axis([-12 12 -12 12]) + xlabel('Takeover') + ylabel('Control') + title('Fold change (-log_2)') + + disp([sum(vuu) sum(vud) sum(vun); sum(vdu) sum(vdd) sum(vdn); sum(vnu) sum(vnd) sum(vnn)]) + + keyboard + + export_ids('./ups_ids.txt', ids(vuu | vnu), data(vuu | vnu, 3:6)); + export_ids('./downs_ids.txt', ids(vdn | vdd), data(vdn | vdd, 3:6)); + dlmwrite('./is_significant.csv', [vdd vdn vnu vuu]); + + keyboard + + return; +end + +function export_ids(fname, ids, params) + + nparams = size(params, 2)/2; + + fid = fopen(fname, 'w'); + for i=1:length(ids) + fprintf(fid, '%s', ids{i}); + + for j=1:nparams + fprintf(fid, ' (%f, %f)', params(i, 2*j-1), params(i, 2*j)); + end + + fprintf(fid, '\n'); + end + fclose(fid); + + return; +end + +function [ups, dwn, colors] = volcano_plot(fold, pval, conf, valids) + + if (nargin<4) + valids = ones(size(conf)); + end + + [indxs, junk, map] = unique(conf); + nindx = length(indxs); + + cgray = brewermap(nindx+1,'Greys'); + creds = brewermap(nindx+1,'Reds'); + cblue = brewermap(nindx+1,'Blues'); + + colors = [cgray(2:end,:); creds(2:end,:); cblue(2:end,:)]; + + ups = (fold>1 & pval>-log2(0.05)) & valids; + dwn = (fold<-1 & pval>-log2(0.05)) & valids; + + cindx = map + nindx*ups + 2*nindx*dwn; + + hold on; + for i=1:3*nindx + curr = (cindx == i) & valids; + scatter(fold(curr), pval(curr), [], colors(i,:), 'filled'); + end + hold off; + + return; +end diff --git a/scripts/fasta_filter.py b/scripts/fasta_filter.py new file mode 100755 index 0000000..78378b9 --- /dev/null +++ b/scripts/fasta_filter.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python3 + +from __future__ import print_function +import sys +import os +import re + +if __name__=='__main__': + helpmess="""Usage: +fasta_filter seq_list fasta_file [-invert] + +Filters out the sequences in seq_list from fasta_file. +If -invert is specified, filters out the sequences NOT included. +""" + # Inputs + if len(sys.argv)<3: + print(helpmess) + sys.exit(0) + else: + infile=os.path.realpath(sys.argv[1]) + fasta=os.path.realpath(sys.argv[2]) + + # Output folder + out_folder='filtered' + if os.path.exists(out_folder)==False: + os.system('mkdir %s' % out_folder) + + # Get already sone sequences + dones = dict() + with open(infile) as f: + for line in f: + val = line.strip() + dones[val.split()[0]] = val + + # Default name for the file + basedir,tmpfile=os.path.split(fasta) + poutname=os.path.join(out_folder, 'inc_'+tmpfile) + noutname=os.path.join(out_folder, 'mis_'+tmpfile) + + # Copy the content of the file + with open(poutname, 'w') as p, open(noutname, 'w') as n, open(fasta) as f: + seq=False + for line in f: + if line[0]=='>': + val = line[1:].strip() + seq = (val in dones) + if seq: + p.write('>{}\n'.format(dones[val])) + else: + n.write(line) + else: + if seq: + p.write(line) + else: + n.write(line)