diff --git a/scripts/10xgenomics_PBMC_5k/analysis_peaks.sh b/scripts/10xgenomics_PBMC_5k/analysis_peaks.sh deleted file mode 100755 index 2c95a12..0000000 --- a/scripts/10xgenomics_PBMC_5k/analysis_peaks.sh +++ /dev/null @@ -1,68 +0,0 @@ -# some paths -## directories -results_dir='results/10xgenomics_PBMC_5k' -data_dir='data' -read_dir="$data_dir/10xgenomics_PBMC_5k" -seq_dir="$data_dir/genomes" -## input -file_bed=$read_dir'/atac_v1_pbmc_5k_peaks.bed' -file_bed_rmsk=$read_dir'/atac_v1_pbmc_5k_peaks_rmsk.bed' -file_bam_open="$read_dir/atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam" -file_bai_open="$read_dir/atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam.bai" -file_bam_nucl="$read_dir/atac_v1_pbmc_5k_possorted_filtered_nucleosomes.bam" -file_bai_nucl="$read_dir/atac_v1_pbmc_5k_possorted_filtered_nucleosomes.bam.bai" -file_hg19="$seq_dir/hg19.fasta" -file_rmsk="$seq_dir/hg19_rmsk.bed" - -mkdir -p $results_dir - -# repeat mask -# remove any peak that has at least 50% of its length overlapping a repeated region (its -# center is inside the region, this is somewhat equivalent to what is done on ccg webinterface -# when checking the repeatMask on option) -bin/bedtools/subtractBed -f 0.5 -A -a data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_peaks.bed -b data/genomes/hg19_rmsk.bed > $file_bed_rmsk - - -# sampled from bed -file_bed_rmsk_2=$read_dir'/atac_v1_pbmc_5k_peaks_rmsk_sampled.bed' -shuf $file_bed_rmsk | head -n 10000 > $file_bed_rmsk_2 - -# matrix creation -## 1kb sequences -file_mat_seq_1kb_1="$results_dir/peaks_rmsk_sequences_1kb.mat" -file_mat_seq_1kb_2="$results_dir/peaks_rmsk_sampled_sequences_1kb.mat" -bin/SequenceMatrixCreator --bed $file_bed_rmsk --fasta $file_hg19 --from -500 --to 500 > $file_mat_seq_1kb_1 -bin/SequenceMatrixCreator --bed $file_bed_rmsk_2 --fasta $file_hg19 --from -500 --to 500 > $file_mat_seq_1kb_2 -## 2kb sequences -file_mat_seq_2kb_1="$results_dir/peaks_rmsk_sequences_2kb.mat" -file_mat_seq_2kb_2="$results_dir/peaks_rmsk_sampled_sequences_2kb.mat" -bin/SequenceMatrixCreator --bed $file_bed_rmsk --fasta $file_hg19 --from -1000 --to 1000 > $file_mat_seq_2kb_1 -bin/SequenceMatrixCreator --bed $file_bed_rmsk_2 --fasta $file_hg19 --from -1000 --to 1000 > $file_mat_seq_2kb_2 - -## open chromatin around peaks -for method in 'read_atac' -do - file_mat_open_1kb_1="$results_dir/peaks_rmsk_open_bin1bp_1kb_$method.mat" - file_mat_open_1kb_2="$results_dir/peaks_rmsk_sampled_open_bin1bp_1kb_$method.mat" - bin/CorrelationMatrixCreator --bed $file_bed_rmsk --bam $file_bam_open --bai $file_bai_open --from -500 --to 500 --binSize 1 --method $method > $file_mat_open_1kb_1 - bin/CorrelationMatrixCreator --bed $file_bed_rmsk_2 --bam $file_bam_open --bai $file_bai_open --from -500 --to 500 --binSize 1 --method $method > $file_mat_open_1kb_2 - file_mat_open_2kb_1="$results_dir/peaks_rmsk_open_bin1bp_2kb_$method.mat" - file_mat_open_2kb_2="$results_dir/peaks_rmsk_sampled_open_bin1bp_2kb_$method.mat" - bin/CorrelationMatrixCreator --bed $file_bed_rmsk --bam $file_bam_open --bai $file_bai_open --from -1000 --to 1000 --binSize 1 --method $method > $file_mat_open_2kb_1 - bin/CorrelationMatrixCreator --bed $file_bed_rmsk_2 --bam $file_bam_open --bai $file_bai_open --from -1000 --to 1000 --binSize 1 --method $method > $file_mat_open_2kb_2 -done - -## all nucleosomes around peaks -for method in 'fragment_center' -do - file_mat_nucl_1kb_1="$results_dir/peaks_rmsk_nucleosomes_bin1bp_1kb_$method.mat" - file_mat_nucl_1kb_2="$results_dir/peaks_rmsk_sampled_nucleosomes_bin1bp_1kb_$method.mat" - bin/CorrelationMatrixCreator --bed $file_bed_rmsk --bam $file_bam_nucl --bai $file_bai_nucl --from -500 --to 500 --binSize 1 --method $method > $file_mat_nucl_1kb_1 - bin/CorrelationMatrixCreator --bed $file_bed_rmsk_2 --bam $file_bam_nucl --bai $file_bai_nucl --from -500 --to 500 --binSize 1 --method $method > $file_mat_nucl_1kb_2 - file_mat_nucl_2kb_1="$results_dir/peaks_rmsk_nucleosomes_bin1bp_2kb_$method.mat" - file_mat_nucl_2kb_2="$results_dir/peaks_rmsk_sampled_nucleosomes_bin1bp_2kb_$method.mat" - bin/CorrelationMatrixCreator --bed $file_bed_rmsk --bam $file_bam_nucl --bai $file_bai_nucl --from -1000 --to 1000 --binSize 1 --method $method > $file_mat_nucl_2kb_1 - bin/CorrelationMatrixCreator --bed $file_bed_rmsk_2 --bam $file_bam_nucl --bai $file_bai_nucl --from -1000 --to 1000 --binSize 1 --method $method > $file_mat_nucl_2kb_2 -done - - diff --git a/scripts/10xgenomics_PBMC_5k/analysis_ctcf_motif.R b/scripts/10xgenomics_PBMC_5k_motifs/analysis_ctcf_motif.R similarity index 100% rename from scripts/10xgenomics_PBMC_5k/analysis_ctcf_motif.R rename to scripts/10xgenomics_PBMC_5k_motifs/analysis_ctcf_motif.R diff --git a/scripts/10xgenomics_PBMC_5k/analysis_ctcf_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs/analysis_ctcf_motif.sh similarity index 97% rename from scripts/10xgenomics_PBMC_5k/analysis_ctcf_motif.sh rename to scripts/10xgenomics_PBMC_5k_motifs/analysis_ctcf_motif.sh index e135353..32860f2 100755 --- a/scripts/10xgenomics_PBMC_5k/analysis_ctcf_motif.sh +++ b/scripts/10xgenomics_PBMC_5k_motifs/analysis_ctcf_motif.sh @@ -1,90 +1,89 @@ # some paths ## directories -results_dir='results/10xgenomics_PBMC_5k' -data_dir='data' -read_dir="$data_dir/10xgenomics_PBMC_5k" -seq_dir="$data_dir/genomes" +results_dir='data/10xgenomics_PBMC_5k_motifs' +read_dir="data/10xgenomics_PBMC_5k" +seq_dir="data/genomes" ## input1 file_bed=$read_dir'/ctcf_motifs_10e-6.bed' file_bam_open="$read_dir/atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam" file_bai_open="$read_dir/atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam.bai" file_bam_1nucl="$read_dir/atac_v1_pbmc_5k_possorted_filtered_133-266bp.bam" file_bai_1nucl="$read_dir/atac_v1_pbmc_5k_possorted_filtered_133-266bp.bam.bai" file_bam_2nucl="$read_dir/atac_v1_pbmc_5k_possorted_filtered_341-500bp.bam" file_bai_2nucl="$read_dir/atac_v1_pbmc_5k_possorted_filtered_341-500bp.bam.bai" file_bam_1nucl2="$read_dir/atac_v1_pbmc_5k_possorted_filtered_341-500bp_splitintwo.bam" file_bai_1nucl2="$read_dir/atac_v1_pbmc_5k_possorted_filtered_341-500bp_splitintwo.bam.bai" file_bam_nucl="$read_dir/atac_v1_pbmc_5k_possorted_filtered_nucleosomes.bam" file_bai_nucl="$read_dir/atac_v1_pbmc_5k_possorted_filtered_nucleosomes.bam.bai" file_hg19="$seq_dir/hg19.fasta" mkdir -p $results_dir # matrix creation ## sequences file_mat_seq="$results_dir/ctcf_motifs_10e-6_sequences.mat" bin/SequenceMatrixCreator --bed $file_bed --fasta $file_hg19 --from -400 --to 400 > $file_mat_seq ## open chromatin around CTCF motif for method in 'read' 'read_atac' 'fragment' do file_mat_open_1="$results_dir/ctcf_motifs_10e-6_open_bin1bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_open --bai $file_bai_open --from -400 --to 400 --binSize 1 --method $method > $file_mat_open_1 file_mat_open_2="$results_dir/ctcf_motifs_10e-6_open_bin2bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_open --bai $file_bai_open --from -400 --to 400 --binSize 2 --method $method > $file_mat_open_2 file_mat_open_10="$results_dir/ctcf_motifs_10e-6_open_bin10bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_open --bai $file_bai_open --from -1000 --to 1000 --binSize 10 --method $method > $file_mat_open_10 done ## mono around CTCF motif for method in 'read' 'fragment' 'fragment_center' do ### mono nucleosomes file_mat_1nucl_1="$results_dir/ctcf_motifs_10e-6_1nucl_bin1bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl --bai $file_bai_1nucl --from -400 --to 400 --binSize 1 --method $method > $file_mat_1nucl_1 file_mat_1nucl_2="$results_dir/ctcf_motifs_10e-6_1nucl_bin2bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl --bai $file_bai_1nucl --from -400 --to 400 --binSize 2 --method $method > $file_mat_1nucl_2 file_mat_1nucl_10="$results_dir/ctcf_motifs_10e-6_1nucl_bin10bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl --bai $file_bai_1nucl --from -1000 --to 1000 --binSize 10 --method $method > $file_mat_1nucl_10 done ## di nucleosomes around CTCF motif for method in 'read' 'fragment' 'fragment_center' do ### di nucleosomes file_mat_2nucl_1="$results_dir/ctcf_motifs_10e-6_2nucl_bin1bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_2nucl --bai $file_bai_2nucl --from -400 --to 400 --binSize 1 --method $method > $file_mat_2nucl_1 file_mat_2nucl_2="$results_dir/ctcf_motifs_10e-6_2nucl_bin2bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_2nucl --bai $file_bai_2nucl --from -400 --to 400 --binSize 2 --method $method > $file_mat_2nucl_2 file_mat_2nucl_10="$results_dir/ctcf_motifs_10e-6_2nucl_bin10bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_2nucl --bai $file_bai_2nucl --from -1000 --to 1000 --binSize 10 --method $method > $file_mat_2nucl_10 done ## mono nucleosomes from processed di-nucleosome data around CTCF motif for method in 'read' 'fragment' 'fragment_center' do ### mono nucleosomes file_mat_1nucl_1="$results_dir/ctcf_motifs_10e-6_2nuclsplitintwo_bin1bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl2 --bai $file_bai_1nucl2 --from -400 --to 400 --binSize 1 --method $method > $file_mat_1nucl_1 file_mat_1nucl_2="$results_dir/ctcf_motifs_10e-6_2nuclsplitintwo_bin2bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl2 --bai $file_bai_1nucl2 --from -400 --to 400 --binSize 2 --method $method > $file_mat_1nucl_2 file_mat_1nucl_10="$results_dir/ctcf_motifs_10e-6_2nuclsplitintwo_bin10bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl2 --bai $file_bai_1nucl2 --from -1000 --to 1000 --binSize 10 --method $method > $file_mat_1nucl_10 done ## all nucleosomes around CTCF motif for method in 'read' 'fragment' 'fragment_center' do ### mono nucleosomes file_mat_nucl_1="$results_dir/ctcf_motifs_10e-6_nucleosomes_bin1bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_nucl --bai $file_bai_nucl --from -400 --to 400 --binSize 1 --method $method > $file_mat_nucl_1 file_mat_nucl_2="$results_dir/ctcf_motifs_10e-6_nucleosomes_bin2bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_nucl --bai $file_bai_nucl --from -400 --to 400 --binSize 2 --method $method > $file_mat_nucl_2 file_mat_nucl_10="$results_dir/ctcf_motifs_10e-6_nucleosomes_bin10bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_nucl --bai $file_bai_nucl --from -1000 --to 1000 --binSize 10 --method $method > $file_mat_nucl_10 done diff --git a/scripts/10xgenomics_PBMC_5k/analysis_ebf1_motif.R b/scripts/10xgenomics_PBMC_5k_motifs/analysis_ebf1_motif.R similarity index 100% rename from scripts/10xgenomics_PBMC_5k/analysis_ebf1_motif.R rename to scripts/10xgenomics_PBMC_5k_motifs/analysis_ebf1_motif.R diff --git a/scripts/10xgenomics_PBMC_5k/analysis_ebf1_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs/analysis_ebf1_motif.sh similarity index 100% rename from scripts/10xgenomics_PBMC_5k/analysis_ebf1_motif.sh rename to scripts/10xgenomics_PBMC_5k_motifs/analysis_ebf1_motif.sh diff --git a/scripts/10xgenomics_PBMC_5k/analysis_myc_motif.R b/scripts/10xgenomics_PBMC_5k_motifs/analysis_myc_motif.R similarity index 100% rename from scripts/10xgenomics_PBMC_5k/analysis_myc_motif.R rename to scripts/10xgenomics_PBMC_5k_motifs/analysis_myc_motif.R diff --git a/scripts/10xgenomics_PBMC_5k/analysis_myc_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs/analysis_myc_motif.sh similarity index 100% rename from scripts/10xgenomics_PBMC_5k/analysis_myc_motif.sh rename to scripts/10xgenomics_PBMC_5k_motifs/analysis_myc_motif.sh diff --git a/scripts/10xgenomics_PBMC_5k/analysis_sp1_motif.R b/scripts/10xgenomics_PBMC_5k_motifs/analysis_sp1_motif.R similarity index 100% rename from scripts/10xgenomics_PBMC_5k/analysis_sp1_motif.R rename to scripts/10xgenomics_PBMC_5k_motifs/analysis_sp1_motif.R diff --git a/scripts/10xgenomics_PBMC_5k/analysis_sp1_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs/analysis_sp1_motif.sh similarity index 100% rename from scripts/10xgenomics_PBMC_5k/analysis_sp1_motif.sh rename to scripts/10xgenomics_PBMC_5k_motifs/analysis_sp1_motif.sh diff --git a/scripts/10xgenomics_PBMC_5k_classification_1/classification_ctcf_motif.R b/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_ctcf_motif.R similarity index 94% rename from scripts/10xgenomics_PBMC_5k_classification_1/classification_ctcf_motif.R rename to scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_ctcf_motif.R index 8df2587..9a0ddf8 100644 --- a/scripts/10xgenomics_PBMC_5k_classification_1/classification_ctcf_motif.R +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_ctcf_motif.R @@ -1,172 +1,172 @@ setwd(file.path("/", "local", "groux", "scATAC-seq")) # libraries library(RColorBrewer) library(seqLogo) # functions source(file.path("scripts", "functions.R")) # the minimum number of classes searched k.min = 1 # the maximum number of classes searched k.max = 10 # path to the images for the logo path.a = file.path("res/A.png") path.c = file.path("res/C.png") path.g = file.path("res/G.png") path.t = file.path("res/T.png") ################## open chromatin patterns around ctcf motifs ################## for(k in k.min:k.max) { # open chromatin - data = read.read.models(file.path("results", "10xgenomics_PBMC_5k_classification_1", + data = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_1", sprintf("ctcf_motifs_10e-6_open_bin1bp_read_atac_%dclass_model.mat", k))) model.open = data$models model.prob = data$prob data = NULL # nucleosomes - model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_classification_1", + model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_1", sprintf("ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_%dclass_model.mat", k)))$models # sequence - model.seq = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_classification_1", + model.seq = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_1", sprintf("ctcf_motifs_10e-6_open_bin1bp_read_atac_%dclass_sequences_model.mat", k)))$models # plot classes col = brewer.pal(3, "Set1") # X11(width=17, height=10) - png(filename=file.path("results", "10xgenomics_PBMC_5k_classification_1", + png(filename=file.path("results", "10xgenomics_PBMC_5k_motifs_classification_1", sprintf("ctcf_motifs_10e-6_classification_open_bin1bp_%dclass.png", k)), units="in", res=720, width=18, height=12) m = matrix(1:10, nrow=5, ncol=2, byrow=F) layout(m) # order from most to least probable class ord = order(model.prob, decreasing=T) ref.open = model.open[ord,, drop=F] ref.nucl = model.nucl[ord,, drop=F] ref.seq = model.seq[,,ord, drop=F] prob = model.prob[ord] class = c(1:nrow(ref.open))[ord] for(i in 1:nrow(ref.open)) { # plot logo plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, main=sprintf("class %d (p=%.2f)", class[i], prob[i])) # x-axis x.lab = seq(-ncol(ref.open), ncol(ref.open), length.out=3) x.at = (x.lab + ncol(ref.open)) / 2 axis(1, at=x.at, labels=x.lab) # y-axis is [0,1] for min/max signal x.at = seq(0, 1, 0.5) axis(2, at=x.at, labels=x.at) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) } row_n = 1 # row counter col_n = 1 # column counter for(i in 1:nrow(ref.open)) { # plot logo center right = 0.5*col_n - 0.01 left = right - 0.2 bottom = 1-(row_n*(0.2))+0.05 top = bottom + 0.15 par(fig=c(left, right, bottom, top), new=T) idx = 380:420 plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) # xaxis x.at = 1:length(idx) axis(1, at=x.at, labels=x.at) # yaxis x.at = seq(0, 2, by=1) axis(2, at=x.at, labels=x.at) row_n = row_n + 1 if(i %% 5 == 0) { col_n = col_n + 1 row_n = 1 } } dev.off() } ################## nucleosomes chromatin patterns around ctcf motifs ################## for(k in k.min:k.max) { # open chromatin - data = read.read.models(file.path("results", "10xgenomics_PBMC_5k_classification_1", + data = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_1", sprintf("ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_%dclass_open_read_atac_model.mat", k))) model.open = data$models model.prob = data$prob data = NULL # nucleosomes - model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_classification_1", + model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_1", sprintf("ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_%dclass_model.mat", k)))$models # sequence - model.seq = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_classification_1", + model.seq = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_1", sprintf("ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_%dclass_sequences_model.mat", k)))$models # plot classes col = brewer.pal(3, "Set1") # X11(width=17, height=10) - png(filename=file.path("results", "10xgenomics_PBMC_5k_classification_1", + png(filename=file.path("results", "10xgenomics_PBMC_5k_motifs_classification_1", sprintf("ctcf_motifs_10e-6_classification_1nucl_bin1bp_%dclass.png", k)), units="in", res=720, width=18, height=12) m = matrix(1:10, nrow=5, ncol=2, byrow=F) layout(m) # order from most to least probable class ord = order(model.prob, decreasing=T) ref.open = model.open[ord,, drop=F] ref.nucl = model.nucl[ord,, drop=F] ref.seq = model.seq[,,ord, drop=F] prob = model.prob[ord] class = c(1:nrow(ref.open))[ord] for(i in 1:nrow(ref.open)) { # plot logo plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, main=sprintf("class %d (p=%.2f)", class[i], prob[i])) # x-axis x.lab = seq(-ncol(ref.open), ncol(ref.open), length.out=3) x.at = (x.lab + ncol(ref.open)) / 2 axis(1, at=x.at, labels=x.lab) # y-axis is [0,1] for min/max signal x.at = seq(0, 1, 0.5) axis(2, at=x.at, labels=x.at) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) } row_n = 1 # row counter col_n = 1 # column counter for(i in 1:nrow(ref.open)) { # plot logo center right = 0.5*col_n - 0.01 left = right - 0.2 bottom = 1-(row_n*(0.2))+0.05 top = bottom + 0.15 par(fig=c(left, right, bottom, top), new=T) idx = 380:420 plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) # xaxis x.at = 1:length(idx) axis(1, at=x.at, labels=x.at) # yaxis x.at = seq(0, 2, by=1) axis(2, at=x.at, labels=x.at) row_n = row_n + 1 if(i %% 5 == 0) { col_n = col_n + 1 row_n = 1 } } dev.off() } diff --git a/scripts/10xgenomics_PBMC_5k_classification_1/classification_ctcf_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_ctcf_motif.sh similarity index 89% rename from scripts/10xgenomics_PBMC_5k_classification_1/classification_ctcf_motif.sh rename to scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_ctcf_motif.sh index da45b83..f5a0b24 100755 --- a/scripts/10xgenomics_PBMC_5k_classification_1/classification_ctcf_motif.sh +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_ctcf_motif.sh @@ -1,52 +1,52 @@ # some paths ## directories -results_dir='results/10xgenomics_PBMC_5k_classification_1' -data_dir='results/10xgenomics_PBMC_5k' +results_dir='results/10xgenomics_PBMC_5k_motifs_classification_1' +data_dir='data/10xgenomics_PBMC_5k_motifs' ## input file_mat_open="$data_dir/ctcf_motifs_10e-6_open_bin1bp_read_atac.mat" file_mat_1nucl="$data_dir/ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center.mat" file_mat_seq="$data_dir/ctcf_motifs_10e-6_sequences.mat" ## file with seeds file_seed=$results_dir'/ctcf_motifs_10e-6_seed.txt' mkdir -p $results_dir touch $file_seed # parameters n_iter='20' n_shift='21' -n_core=8 +n_core=28 # open chromatin for k in 1 2 3 4 5 6 7 8 9 10 do seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) file_prob=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_prob.mat4d' file_mod1=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_model.mat' file_mod2=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_1nucl_fragment_center_model.mat' file_mod3=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_sequences_model.mat' file_aic=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_aic.txt' echo "$file_prob $seed" >> $file_seed - bin/EMRead --read $file_mat_open --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob + bin/EMRead --read $file_mat_open --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2 bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 done # 1nucl chromatin for k in 1 2 3 4 5 6 7 8 9 10 do seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) file_prob=$results_dir/'ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_prob.mat4d' file_mod1=$results_dir/'ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_model.mat' file_mod2=$results_dir/'ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_open_read_atac_model.mat' file_mod3=$results_dir/'ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_sequences_model.mat' file_aic=$results_dir/'ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_aic.txt' echo "$file_prob $seed" >> $file_seed - bin/EMRead --read $file_mat_1nucl --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob + bin/EMRead --read $file_mat_1nucl --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod1 bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod2 bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 done diff --git a/scripts/10xgenomics_PBMC_5k_classification_1/classification_ebf1_motif.R b/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_ebf1_motif.R similarity index 94% rename from scripts/10xgenomics_PBMC_5k_classification_1/classification_ebf1_motif.R rename to scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_ebf1_motif.R index 96841cf..41a905f 100644 --- a/scripts/10xgenomics_PBMC_5k_classification_1/classification_ebf1_motif.R +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_ebf1_motif.R @@ -1,96 +1,96 @@ setwd(file.path("/", "local", "groux", "scATAC-seq")) # libraries library(RColorBrewer) library(seqLogo) # functions source(file.path("scripts", "functions.R")) # the minimum number of classes searched k.min = 1 # the maximum number of classes searched k.max = 10 # path to the images for the logo path.a = file.path("res/A.png") path.c = file.path("res/C.png") path.g = file.path("res/G.png") path.t = file.path("res/T.png") ################## sequence patterns around ebf1 motifs ################## for(k in k.min:k.max) { # open chromatin - data = read.read.models(file.path("results", "10xgenomics_PBMC_5k_classification_1", + data = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_1", sprintf("ebf1_motifs_10e-6_open_bin1bp_read_atac_%dclass_model.mat", k))) model.open = data$models model.prob = data$prob data = NULL # nucleosomes - model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_classification_1", + model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_1", sprintf("ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_%dclass_model.mat", k)))$models # sequence - model.seq = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_classification_1", + model.seq = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_1", sprintf("ebf1_motifs_10e-6_open_bin1bp_read_atac_%dclass_sequences_model.mat", k)))$models # plot classes col = brewer.pal(3, "Set1") # X11(width=17, height=10) - png(filename=file.path("results", "10xgenomics_PBMC_5k_classification_1", + png(filename=file.path("results", "10xgenomics_PBMC_5k_motifs_classification_1", sprintf("ebf1_motifs_10e-6_classification_open_bin1bp_%dclass.png", k)), units="in", res=720, width=18, height=12) m = matrix(1:10, nrow=5, ncol=2, byrow=F) layout(m) # order from most to least probable class ord = order(model.prob, decreasing=T) ref.open = model.open[ord,, drop=F] ref.nucl = model.nucl[ord,, drop=F] ref.seq = model.seq[,,ord, drop=F] prob = model.prob[ord] class = c(1:nrow(ref.open))[ord] for(i in 1:nrow(ref.open)) { # plot logo plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, main=sprintf("class %d (p=%.2f)", class[i], prob[i])) # x-axis x.lab = seq(-ncol(ref.open), ncol(ref.open), length.out=3) x.at = (x.lab + ncol(ref.open)) / 2 axis(1, at=x.at, labels=x.lab) # y-axis is [0,1] for min/max signal x.at = seq(0, 1, 0.5) axis(2, at=x.at, labels=x.at) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) } row_n = 1 # row counter col_n = 1 # column counter for(i in 1:nrow(ref.open)) { # plot logo center right = 0.5*col_n - 0.01 left = right - 0.2 bottom = 1-(row_n*(0.2))+0.05 top = bottom + 0.15 par(fig=c(left, right, bottom, top), new=T) idx = 380:420 plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) # xaxis x.at = 1:length(idx) axis(1, at=x.at, labels=x.at) # yaxis x.at = seq(0, 2, by=1) axis(2, at=x.at, labels=x.at) row_n = row_n + 1 if(i %% 5 == 0) { col_n = col_n + 1 row_n = 1 } } dev.off() } diff --git a/scripts/10xgenomics_PBMC_5k_classification_1/classification_ebf1_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_ebf1_motif.sh similarity index 89% rename from scripts/10xgenomics_PBMC_5k_classification_1/classification_ebf1_motif.sh rename to scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_ebf1_motif.sh index 931e907..431cde7 100755 --- a/scripts/10xgenomics_PBMC_5k_classification_1/classification_ebf1_motif.sh +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_ebf1_motif.sh @@ -1,52 +1,52 @@ # some paths ## directories -results_dir='results/10xgenomics_PBMC_5k_classification_1' -data_dir='results/10xgenomics_PBMC_5k' +results_dir='results/10xgenomics_PBMC_5k_motifs_classification_1' +data_dir='data/10xgenomics_PBMC_5k_motifs' ## input file_mat_open="$data_dir/ebf1_motifs_10e-6_open_bin1bp_read_atac.mat" file_mat_1nucl="$data_dir/ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center.mat" file_mat_seq="$data_dir/ebf1_motifs_10e-6_sequences.mat" ## file with seeds file_seed=$results_dir'/ebf1_motifs_10e-6_seed.txt' mkdir -p $results_dir touch $file_seed # parameters n_iter='20' n_shift='21' -n_core=8 +n_core=28 # open chromatin for k in 1 2 3 4 5 6 7 8 9 10 do seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) file_prob=$results_dir/'ebf1_motifs_10e-6_open_bin1bp_read_atac_'$k'class_prob.mat4d' file_mod1=$results_dir/'ebf1_motifs_10e-6_open_bin1bp_read_atac_'$k'class_model.mat' file_mod2=$results_dir/'ebf1_motifs_10e-6_open_bin1bp_read_atac_'$k'class_1nucl_fragment_center_model.mat' file_mod3=$results_dir/'ebf1_motifs_10e-6_open_bin1bp_read_atac_'$k'class_sequences_model.mat' file_aic=$results_dir/'ebf1_motifs_10e-6_open_bin1bp_read_atac_'$k'class_aic.txt' echo "$file_prob $seed" >> $file_seed - bin/EMRead --read $file_mat_open --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob + bin/EMRead --read $file_mat_open --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2 bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 done # 1nucl chromatin for k in 1 2 3 4 5 6 7 8 9 10 do seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) file_prob=$results_dir/'ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_prob.mat4d' file_mod1=$results_dir/'ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_model.mat' file_mod2=$results_dir/'ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_open_read_atac_model.mat' file_mod3=$results_dir/'ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_sequences_model.mat' file_aic=$results_dir/'ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_aic.txt' echo "$file_prob $seed" >> $file_seed - bin/EMRead --read $file_mat_1nucl --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob + bin/EMRead --read $file_mat_1nucl --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod1 bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod2 bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 done diff --git a/scripts/10xgenomics_PBMC_5k_classification_1/classification_myc_motif.R b/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_myc_motif.R similarity index 94% rename from scripts/10xgenomics_PBMC_5k_classification_1/classification_myc_motif.R rename to scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_myc_motif.R index 1293d93..57ae656 100644 --- a/scripts/10xgenomics_PBMC_5k_classification_1/classification_myc_motif.R +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_myc_motif.R @@ -1,96 +1,96 @@ setwd(file.path("/", "local", "groux", "scATAC-seq")) # libraries library(RColorBrewer) library(seqLogo) # functions source(file.path("scripts", "functions.R")) # the minimum number of classes searched k.min = 1 # the maximum number of classes searched k.max = 10 # path to the images for the logo path.a = file.path("res/A.png") path.c = file.path("res/C.png") path.g = file.path("res/G.png") path.t = file.path("res/T.png") ################## sequence patterns around myc motifs ################## for(k in k.min:k.max) { # open chromatin - data = read.read.models(file.path("results", "10xgenomics_PBMC_5k_classification_1", + data = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_1", sprintf("myc_motifs_10e-6_open_bin1bp_read_atac_%dclass_model.mat", k))) model.open = data$models model.prob = data$prob data = NULL # nucleosomes - model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_classification_1", + model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_1", sprintf("myc_motifs_10e-6_1nucl_bin1bp_fragment_center_%dclass_model.mat", k)))$models # sequence - model.seq = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_classification_1", + model.seq = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_1", sprintf("myc_motifs_10e-6_open_bin1bp_read_atac_%dclass_sequences_model.mat", k)))$models # plot classes col = brewer.pal(3, "Set1") # X11(width=17, height=10) - png(filename=file.path("results", "10xgenomics_PBMC_5k_classification_1", + png(filename=file.path("results", "10xgenomics_PBMC_5k_motifs_classification_1", sprintf("myc_motifs_10e-6_classification_open_bin1bp_%dclass.png", k)), units="in", res=720, width=18, height=12) m = matrix(1:10, nrow=5, ncol=2, byrow=F) layout(m) # order from most to least probable class ord = order(model.prob, decreasing=T) ref.open = model.open[ord,, drop=F] ref.nucl = model.nucl[ord,, drop=F] ref.seq = model.seq[,,ord, drop=F] prob = model.prob[ord] class = c(1:nrow(ref.open))[ord] for(i in 1:nrow(ref.open)) { # plot logo plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, main=sprintf("class %d (p=%.2f)", class[i], prob[i])) # x-axis x.lab = seq(-ncol(ref.open), ncol(ref.open), length.out=3) x.at = (x.lab + ncol(ref.open)) / 2 axis(1, at=x.at, labels=x.lab) # y-axis is [0,1] for min/max signal x.at = seq(0, 1, 0.5) axis(2, at=x.at, labels=x.at) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) } row_n = 1 # row counter col_n = 1 # column counter for(i in 1:nrow(ref.open)) { # plot logo center right = 0.5*col_n - 0.01 left = right - 0.2 bottom = 1-(row_n*(0.2))+0.05 top = bottom + 0.15 par(fig=c(left, right, bottom, top), new=T) idx = 380:420 plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) # xaxis x.at = 1:length(idx) axis(1, at=x.at, labels=x.at) # yaxis x.at = seq(0, 2, by=1) axis(2, at=x.at, labels=x.at) row_n = row_n + 1 if(i %% 5 == 0) { col_n = col_n + 1 row_n = 1 } } dev.off() } diff --git a/scripts/10xgenomics_PBMC_5k_classification_1/classification_myc_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_myc_motif.sh similarity index 89% rename from scripts/10xgenomics_PBMC_5k_classification_1/classification_myc_motif.sh rename to scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_myc_motif.sh index e95bb7e..ec4656a 100755 --- a/scripts/10xgenomics_PBMC_5k_classification_1/classification_myc_motif.sh +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_myc_motif.sh @@ -1,53 +1,53 @@ # some paths ## directories -results_dir='results/10xgenomics_PBMC_5k_classification_1' -data_dir='results/10xgenomics_PBMC_5k' +results_dir='results/10xgenomics_PBMC_5k_motifs_classification_1' +data_dir='data/10xgenomics_PBMC_5k_motifs' ## input file_mat_open="$data_dir/myc_motifs_10e-6_open_bin1bp_read_atac.mat" file_mat_1nucl="$data_dir/myc_motifs_10e-6_1nucl_bin1bp_fragment_center.mat" file_mat_seq="$data_dir/myc_motifs_10e-6_sequences.mat" ## file with seeds file_seed=$results_dir'/myc_motifs_10e-6_seed.txt' mkdir -p $results_dir touch $file_seed # parameters n_iter='20' n_shift='21' -n_core=8 +n_core=28 # open chromatin for k in 1 2 3 4 5 6 7 8 9 10 do seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) file_prob=$results_dir/'myc_motifs_10e-6_open_bin1bp_read_atac_'$k'class_prob.mat4d' file_mod1=$results_dir/'myc_motifs_10e-6_open_bin1bp_read_atac_'$k'class_model.mat' file_mod2=$results_dir/'myc_motifs_10e-6_open_bin1bp_read_atac_'$k'class_1nucl_fragment_center_model.mat' file_mod3=$results_dir/'myc_motifs_10e-6_open_bin1bp_read_atac_'$k'class_sequences_model.mat' file_aic=$results_dir/'myc_motifs_10e-6_open_bin1bp_read_atac_'$k'class_aic.txt' echo "$file_prob $seed" >> $file_seed - bin/EMRead --read $file_mat_open --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob + bin/EMRead --read $file_mat_open --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2 bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 done # 1nucl chromatin for k in 1 2 3 4 5 6 7 8 9 10 do seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) file_prob=$results_dir/'myc_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_prob.mat4d' file_mod1=$results_dir/'myc_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_model.mat' file_mod2=$results_dir/'myc_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_open_read_atac_model.mat' file_mod3=$results_dir/'myc_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_sequences_model.mat' file_aic=$results_dir/'myc_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_aic.txt' echo "$file_prob $seed" >> $file_seed - bin/EMRead --read $file_mat_1nucl --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob + bin/EMRead --read $file_mat_1nucl --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod1 bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod2 bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 done diff --git a/scripts/10xgenomics_PBMC_5k_classification_1/classification_sp1_motif.R b/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_sp1_motif.R similarity index 94% rename from scripts/10xgenomics_PBMC_5k_classification_1/classification_sp1_motif.R rename to scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_sp1_motif.R index 9097f86..53fede4 100644 --- a/scripts/10xgenomics_PBMC_5k_classification_1/classification_sp1_motif.R +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_sp1_motif.R @@ -1,96 +1,96 @@ setwd(file.path("/", "local", "groux", "scATAC-seq")) # libraries library(RColorBrewer) library(seqLogo) # functions source(file.path("scripts", "functions.R")) # the minimum number of classes searched k.min = 1 # the maximum number of classes searched k.max = 10 # path to the images for the logo path.a = file.path("res/A.png") path.c = file.path("res/C.png") path.g = file.path("res/G.png") path.t = file.path("res/T.png") ################## sequence patterns around sp1 motifs ################## for(k in k.min:k.max) { # open chromatin - data = read.read.models(file.path("results", "10xgenomics_PBMC_5k_classification_1", + data = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_1", sprintf("sp1_motifs_10e-7_open_bin1bp_read_atac_%dclass_model.mat", k))) model.open = data$models model.prob = data$prob data = NULL # nucleosomes - model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_classification_1", + model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_1", sprintf("sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_%dclass_model.mat", k)))$models # sequence - model.seq = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_classification_1", + model.seq = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_1", sprintf("sp1_motifs_10e-7_open_bin1bp_read_atac_%dclass_sequences_model.mat", k)))$models # plot classes col = brewer.pal(3, "Set1") # X11(width=17, height=10) - png(filename=file.path("results", "10xgenomics_PBMC_5k_classification_1", + png(filename=file.path("results", "10xgenomics_PBMC_5k_motifs_classification_1", sprintf("sp1_motifs_10e-7_classification_open_bin1bp_%dclass.png", k)), units="in", res=720, width=18, height=12) m = matrix(1:10, nrow=5, ncol=2, byrow=F) layout(m) # order from most to least probable class ord = order(model.prob, decreasing=T) ref.open = model.open[ord,, drop=F] ref.nucl = model.nucl[ord,, drop=F] ref.seq = model.seq[,,ord, drop=F] prob = model.prob[ord] class = c(1:nrow(ref.open))[ord] for(i in 1:nrow(ref.open)) { # plot logo plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, main=sprintf("class %d (p=%.2f)", class[i], prob[i])) # x-axis x.lab = seq(-ncol(ref.open), ncol(ref.open), length.out=3) x.at = (x.lab + ncol(ref.open)) / 2 axis(1, at=x.at, labels=x.lab) # y-axis is [0,1] for min/max signal x.at = seq(0, 1, 0.5) axis(2, at=x.at, labels=x.at) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) } row_n = 1 # row counter col_n = 1 # column counter for(i in 1:nrow(ref.open)) { # plot logo center right = 0.5*col_n - 0.01 left = right - 0.2 bottom = 1-(row_n*(0.2))+0.05 top = bottom + 0.15 par(fig=c(left, right, bottom, top), new=T) idx = 380:420 plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) # xaxis x.at = 1:length(idx) axis(1, at=x.at, labels=x.at) # yaxis x.at = seq(0, 2, by=1) axis(2, at=x.at, labels=x.at) row_n = row_n + 1 if(i %% 5 == 0) { col_n = col_n + 1 row_n = 1 } } dev.off() } diff --git a/scripts/10xgenomics_PBMC_5k_classification_1/classification_sp1_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_sp1_motif.sh similarity index 89% rename from scripts/10xgenomics_PBMC_5k_classification_1/classification_sp1_motif.sh rename to scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_sp1_motif.sh index c7ca927..d9104d0 100755 --- a/scripts/10xgenomics_PBMC_5k_classification_1/classification_sp1_motif.sh +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_sp1_motif.sh @@ -1,52 +1,52 @@ # some paths ## directories -results_dir='results/10xgenomics_PBMC_5k_classification_1' -data_dir='results/10xgenomics_PBMC_5k' +results_dir='results/10xgenomics_PBMC_5k_motifs_classification_1' +data_dir='data/10xgenomics_PBMC_5k_motifs' ## input file_mat_open="$data_dir/sp1_motifs_10e-7_open_bin1bp_read_atac.mat" file_mat_1nucl="$data_dir/sp1_motifs_10e-7_1nucl_bin1bp_fragment_center.mat" file_mat_seq="$data_dir/sp1_motifs_10e-7_sequences.mat" ## file with seeds file_seed=$results_dir'/sp1_motifs_10e-7_seed.txt' mkdir -p $results_dir touch $file_seed # parameters n_iter='20' n_shift='21' -n_core=8 +n_core=28 # open chromatin for k in 1 2 3 4 5 6 7 8 9 10 do seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) file_prob=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_prob.mat4d' file_mod1=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_model.mat' file_mod2=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_1nucl_fragment_center_model.mat' file_mod3=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_sequences_model.mat' file_aic=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_aic.txt' echo "$file_prob $seed" >> $file_seed - bin/EMRead --read $file_mat_open --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob + bin/EMRead --read $file_mat_open --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2 bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 done # 1nucl chromatin for k in 1 2 3 4 5 6 7 8 9 10 do seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) file_prob=$results_dir/'sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_'$k'class_prob.mat4d' file_mod1=$results_dir/'sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_'$k'class_model.mat' file_mod2=$results_dir/'sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_'$k'class_open_read_atac_model.mat' file_mod3=$results_dir/'sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_'$k'class_sequences_model.mat' file_aic=$results_dir/'sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_'$k'class_aic.txt' echo "$file_prob $seed" >> $file_seed - bin/EMRead --read $file_mat_1nucl --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob + bin/EMRead --read $file_mat_1nucl --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod1 bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod2 bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 done diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_1/run_all.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_1/run_all.sh new file mode 100755 index 0000000..11b7a41 --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_1/run_all.sh @@ -0,0 +1,14 @@ + +dir='scripts/10xgenomics_PBMC_5k_motifs_classification_1' + +# classification +$dir/classification_ctcf_motif.sh +$dir/classification_myc_motif.sh +$dir/classification_ebf1_motif.sh +$dir/classification_sp1_motif.sh + +# analysis of classification results +Rscript $dir/classification_ctcf_motif.R +Rscript $dir/classification_myc_motif.R +Rscript $dir/classification_ebf1_motif.R +Rscript $dir/classification_sp1_motif.R diff --git a/scripts/10xgenomics_PBMC_5k_classification_2/classification_ctcf_motif.R b/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_ctcf_motif.R similarity index 94% rename from scripts/10xgenomics_PBMC_5k_classification_2/classification_ctcf_motif.R rename to scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_ctcf_motif.R index 193ff0b..5343c07 100644 --- a/scripts/10xgenomics_PBMC_5k_classification_2/classification_ctcf_motif.R +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_ctcf_motif.R @@ -1,98 +1,98 @@ setwd(file.path("/", "local", "groux", "scATAC-seq")) # libraries library(RColorBrewer) library(seqLogo) # functions source(file.path("scripts", "functions.R")) # the minimum number of classes searched k.min = 1 # the maximum number of classes searched k.max = 10 # path to the images for the logo path.a = file.path("res/A.png") path.c = file.path("res/C.png") path.g = file.path("res/G.png") path.t = file.path("res/T.png") ################## sequence patterns around ctcf motifs ################## for(k in k.min:k.max) { # open chromatin - data = read.read.models(file.path("results", "10xgenomics_PBMC_5k_classification_2", + data = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_2", sprintf("ctcf_motifs_10e-6_open_bin1bp_read_atac_%dclass_model.mat", k))) model.open = data$models model.prob = data$prob data = NULL # nucleosomes - model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_classification_2", + model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_2", sprintf("ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_%dclass_model.mat", k)))$models # sequence - model.seq = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_classification_2", + model.seq = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_2", sprintf("ctcf_motifs_10e-6_sequences_%dclass_model.mat", k)))$models # plot classes col = brewer.pal(3, "Set1") # X11(width=17, height=10) - png(filename=file.path("results", "10xgenomics_PBMC_5k_classification_2", + png(filename=file.path("results", "10xgenomics_PBMC_5k_motifs_classification_2", sprintf("ctcf_motifs_10e-6_classification_%dclass.png", k)), units="in", res=720, width=18, height=12) m = matrix(1:10, nrow=5, ncol=2, byrow=F) layout(m) # order from most to least probable class ord = order(model.prob, decreasing=T) ref.open = model.open[ord,, drop=F] ref.nucl = model.nucl[ord,, drop=F] ref.seq = model.seq[,,ord, drop=F] prob = model.prob[ord] class = c(1:nrow(ref.open))[ord] for(i in 1:nrow(ref.open)) { # plot logo plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, main=sprintf("class %d (p=%.2f)", class[i], prob[i])) # x-axis x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2, length.out=3) x.at = seq(1, ncol(ref.open), length.out=length(x.lab)) axis(1, at=x.at, labels=x.lab) # y-axis is [0,1] for min/max signal y.at = seq(0, 2, length.out=2) y.lab = c("min", "max") axis(2, at=y.at, labels=y.lab) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) } # inlets with center row_n = 1 # row counter col_n = 1 # column counter for(i in 1:nrow(ref.open)) { # plot logo center right = 0.5*col_n - 0.01 left = right - 0.2 bottom = 1-(row_n*(0.2))+0.05 top = bottom + 0.15 par(fig=c(left, right, bottom, top), new=T) idx = (391-1-20):(391+1+20) plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) # xaxis x.at = seq(1, length(idx), length.out = 3) x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2)[idx][x.at] axis(1, at=x.at, labels=x.lab) # yaxis axis(2, at=y.at, labels=y.lab) row_n = row_n + 1 if(i %% 5 == 0) { col_n = col_n + 1 row_n = 1 } } dev.off() } diff --git a/scripts/10xgenomics_PBMC_5k_classification_2/classification_ctcf_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_ctcf_motif.sh similarity index 90% rename from scripts/10xgenomics_PBMC_5k_classification_2/classification_ctcf_motif.sh rename to scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_ctcf_motif.sh index 341fe6a..247a840 100755 --- a/scripts/10xgenomics_PBMC_5k_classification_2/classification_ctcf_motif.sh +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_ctcf_motif.sh @@ -1,36 +1,36 @@ # some paths ## directories -results_dir='results/10xgenomics_PBMC_5k_classification_2' -data_dir='results/10xgenomics_PBMC_5k' +results_dir='results/10xgenomics_PBMC_5k_motifs_classification_2' +data_dir='data/10xgenomics_PBMC_5k_motifs' ## input file_mat_open="$data_dir/ctcf_motifs_10e-6_open_bin1bp_read_atac.mat" file_mat_1nucl="$data_dir/ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center.mat" file_mat_seq="$data_dir/ctcf_motifs_10e-6_sequences.mat" ## file with seeds file_seed=$results_dir'/ctcf_motifs_10e-6_seed.txt' mkdir -p $results_dir touch $file_seed # parameters n_iter='20' n_shift='21' -n_core=12 +n_core=24 # open chromatin and nucleosomes for k in 1 2 3 4 5 6 7 8 9 10 do seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) file_prob=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_1nucl_bin1bp_fragment_center_'$k'class_prob.mat4d' file_mod1=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_model.mat' file_mod2=$results_dir/'ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_model.mat' file_mod3=$results_dir/'ctcf_motifs_10e-6_sequences_'$k'class_model.mat' file_aic=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_aic.txt' echo "$file_prob $seed" >> $file_seed - bin/EMJoint --read $file_mat_open --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob + bin/EMJoint --read $file_mat_open --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2 bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 done diff --git a/scripts/10xgenomics_PBMC_5k_classification_2/classification_ebf1_motif.R b/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_ebf1_motif.R similarity index 94% rename from scripts/10xgenomics_PBMC_5k_classification_2/classification_ebf1_motif.R rename to scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_ebf1_motif.R index e3efefd..564f066 100644 --- a/scripts/10xgenomics_PBMC_5k_classification_2/classification_ebf1_motif.R +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_ebf1_motif.R @@ -1,98 +1,98 @@ setwd(file.path("/", "local", "groux", "scATAC-seq")) # libraries library(RColorBrewer) library(seqLogo) # functions source(file.path("scripts", "functions.R")) # the minimum number of classes searched k.min = 1 # the maximum number of classes searched k.max = 10 # path to the images for the logo path.a = file.path("res/A.png") path.c = file.path("res/C.png") path.g = file.path("res/G.png") path.t = file.path("res/T.png") ################## sequence patterns around ebf1 motifs ################## for(k in k.min:k.max) { # open chromatin - data = read.read.models(file.path("results", "10xgenomics_PBMC_5k_classification_2", + data = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_2", sprintf("ebf1_motifs_10e-6_open_bin1bp_read_atac_%dclass_model.mat", k))) model.open = data$models model.prob = data$prob data = NULL # nucleosomes - model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_classification_2", + model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_2", sprintf("ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_%dclass_model.mat", k)))$models # sequence - model.seq = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_classification_2", + model.seq = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_2", sprintf("ebf1_motifs_10e-6_sequences_%dclass_model.mat", k)))$models # plot classes col = brewer.pal(3, "Set1") # X11(width=17, height=10) - png(filename=file.path("results", "10xgenomics_PBMC_5k_classification_2", + png(filename=file.path("results", "10xgenomics_PBMC_5k_motifs_classification_2", sprintf("ebf1_motifs_10e-6_classification_%dclass.png", k)), units="in", res=720, width=18, height=12) m = matrix(1:10, nrow=5, ncol=2, byrow=F) layout(m) # order from most to least probable class ord = order(model.prob, decreasing=T) ref.open = model.open[ord,, drop=F] ref.nucl = model.nucl[ord,, drop=F] ref.seq = model.seq[,,ord, drop=F] prob = model.prob[ord] class = c(1:nrow(ref.open))[ord] for(i in 1:nrow(ref.open)) { # plot logo plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, main=sprintf("class %d (p=%.2f)", class[i], prob[i])) # x-axis x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2, length.out=3) x.at = seq(1, ncol(ref.open), length.out=length(x.lab)) axis(1, at=x.at, labels=x.lab) # y-axis is [0,1] for min/max signal y.at = seq(0, 2, length.out=2) y.lab = c("min", "max") axis(2, at=y.at, labels=y.lab) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) } # inlets with center row_n = 1 # row counter col_n = 1 # column counter for(i in 1:nrow(ref.open)) { # plot logo center right = 0.5*col_n - 0.01 left = right - 0.2 bottom = 1-(row_n*(0.2))+0.05 top = bottom + 0.15 par(fig=c(left, right, bottom, top), new=T) idx = (391-1-20):(391+1+20) plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) # xaxis x.at = seq(1, length(idx), length.out = 3) x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2)[idx][x.at] axis(1, at=x.at, labels=x.lab) # yaxis axis(2, at=y.at, labels=y.lab) row_n = row_n + 1 if(i %% 5 == 0) { col_n = col_n + 1 row_n = 1 } } dev.off() } diff --git a/scripts/10xgenomics_PBMC_5k_classification_2/classification_ebf1_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_ebf1_motif.sh similarity index 90% rename from scripts/10xgenomics_PBMC_5k_classification_2/classification_ebf1_motif.sh rename to scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_ebf1_motif.sh index ce5cdc0..2204ba9 100755 --- a/scripts/10xgenomics_PBMC_5k_classification_2/classification_ebf1_motif.sh +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_ebf1_motif.sh @@ -1,36 +1,36 @@ # some paths ## directories -results_dir='results/10xgenomics_PBMC_5k_classification_2' -data_dir='results/10xgenomics_PBMC_5k' +results_dir='results/10xgenomics_PBMC_5k_motifs_classification_2' +data_dir='data/10xgenomics_PBMC_5k_motifs' ## input file_mat_open="$data_dir/ebf1_motifs_10e-6_open_bin1bp_read_atac.mat" file_mat_1nucl="$data_dir/ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center.mat" file_mat_seq="$data_dir/ebf1_motifs_10e-6_sequences.mat" ## file with seeds file_seed=$results_dir'/ebf1_motifs_10e-6_seed.txt' mkdir -p $results_dir touch $file_seed # parameters n_iter='20' n_shift='21' -n_core=12 +n_core=24 # open chromatin and nucleosomes for k in 1 2 3 4 5 6 7 8 9 10 do seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) file_prob=$results_dir/'ebf1_motifs_10e-6_open_bin1bp_read_atac_1nucl_bin1bp_fragment_center_'$k'class_prob.mat4d' file_mod1=$results_dir/'ebf1_motifs_10e-6_open_bin1bp_read_atac_'$k'class_model.mat' file_mod2=$results_dir/'ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_model.mat' file_mod3=$results_dir/'ebf1_motifs_10e-6_sequences_'$k'class_model.mat' file_aic=$results_dir/'ebf1_motifs_10e-6_open_bin1bp_read_atac_'$k'class_aic.txt' echo "$file_prob $seed" >> $file_seed - bin/EMJoint --read $file_mat_open --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob + bin/EMJoint --read $file_mat_open --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2 bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 done diff --git a/scripts/10xgenomics_PBMC_5k_classification_2/classification_myc_motif.R b/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_myc_motif.R similarity index 94% rename from scripts/10xgenomics_PBMC_5k_classification_2/classification_myc_motif.R rename to scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_myc_motif.R index c79b248..c7c2f80 100644 --- a/scripts/10xgenomics_PBMC_5k_classification_2/classification_myc_motif.R +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_myc_motif.R @@ -1,98 +1,98 @@ setwd(file.path("/", "local", "groux", "scATAC-seq")) # libraries library(RColorBrewer) library(seqLogo) # functions source(file.path("scripts", "functions.R")) # the minimum number of classes searched k.min = 1 # the maximum number of classes searched k.max = 10 # path to the images for the logo path.a = file.path("res/A.png") path.c = file.path("res/C.png") path.g = file.path("res/G.png") path.t = file.path("res/T.png") ################## sequence patterns around myc motifs ################## for(k in k.min:k.max) { # open chromatin - data = read.read.models(file.path("results", "10xgenomics_PBMC_5k_classification_2", + data = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_2", sprintf("myc_motifs_10e-6_open_bin1bp_read_atac_%dclass_model.mat", k))) model.open = data$models model.prob = data$prob data = NULL # nucleosomes - model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_classification_2", + model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_2", sprintf("myc_motifs_10e-6_1nucl_bin1bp_fragment_center_%dclass_model.mat", k)))$models # sequence - model.seq = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_classification_2", + model.seq = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_2", sprintf("myc_motifs_10e-6_sequences_%dclass_model.mat", k)))$models # plot classes col = brewer.pal(3, "Set1") # X11(width=17, height=10) - png(filename=file.path("results", "10xgenomics_PBMC_5k_classification_2", + png(filename=file.path("results", "10xgenomics_PBMC_5k_motifs_classification_2", sprintf("myc_motifs_10e-6_classification_%dclass.png", k)), units="in", res=720, width=18, height=12) m = matrix(1:10, nrow=5, ncol=2, byrow=F) layout(m) # order from most to least probable class ord = order(model.prob, decreasing=T) ref.open = model.open[ord,, drop=F] ref.nucl = model.nucl[ord,, drop=F] ref.seq = model.seq[,,ord, drop=F] prob = model.prob[ord] class = c(1:nrow(ref.open))[ord] for(i in 1:nrow(ref.open)) { # plot logo plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, main=sprintf("class %d (p=%.2f)", class[i], prob[i])) # x-axis x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2, length.out=3) x.at = seq(1, ncol(ref.open), length.out=length(x.lab)) axis(1, at=x.at, labels=x.lab) # y-axis is [0,1] for min/max signal y.at = seq(0, 2, length.out=2) y.lab = c("min", "max") axis(2, at=y.at, labels=y.lab) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) } # inlets with center row_n = 1 # row counter col_n = 1 # column counter for(i in 1:nrow(ref.open)) { # plot logo center right = 0.5*col_n - 0.01 left = right - 0.2 bottom = 1-(row_n*(0.2))+0.05 top = bottom + 0.15 par(fig=c(left, right, bottom, top), new=T) idx = (391-1-20):(391+1+20) plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) # xaxis x.at = seq(1, length(idx), length.out = 3) x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2)[idx][x.at] axis(1, at=x.at, labels=x.lab) # yaxis axis(2, at=y.at, labels=y.lab) row_n = row_n + 1 if(i %% 5 == 0) { col_n = col_n + 1 row_n = 1 } } dev.off() } diff --git a/scripts/10xgenomics_PBMC_5k_classification_2/classification_myc_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_myc_motif.sh similarity index 91% rename from scripts/10xgenomics_PBMC_5k_classification_2/classification_myc_motif.sh rename to scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_myc_motif.sh index 231485b..6ea2fcf 100755 --- a/scripts/10xgenomics_PBMC_5k_classification_2/classification_myc_motif.sh +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_myc_motif.sh @@ -1,36 +1,36 @@ # some paths ## directories -results_dir='results/10xgenomics_PBMC_5k_classification_2' -data_dir='results/10xgenomics_PBMC_5k' +results_dir='results/10xgenomics_PBMC_5k_motifs_classification_2' +data_dir='data/10xgenomics_PBMC_5k_motifs' ## input file_mat_open="$data_dir/myc_motifs_10e-6_open_bin1bp_read_atac.mat" file_mat_1nucl="$data_dir/myc_motifs_10e-6_1nucl_bin1bp_fragment_center.mat" file_mat_seq="$data_dir/myc_motifs_10e-6_sequences.mat" ## file with seeds file_seed=$results_dir'/myc_motifs_10e-6_seed.txt' mkdir -p $results_dir touch $file_seed # parameters n_iter='20' n_shift='21' n_core=12 # open chromatin and nucleosomes for k in 1 2 3 4 5 6 7 8 9 10 do seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) file_prob=$results_dir/'myc_motifs_10e-6_open_bin1bp_read_atac_1nucl_bin1bp_fragment_center_'$k'class_prob.mat4d' file_mod1=$results_dir/'myc_motifs_10e-6_open_bin1bp_read_atac_'$k'class_model.mat' file_mod2=$results_dir/'myc_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_model.mat' file_mod3=$results_dir/'myc_motifs_10e-6_sequences_'$k'class_model.mat' file_aic=$results_dir/'myc_motifs_10e-6_open_bin1bp_read_atac_'$k'class_aic.txt' echo "$file_prob $seed" >> $file_seed - bin/EMJoint --read $file_mat_open --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob + bin/EMJoint --read $file_mat_open --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2 bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 done diff --git a/scripts/10xgenomics_PBMC_5k_classification_2/classification_sp1_motif.R b/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_sp1_motif.R similarity index 94% rename from scripts/10xgenomics_PBMC_5k_classification_2/classification_sp1_motif.R rename to scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_sp1_motif.R index 24d95e0..9cb993f 100644 --- a/scripts/10xgenomics_PBMC_5k_classification_2/classification_sp1_motif.R +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_sp1_motif.R @@ -1,98 +1,98 @@ setwd(file.path("/", "local", "groux", "scATAC-seq")) # libraries library(RColorBrewer) library(seqLogo) # functions source(file.path("scripts", "functions.R")) # the minimum number of classes searched k.min = 1 # the maximum number of classes searched k.max = 10 # path to the images for the logo path.a = file.path("res/A.png") path.c = file.path("res/C.png") path.g = file.path("res/G.png") path.t = file.path("res/T.png") ################## sequence patterns around sp1 motifs ################## for(k in k.min:k.max) { # open chromatin - data = read.read.models(file.path("results", "10xgenomics_PBMC_5k_classification_2", + data = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_2", sprintf("sp1_motifs_10e-7_open_bin1bp_read_atac_%dclass_model.mat", k))) model.open = data$models model.prob = data$prob data = NULL # nucleosomes - model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_classification_2", + model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_2", sprintf("sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_%dclass_model.mat", k)))$models # sequence - model.seq = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_classification_2", + model.seq = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_2", sprintf("sp1_motifs_10e-7_sequences_%dclass_model.mat", k)))$models # plot classes col = brewer.pal(3, "Set1") # X11(width=17, height=10) - png(filename=file.path("results", "10xgenomics_PBMC_5k_classification_2", + png(filename=file.path("results", "10xgenomics_PBMC_5k_motifs_classification_2", sprintf("sp1_motifs_10e-7_classification_%dclass.png", k)), units="in", res=720, width=18, height=12) m = matrix(1:10, nrow=5, ncol=2, byrow=F) layout(m) # order from most to least probable class ord = order(model.prob, decreasing=T) ref.open = model.open[ord,, drop=F] ref.nucl = model.nucl[ord,, drop=F] ref.seq = model.seq[,,ord, drop=F] prob = model.prob[ord] class = c(1:nrow(ref.open))[ord] for(i in 1:nrow(ref.open)) { # plot logo plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, main=sprintf("class %d (p=%.2f)", class[i], prob[i])) # x-axis x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2, length.out=3) x.at = seq(1, ncol(ref.open), length.out=length(x.lab)) axis(1, at=x.at, labels=x.lab) # y-axis is [0,1] for min/max signal y.at = seq(0, 2, length.out=2) y.lab = c("min", "max") axis(2, at=y.at, labels=y.lab) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) } # inlets with center row_n = 1 # row counter col_n = 1 # column counter for(i in 1:nrow(ref.open)) { # plot logo center right = 0.5*col_n - 0.01 left = right - 0.2 bottom = 1-(row_n*(0.2))+0.05 top = bottom + 0.15 par(fig=c(left, right, bottom, top), new=T) idx = (391-1-20):(391+1+20) plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) # xaxis x.at = seq(1, length(idx), length.out = 3) x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2)[idx][x.at] axis(1, at=x.at, labels=x.lab) # yaxis axis(2, at=y.at, labels=y.lab) row_n = row_n + 1 if(i %% 5 == 0) { col_n = col_n + 1 row_n = 1 } } dev.off() } diff --git a/scripts/10xgenomics_PBMC_5k_classification_2/classification_sp1_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_sp1_motif.sh similarity index 91% rename from scripts/10xgenomics_PBMC_5k_classification_2/classification_sp1_motif.sh rename to scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_sp1_motif.sh index 7ba3cf0..5f561db 100755 --- a/scripts/10xgenomics_PBMC_5k_classification_2/classification_sp1_motif.sh +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_sp1_motif.sh @@ -1,35 +1,35 @@ # some paths ## directories -results_dir='results/10xgenomics_PBMC_5k_classification_2' -data_dir='results/10xgenomics_PBMC_5k' +results_dir='results/10xgenomics_PBMC_5k_motifs_classification_2' +data_dir='data/10xgenomics_PBMC_5k_motifs' ## input file_mat_open="$data_dir/sp1_motifs_10e-7_open_bin1bp_read_atac.mat" file_mat_1nucl="$data_dir/sp1_motifs_10e-7_1nucl_bin1bp_fragment_center.mat" file_mat_seq="$data_dir/sp1_motifs_10e-7_sequences.mat" ## file with seeds file_seed=$results_dir'/sp1_motifs_10e-7_seed.txt' mkdir -p $results_dir touch $file_seed # parameters n_iter='20' n_shift='21' n_core=12 # open chromatin and nucleosomes for k in 1 2 3 4 5 6 7 8 9 10 do seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) file_prob=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_1nucl_bin1bp_fragment_center_'$k'class_prob.mat4d' file_mod1=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_model.mat' file_mod2=$results_dir/'sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_'$k'class_model.mat' file_mod3=$results_dir/'sp1_motifs_10e-7_sequences_'$k'class_model.mat' file_aic=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_aic.txt' echo "$file_prob $seed" >> $file_seed - bin/EMJoint --read $file_mat_open --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob + bin/EMJoint --read $file_mat_open --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2 bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 done diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_2/run_all.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_2/run_all.sh new file mode 100755 index 0000000..ef78973 --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_2/run_all.sh @@ -0,0 +1,14 @@ + +dir='scripts/10xgenomics_PBMC_5k_motifs_classification_2' + +# classification +$dir/classification_ctcf_motif.sh +$dir/classification_myc_motif.sh +$dir/classification_ebf1_motif.sh +$dir/classification_sp1_motif.sh + +# analysis of classification results +Rscript $dir/classification_ctcf_motif.R +Rscript $dir/classification_myc_motif.R +Rscript $dir/classification_ebf1_motif.R +Rscript $dir/classification_sp1_motif.R diff --git a/scripts/10xgenomics_PBMC_5k_classification_3/classification_ctcf_motif.R b/scripts/10xgenomics_PBMC_5k_motifs_classification_3/classification_ctcf_motif.R similarity index 94% rename from scripts/10xgenomics_PBMC_5k_classification_3/classification_ctcf_motif.R rename to scripts/10xgenomics_PBMC_5k_motifs_classification_3/classification_ctcf_motif.R index 2cd7e46..8e71caa 100644 --- a/scripts/10xgenomics_PBMC_5k_classification_3/classification_ctcf_motif.R +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_3/classification_ctcf_motif.R @@ -1,95 +1,95 @@ setwd(file.path("/", "local", "groux", "scATAC-seq")) # libraries library(RColorBrewer) library(seqLogo) # functions source(file.path("scripts", "functions.R")) # the minimum number of classes searched k.min = 1 # the maximum number of classes searched k.max = 10 # path to the images for the logo path.a = file.path("res/A.png") path.c = file.path("res/C.png") path.g = file.path("res/G.png") path.t = file.path("res/T.png") ################## sequence patterns around ctcf motifs ################## for(k in k.min:k.max) { # open chromatin - data = read.read.models(file.path("results", "10xgenomics_PBMC_5k_classification_3", + data = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_3", sprintf("ctcf_motifs_10e-6_open_bin1bp_read_atac_%dclass_model.mat", k))) model.open = data$models model.prob = data$prob data = NULL # nucleosomes - model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_classification_3", + model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_3", sprintf("ctcf_motifs_10e-6_nucleosomes_bin1bp_fragment_center_%dclass_model.mat", k)))$models # sequence - model.seq = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_classification_3", + model.seq = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_3", sprintf("ctcf_motifs_10e-6_sequences_%dclass_model.mat", k)))$models # plot classes col = brewer.pal(3, "Set1") # X11(width=17, height=10) - png(filename=file.path("results", "10xgenomics_PBMC_5k_classification_3", + png(filename=file.path("results", "10xgenomics_PBMC_5k_motifs_classification_3", sprintf("ctcf_motifs_10e-6_classification_sequences_%dclass.png", k)), units="in", res=720, width=18, height=12) m = matrix(1:10, nrow=5, ncol=2, byrow=F) layout(m) # order from most to least probable class ord = order(model.prob, decreasing=T) ref.open = model.open[ord,, drop=F] ref.nucl = model.nucl[ord,, drop=F] ref.seq = model.seq[,,ord, drop=F] prob = model.prob[ord] class = c(1:nrow(ref.open))[ord] for(i in 1:nrow(ref.open)) { # plot logo plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, main=sprintf("class %d (p=%.2f)", class[i], prob[i])) # x-axis x.lab = seq(-ncol(ref.open), ncol(ref.open), length.out=3) x.at = (x.lab + ncol(ref.open)) / 2 axis(1, at=x.at, labels=x.lab) # y-axis is [0,1] for min/max signal x.at = seq(0, 1, 0.5) axis(2, at=x.at, labels=x.at) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) } row_n = 1 # row counter col_n = 1 # column counter for(i in 1:nrow(ref.open)) { # plot logo center right = 0.5*col_n - 0.01 left = right - 0.2 bottom = 1-(row_n*(0.2))+0.05 top = bottom + 0.15 par(fig=c(left, right, bottom, top), new=T) idx = 380:420 plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) # xaxis x.at = 1:length(idx) axis(1, at=x.at, labels=x.at) # yaxis x.at = seq(0, 2, by=1) axis(2, at=x.at, labels=x.at) row_n = row_n + 1 if(i %% 5 == 0) { col_n = col_n + 1 row_n = 1 } } dev.off() } diff --git a/scripts/10xgenomics_PBMC_5k_classification_3/classification_ctcf_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_3/classification_ctcf_motif.sh similarity index 89% rename from scripts/10xgenomics_PBMC_5k_classification_3/classification_ctcf_motif.sh rename to scripts/10xgenomics_PBMC_5k_motifs_classification_3/classification_ctcf_motif.sh index 452d48c..61a8ded 100755 --- a/scripts/10xgenomics_PBMC_5k_classification_3/classification_ctcf_motif.sh +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_3/classification_ctcf_motif.sh @@ -1,38 +1,38 @@ # some paths ## directories -results_dir='results/10xgenomics_PBMC_5k_classification_3' -data_dir='results/10xgenomics_PBMC_5k' +results_dir='results/10xgenomics_PBMC_5k_motifs_classification_3' +data_dir='data/10xgenomics_PBMC_5k_motifs' ## input file_mat_open="$data_dir/ctcf_motifs_10e-6_open_bin1bp_read_atac.mat" file_mat_1nucl="$data_dir/ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center.mat" file_mat_nucl="$data_dir/ctcf_motifs_10e-6_nucleosomes_bin1bp_fragment_center.mat" file_mat_seq="$data_dir/ctcf_motifs_10e-6_sequences.mat" ## file with seeds file_seed=$results_dir'/ctcf_motifs_10e-6_seed.txt' mkdir -p $results_dir touch $file_seed # parameters n_iter='20' n_shift='21' -n_core=12 +n_core=24 # sequences for k in 1 2 3 4 5 6 7 8 9 10 do seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) file_prob=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_sequences_'$k'class_prob.mat4d' file_mod1=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_model.mat' file_mod2=$results_dir/'ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_model.mat' file_mod3=$results_dir/'ctcf_motifs_10e-6_nucleosomes_bin1bp_fragment_center_'$k'class_model.mat' file_mod4=$results_dir/'ctcf_motifs_10e-6_sequences_'$k'class_model.mat' file_aic=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_aic.txt' echo "$file_prob $seed" >> $file_seed - bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob + bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2 bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod3 bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod4 done diff --git a/scripts/10xgenomics_PBMC_5k_classification_3/classification_sp1_motif.R b/scripts/10xgenomics_PBMC_5k_motifs_classification_3/classification_sp1_motif.R similarity index 94% rename from scripts/10xgenomics_PBMC_5k_classification_3/classification_sp1_motif.R rename to scripts/10xgenomics_PBMC_5k_motifs_classification_3/classification_sp1_motif.R index 3b6d495..1dea217 100644 --- a/scripts/10xgenomics_PBMC_5k_classification_3/classification_sp1_motif.R +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_3/classification_sp1_motif.R @@ -1,95 +1,95 @@ setwd(file.path("/", "local", "groux", "scATAC-seq")) # libraries library(RColorBrewer) library(seqLogo) # functions source(file.path("scripts", "functions.R")) # the minimum number of classes searched k.min = 1 # the maximum number of classes searched k.max = 10 # path to the images for the logo path.a = file.path("res/A.png") path.c = file.path("res/C.png") path.g = file.path("res/G.png") path.t = file.path("res/T.png") ################## sequence patterns around sp1 motifs ################## for(k in k.min:k.max) { # open chromatin - data = read.read.models(file.path("results", "10xgenomics_PBMC_5k_classification_3", + data = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_3", sprintf("sp1_motifs_10e-7_open_bin1bp_read_atac_%dclass_model.mat", k))) model.open = data$models model.prob = data$prob data = NULL # nucleosomes - model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_classification_3", + model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_3", sprintf("sp1_motifs_10e-7_nucleosomes_bin1bp_fragment_center_%dclass_model.mat", k)))$models # sequence - model.seq = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_classification_3", + model.seq = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_3", sprintf("sp1_motifs_10e-7_sequences_%dclass_model.mat", k)))$models # plot classes col = brewer.pal(3, "Set1") # X11(width=17, height=10) - png(filename=file.path("results", "10xgenomics_PBMC_5k_classification_3", + png(filename=file.path("results", "10xgenomics_PBMC_5k_motifs_classification_3", sprintf("sp1_motifs_10e-7_classification_sequences_%dclass.png", k)), units="in", res=720, width=18, height=12) m = matrix(1:10, nrow=5, ncol=2, byrow=F) layout(m) # order from most to least probable class ord = order(model.prob, decreasing=T) ref.open = model.open[ord,, drop=F] ref.nucl = model.nucl[ord,, drop=F] ref.seq = model.seq[,,ord, drop=F] prob = model.prob[ord] class = c(1:nrow(ref.open))[ord] for(i in 1:nrow(ref.open)) { # plot logo plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, main=sprintf("class %d (p=%.2f)", class[i], prob[i])) # x-axis x.lab = seq(-ncol(ref.open), ncol(ref.open), length.out=3) x.at = (x.lab + ncol(ref.open)) / 2 axis(1, at=x.at, labels=x.lab) # y-axis is [0,1] for min/max signal x.at = seq(0, 1, 0.5) axis(2, at=x.at, labels=x.at) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) } row_n = 1 # row counter col_n = 1 # column counter for(i in 1:nrow(ref.open)) { # plot logo center right = 0.5*col_n - 0.01 left = right - 0.2 bottom = 1-(row_n*(0.2))+0.05 top = bottom + 0.15 par(fig=c(left, right, bottom, top), new=T) idx = 380:420 plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) # xaxis x.at = 1:length(idx) axis(1, at=x.at, labels=x.at) # yaxis x.at = seq(0, 2, by=1) axis(2, at=x.at, labels=x.at) row_n = row_n + 1 if(i %% 5 == 0) { col_n = col_n + 1 row_n = 1 } } dev.off() } diff --git a/scripts/10xgenomics_PBMC_5k_classification_3/classification_sp1_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_3/classification_sp1_motif.sh similarity index 92% rename from scripts/10xgenomics_PBMC_5k_classification_3/classification_sp1_motif.sh rename to scripts/10xgenomics_PBMC_5k_motifs_classification_3/classification_sp1_motif.sh index a37b533..740d6f3 100755 --- a/scripts/10xgenomics_PBMC_5k_classification_3/classification_sp1_motif.sh +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_3/classification_sp1_motif.sh @@ -1,38 +1,38 @@ # some paths ## directories results_dir='results/10xgenomics_PBMC_5k_classification_3' -data_dir='results/10xgenomics_PBMC_5k' +data_dir='data/10xgenomics_PBMC_5k_motifs' ## input file_mat_open="$data_dir/sp1_motifs_10e-7_open_bin1bp_read_atac.mat" file_mat_1nucl="$data_dir/sp1_motifs_10e-7_1nucl_bin1bp_fragment_center.mat" file_mat_nucl="$data_dir/sp1_motifs_10e-7_nucleosomes_bin1bp_fragment_center.mat" file_mat_seq="$data_dir/sp1_motifs_10e-7_sequences.mat" ## file with seeds file_seed=$results_dir'/sp1_motifs_10e-7_seed.txt' mkdir -p $results_dir touch $file_seed # parameters n_iter='20' n_shift='21' -n_core=12 +n_core=24 # sequences for k in 1 2 3 4 5 6 7 8 9 10 do seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) file_prob=$results_dir/'sp1_motifs_10e-7_open_bin1bp_sequences_'$k'class_prob.mat4d' file_mod1=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_model.mat' file_mod2=$results_dir/'sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_'$k'class_model.mat' file_mod3=$results_dir/'sp1_motifs_10e-7_nucleosomes_bin1bp_fragment_center_'$k'class_model.mat' file_mod4=$results_dir/'sp1_motifs_10e-7_sequences_'$k'class_model.mat' file_aic=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_aic.txt' echo "$file_prob $seed" >> $file_seed - bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob + bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2 bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod3 bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod4 done diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_3/run_all.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_3/run_all.sh new file mode 100755 index 0000000..a636758 --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_3/run_all.sh @@ -0,0 +1,10 @@ + +dir='scripts/10xgenomics_PBMC_5k_motifs_classification_3' + +# classification +$dir/classification_ctcf_motif.sh +$dir/classification_sp1_motif.sh + +# analysis of classification results +Rscript $dir/classification_ctcf_motif.R +Rscript $dir/classification_sp1_motif.R diff --git a/scripts/10xgenomics_PBMC_5k_classification_4/classification_ctcf_motif.R b/scripts/10xgenomics_PBMC_5k_motifs_classification_4/classification_ctcf_motif.R similarity index 94% rename from scripts/10xgenomics_PBMC_5k_classification_4/classification_ctcf_motif.R rename to scripts/10xgenomics_PBMC_5k_motifs_classification_4/classification_ctcf_motif.R index 93d8eae..ae22d2a 100644 --- a/scripts/10xgenomics_PBMC_5k_classification_4/classification_ctcf_motif.R +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_4/classification_ctcf_motif.R @@ -1,96 +1,96 @@ setwd(file.path("/", "local", "groux", "scATAC-seq")) # libraries library(RColorBrewer) library(seqLogo) # functions source(file.path("scripts", "functions.R")) # the minimum number of classes searched k.min = 1 # the maximum number of classes searched k.max = 10 # path to the images for the logo path.a = file.path("res/A.png") path.c = file.path("res/C.png") path.g = file.path("res/G.png") path.t = file.path("res/T.png") ################## sequence patterns around ctcf motifs ################## for(k in k.min:k.max) { # open chromatin - data = read.read.models(file.path("results", "10xgenomics_PBMC_5k_classification_4", + data = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_4", sprintf("ctcf_motifs_10e-6_open_bin1bp_read_atac_%dclass_model.mat", k))) model.open = data$models model.prob = data$prob data = NULL # nucleosomes - model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_classification_4", + model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_4", sprintf("ctcf_motifs_10e-6_nucleosomes_bin1bp_fragment_center_%dclass_model.mat", k)))$models # sequence - model.seq = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_classification_4", + model.seq = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_4", sprintf("ctcf_motifs_10e-6_sequences_%dclass_model.mat", k)))$models # plot classes col = brewer.pal(3, "Set1") # X11(width=17, height=10) - png(filename=file.path("results", "10xgenomics_PBMC_5k_classification_4", + png(filename=file.path("results", "10xgenomics_PBMC_5k_motifs_classification_4", sprintf("ctcf_motifs_10e-6_classification_sequences_%dclass.png", k)), units="in", res=720, width=18, height=12) m = matrix(1:10, nrow=5, ncol=2, byrow=F) layout(m) # order from most to least probable class ord = order(model.prob, decreasing=T) ref.open = model.open[ord,, drop=F] ref.nucl = model.nucl[ord,, drop=F] ref.seq = model.seq[,,ord, drop=F] prob = model.prob[ord] class = c(1:nrow(ref.open))[ord] for(i in 1:nrow(ref.open)) { # plot logo plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, main=sprintf("class %d (p=%.2f)", class[i], prob[i])) # x-axis x.lab = seq(-ncol(ref.open), ncol(ref.open), length.out=3) x.at = (x.lab + ncol(ref.open)) / 2 axis(1, at=x.at, labels=x.lab) # y-axis is [0,1] for min/max signal x.at = seq(0, 1, 0.5) axis(2, at=x.at, labels=x.at) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) } row_n = 1 # row counter col_n = 1 # column counter for(i in 1:nrow(ref.open)) { # plot logo center right = 0.5*col_n - 0.01 left = right - 0.2 bottom = 1-(row_n*(0.2))+0.05 top = bottom + 0.15 par(fig=c(left, right, bottom, top), new=T) idx = 380:420 plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) # xaxis x.at = 1:length(idx) axis(1, at=x.at, labels=x.at) # yaxis x.at = seq(0, 2, by=1) axis(2, at=x.at, labels=x.at) row_n = row_n + 1 if(i %% 5 == 0) { col_n = col_n + 1 row_n = 1 } } dev.off() } diff --git a/scripts/10xgenomics_PBMC_5k_classification_4/classification_ctcf_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_4/classification_ctcf_motif.sh similarity index 89% rename from scripts/10xgenomics_PBMC_5k_classification_4/classification_ctcf_motif.sh rename to scripts/10xgenomics_PBMC_5k_motifs_classification_4/classification_ctcf_motif.sh index e91d7fb..a34f0ec 100755 --- a/scripts/10xgenomics_PBMC_5k_classification_4/classification_ctcf_motif.sh +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_4/classification_ctcf_motif.sh @@ -1,38 +1,38 @@ # some paths ## directories -results_dir='results/10xgenomics_PBMC_5k_classification_4' -data_dir='results/10xgenomics_PBMC_5k' +results_dir='results/10xgenomics_PBMC_5k_motifs_classification_4' +data_dir='data/10xgenomics_PBMC_5k_motifs' ## input file_mat_open="$data_dir/ctcf_motifs_10e-6_open_bin1bp_read_atac.mat" file_mat_1nucl="$data_dir/ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center.mat" file_mat_nucl="$data_dir/ctcf_motifs_10e-6_nucleosomes_bin1bp_fragment_center.mat" file_mat_seq="$data_dir/ctcf_motifs_10e-6_sequences.mat" ## file with seeds file_seed=$results_dir'/ctcf_motifs_10e-6_seed.txt' mkdir -p $results_dir touch $file_seed # parameters n_iter='20' n_shift='1' -n_core=12 +n_core=24 # sequences for k in 1 2 3 4 5 6 7 8 9 10 do seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) file_prob=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_sequences_'$k'class_prob.mat4d' file_mod1=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_model.mat' file_mod2=$results_dir/'ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_model.mat' file_mod3=$results_dir/'ctcf_motifs_10e-6_nucleosomes_bin1bp_fragment_center_'$k'class_model.mat' file_mod4=$results_dir/'ctcf_motifs_10e-6_sequences_'$k'class_model.mat' file_aic=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_aic.txt' echo "$file_prob $seed" >> $file_seed - bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob + bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2 bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod3 bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod4 done diff --git a/scripts/10xgenomics_PBMC_5k_classification_4/classification_sp1_motif.R b/scripts/10xgenomics_PBMC_5k_motifs_classification_4/classification_sp1_motif.R similarity index 94% rename from scripts/10xgenomics_PBMC_5k_classification_4/classification_sp1_motif.R rename to scripts/10xgenomics_PBMC_5k_motifs_classification_4/classification_sp1_motif.R index 3dc0ab1..5d2c6b4 100644 --- a/scripts/10xgenomics_PBMC_5k_classification_4/classification_sp1_motif.R +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_4/classification_sp1_motif.R @@ -1,96 +1,96 @@ setwd(file.path("/", "local", "groux", "scATAC-seq")) # libraries library(RColorBrewer) library(seqLogo) # functions source(file.path("scripts", "functions.R")) # the minimum number of classes searched k.min = 1 # the maximum number of classes searched k.max = 10 # path to the images for the logo path.a = file.path("res/A.png") path.c = file.path("res/C.png") path.g = file.path("res/G.png") path.t = file.path("res/T.png") ################## sequence patterns around sp1 motifs ################## for(k in k.min:k.max) { # open chromatin - data = read.read.models(file.path("results", "10xgenomics_PBMC_5k_classification_4", + data = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_4", sprintf("sp1_motifs_10e-7_open_bin1bp_read_atac_%dclass_model.mat", k))) model.open = data$models model.prob = data$prob data = NULL # nucleosomes - model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_classification_4", + model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_4", sprintf("sp1_motifs_10e-7_nucleosomes_bin1bp_fragment_center_%dclass_model.mat", k)))$models # sequence - model.seq = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_classification_4", + model.seq = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_4", sprintf("sp1_motifs_10e-7_sequences_%dclass_model.mat", k)))$models # plot classes col = brewer.pal(3, "Set1") # X11(width=17, height=10) - png(filename=file.path("results", "10xgenomics_PBMC_5k_classification_4", + png(filename=file.path("results", "10xgenomics_PBMC_5k_motifs_classification_4", sprintf("sp1_motifs_10e-7_classification_sequences_%dclass.png", k)), units="in", res=720, width=18, height=12) m = matrix(1:10, nrow=5, ncol=2, byrow=F) layout(m) # order from most to least probable class ord = order(model.prob, decreasing=T) ref.open = model.open[ord,, drop=F] ref.nucl = model.nucl[ord,, drop=F] ref.seq = model.seq[,,ord, drop=F] prob = model.prob[ord] class = c(1:nrow(ref.open))[ord] for(i in 1:nrow(ref.open)) { # plot logo plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, main=sprintf("class %d (p=%.2f)", class[i], prob[i])) # x-axis x.lab = seq(-ncol(ref.open), ncol(ref.open), length.out=3) x.at = (x.lab + ncol(ref.open)) / 2 axis(1, at=x.at, labels=x.lab) # y-axis is [0,1] for min/max signal x.at = seq(0, 1, 0.5) axis(2, at=x.at, labels=x.at) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) } row_n = 1 # row counter col_n = 1 # column counter for(i in 1:nrow(ref.open)) { # plot logo center right = 0.5*col_n - 0.01 left = right - 0.2 bottom = 1-(row_n*(0.2))+0.05 top = bottom + 0.15 par(fig=c(left, right, bottom, top), new=T) idx = 380:420 plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) # xaxis x.at = 1:length(idx) axis(1, at=x.at, labels=x.at) # yaxis x.at = seq(0, 2, by=1) axis(2, at=x.at, labels=x.at) row_n = row_n + 1 if(i %% 5 == 0) { col_n = col_n + 1 row_n = 1 } } dev.off() } diff --git a/scripts/10xgenomics_PBMC_5k_classification_4/classification_sp1_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_4/classification_sp1_motif.sh similarity index 89% rename from scripts/10xgenomics_PBMC_5k_classification_4/classification_sp1_motif.sh rename to scripts/10xgenomics_PBMC_5k_motifs_classification_4/classification_sp1_motif.sh index 5381f5c..cf0d6b3 100755 --- a/scripts/10xgenomics_PBMC_5k_classification_4/classification_sp1_motif.sh +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_4/classification_sp1_motif.sh @@ -1,38 +1,38 @@ # some paths ## directories -results_dir='results/10xgenomics_PBMC_5k_classification_4' -data_dir='results/10xgenomics_PBMC_5k' +results_dir='results/10xgenomics_PBMC_5k_motifs_classification_4' +data_dir='data/10xgenomics_PBMC_5k_motifs' ## input file_mat_open="$data_dir/sp1_motifs_10e-7_open_bin1bp_read_atac.mat" file_mat_1nucl="$data_dir/sp1_motifs_10e-7_1nucl_bin1bp_fragment_center.mat" file_mat_nucl="$data_dir/sp1_motifs_10e-7_nucleosomes_bin1bp_fragment_center.mat" file_mat_seq="$data_dir/sp1_motifs_10e-7_sequences.mat" ## file with seeds file_seed=$results_dir'/sp1_motifs_10e-7_seed.txt' mkdir -p $results_dir touch $file_seed # parameters n_iter='20' n_shift='1' -n_core=12 +n_core=24 # sequences for k in 1 2 3 4 5 6 7 8 9 10 do seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) file_prob=$results_dir/'sp1_motifs_10e-7_open_bin1bp_sequences_'$k'class_prob.mat4d' file_mod1=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_model.mat' file_mod2=$results_dir/'sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_'$k'class_model.mat' file_mod3=$results_dir/'sp1_motifs_10e-7_nucleosomes_bin1bp_fragment_center_'$k'class_model.mat' file_mod4=$results_dir/'sp1_motifs_10e-7_sequences_'$k'class_model.mat' file_aic=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_aic.txt' echo "$file_prob $seed" >> $file_seed - bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob + bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2 bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod3 bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod4 done diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_4/run_all.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_4/run_all.sh new file mode 100755 index 0000000..4b406e0 --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_4/run_all.sh @@ -0,0 +1,10 @@ + +dir='scripts/10xgenomics_PBMC_5k_motifs_classification_4' + +# classification +$dir/classification_ctcf_motif.sh +$dir/classification_sp1_motif.sh + +# analysis of classification results +Rscript $dir/classification_ctcf_motif.R +Rscript $dir/classification_sp1_motif.R diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_1/classification_peaks_sampled.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_1/classification_peaks_sampled.sh index 618a604..34da2c9 100755 --- a/scripts/10xgenomics_PBMC_5k_peaks_classification_1/classification_peaks_sampled.sh +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_1/classification_peaks_sampled.sh @@ -1,35 +1,35 @@ # paths ## dir data_dir="data/10xgenomics_PBMC_5k_peaks" results_dir="results/10xgenomics_PBMC_5k_peaks_classification_1" ## matrix files file_mat_open=$data_dir/'peaks_rmsk_sampled_openchromatin_1kb_read_atac.mat' file_mat_nucl=$data_dir/'peaks_rmsk_sampled_nucleosomes_1kb_fragment_center.mat' file_mat_seq=$data_dir/'peaks_rmsk_sampled_sequences_1kb.mat' ## file with seeds file_seed=$results_dir'/peaks_rmsk_sampled_seed.txt' mkdir -p $results_dir touch $file_seed # EM param n_iter='100' n_shift='981' -n_core=24 +n_core=8 # classify for k in 10 20 30 do ## results files file_prob=$results_dir/'peaks_rmsk_sampled_sequences_1kb_'$k'class_prob.mat4d' file_mod1=$results_dir/'peaks_rmsk_sampled_openchromatin_1kb_read_atac_'$k'class_model.mat' file_mod2=$results_dir/'peaks_rmsk_sampled_nucleosomes_1kb_fragment_center_'$k'class_model.mat' file_mod3=$results_dir/'peaks_rmsk_sampled_sequences_1kb_'$k'class_model.mat' seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) echo "$file_prob $seed" >> $file_seed bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift --flip --bgclass --iter $n_iter --seed $seed --thread $n_core > $file_prob bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod2 bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 done diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_6/analysis_test_sampled.R b/scripts/10xgenomics_PBMC_5k_peaks_classification_6/analysis_test_sampled.R index d4fc044..e369c8d 100644 --- a/scripts/10xgenomics_PBMC_5k_peaks_classification_6/analysis_test_sampled.R +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_6/analysis_test_sampled.R @@ -1,95 +1,103 @@ setwd(file.path("/", "local", "groux", "scATAC-seq")) # libraries library(RColorBrewer) # functions source(file.path("scripts", "functions.R")) # the number of classes searched n.classes = c(23) # path to the images for the logo path.a = file.path("res/A.png") path.c = file.path("res/C.png") path.g = file.path("res/G.png") path.t = file.path("res/T.png") ################## sequence patterns around ctcf motifs ################## for(k in n.classes) { # sequence data = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_6", sprintf("peaks_rmsk_sampled_sequences_1kb_%dclass_model_extended.mat", k))) model.seq = data$models model.prob = data$prob data = NULL # open chromatin model.open = read.read.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_6", sprintf("peaks_rmsk_sampled_openchromatin_1kb_read_atac_%dclass_model_extended.mat", k)))$models # nucleosomes model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_6", sprintf("peaks_rmsk_sampled_nucleosomes_1kb_fragment_center_%dclass_model_extended.mat", k)))$models # plot classes col = brewer.pal(3, "Set1") X11(width=26, height=12) # png(filename=file.path("results", "10xgenomics_PBMC_5k_peaks_classification_6", # sprintf("peaks_rmsk_sampled_sequences_%dclass.png", k)), # units="in", res=720, width=18, height=12) m = matrix(1:24, nrow=6, ncol=4, byrow=F) layout(m) # order from most to least probable class ord = order(model.prob, decreasing=T) - ref.open = model.open[ord,, drop=F][,316:716] - ref.nucl = model.nucl[ord,, drop=F][,316:716] - ref.seq = model.seq[,,ord, drop=F][,316:716,] + ref.open = model.open[ord,, drop=F][,] + ref.nucl = model.nucl[ord,, drop=F][,] + ref.seq = model.seq[,,ord, drop=F][,,] prob = model.prob[ord] class = c(1:nrow(ref.open))[ord] for(i in 1:nrow(ref.open)) { # plot logo plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, main=sprintf("class %d (p=%.2f)", class[i], prob[i])) # x-axis x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2, length.out=3) x.at = seq(1, ncol(ref.open), length.out=length(x.lab)) axis(1, at=x.at, labels=x.lab) # y-axis is [0,1] for min/max signal y.at = seq(0, 2, length.out=2) y.lab = c("min", "max") axis(2, at=y.at, labels=y.lab) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) } # inlets with center - # row_n = 1 # row counter - # col_n = 1 # column counter - # for(i in 1:nrow(ref.open)) - # { # plot logo center - # right = 0.25*col_n + 0.03 - # left = right - 0.15 - # bottom = 1-(row_n*(0.2))+0.05 - # top = bottom + 0.15 - # par(fig=c(left, right, bottom, top), new=T) - # idx = (516-1-10):(516+1+10) - # plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) - # # plot signal (multiplies by 2 because the y-axis goes to 2 bits) - # lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) - # lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) - # # xaxis - # x.at = seq(1, length(idx), length.out = 3) - # x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2)[idx][x.at] - # axis(1, at=x.at, labels=x.lab) - # # yaxis - # axis(2, at=y.at, labels=y.lab) - # row_n = row_n + 1 - # if(i %% 5 == 0) - # { col_n = col_n + 1 - # row_n = 1 - # } - # } + # inlets with center + row_n = 1 # row counter + col_n = 1 # column counter + row_h = 1/nrow(m) # height of row + col_w = 1/ncol(m) # width of column + row_cor = row_h / 2 + col_cor = col_w / 3 + for(i in 1:nrow(ref.open)) + { # plot logo center + left = (col_w*col_n) - col_w + right = left + col_w + left = right - col_cor + bottom = 1 - (row_h*row_n) + top = bottom + row_h + bottom = top - row_cor + + par(fig=c(left, right, bottom, top), new=T) + idx = (ceiling(dim(ref.seq)[2]/2)-1-10):(ceiling(dim(ref.seq)[2]/2)-1+10) + plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) + # plot signal (multiplies by 2 because the y-axis goes to 2 bits) + lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) + lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) + # xaxis + x.at = ceiling(length(idx)/2) + x.lab = 0 + axis(1, at=x.at, labels=x.lab) + # yaxis + axis(2, at=y.at, labels=y.lab) + row_n = row_n + 1 + if(i %% nrow(m) == 0) + { col_n = col_n + 1 + row_n = 1 + } + } dev.off() } diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_7/analysis_test.R b/scripts/10xgenomics_PBMC_5k_peaks_classification_7/analysis_test.R index 05b1f43..0531449 100644 --- a/scripts/10xgenomics_PBMC_5k_peaks_classification_7/analysis_test.R +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_7/analysis_test.R @@ -1,103 +1,180 @@ setwd(file.path("/", "local", "groux", "scATAC-seq")) # libraries library(RColorBrewer) # functions source(file.path("scripts", "functions.R")) # the number of classes searched n.classes = c(23) # path to the images for the logo path.a = file.path("res/A.png") path.c = file.path("res/C.png") path.g = file.path("res/G.png") path.t = file.path("res/T.png") ################## sequence patterns around ctcf motifs ################## for(k in n.classes) { # sequence data = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_7", - sprintf("peaks_rmsk_sampled_sequences_1kb_%dclass_model_extended.mat", k))) + sprintf("peaks_rmsk_sequences_1kb_%dclass_model_extended.mat", k))) model.seq = data$models model.prob = data$prob data = NULL # open chromatin model.open = read.read.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_7", - sprintf("peaks_rmsk_sampled_openchromatin_1kb_read_atac_%dclass_model_extended.mat", k)))$models + sprintf("peaks_rmsk_openchromatin_1kb_read_atac_%dclass_model_extended.mat", k)))$models # nucleosomes model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_7", - sprintf("peaks_rmsk_sampled_nucleosomes_1kb_fragment_center_%dclass_model_extended.mat", k)))$models + sprintf("peaks_rmsk_nucleosomes_1kb_fragment_center_%dclass_model_extended.mat", k)))$models # plot classes col = brewer.pal(3, "Set1") X11(width=26, height=12) - # png(filename=file.path("results", "test_1kb", + # png(filename=file.path("results", "10xgenomics_PBMC_5k_peaks_classification_7", # sprintf("peaks_rmsk_sampled_sequences_%dclass.png", k)), # units="in", res=720, width=18, height=12) m = matrix(1:24, nrow=6, ncol=4, byrow=F) layout(m) # order from most to least probable class ord = order(model.prob, decreasing=T) - ref.open = model.open[ord,, drop=F] - ref.nucl = model.nucl[ord,, drop=F] - ref.seq = model.seq[,,ord, drop=F] + ref.open = model.open[ord,, drop=F][,] + ref.nucl = model.nucl[ord,, drop=F][,] + ref.seq = model.seq[,,ord, drop=F][,,] prob = model.prob[ord] class = c(1:nrow(ref.open))[ord] for(i in 1:nrow(ref.open)) { # plot logo + par(mar=c(2,2,2,0)) plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, main=sprintf("class %d (p=%.2f)", class[i], prob[i])) # x-axis x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2, length.out=3) x.at = seq(1, ncol(ref.open), length.out=length(x.lab)) axis(1, at=x.at, labels=x.lab) # y-axis is [0,1] for min/max signal y.at = seq(0, 2, length.out=2) y.lab = c("min", "max") axis(2, at=y.at, labels=y.lab) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) } + # inlets with center - row_n = 1 # row counter - col_n = 1 # column counter + row_n = 1 # row counter + col_n = 1 # column counter + row_h = 1/nrow(m) # height of row + col_w = 1/ncol(m) # width of column + row_cor = row_h / 2 + col_cor = col_w / 3 for(i in 1:nrow(ref.open)) { # plot logo center - right = 0.5*col_n - 0.01 - left = right - 0.2 - bottom = 1-(row_n*(0.2))+0.05 - top = bottom + 0.15 + left = (col_w*col_n) - col_w + right = left + col_w + left = right - col_cor + bottom = 1 - (row_h*row_n) + top = bottom + row_h + bottom = top - row_cor + par(fig=c(left, right, bottom, top), new=T) - idx = (516-1-20):(516+1+20) + idx = (ceiling(dim(ref.seq)[2]/2)-1-10):(ceiling(dim(ref.seq)[2]/2)-1+10) plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) # xaxis - x.at = seq(1, length(idx), length.out = 3) - x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2)[idx][x.at] + # x.at = seq(1, length(idx), length.out = 3) + # x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2)[idx][x.at] + x.at = ceiling(length(idx)/2) + x.lab = 0 axis(1, at=x.at, labels=x.lab) # yaxis axis(2, at=y.at, labels=y.lab) row_n = row_n + 1 - if(i %% 5 == 0) + if(i %% nrow(m) == 0) { col_n = col_n + 1 row_n = 1 } } # dev.off() } -m = matrix(1:24, nrow=6, ncol=4, byrow=F) -layout(m) -col=brewer.pal(3,"Set1") -for(i in 1:nrow(model.open)) -{ plot(model.open[i,]/max(model.open[i,]), type='l', lwd=2, col=col[1]) - lines(model.nucl[i,]/max(model.nucl[i,]), type='l', lwd=2, col=col[2]) -} + +# PU.1 +X11(width=18, height=9) + par(mfrow=c(2,1)) + par(mar=c(2,2,2,0)) + plot.logo(model.seq[,,4], path.a, path.c, path.g, path.t, + main=sprintf("class 4 (p=%.2f)", model.prob[4])) + # x-axis + x.lab = seq(-(ncol(model.seq)-1)/2, (ncol(model.seq)-1)/2, length.out=3) + x.at = seq(1, ncol(model.seq), length.out=length(x.lab)) + axis(1, at=x.at, labels=x.lab) + # y-axis is [0,1] for min/max signal + y.at = seq(0, 2, length.out=2) + y.lab = c("min", "max") + axis(2, at=y.at, labels=y.lab) + # plot signal (multiplies by 2 because the y-axis goes to 2 bits) + lines(2*(model.open[4,] / max(model.open[4,])), lwd=1, col=col[1]) + lines(2*(model.nucl[4,] / max(model.nucl[4,])), lwd=1, col=col[2]) + + par(mar=c(2,2,2,0)) + plot.logo(model.seq[,495:535,4], path.a, path.c, path.g, path.t, + main=sprintf("class 4 (p=%.2f)", model.prob[4])) + lines(2*(model.open[4,495:535] / max(model.open[4,])), lwd=1, col=col[1]) + lines(2*(model.nucl[4,495:535] / max(model.nucl[4,])), lwd=1, col=col[2]) + +# NFE2 +X11(width=18, height=9) + par(mfrow=c(2,1)) + par(mar=c(2,2,2,0)) + plot.logo(model.seq[,,14], path.a, path.c, path.g, path.t, + main=sprintf("class 14 (p=%.2f)", model.prob[14])) + # x-axis + x.lab = seq(-(ncol(model.seq)-1)/2, (ncol(model.seq)-1)/2, length.out=3) + x.at = seq(1, ncol(model.seq), length.out=length(x.lab)) + axis(1, at=x.at, labels=x.lab) + # y-axis is [0,1] for min/max signal + y.at = seq(0, 2, length.out=2) + y.lab = c("min", "max") + axis(2, at=y.at, labels=y.lab) + # plot signal (multiplies by 2 because the y-axis goes to 2 bits) + lines(2*(model.open[14,] / max(model.open[14,])), lwd=1, col=col[1]) + lines(2*(model.nucl[14,] / max(model.nucl[14,])), lwd=1, col=col[2]) + + par(mar=c(2,2,2,0)) + plot.logo(model.seq[,490:540,14], path.a, path.c, path.g, path.t, + main=sprintf("class 14 (p=%.2f)", model.prob[14])) + lines(2*(model.open[14,490:540] / max(model.open[14,])), lwd=1, col=col[1]) + lines(2*(model.nucl[14,490:540] / max(model.nucl[14,])), lwd=1, col=col[2]) + +# CEPBB +X11(width=18, height=9) + par(mfrow=c(2,1)) + par(mar=c(2,2,2,0)) + plot.logo(model.seq[,,5], path.a, path.c, path.g, path.t, + main=sprintf("class 5 (p=%.2f)", model.prob[5])) + # x-axis + x.lab = seq(-(ncol(model.seq)-1)/2, (ncol(model.seq)-1)/2, length.out=3) + x.at = seq(1, ncol(model.seq), length.out=length(x.lab)) + axis(1, at=x.at, labels=x.lab) + # y-axis is [0,1] for min/max signal + y.at = seq(0, 2, length.out=2) + y.lab = c("min", "max") + axis(2, at=y.at, labels=y.lab) + # plot signal (multiplies by 2 because the y-axis goes to 2 bits) + lines(2*(model.open[5,] / max(model.open[5,])), lwd=1, col=col[1]) + lines(2*(model.nucl[5,] / max(model.nucl[5,])), lwd=1, col=col[2]) + + par(mar=c(2,2,2,0)) + plot.logo(model.seq[,495:535,5], path.a, path.c, path.g, path.t, + main=sprintf("class 5 (p=%.2f)", model.prob[5])) + lines(2*(model.open[5,495:535] / max(model.open[5,])), lwd=1, col=col[1]) + lines(2*(model.nucl[5,495:535] / max(model.nucl[5,])), lwd=1, col=col[2]) + \ No newline at end of file diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_7/classification_peaks.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_7/classification_peaks.sh index 9f2a21d..80b9afd 100755 --- a/scripts/10xgenomics_PBMC_5k_peaks_classification_7/classification_peaks.sh +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_7/classification_peaks.sh @@ -1,76 +1,76 @@ # paths ## dir data_dir_p="data/10xgenomics_PBMC_5k_peaks" data_dir="data/10xgenomics_PBMC_5k" pwm_dir="data/pwm/jaspar_2018_clustering/" hg19_dir="data/genomes" results_dir="results/10xgenomics_PBMC_5k_peaks_classification_7" ## matrix files file_mat_open=$data_dir_p/'peaks_rmsk_openchromatin_1kb_read_atac.mat' file_mat_nucl=$data_dir_p/'peaks_rmsk_nucleosomes_1kb_fragment_center.mat' file_mat_seq=$data_dir_p/'peaks_rmsk_sequences_1kb.mat' ## file with seeds file_seed=$results_dir'/peaks_rmsk_seed.txt' mkdir -p $results_dir touch $file_seed # EM param n_iter='1' n_shift='971' -n_core=8 +n_core=24 ## PWM files jun="$pwm_dir/cluster_3_node_23_20_motifs_prob.mat" hif1a="$pwm_dir/cluster_4_node_31_3_motifs_prob.mat" myc="$pwm_dir/cluster_4_node_22_4_motifs_prob.mat" pu1="$pwm_dir/cluster_7_node_13_2_motifs_prob.mat" cebpb="$pwm_dir/cluster_5_node_20_5_motifs_prob.mat" irf4="$pwm_dir/cluster_31_node_4_5_motifs_prob.mat" irf2="$pwm_dir/cluster_31_node_5_2_motifs_prob.mat" lhx3="$pwm_dir/cluster_1_node_74_2_motifs_prob.mat" foxh1="$pwm_dir/cluster_66_1_motifs_prob.mat" sox3="$pwm_dir/cluster_33_node_1_2_motifs_prob.mat" mef2c="$pwm_dir/cluster_20_4_motifs_prob.mat" elf5="$pwm_dir/cluster_7_node_17_5_motifs_prob.mat" stat6="$pwm_dir/cluster_32_node_STAT6_1_motifs_prob.mat" nfe2="$pwm_dir/cluster_3_node_24_4_motifs_prob.mat" ahr="$pwm_dir/cluster_4_node_30_2_motifs_prob.mat" e2f2="$pwm_dir/cluster_39_node_1_2_motifs_prob.mat" ctcf="$pwm_dir/cluster_48_node_ctcf_1_motifs_prob.mat" klf="$pwm_dir/cluster_28_node_14_3_motifs_prob.mat" nr4a1="$pwm_dir/cluster_2_node_12_4_motifs_prob.mat" egr="$pwm_dir/cluster_28_node_13_4_motifs_prob.mat" gata="$pwm_dir/cluster_21_node_5_6_motifs_prob.mat" nfat="$pwm_dir/cluster_19_node_2_3_motifs_prob.mat" runx="$pwm_dir/cluster_38_node_3_3_motifs_prob.mat" # classify for k in 23 do ## results files file_prob=$results_dir/'peaks_rmsk_sequences_1kb_'$k'class_prob.mat4d' file_mod1=$results_dir/'peaks_rmsk_openchromatin_1kb_read_atac_'$k'class_model.mat' file_mod2=$results_dir/'peaks_rmsk_nucleosomes_1kb_fragment_center_'$k'class_model.mat' file_mod3=$results_dir/'peaks_rmsk_sequences_1kb_'$k'class_model.mat' seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) echo "$file_prob $seed" >> $file_seed - bin/EMSequence --seq $file_mat_seq --class $k --motifs $jun,$hif1a,$myc,$pu1,$cebpb,$irf4,$irf2,$lhx3,$foxh1,$sox3,$mef2c,$elf5,$stat6,$nfe2,$ahr,$e2f2,$ctcf,$klf,$nr4a1,$egr,$gata,$nfat,$runx --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob + bin/EMSequence --seq $file_mat_seq --class $k --motifs $jun,$hif1a,$myc,$pu1,$cebpb,$irf4,$irf2,$lhx3,$foxh1,$sox3,$mef2c,$elf5,$stat6,$nfe2,$ahr,$e2f2,$ctcf,$klf,$nr4a1,$egr,$gata,$nfat,$runx --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod2 bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 # extend models file_mod1_ext=$results_dir/'peaks_rmsk_openchromatin_1kb_read_atac_'$k'class_model_extended.mat' file_mod2_ext=$results_dir/'peaks_rmsk_nucleosomes_1kb_fragment_center_'$k'class_model_extended.mat' file_mod3_ext=$results_dir/'peaks_rmsk_sequences_1kb_'$k'class_model_extended.mat' file_bed=$data_dir/'atac_v1_pbmc_5k_peaks_rmsk.bed' file_fasta=$hg19_dir/'hg19.fasta' file_bam_open=$data_dir/'atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam' file_bai_open=$data_dir/'atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam.bai' file_bam_nucl=$data_dir/'atac_v1_pbmc_5k_possorted_filtered_nucleosomes.bam' file_bai_nucl=$data_dir/'atac_v1_pbmc_5k_possorted_filtered_nucleosomes.bam.bai' bin/ReadModelExtender --bed $file_bed --bam $file_bam_open --bai $file_bai_open --prob $file_prob --from -500 --to 500 --ext 1000 --binSize 1 --method 'read_atac' --thread $n_core > $file_mod1_ext bin/ReadModelExtender --bed $file_bed --bam $file_bam_nucl --bai $file_bai_nucl --prob $file_prob --from -500 --to 500 --ext 1000 --binSize 1 --method 'fragment_center' --thread $n_core > $file_mod2_ext bin/SequenceModelExtender --bed $file_bed --fasta $file_fasta --prob $file_prob --from -500 --to 500 --ext 1000 --thread $n_core > $file_mod3_ext done diff --git a/src/Applications/EMJointApplication.cpp b/src/Applications/EMJointApplication.cpp index 77c18da..22b9caa 100644 --- a/src/Applications/EMJointApplication.cpp +++ b/src/Applications/EMJointApplication.cpp @@ -1,175 +1,184 @@ #include #include #include #include #include // std::move() #include // std::invalid_argument #include #include // boost::split() #include namespace po = boost::program_options ; EMJointApplication::EMJointApplication(int argn, char** argv) - : files_read(""), file_sequence(""), n_class(0), n_iter(0), n_shift(0), flip(false), + : files_read(""), file_sequence(""), file_out(""), + n_class(0), n_iter(0), n_shift(0), flip(false), n_threads(0), seed(""), runnable(true) { // parse command line options and set the fields this->parseOptions(argn, argv) ; } int EMJointApplication::run() { if(this->runnable) { // read data std::vector read_paths ; boost::split(read_paths, this->files_read, [](char c){return c == ',';}) ; std::vector> data_read ; for(const auto& path : read_paths) { if(path == "") { continue ; } data_read.push_back(Matrix2D(path)) ; } // sequence data EMJoint* em = nullptr ; if(this->file_sequence == "") { em = new EMJoint(std::move(data_read), this->n_class, this->n_iter, this->n_shift, this->flip, this->seed, this->n_threads) ; } else { Matrix2D data_seq(this->file_sequence) ; em = new EMJoint(std::move(data_read), std::move(data_seq), this->n_class, this->n_iter, this->n_shift, this->flip, this->seed, this->n_threads) ; } em->classify() ; - std::cout << em->get_post_prob() << std::endl ; + em->get_post_prob().save(this->file_out) ; delete em ; em = nullptr ; return EXIT_SUCCESS ; } else { return EXIT_FAILURE ; } } void EMJointApplication::parseOptions(int argn, char** argv) { // no option to parse if(argv == nullptr) { std::string message = "no options to parse!" ; throw std::invalid_argument(message) ; } // help messages std::string desc_msg = "\n" "EMJoint is a probabilistic partitioning algorithm that \n" "sofetly assigns genomic regions to classes given 1) the shapes \n" "of the read densities over the regions and 2) the region sequence \n" "motif contents. \n " "The assignment probabilitiesare returned through stdout.\n\n" ; std::string opt_help_msg = "Produces this help message." ; std::string opt_thread_msg = "The number of threads dedicated to parallelize the computations, \n" "by default 0 (no parallelization)." ; std::string opt_read_msg = "A coma separated list of paths to the file containing the \n" "read density data. At least one path is needed." ; std::string opt_seq_msg = "The path to the file containing the sequence data. If no path is \n" "given, the classification is only cares about the read density \n" "shapes." ; + std::string opt_file_out_msg = "A path to a file in which the assignment probabilities will be saved\n" + "in binary format." ; std::string opt_iter_msg = "The number of iterations." ; std::string opt_class_msg = "The number of classes to find." ; std::string opt_shift_msg = "Enables this number of column of shifting " "freedom. By default, shifting is " "disabled (equivalent to --shift 1)." ; std::string opt_flip_msg = "Enables flipping."; std::string opt_seed_msg = "A value to seed the random number generator."; // option parser boost::program_options::variables_map vm ; boost::program_options::options_description desc(desc_msg) ; desc.add_options() ("help,h", opt_help_msg.c_str()) - ("read", po::value(&(this->files_read)), opt_read_msg.c_str()) + ("read", po::value(&(this->files_read)), opt_read_msg.c_str()) ("seq", po::value(&(this->file_sequence)), opt_read_msg.c_str()) + ("out", po::value(&(this->file_out)), opt_file_out_msg.c_str()) ("iter,i", po::value(&(this->n_iter)), opt_iter_msg.c_str()) ("class,c", po::value(&(this->n_class)), opt_class_msg.c_str()) ("shift,s", po::value(&(this->n_shift)), opt_shift_msg.c_str()) ("flip", opt_flip_msg.c_str()) ("seed", po::value(&(this->seed)), opt_seed_msg.c_str()) ("thread", po::value(&(this->n_threads)), opt_thread_msg.c_str()) ; // parse try { po::store(po::parse_command_line(argn, argv, desc), vm) ; po::notify(vm) ; } catch(std::invalid_argument& e) { std::string msg = std::string("Error! Invalid option given!\n") + std::string(e.what()) ; throw std::invalid_argument(msg) ; } catch(...) { throw std::invalid_argument("An unknown error occured while parsing the options") ; } bool help = vm.count("help") ; // checks unproper option settings if(this->files_read == "" and this->file_sequence == "" and (not help)) { std::string msg("Error! No data were given (--read and --seq)!") ; throw std::invalid_argument(msg) ; } if(this->files_read == "" and (not help)) { std::string msg("Error! No read density data were given (--read)!") ; throw std::invalid_argument(msg) ; } + if(this->file_out == "" and + (not help)) + { std::string msg("Error! No output file given (--out)!") ; + throw std::invalid_argument(msg) ; + } // no iter given -> 1 iter if(this->n_iter == 0) { this->n_iter = 1 ; } // no shift class given -> 1 class if(this->n_class == 0) { this->n_class = 1 ; } // no shift given, value of 1 -> no shift if(this->n_shift == 0) { this->n_shift = 1 ; } // set flip if(vm.count("flip")) { this->flip = true ; } // help invoked, run() cannot be invoked if(help) { std::cout << desc << std::endl ; this->runnable = false ; return ; } // everything fine, run() can be called else { this->runnable = true ; return ; } } int main(int argn, char** argv) { EMJointApplication app(argn, argv) ; return app.run() ; } diff --git a/src/Applications/EMJointApplication.hpp b/src/Applications/EMJointApplication.hpp index 4fa806c..abe5296 100644 --- a/src/Applications/EMJointApplication.hpp +++ b/src/Applications/EMJointApplication.hpp @@ -1,101 +1,106 @@ #ifndef EMJOINTAPPLICATION_HPP #define EMJOINTAPPLICATION_HPP #include #include #include /*! * \brief The EMJointApplication class is a wrapper around an EMJoint * instance creating an autonomous application to classify data by directly * passing all the options and parameters from the command line. */ class EMJointApplication: public ApplicationInterface { public: EMJointApplication() = delete ; EMJointApplication(const EMJointApplication& app) = delete ; /*! * \brief Constructs an object from the command line * options. * \param argn the number of options passed to the * main() function. * \param argv the vector of options passed to the * main() function. */ EMJointApplication(int argn, char** argv) ; /*! * \brief Runs the application. The data are classified * using the given settings and the posterior probability * matrix is returned through the stdout. * The matrix is a 4D matrix with dimensions : * regions, class, shift flip. * \return an exit code EXIT_SUCCESS or EXIT_FAILURE * to return to the OS. */ virtual int run() override ; private: /*! * \brief Parses the program command line options and * sets the object field accordingly. * If the help option is detected, the "runnable" * field is set to false and subsequent calls to * run() will produce nothing. * \param argn the number of options passed to the * main() function. * \param argv the vector of options passed to the * main() function. * \throw std::invalid_argument if an error is found * in the program options. */ void parseOptions(int argn, char** argv) ; /*! * \brief a coma separated list of paths to the files * containing the read density data */ std::string files_read ; /*! * \brief the path to the file containing the * sequence data. */ std::string file_sequence ; + /*! + * \brief the path to the file in which the probability + * matrix will be saved. + */ + std::string file_out ; /*! * \brief the number of classes to partition the data into. */ size_t n_class ; /*! * \brief the number of iterations allowed. */ size_t n_iter ; /*! * \brief the shifting freedom. */ size_t n_shift ; /*! * \brief whether flipping freedom is allowed. */ bool flip ; /*! * \brief the number of threads. */ size_t n_threads ; /*! * \brief a seed to initialise the random number generator. */ std::string seed ; /*! * \brief a flag indicating whether the core of run() can be * run or not. */ bool runnable ; } ; #endif // EMJOINTAPPLICATION_HPP diff --git a/src/Applications/EMReadApplication.cpp b/src/Applications/EMReadApplication.cpp index 5b8e842..fb521ce 100644 --- a/src/Applications/EMReadApplication.cpp +++ b/src/Applications/EMReadApplication.cpp @@ -1,136 +1,147 @@ #include #include #include #include #include // std::invalid_argument #include #include +#include namespace po = boost::program_options ; EMReadApplication::EMReadApplication(int argn, char** argv) - : file_read(""), n_class(0), n_iter(0), n_shift(0), flip(false), + : file_read(""), file_out(""), + n_class(0), n_iter(0), n_shift(0), flip(false), n_threads(0), seed(""), runnable(true) { // parse command line options and set the fields this->parseOptions(argn, argv) ; } int EMReadApplication::run() { if(this->runnable) { EMRead em(Matrix2D(this->file_read), this->n_class, this->n_iter, this->n_shift, this->flip, this->seed, this->n_threads) ; em.classify() ; - std::cout << em.get_post_prob() << std::endl ; + em.get_post_prob().save(this->file_out) ; return EXIT_SUCCESS ; } else { return EXIT_FAILURE ; } } void EMReadApplication::parseOptions(int argn, char** argv) { // no option to parse if(argv == nullptr) { std::string message = "no options to parse!" ; throw std::invalid_argument(message) ; } // help messages std::string desc_msg = "\n" "EMRead is a probabilistic partitioning algorithm that \n" "sofetly assigns genomic regions to classes given the shape \n" "of the read density over the region. The assignment \n" "probabilities are returned through stdout.\n\n" ; std::string opt_help_msg = "Produces this help message." ; std::string opt_thread_msg = "The number of threads dedicated to parallelize the computations,\n " "by default 0 (no parallelization)." ; std::string opt_read_msg = "The path to the file containing the read density data" ; + std::string opt_file_out_msg = "A path to a file in which the assignment probabilities will be saved\n" + "in binary format." ; std::string opt_iter_msg = "The number of iterations." ; std::string opt_class_msg = "The number of classes to find." ; std::string opt_shift_msg = "Enables this number of column of shifting " "freedom to realign the data. By default, shifting is " "disabled (equivalent to --shift 1)." ; std::string opt_flip_msg = "Enables flipping to realign the data."; std::string opt_seed_msg = "A value to seed the random number generator."; // option parser boost::program_options::variables_map vm ; boost::program_options::options_description desc(desc_msg) ; std::string seeding_tmp ; desc.add_options() ("help,h", opt_help_msg.c_str()) - ("read", po::value(&(this->file_read)), opt_read_msg.c_str()) + ("read", po::value(&(this->file_read)), opt_read_msg.c_str()) - ("iter,i", po::value(&(this->n_iter)), opt_iter_msg.c_str()) - ("class,c", po::value(&(this->n_class)), opt_class_msg.c_str()) - ("shift,s", po::value(&(this->n_shift)), opt_shift_msg.c_str()) + ("out", po::value(&(this->file_out)), opt_file_out_msg.c_str()) + + ("iter,i", po::value(&(this->n_iter)), opt_iter_msg.c_str()) + ("class,c", po::value(&(this->n_class)), opt_class_msg.c_str()) + ("shift,s", po::value(&(this->n_shift)), opt_shift_msg.c_str()) ("flip", opt_flip_msg.c_str()) - ("seed", po::value(&(this->seed)), opt_seed_msg.c_str()) - ("thread", po::value(&(this->n_threads)), opt_thread_msg.c_str()) ; + ("seed", po::value(&(this->seed)), opt_seed_msg.c_str()) + ("thread", po::value(&(this->n_threads)), opt_thread_msg.c_str()) ; // parse try { po::store(po::parse_command_line(argn, argv, desc), vm) ; po::notify(vm) ; } catch(std::invalid_argument& e) { std::string msg = std::string("Error! Invalid option given!\n") + std::string(e.what()) ; throw std::invalid_argument(msg) ; } catch(...) { throw std::invalid_argument("An unknown error occured while parsing the options") ; } bool help = vm.count("help") ; // checks unproper option settings if(this->file_read == "" and (not help)) { std::string msg("Error! No data were given (--read)!") ; throw std::invalid_argument(msg) ; } + if(this->file_out == "" and + (not help)) + { std::string msg("Error! No output file given (--out)!") ; + throw std::invalid_argument(msg) ; + } // no iter given -> 1 iter if(this->n_iter == 0) { this->n_iter = 1 ; } // no shift class given -> 1 class if(this->n_class == 0) { this->n_class = 1 ; } // no shift given, value of 1 -> no shift if(this->n_shift == 0) { this->n_shift = 1 ; } // set flip if(vm.count("flip")) { this->flip = true ; } // help invoked, run() cannot be invoked if(help) { std::cout << desc << std::endl ; this->runnable = false ; return ; } // everything fine, run() can be called else { this->runnable = true ; return ; } } int main(int argn, char** argv) { EMReadApplication app(argn, argv) ; return app.run() ; } diff --git a/src/Applications/EMReadApplication.hpp b/src/Applications/EMReadApplication.hpp index 66cb1be..fc98551 100644 --- a/src/Applications/EMReadApplication.hpp +++ b/src/Applications/EMReadApplication.hpp @@ -1,91 +1,96 @@ #ifndef EMREADAPPLICATION_HPP #define EMREADAPPLICATION_HPP #include #include /*! * \brief The EMReadApplication class is a wrapper around an EMRead * instance creating an autonomous application to classify data by directly * passing all the options and parameters from the command line. */ class EMReadApplication: public ApplicationInterface { public: EMReadApplication() = delete ; EMReadApplication(const EMReadApplication& app) = delete ; /*! * \brief Constructs an object from the command line * options. * \param argn the number of options passed to the * main() function. * \param argv the vector of options passed to the * main() function. */ EMReadApplication(int argn, char** argv) ; /*! * \brief Runs the application. The data are classified * using the given settings and the posterior probability * matrix is returned through the stdout. * The matrix is a 4D matrix with dimensions : * regions, class, shift flip. * \return an exit code EXIT_SUCCESS or EXIT_FAILURE * to return to the OS. */ virtual int run() override ; private: /*! * \brief Parses the program command line options and * sets the object field accordingly. * If the help option is detected, the "runnable" * field is set to false and subsequent calls to * run() will produce nothing. * \param argn the number of options passed to the * main() function. * \param argv the vector of options passed to the * main() function. * \throw std::invalid_argument if an error is found * in the program options. */ void parseOptions(int argn, char** argv) ; /*! * \brief the paths to the file containing the read * density data. */ std::string file_read ; + /*! + * \brief the path to the file in which the probability + * matrix will be saved. + */ + std::string file_out ; /*! * \brief the number of classes to partition the data into. */ size_t n_class ; /*! * \brief the number of iterations allowed. */ size_t n_iter ; /*! * \brief the shifting freedom. */ size_t n_shift ; /*! * \brief whether flipping freedom is allowed. */ bool flip ; /*! * \brief the number of threads. */ size_t n_threads ; /*! * \brief a seed to initialise the random number generator. */ std::string seed ; /*! * \brief a flag indicating whether the core of run() can be * run or not. */ bool runnable ; } ; #endif // EMREADAPPLICATION_HPP diff --git a/src/Applications/EMSequenceApplication.cpp b/src/Applications/EMSequenceApplication.cpp index 7c061ac..3260df7 100644 --- a/src/Applications/EMSequenceApplication.cpp +++ b/src/Applications/EMSequenceApplication.cpp @@ -1,279 +1,283 @@ #include #include #include #include #include // std::move() #include // std::invalid_argument #include #include // boost::split() #include +#include #include namespace po = boost::program_options ; -template -std::ostream& operator << (std::ostream& stream, - const std::vector& v) -{ for(const auto& x : v) - { stream << x << " " ; } - return stream ; -} EMSequenceApplication::EMSequenceApplication(int argn, char** argv) - : file_seq(""), files_motif(""), + : file_seq(""), files_motif(""), file_out(""), n_class(0), n_iter(0), n_shift(0), flip(false), bckg_class(false), n_threads(0), seed(""), runnable(true) { // parse command line options and set the fields this->parseOptions(argn, argv) ; } int EMSequenceApplication::run() { if(this->runnable) { EMSequence* em(nullptr) ; // data Matrix2D data(this->file_seq) ; // seeds motifs randomly if(this->files_motif == "") { em = new EMSequence(std::move(data), this->n_class, this->n_iter, this->n_shift, this->flip, this->bckg_class, this->seed, this->n_threads) ; } // seeds motifs with the given matrices else { // model std::vector motif_paths ; boost::split(motif_paths, this->files_motif, [](char c){return c == ',';}) ; // this->n_class = motif_paths.size() + this->bckg_class ; size_t model_ncol = data.get_ncol() - this->n_shift + 1 ; // add the given motif, random motifs (if needed) and // background class (if needed) Matrix3D model = this->init_model(model_ncol, data, motif_paths) ; em = new EMSequence(std::move(data), std::move(model), this->n_iter, this->flip, this->n_threads) ; } // classify em->classify() ; - std::cout << em->get_post_prob() << std::endl ; + em->get_post_prob().save(this->file_out) ; // clean delete em ; em = nullptr ; return EXIT_SUCCESS ; } else { return EXIT_FAILURE ; } } void EMSequenceApplication::parseOptions(int argn, char** argv) { // no option to parse if(argv == nullptr) { std::string message = "no options to parse!" ; throw std::invalid_argument(message) ; } // help messages std::string desc_msg = "\n" "EMSequence is a probabilistic partitioning algorithm that \n" - "sofetly assigns sequences to classes given their motif content \n" - "The assignment probabilities are returned through stdout.\n\n" ; + "sofetly assigns sequences to classes given their motif content.\n" + "The assignment probabilities are written in binary format as a 4D " + "matrix.\n\n" ; std::string opt_help_msg = "Produces this help message." ; std::string opt_thread_msg = "The number of threads dedicated to parallelize the computations,\n " "by default 0 (no parallelization)." ; std::string opt_seq_msg = "The path to the file containing the sequences" ; std::string opt_motifs_msg = "A coma separated list of path to files containing the initial motifs\n" "values. The motifs should be probability matrices in horizontal format.\n" "If the motifs are too short after accounting for shifting, extra\n" "columns with uniform probabilities will be added on each side. The\n" "given number of classes (--class) should at least be the number of\n" "initial motifs. If the number of classes is bigger than the number of" "given motifs, the remaining classes are initialised randomly\n." ; + std::string opt_file_out_msg = "A path to a file in which the assignment probabilities will be saved\n" + "in binary format." ; std::string opt_iter_msg = "The number of iterations." ; std::string opt_class_msg = "The number of classes to find." ; std::string opt_shift_msg = "Enables this number of column of shifting freedom to realign\n" "the data. By default, shifting is disabled (equivalent to\n" "--shift 1)." ; std::string opt_flip_msg = "Enables flipping to realign the data."; std::string opt_bckg_msg = "Adds a class to model the sequence background. This class\n" "contains the sequence background probabilities at each position\n" "and is never updated." ; std::string opt_seed_msg = "A value to seed the random number generator."; // option parser boost::program_options::variables_map vm ; boost::program_options::options_description desc(desc_msg) ; std::string seeding_tmp ; desc.add_options() ("help,h", opt_help_msg.c_str()) - ("seq", po::value(&(this->file_seq)), opt_seq_msg.c_str()) + ("seq", po::value(&(this->file_seq)), opt_seq_msg.c_str()) + + ("motifs", po::value(&(this->files_motif)), opt_motifs_msg.c_str()) - ("motifs", po::value(&(this->files_motif)), opt_motifs_msg.c_str()) + ("out", po::value(&(this->file_out)), opt_file_out_msg.c_str()) - ("iter,i", po::value(&(this->n_iter)), opt_iter_msg.c_str()) - ("class,c", po::value(&(this->n_class)), opt_class_msg.c_str()) - ("shift,s", po::value(&(this->n_shift)), opt_shift_msg.c_str()) + ("iter,i", po::value(&(this->n_iter)), opt_iter_msg.c_str()) + ("class,c", po::value(&(this->n_class)), opt_class_msg.c_str()) + ("shift,s", po::value(&(this->n_shift)), opt_shift_msg.c_str()) ("flip", opt_flip_msg.c_str()) ("bgclass", opt_bckg_msg.c_str()) - ("seed", po::value(&(this->seed)), opt_seed_msg.c_str()) - ("thread", po::value(&(this->n_threads)), opt_thread_msg.c_str()) ; + ("seed", po::value(&(this->seed)), opt_seed_msg.c_str()) + ("thread", po::value(&(this->n_threads)), opt_thread_msg.c_str()) ; // parse try { po::store(po::parse_command_line(argn, argv, desc), vm) ; po::notify(vm) ; } catch(std::invalid_argument& e) { std::string msg = std::string("Error! Invalid option given!\n") + std::string(e.what()) ; throw std::invalid_argument(msg) ; } catch(...) { throw std::invalid_argument("An unknown error occured while parsing the options") ; } bool help = vm.count("help") ; // checks unproper option settings if(this->file_seq == "" and (not help)) { std::string msg("Error! No data were given (--seq)!") ; throw std::invalid_argument(msg) ; } + if(this->file_out == "" and + (not help)) + { std::string msg("Error! No output file given (--out)!") ; + throw std::invalid_argument(msg) ; + } // no iter given -> 1 iter if(this->n_iter == 0) { this->n_iter = 1 ; } // no shift class given -> 1 class if(this->n_class == 0) { this->n_class = 1 ; } // no shift given, value of 1 -> no shift if(this->n_shift == 0) { this->n_shift = 1 ; } // set flip if(vm.count("flip")) { this->flip = true ; } // set background class if(vm.count("bgclass")) { this->bckg_class = true ; } // help invoked, run() cannot be invoked if(help) { std::cout << desc << std::endl ; this->runnable = false ; return ; } // everything fine, run() can be called else { this->runnable = true ; return ; } } Matrix3D EMSequenceApplication::init_model(size_t model_len, const Matrix2D& data, const std::vector& motif_paths) const { int n_class_given = motif_paths.size() ; int n_class_bckg = this->bckg_class ; int n_class_rand = this->n_class - n_class_given - n_class_bckg ; // number of classes should at least be number of motifs if(n_class_given > (int)this->n_class) { char msg[4096] ; sprintf(msg, "Error! number of class given (--class %zu) should at " "least be equal to number of motifs (--motifs %d)", this->n_class, n_class_given) ; throw std::invalid_argument(msg) ; } // check if there is room for a background class if((int)this->n_class < n_class_given+this->bckg_class) { char msg[4096] ; sprintf(msg, "Error! no class left to add a background " "class (--bgclass) with the given motifs (--motifs) (--class %zu)", this->n_class) ; throw std::invalid_argument(msg) ; } // init empty model Matrix3D model(this->n_class, model_len, 4, 0.25) ; // add given motifs for(size_t i=0; i matrix(motif_paths[i]) ; // motif is too big for this shift if(matrix.get_ncol() > model_len) { char msg[4096] ; sprintf(msg, "Error! In %s, motif column number is bigger " "than data column number - shift + 1 " "(%zu > %zu - %zu + 1)", motif_paths[i].c_str(), matrix.get_ncol(), data.get_ncol(), this->n_shift) ; throw std::invalid_argument(msg) ; } // insert motif in middle of matrix else { // size_t j_model = this->n_shift / 2 ; size_t j_model = (model_len - matrix.get_ncol()) / 2 ; for(size_t j_mat=0, j_mod=j_model; j_mat 0) { // initialise randomly EMSequence em(data, n_class_rand, this->n_iter, this->n_shift, this->flip, this->bckg_class, this->seed, this->n_threads) ; Matrix3D model_rand = em.get_sequence_models() ; // copy them into model for(int i_rand=0, i_mod=n_class_given; i_rand #include #include #include #include /*! * \brief The EMSequenceApplication class is a wrapper around an EMSequence * instance creating an autonomous application to classify sequences by directly * passing all the options and parameters from the command line. */ class EMSequenceApplication: public ApplicationInterface { public: EMSequenceApplication() = delete ; EMSequenceApplication(const EMSequenceApplication& app) = delete ; /*! * \brief Constructs an object from the command line * options. * \param argn the number of options passed to the * main() function. * \param argv the vector of options passed to the * main() function. */ EMSequenceApplication(int argn, char** argv) ; /*! * \brief Runs the application. The data are classified * using the given settings and the posterior probability * matrix is returned through the stdout. * The matrix is a 4D matrix with dimensions : * regions, class, shift flip. * \return an exit code EXIT_SUCCESS or EXIT_FAILURE * to return to the OS. */ virtual int run() override ; private: /*! * \brief Parses the program command line options and * sets the object field accordingly. * If the help option is detected, the "runnable" * field is set to false and subsequent calls to * run() will produce nothing. * \param argn the number of options passed to the * main() function. * \param argv the vector of options passed to the * main() function. * \throw std::invalid_argument if an error is found * in the program options. */ void parseOptions(int argn, char** argv) ; /*! * \brief Initialise the class models if matrices * are given as initial class motifs. * If the given class motifs are shorter than the * model after accounting for shifting, extra columns * with uniform probabilities will be added on each * side. * If the number of classes is higher than the * number of given motifs, extra classes will be * initialised randomly.A background class is included * if needed. * \param model_len the number of positions (columns) * of the model to initialise. * \param data the sequence matrix, in integer format. * \param motif_paths the paths to the files containing * the probability matrices to use to initialise the * class motifs. * \return */ Matrix3D init_model(size_t model_len, const Matrix2D& data, const std::vector& motif_paths) const ; /*! * \brief the paths to the file containing the sequence * data. */ std::string file_seq ; /*! * \brief a coma separated list of files containing the * initial motif matrices. */ std::string files_motif ; + /*! + * \brief the path to the file in which the probability + * matrix will be saved. + */ + std::string file_out ; + /*! * \brief the number of classes to partition the data into. */ size_t n_class ; /*! * \brief the number of iterations allowed. */ size_t n_iter ; /*! * \brief the shifting freedom. */ size_t n_shift ; /*! * \brief whether flipping freedom is allowed. */ bool flip ; /*! * \brief whether a constant class to model the * sequence background should be added. This * class has the sequence background probabilities * at each position. */ bool bckg_class ; /*! * \brief the number of threads. */ size_t n_threads ; /*! * \brief a seed to initialise the random number generator. */ std::string seed ; /*! * \brief a flag indicating whether the core of run() can be * run or not. */ bool runnable ; } ; #endif // EMSEQUENCEAPPLICATION_HPP diff --git a/src/Applications/ProbToModelApplication.cpp b/src/Applications/ProbToModelApplication.cpp index 25d8c0d..65334f9 100644 --- a/src/Applications/ProbToModelApplication.cpp +++ b/src/Applications/ProbToModelApplication.cpp @@ -1,213 +1,214 @@ #include #include #include #include #include // std::move() #include // std::invalid_argument, std::runtime_error #include #include #include #include #include namespace po = boost::program_options ; typedef std::vector vector_d ; ProbToModelApplication::ProbToModelApplication(int argn, char** argv) : file_read(""), file_seq(""), file_prob(""), n_threads(0), runnable(false) { this->parseOptions(argn, argv) ; } ProbToModelApplication::~ProbToModelApplication() {} int ProbToModelApplication::run() { if(this->runnable) { // load data std::string file_data ; bool read_data = false ; bool seq_data = false ; if(this->file_read != "") { file_data = this->file_read ; read_data = true ; seq_data = false ; } else if(this->file_seq != "") { file_data = this->file_seq ; read_data = false ; seq_data = true ; } else { std::string msg("Error! Could not determine the type of the data!") ; throw std::runtime_error(msg) ; } Matrix2D data(file_data) ; - Matrix4D prob(this->file_prob) ; + // Matrix4D prob(this->file_prob) ; + Matrix4D prob ; prob.load(this->file_prob) ; if(data.get_nrow() != prob.get_dim()[0]) { char msg[4096] ; sprintf(msg, "Error! data and prob matrices have unequal " "row numbers (%zu / %zu)!", data.get_nrow(), prob.get_dim()[0]) ; throw std::runtime_error(msg) ; } else if(data.get_ncol() < prob.get_dim()[2]) { char msg[4096] ; sprintf(msg, "Error! too many shift states for the data!" "%zu shift states and %zu columns in data)!", prob.get_dim()[2], data.get_ncol()) ; throw std::runtime_error(msg) ; } // get the data model ModelComputer* ptr = nullptr ; if(read_data) { ptr = new ReadModelComputer(std::move(data), prob, this->n_threads) ; } else if(seq_data) { ptr = new SequenceModelComputer(std::move(data), prob, this->n_threads) ; } Matrix2D model = ptr->get_model() ; delete ptr ; ptr = nullptr ; // compute the class prob size_t n_row = prob.get_dim()[0] ; size_t n_class = prob.get_dim()[1] ; size_t n_shift = prob.get_dim()[2] ; size_t n_flip = prob.get_dim()[3] ; vector_d class_prob(n_class, 0.) ; double p_tot = 0. ; for(size_t i=0; i model_final(model.get_nrow(), model.get_ncol() + 1) ; // 1st column contain the class prob if(read_data) { for(size_t i=0; i(&(this->file_read)), opt_read_msg.c_str()) ("seq,", po::value(&(this->file_seq)), opt_seq_msg.c_str()) ("prob,", po::value(&(this->file_prob)), opt_prob_msg.c_str()) ("thread", po::value(&(this->n_threads)), opt_thread_msg.c_str()) ; // parse try { po::store(po::parse_command_line(argn, argv, desc), vm) ; po::notify(vm) ; } catch(std::invalid_argument& e) { std::string msg = std::string("Error! Invalid option given!\n") + std::string(e.what()) ; throw std::invalid_argument(msg) ; } catch(...) { throw std::invalid_argument("An unknown error occured while parsing the options") ; } bool help = vm.count("help") ; // checks unproper option settings if((this->file_read == "") and (this->file_seq == "") and (not help)) { std::string msg("Error! No data file was given (--read or --seq)!") ; throw std::invalid_argument(msg) ; } else if((this->file_read != "") and (this->file_seq != "") and (not help)) { std::string msg("Error! --read and --seq are mutually exclusive!") ; throw std::invalid_argument(msg) ; } else if(this->file_prob == "" and (not help)) { std::string msg("Error! No posterior probabily file was given (--prob)!") ; throw std::invalid_argument(msg) ; } // help invoked, run() cannot be invoked if(help) { std::cout << desc << std::endl ; this->runnable = false ; return ; } // everything fine, run() can be called else { this->runnable = true ; return ; } } int main(int argn, char** argv) { ProbToModelApplication app(argn, argv) ; return app.run() ; } diff --git a/src/Applications/ReadModelExtenderApplication.cpp b/src/Applications/ReadModelExtenderApplication.cpp index bfb5b2b..bf1d9b6 100644 --- a/src/Applications/ReadModelExtenderApplication.cpp +++ b/src/Applications/ReadModelExtenderApplication.cpp @@ -1,272 +1,273 @@ #include #include #include #include #include // std::move() #include // std::invalid_argument, std::runtime_error #include #include #include #include #include namespace po = boost::program_options ; // the valid values for --method option std::string method_read = "read" ; std::string method_read_atac = "read_atac" ; std::string method_fragment = "fragment" ; std::string method_fragment_center = "fragment_center" ; ReadModelExtenderApplication::ReadModelExtenderApplication(int argn, char** argv) : file_bed(""), file_bam(""), file_bai(""), file_prob(""), from(0), to(0), ext(0), bin_size(0), method(CorrelationMatrixCreator::FRAGMENT), n_threads(0), runnable(false) { this->parseOptions(argn, argv) ; } ReadModelExtenderApplication::~ReadModelExtenderApplication() {} int ReadModelExtenderApplication::run() { if(this->runnable) { // extend limits int ext_right = this->ext/2 ; int ext_left = this->ext - ext_right ; this->from -= ext_left ; this->to += ext_right ; // create extended matrix CorrelationMatrixCreator mc(this->file_bed, this->file_bam, this->file_bai, this->from, this->to, this->bin_size, this->method) ; Matrix2D data = mc.create_matrix() ; // compute model - Matrix4D prob(this->file_prob) ; + // Matrix4D prob(this->file_prob) ; + Matrix4D prob ; prob.load(this->file_prob) ; if(prob.get_dim()[0] != data.get_nrow()) { char msg[4096] ; sprintf(msg, "Error! data matrix and probability matrix have " "unequal row numbers (%zu and %zu)", prob.get_dim()[0], data.get_nrow()) ; throw std::invalid_argument(msg) ; } size_t n_row = prob.get_dim()[0] ; size_t n_class = prob.get_dim()[1] ; size_t n_shift = prob.get_dim()[2] ; size_t n_flip = prob.get_dim()[3] ; ReadModelComputer model_cp(std::move(data), prob, this->n_threads) ; Matrix2D model = model_cp.get_model() ; // compute class prob vector_d class_prob(n_class, 0.) ; double p_tot = 0. ; for(size_t i=0; i model_final(model.get_nrow(), model.get_ncol() + 1) ; // 1st column contain the class prob for(size_t i=0; i(&(this->file_bed)), opt_bed_msg.c_str()) ("bam", po::value(&(this->file_bam)), opt_bam_msg.c_str()) ("bai", po::value(&(this->file_bai)), opt_bai_msg.c_str()) ("prob,", po::value(&(this->file_prob)), opt_prob_msg.c_str()) ("from", po::value(&(this->from)), opt_from_msg.c_str()) ("to", po::value(&(this->to)), opt_to_msg.c_str()) ("ext", po::value(&(this->ext)), opt_ext_msg.c_str()) ("binSize", po::value(&(this->bin_size)), opt_binsize_msg.c_str()) ("method", po::value(&(method)), opt_method_msg.c_str()) ("thread", po::value(&(this->n_threads)), opt_thread_msg.c_str()) ; // parse try { po::store(po::parse_command_line(argn, argv, desc), vm) ; po::notify(vm) ; } catch(std::invalid_argument& e) { std::string msg = std::string("Error! Invalid option given!\n") + std::string(e.what()) ; throw std::invalid_argument(msg) ; } catch(...) { throw std::invalid_argument("An unknown error occured while parsing the options") ; } bool help = vm.count("help") ; // checks unproper option settings if(this->file_bed == "" and (not help)) { std::string msg("Error! No BED file was given (--bed)!") ; throw std::invalid_argument(msg) ; } else if(this->file_bam == "" and (not help)) { std::string msg("Error! No BAM file was given (--bam)!") ; throw std::invalid_argument(msg) ; } else if(this->file_bai == "" and (not help)) { std::string msg("Error! No BAM index file was given (--bai)!") ; throw std::invalid_argument(msg) ; } else if(this->file_prob == "" and (not help)) { std::string msg("Error! No posterior probability file was given (--prob)!") ; throw std::invalid_argument(msg) ; } else if(this->from == 0 and this->to == 0 and (not help)) { std::string msg("Error! No range given (--from and --to)!") ; throw std::invalid_argument(msg) ; } else if(this->from >= this->to and (not help)) { std::string msg("Error! from shoud be smaller than to (--from and --to)!") ; throw std::invalid_argument(msg) ; } else if(ext <= 0 and (not help)) { std::string msg("Error! the number of columns to add should be > 0 (--ext)!") ; throw std::invalid_argument(msg) ; } else if(this->bin_size <= 0 and (not help)) { std::string msg("Error! bin size should be bigger than 0 (--binSize)!") ; throw std::invalid_argument(msg) ; } else if(method != method_read and method != method_read_atac and method != method_fragment and method != method_fragment_center) { char msg[4096] ; sprintf(msg, "Error! method should be %s, %s, %s or %s (--method)", method_read.c_str(), method_read_atac.c_str(), method_fragment.c_str(), method_fragment_center.c_str()) ; throw std::invalid_argument(msg) ; } // set method if(method == method_read) { this->method = CorrelationMatrixCreator::READ ; } else if(method == method_read_atac) { this->method = CorrelationMatrixCreator::READ_ATAC ; } else if(method == method_fragment) { this->method = CorrelationMatrixCreator::FRAGMENT ; } else if(method == method_fragment_center) { this->method = CorrelationMatrixCreator::FRAGMENT_CENTER ; } // help invoked, run() cannot be invoked if(help) { std::cout << desc << std::endl ; this->runnable = false ; return ; } // everything fine, run() can be called else { this->runnable = true ; return ; } } int main(int argn, char** argv) { ReadModelExtenderApplication app(argn, argv) ; return app.run() ; } diff --git a/src/Applications/SequenceModelExtenderApplication.cpp b/src/Applications/SequenceModelExtenderApplication.cpp index 1000b56..1d50194 100644 --- a/src/Applications/SequenceModelExtenderApplication.cpp +++ b/src/Applications/SequenceModelExtenderApplication.cpp @@ -1,215 +1,216 @@ #include #include #include #include #include // std::move() #include // std::invalid_argument, std::runtime_error #include #include #include #include #include namespace po = boost::program_options ; SequenceModelExtenderApplication::SequenceModelExtenderApplication(int argn, char** argv) : file_bed(""), file_fasta(""), file_prob(""), from(0), to(0), ext(0), n_threads(0), runnable(false) { this->parseOptions(argn, argv) ; } SequenceModelExtenderApplication::~SequenceModelExtenderApplication() {} int SequenceModelExtenderApplication::run() { if(this->runnable) { // extend limits int ext_right = this->ext/2 ; int ext_left = this->ext - ext_right ; this->from -= ext_left ; this->to += ext_right ; // create extended matrix SequenceMatrixCreator mc(this->file_bed, this->file_fasta, this->from, this->to) ; Matrix2D data = mc.create_matrix() ; // compute model - Matrix4D prob(this->file_prob) ; + // Matrix4D prob(this->file_prob) ; + Matrix4D prob ; prob.load(this->file_prob) ; if(prob.get_dim()[0] != data.get_nrow()) { char msg[4096] ; sprintf(msg, "Error! data matrix and probability matrix have " "unequal row numbers (%zu and %zu)", prob.get_dim()[0], data.get_nrow()) ; throw std::invalid_argument(msg) ; } size_t n_row = prob.get_dim()[0] ; size_t n_class = prob.get_dim()[1] ; size_t n_shift = prob.get_dim()[2] ; size_t n_flip = prob.get_dim()[3] ; SequenceModelComputer model_cp(std::move(data), prob, this->n_threads) ; Matrix2D model = model_cp.get_model() ; // compute class prob vector_d class_prob(n_class, 0.) ; double p_tot = 0. ; for(size_t i=0; i model_final(model.get_nrow(), model.get_ncol() + 1) ; // 1st column contain the class prob size_t i_class = 0 ; for(size_t i=0; i(&(this->file_bed)), opt_bed_msg.c_str()) ("fasta", po::value(&(this->file_fasta)), opt_fasta_msg.c_str()) ("prob,", po::value(&(this->file_prob)), opt_prob_msg.c_str()) ("from", po::value(&(this->from)), opt_from_msg.c_str()) ("to", po::value(&(this->to)), opt_to_msg.c_str()) ("ext", po::value(&(this->ext)), opt_ext_msg.c_str()) ("thread", po::value(&(this->n_threads)), opt_thread_msg.c_str()) ; // parse try { po::store(po::parse_command_line(argn, argv, desc), vm) ; po::notify(vm) ; } catch(std::invalid_argument& e) { std::string msg = std::string("Error! Invalid option given!\n") + std::string(e.what()) ; throw std::invalid_argument(msg) ; } catch(...) { throw std::invalid_argument("An unknown error occured while parsing the options") ; } bool help = vm.count("help") ; // checks unproper option settings if(this->file_bed == "" and (not help)) { std::string msg("Error! No BED file was given (--bed)!") ; throw std::invalid_argument(msg) ; } else if(this->file_fasta == "" and (not help)) { std::string msg("Error! No fasta file was given (--fasta)!") ; throw std::invalid_argument(msg) ; } else if(this->file_prob == "" and (not help)) { std::string msg("Error! No posterior probability file was given (--prob)!") ; throw std::invalid_argument(msg) ; } else if(this->from == 0 and this->to == 0 and (not help)) { std::string msg("Error! No range given (--from and --to)!") ; throw std::invalid_argument(msg) ; } else if(this->from >= this->to and (not help)) { std::string msg("Error! from shoud be smaller than to (--from and --to)!") ; throw std::invalid_argument(msg) ; } else if(ext <= 0 and (not help)) { std::string msg("Error! the number of columns to add should be > 0 (--ext)!") ; throw std::invalid_argument(msg) ; } // help invoked, run() cannot be invoked if(help) { std::cout << desc << std::endl ; this->runnable = false ; return ; } // everything fine, run() can be called else { this->runnable = true ; return ; } } int main(int argn, char** argv) { SequenceModelExtenderApplication app(argn, argv) ; return app.run() ; } diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 6926729..245382c 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,122 +1,122 @@ # compiler options add_compile_options(-std=c++14) add_compile_options(-O3) add_compile_options(-Wall) add_compile_options(-Wextra) add_compile_options(-Werror) add_compile_options(-Wfatal-errors) add_compile_options(-pedantic) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SEQAN_CXX_FLAGS}") add_definitions (${SEQAN_DEFINITIONS}) # include file location include_directories(${Boost_INCLUDE_DIRS}) include_directories(${SEQAN_INCLUDE_DIRS}) include_directories("${scATACseq_SOURCE_DIR}/src/Matrix") include_directories("${scATACseq_SOURCE_DIR}/src/Clustering") include_directories("${scATACseq_SOURCE_DIR}/src/Random") include_directories("${scATACseq_SOURCE_DIR}/src/Parallel") include_directories("${scATACseq_SOURCE_DIR}/src/Statistics") include_directories("${scATACseq_SOURCE_DIR}/src/GUI") include_directories("${scATACseq_SOURCE_DIR}/src/Applications") include_directories("${scATACseq_SOURCE_DIR}/src/Matrix") include_directories("${scATACseq_SOURCE_DIR}/src/GenomicTools") include_directories("${scATACseq_SOURCE_DIR}/src/Utility") # compile modules into static libraries ## set output directory set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/lib") ## build instructions add_library(Clustering "Clustering/DataLayer.cpp" "Clustering/ReadLayer.cpp" "Clustering/SequenceLayer.cpp" "Clustering/ModelComputer.cpp" "Clustering/ReadModelComputer.cpp" "Clustering/SequenceModelComputer.cpp" "Clustering/EMBase.cpp" "Clustering/EMRead.cpp" "Clustering/EMSequence.cpp" "Clustering/EMJoint.cpp") add_library(Random "Random/Random.cpp" "Random/RandomNumberGenerator.cpp") add_library(Parallel "Parallel/ThreadPool.cpp") add_library(Statistics "Statistics/Statistics.cpp") add_library(GUI "GUI/ConsoleProgressBar.cpp" "GUI/Diplayable.cpp" "GUI/Updatable.cpp") add_library(GenomicTools "GenomicTools/MatrixCreator.cpp" "GenomicTools/ReadMatrixCreator.cpp" "GenomicTools/CorrelationMatrixCreator.cpp" "GenomicTools/SequenceMatrixCreator.cpp" "GenomicTools/GenomeRegion.cpp") add_library(Utility "Utility/matrices.cpp" "Utility/dna_utility.cpp") ## resolve dependencies target_link_libraries(Utility ${SEQAN_LIBRARIES}) target_link_libraries(Clustering Utility Random Statistics GUI Parallel ${SEQAN_LIBRARIES}) target_link_libraries(Parallel Threads::Threads) target_link_libraries(GenomicTools Utility ${SEQAN_LIBRARIES}) # executables ## a toy for seqan -set(EXE_MAIN_SEQAN "main_seqan") -add_executable(${EXE_MAIN_SEQAN} "main_seqan.cpp") -target_link_libraries(${EXE_MAIN_SEQAN} ${SEQAN_LIBRARIES} GenomicTools Clustering) -set_target_properties(${EXE_MAIN_SEQAN} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") +set(EXE_MAIN_TEST "main_test") +add_executable(${EXE_MAIN_TEST} "main_test.cpp") +target_link_libraries(${EXE_MAIN_TEST} GenomicTools Clustering) +set_target_properties(${EXE_MAIN_TEST} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") ## a toy for correlation matrix set(EXE_MAIN_CORMAT "main_cormat") add_executable(${EXE_MAIN_CORMAT} "main_cormat.cpp") target_link_libraries(${EXE_MAIN_CORMAT} ${SEQAN_LIBRARIES} Utility GenomicTools Random) set_target_properties(${EXE_MAIN_CORMAT} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") ## an application to create a matrix from BED and a BAM file set(EXE_MAIN_BAMMATRIX "CorrelationMatrixCreator") add_executable(${EXE_MAIN_BAMMATRIX} "Applications/CorrelationMatrixCreatorApplication.cpp" "Applications/ApplicationInterface.cpp") target_link_libraries(${EXE_MAIN_BAMMATRIX} GenomicTools Utility Boost::program_options) set_target_properties(${EXE_MAIN_BAMMATRIX} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") ## an application to create a sequence matrix from BED and a fasta file set(EXE_MAIN_SEQMATRIX "SequenceMatrixCreator") add_executable(${EXE_MAIN_SEQMATRIX} "Applications/SequenceMatrixCreatorApplication.cpp" "Applications/ApplicationInterface.cpp") target_link_libraries(${EXE_MAIN_SEQMATRIX} GenomicTools Utility Boost::program_options) set_target_properties(${EXE_MAIN_SEQMATRIX} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") ## an EMRead standalone set(EXE_EMREAD "EMRead") add_executable(${EXE_EMREAD} "Applications/EMReadApplication.cpp" "Applications/ApplicationInterface.cpp") target_link_libraries(${EXE_EMREAD} Clustering Utility Boost::program_options) set_target_properties(${EXE_EMREAD} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") ## an EMSequence standalone set(EXE_EMSEQ "EMSequence") add_executable(${EXE_EMSEQ} "Applications/EMSequenceApplication.cpp" "Applications/ApplicationInterface.cpp") target_link_libraries(${EXE_EMSEQ} Clustering Utility Boost::program_options) set_target_properties(${EXE_EMSEQ} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") ## an EMJoint standalone set(EXE_EMJOINT "EMJoint") add_executable(${EXE_EMJOINT} "Applications/EMJointApplication.cpp" "Applications/ApplicationInterface.cpp") target_link_libraries(${EXE_EMJOINT} Clustering Utility Boost::program_options) set_target_properties(${EXE_EMJOINT} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") ## an executable to compute data models from the data and the post prob of an EM classification set(EXE_PROB2REF "ProbToModel") add_executable(${EXE_PROB2REF} "Applications/ProbToModelApplication.cpp" "Applications/ApplicationInterface.cpp") target_link_libraries(${EXE_PROB2REF} Clustering Utility Boost::program_options) set_target_properties(${EXE_PROB2REF} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") ## an executable to extend read models from an EM classification set(EXE_READMODELEXTENDER "ReadModelExtender") add_executable(${EXE_READMODELEXTENDER} "Applications/ReadModelExtenderApplication.cpp" "Applications/ApplicationInterface.cpp") target_link_libraries(${EXE_READMODELEXTENDER} Clustering GenomicTools Utility Boost::program_options) set_target_properties(${EXE_READMODELEXTENDER} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") ## an executable to extend read models from an EM classification set(EXE_SEQUENCEMODELEXTENDER "SequenceModelExtender") add_executable(${EXE_SEQUENCEMODELEXTENDER} "Applications/SequenceModelExtenderApplication.cpp" "Applications/ApplicationInterface.cpp") target_link_libraries(${EXE_SEQUENCEMODELEXTENDER} Clustering GenomicTools Utility Boost::program_options) set_target_properties(${EXE_SEQUENCEMODELEXTENDER} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") ## a test suite set(EXE_TESTS "unittests") add_executable(${EXE_TESTS} "unittests.cpp" "Unittests/unittests_matrix.cpp" "Unittests/unittests_genomictools.cpp") target_link_libraries(${EXE_TESTS} ${UNITTEST_LIB} ${SEQAN_LIBRARIES} GenomicTools) set_target_properties(${EXE_TESTS} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") diff --git a/src/Clustering/EMBase.cpp b/src/Clustering/EMBase.cpp index 3a45de9..570866c 100644 --- a/src/Clustering/EMBase.cpp +++ b/src/Clustering/EMBase.cpp @@ -1,298 +1,305 @@ #include #include #include // std::invalid_argument #include // std::promise, std::future #include // std::pair, std::move() #include // std::bind(), std::ref() #include // std::iota() #include // std::mt19937 #include #include #include #include // beta_distribution() #include // rand_string() #include // getRandomNumberGenerator() #include // sd(), normal_pmf() EMBase::EMBase(size_t n_row, size_t n_col, size_t n_class, size_t n_iter, size_t n_shift, bool flip, size_t n_threads=0) : n_row(n_row), n_col(n_col), n_class(n_class), n_shift(n_shift), flip(flip), n_flip(flip+1), n_iter(n_iter), l_model(n_col - n_shift + 1), + loglikelihood(n_row, n_class, n_shift, n_flip, 0.), + post_prob(n_row, n_class, n_shift, n_flip, 0.), + post_state_prob(n_class, n_shift, n_flip, 0.), + post_class_prob(n_class, 0.), + post_prob_rowsum(n_row, 0.), + post_prob_colsum(n_class, 0.), post_prob_tot(0.), threads(nullptr) { // check n_shift value if(this->n_col < this->n_shift) { char msg[4096] ; sprintf(msg, "Error! Shift is bigger than data column number " "(%zu / %zu)!", this->n_shift, this->n_col) ; throw std::invalid_argument(msg) ; } - + /* // data structures this->loglikelihood = Matrix4D(this->n_row, this->n_class, this->n_shift, this->n_flip, 0.) ; this->post_prob = Matrix4D(this->n_row, this->n_class, this->n_shift, this->n_flip, 0.) ; this->post_state_prob = Matrix3D(this->n_class, this->n_shift, this->n_flip, 0.) ; this->post_class_prob = vector_d(this->n_class, 0) ; this->post_prob_rowsum = vector_d(this->n_row, 0) ; this->post_prob_colsum = vector_d(this->n_class, 0) ; this->post_prob_tot = 0 ; + */ // threads if(n_threads) { this->threads = new ThreadPool(n_threads) ; } } EMBase::~EMBase() { // threads if(this->threads != nullptr) { this->threads->join() ; delete this->threads ; this->threads = nullptr ; } } Matrix4D EMBase::get_post_prob() const { return this->post_prob ; } vector_d EMBase::get_post_class_prob() const { return this->post_class_prob ; } void EMBase::set_state_prob_uniform() { double sum = this->n_class * this->n_shift * this->n_flip ; for(size_t i=0; in_class; i++) { for(size_t j=0; jn_shift; j++) { for(size_t k=0; kn_flip; k++) { this->post_state_prob(i,j,k) = 1./sum ; } } } } void EMBase::set_post_prob_random(const std::string& seed) { // set random number generator // will be used to generate thread private seeds getRandomGenerator(seed) ; // don't parallelize if(this->threads == nullptr) { std::promise promise ; std::future future = promise.get_future() ; this->set_post_prob_random_routine(0, this->n_row, seed, promise) ; // compute the sum of post prob and the per class sum of post prob // from the partial results computed on each slice this->post_prob_tot = 0. ; this->post_prob_colsum = future.get() ; for(const auto& prob : this->post_prob_colsum) { this->post_prob_tot += prob ; } } // parallelize else { size_t n_threads = this->threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row,n_threads) ; // get promises and futures // the function run by the threads will compute // the partial sum per class of post_prob for the given slice // this should be used to compute the complete sum of post_prob // and the complete sum per class of post_prob std::vector> promises(n_threads) ; std::vector> futures(n_threads) ; // private seeds std::vector private_seeds(n_threads) ; for(size_t i=0; ithreads->addJob(std::move( std::bind(&EMBase::set_post_prob_random_routine, this, slice.first, slice.second, private_seeds[i], std::ref(promises[i])))) ; } // wait until all threads are done working // compute the sum of post prob and the per class sum of post prob // from the partial results computed on each slice this->post_prob_tot = 0. ; this->post_prob_colsum = vector_d(this->n_class, 0.) ; for(auto& future : futures) { auto probs = future.get() ; for(size_t i=0; in_class; i++) { double prob = probs[i] ; this->post_prob_colsum[i] += prob ; this->post_prob_tot += prob ; } } // -------------------------- threads stop --------------------------- } // compute class and state probs this->compute_class_prob() ; } void EMBase::set_post_prob_random_routine(size_t from, size_t to, const std::string& seed, std::promise& post_prob_colsum) { // random number generator std::mt19937 generator ; std::seed_seq seed_sequence(seed.begin(),seed.end()) ; generator.seed(seed_sequence) ; // this->post_prob_tot = 0. ; // this->post_prob_colsum = vector_d(this->n_class, 0.) ; vector_d colsums = vector_d(this->n_class, 0.) ; vector_d rowsums(this->n_row, 0) ; // random sampling beta_distribution beta(1, this->n_row) ; for(size_t i=from; in_class; j++) { for(size_t k=0; kn_shift; k++) { for(size_t l=0; ln_flip; l++) { double p = beta(generator) ; this->post_prob(i,j,k,l) = p ; rowsums[i] += p ; } } } } // normalization for(size_t i=from; in_class; j++) { for(size_t k=0; kn_shift; k++) { for(size_t l=0; ln_flip; l++) { double p = this->post_prob(i,j,k,l) / rowsums[i] ; this->post_prob(i,j,k,l) = p ; // this->post_prob_tot += p ; // this->post_prob_colsum[j] += p ; colsums[j] += p ; } } } } // compute class and state probs // this->compute_class_prob() ; post_prob_colsum.set_value(colsums) ; } void EMBase::compute_class_prob() { for(size_t n_class=0; n_classn_class; n_class++) { // reset total this->post_class_prob[n_class] = 0. ; for(size_t n_shift=0; n_shiftn_shift; n_shift++) { for(size_t flip=0; flipn_flip; flip++) { // sum this->post_state_prob(n_class,n_shift,flip) = 0. ; for(size_t i=0; in_row; i++) { this->post_state_prob(n_class,n_shift,flip) += this->post_prob(i,n_class,n_shift,flip) ; } // normalize this->post_state_prob(n_class,n_shift,flip) /= this->post_prob_tot ; this->post_class_prob[n_class] += this->post_state_prob(n_class,n_shift,flip) ; } } } } void EMBase::center_post_state_prob() { if(this->n_shift == 1) { return ; } // the possible shift states vector_d shifts(this->n_shift) ; std::iota(shifts.begin(), shifts.end(), 1.) ; // the shift probabilities and the class probabilies // (no need to norm., class_prob sums to 1) double shifts_prob_measured_tot = 0. ; vector_d shifts_prob_measured(this->n_shift) ; for(size_t s=0; sn_shift; s++) { for(size_t k=0; kn_class; k++) { for(size_t f=0; fn_flip; f++) { shifts_prob_measured[s] += this->post_state_prob(k,s,f) ; shifts_prob_measured_tot += this->post_state_prob(k,s,f) ; } } } // the shift mean and (biased) standard deviation double shifts_sd = sd(shifts, shifts_prob_measured, false) ; // the shift probabilities under the assumption that is // distributed as a gaussian centered on // the central shift state with sd and mean as in the data // sd as the data vector_d shifts_prob_centered(shifts.size(), 0.) ; double shifts_prob_centered_tot = 0. ; for(size_t i=0; in_shift/2)+1, shifts_sd) ; shifts_prob_centered_tot += shifts_prob_centered[i] ; } for(size_t k=0; kn_class; k++) { for(size_t f=0; fn_flip; f++) { for(size_t s=0; sn_shift; s++) { this->post_state_prob(k,s,f) = this->post_class_prob[k] * shifts_prob_centered[s] / (this->n_flip * shifts_prob_centered_tot) ; } } } // shifts_prob_measured_tot = 0. ; shifts_prob_measured.clear() ; shifts_prob_measured.resize(this->n_shift) ; for(size_t s=0; sn_shift; s++) { for(size_t k=0; kn_class; k++) { for(size_t f=0; fn_flip; f++) { shifts_prob_measured[s] += this->post_state_prob(k,s,f) ; } } } } diff --git a/src/Clustering/EMJoint.cpp b/src/Clustering/EMJoint.cpp index 537fd3c..29de0b7 100644 --- a/src/Clustering/EMJoint.cpp +++ b/src/Clustering/EMJoint.cpp @@ -1,584 +1,576 @@ #include #include #include #include // std::promise, std::future #include // std::pair, std::move() #include // std::bind(), std::ref() #include #include #include #include #include #include #include // getRandomNumberGenerator() #include // ConsoleProgressBar -template -std::ostream& operator << (std::ostream& stream, - const std::vector& v) -{ for(const auto& t : v) - { stream << t << " " ; } - return stream ; -} - EMJoint::EMJoint(const std::vector>& read_matrices, size_t n_class, size_t n_iter, size_t n_shift, bool flip, const std::string& seed, size_t n_threads) : EMBase(read_matrices[0].get_nrow(), read_matrices[0].get_ncol(), n_class, n_iter, n_shift, flip, n_threads), n_layer(read_matrices.size()), loglikelihood_layer(n_layer, Matrix4D(this->n_row, this->n_class, this->n_shift, this->n_flip, 0.)), loglikelihood_max(this->n_layer, vector_d(this->n_row, 0.)), read_layers(), seq_layer(nullptr) { // check data matrices and their dimensions if(this->n_layer == 0) { throw std::invalid_argument("Error! No data layer given!") ; } for(const auto& matrix : read_matrices) { if(matrix.get_nrow() != this->n_row) { char msg[4096] ; sprintf(msg, "Error! Read layers have variable row numbers " "(%zu and %zu)!", matrix.get_nrow(), this->n_row) ; throw std::invalid_argument(msg) ; } else if(matrix.get_ncol() != this->n_col) { char msg[4096] ; sprintf(msg, "Error! Read layers have variable column numbers " "(%zu and %zu)!", matrix.get_ncol(), this->n_col) ; throw std::invalid_argument(msg) ; } } // initialise post prob randomly // getRandomGenerator(seed) ; this->set_post_prob_random(seed) ; // data and models // create read layer and initialise the models from the post prob for(auto& matrix : read_matrices) { // create the layer this->read_layers.push_back(new ReadLayer(matrix, this->n_class, this->n_shift, this->flip, this->threads)) ; this->read_layers.back()->update_model(this->post_prob, this->threads) ; } } EMJoint::EMJoint(std::vector>&& read_matrices, size_t n_class, size_t n_iter, size_t n_shift, bool flip, const std::string& seed, size_t n_threads) : EMBase(read_matrices[0].get_nrow(), read_matrices[0].get_ncol(), n_class, n_iter, n_shift, flip, n_threads), n_layer(read_matrices.size()), loglikelihood_layer(n_layer, Matrix4D(this->n_row, this->n_class, this->n_shift, this->n_flip, 0.)), loglikelihood_max(this->n_layer, vector_d(this->n_row, 0.)), read_layers(), seq_layer(nullptr) { // check data matrices and their dimensions if(this->n_layer == 0) { throw std::invalid_argument("Error! No data layer given!") ; } for(const auto& matrix : read_matrices) { if(matrix.get_nrow() != this->n_row) { char msg[4096] ; sprintf(msg, "Error! Read layers have variable row numbers " "(%zu and %zu)!", matrix.get_nrow(), this->n_row) ; throw std::invalid_argument(msg) ; } else if(matrix.get_ncol() != this->n_col) { char msg[4096] ; sprintf(msg, "Error! Read layers have variable column numbers " "(%zu and %zu)!", matrix.get_ncol(), this->n_col) ; throw std::invalid_argument(msg) ; } } // initialise post prob randomly // getRandomGenerator(seed) ; this->set_post_prob_random(seed) ; // data and models // create read layer and initialise the models from the post prob for(auto& matrix : read_matrices) { // create the layer this->read_layers.push_back(new ReadLayer(std::move(matrix), this->n_class, this->n_shift, this->flip, this->threads)) ; this->read_layers.back()->update_model(this->post_prob, this->threads) ; } } EMJoint::EMJoint(const std::vector>& read_matrices, const Matrix2D& seq_matrix, size_t n_class, size_t n_iter, size_t n_shift, bool flip, const std::string& seed, size_t n_threads) : EMBase(read_matrices[0].get_nrow(), read_matrices[0].get_ncol(), n_class, n_iter, n_shift, flip, n_threads), n_layer(read_matrices.size()+1), loglikelihood_layer(this->n_layer, Matrix4D(this->n_row, this->n_class, this->n_shift, this->n_flip, 0.)), loglikelihood_max(this->n_layer, vector_d(this->n_row, 0.)), read_layers(), seq_layer(nullptr) { // check data matrices and their dimensions for(const auto& matrix : read_matrices) { if(matrix.get_nrow() != this->n_row) { char msg[4096] ; sprintf(msg, "Error! A read matrix row number is different than expected " "(%zu instead of %zu)!", matrix.get_nrow(), this->n_row) ; throw std::invalid_argument(msg) ; } else if(matrix.get_ncol() != this->n_col) { char msg[4096] ; sprintf(msg, "Error! A read matrix column number is different than expected " "(%zu instead of %zu)!", matrix.get_ncol(), this->n_col) ; throw std::invalid_argument(msg) ; } } if(seq_matrix.get_nrow() != this->n_row) { char msg[4096] ; sprintf(msg, "Error! A sequence matrix row number is different than expected " "(%zu instead of %zu)!", seq_matrix.get_nrow(), this->n_row) ; throw std::invalid_argument(msg) ; } else if(seq_matrix.get_ncol() != this->n_col) { char msg[4096] ; sprintf(msg, "Error! A sequence matrix column number is different than expected " "(%zu instead of %zu)!", seq_matrix.get_ncol(), this->n_col) ; throw std::invalid_argument(msg) ; } // initialise post prob randomly // getRandomGenerator(seed) ; this->set_post_prob_random(seed) ; // data and models // create read layer and initialise the models from the post prob for(auto& matrix : read_matrices) { // create the layer this->read_layers.push_back(new ReadLayer(matrix, this->n_class, this->n_shift, this->flip, this->threads)) ; this->read_layers.back()->update_model(this->post_prob, this->threads) ; } // create sequence layer and initialise the models from the post prob this->seq_layer = new SequenceLayer(seq_matrix, this->n_class, this->n_shift, this->flip, false) ; this->seq_layer->update_model(this->post_prob, this->threads) ; } EMJoint::EMJoint(std::vector>&& read_matrices, Matrix2D&& seq_matrix, size_t n_class, size_t n_iter, size_t n_shift, bool flip, const std::string& seed, size_t n_threads) : EMBase(read_matrices[0].get_nrow(), read_matrices[0].get_ncol(), n_class, n_iter, n_shift, flip, n_threads), n_layer(read_matrices.size()+1), loglikelihood_layer(this->n_layer, Matrix4D(this->n_row, this->n_class, this->n_shift, this->n_flip, 0.)), loglikelihood_max(this->n_layer, vector_d(this->n_row, 0.)), read_layers(), seq_layer(nullptr) { // check data matrices and their dimensions for(const auto& matrix : read_matrices) { if(matrix.get_nrow() != this->n_row) { char msg[4096] ; sprintf(msg, "Error! A read matrix row number is different than expected " "(%zu instead of %zu)!", matrix.get_nrow(), this->n_row) ; throw std::invalid_argument(msg) ; } else if(matrix.get_ncol() != this->n_col) { char msg[4096] ; sprintf(msg, "Error! A read matrix column number is different than expected " "(%zu instead of %zu)!", matrix.get_ncol(), this->n_col) ; throw std::invalid_argument(msg) ; } } if(seq_matrix.get_nrow() != this->n_row) { char msg[4096] ; sprintf(msg, "Error! A sequence matrix row number is different than expected " "(%zu instead of %zu)!", seq_matrix.get_nrow(), this->n_row) ; throw std::invalid_argument(msg) ; } else if(seq_matrix.get_ncol() != this->n_col) { char msg[4096] ; sprintf(msg, "Error! A sequence matrix column number is different than expected " "(%zu instead of %zu)!", seq_matrix.get_ncol(), this->n_col) ; throw std::invalid_argument(msg) ; } // initialise post prob randomly // getRandomGenerator(seed) ; this->set_post_prob_random(seed) ; // data and models // create read layer and initialise the models from the post prob for(auto& matrix : read_matrices) { // create the layer this->read_layers.push_back(new ReadLayer(std::move(matrix), this->n_class, this->n_shift, this->flip, this->threads)) ; this->read_layers.back()->update_model(this->post_prob, this->threads) ; } // create sequence layer and initialise the models from the post prob this->seq_layer = new SequenceLayer(std::move(seq_matrix), this->n_class, this->n_shift, this->flip, false) ; this->seq_layer->update_model(this->post_prob, this->threads) ; } EMJoint::~EMJoint() { // join the threads in case // deleted by EMBase destructor this->threads->join() ; // read data and models for(auto& ptr : this->read_layers) { if(ptr != nullptr) { delete ptr ; ptr = nullptr ; } } // sequence data and models if(seq_layer != nullptr) { delete seq_layer ; seq_layer = nullptr ; } } std::vector> EMJoint::get_read_models() const { std::vector> models ; for(const auto& ptr : this->read_layers) { models.push_back(ptr->get_model()) ; } return models ; } Matrix3D EMJoint::get_sequence_models() const { return this->seq_layer->get_model() ; } EMJoint::exit_codes EMJoint::classify() { size_t bar_update_n = this->n_iter ; ConsoleProgressBar bar(std::cerr, bar_update_n, 60, "classifying") ; // optimize the partition for(size_t n_iter=0; n_itern_iter; n_iter++) { // E-step this->compute_loglikelihood() ; this->compute_post_prob() ; // M-step this->compute_class_prob() ; this->update_models() ; this->center_post_state_prob() ; bar.update() ; } bar.update() ; std::cerr << std::endl ; return EMJoint::exit_codes::ITER_MAX ; } void EMJoint::compute_loglikelihood() { // compute the loglikelihood for each layer size_t i = 0 ; for(auto& ptr : this->read_layers) { ptr->compute_loglikelihoods(this->loglikelihood_layer[i], this->loglikelihood_max[i], this->threads) ; i++ ; } this->seq_layer->compute_loglikelihoods(this->loglikelihood_layer[i], this->loglikelihood_max[i], this->threads) ; i++ ; /* // sum the likelihood for each state, over all layers // this is the "joint likelihood" for(size_t i=0; in_row; i++) { for(size_t j=0; jn_class; j++) { for(size_t k=0; kn_shift; k++) { for(size_t l=0; ln_flip; l++) { // reset this->loglikelihood(i,j,k,l) = 0. ; // sum for(size_t m=0; mn_layer; m++) { this->loglikelihood(i,j,k,l) += (this->loglikelihood_layer[m](i,j,k,l) - this->loglikelihood_max[m][i]) ; } } } } } */ // sum the likelihood for each state, over all layers // and rescale the values // don't parallelize if(this->threads == nullptr) { std::promise promise ; std::future future = promise.get_future() ; this->compute_loglikelihood_routine(0, this->n_row, promise) ; future.get() ; } // parallelize else { size_t n_threads = this->threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row,n_threads) ; // get promises and futures std::vector> promises(n_threads) ; std::vector> futures(n_threads) ; for(size_t i=0; ithreads->addJob(std::move( std::bind(&EMJoint::compute_loglikelihood_routine, this, slice.first, slice.second, std::ref(promises[i])))) ; } // wait until all threads are done working for(auto& future : futures) { future.get() ; } // -------------------------- threads stop --------------------------- } } void EMJoint::compute_loglikelihood_routine(size_t from, size_t to, std::promise& done) { // limite value range for(size_t i=from; in_class; j++) { for(size_t k=0; kn_shift; k++) { for(size_t l=0; ln_flip; l++) { // reset this->loglikelihood(i,j,k,l) = 0. ; // sum for(size_t m=0; mn_layer; m++) { this->loglikelihood(i,j,k,l) += (this->loglikelihood_layer[m](i,j,k,l) - this->loglikelihood_max[m][i]) ; } } } } } done.set_value(true) ; } void EMJoint::compute_post_prob() { // don't parallelize if(this->threads == nullptr) { std::promise promise ; std::future future = promise.get_future() ; this->compute_post_prob_routine(0, this->n_row, promise) ; // compute the sum of post prob and the per class sum of post prob // from the partial results computed on each slice this->post_prob_tot = 0. ; this->post_prob_colsum = future.get() ; for(const auto& prob : this->post_prob_colsum) { this->post_prob_tot += prob ; } } // parallelize else { size_t n_threads = this->threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row,n_threads) ; // get promises and futures // the function run by the threads will compute // the partial sum per class of post_prob for the given slice // this should be used to compute the complete sum of post_prob // and the complete sum per class of post_prob std::vector> promises(n_threads) ; std::vector> futures(n_threads) ; for(size_t i=0; ithreads->addJob(std::move( std::bind(&EMJoint::compute_post_prob_routine, this, slice.first, slice.second, std::ref(promises[i])))) ; } // wait until all threads are done working // compute the sum of post prob and the per class sum of post prob // from the partial results computed on each slice this->post_prob_tot = 0. ; this->post_prob_colsum = vector_d(this->n_class, 0.) ; for(auto& future : futures) { auto probs = future.get() ; for(size_t i=0; in_class; i++) { double prob = probs[i] ; this->post_prob_colsum[i] += prob ; this->post_prob_tot += prob ; } } // -------------------------- threads stop --------------------------- } } void EMJoint::compute_post_prob_routine(size_t from, size_t to, std::promise& post_prob_colsum) { vector_d colsums(this->n_class, 0.) ; // post prob for(size_t i=from; ipost_prob_rowsum[i] = 0. ; for(size_t n_class=0; n_classn_class; n_class++) { for(size_t n_shift=0; n_shiftn_shift; n_shift++) { for(size_t n_flip=0; n_flipn_flip; n_flip++) { double p = exp(this->loglikelihood(i,n_class,n_shift,n_flip)) * this->post_state_prob(n_class,n_shift,n_flip) ; this->post_prob(i,n_class,n_shift,n_flip) = p ; this->post_prob_rowsum[i] += p ; } } } // normalize for(size_t n_class=0; n_classn_class; n_class++) { for(size_t n_shift=0; n_shiftn_shift; n_shift++) { for(size_t n_flip=0; n_flipn_flip; n_flip++) { double p = std::max(this->post_prob(i,n_class,n_shift,n_flip) / this->post_prob_rowsum[i], ReadLayer::p_min) ; this->post_prob(i,n_class,n_shift,n_flip) = p ; colsums[n_class] += p ; } } } } post_prob_colsum.set_value(colsums) ; } void EMJoint::update_models() { // read data and models for(auto& ptr : this->read_layers) { ptr->update_model(this->post_prob, this->post_prob_colsum, this->threads) ; } // sequence data and models this->seq_layer->update_model(this->post_prob, this->threads) ; } diff --git a/src/Clustering/EMRead.cpp b/src/Clustering/EMRead.cpp index f8aa775..6535873 100644 --- a/src/Clustering/EMRead.cpp +++ b/src/Clustering/EMRead.cpp @@ -1,295 +1,309 @@ #include #include #include #include // std::promise, std::future #include // std::pair, std::move() #include // std::bind(), std::ref() #include // exp() #include // ReadLayer #include // getRandomNumberGenerator() #include // ConsoleProgressBar #include // ThreadPool +template +std::ostream& operator << (std::ostream& stream, const std::vector& v) +{ for(const auto x : v) + { stream << x << " " ; } + return stream ; +} + EMRead::EMRead(const Matrix2D& read_matrix, size_t n_class, size_t n_iter, size_t n_shift, bool flip, const std::string& seed, size_t n_threads) : EMBase(read_matrix.get_nrow(), read_matrix.get_ncol(), n_class, n_iter, n_shift, flip, n_threads), loglikelihood_max(n_row, 0.), read_layer(nullptr) { this->loglikelihood_max = vector_d(n_row, 0.) ; // initialise post prob randomly this->set_post_prob_random(seed) ; // data and models this->read_layer = new ReadLayer(read_matrix, this->n_class, this->n_shift, flip, this->threads) ; // intialise the models with the post prob this->read_layer->update_model(this->post_prob, this->threads) ; } EMRead::EMRead(Matrix2D&& read_matrix, size_t n_class, size_t n_iter, size_t n_shift, bool flip, const std::string& seed, size_t n_threads) : EMBase(read_matrix.get_nrow(), read_matrix.get_ncol(), n_class, n_iter, n_shift, flip, n_threads), loglikelihood_max(n_row, 0.), read_layer(nullptr) { this->loglikelihood_max = vector_d(n_row, 0.) ; // initialise post prob randomly this->set_post_prob_random(seed) ; // data and models this->read_layer = new ReadLayer(std::move(read_matrix), this->n_class, this->n_shift, flip, this->threads) ; // intialise the models with the post prob this->read_layer->update_model(this->post_prob, this->threads) ; } EMRead::~EMRead() { if(this->read_layer == nullptr) { delete this->read_layer ; this->read_layer = nullptr ; } } Matrix3D EMRead::get_read_models() const { return read_layer->get_model() ; } EMRead::exit_codes EMRead::classify() -{ size_t bar_update_n = this->n_iter ; +{ + size_t bar_update_n = this->n_iter ; ConsoleProgressBar bar(std::cerr, bar_update_n, 60, "classifying") ; // optimize the partition for(size_t n_iter=0; n_itern_iter; n_iter++) - { // E-step + { + // E-step this->compute_loglikelihood() ; + // std::cerr << this->post_prob_rowsum << std::endl ; + // std::cerr << this->post_prob_colsum << std::endl ; this->compute_post_prob() ; // M-step + // std::cerr << this->post_prob_rowsum << std::endl ; + // std::cerr << this->post_prob_colsum << std::endl ; this->compute_class_prob() ; this->update_models() ; this->center_post_state_prob() ; bar.update() ; } bar.update() ; std::cerr << std::endl ; return EMRead::exit_codes::ITER_MAX ; } void EMRead::compute_loglikelihood() { // compute the loglikelihood this->read_layer->compute_loglikelihoods(this->loglikelihood, this->loglikelihood_max, this->threads) ; /* // rescale the values for(size_t i=0; in_row; i++) { for(size_t j=0; jn_class; j++) { for(size_t k=0; kn_shift; k++) { for(size_t l=0; ln_flip; l++) { this->loglikelihood(i,j,k,l) = (this->loglikelihood(i,j,k,l) - this->loglikelihood_max[i]) ; } } } } */ // rescale the values // don't parallelize if(this->threads == nullptr) { std::promise promise ; std::future future = promise.get_future() ; this->compute_loglikelihood_routine(0, this->n_row, promise) ; future.get() ; } // parallelize else { size_t n_threads = this->threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row,n_threads) ; // get promises and futures std::vector> promises(n_threads) ; std::vector> futures(n_threads) ; for(size_t i=0; ithreads->addJob(std::move( std::bind(&EMRead::compute_loglikelihood_routine, this, slice.first, slice.second, std::ref(promises[i])))) ; } // wait until all threads are done working for(auto& future : futures) { future.get() ; } // -------------------------- threads stop --------------------------- } } void EMRead::compute_loglikelihood_routine(size_t from, size_t to, std::promise& done) { // rescale the values for(size_t i=from; in_class; j++) { for(size_t k=0; kn_shift; k++) { for(size_t l=0; ln_flip; l++) { this->loglikelihood(i,j,k,l) = (this->loglikelihood(i,j,k,l) - this->loglikelihood_max[i]) ; } } } } done.set_value(true) ; } void EMRead::compute_post_prob() { // don't parallelize if(this->threads == nullptr) { std::promise promise ; std::future future = promise.get_future() ; this->compute_post_prob_routine(0, this->n_row, promise) ; // compute the sum of post prob and the per class sum of post prob // from the partial results computed on each slice this->post_prob_tot = 0. ; this->post_prob_colsum = future.get() ; for(const auto& prob : this->post_prob_colsum) { this->post_prob_tot += prob ; } } // parallelize else { size_t n_threads = this->threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row,n_threads) ; // get promises and futures // the function run by the threads will compute // the partial sum per class of post_prob for the given slice // this should be used to compute the complete sum of post_prob // and the complete sum per class of post_prob std::vector> promises(n_threads) ; std::vector> futures(n_threads) ; for(size_t i=0; ithreads->addJob(std::move( std::bind(&EMRead::compute_post_prob_routine, this, slice.first, slice.second, std::ref(promises[i])))) ; } // wait until all threads are done working // compute the sum of post prob and the per class sum of post prob // from the partial results computed on each slice this->post_prob_tot = 0. ; this->post_prob_colsum = vector_d(this->n_class, 0.) ; for(auto& future : futures) { auto probs = future.get() ; for(size_t i=0; in_class; i++) { double prob = probs[i] ; this->post_prob_colsum[i] += prob ; this->post_prob_tot += prob ; } } // -------------------------- threads stop --------------------------- } } void EMRead::compute_post_prob_routine(size_t from, size_t to, std::promise& post_prob_colsum) { vector_d colsums(this->n_class, 0.) ; // reset grand total // this->post_prob_tot = 0 ; // this->post_prob_colsum = vector_d(n_class, 0) ; // post prob for(size_t i=from; ipost_prob_rowsum[i] = 0. ; for(size_t n_class=0; n_classn_class; n_class++) { for(size_t n_shift=0; n_shiftn_shift; n_shift++) { for(size_t n_flip=0; n_flipn_flip; n_flip++) { double p = exp(this->loglikelihood(i,n_class,n_shift,n_flip)) * this->post_state_prob(n_class,n_shift,n_flip) ; this->post_prob(i,n_class,n_shift,n_flip) = p ; this->post_prob_rowsum[i] += p ; } } } + // normalize for(size_t n_class=0; n_classn_class; n_class++) { for(size_t n_shift=0; n_shiftn_shift; n_shift++) { for(size_t n_flip=0; n_flipn_flip; n_flip++) - { + { // avoid p=0. by rounding errors double p = std::max(this->post_prob(i,n_class,n_shift,n_flip) / this->post_prob_rowsum[i], ReadLayer::p_min) ; this->post_prob(i,n_class,n_shift,n_flip) = p ; colsums[n_class] += p ; } } } } post_prob_colsum.set_value(colsums) ; } void EMRead::update_models() { this->read_layer->update_model(this->post_prob, this->post_prob_colsum, this->threads) ; } diff --git a/src/Clustering/EMSequence.cpp b/src/Clustering/EMSequence.cpp index 34bfb99..89eba6f 100644 --- a/src/Clustering/EMSequence.cpp +++ b/src/Clustering/EMSequence.cpp @@ -1,404 +1,398 @@ #include #include #include #include // std::promise, std::future #include // std::pair, std::move() #include // std::bind(), std::ref() #include // exp() #include // SequenceLayer #include // getRandomNumberGenerator() #include // ConsoleProgressBar #include // ThreadPool #include // dna::base_composition() -template -std::ostream& operator << (std::ostream& stream, const std::vector& v) -{ for(const auto& x : v) - { stream << x << " " ; } - return stream ; -} EMSequence::EMSequence(const Matrix2D& seq_matrix, size_t n_class, size_t n_iter, size_t n_shift, bool flip, bool bckg_class, const std::string& seed, size_t n_threads) : EMBase(seq_matrix.get_nrow(), seq_matrix.get_ncol(), n_class, n_iter, n_shift, flip, n_threads), loglikelihood_max(n_row, 0.), seq_layer(nullptr) { this->loglikelihood_max = vector_d(n_row, 0.) ; // initialise post prob randomly // getRandomGenerator(seed) ; this->set_post_prob_random(seed) ; // compute background before giving data to // SequenceLayer Matrix2D bckg_motif ; if(bckg_class) { // sequence composition std::vector base_comp = dna::base_composition(seq_matrix, flip) ; // create a motif bckg_motif = Matrix2D(4, seq_matrix.get_ncol()-this->n_shift+1) ; for(size_t i=0; iseq_layer = new SequenceLayer(seq_matrix, this->n_class, this->n_shift, this->flip, bckg_class) ; // intialise the models with the post prob this->seq_layer->update_model(this->post_prob, this->threads) ; // overwrite last class as background class if(bckg_class) { this->seq_layer->set_class(this->n_class-1, bckg_motif) ; } } EMSequence::EMSequence(Matrix2D&& seq_matrix, size_t n_class, size_t n_iter, size_t n_shift, bool flip, bool bckg_class, const std::string& seed, size_t n_threads) : EMBase(seq_matrix.get_nrow(), seq_matrix.get_ncol(), n_class, n_iter, n_shift, flip, n_threads), loglikelihood_max(n_row, 0.), seq_layer(nullptr) { this->loglikelihood_max = vector_d(n_row, 0.) ; // initialise post prob randomly // getRandomGenerator(seed) ; this->set_post_prob_random(seed) ; // compute background before giving data to // SequenceLayer Matrix2D bckg_motif ; if(bckg_class) { // sequence composition std::vector base_comp = dna::base_composition(seq_matrix, flip) ; // create a motif bckg_motif = Matrix2D(4, seq_matrix.get_ncol()-this->n_shift+1) ; for(size_t i=0; iseq_layer = new SequenceLayer(std::move(seq_matrix), this->n_class, this->n_shift, this->flip, bckg_class) ; // intialise the models with the post prob this->seq_layer->update_model(this->post_prob, this->threads) ; // overwrite last class as background class if(bckg_class) { this->seq_layer->set_class(this->n_class-1, bckg_motif) ; } } EMSequence::EMSequence(const Matrix2D& seq_matrix, const Matrix3D& motifs, size_t n_iter, bool flip, bool bckg_class, size_t n_threads) : EMBase(seq_matrix.get_nrow(), seq_matrix.get_ncol(), motifs.get_dim()[0], n_iter, seq_matrix.get_ncol() - motifs.get_dim()[1] + 1, flip, n_threads), loglikelihood_max(n_row, 0.), seq_layer(nullptr) { this->loglikelihood_max = vector_d(n_row, 0.) ; // initialise post prob randomly // getRandomGenerator(seed) ; // this->set_post_prob_random(seed) ; // data and models this->seq_layer = new SequenceLayer(seq_matrix, motifs, this->flip, bckg_class) ; // intialise the class prob uniformly this->set_state_prob_uniform() ; } EMSequence::EMSequence(Matrix2D&& seq_matrix, Matrix3D&& motifs, size_t n_iter, bool flip, bool bckg_class, size_t n_threads) : EMBase(seq_matrix.get_nrow(), seq_matrix.get_ncol(), motifs.get_dim()[0], n_iter, seq_matrix.get_ncol() - motifs.get_dim()[1] + 1, flip, n_threads), loglikelihood_max(n_row, 0.), seq_layer(nullptr) { this->loglikelihood_max = vector_d(n_row, 0.) ; // initialise post prob randomly // getRandomGenerator(seed) ; // this->set_post_prob_random(seed) ; // data and models this->seq_layer = new SequenceLayer(std::move(seq_matrix), std::move(motifs), this->flip, bckg_class) ; // intialise the class prob uniformly this->set_state_prob_uniform() ; } EMSequence::~EMSequence() { if(this->seq_layer == nullptr) { delete this->seq_layer ; this->seq_layer = nullptr ; } } Matrix3D EMSequence::get_sequence_models() const { return seq_layer->get_model() ; } EMSequence::exit_codes EMSequence::classify() { size_t bar_update_n = this->n_iter ; ConsoleProgressBar bar(std::cerr, bar_update_n, 60, "classifying") ; // optimize the partition for(size_t n_iter=0; n_itern_iter; n_iter++) { // E-step this->compute_loglikelihood() ; this->compute_post_prob() ; // M-step this->compute_class_prob() ; this->update_models() ; this->center_post_state_prob() ; bar.update() ; } bar.update() ; std::cerr << std::endl ; return EMSequence::exit_codes::ITER_MAX ; } void EMSequence::compute_loglikelihood() { // compute the loglikelihood this->seq_layer->compute_loglikelihoods(this->loglikelihood, this->loglikelihood_max, this->threads) ; // rescale the values // don't parallelize if(this->threads == nullptr) { std::promise promise ; std::future future = promise.get_future() ; this->compute_loglikelihood_routine(0, this->n_row, promise) ; future.get() ; } // parallelize else { size_t n_threads = this->threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row,n_threads) ; // get promises and futures std::vector> promises(n_threads) ; std::vector> futures(n_threads) ; for(size_t i=0; ithreads->addJob(std::move( std::bind(&EMSequence::compute_loglikelihood_routine, this, slice.first, slice.second, std::ref(promises[i])))) ; } // wait until all threads are done working for(auto& future : futures) { future.get() ; } // -------------------------- threads stop --------------------------- } } void EMSequence::compute_loglikelihood_routine(size_t from, size_t to, std::promise& done) { // rescale the values for(size_t i=from; in_class; j++) { for(size_t k=0; kn_shift; k++) { for(size_t l=0; ln_flip; l++) { this->loglikelihood(i,j,k,l) = (this->loglikelihood(i,j,k,l) - this->loglikelihood_max[i]) ; } } } } done.set_value(true) ; } void EMSequence::compute_post_prob() { // don't parallelize if(this->threads == nullptr) { std::promise promise ; std::future future = promise.get_future() ; this->compute_post_prob_routine(0, this->n_row, promise) ; // compute the sum of post prob and the per class sum of post prob // from the partial results computed on each slice this->post_prob_tot = 0. ; this->post_prob_colsum = future.get() ; for(const auto& prob : this->post_prob_colsum) { this->post_prob_tot += prob ; } } // parallelize else { size_t n_threads = this->threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row,n_threads) ; // get promises and futures // the function run by the threads will compute // the partial sum per class of post_prob for the given slice // this should be used to compute the complete sum of post_prob // and the complete sum per class of post_prob std::vector> promises(n_threads) ; std::vector> futures(n_threads) ; for(size_t i=0; ithreads->addJob(std::move( std::bind(&EMSequence::compute_post_prob_routine, this, slice.first, slice.second, std::ref(promises[i])))) ; } // wait until all threads are done working // compute the sum of post prob and the per class sum of post prob // from the partial results computed on each slice this->post_prob_tot = 0. ; this->post_prob_colsum = vector_d(this->n_class, 0.) ; for(auto& future : futures) { auto probs = future.get() ; for(size_t i=0; in_class; i++) { double prob = probs[i] ; this->post_prob_colsum[i] += prob ; this->post_prob_tot += prob ; } } // -------------------------- threads stop --------------------------- } } void EMSequence::compute_post_prob_routine(size_t from, size_t to, std::promise& post_prob_colsum) { vector_d colsums(this->n_class, 0.) ; // reset grand total // this->post_prob_tot = 0 ; // this->post_prob_colsum = vector_d(n_class, 0) ; // post prob for(size_t i=from; ipost_prob_rowsum[i] = 0. ; for(size_t n_class=0; n_classn_class; n_class++) { for(size_t n_shift=0; n_shiftn_shift; n_shift++) { for(size_t n_flip=0; n_flipn_flip; n_flip++) { double p = exp(this->loglikelihood(i,n_class,n_shift,n_flip)) * this->post_state_prob(n_class,n_shift,n_flip) ; this->post_prob(i,n_class,n_shift,n_flip) = p ; this->post_prob_rowsum[i] += p ; } } } // normalize for(size_t n_class=0; n_classn_class; n_class++) { for(size_t n_shift=0; n_shiftn_shift; n_shift++) { for(size_t n_flip=0; n_flipn_flip; n_flip++) { double p = std::max(this->post_prob(i,n_class,n_shift,n_flip) / this->post_prob_rowsum[i], SequenceLayer::p_min) ; this->post_prob(i,n_class,n_shift,n_flip) = p ; colsums[n_class] += p ; } } } } post_prob_colsum.set_value(colsums) ; } void EMSequence::update_models() { this->seq_layer->update_model(this->post_prob, this->threads) ; } diff --git a/src/Clustering/ReadLayer.cpp b/src/Clustering/ReadLayer.cpp index d4156f6..367e2b4 100644 --- a/src/Clustering/ReadLayer.cpp +++ b/src/Clustering/ReadLayer.cpp @@ -1,513 +1,444 @@ #include #include // std::invalid_argument #include // numeric_limits #include // log(), exp(), pow() #include #include // std::promise, std::future #include // std::pair, std::move() #include // std::bind(), std::ref() #include // beta_pmf(), poisson_pmf() #include // rand_real_uniform(), rand_int_uniform() #include #include #include #include #include typedef std::vector vector_d ; ReadLayer::ReadLayer(const Matrix2D& data, size_t n_class, size_t n_shift, bool flip, ThreadPool* threads) : DataLayer(data, n_class, n_shift, flip), window_means(n_row, n_shift, 0.) { this->n_category = 1 ; // initialise the empty model this->model = Matrix3D(this->n_class, this->l_model, this->n_category, 0) ; // compute window means this->compute_window_means(threads) ; } ReadLayer::ReadLayer(Matrix2D&& data, size_t n_class, size_t n_shift, bool flip, ThreadPool* threads) : DataLayer(std::move(data), n_class, n_shift, flip), window_means(n_row, n_shift, 0.) { this->n_category = 1 ; // initialise the empty model this->model = Matrix3D(this->n_class, this->l_model, this->n_category, 0) ; // compute window means this->compute_window_means(threads) ; } ReadLayer::ReadLayer(const Matrix2D& data, const Matrix3D& model, bool flip, ThreadPool* threads) : DataLayer(data, model, flip), window_means(n_row, n_shift, 0.) { // check that the model only has one category if(this->n_category > 1) { char msg[4096] ; sprintf(msg, "Error! model is expected to have length 1 on " "3rd dimension, not %zu", this->n_category) ; throw std::invalid_argument(msg) ; } // compute window means this->compute_window_means(threads) ; } ReadLayer::ReadLayer(Matrix2D&& data, Matrix3D&& model, bool flip, ThreadPool* threads) : DataLayer(std::move(data), std::move(model), flip), window_means(n_row, n_shift, 0.) { // check that the model only has one category if(this->n_category > 1) { char msg[4096] ; sprintf(msg, "Error! model is expected to have length 1 on " "3rd dimension, not %zu", this->n_category) ; throw std::invalid_argument(msg) ; } // compute window means this->compute_window_means(threads) ; } ReadLayer::~ReadLayer() {} void ReadLayer::compute_loglikelihoods(Matrix4D& loglikelihood, vector_d& loglikelihood_max, ThreadPool* threads) const { // dimension checks this->check_loglikelihood_dim(loglikelihood) ; this->check_loglikelihood_max_dim(loglikelihood_max) ; // don't parallelize if(threads == nullptr) { std::promise promise ; std::future future = promise.get_future() ; this->compute_loglikelihoods_routine(0, this->n_row, std::ref(loglikelihood), std::ref(loglikelihood_max), promise) ; future.get() ; } // parallelize else { size_t n_threads = threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row, n_threads) ; // get promises and futures // the function run by the threads will simply fill the promise with // "true" to indicate that they are done std::vector> promises(n_threads) ; std::vector> futures(n_threads) ; for(size_t i=0; iaddJob(std::move( std::bind(&ReadLayer::compute_loglikelihoods_routine, this, slice.first, slice.second, std::ref(loglikelihood), std::ref(loglikelihood_max), std::ref(promises[i])))) ; } // wait until all threads are done working for(auto& future : futures) { future.get() ; } // -------------------------- threads stop --------------------------- } } void ReadLayer::compute_loglikelihoods_routine(size_t from, size_t to, Matrix4D& loglikelihood, vector_d& loglikelihood_max, std::promise& done) const { // normalize the models Matrix3D model_norm = this->model ; for(size_t i=0; in_class; i++) { double mean = 0. ; for(size_t j=0; jl_model; j++) { mean += model_norm(i,j,0) ; } mean /= this->l_model ; for(size_t j=0; jl_model; j++) { model_norm(i,j,0) /= mean ; } } - // compute log likelihood for(size_t i=from; i::lowest() ; for(size_t j=0; jn_class; j++) { for(size_t s_fw=0, s_rev=this->n_shift-1; s_fwn_shift; s_fw++, s_rev--) { // slice is [from_fw,to) // from_dat_fw to_dat_fw [from_dat_fw, to_dat_fw] // fw |---------->>>----------| // ----------------------------------> data // rev |----------<<<----------| [from_dat_rev, to_dat_rev] // to_dat_rev can be -1 -> int // to_dat_rev from_dat_rev // log likelihood double ll_fw = 0. ; double ll_rev = 0. ; // --------------- forward --------------- size_t from_dat_fw = s_fw ; size_t to_dat_fw = from_dat_fw + this->l_model - 1 ; // --------------- reverse --------------- size_t from_dat_rev = this->n_col - 1 - s_fw ; // size_t to_dat_rev = from_dat_rev - (this->l_model - 1) ; for(size_t j_dat_fw=from_dat_fw,j_ref_fw=0, j_dat_rev=from_dat_rev; j_dat_fwdata(i,j_dat_fw), model_norm(j,j_ref_fw,0)* this->window_means(i,s_fw))) ; - ll_fw += ll ; - // ll_fw += std::max(ll, ReadLayer::p_min_log) ; + // ll_fw += ll ; + // p(A|B) may be really unlikely -> rounded to 0 -> log(0) = -inf + ll_fw += std::max(ll, ReadLayer::p_min_log) ; // --------------- reverse --------------- if(this->flip) { ll = log(poisson_pmf(this->data(i,j_dat_rev), model_norm(j,j_ref_fw,0)* this->window_means(i,s_rev))) ; - ll_rev += ll ; - // ll_rev += std::max(ll, ReadLayer::p_min_log) ; + // ll_rev += ll ; + // p(A|B) may be really unlikely -> rounded to 0 -> log(0) = -inf + ll_rev += std::max(ll, ReadLayer::p_min_log) ; } } loglikelihood(i,j,from_dat_fw,flip_states::FORWARD) = ll_fw ; // keep track of the max per row if(ll_fw > loglikelihood_max[i]) { loglikelihood_max[i] = ll_fw ; } if(this->flip) { loglikelihood(i,j,from_dat_fw,flip_states::REVERSE) = ll_rev ; // keep track of the max per row if(ll_rev > loglikelihood_max[i]) { loglikelihood_max[i] = ll_rev ; } } } } } done.set_value(true) ; } void ReadLayer::update_model(const Matrix4D& posterior_prob, ThreadPool* threads) { // computing sum over the columns (classes) size_t n_row = posterior_prob.get_dim()[0] ; size_t n_class = posterior_prob.get_dim()[1] ; size_t n_shift = posterior_prob.get_dim()[2] ; size_t n_flip = posterior_prob.get_dim()[3] ; vector_d colsum(n_class, 0.) ; for(size_t i=0; iupdate_model(posterior_prob, colsum, threads) ; - /* - // don't parallelize - if(threads == nullptr) - { std::promise> promise ; - std::future> future = promise.get_future() ; - this->update_model_routine(0, - this->n_row, - posterior_prob, - colsum, - promise) ; - this->model = future.get() ; - } - // parallelize - else - { size_t n_threads = threads->getNThread() ; - // compute the slices on which each thread will work - std::vector> slices = - ThreadPool::split_range(0, this->n_row, n_threads) ; - - // get promises and futures - // the function run by the threads will simply fill the promise with - // "true" to indicate that they are done - std::vector>> promises(n_threads) ; - std::vector>> futures(n_threads) ; - for(size_t i=0; iaddJob(std::move( - std::bind(&ReadLayer::update_model_routine, - this, - slice.first, - slice.second, - posterior_prob, - colsum, - std::ref(promises[i])))) ; - } - // reinitialise the model - this->model = Matrix3D(this->n_class, - this->l_model, - this->n_category, - 0.) ; - // wait until all threads are done working - // and update the model - for(auto& future : futures) - { Matrix3D model_part = future.get() ; - for(size_t i=0; in_class; i++) - { for(size_t j=0; jl_model; j++) - { for(size_t k=0; kn_category; k++) - { this->model(i,j,k) += - model_part(i,j,k) ; - } - } - } - } - // -------------------------- threads stop --------------------------- - } - // avoid 0's in the model to ensure that pmf_poisson() never - // return 0 - for(size_t i=0; in_class; i++) - { for(size_t j=0; jl_model; j++) - { for(size_t k=0; kn_category; k++) - { this->model(i,j,k) = - std::max(this->model(i,j,k), ReadLayer::p_min) ; - } - } - } - */ } void ReadLayer::update_model(const Matrix4D& posterior_prob, const vector_d& posterior_prob_colsum, ThreadPool* threads) { // don't parallelize if(threads == nullptr) { std::promise> promise ; std::future> future = promise.get_future() ; this->update_model_routine(0, this->n_row, posterior_prob, posterior_prob_colsum, promise) ; this->model = future.get() ; } // parallelize else { size_t n_threads = threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row, n_threads) ; // get promises and futures // the function run by the threads will simply fill the promise with // "true" to indicate that they are done std::vector>> promises(n_threads) ; std::vector>> futures(n_threads) ; for(size_t i=0; iaddJob(std::move( std::bind(&ReadLayer::update_model_routine, this, slice.first, slice.second, std::ref(posterior_prob), std::ref(posterior_prob_colsum), std::ref(promises[i])))) ; } // reinitialise the model this->model = Matrix3D(this->n_class, this->l_model, this->n_category, 0.) ; // wait until all threads are done working // and update the mode for(auto& future : futures) { Matrix3D model_part = future.get() ; for(size_t i=0; in_class; i++) { for(size_t j=0; jl_model; j++) { for(size_t k=0; kn_category; k++) { this->model(i,j,k) += model_part(i,j,k) ; } } } } // -------------------------- threads stop --------------------------- } // avoid 0's in the model to ensure that pmf_poisson() never // return 0 for(size_t i=0; in_class; i++) { for(size_t j=0; jl_model; j++) { for(size_t k=0; kn_category; k++) { this->model(i,j,k) = std::max(this->model(i,j,k), ReadLayer::p_min) ; } } } } void ReadLayer::update_model_routine(size_t from, size_t to, const Matrix4D& posterior_prob, const vector_d& posterior_prob_colsum, std::promise>& promise) const { // dimension checks this->check_posterior_prob_dim(posterior_prob) ; this->check_posterior_prob_colsum_dim(posterior_prob_colsum) ; // partial model Matrix3D model(this->n_class, this->l_model, this->n_category, 0.) ; for(size_t n_class=0; n_class < this->n_class; n_class++) { for(size_t i=from; in_shift; n_shift++) { // --------------- forward --------------- int from_dat_fw = n_shift ; int to_dat_fw = from_dat_fw + this->l_model - 1 ; for(int j_dat_fw=from_dat_fw, j_ref_fw=0; j_dat_fw<=to_dat_fw; j_dat_fw++, j_ref_fw++) { model(n_class,j_ref_fw,0) += (posterior_prob(i,n_class,n_shift,flip_states::FORWARD) * this->data(i,j_dat_fw)) / posterior_prob_colsum[n_class] ; } // --------------- reverse --------------- if(this->flip) { int from_dat_rev = this->n_col - 1 - n_shift ; int to_dat_rev = from_dat_rev - (this->l_model - 1) ; for(int j_dat_rev=from_dat_rev, j_ref_fw=0; j_dat_rev >= to_dat_rev; j_dat_rev--, j_ref_fw++) { model(n_class,j_ref_fw,0) += (posterior_prob(i,n_class,n_shift,flip_states::REVERSE) * this->data(i,j_dat_rev)) / posterior_prob_colsum[n_class] ; } } } } } promise.set_value(model) ; } void ReadLayer::compute_window_means(ThreadPool* threads) { // don't parallelize if(threads == nullptr) { std::promise promise ; std::future future = promise.get_future() ; this->compute_window_means_routine(0, this->n_row, promise) ; future.get() ; } // parallelize else { size_t n_threads = threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row, n_threads) ; // get promises and futures // the function run by the threads will simply fill the promise with // "true" to indicate that they are done std::vector> promises(n_threads) ; std::vector> futures(n_threads) ; for(size_t i=0; iaddJob(std::move( std::bind(&ReadLayer::compute_window_means_routine, this, slice.first, slice.second, std::ref(promises[i])))) ; } // wait until all threads are done working for(auto& future : futures) { future.get() ; } // -------------------------- threads stop --------------------------- } } void ReadLayer::compute_window_means_routine(size_t from, size_t to, std::promise& done) { double l_window = double(this->l_model) ; for(size_t i=from; in_shift; from++) { double sum = 0. ; // slice is [from,to) size_t to = from + this->l_model ; for(size_t j=from; jdata(i,j) ;} this->window_means(i,from) = sum / l_window ; } } done.set_value(true) ; } void ReadLayer::check_posterior_prob_colsum_dim(const vector_d& posterior_prob_colsum) const { if(posterior_prob_colsum.size() != this->n_class) { char msg[4096] ; sprintf(msg, "Error! posterior_class_prob matrix size is not " "equal to model class number : %zu / %zu", posterior_prob_colsum.size(), this->n_class) ; throw std::invalid_argument(msg) ; } } diff --git a/src/Matrix/Matrix.hpp b/src/Matrix/Matrix.hpp index a769c96..6a0212a 100644 --- a/src/Matrix/Matrix.hpp +++ b/src/Matrix/Matrix.hpp @@ -1,684 +1,840 @@ #ifndef MATRIX_HPP #define MATRIX_HPP #include #include // accumulate() #include +#include #include // setw(), setprecision(), fixed #include // out_of_range, invalid_argument #include // swap()f /*! * \brief The Matrix class is a generic class to store data in a matrix. * The matrix dimensionality can be any value : 1 is a vector, 2 is a regular * 2D matrix, 3 is a 3D matrix, etc. * * In order to store the data properly and to perform all operations smoothly, the * internal representation format differs from the "usual format". That is : the user * provides coordinates as (x,y,z,...) where x referes to the row number, y to * the column number, z the the z slice, etc. * Internally however, x corresponds to the column number and y to the row number. * Every other dimension has the same meaning. * * Internal representation : * * Here is an example of a 2x3 matrix (2D) * * {0,1,2,3,4,5} vector is turned to * X * ----------> * 0 1 2 | * 3 4 5 | Y * \|/ * * dimensions are stored as {nx, ny} which corresponds to {ncol, nrow}. Coordinates * are given using the universal format coord=(x,y) which are interpreted as {row, col}. * Thus a simple swap(coord[0],coord[1]) should be performed to ensurethat the user given * coordinates can be used in this referencial. * * * Here is an example of a 2x3x2x2 matrix(4D) * {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23} is turned to * * X * -----------> | | * 0 1 2 | | | * 3 4 5 | Y | | * \|/ | Z | * 6 7 8 | | | * 9 10 11 | Y | | * \|/ \|/ | * | A * 12 13 14 | | | * 15 16 17 | Y | | * \|/ | Z | * 18 19 20 | | | * 21 22 23 | Y | | * \|/ \|/ \|/ * * dimensions are stored as {nx, ny, nz, na} which corredponds to {ncol, nrow, nz, na}. * Coordinates are given using the universal format coord=(x,y,z,a) which are interpreted * as {row, col, z, a}. Thus a simple swap(coord[0],coord[1]) should be performed to ensure * that the user given coordinates can be used in this referencial. * * + * A format to save a Matrix objects in a binary file is defined. The following values + * are written : + * 1x size_t : the number of dimensions of the Matrix stored. This is the value + * of _dim_size field. + * Nx size_t : the width of the matrix in each dimension. These values correspond + * to the content of the _dim vector and can be loaded inside this + * vector as they are. + * Dx values : the values contained in the matrix, in the _data vector. + * These values can be loaded directly in this vector. is equal to the + * product of the values stored right before. The type depends on + * the type of the data stored in the matrix. The 1st of the D values is + * also the first value of the _data vector. */ template class Matrix { public: // constructors Matrix() = default ; /*! * \brief Constructs an matrix with the given dimension with * 0 values. * \param dim the dimensions. */ Matrix(const std::vector& dim) ; /*! * \brief Constructs a matrix with the given dimensions and * initialize the values to the given value. * \param dim the dimensions. * \param value the value to initialize the matrix content * with. */ Matrix(const std::vector& dim, T value) ; /*! * \brief Copy constructor. * \param other the matrix to copy. */ Matrix(const Matrix& other) ; /*! * \brief Move constructor. * \param other the matrix to use. */ Matrix(Matrix&& other) ; /*! * \brief Destructor. */ - virtual ~Matrix() = default ; + virtual ~Matrix() ; // methods + /*! + * \brief loads a matrix from the given binary + * file. + * \param path the path to the file to read. + * \param dim_n the expected number of dimensions + * of the matrix. + * \throw std::invalid_argument if the dimensionality + * of the matrix stored in the file is not equal to + * the expected number of dimensions and + * std::runtime_error if any reading error + * occures. + */ + virtual void load(const std::string& file_address, + size_t dim_n) ; + + /*! + * \brief writes to content of the matrix + * to a given binary file. + * \param path the path to the file. + */ + virtual void save(const std::string& file_address) ; + /*! * \brief Gets the element at the given offset. * \param offset the offset of the element to get. * \throw std::out_of_range exception if the offset * is out of range. * \return the element. */ T get(size_t offset) const ; /*! * \brief Gets the element at the given coordinates. * \param coord the coordinates of the element to get. * \throw std::out_of_range exception if the coordinates * are out of range. * \return the element. */ T get(const std::vector& coord) const ; /*! * \brief Sets the element at the given offset * to the given value. * \param offset the offset of the element to set. * \param value the new value. * \throw std::out_of_range exception if the offset * is out of range. */ void set(size_t offset, T value) ; /*! * \brief Sets the element at the given coordinates * to the given value. * \param coord the coordinates of the element to set. * \param value the new value. * \throw std::out_of_range exception if the coordinates * are out of range. */ void set(const std::vector& coord, T value) ; /*! * \brief Gets the matrix dimensions. * \return the dimensions. */ std::vector get_dim() const ; /*! * \brief Gets the data vector. * \return a a vector containing the data. */ std::vector get_data() ; /*! * \brief Gets the number of dimensions (the length * of the dimension vector). * \return the number of dimensions */ size_t get_dim_size() const ; /*! * \brief Gets the number of elements contained in the * matrix. * \return the number of element contained in the * matrix. */ size_t get_data_size() const ; /*! * \brief Returns the partial products of the dimensions. * \return the partial products of the dimensions. */ std::vector get_dim_product() const ; /*! * \brief Produces a nice representation of the matrix on the given * stream. * \param stream the stream. * \param precision the rounding precision. * \param width the column width in number of characters. * \param sep the character separator. */ virtual void print(std::ostream& stram, size_t precision=4, size_t width=8, char sep=' ') const ; // operator /*! * \brief Assignment operator. * \param other an other matrix to copy the values from. * \return a reference to the current instance. */ Matrix& operator = (const Matrix& other) ; /*! * \brief Move assignment operator. * \param other an other matrix to use the values from. * \return a reference to the current instance. */ Matrix& operator = (Matrix&& other) ; /*! * \brief Adds value to each element. * \param value the value to add. * \return a reference to the instance. */ Matrix& operator += (T value) ; /*! * \brief Substracts value to each element. * \param value the value to substract. * \return a reference to the instance. */ Matrix& operator -= (T value) ; /*! * \brief Multiplies each element by value. * \param value the value to multiply the elements by. * \return a reference to the instance. */ Matrix& operator *= (T value) ; /*! * \brief Divides each element by value. * \param value the value to multiply the elements by. * \throw std::invalid_argument if value is 0. * \return a reference to the instance. */ Matrix& operator /= (T value) ; /*! * \brief Comparison operator, returns true if * both matrices are identical, that is do not * have the same data and dimensions. * \param other an other matrix. * \return true if both matrices have the same * data and dimensions. */ bool operator == (const Matrix& other) const ; /*! * \brief Comparison operator, returns true if * both matrices are different, that is do not * have the same data and dimensions. * \param other an other matrix. * \return true if both matrices are different. */ bool operator != (const Matrix& other) const ; /*! * \brief Returns a reference to the corrresponding * element. This method does not perform any check on * the coordinates. * \param coord coord the coordinates of the element to get. * \return a reference to this element. */ T& operator () (const std::vector& coord) ; /*! * \brief Returns a const reference to the corrresponding * element. This method does not perform any check on * the coordinates. * \param coord coord the coordinates of the element to get. * \return a const reference to this element. */ const T& operator () (const std::vector& coord) const ; protected: // methods /*! * \brief Computes the partial dimension products and fills * this->dim_prod according to the current values of * this->_dim and this->dim_size. */ void compute_dim_product() ; /*! * \brief Given a vector of at least 2 dimensional coordinates, * it simply swaps the elements at index 0 (row number) and 1 * (column number) to make them fit the x,y,... matrix * reprensetation (x:number of columns, y:number of rows). * \param coord a vector of coordinates (row, column, ...). * \return a vector of coordinates corresponding to (x,y,...). */ std::vector swap_coord(const std::vector& coord) const ; /*! * \brief Complementary function of convert_coord(). Given * a vector of coordinates in (x,y,...) format, it turns it * into (row,col,...) format. * \param coord a vector of coordinates (x,y, ...). * \return a vector of coordinates corresponding to (row,col,...). */ std::vector convert_coord_back(const std::vector& coord) const ; /*! * \brief Checks whether a given offset is a valid offset or * whether it is out of range. * \param offset the offset to check. * \return whether the offset is valid. */ bool is_valid(size_t offset) const ; /*! * \brief Checks whether coordinates in (x,y,...) format are * valid or whether they are out of range. * \param offset the offset to check. * \return whether the offset is valid. */ bool is_valid(const std::vector& coord) const ; /*! * \brief Converts a vector of VALID (x,y,...) coordinates to a * the corresponding offset allowing to get an element in the * data vector. * If the coordinate vector has a (row, column, ...) format, the * result will be wrong. * \param coord a vector of coordinates with (x,y,...) format. * \return the corresponding offset. */ size_t convert_to_offset(const std::vector& coord) const ; /*! * \brief Complementary function of convert_to_offset(). Given an * offset, this function returns the corresponding coordinate * vector in (x,y,...) format. * \param offset a given offset. * \return the corresponding vector of (x,y,..) coordinates. */ std::vector convert_to_coord(size_t offset) const ; // fields /*! * \brief The dimensions values. */ std::vector _dim ; /*! * \brief Stores the data. */ - std::vector _data ; + std::vector* _data ; /*! * \brief The number of dimensions. */ size_t _dim_size ; /*! * \brief The number of data elements stored. */ size_t _data_size ; - /*! * \brief Contains the partial product of the dimensions. That is, * the ith element contains the product of all the i-1 precedent * dimensions : * element 0 : 1, element 1 : x, element 2 : x*y, element 3 : x*y*z, * and so one. * This is used for coordinates to offset and offset to coordinates * conversions. */ std::vector _dim_prod ; } ; // operators /*! * \brief Addition operator. * \param m the matrix of interest * \param value the value to add to each element. * \return the resulting matrix. */ template const Matrix operator + (Matrix m, T value) { Matrix other(m) ; other += value ; return other ; } /*! * \brief Substraction operator * \param m the matrix of interest. * \param value the value to substract to each element. * \return the resulting matrix. */ template const Matrix operator - (Matrix m, T value) { Matrix other(m) ; other -= value ; return other ; } /*! * \brief Multiplication operator. * \param m the matrix of interest. * \param value the value to multiply each elements by. * \return the resulting matrix. */ template const Matrix operator * (Matrix m, T value) { Matrix other(m) ; other *= value ; return other ; } /*! * \brief Division operator. * \param m the matrix of interest. * \param value the value to divide each elements by. * \throw std::invalid_argument if value is 0. * \return the resulting matrix. */ template const Matrix operator / (Matrix m, T value) { if(value == static_cast(0)) { throw std::invalid_argument("division by 0!") ; } Matrix other(m) ; other /= value ; return other ; } /*! * \brief Sends a representation of the matrix to the stream. * \param stream the stream of interest. * \param m the matrix of interest. * \return a reference to the stream. */ template std::ostream& operator << (std::ostream& stream, const Matrix& m) { m.print(stream) ; return stream ; } // method implementation template Matrix::Matrix(const std::vector& dim) : Matrix(dim, 0) {} - template Matrix::Matrix(const std::vector& dim, T value) { this->_dim_size = dim.size() ; this->_dim = this->swap_coord(dim) ; this->_data_size = std::accumulate(dim.begin(), dim.end(), (size_t)1, std::multiplies()) ; - this->_data = std::vector(this->_data_size, value) ; + this->_data = new std::vector(this->_data_size, value) ; this->compute_dim_product() ; } template Matrix::Matrix(const Matrix& other) -{ *this = other ; } +{ this->_dim_size = other._dim_size ; + this->_dim = other._dim ; + this->_data_size = other._data_size ; + this->_data = new std::vector(*(other._data)) ; + this->_dim_prod = other._dim_prod ; +} template -Matrix::Matrix(Matrix&& other) -{ this->_dim = other._dim ; - this->_data = other._data ; - this->_dim_size = other._dim_size ; +Matrix::Matrix(Matrix&& other) +{ this->_dim_size = other._dim_size ; + this->_dim = other._dim ; this->_data_size = other._data_size ; - this->_dim_prod = other._dim_prod ; + this->_data = other._data ; + other._data = nullptr ; + this->_dim_prod = other._dim_prod ; +} + +template +Matrix::~Matrix() +{ if(this->_data != nullptr) + { delete this->_data ; + this->_data = nullptr ; + } +} + +template +void Matrix::load(const std::string& file_address, + size_t dim_n) +{ + // open + std::ifstream file(file_address, std::ifstream::in | std::ifstream::binary) ; + if(file.fail()) + { char msg[4096] ; + sprintf(msg, "error! cannot open %s", file_address.c_str()) ; + throw std::runtime_error(msg) ; + } + + // read number of dimensions + file.read((char*) &(this->_dim_size), sizeof(size_t)) ; + if(not file) + { file.close() ; + char msg[4096] ; + sprintf(msg, "Error! something occured while reading number of dimensions in %s", + file_address.c_str()) ; + throw std::invalid_argument(msg) ; + } + // this file does not store a matrix with the expected dimensions + if(this->_dim_size != dim_n) + { file.close() ; + char msg[4096] ; + sprintf(msg, "Error! Invalid number of dimensions (%zu) found in %s", + this->_dim_size, + file_address.c_str()) ; + throw std::runtime_error(msg) ; + } + + // read dimensions + this->_dim = std::vector(this->_dim_size) ; + file.read((char*) &(this->_dim[0]), this->_dim_size*sizeof(size_t)) ; + if(not file) + { file.close() ; + char msg[4096] ; + sprintf(msg, "Error! something occured while reading dimensions in %s", + file_address.c_str()) ; + throw std::runtime_error(msg) ; + } + + // read data + this->_data_size = std::accumulate(this->_dim.begin(), + this->_dim.end(), + (size_t)1, + std::multiplies()) ; + this->_data = new std::vector(this->_data_size) ; + file.read((char*) &((*this->_data)[0]), this->_data_size*sizeof(T)) ; + if(not file) + { file.close() ; + char msg[4096] ; + sprintf(msg, "Error! something occured while reading data in %s", + file_address.c_str()) ; + throw std::runtime_error(msg) ; + } + + file.close() ; + + this->compute_dim_product() ; +} + +template +void Matrix::save(const std::string &file_address) +{ + // open + std::ofstream file(file_address, std::ifstream::out | std::ifstream::binary) ; + if(file.fail()) + { char msg[4096] ; + sprintf(msg, "error! cannot open %s", file_address.c_str()) ; + throw std::runtime_error(msg) ; + } + + // write number of dimensions + file.write((char*) &this->_dim_size, sizeof(size_t)) ; + if(not file) + { char msg[4096] ; + sprintf(msg, "Error! something happened while writting dimension number to %s", + file_address.c_str()) ; + file.close() ; + throw std::runtime_error(msg) ; + } + + // write dimensions + for(auto x : this->_dim) + { file.write((char*) &x, sizeof(size_t)) ; + if(not file) + { char msg[4096] ; + sprintf(msg, "Error! something happened while writting dimensions to %s", + file_address.c_str()) ; + file.close() ; + throw std::runtime_error(msg) ; + } + } + + // write data + file.write((char*) &((*this->_data)[0]), this->_data_size*sizeof(T)) ; + if(not file) + { char msg[4096] ; + sprintf(msg, "Error! something happened while writting data to %s", + file_address.c_str()) ; + file.close() ; + throw std::runtime_error(msg) ; + } + + file.close() ; } template T Matrix::get(size_t offset) const { if(not this->is_valid(offset)) { throw std::out_of_range("offset is out of range!") ; } - return this->_data[offset] ; + return (*this->_data)[offset] ; } template T Matrix::get(const std::vector& coord) const { std::vector coord_new = this->swap_coord(coord) ; if(not this->is_valid(coord_new)) { throw std::out_of_range("coordinates are out of range!") ; } - return this->_data[this->convert_to_offset(coord_new)] ; + return (*this->_data)[this->convert_to_offset(coord_new)] ; } template void Matrix::set(size_t offset, T value) { if(not this->is_valid(offset)) { throw std::out_of_range("offset is out of range!") ; } - this->_data[offset] = value ; + (*this->_data)[offset] = value ; } template void Matrix::set(const std::vector& coord, T value) { std::vector coord_new = this->swap_coord(coord) ; if(not this->is_valid(coord_new)) { throw std::out_of_range("coordinates are out of range!") ; } - this->_data[this->convert_to_offset(coord_new)] = value ; + (*this->_data)[this->convert_to_offset(coord_new)] = value ; } template std::vector Matrix::get_dim() const { return this->swap_coord(this->_dim) ; } template std::vector Matrix::get_data() -{ return this->_data ; } +{ return (*this->_data) ; } template size_t Matrix::get_dim_size() const { return this->_dim_size ; } template size_t Matrix::get_data_size() const { return this->_data_size ; } template std::vector Matrix::get_dim_product() const { return this->_dim_prod ; } template void Matrix::print(std::ostream& stream, size_t precision, size_t width, char sep) const { stream.setf(std::ios::left) ; stream << std::setprecision(precision) << std::fixed ; for(size_t i=0; iget_data_size(); i++) { stream << std::setw(width) << this->get(i) << sep ; } } template Matrix& Matrix::operator = (const Matrix& other) { this->_dim = other._dim ; this->_dim_size = other._dim_size ; - this->_data = other._data ; + this->_data = new std::vector(*other._data) ; this->_data_size = other._data_size ; this->_dim_prod = other._dim_prod ; return *this ; } template Matrix& Matrix::operator = (Matrix&& other) { this->_dim = other._dim ; this->_dim_size = other._dim_size ; this->_data = other._data ; + other._data = nullptr ; this->_data_size = other._data_size ; this->_dim_prod = other._dim_prod ; return *this ; } template Matrix& Matrix::operator += (T value) -{ for(auto& i : this->_data) +{ for(auto& i : (*this->_data)) { i += value ; } return *this ; } template Matrix& Matrix::operator -= (T value) -{ for(auto& i : this->_data) +{ for(auto& i : (*this->_data)) { i -= value ; } return *this ; } template Matrix& Matrix::operator *= (T value) -{ for(auto& i : this->_data) +{ for(auto& i : (*this->_data)) { i *= value ; } return *this ; } template Matrix& Matrix::operator /= (T value) { if(value == static_cast(0)) { throw std::invalid_argument("division by 0!") ; } - for(auto& i : this->_data) + for(auto& i : (*this->_data)) { i /= value ; } return *this ; } template bool Matrix::operator == (const Matrix& other) const { if(&other == this) { return true ; } // check dim if(this->_dim_size != other._dim_size) { return false ; } for(size_t i=0; i_dim_size; i++) { if(this->_dim[i] != other._dim[i]) { return false ; } } // check data if(this->_data_size != other._data_size) { return false ; } for(size_t i=0; i_data_size; i++) - { if(this->_data[i] != other._data[i]) + { if((*this->_data)[i] != (*other._data)[i]) { return false ; } } return true ; } template bool Matrix::operator !=(const Matrix& other) const { return not ((*this) == other) ;} template T& Matrix::operator () (const std::vector& coord) { std::vector coord_new = this->swap_coord(coord) ; - return this->_data[this->convert_to_offset(coord_new)] ; + return (*this->_data)[this->convert_to_offset(coord_new)] ; } template const T& Matrix::operator () (const std::vector& coord) const { std::vector coord_new = this->swap_coord(coord) ; - return this->_data[this->convert_to_offset(coord_new)] ; + return (*this->_data)[this->convert_to_offset(coord_new)] ; } template void Matrix::compute_dim_product() { this->_dim_prod = std::vector(this->_dim_size, 0) ; this->_dim_prod[0] = 1 ; if(this->_dim_size > 1) { this->_dim_prod[1] = this->_dim[0] ; } if(this->_dim_size > 2) { for(size_t i=2; i_dim_size; i++) { this->_dim_prod[i] = this->_dim_prod[i-1]*this->_dim[i-1] ; } } } template std::vector Matrix::swap_coord(const std::vector &coord) const { std::vector coord_new = coord ; // reformat coord = (row,col,...) = (y,y,...) into coord = (col,row,...) = (x,y,...) if(this->_dim_size > 1) { std::swap(coord_new[0], coord_new[1]) ; } return coord_new ; } template bool Matrix::is_valid(size_t offset) const { if(offset > this->_data_size-1) { return false ; } return true ; } template bool Matrix::is_valid(const std::vector& coord) const { if(coord.size() != this->_dim_size) { return false ; } for(size_t i=0; i this->_dim[i]) { return false ; } } return true ; } template size_t Matrix::convert_to_offset(const std::vector& coord) const { size_t offset = 0 ; for(size_t i=0; i_dim_size; i++) { offset += coord[i] * this->_dim_prod[i] ; } return offset ; } template std::vector Matrix::convert_to_coord(size_t offset) const { std::vector coord(this->_dim_size, 0) ; for(int i=this->_dim_size-1; i>=0; i--) { size_t c = offset / this->_dim_prod[i] ; coord[i] = c ; offset -= (this->_dim_prod[i]*c) ; } return coord ; } #endif // MATRIX_HPP diff --git a/src/Matrix/Matrix2D.hpp b/src/Matrix/Matrix2D.hpp index fac67be..286b884 100644 --- a/src/Matrix/Matrix2D.hpp +++ b/src/Matrix/Matrix2D.hpp @@ -1,609 +1,634 @@ #ifndef MATRIX2D_HPP #define MATRIX2D_HPP #include #include #include -#include // ifstream +#include // std::move() +#include // ifstream #include -#include // setw(), setprecision(), fixed -#include // istringstream -#include // runtime_error, out_of_range +#include // setw(), setprecision(), fixed +#include // istringstream +#include // runtime_error, out_of_range #define BUFFER_SIZE 4096 /*! The Matrix2D class is a specialisation of the Matrix * class to make work with 2D matrices easier. * + * + * A format to save a Matrix2D objects in a binary file is defined. The following values + * are written : + * 1x size_t : the number of dimensions of the Matrix stored. This is the value + * of _dim_size field. This value must be 2 otherwise this is not a 2D + * matrix. + * 2x size_t : the width of the matrix in each dimension. These values correspond + * to the content of the _dim vector and can be loaded inside this + * vector as they are. + * Dx values : the values contained in the matrix, in the _data vector. + * These values can be loaded directly in this vector. is equal to the + * product of the 2 values stored right before. The type depends on + * the type of the data stored in the matrix. The 1st of the D values is + * also the first value of the _data vector. + * + * * A text format is defined to store such matrices. * In this format, each row is written on a single line * and the values should separated by any blank character * (tab, space, multiple spaces, ...). Empty lines are * not allowed. * * ---- start ---- * 1 2 3 * 4 5 6 * 7 8 9 * ----- end ----- * * Constructing a matrix from an empty file (0 bytes or only an EOL char) returns a null * matrix (0x0 dimensions). Writting a null matrix (that is with at least one null * dimension creates an empty file. * */ template class Matrix2D : public Matrix { public: // constructors Matrix2D() = default ; /*! * \brief Constructs a matrix with the given dimensions, * filled with 0 values. * \param nrow the number of rows. * \param ncol the number of columns. */ Matrix2D(size_t nrow, size_t ncol) ; /*! * \brief Constructs a matrix with the given dimensions and * initialize the values to the given value. * \param nrow the number of rows. * \param ncol the number of columns. * \param value the value to initialize the matrix content * with. */ Matrix2D(size_t nrow, size_t ncol, T value) ; /*! * \brief Copy constructor * \param other the matrix to copy the values from. */ Matrix2D(const Matrix2D& other) ; /*! * \brief Move constructor * \param other the matrix to use the values from. */ Matrix2D(Matrix2D&& other) ; /*! * \brief Constructs a matrix from a text file. A matrix contructed * from an empty file (or a file containing only one EOL char) returns * an empty matrix (null dimensions). * \param file_address the address of the file containing the matrix. * \throw std::runtime_error if anything happen while reading the * file (format error, file not found, etc). */ Matrix2D(const std::string& file_address) ; /*! * \brief Destructor. */ - virtual ~Matrix2D() = default ; + virtual ~Matrix2D() ; // methods overloaded in Matrix using Matrix::get ; using Matrix::set ; // methods + /*! + * \brief loads a binary file containing + * a matrix. + * \param path the path to the file. + */ + void load(const std::string& file_address) ; + /*! * \brief Gets the element at the given coordinates. * \param row the row number of the element to set. * \param col the column number of the element to set. * \throw std::out_of_range exception if the coordinates * are out of range. * \return the element. */ T get(size_t row, size_t col) const ; /*! * \brief Sets the element at the given coordinates * to the given value. * \param row the row number of the element to set. * \param col the column number of the element to set. * \param value the new value. * \throw std::out_of_range exception if the coordinates * are out of range. */ void set(size_t row, size_t col, T value) ; /*! * \brief Gets the number of rows. * \return the number of rows. */ size_t get_nrow() const ; /*! * \brief Gets the number of columns. * \return the number of columns. */ size_t get_ncol() const ; /*! * \brief Gets the values in the i-th row. * \param i the row of interest. * \throw std::out_of_range if i is out of range. * \return the values in this row. */ std::vector get_row(size_t i) const ; /*! * \brief Gets the values in the i-th column. * \param i the column of interest. * \throw std::out_of_range if i is out of range. * \return the values in this column. */ std::vector get_col(size_t i) const ; /*! * \brief Sets the values of a given rows with the values of a given * vector. * \param i the row of interest. * \param values the new values. * \throw std::out_of_range if i is out of range. * \throw std::invalid_argument if values does not have a length equal * to the number of columns of the matrix. */ void set_row(size_t i, const std::vector& values) ; /*! * \brief Sets the values of a given column with the values of a given * vector. * \param i the column of interest. * \param values the new values. * \throw std::out_of_range if i is out of range. * \throw std::invalid_argument if values does not have a length equal * to the number of rows of the matrix. */ void set_col(size_t i, const std::vector& values) ; /*! * \brief Produces a nice representation of the matrix on the given * stream. * \param stream the stream. * \param precision the rounding precision. * \param width the column width in number of characters. * \param sep the character separator. */ virtual void print(std::ostream& stram, size_t precision=4, size_t width=8, char sep=' ') const override ; // operators /*! * Assignment operator. * \param other an other matrix to copy the values from. * \return a reference to the current the instance. */ Matrix2D& operator = (const Matrix2D& other) ; /*! * Move Assignment operator. * \param other an other matrix to use the values from. * \return a reference to the instance. */ Matrix2D& operator = (Matrix2D&& other) ; /*! * \brief Returns a reference to the corrresponding * element. This method does not perform any check on * the coordinates. * \param row the row number of the element to set. * \param col the column number of the element to set. * \return a reference to this element. */ T& operator () (size_t row, size_t col) ; /*! * \brief Returns a const reference to the corrresponding * element. This method does not perform any check on * the coordinates. * \param row the row number of the element to set. * \param col the column number of the element to set. * \return a const reference to this element. */ const T& operator () (size_t row, size_t col) const ; private: /*! * \brief Converts a pair of VALID (x,y) coordinates to a * the corresponding offset allowing to get an element in the * data vector. * \param row the row index. * \param col the column index. * \return the corresponding offset. */ size_t convert_to_offset(size_t row, size_t col) const ; /*! * \brief Computes and stores the offsets at which * each row start. */ void compute_row_offsets() ; /*! * \brief Computes and stores the offsets at which * each row start. */ void compute_col_offsets() ; /*! * \brief Contains the offsets at which each row starts. * Each element corresponds to the corresponding rows * (1st element -> 1st row). */ std::vector _row_offsets ; /*! * \brief Contains the offsets at which each row starts. * Each element corresponds to the corresponding rows * (1st element -> 1st row). */ std::vector _col_offsets ; } ; // operators /*! * \brief Addition operator. * \param m the matrix of interest * \param value the value to add to each element. * \return the resulting matrix. */ template const Matrix2D operator + (Matrix2D m, T value) { Matrix2D other(m) ; m += value ; return m ; } /*! * \brief Substraction operator * \param m the matrix of interest. * \param value the value to substract to each element. * \return the resulting matrix. */ template const Matrix2D operator - (Matrix2D m, T value) { Matrix2D other(m) ; m -= value ; return m ; } /*! * \brief Multiplication operator. * \param m the matrix of interest. * \param value the value to multiply each elements by. * \return the resulting matrix. */ template const Matrix2D operator * (Matrix2D m, T value) { Matrix2D other(m) ; m *= value ; return m ; } /*! * \brief Division operator. * \param m the matrix of interest. * \param value the value to divide each elements by. * \throw std::invalid_argument if value is 0. * \return the resulting matrix. */ template const Matrix2D operator / (Matrix2D m, T value) { if(value == static_cast(0)) { throw std::invalid_argument("division by 0!") ; } Matrix2D other(m) ; other /= value ; return other ; } /*! * \brief Sends a representation of the matrix to the stream. * \param stream the stream of interest. * \param m the matrix of interest. * \return a reference to the stream. */ template std::ostream& operator << (std::ostream& stream, const Matrix2D& m) { m.print(stream) ; return stream ; } // other usefull functions /*! * \brief Produces a transpose of the given matrix. * \param m a matrix. */ template Matrix2D transpose(const Matrix2D& m) ; // method implementation template Matrix2D transpose(const Matrix2D& m) { std::vector dim = m.get_dim() ; size_t nrow = dim[0] ; size_t ncol = dim[1] ; Matrix2D m2(ncol, nrow, 0) ; for(size_t i=0; i Matrix2D::Matrix2D(size_t nrow, size_t ncol) : Matrix2D(nrow, ncol, static_cast(0)) {} template Matrix2D::Matrix2D(size_t nrow, size_t ncol, T value) : Matrix({nrow, ncol}, value), _row_offsets(nrow), _col_offsets(ncol) { this->compute_row_offsets() ; this->compute_col_offsets() ; } template Matrix2D::Matrix2D(const Matrix2D& other) : Matrix(other) { this->_row_offsets = other._row_offsets ; this->_col_offsets = other._col_offsets ; } template -Matrix2D::Matrix2D(Matrix2D&& other) - : Matrix(other) +Matrix2D::Matrix2D(Matrix2D&& other) + : Matrix(std::move(other)) { this->_row_offsets = other._row_offsets ; this->_col_offsets = other._col_offsets ; } - template Matrix2D::Matrix2D(const std::string &file_address) -// : Matrix({0,0}) { this->_dim = {0,0} ; - this->_data = std::vector() ; + this->_data = new std::vector() ; this->_dim_size = this->_dim.size() ; - this->_data_size = this->_data.size() ; + this->_data_size = this->_data->size() ; this->_dim_prod = std::vector(this->_dim_size, 0) ; std::ifstream file(file_address, std::ifstream::in) ; if(file.fail()) { char msg[BUFFER_SIZE] ; sprintf(msg, "error! cannot open %s", file_address.c_str()) ; throw std::runtime_error(msg) ; } std::string buffer_str ; std::vector buffer_vec ; T buffer_T ; // read file size_t n_line = 0 ; size_t row_len = 0 ; while(getline(file, buffer_str)) { // check stream status and read content if(file.fail()) { file.close() ; char msg[BUFFER_SIZE] ; sprintf(msg, "error! while reading %s", file_address.c_str()) ; throw std::runtime_error(msg) ; } if(buffer_str.size() == 0) { // this file only contains one eol char and should be considered as empty, // -> returns empty matrix not an error if(n_line == 0 and file.peek() == EOF and file.eof()) { break ; } file.close() ; char msg[BUFFER_SIZE] ; sprintf(msg, "format error! while reading %s (empty line)", file_address.c_str()) ; throw std::runtime_error(msg) ; } // parse line buffer_vec.clear() ; std::istringstream buffer_ss(buffer_str) ; while(buffer_ss >> buffer_T) { buffer_vec.push_back(buffer_T) ; } // check for an error which likely indicates that a value could not be // casted into a type T (mixed data types in the file) if(buffer_ss.fail() and not buffer_ss.eof()) { file.close() ; char msg[BUFFER_SIZE] ; sprintf(msg, "format error! could not read a line in %s (incompatible data types)", file_address.c_str()) ; throw std::runtime_error(msg) ; } // check that number of column is constant if(n_line == 0) { row_len = buffer_vec.size() ; } else if(buffer_vec.size() != row_len) { file.close() ; char msg[BUFFER_SIZE] ; sprintf(msg, "format error! variable number of columns in %s", file_address.c_str()) ; throw std::runtime_error(msg) ; } // update matrix content for(auto i : buffer_vec) - { this->_data.push_back(i) ; + { this->_data->push_back(i) ; this->_data_size++ ; } this->_dim[1]++ ; n_line++ ; } file.close() ; this->_dim[0] = row_len ; this->compute_dim_product() ; this->_row_offsets = std::vector(this->_dim[1]) ; this->_col_offsets = std::vector(this->_dim[0]) ; this->compute_row_offsets() ; this->compute_col_offsets() ; } +template +Matrix2D::~Matrix2D() +{ if(this->_data != nullptr) + { delete this->_data ; + this->_data = nullptr ; + } +} + +template +void Matrix2D::load(const std::string& file_address) +{ + Matrix::load(file_address, 2) ; + this->_row_offsets = std::vector(this->_dim[1]) ; + this->_col_offsets = std::vector(this->_dim[0]) ; + + this->compute_dim_product() ; + this->compute_row_offsets() ; + this->compute_col_offsets() ; +} template T Matrix2D::get(size_t row, size_t col) const { try { return this->get({row, col}) ; } catch(std::out_of_range& e) { throw e ; } } template void Matrix2D::set(size_t row, size_t col, T value) { try { this->set({row, col}, value) ; } catch(std::out_of_range& e) { throw e ; } } template size_t Matrix2D::get_nrow() const { return this->_dim[1] ; } template size_t Matrix2D::get_ncol() const { return this->_dim[0] ; } template std::vector Matrix2D::get_row(size_t i) const { if(i>=this->get_nrow()) { throw std::out_of_range("row index is out of range!") ; } std::vector row(this->get_ncol()) ; for(size_t j=i*this->get_ncol(), n=0; nget_ncol(); j++, n++) - { row[n] = this->_data[j] ; } + { row[n] = (*this->_data)[j] ; } return row ; } template std::vector Matrix2D::get_col(size_t i) const { if(i>=this->get_ncol()) { throw std::out_of_range("column index is out of range!") ; } std::vector col(this->get_nrow()) ; for(size_t j=i, n=0; nget_nrow(); j+=this->get_ncol(), n++) - { col[n] = this->_data[j] ; } + { col[n] = (*this->_data)[j] ; } return col ; } template void Matrix2D::set_row(size_t i, const std::vector& values) { if(i>=this->get_nrow()) { throw std::out_of_range("row index is out of range!") ; } else if(values.size() != this->get_ncol()) { throw std::invalid_argument("the given vector length is not equal to the number of columns!") ; } for(size_t j=i*this->get_ncol(), n=0; nget_ncol(); j++, n++) - { this->_data[j] = values[n] ; } + { (*this->_data)[j] = values[n] ; } } template void Matrix2D::set_col(size_t i, const std::vector& values) { if(i>=this->get_ncol()) { throw std::out_of_range("row index is out of range!") ; } else if(values.size() != this->get_nrow()) { throw std::invalid_argument("the given vector length is not equal to the number of rows!") ; } for(size_t n=0, j=i; nget_nrow(); n++, j+=this->get_ncol()) - { this->_data[j] = values[n] ; } + { (*this->_data)[j] = values[n] ; } } template void Matrix2D::print(std::ostream& stream, size_t precision, size_t width, char sep) const { stream.setf(std::ios::left) ; stream << std::setprecision(precision) << std::fixed ; size_t n = 0 ; size_t n_tot = this->get_nrow()*this->get_ncol() ; for(size_t i=0; iget_nrow(); i++) { for(size_t j=0; jget_ncol(); j++, n++) { stream << std::setw(width) << (*this)(i,j) << sep ; } if(n Matrix2D& Matrix2D::operator = (const Matrix2D& other) -{ std::cerr << "Matrix2D& Matrix2D::operator = (const Matrix2D& other)" << std::endl ; +{ /* this->_dim = other._dim ; this->_dim_size = other._dim_size ; - this->_data = other._data ; + this->_data = new std::vector(other._data) ; this->_data_size = other._data_size ; this->_dim_prod = other._dim_prod ; + */ + Matrix::operator=(other) ; this->_row_offsets = other._row_offsets ; this->_col_offsets = other._col_offsets ; return *this ; } template Matrix2D& Matrix2D::operator = (Matrix2D&& other) -{ std::cerr << "Matrix2D& Matrix2D::operator = (Matrix2D&& other)" << std::endl ; - this->_dim = other._dim ; - this->_dim_size = other._dim_size ; - this->_data = other._data ; - this->_data_size = other._data_size ; - this->_dim_prod = other._dim_prod ; +{ Matrix::operator=(std::move(other)) ; this->_row_offsets = other._row_offsets ; this->_col_offsets = other._col_offsets ; return *this ; } template T& Matrix2D::operator () (size_t row, size_t col) -{ // std::vector coord = {col, row} ; - // return this->_data[this->convert_to_offset(coord)] ; - return this->_data[this->convert_to_offset(row, col)] ; -} +{ return (*this->_data)[this->convert_to_offset(row, col)] ; } template const T& Matrix2D::operator () (size_t row, size_t col) const -{ // std::vector coord = {col, row} ; - // return this->_data[this->convert_to_offset(coord)] ; - return this->_data[this->convert_to_offset(row, col)] ; -} +{ return (*this->_data)[this->convert_to_offset(row, col)] ; } template void Matrix2D::compute_row_offsets() { for(size_t i=0; i_dim[1]; i++) { this->_row_offsets[i] = i * this->_dim_prod[1] ; } } template void Matrix2D::compute_col_offsets() { for(size_t i=0; i_dim[0]; i++) { this->_col_offsets[i] = i * this->_dim_prod[0] ; } } template size_t Matrix2D::convert_to_offset(size_t row, size_t col) const -{ /* - size_t offset = 0 ; - - for(size_t i=0; i_dim_size; i++) - { offset += coord[i] * this->_dim_prod[i] ; } - - return offset ; - */ +{ size_t offset = this->_row_offsets[row] + this->_col_offsets[col] ; return offset ; } #endif // MATRIX2D_HPP diff --git a/src/Matrix/Matrix3D.hpp b/src/Matrix/Matrix3D.hpp index 4ed6721..2bdd55a 100644 --- a/src/Matrix/Matrix3D.hpp +++ b/src/Matrix/Matrix3D.hpp @@ -1,599 +1,637 @@ #ifndef MATRIX3D_HPP #define MATRIX3D_HPP #include #include #include +#include // std::move() #include #include // setw(), setprecision(), fixed #include // ifstream #include // istringstream #include // runtime_error, out_of_range #include // equal() #define BUFFER_SIZE 4096 /*! * The Matrix3D class is a specialisation of the Matrix * class to make work with 3D matrices more easily. * - * A text file format is defined to store such matrices. The specifications are as + * A format to save a Matrix3D objects in a binary file is defined. The following values + * are written : + * 1x size_t : the number of dimensions of the Matrix stored. This is the value + * of _dim_size field. This value must be 3 otherwise this is not a 3D + * matrix. + * 3x size_t : the width of the matrix in each dimension. These values correspond + * to the content of the _dim vector and can be loaded inside this + * vector as they are. + * Dx values : the values contained in the matrix, in the _data vector. + * These values can be loaded directly in this vector. is equal to the + * product of the 3 values stored right before. The type depends on + * the type of the data stored in the matrix. The 1st of the D values is + * also the first value of the _data vector. + * + * + * A text file format is defined to store Matrix3D objects. The specifications are as * follows : * Absolutely NO empty lines are allowed! * The following lines should contain : * * 1st line : a slice header, ',,0' indicates that a slice of the 3rd dimension * is beginning (this is a z slice). * 2nd - Nth line : the firt slice, as a 2d matrix (the exemple below has dimensions 3x4). * N+1th line : a slice header, ',,1' indicates that the 2nd slice is beginning. * N+1th - ... : the second slice * and so on... * * Example of a 3x4x2 3D matrix * ---- start ---- * ,,0 * 1 2 3 4 * 5 6 7 8 * 8 9 10 11 *,,1 * 12 13 14 15 * 16 17 18 19 * 20 21 22 23 * ----- end ----- * * Constructing a matrix from an empty file (0 bytes or only an EOL char) returns a null * matrix (0x0x0 dimensions). Writting a null matrix (that is with at least one null * dimension creates an empty file. * */ template class Matrix3D : public Matrix { public: // constructors /*! * Default constructor. */ Matrix3D() = default ; /*! * \brief Constructs a matrix with the given dimensions, * filled with 0 values. * \param dim1 the first dimension. * \param dim2 the second dimension. * \param dim3 the third dimension. */ Matrix3D(size_t dim1, size_t dim2, size_t dim3) ; /*! * \brief Constructs a matrix with the given dimensions and * initialize the values to the given value. * \param dim1 the first dimension. * \param dim2 the second dimension. * \param dim3 the third dimension. * \param value the value to initialize the matrix content * with. */ Matrix3D(size_t dim1, size_t dim2, size_t dim3, T value) ; /*! * \brief Copy constructor * \param other the matrix to copy the values from. */ Matrix3D(const Matrix3D& other) ; /*! * \brief Move constructor * \param other the matrix to use the values from. */ Matrix3D(Matrix3D&& other) ; /*! * \brief Constructs a matrix from a text file. A matrix contructed * from an empty file (or a file containing only one EOL char) returns * an empty matrix (null dimensions). * \param file_address the address of the file containing the matrix. * \throw std::runtime_error if anything happen while reading the * file (format error, file not found, etc). */ Matrix3D(const std::string& file_address) ; /*! * \brief Destructor. */ - virtual ~Matrix3D() = default ; + virtual ~Matrix3D() ; // methods overloaded from Matrix using Matrix::get ; using Matrix::set ; // methods + /*! + * \brief loads a binary file containing + * a matrix. + * \param path the path to the file. + */ + void load(const std::string& file_address) ; + /*! * \brief Gets the element at the given coordinates. * \param dim1 the first dimension coordinate. * \param dim2 the second dimension coordinate. * \param dim3 the third dimension coordinate. * \throw std::out_of_range exception if the coordinates * are out of range. * \return the element. */ T get(size_t dim1, size_t dim2, size_t dim3) const ; /*! * \brief Sets the element at the given coordinates * to the given value. * \param dim1 the first dimension coordinate. * \param dim2 the second dimension coordinate. * \param dim3 the third dimension coordinate. * \param value the new value. * \throw std::out_of_range exception if the coordinates * are out of range. */ void set(size_t dim1, size_t dim2, size_t dim3, T value) ; /*! * \brief Produces a nice representation of the matrix on the given * stream. * \param stream the stream. * \param precision the rounding precision. * \param width the column width in number of characters. * \param sep the character separator. */ virtual void print(std::ostream& stream, size_t precision=4 ,size_t width=8, char sep=' ') const override ; // operators /*! * Assignment operator. * \param other an other matrix to copy the values from. * \return a reference to the current the instance. */ Matrix3D& operator = (const Matrix3D& other) ; /*! * Move Assignment operator. * \param other an other matrix to use the values from. * \return a reference to the instance. */ Matrix3D& operator = (Matrix3D&& other) ; /*! * \brief Returns a reference to the corrresponding * element. This method does not perform any check on * the coordinates. * \param dim1 the first dimension coordinate. * \param dim2 the second dimension coordinate. * \param dim3 the third dimension coordinate. * \return a reference to this element. */ T& operator () (size_t dim1, size_t dim2, size_t dim3) ; /*! * \brief Returns a constant reference to the corrresponding * element. This method does not perform any check on * the coordinates. * \param dim1 the first dimension coordinate. * \param dim2 the second dimension coordinate. * \param dim3 the third dimension coordinate. * \return a constant reference to this element. */ const T& operator () (size_t dim1, size_t dim2, size_t dim3) const ; private: // methods /*! * \brief Checks whether a given string is a slice header * (such as ",,0"), as found in files storing Matrix3D. * \param str the string to check. * \return whether the string is a slice header. */ bool is_header(const std::string& str) const ; /*! * \brief Converts a triplet of VALID (dim1, dim2, dim3) coordinates * to a the corresponding offset allowing to get an element in the * data vector. * \param dim1 the index of the 1st dimension slice (row). * \param dim2 the index of the 2nd dimension slice (column). * \param dim3 the index of the 3rd dimension slice. * \return the corresponding offset. */ size_t convert_to_offset(size_t dim1, size_t dim2, size_t dim3) const ; /*! * \brief Computes and stores the offsets at which * each slice on the 1st dimension (row) starts. */ void compute_dim1_offsets() ; /*! * \brief Computes and stores the offsets at which * each slice on the 2nd dimension (column) starts. */ void compute_dim2_offsets() ; /*! * \brief Computes and stores the offsets at which * each slice on the 3rd dimension (3rd dimension * slice) starts. */ void compute_dim3_offsets() ; /*! * \brief Contains the offsets at which each x slice * starts. Each element corresponds to the corresponding * x slice (1st element -> 1st x slice (row)). */ std::vector _dim1_offsets ; /*! * \brief Contains the offsets at which each y slice * starts. Each element corresponds to the corresponding * y slice (1st element -> 1st y slice (column)). */ std::vector _dim2_offsets ; /*! * \brief Contains the offsets at which each x slice * starts. Each element corresponds to the corresponding * x slice (1st element -> 1st z slice). */ std::vector _dim3_offsets ; } ; // operators /*! * \brief Addition operator. * \param m the matrix of interest * \param value the value to add to each element. * \return the resulting matrix. */ template const Matrix3D operator + (Matrix3D m, T value) { Matrix3D other(m) ; m += value ; return m ; } /*! * \brief Substraction operator * \param m the matrix of interest. * \param value the value to substract to each element. * \return the resulting matrix. */ template const Matrix3D operator - (Matrix3D m, T value) { Matrix3D other(m) ; m -= value ; return m ; } /*! * \brief Multiplication operator. * \param m the matrix of interest. * \param value the value to multiply each elements by. * \return the resulting matrix. */ template const Matrix3D operator * (Matrix3D m, T value) { Matrix3D other(m) ; m *= value ; return m ; } /*! * \brief Division operator. * \param m the matrix of interest. * \param value the value to divide each elements by. * \throw std::invalid_argument if value is 0. * \return the resulting matrix. */ template const Matrix3D operator / (Matrix3D m, T value) { if(value == static_cast(0)) { throw std::invalid_argument("division by 0!") ; } Matrix3D other(m) ; other /= value ; return other ; } /*! * \brief Sends a representation of the matrix to the stream. * \param stream the stream of interest. * \param m the matrix of interest. * \return a reference to the stream. */ template std::ostream& operator << (std::ostream& stream, const Matrix3D& m) { m.print(stream) ; return stream ; } // method implementation template Matrix3D::Matrix3D(size_t dim1, size_t dim2, size_t dim3) : Matrix3D(dim1, dim2, dim3, 0) {} template Matrix3D::Matrix3D(size_t dim1, size_t dim2, size_t dim3, T value) : Matrix({dim1, dim2, dim3}, value), _dim1_offsets(dim1), _dim2_offsets(dim2), _dim3_offsets(dim3) { this->compute_dim1_offsets() ; this->compute_dim2_offsets() ; this->compute_dim3_offsets() ; } template Matrix3D::Matrix3D(const Matrix3D& other) : Matrix(other) { this->_dim1_offsets = other._dim1_offsets ; this->_dim2_offsets = other._dim2_offsets ; this->_dim3_offsets = other._dim3_offsets ; } template -Matrix3D::Matrix3D(Matrix3D&& other) - : Matrix(other) +Matrix3D::Matrix3D(Matrix3D&& other) + : Matrix(std::move(other)) { this->_dim1_offsets = other._dim1_offsets ; this->_dim2_offsets = other._dim2_offsets ; this->_dim3_offsets = other._dim3_offsets ; } template Matrix3D::Matrix3D(const std::string &file_address) { this->_dim = {0,0,0} ; - this->_data = std::vector() ; + this->_data = new std::vector() ; this->_dim_size = this->_dim.size() ; - this->_data_size = this->_data.size() ; + this->_data_size = this->_data->size() ; this->_dim_prod = std::vector(this->_dim_size, 0) ; std::ifstream file(file_address, std::ifstream::in) ; if(file.fail()) { char msg[BUFFER_SIZE] ; sprintf(msg, "error! cannot open %s", file_address.c_str()) ; throw std::runtime_error(msg) ; } std::string buffer_str ; std::vector buffer_vec ; T buffer_T ; // read file size_t n_line = 0, n_line_data = 0 ; // number of line and of data line read size_t row_len = 0, col_len = 0 ; // length of row and column in nber of values size_t row_len_cur = 0, col_len_cur = 0 ; // current number of values read in row and col while(getline(file, buffer_str)) { if(file.fail()) { file.close() ; char msg[BUFFER_SIZE] ; sprintf(msg, "error! while reading %s", file_address.c_str()) ; throw std::runtime_error(msg) ; } // check empty line if(buffer_str.size() == 0) { // this file only contains one eol char and should be considered as empty, // -> returns empty matrix not an error if(n_line == 0 and file.peek() == EOF and file.eof()) { break ; } file.close() ; char msg[BUFFER_SIZE] ; sprintf(msg, "format error! while reading %s (empty line)", file_address.c_str()) ; throw std::runtime_error(msg) ; } // check whether it is the beginning of a slice // 1st line in file should be one like this if(this->is_header(buffer_str)) { // check that slice have a constant number of rows if(this->_dim[2] == 1) { col_len = col_len_cur ; // this->_dim[0] = row_len ; // this->_dim[1] = col_len ; } else if(col_len_cur != col_len) { file.close() ; char msg[BUFFER_SIZE] ; sprintf(msg, "format error! slice have variable dimensions 1 in %s", file_address.c_str()) ; throw std::runtime_error(msg) ; } this->_dim[2]++ ; col_len_cur = 0 ; n_line++ ; continue ; } // 1st line in file should be a header and entering // this block is forbidden if(n_line == 0) { file.close() ; char msg[BUFFER_SIZE] ; sprintf(msg, "format error! first line is not a slice header in %s", file_address.c_str()) ; throw std::runtime_error(msg) ; } // parse line row_len_cur = 0 ; buffer_vec.clear() ; std::istringstream buffer_ss(buffer_str) ; while(buffer_ss >> buffer_T) { buffer_vec.push_back(buffer_T) ; row_len_cur++ ; } // check for an error which likely indicates that a value could not be // casted into a type T (mixed data types in the file) if(buffer_ss.fail() and not buffer_ss.eof()) { file.close() ; char msg[BUFFER_SIZE] ; sprintf(msg, "format error! could not read a line in %s (incompatible data types)", file_address.c_str()) ; throw std::runtime_error(msg) ; } // check that number of column is constant if(n_line_data == 0) { row_len = row_len_cur ; } else if(row_len_cur != row_len) { file.close() ; char msg[BUFFER_SIZE] ; sprintf(msg, "format error! slice have variable dimensions 2 in %s", file_address.c_str()) ; throw std::runtime_error(msg) ; } // update matrix content for(auto i : buffer_vec) - { this->_data.push_back(i) ; + { this->_data->push_back(i) ; this->_data_size++ ; } col_len_cur++ ; n_line_data++ ; n_line++ ; // update matrix dimensions this->_dim[0] = row_len_cur ; this->_dim[1] = col_len_cur ; } // check dimensions of last slice if(col_len_cur != this->_dim[1]) { file.close() ; char msg[BUFFER_SIZE] ; sprintf(msg, "format error! slice have variable dimensions in %s", file_address.c_str()) ; throw std::runtime_error(msg) ; } file.close() ; this->compute_dim_product() ; this->_dim1_offsets = std::vector(this->_dim[1]) ; this->_dim2_offsets = std::vector(this->_dim[0]) ; this->_dim3_offsets = std::vector(this->_dim[2]) ; this->compute_dim1_offsets() ; this->compute_dim2_offsets() ; this->compute_dim3_offsets() ; } +template +Matrix3D::~Matrix3D() +{ if(this->_data != nullptr) + { delete this->_data ; + this->_data = nullptr ; + } +} + +template +void Matrix3D::load(const std::string& file_address) +{ + Matrix::load(file_address, 3) ; + + this->_dim1_offsets = std::vector(this->_dim[1]) ; + this->_dim2_offsets = std::vector(this->_dim[0]) ; + this->_dim3_offsets = std::vector(this->_dim[2]) ; + this->compute_dim1_offsets() ; + this->compute_dim2_offsets() ; + this->compute_dim3_offsets() ; +} template T Matrix3D::get(size_t dim1, size_t dim2, size_t dim3) const { try { return this->get({dim1, dim2, dim3}) ; } catch(std::out_of_range& e) { throw e ; } } template void Matrix3D::set(size_t dim1, size_t dim2, size_t dim3, T value) { try { return this->set({dim1, dim2, dim3}, value) ; } catch(std::out_of_range& e) { throw e ; } } template Matrix3D& Matrix3D::operator = (const Matrix3D& other) -{ std::cerr << "Matrix3D& Matrix3D::operator = (const Matrix3D& other)" << std::endl ; +{ /* this->_dim = other._dim ; this->_dim_size = other._dim_size ; - this->_data = other._data ; + this->_data = new std::vector(*(other._data)) ; this->_data_size = other._data_size ; this->_dim_prod = other._dim_prod ; + */ + Matrix::operator=(other) ; this->_dim1_offsets = other._dim1_offsets ; - this->_dim2_offsets = other._dim1_offsets ; + this->_dim2_offsets = other._dim2_offsets ; this->_dim3_offsets = other._dim3_offsets ; return *this ; } template Matrix3D& Matrix3D::operator = (Matrix3D&& other) -{ std::cerr << "Matrix3D& Matrix3D::operator = (const Matrix3D&& other)" << std::endl ; - this->_dim = other._dim ; - this->_dim_size = other._dim_size ; - this->_data = other._data ; - this->_data_size = other._data_size ; - this->_dim_prod = other._dim_prod ; - this->_dim1_offsets = other._dim1_offsets ; - this->_dim2_offsets = other._dim1_offsets ; - this->_dim3_offsets = other._dim3_offsets ; +{ Matrix::operator=(std::move(other)) ; + this->_dim1_offsets = other._dim1_offsets ; + this->_dim2_offsets = other._dim2_offsets ; + this->_dim3_offsets = other._dim3_offsets ; return *this ; } template T& Matrix3D::operator () (size_t dim1, size_t dim2, size_t dim3) -{ return this->_data[this->convert_to_offset(dim1, dim2, dim3)] ; } +{ return (*this->_data)[this->convert_to_offset(dim1, dim2, dim3)] ; } template const T& Matrix3D::operator () (size_t dim1, size_t dim2, size_t dim3) const -{ return this->_data[this->convert_to_offset(dim1, dim2, dim3)] ; } - +{ return (*this->_data)[this->convert_to_offset(dim1, dim2, dim3)] ; } template void Matrix3D::print(std::ostream& stream, size_t precision, size_t width, char sep) const { // if the matrix has at least one 0 dimension (no data), don't do anything if(this->_dim[0]==0 or this->_dim[1]==0 or this->_dim[2]==0) { return ; } - stream.setf(std::ios::left) ; stream << std::setprecision(precision) << std::fixed ; std::vector dim = this->get_dim() ; size_t n = 0 ; - size_t n_tot = std::accumulate(dim.begin(), dim.end(), 1, std::multiplies()) ; + size_t n_tot = std::accumulate(dim.begin(), dim.end(), (size_t)1, std::multiplies()) ; for(size_t z=0; z bool Matrix3D::is_header(const std::string& str) const { if(str[0] == ',' and str[1] == ',' and str.find(',', 2) == std::string::npos) { return true ; } return false ; } template void Matrix3D::compute_dim1_offsets() { for(size_t i=0; i_dim[1]; i++) { this->_dim1_offsets[i] = i * this->_dim_prod[1] ; } } template void Matrix3D::compute_dim2_offsets() { for(size_t i=0; i_dim[0]; i++) { this->_dim2_offsets[i] = i * this->_dim_prod[0] ; } } template void Matrix3D::compute_dim3_offsets() { for(size_t i=0; i_dim[2]; i++) { this->_dim3_offsets[i] = i * this->_dim_prod[2] ; } } template size_t Matrix3D::convert_to_offset(size_t dim1, size_t dim2, size_t dim3) const { /* size_t offset = 0 ; for(size_t i=0; i_dim_size; i++) { offset += coord[i] * this->_dim_prod[i] ; } return offset ; */ size_t offset = this->_dim1_offsets[dim1] + this->_dim2_offsets[dim2] + this->_dim3_offsets[dim3] ; return offset ; } #endif // MATRIX3D_HPP diff --git a/src/Matrix/Matrix4D.hpp b/src/Matrix/Matrix4D.hpp index 4ef5cfb..3e95264 100644 --- a/src/Matrix/Matrix4D.hpp +++ b/src/Matrix/Matrix4D.hpp @@ -1,792 +1,839 @@ #ifndef MATRIX4D_HPP #define MATRIX4D_HPP #include #include #include +#include // std::move() #include // runtime_error, out_of_range #include #include // setw(), setprecision(), fixed #include // ifstream #include // sstream #define BUFFER_SIZE 4096 /*! * The Matrix4D class is a specialisation of the Matrix * class to make work with 4D matrices more easily. * + * A format to save a Matrix4D objects in a binary file is defined. The following values + * are written : + * 1x size_t : the number of dimensions of the Matrix stored. This is the value + * of _dim_size field. This value must be 4 otherwise this is not a 4D + * matrix. + * 4x size_t : the width of the matrix in each dimension. These values correspond + * to the content of the _dim vector and can be loaded inside this + * vector as they are. + * Dx values : the values contained in the matrix, in the _data vector. + * These values can be loaded directly in this vector. is equal to the + * product of the 4 values stored right before. The type depends on + * the type of the data stored in the matrix. The 1st of the D values is + * also the first value of the _data vector. + * + * * A text file format is defined to store such matrices. The specifications are as * follows : * Absolutely NO empty lines are allowed! * The following lines should contain : * * 1st line : a slice header ',,,0' indicating that a slice of the 4th dimension * is beginning. * 3nd - Nth line : the slice of the 4th dimension. It contains slice in the 3rd dimension * which are 2D matrices separated by headers (',,0' and ',,1', in the * example below, they have 2x3 dimensions). * N+1th line : ',,,1' indicating that the 2nd slice of the 4th dimension is beginning. * and so on... * Example * ---- start ---- * ,,,0 * ,,0 * 1 2 3 * 4 5 6 * ,,1 * 7 8 9 * 10 11 12 * ,,,1 * ,,0 * 21 22 23 * 24 25 26 * ,,1 * 27 28 29 * 30 31 32 * ----- end ----- * * Constructing a matrix from an empty file (0 bytes or only an EOL char) returns a null * matrix (0x0x0x0 dimensions). Writting a null matrix (that is with at least one null * dimension creates an empty file. * */ template class Matrix4D : public Matrix { public: static size_t n_instance ; public: // constructors /*! * Default constructor. */ Matrix4D() = default ; /*! * \brief Constructs a matrix with the given dimensions, * filled with 0 values. * \param dim1 the first dimension. * \param dim2 the second dimension. * \param dim3 the third dimension. * \param dim4 the fourth dimension. */ Matrix4D(size_t dim1, size_t dim2, size_t dim3, size_t dim4) ; /*! * \brief Constructs a matrix with the given dimensions and * initialize the values to the given value. * \param dim1 the first dimension. * \param dim2 the second dimension. * \param dim3 the third dimension. * \param dim4 the fourth dimension. * \param value the value to initialize the matrix content * with. */ Matrix4D(size_t dim1, size_t dim2, size_t dim3, size_t dim4, T value) ; /*! * \brief Copy constructor * \param other the matrix to copy the content from. */ Matrix4D(const Matrix4D& other) ; /*! * \brief Mover constructor * \param other the matrix to copy the content from. */ Matrix4D(Matrix4D&& other) ; /*! * \brief Constructs a matrix from a text file. A matrix contructed * from an empty file (or a file containing only one EOL char) returns * an empty matrix (null dimensions). * \param file_address the address of the file containing the matrix. * \throw std::runtime_error if anything happen while reading the * file (format error, file not found, etc). */ Matrix4D(const std::string& file_address) ; /*! * \brief Destructor. */ - virtual ~Matrix4D() = default ; + virtual ~Matrix4D() ; // methods overloaded from Matrix using Matrix::get ; using Matrix::set ; - // methods OK + // methods + /*! + * \brief loads a matrix from the given binary + * file. + * \param path the path to the file to read. + * \param dim_n the expected number of dimensions + * of the matrix. + * \throw std::invalid_argument if the dimensionality + * of the matrix stored in the file is not equal to + * the expected number of dimensions and + * std::runtime_error if any reading error + * occures. + */ + void load(const std::string& file_address) ; + /*! * \brief Gets the element at the given coordinates. * \param dim1 the first dimension coordinate. * \param dim2 the second dimension coordinate. * \param dim3 the third dimension coordinate. * \param dim4 the fourth dimension coordinate. * \throw std::out_of_range exception if the coordinates * are out of range. * \return the element. */ T get(size_t dim1, size_t dim2, size_t dim3, size_t dim4) const ; /*! * \brief Sets the element at the given coordinates * to the given value. * \param dim1 the first dimension coordinate. * \param dim2 the second dimension coordinate. * \param dim3 the third dimension coordinate. * \param dim4 the fourth dimension coordinate. * \param value the new value. * \throw std::out_of_range exception if the coordinates * are out of range. */ void set(size_t dim1, size_t dim2, size_t dim3, size_t dim4, T value) ; /*! * \brief Produces a nice representation of the matrix on the given * stream. * \param stream the stream. * \param precision the rounding precision. * \param width the column width in number of characters. * \param sep the character separator. */ virtual void print(std::ostream& stream, size_t precision=4 ,size_t width=8, char sep=' ') const override ; // operators /*! * Assignment operator. * \param other an other matrix to copy the values from. * \return a reference to the current the instance. */ Matrix4D& operator = (const Matrix4D& other) ; /*! * Move Assignment operator. * \param other an other matrix to use the values from. * \return a reference to the instance. */ Matrix4D& operator = (Matrix4D&& other) ; /*! * \brief Returns a reference to the corrresponding * element. This method does not perform any check on * the coordinates. * \param dim1 the first dimension coordinate. * \param dim2 the second dimension coordinate. * \param dim3 the third dimension coordinate. * \param dim4 the third dimension coordinate. * \return a reference to this element. */ T& operator() (size_t dim1, size_t dim2, size_t dim3, size_t dim4) ; /*! * \brief Returns a reference to the corrresponding * element. This method does not perform any check on * the coordinates. * \param dim1 the first dimension coordinate. * \param dim2 the second dimension coordinate. * \param dim3 the third dimension coordinate. * \param dim4 the third dimension coordinate. * \return a reference to this element. */ const T& operator() (size_t dim1, size_t dim2, size_t dim3, size_t dim4) const ; private: // methods /*! * \brief Checks whether a given string is a 3D header * (such as ",,0"), as found in files storing Matrix4D. * \param str the string to check. * \return whether the string is such a slice header. */ bool is_header_3d(const std::string& str) const ; /*! * \brief Checks whether a given string is a 4D header * (such as ",,,0"), as found in files storing Matrix4D. * \param str the string to check. * \return whether the string is such a slice header. */ bool is_header_4d(const std::string& str) const ; /*! * \brief Routine to load 4D matrices from files. * This method reads from a std::ifstream object, * from the current pointer location until i) a 4D * header line is found (such as ',,,1') or ii) until * it cannot read anymore from the stream. All * data are pushed back into the data vector and * the dimensions of the data read are stored into * the dim vector (these data are actually a 3D * matrix). If the method returned because it * found another 4D header, it returns true, false * otherwise. * To read an entire 4D matrix from a file, simply * use this scheme : i) read the 1st 4D header * ii) call this function while it returns true. * \param file_name a reference to a string containing * the address of the file currently read (for exception * messages). * \param file a reference to the std::ifstream to read * from. Obviously, the stream state will be modified as * the method reads from it. However, it will never be * closed by the method. * \param data a reference to an empty vector where the * read data will be pushed back. * \param dim a reference to an empty vector where the * dimensions of the read data will be stored. * \return whether the last piece of data read from the * stream was a 4D header. */ bool get_3d_slice(const std::string& file_name, std::ifstream& file, std::vector& data, std::vector& dim) const ; /*! * \brief Converts a quadruplet of VALID (dim1, dim2, dim3, dim4) * coordinates to a the corresponding offset allowing to get an * element in the data vector. * \param dim1 the index of the 1st dimension slice. * \param dim2 the index of the 2nd dimension slice. * \param dim3 the index of the 3rd dimension slice. * \param dim4 the index of the 4th dimension slice. * \return the corresponding offset. */ size_t convert_to_offset(size_t dim1, size_t dim2, size_t dim3, size_t dim4) const ; /*! * \brief Computes and stores the offsets at which * each slice on the 1st dimension starts. */ void compute_dim1_offsets() ; /*! * \brief Computes and stores the offsets at which * each slice on the 2nd dimension starts. */ void compute_dim2_offsets() ; /*! * \brief Computes and stores the offsets at which * each slice on the 3rd dimension starts. */ void compute_dim3_offsets() ; /*! * \brief Computes and stores the offsets at which * each slice on the 4th dimension starts. */ void compute_dim4_offsets() ; /*! * \brief Contains the offsets at which each dim1 slice * starts. Each element corresponds to the corresponding * dim1 slice (1st element -> 1st dim1 slice). */ std::vector _dim1_offsets ; /*! * \brief Contains the offsets at which each dim2 slice * starts. Each element corresponds to the corresponding * y slice (1st element -> 1st dim2 slice). */ std::vector _dim2_offsets ; /*! * \brief Contains the offsets at which each dim3 slice * starts. Each element corresponds to the corresponding * x slice (1st element -> 1st dim3 slice). */ std::vector _dim3_offsets ; /*! * \brief Contains the offsets at which each dim4 slice * starts. Each element corresponds to the corresponding * x slice (1st element -> 1st dim4 slice). */ std::vector _dim4_offsets ; } ; // operators /*! * \brief Addition operator. * \param m the matrix of interest * \param value the value to add to each element. * \return the resulting matrix. */ template const Matrix4D operator + (Matrix4D m, T value) { Matrix4D other(m) ; m += value ; return m ; } /*! * \brief Substraction operator * \param m the matrix of interest. * \param value the value to substract to each element. * \return the resulting matrix. */ template const Matrix4D operator - (Matrix4D m, T value) { Matrix4D other(m) ; m -= value ; return m ; } /*! * \brief Multiplication operator. * \param m the matrix of interest. * \param value the value to multiply each elements by. * \return the resulting matrix. */ template const Matrix4D operator * (Matrix4D m, T value) { Matrix4D other(m) ; m *= value ; return m ; } /*! * \brief Division operator. * \param m the matrix of interest. * \param value the value to divide each elements by. * \throw std::invalid_argument if value is 0. * \return the resulting matrix. */ template const Matrix4D operator / (Matrix4D m, T value) { if(value == static_cast(0)) { throw std::invalid_argument("division by 0!") ; } Matrix4D other(m) ; other /= value ; return other ; } /*! * \brief Sends a representation of the matrix to the stream. * \param stream the stream of interest. * \param m the matrix of interest. * \return a reference to the stream. */ template std::ostream& operator << (std::ostream& stream, const Matrix4D& m) { m.print(stream) ; return stream ; } // method implementation template Matrix4D::Matrix4D(size_t dim1, size_t dim2, size_t dim3, size_t dim4) : Matrix4D(dim1, dim2, dim3, dim4, 0) -{ std::cerr << "Matrix4D::Matrix4D(size_t dim1, size_t dim2, size_t dim3, size_t dim4)" << std::endl ; } +{ ; } template Matrix4D::Matrix4D(size_t dim1, size_t dim2, size_t dim3, size_t dim4, T value) : Matrix({dim1, dim2, dim3, dim4}, value), _dim1_offsets(dim1), _dim2_offsets(dim2), _dim3_offsets(dim3), _dim4_offsets(dim4) -{ std::cerr << "Matrix4D::Matrix4D(size_t dim1, size_t dim2, size_t dim3, size_t dim4, T value)" << std::endl ; - this->compute_dim1_offsets() ; +{ this->compute_dim1_offsets() ; this->compute_dim2_offsets() ; this->compute_dim3_offsets() ; this->compute_dim4_offsets() ; } template Matrix4D::Matrix4D(const Matrix4D &other) : Matrix(other) -{ std::cerr << "Matrix4D::Matrix4D(const Matrix4D &other)" << std::endl ; - this->_dim1_offsets = other._dim1_offsets ; +{ this->_dim1_offsets = other._dim1_offsets ; this->_dim2_offsets = other._dim2_offsets ; this->_dim3_offsets = other._dim3_offsets ; this->_dim4_offsets = other._dim4_offsets ; } template -Matrix4D::Matrix4D(Matrix4D&& other) - : Matrix(other) -{ std::cerr << "Matrix4D::Matrix4D(const Matrix4D &other)" << std::endl ; - this->_dim1_offsets = other._dim1_offsets ; +Matrix4D::Matrix4D(Matrix4D &&other) + : Matrix(std::move(other)) +{ this->_dim1_offsets = other._dim1_offsets ; this->_dim2_offsets = other._dim2_offsets ; this->_dim3_offsets = other._dim3_offsets ; this->_dim4_offsets = other._dim4_offsets ; } template Matrix4D::Matrix4D(const std::string &file_address) -{ std::cerr << "Matrix4D::Matrix4D(const std::string &file_address)" << std::endl ; - this->_dim = {0,0,0,0} ; - this->_data = std::vector() ; +{ this->_dim = {0,0,0,0} ; + this->_data = new std::vector() ; this->_dim_size = this->_dim.size() ; - this->_data_size = this->_data.size() ; + this->_data_size = this->_data->size() ; this->_dim_prod = std::vector(this->_dim_size, 0) ; std::ifstream file(file_address, std::ifstream::in) ; if(file.fail()) { char msg[BUFFER_SIZE] ; sprintf(msg, "error! cannot open %s", file_address.c_str()) ; throw std::runtime_error(msg) ; } std::string buffer_str ; std::vector buffer_t ; std::vector dim ; // read 1st line getline(file, buffer_str) ; // empty line if(buffer_str.size() == 0) { // this file only contains one eol char and should be considered as empty, // -> returns empty matrix not an error if(file.peek() == EOF and file.eof()) { file.close() ; return ; } file.close() ; char msg[BUFFER_SIZE] ; sprintf(msg, "error! while reading %s (empty line)", file_address.c_str()) ; throw std::runtime_error(msg) ; } if(file.fail()) { file.close() ; char msg[BUFFER_SIZE] ; sprintf(msg, "error! while reading %s", file_address.c_str()) ; throw std::runtime_error(msg) ; } bool found_4d_header = this->is_header_4d(buffer_str) ; do { if(file.fail()) { file.close() ; char msg[BUFFER_SIZE] ; sprintf(msg, "error! while reading %s", file_address.c_str()) ; throw std::runtime_error(msg) ; } // check empty line if(buffer_str.size() == 0) { file.close() ; char msg[BUFFER_SIZE] ; sprintf(msg, "error! while reading %s (empty line)", file_address.c_str()) ; throw std::runtime_error(msg) ; } // this is the beginning of a 3D slice -> get it using routine if(found_4d_header) { try { // get slice buffer_t.clear() ; dim.clear() ; found_4d_header = this->get_3d_slice(file_address, file, buffer_t, dim); // update data for(const auto& i : buffer_t) - { this->_data.push_back(i) ; + { this->_data->push_back(i) ; this->_data_size++ ; } // update dim only for the 1st slice (the 1st slice set the dimensions) if(this->_dim[3] == 0) { this->_dim[0] = dim[0] ; this->_dim[1] = dim[1] ; this->_dim[2] = dim[2] ; } // check dimensions of the slice else { if(dim[0] != this->_dim[0] or dim[1] != this->_dim[1] or dim[2] != this->_dim[2]) { char msg[BUFFER_SIZE] ; sprintf(msg, "format error! slice have variable dimensions in %s", file_address.c_str()) ; throw std::runtime_error(msg) ; } } this->_dim[3]++ ; } catch(std::runtime_error& e) { file.close() ; throw e ; } } // this is an error, everything between two ',,,N' header // should be read at once. The only way out of the loop // is that no more header has been read because of eof else if(not found_4d_header and not file.eof()) { file.close() ; char msg[BUFFER_SIZE] ; sprintf(msg, "error! while reading %s", file_address.c_str()) ; throw std::runtime_error(msg) ; } } while(found_4d_header) ; file.close() ; this->compute_dim_product() ; this->_dim1_offsets = std::vector(this->_dim[1]) ; this->_dim2_offsets = std::vector(this->_dim[0]) ; this->_dim3_offsets = std::vector(this->_dim[2]) ; this->_dim4_offsets = std::vector(this->_dim[3]) ; this->compute_dim1_offsets() ; this->compute_dim2_offsets() ; this->compute_dim3_offsets() ; this->compute_dim4_offsets() ; } +template +Matrix4D::~Matrix4D() +{ if(this->_data != nullptr) + { delete this->_data ; + this->_data = nullptr ; + } +} + +template +void Matrix4D::load(const std::string& file_address) +{ + Matrix::load(file_address, 4) ; + + this->_dim1_offsets = std::vector(this->_dim[1]) ; + this->_dim2_offsets = std::vector(this->_dim[0]) ; + this->_dim3_offsets = std::vector(this->_dim[2]) ; + this->_dim4_offsets = std::vector(this->_dim[3]) ; + + this->compute_dim1_offsets() ; + this->compute_dim2_offsets() ; + this->compute_dim3_offsets() ; + this->compute_dim4_offsets() ; +} + template T Matrix4D::get(size_t dim1, size_t dim2, size_t dim3, size_t dim4) const { try { return this->get({dim1, dim2, dim3, dim4}) ; } catch(std::out_of_range& e) { throw e ; } } template void Matrix4D::set(size_t dim1, size_t dim2, size_t dim3, size_t dim4, T value) { try { this->set({dim1, dim2, dim3, dim4}, value) ; } catch(std::out_of_range& e) { throw e ; } } template void Matrix4D::print(std::ostream &stream, size_t precision, size_t width, char sep) const { // if the matrix has at least one 0 dimension (no data), don't do anything if(this->_dim[0]==0 or this->_dim[1]==0 or this->_dim[2]==0 or this->_dim[3]==0) { return ; } stream.setf(std::ios::left) ; stream << std::setprecision(precision) << std::fixed ; std::vector dim = this->get_dim() ; size_t n = 0 ; - size_t n_tot = std::accumulate(dim.begin(), dim.end(), 1, std::multiplies()) ; + size_t n_tot = std::accumulate(dim.begin(), dim.end(), (size_t)1, std::multiplies()) ; for(size_t dim4=0; dim4 Matrix4D& Matrix4D::operator = (const Matrix4D& other) -{ std::cerr << "Matrix4D& Matrix4D::operator = (const Matrix4D& other)" << std::endl ; +{ /* this->_dim = other._dim ; this->_dim_size = other._dim_size ; - this->_data = other._data ; + this->_data = new std::vector(*(other._data)) ; this->_data_size = other._data_size ; this->_dim_prod = other._dim_prod ; + */ + Matrix::operator=(other) ; this->_dim1_offsets = other._dim1_offsets ; this->_dim2_offsets = other._dim2_offsets ; this->_dim3_offsets = other._dim3_offsets ; this->_dim4_offsets = other._dim4_offsets ; return *this ; } template Matrix4D& Matrix4D::operator = (Matrix4D&& other) -{ std::cerr << "Matrix4D& Matrix4D::operator = (Matrix4D&& other)" << std::endl ; - this->_dim = other._dim ; - this->_dim_size = other._dim_size ; - this->_data = other._data ; - this->_data_size = other._data_size ; - this->_dim_prod = other._dim_prod ; +{ Matrix::operator=(std::move(other)) ; this->_dim1_offsets = other._dim1_offsets ; this->_dim2_offsets = other._dim2_offsets ; this->_dim3_offsets = other._dim3_offsets ; this->_dim4_offsets = other._dim4_offsets ; return *this ; } template T& Matrix4D::operator () (size_t dim1, size_t dim2, size_t dim3, size_t dim4) -{ return this->_data[this->convert_to_offset(dim1, dim2, dim3, dim4)] ; } +{ return (*this->_data)[this->convert_to_offset(dim1, dim2, dim3, dim4)] ; } template const T& Matrix4D::operator () (size_t dim1, size_t dim2, size_t dim3, size_t dim4) const -{ return this->_data[this->convert_to_offset(dim1, dim2, dim3, dim4)] ; } +{ return (*this->_data)[this->convert_to_offset(dim1, dim2, dim3, dim4)] ; } template bool Matrix4D::is_header_3d(const std::string &str) const { if(str[0] == ',' and str[1] == ',' and str.find(',', 2) == std::string::npos) { return true ; } return false ; } template bool Matrix4D::is_header_4d(const std::string &str) const { if(str[0] == ',' and str[1] == ',' and str[2] == ',' and str.find(',', 3) == std::string::npos) { return true ; } return false ; } template bool Matrix4D::get_3d_slice(const std::string& file_name, std::ifstream& file, std::vector &data, std::vector &dim) const { bool found_4d_header = false ; // the flag to return dim = {0,0,0} ; std::string buffer_str ; std::vector buffer_vec ; T buffer_T ; size_t n_line = 0, n_line_data = 0 ; // number of line and of data line read size_t row_len = 0, col_len = 0 ; // length of row and column in nber of values size_t row_len_cur = 0, col_len_cur = 0 ; // current number of values read in row and col while(getline(file, buffer_str)) { if(file.fail()) { char msg[BUFFER_SIZE] ; sprintf(msg, "error! while reading %s", file_name.c_str()) ; throw std::runtime_error(msg) ; } // check empty line if(buffer_str.size() == 0) { char msg[BUFFER_SIZE] ; sprintf(msg, "error! while reading %s (empty line)", file_name.c_str()) ; throw std::runtime_error(msg) ; } // check whether this is the beginning of a 4D slice header, if so // break if(this->is_header_4d(buffer_str)) { found_4d_header = true ; break ; } // check whether it is the beginning of a slice // 1st line in file should be if(this->is_header_3d(buffer_str)) { // check that slice have a constant number of rows if(dim[2] == 1) { col_len = col_len_cur ; // dim[0] = row_len ; // dim[1] = col_len ; } else if(col_len_cur != col_len) { char msg[BUFFER_SIZE] ; sprintf(msg, "format error! slice have variable dimensions in %s", file_name.c_str()) ; throw std::runtime_error(msg) ; } dim[2]++ ; col_len_cur = 0 ; n_line++ ; continue ; } // 1st line in file should be a header and entering // this block is forbidden if(n_line == 0) { char msg[BUFFER_SIZE] ; sprintf(msg, "format error! first line is not a slice header in %s", file_name.c_str()) ; throw std::runtime_error(msg) ; } // parse line row_len_cur = 0 ; buffer_vec.clear() ; std::istringstream buffer_ss(buffer_str) ; while(buffer_ss >> buffer_T) { buffer_vec.push_back(buffer_T) ; row_len_cur++ ; } // check for an error which likely indicates that a value could not be // casted into a type T (mixed data types in the file) if(buffer_ss.fail() and not buffer_ss.eof()) { char msg[BUFFER_SIZE] ; sprintf(msg, "format error! could not read a line in %s (incompatible data types)", file_name.c_str()) ; throw std::runtime_error(msg) ; } // check that number of column is constant if(n_line_data == 0) { row_len = row_len_cur ; } else if(row_len_cur != row_len) { char msg[BUFFER_SIZE] ; sprintf(msg, "format error! slice have variable dimensions in %s", file_name.c_str()) ; throw std::runtime_error(msg) ; } // update matrix content for(auto i : buffer_vec) { data.push_back(i) ; } col_len_cur++ ; n_line_data++ ; n_line++ ; // update dimension dim[0] = row_len_cur ; dim[1] = col_len_cur ; } // check dimensions of last slice if(col_len_cur != dim[1]) { char msg[BUFFER_SIZE] ; sprintf(msg, "format error! slice have variable dimensions 333 in %s", file_name.c_str()) ; throw std::runtime_error(msg) ; } return found_4d_header ; } template void Matrix4D::compute_dim1_offsets() { for(size_t i=0; i_dim[1]; i++) { this->_dim1_offsets[i] = i * this->_dim_prod[1] ; } } template void Matrix4D::compute_dim2_offsets() { for(size_t i=0; i_dim[0]; i++) { this->_dim2_offsets[i] = i * this->_dim_prod[0] ; } } template void Matrix4D::compute_dim3_offsets() { for(size_t i=0; i_dim[2]; i++) { this->_dim3_offsets[i] = i * this->_dim_prod[2] ; } } template void Matrix4D::compute_dim4_offsets() { for(size_t i=0; i_dim[3]; i++) { this->_dim4_offsets[i] = i * this->_dim_prod[3] ; } } template size_t Matrix4D::convert_to_offset(size_t dim1, size_t dim2, size_t dim3, size_t dim4) const { /* size_t offset = 0 ; for(size_t i=0; i_dim_size; i++) { offset += coord[i] * this->_dim_prod[i] ; } return offset ; */ size_t offset = this->_dim1_offsets[dim1] + this->_dim2_offsets[dim2] + this->_dim3_offsets[dim3] + this->_dim4_offsets[dim4] ; return offset ; } #endif // MATRIX4D_HPP diff --git a/src/main_cormat.cpp b/src/main_cormat.cpp index de674b5..b46169c 100644 --- a/src/main_cormat.cpp +++ b/src/main_cormat.cpp @@ -1,271 +1,311 @@ #include #include #include #include #include #include #include #include #include #include class TestTimer { public: TestTimer(const std::string & name) : name(name), start(boost::date_time::microsec_clock::local_time()) { } ~TestTimer() { using namespace std; using namespace boost; posix_time::ptime now(date_time::microsec_clock::local_time()); posix_time::time_duration d = now - start; cout << name << " completed in " << d.total_milliseconds() / 1000.0 << " seconds" << endl; } private: std::string name; boost::posix_time::ptime start; }; void f_vector2d(size_t nrow, size_t ncol) { std::vector> m ; { TestTimer timer("f_vector2d init") ; m = std::vector>(nrow, std::vector(ncol, 0.)) ; for(size_t i=0; i m ; { TestTimer timer("f_matrix2d init") ; m = Matrix2D(nrow, ncol, 0.) ; } { TestTimer timer("f_matrix2d writting") ; for(size_t i=0; i>> m ; { TestTimer timer("f_vector3d init") ; m = std::vector>>(dim1, std::vector>(dim2, std::vector(dim3,0.))) ; for(size_t i=0; i m ; { TestTimer timer("f_matrix3d init") ; m = Matrix3D(dim1, dim2, dim3) ; for(size_t i=0; i>>> m ; { TestTimer timer("f_vector4d init") ; m = std::vector>>>(dim1, std::vector>>(dim2, std::vector>(dim3, std::vector(dim4, 0)))) ; for(size_t i=0; i m ; { TestTimer timer("f_matrix4d init") ; m = Matrix4D(dim1, dim2, dim3,dim4, 0) ; for(size_t i=0; i m ; + // move assignment operator + m = Matrix2D(2, 3, 9) ; + int b = 1 ; + for(size_t i=0; i m2(m) ; + std::cout << m2 << std::endl ; + // move constructor + Matrix2D m3(std::move(m)) ; + std::cout << m3 << std::endl ; + + std::cout << (m2 == m3) << std::endl ; + */ - f_vector3d(dim1, dim2, dim3) ; - f_matrix3d(dim1, dim2, dim3) ; + /* + Matrix3D m ; + // move assignment operator + m = Matrix3D(2, 3, 4, 9) ; + int b = 1 ; + for(size_t i=0; i m2(m) ; + std::cout << m2 << std::endl ; + // move constructor + Matrix3D m3(std::move(m)) ; + std::cout << m3 << std::endl ; + + std::cout << (m2 == m3) << std::endl ; + */ - f_vector4d(dim1, dim2, dim3, dim4) ; - f_matrix4d(dim1, dim2, dim3, dim4) ; + /* + Matrix4D m ; + // move assignment operator + m = Matrix4D(2, 3, 4, 2, 9) ; + int b = 1 ; + for(size_t i=0; i m2(m) ; + std::cout << m2 << std::endl ; + // move constructor + Matrix4D m3(std::move(m)) ; + std::cout << m3 << std::endl ; + + std::cout << (m2 == m3) << std::endl ; */ - std::cout << "main START" << std::endl ; - Matrix2D m2 ; - m2 = Matrix2D(10, 10) ; - Matrix3D m3 ; - m3 = Matrix3D(10, 10, 10) ; - Matrix4D m4 ; - m4 = Matrix4D(10, 10, 10, 10) ; - std::cout << "main END" << std::endl ; + return 0; } diff --git a/src/main_em.cpp b/src/main_em.cpp deleted file mode 100644 index 954a16c..0000000 --- a/src/main_em.cpp +++ /dev/null @@ -1,92 +0,0 @@ -#include -#include -#include -#include - -#include -#include - -using namespace std ; - -void get_size(const vector>>>>& m) -{ size_t size_d = 0 ; - size_t size_m4 = 0 ; - size_t size_m3 = 0 ; - size_t size_m2 = 0 ; - size_t size_m = 0 ; - - std::cout << "sizeof m : " << sizeof(m) << std::endl ; - std::cout << "sizeof m[0] : " << sizeof(m[0]) << std::endl ; - std::cout << "sizeof m[0][0] : " << sizeof(m[0][0]) << std::endl ; - std::cout << "sizeof m[0][0][0] : " << sizeof(m[0][0][0]) << std::endl ; - std::cout << "sizeof m[0][0][0][0] : " << sizeof(m[0][0][0][0]) << std::endl ; - - for(const auto& m4 : m) - { size_m4 += sizeof(m4) ; - for(const auto& m3 : m4) - { size_m3 += sizeof(m3) ; - for(const auto& m2 : m3) - { size_m2 += sizeof(m2) ; - for(const auto& m : m2) - { size_m += sizeof(m) ; - size_d += m.capacity() * sizeof(int) ; - } - } - } - } - std::cout << "size of matrix" << std::endl - << "size of m4 : " << size_m4 << std::endl - << "size of m3 : " << size_m3 << std::endl - << "size of m2 : " << size_m2 << std::endl - << "size of m : " << size_m << std::endl - << "size of data : " << size_d << std::endl ; -} - - -void get_size(const vector>>>& m) -{ size_t size_d = 0 ; - size_t size_m3 = 0 ; - size_t size_m2 = 0 ; - size_t size_m = 0 ; - - std::cout << "sizeof m : " << sizeof(m) << std::endl ; - std::cout << "sizeof m[0] : " << sizeof(m[0]) << std::endl ; - std::cout << "sizeof m[0][0] : " << sizeof(m[0][0]) << std::endl ; - std::cout << "sizeof m[0][0][0] : " << sizeof(m[0][0][0]) << std::endl ; - - for(const auto& m3 : m) - { size_m3 += sizeof(m3) ; - for(const auto& m2 : m3) - { size_m2 += sizeof(m2) ; - for(const auto& m : m2) - { size_m += sizeof(m) ; - size_d += m.capacity() * sizeof(int) ; - } - } - } - std::cout << "size of matrix" << std::endl - << "size of m3 : " << size_m3 << std::endl - << "size of m2 : " << size_m2 << std::endl - << "size of m : " << size_m << std::endl - << "size of data : " << size_d << std::endl ; -} - -void get_size(const vector& v) -{ std::cout << "sizeof v : " << sizeof(v) << std::endl ; - std::cout << "size of data : " << v.size() * sizeof(int) << std::endl ; -} - -int main() -{ - vector>>>> - m(2, - vector>>>(97998, - vector>>(5, - vector>(201, - vector(2))))) ; - - get_size(m) ; - vector v(97998*5*201*2) ; - get_size(v) ; - return EXIT_SUCCESS ; -} diff --git a/src/main_em2.cpp b/src/main_em2.cpp deleted file mode 100644 index 5e7e185..0000000 --- a/src/main_em2.cpp +++ /dev/null @@ -1,36 +0,0 @@ -#include -#include - -#include -#include -#include - - -int main() -{ - - std::string data_path1 = "/local/groux/scATAC-seq/results/10xgenomics_PBMC_5k/" - "ctcf_motifs_10e-6_open_bin1bp_read_atac.mat" ; - std::string data_path2 = "/local/groux/scATAC-seq/results/10xgenomics_PBMC_5k/" - "ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center.mat" ; - - size_t n_class = 10 ; - size_t n_shift = 21 ; - bool flip = false ; - size_t n_iter = 20 ; - std::string seed = "08july2019" ; - - EMEngine em_new(std::vector>{Matrix2D(data_path1), Matrix2D(data_path2)}, - std::vector>{}, - n_class, - n_iter, - n_shift, - flip, - EMEngine::seeding_codes::RANDOM, - seed) ; - - em_new.classify() ; - em_new.get_read_models() ; - - return EXIT_SUCCESS ; -} diff --git a/src/main_seqan.cpp b/src/main_seqan.cpp deleted file mode 100644 index bf75184..0000000 --- a/src/main_seqan.cpp +++ /dev/null @@ -1,446 +0,0 @@ -#include -#include -#include -#include -#include - -#include -#include - -using namespace seqan; - -template -std::ostream& operator << (std::ostream& o, const std::unordered_map& map) -{ for(const auto& pair : map) - { o << "< " << pair.first << " " - << pair.second << " >" - << std::endl ; - } - return o ; -} - - -void bam_stat(const std::string& path_bam) -{ // CharString bamFileInName = path_bam.c_str() ; - // Open input BAM file. - BamFileIn bamFileIn; - if (!open(bamFileIn, path_bam.c_str())) - { - char msg[1024] ; - sprintf(msg, "ERROR: could not open input file %s", path_bam.c_str()) ; - throw std::runtime_error(msg); - } - - // read header. - BamHeader header; - try - { readHeader(header, bamFileIn); - } - catch (ParseError const & e) - { char msg[1024] ; - sprintf(msg, "ERROR: input header is badly formatted. %s", e.what()) ; - throw std::runtime_error(msg); - } - - // counters - int n_frag = 0 ; - int n_frag_bad = 0 ; - int n_read_fw_1 = 0 ; - int n_read_fw_2 = 0 ; - int n_read_rv_1 = 0 ; - int n_read_rv_2 = 0 ; - - BamAlignmentRecord record; - while (!atEnd(bamFileIn)) - { readRecord(record, bamFileIn) ; - n_frag++ ; - if(not seqan::hasFlagAllProper(record)) - { n_frag_bad++ ; - continue ; - } - else if((seqan::hasFlagRC(record) and - seqan::hasFlagNextRC(record)) or - (not seqan::hasFlagRC(record) and - not seqan::hasFlagNextRC(record))) - { n_frag_bad++ ; - continue ; - } - - // read - if(seqan::hasFlagFirst(record) and not seqan::hasFlagRC(record)) - { n_read_fw_1++ ; } - - if(not seqan::hasFlagFirst(record) and not seqan::hasFlagRC(record)) - { n_read_fw_2++ ; } - - if(seqan::hasFlagFirst(record) and seqan::hasFlagRC(record)) - { n_read_rv_1++ ; } - - if(not seqan::hasFlagFirst(record) and seqan::hasFlagRC(record)) - { n_read_rv_2++ ; } - } - close(bamFileIn) ; - std::cout << path_bam << std::endl ; - std::cout << "n frag : " << n_frag << std::endl ; - std::cout << "n frag bad qual : " << n_frag_bad << std::endl ; - std::cout << "n read fw 1st : " << n_read_fw_1 << std::endl ; - std::cout << "n read fw 2nd : " << n_read_fw_2 << std::endl ; - std::cout << "n read rv 1st : " << n_read_rv_1 << std::endl ; - std::cout << "n read rv 2nd : " << n_read_rv_2 << std::endl << std::endl ; -} - -bool is_good_read(const seqan::BamAlignmentRecord& record) -{ - if((seqan::hasFlagUnmapped(record)) or // read unmapped flag - seqan::hasFlagQCNoPass(record) or // not passing QC flag - seqan::hasFlagDuplicate(record)) // PCR duplicate flag - { return false ; } - return true ; -} - -bool is_good_pair(const seqan::BamAlignmentRecord& record) -{ - if((not seqan::hasFlagMultiple(record)) or // is paired flag - (not seqan::hasFlagAllProper(record))) // each read properly aligned flag - { return false ; } - - if((not seqan::hasFlagFirst(record)) or // read 1st in pair flag - seqan::hasFlagLast(record)) // mate 1st in pair flag - { return false ; } - - // read info - bool read_is_rev = seqan::hasFlagRC(record) ; // read is rev flag - int read_start = record.beginPos ; - // mate info - bool mate_is_rev = seqan::hasFlagNextRC(record) ; // mate is rev flag - int mate_start = record.pNext ; - - // qc - if((not is_good_read(record)) or - // --> --> - (not read_is_rev and not mate_is_rev) or - // <-- <-- - (read_is_rev and mate_is_rev) or - // <-- --> 1/2 - ((read_is_rev and not mate_is_rev) and (read_start < mate_start)) or - // <-- --> 2/2 - ((not read_is_rev and mate_is_rev) and (read_start > mate_start))) - { return false ; } - return true ; -} - -void read_bam(const std::string& path_bam) -{ - BamFileIn bamFileIn; - if (!open(bamFileIn, path_bam.c_str())) - { - char msg[1024] ; - sprintf(msg, "ERROR: could not open input file %s", path_bam.c_str()) ; - throw std::runtime_error(msg); - } - - // read header. - BamHeader header; - try - { readHeader(header, bamFileIn); - } - catch (ParseError const & e) - { char msg[1024] ; - sprintf(msg, "ERROR: input header is badly formatted. %s", e.what()) ; - throw std::runtime_error(msg); - } - - BamAlignmentRecord record; - while (!atEnd(bamFileIn)) - { - readRecord(record, bamFileIn) ; - - bool read_rev = seqan::hasFlagRC(record) ; - bool mate_rev = seqan::hasFlagNextRC(record) ; - int read_start = record.beginPos ; - int mate_start = record.pNext ; - std::string chrom = seqan::toCString(seqan::getContigName(record, bamFileIn)) ; - - if(not is_good_pair(record)) - { continue ; } - - char msg[1024] ; - if(not read_rev and mate_rev) - { sprintf(msg, "[fw %s %d] [rv %s %d]", - chrom.c_str(), read_start, chrom.c_str(), mate_start) ; - } - else if(read_rev and not mate_rev) - { sprintf(msg, "[rv %s %d] [fw %s %d]", - chrom.c_str(), read_start, chrom.c_str(), mate_start) ; - } - std::cout << msg << std::endl ; - } - close(bamFileIn) ; -} - -void count_record_bam(const std::string& path_bam) -{ - BamFileIn bamFileIn; - if (!open(bamFileIn, path_bam.c_str())) - { - char msg[1024] ; - sprintf(msg, "ERROR: could not open input file %s", path_bam.c_str()) ; - throw std::runtime_error(msg); - } - - // read header. - BamHeader header; - try - { readHeader(header, bamFileIn); - } - catch (ParseError const & e) - { char msg[1024] ; - sprintf(msg, "ERROR: input header is badly formatted. %s", e.what()) ; - throw std::runtime_error(msg); - } - - size_t n_rec = 0 ; - BamAlignmentRecord record; - while (!atEnd(bamFileIn)) - { - readRecord(record, bamFileIn) ; - n_rec++ ; - } - close(bamFileIn) ; - std::cout << "nber record : " << n_rec << std::endl ; -} - -void check_chromosomes_bam(const std::string& path_bam) -{ - BamFileIn bamFileIn; - if (!open(bamFileIn, path_bam.c_str())) - { - char msg[1024] ; - sprintf(msg, "ERROR: could not open input file %s", path_bam.c_str()) ; - throw std::runtime_error(msg); - } - - // read header. - BamHeader header; - try - { readHeader(header, bamFileIn); - } - catch (ParseError const & e) - { char msg[1024] ; - sprintf(msg, "ERROR: input header is badly formatted. %s", e.what()) ; - throw std::runtime_error(msg); - } - - int chrom_n = 0 ; - std::unordered_map map ; - - BamAlignmentRecord record; - size_t i=0 ; - while (!atEnd(bamFileIn)) - { - readRecord(record, bamFileIn) ; - std::string chrom = seqan::toCString( - seqan::getContigName( - record, bamFileIn)) ; - if(map.find(chrom) == map.end()) - { map[chrom] = chrom_n ; - chrom_n++ ; - } - /* - else if(map.find(chrom) != map.end() and - chrom_n-1 != map.find(chrom)->second) - { auto chrom_tmp = map.find(chrom)->first ; - auto chrom_n_tmp = map.find(chrom)->second ; - std::cout << "sorting issue with " - << chrom_tmp << " " << chrom_n_tmp << "/" << chrom_n - << std::endl ; - } - */ - std::cout << i << std::endl ; - i++ ; - } - close(bamFileIn) ; - std::cout << "chromosomes :" << std::endl << map << std::endl ; -} - -void test(const std::string& path_bam) -{ - BamFileIn bamFileIn; - if (!open(bamFileIn, path_bam.c_str())) - { - char msg[1024] ; - sprintf(msg, "ERROR: could not open input file %s", path_bam.c_str()) ; - throw std::runtime_error(msg); - } - - // Open output SAM which is the standard output. - BamFileOut samFileOut(context(bamFileIn), std::cout, Sam()); - - // read header. - BamHeader header; - try - { readHeader(header, bamFileIn) ; } - catch (ParseError const & e) - { char msg[1024] ; - sprintf(msg, "ERROR: input header is badly formatted. %s", e.what()) ; - throw std::runtime_error(msg); - } - - size_t n_rec = 0 ; - BamAlignmentRecord record; - while (!atEnd(bamFileIn)) - { - readRecord(record, bamFileIn) ; - if(n_rec == 165421293) - { writeRecord(samFileOut, record) ; - std::cout << is_good_read(record) << std::endl ; - } - else if(n_rec == 366090419) - { writeRecord(samFileOut, record) ; - std::cout << is_good_read(record) << std::endl ; - // should bug - /* - std::string chrom = seqan::toCString( - seqan::getContigName( - record, bamFileIn)) ; - */ - } - n_rec++ ; - } - close(bamFileIn) ; - std::cout << "nber record : " << n_rec << std::endl ; -} - - -int copy_top_bam(const std::string& bam_path_in, - const std::string& bam_path_out, - int n_lines) -{ - - // Open input file, BamFileIn can read SAM and BAM files. - BamFileIn bamFileIn; - if (!open(bamFileIn, bam_path_in.c_str())) - { - std::cerr << "ERROR: Could not open " << bam_path_in << std::endl; - return 1; - } - // Open output file, BamFileOut accepts also an ostream and a format tag. - BamFileOut bamFileOut(bam_path_out.c_str()); - - try - { - // Copy header. - BamHeader header; - readHeader(header, bamFileIn); - writeHeader(bamFileOut, header); - - // Copy records. - BamAlignmentRecord record; - int n=0 ; - while (not atEnd(bamFileIn) and - n +#include + +#include +#include +#include +# + +using namespace std ; + +template +std::ostream& operator << (std::ostream& stream, const std::vector& v) +{ for(const auto x : v) + { stream << x << " " ; } + return stream ; +} + +int main() +{ // path + std::string bed_file = "/local/groux/scATAC-seq/data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_peaks_rmsk_sampled.bed" ; + std::string bam_file = "/local/groux/scATAC-seq/data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam" ; + std::string bai_file = "/local/groux/scATAC-seq/data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam.bai" ; + std::string prob_file = "/local/groux/scATAC-seq/results/10xgenomics_PBMC_5k_peaks_classification_6/peaks_rmsk_sampled_sequences_1kb_23class_prob.mat4d" ; + + // posterior prob + std::cerr << "loading posterior prob" << std::endl ; + Matrix4D prob(prob_file) ; + size_t n_row = prob.get_dim()[0] ; + size_t n_class = prob.get_dim()[1] ; + size_t n_shift = prob.get_dim()[2] ; + size_t n_flip = prob.get_dim()[3] ; + bool flip = n_flip == 2 ; + std::cerr << "posterior prob " << prob.get_dim() << std::endl ; + + // class prob + std::cerr << "computing class probabilities" << std::endl ; + std::vector prob_colsum(n_class, 0.) ; + double tot = 0. ; + for(size_t i=0; i data2 = mc.create_matrix() ; + size_t n_col2 = data2.get_ncol() ; + std::cerr << "data2 matrix " << data2.get_dim() << std::endl ; + + // realign matrix + std::cerr << "computing data3 matrix" << std::endl ; + size_t n_col3 = to1 - from1 + 1 ; + Matrix2D data3(n_row, + n_col3, + 0.) ; + std::cerr << "data3 matrix " << data3.get_dim() << std::endl ; + for(size_t i=0; i= to_dat2_rev; + j_dat2_rev--, j_dat3_fw++) + { data3(i,j_dat3_fw) += + (prob(i,class_k,s,1) * + data2(i,j_dat2_rev)) / + prob_colsum[class_k] ; + } + } + } + } + // clean memory + prob = Matrix4D() ; + + // convert to integer + Matrix2D data4(data3.get_nrow(), + data3.get_ncol()) ; + for(size_t i=0; i< data4.get_nrow(); i++) + { for(size_t j=0; j