diff --git a/res/A.png.save b/res/A.png.save deleted file mode 100644 index 5d41180..0000000 Binary files a/res/A.png.save and /dev/null differ diff --git a/res/C.png.save b/res/C.png.save deleted file mode 100644 index 7a818e5..0000000 Binary files a/res/C.png.save and /dev/null differ diff --git a/res/G.png.save b/res/G.png.save deleted file mode 100644 index 355b078..0000000 Binary files a/res/G.png.save and /dev/null differ diff --git a/res/T.png.save b/res/T.png.save deleted file mode 100644 index f69fa19..0000000 Binary files a/res/T.png.save and /dev/null differ diff --git a/scripts/10xgenomics_PBMC_5k/analysis_ctcf_motif_chr1.R b/scripts/10xgenomics_PBMC_5k/analysis_ctcf_motif_chr1.R deleted file mode 100644 index 711629c..0000000 --- a/scripts/10xgenomics_PBMC_5k/analysis_ctcf_motif_chr1.R +++ /dev/null @@ -1,223 +0,0 @@ -setwd(file.path("/", "local", "groux", "scATAC-seq")) - -# libraries -library(RColorBrewer) - -# functions -source(file.path("scripts", "functions.R")) - - -################## aggregations around CTCF motifs ################## - -# data -# open chromatin -data.open.1.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_open_bin1bp_fragment.mat"))) -data.open.2.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_open_bin2bp_fragment.mat"))) -data.open.10.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_open_bin10bp_fragment.mat"))) - -data.open.1.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_open_bin1bp_read.mat"))) -data.open.2.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_open_bin2bp_read.mat"))) -data.open.10.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_open_bin10bp_read.mat"))) - -data.open.1.atac = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_open_bin1bp_read_atac.mat"))) -data.open.2.atac = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_open_bin2bp_read_atac.mat"))) -data.open.10.atac = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_open_bin10bp_read_atac.mat"))) - -# mono-nucleosomes -data.1nucl.1.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_1nucl_bin1bp_fragment.mat"))) -data.1nucl.2.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_1nucl_bin2bp_fragment.mat"))) -data.1nucl.10.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_1nucl_bin10bp_fragment.mat"))) - -data.1nucl.1.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_1nucl_bin1bp_read.mat"))) -data.1nucl.2.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_1nucl_bin2bp_read.mat"))) -data.1nucl.10.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_1nucl_bin10bp_read.mat"))) - -data.1nucl.1.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_1nucl_bin1bp_fragment_center.mat"))) -data.1nucl.2.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_1nucl_bin2bp_fragment_center.mat"))) -data.1nucl.10.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_1nucl_bin10bp_fragment_center.mat"))) - -# di-nucleosomes -data.2nucl.1.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_2nucl_bin1bp_fragment.mat"))) -data.2nucl.2.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_2nucl_bin2bp_fragment.mat"))) -data.2nucl.10.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_2nucl_bin10bp_fragment.mat"))) - -data.2nucl.1.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_2nucl_bin1bp_read.mat"))) -data.2nucl.2.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_2nucl_bin2bp_read.mat"))) -data.2nucl.10.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_2nucl_bin10bp_read.mat"))) - -data.2nucl.1.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_2nucl_bin1bp_fragment_center.mat"))) -data.2nucl.2.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_2nucl_bin2bp_fragment_center.mat"))) -data.2nucl.10.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_2nucl_bin10bp_fragment_center.mat"))) - -# mono-nucleosomes from di-nucleosome data -data.nucls.1.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_2nuclsplitintwo_bin1bp_fragment.mat"))) -data.nucls.2.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_2nuclsplitintwo_bin2bp_fragment.mat"))) -data.nucls.10.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_2nuclsplitintwo_bin10bp_fragment.mat"))) - -data.nucls.1.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_2nuclsplitintwo_bin1bp_read.mat"))) -data.nucls.2.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_2nuclsplitintwo_bin2bp_read.mat"))) -data.nucls.10.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_2nuclsplitintwo_bin10bp_read.mat"))) - -data.nucls.1.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_2nuclsplitintwo_bin1bp_fragment_center.mat"))) -data.nucls.2.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_2nuclsplitintwo_bin2bp_fragment_center.mat"))) -data.nucls.10.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_2nuclsplitintwo_bin10bp_fragment_center.mat"))) - - -# colors -col = brewer.pal(4, "Set1") - -# x-axis -axis.at.1 = seq(0, ncol(data.open.1.frag), length.out =5) -axis.lab.1 = seq(-400, 400, by=200) -axis.at.2 = seq(0, ncol(data.open.2.frag), length.out =5) -axis.lab.2 = seq(-400, 400, by=200) -axis.at.10 = seq(0, ncol(data.open.10.frag), length.out=5) -axis.lab.10 = seq(-1000, 1000, by=500) - -# X11(width=12, height=12) -png(filename=file.path("results/10xgenomics_PBMC_5k/ctcf_motifs_10e-6_chr1_aggregations.png"), - units="in", res=720, width=12, height=9) - m = matrix(nrow=4, ncol=4, - data=c(16,13,14,15, - 10, 1, 4, 7, - 11, 2, 5, 8, - 12, 3, 6, 9), byrow=T) - l = layout(mat=m, widths=c(0.2, 1, 1, 1), heights=c(0.2, 1, 1, 1)) - layout.show(l) - - p = par(mar=c(5.1, 5.1, 4.1, 2.1)) - - # 1bp resolution - ## entire fragments - plot(colMeans(data.open.1.frag), col=col[1], lwd=3, type='l', - main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n', - cex.axis=2, cex.lab=2) - lines(colMeans(data.open.1.frag), col=col[1], lwd=3) - lines(colMeans(data.1nucl.1.frag), col=col[2], lwd=3) - lines(colMeans(data.2nucl.1.frag), col=col[3], lwd=3) - lines(colMeans(data.nucls.1.frag), col=col[4], lwd=3) - axis(side=1, at=axis.at.1, labels=axis.lab.1, cex.axis=1.8) - ## entire reads - plot(colMeans(data.open.1.read), col=col[1], lwd=3, type='l', - main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n', - cex.axis=2, cex.lab=2) - lines(colMeans(data.1nucl.1.read), col=col[2], lwd=3) - lines(colMeans(data.2nucl.1.read), col=col[3], lwd=3) - lines(colMeans(data.nucls.1.read), col=col[4], lwd=3) - axis(side=1, at=axis.at.1, labels=axis.lab.1, cex.axis=1.8) - ## atac reads and centers - plot(colMeans(data.open.1.atac)/max(colMeans(data.open.1.atac)), - col=col[1], lwd=3, type='l', xaxt='n', - main="", xlab="pos[bp]", ylab="Prop max signal", - cex.axis=2, cex.lab=2) - lines(colMeans(data.1nucl.1.cent)/max(colMeans(data.1nucl.1.cent)), - col=col[2], lwd=3) - lines(colMeans(data.2nucl.1.cent)/max(colMeans(data.2nucl.1.cent)), - col=col[3], lwd=3) - lines(colMeans(data.nucls.1.cent)/max(colMeans(data.nucls.1.cent)), - col=col[4], lwd=3) - axis(side=1, at=axis.at.1, labels=axis.lab.1, cex.axis=1.8) - - # 2bp resolution - ## entire fragments - plot(colMeans(data.open.2.frag), col=col[1], lwd=3, type='l', - main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n', - cex.axis=2, cex.lab=2) - lines(colMeans(data.1nucl.2.frag), col=col[2], lwd=3) - lines(colMeans(data.2nucl.2.frag), col=col[3], lwd=3) - lines(colMeans(data.nucls.2.frag), col=col[4], lwd=3) - axis(side=1, at=axis.at.2, labels=axis.lab.2, cex.axis=1.8) - ## entire reads - plot(colMeans(data.open.2.read), col=col[1], lwd=3, type='l', - main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n', - cex.axis=2, cex.lab=2) - lines(colMeans(data.1nucl.2.read), col=col[2], lwd=3) - lines(colMeans(data.2nucl.2.read), col=col[3], lwd=3) - lines(colMeans(data.nucls.2.read), col=col[4], lwd=3) - axis(side=1, at=axis.at.2, labels=axis.lab.2, cex.axis=1.8) - ## atac reads and centers - plot(colMeans(data.open.2.atac)/max(colMeans(data.open.2.atac)), - col=col[1], lwd=3, type='l', xaxt='n', - main="", xlab="pos[bp]", ylab="Prop max signal", - cex.axis=2, cex.lab=2) - lines(colMeans(data.1nucl.2.cent)/max(colMeans(data.1nucl.2.cent)), - col=col[2], lwd=3) - lines(colMeans(data.2nucl.2.cent)/max(colMeans(data.2nucl.2.cent)), - col=col[3], lwd=3) - lines(colMeans(data.nucls.2.cent)/max(colMeans(data.nucls.2.cent)), - col=col[4], lwd=3) - axis(side=1, at=axis.at.2, labels=axis.lab.2, cex.axis=1.8) - - # 10bp resolution - ## entire fragments - plot(colMeans(data.open.10.frag), col=col[1], lwd=3, type='l', - main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n', - cex.axis=2, cex.lab=2) - lines(colMeans(data.1nucl.10.frag), col=col[2], lwd=3) - lines(colMeans(data.2nucl.10.frag), col=col[3], lwd=3) - lines(colMeans(data.nucls.10.frag), col=col[4], lwd=3) - axis(side=1, at=axis.at.10, labels=axis.lab.10, cex.axis=1.8) - ## entire reads - plot(colMeans(data.open.10.read), col=col[1], lwd=3, type='l', - main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n', - cex.axis=2, cex.lab=2) - lines(colMeans(data.1nucl.10.read), col=col[2], lwd=3) - lines(colMeans(data.2nucl.10.read), col=col[3], lwd=3) - lines(colMeans(data.nucls.10.read), col=col[4], lwd=3) - axis(side=1, at=axis.at.10, labels=axis.lab.10, cex.axis=1.8) - ## atac reads and centers - plot(colMeans(data.open.10.atac)/max(colMeans(data.open.10.atac)), - col=col[1], lwd=3, type='l', xaxt='n', - main="", xlab="pos[bp]", ylab="Prop max signal", - cex.axis=2, cex.lab=2) - lines(colMeans(data.1nucl.10.cent)/max(colMeans(data.1nucl.10.cent)), - col=col[2], lwd=3) - lines(colMeans(data.2nucl.10.cent)/max(colMeans(data.2nucl.10.cent)), - col=col[3], lwd=3) - lines(colMeans(data.nucls.10.cent)/max(colMeans(data.nucls.10.cent)), - col=col[4], lwd=3) - axis(side=1, at=axis.at.10, labels=axis.lab.10, cex.axis=1.8) - - # some legends over the rows and columns - p = par(mar=c(0,0,0,0)) - plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n') - text(0, 0, labels="FRAGMENTS", cex=2, srt=90) - - plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n') - text(0, 0, labels="READS", cex=2, srt=90) - - plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n') - text(0, 0, labels="EDGES/CENTERS", cex=2, srt=90) - - plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n') - text(0, 0, labels="+/-400bp by 1bp", cex=2) - - plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n') - text(0, 0, labels="+/-400bp by 2bp", cex=2) - - plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n') - text(0, 0, labels="+/-1kp by 10bp", cex=2) - - par(p) -dev.off() - - - -# footprint -# x-axis -axis.at.fp = seq(0, 200, length.out=3) -axis.lab.fp = seq(-100, 100, by=100) - - -# X11(width=8, height=4) -png(filename=file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_footprint.png"), - units="in", res=720, width=8, height=4) - p = par(mar=c(5.1, 5.1, 4.1, 2.1)) - plot(colMeans(data.open.1.atac[,300:500]), type='l', lwd=3, col=col[1], - main="CTCF motif", xlab="pos[bp]", ylab="Nb of reads", xaxt='n', - cex.axis=2, cex.lab=2) - abline(v=90, lwd=3, lty=2) - abline(v=110, lwd=3, lty=2) - axis(side=1, at=axis.at.fp, labels=axis.lab.fp, cex.axis=1.8) - par(p) -dev.off() diff --git a/scripts/10xgenomics_PBMC_5k/analysis_ctcf_motif_chr1.sh b/scripts/10xgenomics_PBMC_5k/analysis_ctcf_motif_chr1.sh deleted file mode 100755 index 6c21352..0000000 --- a/scripts/10xgenomics_PBMC_5k/analysis_ctcf_motif_chr1.sh +++ /dev/null @@ -1,67 +0,0 @@ -# some paths -## directories -results_dir='results/10xgenomics_PBMC_5k' -data_dir='data/10xgenomics_PBMC_5k/' -## input -file_bed=$data_dir'/ctcf_motifs_10e-6_chr1.bed' -file_bam_open="$data_dir/atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam" -file_bai_open="$data_dir/atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam.bai" -file_bam_1nucl="$data_dir/atac_v1_pbmc_5k_possorted_filtered_133-266bp.bam" -file_bai_1nucl="$data_dir/atac_v1_pbmc_5k_possorted_filtered_133-266bp.bam.bai" -file_bam_2nucl="$data_dir/atac_v1_pbmc_5k_possorted_filtered_341-500bp.bam" -file_bai_2nucl="$data_dir/atac_v1_pbmc_5k_possorted_filtered_341-500bp.bam.bai" -file_bam_1nucl2="$data_dir/atac_v1_pbmc_5k_possorted_filtered_341-500bp_splitintwo.bam" -file_bai_1nucl2="$data_dir/atac_v1_pbmc_5k_possorted_filtered_341-500bp_splitintwo.bam.bai" - -mkdir -p $results_dir - -# matrix creation -## open chromatin around CTCF motif -for method in 'read' 'read_atac' 'fragment' -do - file_mat_open_1="$results_dir/ctcf_motifs_10e-6_chr1_open_bin1bp_$method.mat" - bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_open --bai $file_bai_open --from -400 --to 400 --binSize 1 --method $method > $file_mat_open_1 - file_mat_open_2="$results_dir/ctcf_motifs_10e-6_chr1_open_bin2bp_$method.mat" - bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_open --bai $file_bai_open --from -400 --to 400 --binSize 2 --method $method > $file_mat_open_2 - file_mat_open_10="$results_dir/ctcf_motifs_10e-6_chr1_open_bin10bp_$method.mat" - bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_open --bai $file_bai_open --from -1000 --to 1000 --binSize 10 --method $method > $file_mat_open_10 -done - -## mono around CTCF motif -for method in 'read' 'fragment' 'fragment_center' -do - ### mono nucleosomes - file_mat_1nucl_1="$results_dir/ctcf_motifs_10e-6_chr1_1nucl_bin1bp_$method.mat" - bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl --bai $file_bai_1nucl --from -400 --to 400 --binSize 1 --method $method > $file_mat_1nucl_1 - file_mat_1nucl_2="$results_dir/ctcf_motifs_10e-6_chr1_1nucl_bin2bp_$method.mat" - bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl --bai $file_bai_1nucl --from -400 --to 400 --binSize 2 --method $method > $file_mat_1nucl_2 - file_mat_1nucl_10="$results_dir/ctcf_motifs_10e-6_chr1_1nucl_bin10bp_$method.mat" - bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl --bai $file_bai_1nucl --from -1000 --to 1000 --binSize 10 --method $method > $file_mat_1nucl_10 -done - - -## di nucleosomes around CTCF motif -for method in 'read' 'fragment' 'fragment_center' -do - ### di nucleosomes - file_mat_2nucl_1="$results_dir/ctcf_motifs_10e-6_chr1_2nucl_bin1bp_$method.mat" - bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_2nucl --bai $file_bai_2nucl --from -400 --to 400 --binSize 1 --method $method > $file_mat_2nucl_1 - file_mat_2nucl_2="$results_dir/ctcf_motifs_10e-6_chr1_2nucl_bin2bp_$method.mat" - bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_2nucl --bai $file_bai_2nucl --from -400 --to 400 --binSize 2 --method $method > $file_mat_2nucl_2 - file_mat_2nucl_10="$results_dir/ctcf_motifs_10e-6_chr1_2nucl_bin10bp_$method.mat" - bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_2nucl --bai $file_bai_2nucl --from -1000 --to 1000 --binSize 10 --method $method > $file_mat_2nucl_10 -done - - -## mono nucleosomes from processed di-nucleosome data around CTCF motif -for method in 'read' 'fragment' 'fragment_center' -do - ### mono nucleosomes - file_mat_1nucl_1="$results_dir/ctcf_motifs_10e-6_chr1_2nuclsplitintwo_bin1bp_$method.mat" - bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl2 --bai $file_bai_1nucl2 --from -400 --to 400 --binSize 1 --method $method > $file_mat_1nucl_1 - file_mat_1nucl_2="$results_dir/ctcf_motifs_10e-6_chr1_2nuclsplitintwo_bin2bp_$method.mat" - bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl2 --bai $file_bai_1nucl2 --from -400 --to 400 --binSize 2 --method $method > $file_mat_1nucl_2 - file_mat_1nucl_10="$results_dir/ctcf_motifs_10e-6_chr1_2nuclsplitintwo_bin10bp_$method.mat" - bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl2 --bai $file_bai_1nucl2 --from -1000 --to 1000 --binSize 10 --method $method > $file_mat_1nucl_10 -done - diff --git a/scripts/10xgenomics_PBMC_5k/analysis_peaks.sh b/scripts/10xgenomics_PBMC_5k/analysis_peaks.sh index 4979f3a..2c95a12 100755 --- a/scripts/10xgenomics_PBMC_5k/analysis_peaks.sh +++ b/scripts/10xgenomics_PBMC_5k/analysis_peaks.sh @@ -1,38 +1,68 @@ # some paths ## directories results_dir='results/10xgenomics_PBMC_5k' data_dir='data' read_dir="$data_dir/10xgenomics_PBMC_5k" seq_dir="$data_dir/genomes" -## input1 +## input file_bed=$read_dir'/atac_v1_pbmc_5k_peaks.bed' +file_bed_rmsk=$read_dir'/atac_v1_pbmc_5k_peaks_rmsk.bed' file_bam_open="$read_dir/atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam" file_bai_open="$read_dir/atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam.bai" file_bam_nucl="$read_dir/atac_v1_pbmc_5k_possorted_filtered_nucleosomes.bam" file_bai_nucl="$read_dir/atac_v1_pbmc_5k_possorted_filtered_nucleosomes.bam.bai" file_hg19="$seq_dir/hg19.fasta" +file_rmsk="$seq_dir/hg19_rmsk.bed" mkdir -p $results_dir +# repeat mask +# remove any peak that has at least 50% of its length overlapping a repeated region (its +# center is inside the region, this is somewhat equivalent to what is done on ccg webinterface +# when checking the repeatMask on option) +bin/bedtools/subtractBed -f 0.5 -A -a data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_peaks.bed -b data/genomes/hg19_rmsk.bed > $file_bed_rmsk + + +# sampled from bed +file_bed_rmsk_2=$read_dir'/atac_v1_pbmc_5k_peaks_rmsk_sampled.bed' +shuf $file_bed_rmsk | head -n 10000 > $file_bed_rmsk_2 + # matrix creation -## sequences -file_mat_seq="$results_dir/peaks_sequences.mat" -bin/SequenceMatrixCreator --bed $file_bed --fasta $file_hg19 --from -500 --to 500 > $file_mat_seq +## 1kb sequences +file_mat_seq_1kb_1="$results_dir/peaks_rmsk_sequences_1kb.mat" +file_mat_seq_1kb_2="$results_dir/peaks_rmsk_sampled_sequences_1kb.mat" +bin/SequenceMatrixCreator --bed $file_bed_rmsk --fasta $file_hg19 --from -500 --to 500 > $file_mat_seq_1kb_1 +bin/SequenceMatrixCreator --bed $file_bed_rmsk_2 --fasta $file_hg19 --from -500 --to 500 > $file_mat_seq_1kb_2 +## 2kb sequences +file_mat_seq_2kb_1="$results_dir/peaks_rmsk_sequences_2kb.mat" +file_mat_seq_2kb_2="$results_dir/peaks_rmsk_sampled_sequences_2kb.mat" +bin/SequenceMatrixCreator --bed $file_bed_rmsk --fasta $file_hg19 --from -1000 --to 1000 > $file_mat_seq_2kb_1 +bin/SequenceMatrixCreator --bed $file_bed_rmsk_2 --fasta $file_hg19 --from -1000 --to 1000 > $file_mat_seq_2kb_2 ## open chromatin around peaks for method in 'read_atac' do - file_mat_open_1="$results_dir/peaks_open_bin1bp_$method.mat" - bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_open --bai $file_bai_open --from -500 --to 500 --binSize 1 --method $method > $file_mat_open_1 + file_mat_open_1kb_1="$results_dir/peaks_rmsk_open_bin1bp_1kb_$method.mat" + file_mat_open_1kb_2="$results_dir/peaks_rmsk_sampled_open_bin1bp_1kb_$method.mat" + bin/CorrelationMatrixCreator --bed $file_bed_rmsk --bam $file_bam_open --bai $file_bai_open --from -500 --to 500 --binSize 1 --method $method > $file_mat_open_1kb_1 + bin/CorrelationMatrixCreator --bed $file_bed_rmsk_2 --bam $file_bam_open --bai $file_bai_open --from -500 --to 500 --binSize 1 --method $method > $file_mat_open_1kb_2 + file_mat_open_2kb_1="$results_dir/peaks_rmsk_open_bin1bp_2kb_$method.mat" + file_mat_open_2kb_2="$results_dir/peaks_rmsk_sampled_open_bin1bp_2kb_$method.mat" + bin/CorrelationMatrixCreator --bed $file_bed_rmsk --bam $file_bam_open --bai $file_bai_open --from -1000 --to 1000 --binSize 1 --method $method > $file_mat_open_2kb_1 + bin/CorrelationMatrixCreator --bed $file_bed_rmsk_2 --bam $file_bam_open --bai $file_bai_open --from -1000 --to 1000 --binSize 1 --method $method > $file_mat_open_2kb_2 done - ## all nucleosomes around peaks for method in 'fragment_center' do - ### mono nucleosomes - file_mat_nucl_1="$results_dir/peaks_nucleosomes_bin1bp_$method.mat" - bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_nucl --bai $file_bai_nucl --from -500 --to 500 --binSize 1 --method $method > $file_mat_nucl_1 + file_mat_nucl_1kb_1="$results_dir/peaks_rmsk_nucleosomes_bin1bp_1kb_$method.mat" + file_mat_nucl_1kb_2="$results_dir/peaks_rmsk_sampled_nucleosomes_bin1bp_1kb_$method.mat" + bin/CorrelationMatrixCreator --bed $file_bed_rmsk --bam $file_bam_nucl --bai $file_bai_nucl --from -500 --to 500 --binSize 1 --method $method > $file_mat_nucl_1kb_1 + bin/CorrelationMatrixCreator --bed $file_bed_rmsk_2 --bam $file_bam_nucl --bai $file_bai_nucl --from -500 --to 500 --binSize 1 --method $method > $file_mat_nucl_1kb_2 + file_mat_nucl_2kb_1="$results_dir/peaks_rmsk_nucleosomes_bin1bp_2kb_$method.mat" + file_mat_nucl_2kb_2="$results_dir/peaks_rmsk_sampled_nucleosomes_bin1bp_2kb_$method.mat" + bin/CorrelationMatrixCreator --bed $file_bed_rmsk --bam $file_bam_nucl --bai $file_bai_nucl --from -1000 --to 1000 --binSize 1 --method $method > $file_mat_nucl_2kb_1 + bin/CorrelationMatrixCreator --bed $file_bed_rmsk_2 --bam $file_bam_nucl --bai $file_bai_nucl --from -1000 --to 1000 --binSize 1 --method $method > $file_mat_nucl_2kb_2 done diff --git a/scripts/10xgenomics_PBMC_5k/process_data.sh b/scripts/10xgenomics_PBMC_5k/process_data.sh index 755fe66..2f11be8 100755 --- a/scripts/10xgenomics_PBMC_5k/process_data.sh +++ b/scripts/10xgenomics_PBMC_5k/process_data.sh @@ -1,19 +1,17 @@ mkdir -p data/10xgenomics_PBMC_5k # download 10xGenomics 5k PBMC ss-ATAC-seq dataset wget -O data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_possorted.bam http://s3-us-west-2.amazonaws.com/10x.files/samples/cell-atac/1.0.1/atac_v1_pbmc_5k/atac_v1_pbmc_5k_possorted_bam.bam # download some barecode informations wget -O data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_singlecell.csv http://cf.10xgenomics.com/samples/cell-atac/1.0.1/atac_v1_pbmc_5k/atac_v1_pbmc_5k_singlecell.csv # download their peaks wget -O data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_peaks.bed http://cf.10xgenomics.com/samples/cell-atac/1.0.1/atac_v1_pbmc_5k/atac_v1_pbmc_5k_peaks.bed sed -E s/^\([0-9XY]+\)/chr\\1/ data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_peaks.bed | grep -E ^chr | sort -k 1,1V -k2,2n -k3,3n > data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_peaks_sort.bed mv data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_peaks_sort.bed data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_peaks.bed -# get only peaks on chr1 -grep -E '^chr1[[:space:]]' data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_peaks.bed > data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_peaks_chr1.bed # extract the barecodes corresponding to cells, based on 10XGenomics analysis grep -E _cell_[0-9]+ data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_singlecell.csv | cut -d ',' -f 1 > data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_barcodes.txt # filter out reads which do not have a proper barcode python3.6 scripts/bam_tools/filter_bam.py -i data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_possorted.bam --tag CB --values data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_barcodes.txt -o data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_possorted_filtered.bam diff --git a/scripts/10xgenomics_PBMC_5k/run_all.sh b/scripts/10xgenomics_PBMC_5k/run_all.sh index e961146..0d10fa5 100755 --- a/scripts/10xgenomics_PBMC_5k/run_all.sh +++ b/scripts/10xgenomics_PBMC_5k/run_all.sh @@ -1,11 +1,6 @@ # download the data, filter them and split by fragment size mkdir -p data/10xgenomics_PBMC_5k scripts/10xgenomics_PBMC_5k/process_data.sh scripts/10xgenomics_PBMC_5k/split_by_size.sh - -# analyse chromosome 1 -scripts/10xgenomics_PBMC_5k/analysis_chr1.sh -Rscript scripts scripts/10xgenomics_PBMC_5k/analysis_chr1.R - diff --git a/scripts/10xgenomics_PBMC_5k_classification_1/classification_ctcf_motif.R b/scripts/10xgenomics_PBMC_5k_classification_1/classification_ctcf_motif.R index 1685922..8df2587 100644 --- a/scripts/10xgenomics_PBMC_5k_classification_1/classification_ctcf_motif.R +++ b/scripts/10xgenomics_PBMC_5k_classification_1/classification_ctcf_motif.R @@ -1,96 +1,172 @@ setwd(file.path("/", "local", "groux", "scATAC-seq")) # libraries library(RColorBrewer) library(seqLogo) # functions source(file.path("scripts", "functions.R")) # the minimum number of classes searched k.min = 1 # the maximum number of classes searched k.max = 10 # path to the images for the logo path.a = file.path("res/A.png") path.c = file.path("res/C.png") path.g = file.path("res/G.png") path.t = file.path("res/T.png") -################## sequence patterns around ctcf motifs ################## +################## open chromatin patterns around ctcf motifs ################## for(k in k.min:k.max) { # open chromatin data = read.read.models(file.path("results", "10xgenomics_PBMC_5k_classification_1", sprintf("ctcf_motifs_10e-6_open_bin1bp_read_atac_%dclass_model.mat", k))) model.open = data$models model.prob = data$prob data = NULL # nucleosomes model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_classification_1", sprintf("ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_%dclass_model.mat", k)))$models # sequence model.seq = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_classification_1", sprintf("ctcf_motifs_10e-6_open_bin1bp_read_atac_%dclass_sequences_model.mat", k)))$models # plot classes col = brewer.pal(3, "Set1") # X11(width=17, height=10) png(filename=file.path("results", "10xgenomics_PBMC_5k_classification_1", sprintf("ctcf_motifs_10e-6_classification_open_bin1bp_%dclass.png", k)), units="in", res=720, width=18, height=12) m = matrix(1:10, nrow=5, ncol=2, byrow=F) layout(m) # order from most to least probable class ord = order(model.prob, decreasing=T) ref.open = model.open[ord,, drop=F] ref.nucl = model.nucl[ord,, drop=F] ref.seq = model.seq[,,ord, drop=F] prob = model.prob[ord] class = c(1:nrow(ref.open))[ord] for(i in 1:nrow(ref.open)) { # plot logo plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, main=sprintf("class %d (p=%.2f)", class[i], prob[i])) # x-axis x.lab = seq(-ncol(ref.open), ncol(ref.open), length.out=3) x.at = (x.lab + ncol(ref.open)) / 2 axis(1, at=x.at, labels=x.lab) # y-axis is [0,1] for min/max signal x.at = seq(0, 1, 0.5) axis(2, at=x.at, labels=x.at) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) } row_n = 1 # row counter col_n = 1 # column counter for(i in 1:nrow(ref.open)) { # plot logo center right = 0.5*col_n - 0.01 left = right - 0.2 bottom = 1-(row_n*(0.2))+0.05 top = bottom + 0.15 par(fig=c(left, right, bottom, top), new=T) idx = 380:420 plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) # xaxis x.at = 1:length(idx) axis(1, at=x.at, labels=x.at) # yaxis x.at = seq(0, 2, by=1) axis(2, at=x.at, labels=x.at) row_n = row_n + 1 if(i %% 5 == 0) { col_n = col_n + 1 row_n = 1 } } dev.off() } + +################## nucleosomes chromatin patterns around ctcf motifs ################## + +for(k in k.min:k.max) +{ + # open chromatin + data = read.read.models(file.path("results", "10xgenomics_PBMC_5k_classification_1", + sprintf("ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_%dclass_open_read_atac_model.mat", k))) + model.open = data$models + model.prob = data$prob + data = NULL + # nucleosomes + model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_classification_1", + sprintf("ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_%dclass_model.mat", k)))$models + # sequence + model.seq = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_classification_1", + sprintf("ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_%dclass_sequences_model.mat", k)))$models + + # plot classes + col = brewer.pal(3, "Set1") + # X11(width=17, height=10) + png(filename=file.path("results", "10xgenomics_PBMC_5k_classification_1", + sprintf("ctcf_motifs_10e-6_classification_1nucl_bin1bp_%dclass.png", k)), + units="in", res=720, width=18, height=12) + m = matrix(1:10, nrow=5, ncol=2, byrow=F) + layout(m) + # order from most to least probable class + ord = order(model.prob, decreasing=T) + ref.open = model.open[ord,, drop=F] + ref.nucl = model.nucl[ord,, drop=F] + ref.seq = model.seq[,,ord, drop=F] + prob = model.prob[ord] + class = c(1:nrow(ref.open))[ord] + for(i in 1:nrow(ref.open)) + { # plot logo + plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, + main=sprintf("class %d (p=%.2f)", class[i], prob[i])) + # x-axis + x.lab = seq(-ncol(ref.open), ncol(ref.open), length.out=3) + x.at = (x.lab + ncol(ref.open)) / 2 + axis(1, at=x.at, labels=x.lab) + # y-axis is [0,1] for min/max signal + x.at = seq(0, 1, 0.5) + axis(2, at=x.at, labels=x.at) + # plot signal (multiplies by 2 because the y-axis goes to 2 bits) + lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) + lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) + } + row_n = 1 # row counter + col_n = 1 # column counter + for(i in 1:nrow(ref.open)) + { # plot logo center + right = 0.5*col_n - 0.01 + left = right - 0.2 + bottom = 1-(row_n*(0.2))+0.05 + top = bottom + 0.15 + par(fig=c(left, right, bottom, top), new=T) + idx = 380:420 + plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) + # plot signal (multiplies by 2 because the y-axis goes to 2 bits) + lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) + lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) + # xaxis + x.at = 1:length(idx) + axis(1, at=x.at, labels=x.at) + # yaxis + x.at = seq(0, 2, by=1) + axis(2, at=x.at, labels=x.at) + row_n = row_n + 1 + if(i %% 5 == 0) + { col_n = col_n + 1 + row_n = 1 + } + } + dev.off() +} diff --git a/scripts/10xgenomics_PBMC_5k_classification_1/classification_ctcf_motif.sh b/scripts/10xgenomics_PBMC_5k_classification_1/classification_ctcf_motif.sh index 6b59c2f..da45b83 100755 --- a/scripts/10xgenomics_PBMC_5k_classification_1/classification_ctcf_motif.sh +++ b/scripts/10xgenomics_PBMC_5k_classification_1/classification_ctcf_motif.sh @@ -1,53 +1,52 @@ # some paths ## directories results_dir='results/10xgenomics_PBMC_5k_classification_1' data_dir='results/10xgenomics_PBMC_5k' ## input file_mat_open="$data_dir/ctcf_motifs_10e-6_open_bin1bp_read_atac.mat" file_mat_1nucl="$data_dir/ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center.mat" file_mat_seq="$data_dir/ctcf_motifs_10e-6_sequences.mat" ## file with seeds file_seed=$results_dir'/ctcf_motifs_10e-6_seed.txt' mkdir -p $results_dir touch $file_seed # parameters n_iter='20' n_shift='21' -seeding='random' -n_core=3 +n_core=8 # open chromatin for k in 1 2 3 4 5 6 7 8 9 10 do seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) file_prob=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_prob.mat4d' file_mod1=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_model.mat' file_mod2=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_1nucl_fragment_center_model.mat' file_mod3=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_sequences_model.mat' file_aic=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_aic.txt' echo "$file_prob $seed" >> $file_seed - bin/ChIPPartitioning --read $file_mat_open --class $k --shift $n_shift --flip --iter $n_iter --seeding $seeding --seed $seed --thread $n_core > $file_prob - bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 - bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2 - bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 + bin/EMRead --read $file_mat_open --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob + bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 + bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2 + bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 done # 1nucl chromatin for k in 1 2 3 4 5 6 7 8 9 10 do seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) file_prob=$results_dir/'ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_prob.mat4d' file_mod1=$results_dir/'ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_model.mat' file_mod2=$results_dir/'ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_open_read_atac_model.mat' file_mod3=$results_dir/'ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_sequences_model.mat' file_aic=$results_dir/'ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_aic.txt' echo "$file_prob $seed" >> $file_seed - bin/ChIPPartitioning --read $file_mat_1nucl --class $k --shift $n_shift --flip --iter $n_iter --seeding $seeding --seed $seed --thread $n_core > $file_prob - bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod1 - bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod2 - bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 + bin/EMRead --read $file_mat_1nucl --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob + bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod1 + bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod2 + bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 done diff --git a/scripts/10xgenomics_PBMC_5k_classification_1/classification_ebf1_motif.sh b/scripts/10xgenomics_PBMC_5k_classification_1/classification_ebf1_motif.sh index 5f7872e..931e907 100755 --- a/scripts/10xgenomics_PBMC_5k_classification_1/classification_ebf1_motif.sh +++ b/scripts/10xgenomics_PBMC_5k_classification_1/classification_ebf1_motif.sh @@ -1,53 +1,52 @@ # some paths ## directories results_dir='results/10xgenomics_PBMC_5k_classification_1' data_dir='results/10xgenomics_PBMC_5k' ## input file_mat_open="$data_dir/ebf1_motifs_10e-6_open_bin1bp_read_atac.mat" file_mat_1nucl="$data_dir/ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center.mat" file_mat_seq="$data_dir/ebf1_motifs_10e-6_sequences.mat" ## file with seeds file_seed=$results_dir'/ebf1_motifs_10e-6_seed.txt' mkdir -p $results_dir touch $file_seed # parameters n_iter='20' n_shift='21' -seeding='random' -n_core=3 +n_core=8 # open chromatin for k in 1 2 3 4 5 6 7 8 9 10 do seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) file_prob=$results_dir/'ebf1_motifs_10e-6_open_bin1bp_read_atac_'$k'class_prob.mat4d' file_mod1=$results_dir/'ebf1_motifs_10e-6_open_bin1bp_read_atac_'$k'class_model.mat' file_mod2=$results_dir/'ebf1_motifs_10e-6_open_bin1bp_read_atac_'$k'class_1nucl_fragment_center_model.mat' file_mod3=$results_dir/'ebf1_motifs_10e-6_open_bin1bp_read_atac_'$k'class_sequences_model.mat' file_aic=$results_dir/'ebf1_motifs_10e-6_open_bin1bp_read_atac_'$k'class_aic.txt' echo "$file_prob $seed" >> $file_seed - bin/ChIPPartitioning --read $file_mat_open --class $k --shift $n_shift --flip --iter $n_iter --seeding $seeding --seed $seed --thread $n_core > $file_prob - bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 - bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2 - bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 + bin/EMRead --read $file_mat_open --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob + bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 + bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2 + bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 done # 1nucl chromatin for k in 1 2 3 4 5 6 7 8 9 10 do seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) file_prob=$results_dir/'ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_prob.mat4d' file_mod1=$results_dir/'ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_model.mat' file_mod2=$results_dir/'ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_open_read_atac_model.mat' file_mod3=$results_dir/'ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_sequences_model.mat' file_aic=$results_dir/'ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_aic.txt' echo "$file_prob $seed" >> $file_seed - bin/ChIPPartitioning --read $file_mat_1nucl --class $k --shift $n_shift --flip --iter $n_iter --seeding $seeding --seed $seed --thread $n_core > $file_prob - bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod1 - bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod2 - bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 + bin/EMRead --read $file_mat_1nucl --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob + bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod1 + bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod2 + bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 done diff --git a/scripts/10xgenomics_PBMC_5k_classification_1/classification_myc_motif.sh b/scripts/10xgenomics_PBMC_5k_classification_1/classification_myc_motif.sh index 801a070..e95bb7e 100755 --- a/scripts/10xgenomics_PBMC_5k_classification_1/classification_myc_motif.sh +++ b/scripts/10xgenomics_PBMC_5k_classification_1/classification_myc_motif.sh @@ -1,53 +1,53 @@ # some paths ## directories results_dir='results/10xgenomics_PBMC_5k_classification_1' data_dir='results/10xgenomics_PBMC_5k' ## input file_mat_open="$data_dir/myc_motifs_10e-6_open_bin1bp_read_atac.mat" file_mat_1nucl="$data_dir/myc_motifs_10e-6_1nucl_bin1bp_fragment_center.mat" file_mat_seq="$data_dir/myc_motifs_10e-6_sequences.mat" ## file with seeds file_seed=$results_dir'/myc_motifs_10e-6_seed.txt' mkdir -p $results_dir touch $file_seed # parameters n_iter='20' n_shift='21' -seeding='random' -n_core=3 +n_core=8 # open chromatin for k in 1 2 3 4 5 6 7 8 9 10 do seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) file_prob=$results_dir/'myc_motifs_10e-6_open_bin1bp_read_atac_'$k'class_prob.mat4d' file_mod1=$results_dir/'myc_motifs_10e-6_open_bin1bp_read_atac_'$k'class_model.mat' file_mod2=$results_dir/'myc_motifs_10e-6_open_bin1bp_read_atac_'$k'class_1nucl_fragment_center_model.mat' file_mod3=$results_dir/'myc_motifs_10e-6_open_bin1bp_read_atac_'$k'class_sequences_model.mat' file_aic=$results_dir/'myc_motifs_10e-6_open_bin1bp_read_atac_'$k'class_aic.txt' echo "$file_prob $seed" >> $file_seed - bin/ChIPPartitioning --read $file_mat_open --class $k --shift $n_shift --flip --iter $n_iter --seeding $seeding --seed $seed --thread $n_core > $file_prob - bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 - bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2 - bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 + bin/EMRead --read $file_mat_open --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob + bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 + bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2 + bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 done # 1nucl chromatin for k in 1 2 3 4 5 6 7 8 9 10 do seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) file_prob=$results_dir/'myc_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_prob.mat4d' file_mod1=$results_dir/'myc_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_model.mat' file_mod2=$results_dir/'myc_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_open_read_atac_model.mat' file_mod3=$results_dir/'myc_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_sequences_model.mat' file_aic=$results_dir/'myc_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_aic.txt' echo "$file_prob $seed" >> $file_seed - bin/ChIPPartitioning --read $file_mat_1nucl --class $k --shift $n_shift --flip --iter $n_iter --seeding $seeding --seed $seed --thread $n_core > $file_prob - bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod1 - bin/ProbToModel --seq $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod2 - bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 + bin/EMRead --read $file_mat_1nucl --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob + bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod1 + bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod2 + bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 done + diff --git a/scripts/10xgenomics_PBMC_5k_classification_1/classification_sp1_motif.sh b/scripts/10xgenomics_PBMC_5k_classification_1/classification_sp1_motif.sh index 8e34470..c7ca927 100755 --- a/scripts/10xgenomics_PBMC_5k_classification_1/classification_sp1_motif.sh +++ b/scripts/10xgenomics_PBMC_5k_classification_1/classification_sp1_motif.sh @@ -1,53 +1,52 @@ # some paths ## directories results_dir='results/10xgenomics_PBMC_5k_classification_1' data_dir='results/10xgenomics_PBMC_5k' ## input file_mat_open="$data_dir/sp1_motifs_10e-7_open_bin1bp_read_atac.mat" file_mat_1nucl="$data_dir/sp1_motifs_10e-7_1nucl_bin1bp_fragment_center.mat" file_mat_seq="$data_dir/sp1_motifs_10e-7_sequences.mat" ## file with seeds file_seed=$results_dir'/sp1_motifs_10e-7_seed.txt' mkdir -p $results_dir touch $file_seed # parameters n_iter='20' n_shift='21' -seeding='random' -n_core=3 +n_core=8 # open chromatin for k in 1 2 3 4 5 6 7 8 9 10 do seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) file_prob=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_prob.mat4d' file_mod1=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_model.mat' file_mod2=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_1nucl_fragment_center_model.mat' file_mod3=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_sequences_model.mat' file_aic=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_aic.txt' echo "$file_prob $seed" >> $file_seed - bin/ChIPPartitioning --read $file_mat_open --class $k --shift $n_shift --flip --iter $n_iter --seeding $seeding --seed $seed --thread $n_core > $file_prob - bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 - bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2 - bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 + bin/EMRead --read $file_mat_open --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob + bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 + bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2 + bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 done # 1nucl chromatin for k in 1 2 3 4 5 6 7 8 9 10 do seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) file_prob=$results_dir/'sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_'$k'class_prob.mat4d' file_mod1=$results_dir/'sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_'$k'class_model.mat' file_mod2=$results_dir/'sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_'$k'class_open_read_atac_model.mat' file_mod3=$results_dir/'sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_'$k'class_sequences_model.mat' file_aic=$results_dir/'sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_'$k'class_aic.txt' echo "$file_prob $seed" >> $file_seed - bin/ChIPPartitioning --read $file_mat_1nucl --class $k --shift $n_shift --flip --iter $n_iter --seeding $seeding --seed $seed --thread $n_core > $file_prob - bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod1 - bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod2 - bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 + bin/EMRead --read $file_mat_1nucl --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob + bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod1 + bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod2 + bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 done diff --git a/scripts/10xgenomics_PBMC_5k_classification_2/classification_ctcf_motif.R b/scripts/10xgenomics_PBMC_5k_classification_2/classification_ctcf_motif.R index a8f1f35..193ff0b 100644 --- a/scripts/10xgenomics_PBMC_5k_classification_2/classification_ctcf_motif.R +++ b/scripts/10xgenomics_PBMC_5k_classification_2/classification_ctcf_motif.R @@ -1,96 +1,98 @@ setwd(file.path("/", "local", "groux", "scATAC-seq")) # libraries library(RColorBrewer) library(seqLogo) # functions source(file.path("scripts", "functions.R")) # the minimum number of classes searched k.min = 1 # the maximum number of classes searched k.max = 10 # path to the images for the logo path.a = file.path("res/A.png") path.c = file.path("res/C.png") path.g = file.path("res/G.png") path.t = file.path("res/T.png") ################## sequence patterns around ctcf motifs ################## for(k in k.min:k.max) { # open chromatin data = read.read.models(file.path("results", "10xgenomics_PBMC_5k_classification_2", sprintf("ctcf_motifs_10e-6_open_bin1bp_read_atac_%dclass_model.mat", k))) model.open = data$models model.prob = data$prob data = NULL # nucleosomes model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_classification_2", sprintf("ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_%dclass_model.mat", k)))$models # sequence model.seq = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_classification_2", sprintf("ctcf_motifs_10e-6_sequences_%dclass_model.mat", k)))$models # plot classes col = brewer.pal(3, "Set1") # X11(width=17, height=10) png(filename=file.path("results", "10xgenomics_PBMC_5k_classification_2", sprintf("ctcf_motifs_10e-6_classification_%dclass.png", k)), units="in", res=720, width=18, height=12) m = matrix(1:10, nrow=5, ncol=2, byrow=F) layout(m) # order from most to least probable class ord = order(model.prob, decreasing=T) ref.open = model.open[ord,, drop=F] ref.nucl = model.nucl[ord,, drop=F] ref.seq = model.seq[,,ord, drop=F] prob = model.prob[ord] class = c(1:nrow(ref.open))[ord] for(i in 1:nrow(ref.open)) { # plot logo plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, main=sprintf("class %d (p=%.2f)", class[i], prob[i])) # x-axis - x.lab = seq(-ncol(ref.open), ncol(ref.open), length.out=3) - x.at = (x.lab + ncol(ref.open)) / 2 + x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2, length.out=3) + x.at = seq(1, ncol(ref.open), length.out=length(x.lab)) axis(1, at=x.at, labels=x.lab) # y-axis is [0,1] for min/max signal - x.at = seq(0, 1, 0.5) - axis(2, at=x.at, labels=x.at) + y.at = seq(0, 2, length.out=2) + y.lab = c("min", "max") + axis(2, at=y.at, labels=y.lab) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) } + # inlets with center row_n = 1 # row counter col_n = 1 # column counter for(i in 1:nrow(ref.open)) { # plot logo center right = 0.5*col_n - 0.01 left = right - 0.2 bottom = 1-(row_n*(0.2))+0.05 top = bottom + 0.15 par(fig=c(left, right, bottom, top), new=T) - idx = 380:420 + idx = (391-1-20):(391+1+20) plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) # xaxis - x.at = 1:length(idx) - axis(1, at=x.at, labels=x.at) + x.at = seq(1, length(idx), length.out = 3) + x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2)[idx][x.at] + axis(1, at=x.at, labels=x.lab) # yaxis - x.at = seq(0, 2, by=1) - axis(2, at=x.at, labels=x.at) + axis(2, at=y.at, labels=y.lab) row_n = row_n + 1 if(i %% 5 == 0) { col_n = col_n + 1 row_n = 1 } } dev.off() } diff --git a/scripts/10xgenomics_PBMC_5k_classification_2/classification_ctcf_motif.sh b/scripts/10xgenomics_PBMC_5k_classification_2/classification_ctcf_motif.sh index 5cc89c7..341fe6a 100755 --- a/scripts/10xgenomics_PBMC_5k_classification_2/classification_ctcf_motif.sh +++ b/scripts/10xgenomics_PBMC_5k_classification_2/classification_ctcf_motif.sh @@ -1,37 +1,36 @@ # some paths ## directories results_dir='results/10xgenomics_PBMC_5k_classification_2' data_dir='results/10xgenomics_PBMC_5k' ## input file_mat_open="$data_dir/ctcf_motifs_10e-6_open_bin1bp_read_atac.mat" file_mat_1nucl="$data_dir/ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center.mat" file_mat_seq="$data_dir/ctcf_motifs_10e-6_sequences.mat" ## file with seeds file_seed=$results_dir'/ctcf_motifs_10e-6_seed.txt' mkdir -p $results_dir touch $file_seed # parameters n_iter='20' n_shift='21' -seeding='random' -n_core=3 +n_core=12 # open chromatin and nucleosomes for k in 1 2 3 4 5 6 7 8 9 10 do seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) file_prob=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_1nucl_bin1bp_fragment_center_'$k'class_prob.mat4d' file_mod1=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_model.mat' file_mod2=$results_dir/'ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_model.mat' file_mod3=$results_dir/'ctcf_motifs_10e-6_sequences_'$k'class_model.mat' file_aic=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_aic.txt' echo "$file_prob $seed" >> $file_seed - bin/ChIPPartitioning --read $file_mat_open --seq $file_mat_1nucl --class $k --shift $n_shift --flip --iter $n_iter --seeding $seeding --seed $seed --thread $n_core > $file_prob - bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 - bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2 - bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 + bin/EMJoint --read $file_mat_open --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob + bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 + bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2 + bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 done diff --git a/scripts/10xgenomics_PBMC_5k_classification_2/classification_ebf1_motif.R b/scripts/10xgenomics_PBMC_5k_classification_2/classification_ebf1_motif.R index cb92556..e3efefd 100644 --- a/scripts/10xgenomics_PBMC_5k_classification_2/classification_ebf1_motif.R +++ b/scripts/10xgenomics_PBMC_5k_classification_2/classification_ebf1_motif.R @@ -1,96 +1,98 @@ setwd(file.path("/", "local", "groux", "scATAC-seq")) # libraries library(RColorBrewer) library(seqLogo) # functions source(file.path("scripts", "functions.R")) # the minimum number of classes searched k.min = 1 # the maximum number of classes searched k.max = 10 # path to the images for the logo path.a = file.path("res/A.png") path.c = file.path("res/C.png") path.g = file.path("res/G.png") path.t = file.path("res/T.png") ################## sequence patterns around ebf1 motifs ################## for(k in k.min:k.max) { # open chromatin data = read.read.models(file.path("results", "10xgenomics_PBMC_5k_classification_2", sprintf("ebf1_motifs_10e-6_open_bin1bp_read_atac_%dclass_model.mat", k))) model.open = data$models model.prob = data$prob data = NULL # nucleosomes model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_classification_2", sprintf("ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_%dclass_model.mat", k)))$models # sequence model.seq = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_classification_2", sprintf("ebf1_motifs_10e-6_sequences_%dclass_model.mat", k)))$models # plot classes col = brewer.pal(3, "Set1") # X11(width=17, height=10) png(filename=file.path("results", "10xgenomics_PBMC_5k_classification_2", sprintf("ebf1_motifs_10e-6_classification_%dclass.png", k)), units="in", res=720, width=18, height=12) m = matrix(1:10, nrow=5, ncol=2, byrow=F) layout(m) # order from most to least probable class ord = order(model.prob, decreasing=T) ref.open = model.open[ord,, drop=F] ref.nucl = model.nucl[ord,, drop=F] ref.seq = model.seq[,,ord, drop=F] prob = model.prob[ord] class = c(1:nrow(ref.open))[ord] for(i in 1:nrow(ref.open)) { # plot logo plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, main=sprintf("class %d (p=%.2f)", class[i], prob[i])) # x-axis - x.lab = seq(-ncol(ref.open), ncol(ref.open), length.out=3) - x.at = (x.lab + ncol(ref.open)) / 2 + x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2, length.out=3) + x.at = seq(1, ncol(ref.open), length.out=length(x.lab)) axis(1, at=x.at, labels=x.lab) # y-axis is [0,1] for min/max signal - x.at = seq(0, 1, 0.5) - axis(2, at=x.at, labels=x.at) + y.at = seq(0, 2, length.out=2) + y.lab = c("min", "max") + axis(2, at=y.at, labels=y.lab) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) } + # inlets with center row_n = 1 # row counter col_n = 1 # column counter for(i in 1:nrow(ref.open)) { # plot logo center right = 0.5*col_n - 0.01 left = right - 0.2 bottom = 1-(row_n*(0.2))+0.05 top = bottom + 0.15 par(fig=c(left, right, bottom, top), new=T) - idx = 380:420 + idx = (391-1-20):(391+1+20) plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) # xaxis - x.at = 1:length(idx) - axis(1, at=x.at, labels=x.at) + x.at = seq(1, length(idx), length.out = 3) + x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2)[idx][x.at] + axis(1, at=x.at, labels=x.lab) # yaxis - x.at = seq(0, 2, by=1) - axis(2, at=x.at, labels=x.at) + axis(2, at=y.at, labels=y.lab) row_n = row_n + 1 if(i %% 5 == 0) { col_n = col_n + 1 row_n = 1 } } dev.off() } diff --git a/scripts/10xgenomics_PBMC_5k_classification_2/classification_ebf1_motif.sh b/scripts/10xgenomics_PBMC_5k_classification_2/classification_ebf1_motif.sh index d9fbe7d..ce5cdc0 100755 --- a/scripts/10xgenomics_PBMC_5k_classification_2/classification_ebf1_motif.sh +++ b/scripts/10xgenomics_PBMC_5k_classification_2/classification_ebf1_motif.sh @@ -1,37 +1,36 @@ # some paths ## directories results_dir='results/10xgenomics_PBMC_5k_classification_2' data_dir='results/10xgenomics_PBMC_5k' ## input file_mat_open="$data_dir/ebf1_motifs_10e-6_open_bin1bp_read_atac.mat" file_mat_1nucl="$data_dir/ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center.mat" file_mat_seq="$data_dir/ebf1_motifs_10e-6_sequences.mat" ## file with seeds file_seed=$results_dir'/ebf1_motifs_10e-6_seed.txt' mkdir -p $results_dir touch $file_seed # parameters n_iter='20' n_shift='21' -seeding='random' -n_core=3 +n_core=12 # open chromatin and nucleosomes for k in 1 2 3 4 5 6 7 8 9 10 do seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) file_prob=$results_dir/'ebf1_motifs_10e-6_open_bin1bp_read_atac_1nucl_bin1bp_fragment_center_'$k'class_prob.mat4d' file_mod1=$results_dir/'ebf1_motifs_10e-6_open_bin1bp_read_atac_'$k'class_model.mat' file_mod2=$results_dir/'ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_model.mat' file_mod3=$results_dir/'ebf1_motifs_10e-6_sequences_'$k'class_model.mat' file_aic=$results_dir/'ebf1_motifs_10e-6_open_bin1bp_read_atac_'$k'class_aic.txt' echo "$file_prob $seed" >> $file_seed - bin/ChIPPartitioning --read $file_mat_open --seq $file_mat_1nucl --class $k --shift $n_shift --flip --iter $n_iter --seeding $seeding --seed $seed --thread $n_core > $file_prob - bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 - bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2 - bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 + bin/EMJoint --read $file_mat_open --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob + bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 + bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2 + bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 done diff --git a/scripts/10xgenomics_PBMC_5k_classification_2/classification_myc_motif.R b/scripts/10xgenomics_PBMC_5k_classification_2/classification_myc_motif.R index 902b5d9..c79b248 100644 --- a/scripts/10xgenomics_PBMC_5k_classification_2/classification_myc_motif.R +++ b/scripts/10xgenomics_PBMC_5k_classification_2/classification_myc_motif.R @@ -1,96 +1,98 @@ setwd(file.path("/", "local", "groux", "scATAC-seq")) # libraries library(RColorBrewer) library(seqLogo) # functions source(file.path("scripts", "functions.R")) # the minimum number of classes searched k.min = 1 # the maximum number of classes searched k.max = 10 # path to the images for the logo path.a = file.path("res/A.png") path.c = file.path("res/C.png") path.g = file.path("res/G.png") path.t = file.path("res/T.png") ################## sequence patterns around myc motifs ################## for(k in k.min:k.max) { # open chromatin data = read.read.models(file.path("results", "10xgenomics_PBMC_5k_classification_2", sprintf("myc_motifs_10e-6_open_bin1bp_read_atac_%dclass_model.mat", k))) model.open = data$models model.prob = data$prob data = NULL # nucleosomes model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_classification_2", sprintf("myc_motifs_10e-6_1nucl_bin1bp_fragment_center_%dclass_model.mat", k)))$models # sequence model.seq = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_classification_2", sprintf("myc_motifs_10e-6_sequences_%dclass_model.mat", k)))$models # plot classes col = brewer.pal(3, "Set1") # X11(width=17, height=10) png(filename=file.path("results", "10xgenomics_PBMC_5k_classification_2", sprintf("myc_motifs_10e-6_classification_%dclass.png", k)), units="in", res=720, width=18, height=12) m = matrix(1:10, nrow=5, ncol=2, byrow=F) layout(m) # order from most to least probable class ord = order(model.prob, decreasing=T) ref.open = model.open[ord,, drop=F] ref.nucl = model.nucl[ord,, drop=F] ref.seq = model.seq[,,ord, drop=F] prob = model.prob[ord] class = c(1:nrow(ref.open))[ord] for(i in 1:nrow(ref.open)) { # plot logo plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, main=sprintf("class %d (p=%.2f)", class[i], prob[i])) # x-axis - x.lab = seq(-ncol(ref.open), ncol(ref.open), length.out=3) - x.at = (x.lab + ncol(ref.open)) / 2 + x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2, length.out=3) + x.at = seq(1, ncol(ref.open), length.out=length(x.lab)) axis(1, at=x.at, labels=x.lab) # y-axis is [0,1] for min/max signal - x.at = seq(0, 1, 0.5) - axis(2, at=x.at, labels=x.at) + y.at = seq(0, 2, length.out=2) + y.lab = c("min", "max") + axis(2, at=y.at, labels=y.lab) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) } + # inlets with center row_n = 1 # row counter col_n = 1 # column counter for(i in 1:nrow(ref.open)) { # plot logo center right = 0.5*col_n - 0.01 left = right - 0.2 bottom = 1-(row_n*(0.2))+0.05 top = bottom + 0.15 par(fig=c(left, right, bottom, top), new=T) - idx = 380:420 + idx = (391-1-20):(391+1+20) plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) # xaxis - x.at = 1:length(idx) - axis(1, at=x.at, labels=x.at) + x.at = seq(1, length(idx), length.out = 3) + x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2)[idx][x.at] + axis(1, at=x.at, labels=x.lab) # yaxis - x.at = seq(0, 2, by=1) - axis(2, at=x.at, labels=x.at) + axis(2, at=y.at, labels=y.lab) row_n = row_n + 1 if(i %% 5 == 0) { col_n = col_n + 1 - row_n = 1 + row_n = 1 } } dev.off() } diff --git a/scripts/10xgenomics_PBMC_5k_classification_2/classification_myc_motif.sh b/scripts/10xgenomics_PBMC_5k_classification_2/classification_myc_motif.sh index 0b1d83a..231485b 100755 --- a/scripts/10xgenomics_PBMC_5k_classification_2/classification_myc_motif.sh +++ b/scripts/10xgenomics_PBMC_5k_classification_2/classification_myc_motif.sh @@ -1,37 +1,36 @@ # some paths ## directories results_dir='results/10xgenomics_PBMC_5k_classification_2' data_dir='results/10xgenomics_PBMC_5k' ## input file_mat_open="$data_dir/myc_motifs_10e-6_open_bin1bp_read_atac.mat" file_mat_1nucl="$data_dir/myc_motifs_10e-6_1nucl_bin1bp_fragment_center.mat" file_mat_seq="$data_dir/myc_motifs_10e-6_sequences.mat" ## file with seeds file_seed=$results_dir'/myc_motifs_10e-6_seed.txt' mkdir -p $results_dir touch $file_seed # parameters n_iter='20' n_shift='21' -seeding='random' -n_core=3 +n_core=12 # open chromatin and nucleosomes for k in 1 2 3 4 5 6 7 8 9 10 do seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) file_prob=$results_dir/'myc_motifs_10e-6_open_bin1bp_read_atac_1nucl_bin1bp_fragment_center_'$k'class_prob.mat4d' file_mod1=$results_dir/'myc_motifs_10e-6_open_bin1bp_read_atac_'$k'class_model.mat' file_mod2=$results_dir/'myc_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_model.mat' file_mod3=$results_dir/'myc_motifs_10e-6_sequences_'$k'class_model.mat' file_aic=$results_dir/'myc_motifs_10e-6_open_bin1bp_read_atac_'$k'class_aic.txt' echo "$file_prob $seed" >> $file_seed - bin/ChIPPartitioning --read $file_mat_open --seq $file_mat_1nucl --class $k --shift $n_shift --flip --iter $n_iter --seeding $seeding --seed $seed --thread $n_core > $file_prob - bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 - bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2 - bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 + bin/EMJoint --read $file_mat_open --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob + bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 + bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2 + bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 done diff --git a/scripts/10xgenomics_PBMC_5k_classification_2/classification_sp1_motif.R b/scripts/10xgenomics_PBMC_5k_classification_2/classification_sp1_motif.R index f53e34b..24d95e0 100644 --- a/scripts/10xgenomics_PBMC_5k_classification_2/classification_sp1_motif.R +++ b/scripts/10xgenomics_PBMC_5k_classification_2/classification_sp1_motif.R @@ -1,96 +1,98 @@ setwd(file.path("/", "local", "groux", "scATAC-seq")) # libraries library(RColorBrewer) library(seqLogo) # functions source(file.path("scripts", "functions.R")) # the minimum number of classes searched k.min = 1 # the maximum number of classes searched k.max = 10 # path to the images for the logo path.a = file.path("res/A.png") path.c = file.path("res/C.png") path.g = file.path("res/G.png") path.t = file.path("res/T.png") ################## sequence patterns around sp1 motifs ################## for(k in k.min:k.max) { # open chromatin data = read.read.models(file.path("results", "10xgenomics_PBMC_5k_classification_2", sprintf("sp1_motifs_10e-7_open_bin1bp_read_atac_%dclass_model.mat", k))) model.open = data$models model.prob = data$prob data = NULL # nucleosomes model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_classification_2", sprintf("sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_%dclass_model.mat", k)))$models # sequence model.seq = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_classification_2", sprintf("sp1_motifs_10e-7_sequences_%dclass_model.mat", k)))$models # plot classes col = brewer.pal(3, "Set1") # X11(width=17, height=10) png(filename=file.path("results", "10xgenomics_PBMC_5k_classification_2", sprintf("sp1_motifs_10e-7_classification_%dclass.png", k)), units="in", res=720, width=18, height=12) m = matrix(1:10, nrow=5, ncol=2, byrow=F) layout(m) # order from most to least probable class ord = order(model.prob, decreasing=T) ref.open = model.open[ord,, drop=F] ref.nucl = model.nucl[ord,, drop=F] ref.seq = model.seq[,,ord, drop=F] prob = model.prob[ord] class = c(1:nrow(ref.open))[ord] for(i in 1:nrow(ref.open)) { # plot logo plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, main=sprintf("class %d (p=%.2f)", class[i], prob[i])) # x-axis - x.lab = seq(-ncol(ref.open), ncol(ref.open), length.out=3) - x.at = (x.lab + ncol(ref.open)) / 2 + x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2, length.out=3) + x.at = seq(1, ncol(ref.open), length.out=length(x.lab)) axis(1, at=x.at, labels=x.lab) # y-axis is [0,1] for min/max signal - x.at = seq(0, 1, 0.5) - axis(2, at=x.at, labels=x.at) + y.at = seq(0, 2, length.out=2) + y.lab = c("min", "max") + axis(2, at=y.at, labels=y.lab) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) } + # inlets with center row_n = 1 # row counter col_n = 1 # column counter for(i in 1:nrow(ref.open)) { # plot logo center right = 0.5*col_n - 0.01 left = right - 0.2 bottom = 1-(row_n*(0.2))+0.05 top = bottom + 0.15 par(fig=c(left, right, bottom, top), new=T) - idx = 380:420 + idx = (391-1-20):(391+1+20) plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) # xaxis - x.at = 1:length(idx) - axis(1, at=x.at, labels=x.at) + x.at = seq(1, length(idx), length.out = 3) + x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2)[idx][x.at] + axis(1, at=x.at, labels=x.lab) # yaxis - x.at = seq(0, 2, by=1) - axis(2, at=x.at, labels=x.at) + axis(2, at=y.at, labels=y.lab) row_n = row_n + 1 if(i %% 5 == 0) { col_n = col_n + 1 row_n = 1 } } dev.off() } diff --git a/scripts/10xgenomics_PBMC_5k_classification_2/classification_sp1_motif.sh b/scripts/10xgenomics_PBMC_5k_classification_2/classification_sp1_motif.sh index 2a18d68..7ba3cf0 100755 --- a/scripts/10xgenomics_PBMC_5k_classification_2/classification_sp1_motif.sh +++ b/scripts/10xgenomics_PBMC_5k_classification_2/classification_sp1_motif.sh @@ -1,37 +1,35 @@ # some paths ## directories results_dir='results/10xgenomics_PBMC_5k_classification_2' data_dir='results/10xgenomics_PBMC_5k' ## input file_mat_open="$data_dir/sp1_motifs_10e-7_open_bin1bp_read_atac.mat" file_mat_1nucl="$data_dir/sp1_motifs_10e-7_1nucl_bin1bp_fragment_center.mat" file_mat_seq="$data_dir/sp1_motifs_10e-7_sequences.mat" ## file with seeds file_seed=$results_dir'/sp1_motifs_10e-7_seed.txt' mkdir -p $results_dir touch $file_seed # parameters n_iter='20' n_shift='21' -seeding='random' -n_core=3 +n_core=12 # open chromatin and nucleosomes for k in 1 2 3 4 5 6 7 8 9 10 do seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) file_prob=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_1nucl_bin1bp_fragment_center_'$k'class_prob.mat4d' file_mod1=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_model.mat' file_mod2=$results_dir/'sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_'$k'class_model.mat' file_mod3=$results_dir/'sp1_motifs_10e-7_sequences_'$k'class_model.mat' file_aic=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_aic.txt' echo "$file_prob $seed" >> $file_seed - bin/ChIPPartitioning --read $file_mat_open --seq $file_mat_1nucl --class $k --shift $n_shift --flip --iter $n_iter --seeding $seeding --seed $seed --thread $n_core > $file_prob - bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 - bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2 - bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 - + bin/EMJoint --read $file_mat_open --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob + bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 + bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2 + bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 done diff --git a/scripts/10xgenomics_PBMC_5k_classification_3/classification_ctcf_motif.sh b/scripts/10xgenomics_PBMC_5k_classification_3/classification_ctcf_motif.sh index f47c2de..452d48c 100755 --- a/scripts/10xgenomics_PBMC_5k_classification_3/classification_ctcf_motif.sh +++ b/scripts/10xgenomics_PBMC_5k_classification_3/classification_ctcf_motif.sh @@ -1,39 +1,38 @@ # some paths ## directories results_dir='results/10xgenomics_PBMC_5k_classification_3' data_dir='results/10xgenomics_PBMC_5k' ## input file_mat_open="$data_dir/ctcf_motifs_10e-6_open_bin1bp_read_atac.mat" file_mat_1nucl="$data_dir/ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center.mat" file_mat_nucl="$data_dir/ctcf_motifs_10e-6_nucleosomes_bin1bp_fragment_center.mat" file_mat_seq="$data_dir/ctcf_motifs_10e-6_sequences.mat" ## file with seeds file_seed=$results_dir'/ctcf_motifs_10e-6_seed.txt' mkdir -p $results_dir touch $file_seed # parameters n_iter='20' n_shift='21' -seeding='random' -n_core=8 +n_core=12 -# open chromatin and nucleosomes +# sequences for k in 1 2 3 4 5 6 7 8 9 10 do seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) file_prob=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_sequences_'$k'class_prob.mat4d' file_mod1=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_model.mat' file_mod2=$results_dir/'ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_model.mat' file_mod3=$results_dir/'ctcf_motifs_10e-6_nucleosomes_bin1bp_fragment_center_'$k'class_model.mat' file_mod4=$results_dir/'ctcf_motifs_10e-6_sequences_'$k'class_model.mat' file_aic=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_aic.txt' echo "$file_prob $seed" >> $file_seed - # bin/ChIPPartitioning --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seeding $seeding --seed $seed --thread $n_core > $file_prob - # bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 - # bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2 - bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod3 - # bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod4 + bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob + bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 + bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2 + bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod3 + bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod4 done diff --git a/scripts/10xgenomics_PBMC_5k_classification_3/classification_sp1_motif.sh b/scripts/10xgenomics_PBMC_5k_classification_3/classification_sp1_motif.sh index 0779c14..a37b533 100755 --- a/scripts/10xgenomics_PBMC_5k_classification_3/classification_sp1_motif.sh +++ b/scripts/10xgenomics_PBMC_5k_classification_3/classification_sp1_motif.sh @@ -1,39 +1,38 @@ # some paths ## directories results_dir='results/10xgenomics_PBMC_5k_classification_3' data_dir='results/10xgenomics_PBMC_5k' ## input file_mat_open="$data_dir/sp1_motifs_10e-7_open_bin1bp_read_atac.mat" file_mat_1nucl="$data_dir/sp1_motifs_10e-7_1nucl_bin1bp_fragment_center.mat" file_mat_nucl="$data_dir/sp1_motifs_10e-7_nucleosomes_bin1bp_fragment_center.mat" file_mat_seq="$data_dir/sp1_motifs_10e-7_sequences.mat" ## file with seeds file_seed=$results_dir'/sp1_motifs_10e-7_seed.txt' mkdir -p $results_dir touch $file_seed # parameters n_iter='20' n_shift='21' -seeding='random' -n_core=8 +n_core=12 -# open chromatin and nucleosomes +# sequences for k in 1 2 3 4 5 6 7 8 9 10 do seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) file_prob=$results_dir/'sp1_motifs_10e-7_open_bin1bp_sequences_'$k'class_prob.mat4d' file_mod1=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_model.mat' file_mod2=$results_dir/'sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_'$k'class_model.mat' file_mod3=$results_dir/'sp1_motifs_10e-7_nucleosomes_bin1bp_fragment_center_'$k'class_model.mat' file_mod4=$results_dir/'sp1_motifs_10e-7_sequences_'$k'class_model.mat' file_aic=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_aic.txt' echo "$file_prob $seed" >> $file_seed - bin/ChIPPartitioning --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seeding $seeding --seed $seed --thread $n_core > $file_prob - bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 - bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2 - bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod3 - bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod4 + bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob + bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 + bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2 + bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod3 + bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod4 done diff --git a/scripts/10xgenomics_PBMC_5k_classification_4/classification_ctcf_motif.sh b/scripts/10xgenomics_PBMC_5k_classification_4/classification_ctcf_motif.sh index 621241a..e91d7fb 100755 --- a/scripts/10xgenomics_PBMC_5k_classification_4/classification_ctcf_motif.sh +++ b/scripts/10xgenomics_PBMC_5k_classification_4/classification_ctcf_motif.sh @@ -1,39 +1,38 @@ # some paths ## directories results_dir='results/10xgenomics_PBMC_5k_classification_4' data_dir='results/10xgenomics_PBMC_5k' ## input file_mat_open="$data_dir/ctcf_motifs_10e-6_open_bin1bp_read_atac.mat" file_mat_1nucl="$data_dir/ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center.mat" file_mat_nucl="$data_dir/ctcf_motifs_10e-6_nucleosomes_bin1bp_fragment_center.mat" file_mat_seq="$data_dir/ctcf_motifs_10e-6_sequences.mat" ## file with seeds file_seed=$results_dir'/ctcf_motifs_10e-6_seed.txt' mkdir -p $results_dir touch $file_seed # parameters n_iter='20' n_shift='1' -seeding='random' -n_core=8 +n_core=12 -# open chromatin and nucleosomes +# sequences for k in 1 2 3 4 5 6 7 8 9 10 do seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) file_prob=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_sequences_'$k'class_prob.mat4d' file_mod1=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_model.mat' file_mod2=$results_dir/'ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_model.mat' file_mod3=$results_dir/'ctcf_motifs_10e-6_nucleosomes_bin1bp_fragment_center_'$k'class_model.mat' file_mod4=$results_dir/'ctcf_motifs_10e-6_sequences_'$k'class_model.mat' file_aic=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_aic.txt' echo "$file_prob $seed" >> $file_seed - bin/ChIPPartitioning --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seeding $seeding --seed $seed --thread $n_core > $file_prob - bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 - bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2 - bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod3 - bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod4 + bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob + bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 + bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2 + bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod3 + bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod4 done diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_1/classification_peaks.R b/scripts/10xgenomics_PBMC_5k_classification_4/classification_sp1_motif.R similarity index 84% copy from scripts/10xgenomics_PBMC_5k_peaks_classification_1/classification_peaks.R copy to scripts/10xgenomics_PBMC_5k_classification_4/classification_sp1_motif.R index 93d8eae..3dc0ab1 100644 --- a/scripts/10xgenomics_PBMC_5k_peaks_classification_1/classification_peaks.R +++ b/scripts/10xgenomics_PBMC_5k_classification_4/classification_sp1_motif.R @@ -1,96 +1,96 @@ setwd(file.path("/", "local", "groux", "scATAC-seq")) # libraries library(RColorBrewer) library(seqLogo) # functions source(file.path("scripts", "functions.R")) # the minimum number of classes searched k.min = 1 # the maximum number of classes searched k.max = 10 # path to the images for the logo path.a = file.path("res/A.png") path.c = file.path("res/C.png") path.g = file.path("res/G.png") path.t = file.path("res/T.png") -################## sequence patterns around ctcf motifs ################## +################## sequence patterns around sp1 motifs ################## for(k in k.min:k.max) { # open chromatin data = read.read.models(file.path("results", "10xgenomics_PBMC_5k_classification_4", - sprintf("ctcf_motifs_10e-6_open_bin1bp_read_atac_%dclass_model.mat", k))) + sprintf("sp1_motifs_10e-7_open_bin1bp_read_atac_%dclass_model.mat", k))) model.open = data$models model.prob = data$prob data = NULL # nucleosomes model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_classification_4", - sprintf("ctcf_motifs_10e-6_nucleosomes_bin1bp_fragment_center_%dclass_model.mat", k)))$models + sprintf("sp1_motifs_10e-7_nucleosomes_bin1bp_fragment_center_%dclass_model.mat", k)))$models # sequence model.seq = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_classification_4", - sprintf("ctcf_motifs_10e-6_sequences_%dclass_model.mat", k)))$models + sprintf("sp1_motifs_10e-7_sequences_%dclass_model.mat", k)))$models # plot classes col = brewer.pal(3, "Set1") # X11(width=17, height=10) png(filename=file.path("results", "10xgenomics_PBMC_5k_classification_4", - sprintf("ctcf_motifs_10e-6_classification_sequences_%dclass.png", k)), + sprintf("sp1_motifs_10e-7_classification_sequences_%dclass.png", k)), units="in", res=720, width=18, height=12) m = matrix(1:10, nrow=5, ncol=2, byrow=F) layout(m) # order from most to least probable class ord = order(model.prob, decreasing=T) ref.open = model.open[ord,, drop=F] ref.nucl = model.nucl[ord,, drop=F] ref.seq = model.seq[,,ord, drop=F] prob = model.prob[ord] class = c(1:nrow(ref.open))[ord] for(i in 1:nrow(ref.open)) { # plot logo plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, main=sprintf("class %d (p=%.2f)", class[i], prob[i])) # x-axis x.lab = seq(-ncol(ref.open), ncol(ref.open), length.out=3) x.at = (x.lab + ncol(ref.open)) / 2 axis(1, at=x.at, labels=x.lab) # y-axis is [0,1] for min/max signal x.at = seq(0, 1, 0.5) axis(2, at=x.at, labels=x.at) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) } row_n = 1 # row counter col_n = 1 # column counter for(i in 1:nrow(ref.open)) { # plot logo center right = 0.5*col_n - 0.01 left = right - 0.2 bottom = 1-(row_n*(0.2))+0.05 top = bottom + 0.15 par(fig=c(left, right, bottom, top), new=T) idx = 380:420 plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) # xaxis x.at = 1:length(idx) axis(1, at=x.at, labels=x.at) # yaxis x.at = seq(0, 2, by=1) axis(2, at=x.at, labels=x.at) row_n = row_n + 1 if(i %% 5 == 0) { col_n = col_n + 1 row_n = 1 } } dev.off() } diff --git a/scripts/10xgenomics_PBMC_5k_classification_3/classification_sp1_motif.sh b/scripts/10xgenomics_PBMC_5k_classification_4/classification_sp1_motif.sh similarity index 63% copy from scripts/10xgenomics_PBMC_5k_classification_3/classification_sp1_motif.sh copy to scripts/10xgenomics_PBMC_5k_classification_4/classification_sp1_motif.sh index 0779c14..5381f5c 100755 --- a/scripts/10xgenomics_PBMC_5k_classification_3/classification_sp1_motif.sh +++ b/scripts/10xgenomics_PBMC_5k_classification_4/classification_sp1_motif.sh @@ -1,39 +1,38 @@ # some paths ## directories -results_dir='results/10xgenomics_PBMC_5k_classification_3' +results_dir='results/10xgenomics_PBMC_5k_classification_4' data_dir='results/10xgenomics_PBMC_5k' ## input file_mat_open="$data_dir/sp1_motifs_10e-7_open_bin1bp_read_atac.mat" file_mat_1nucl="$data_dir/sp1_motifs_10e-7_1nucl_bin1bp_fragment_center.mat" file_mat_nucl="$data_dir/sp1_motifs_10e-7_nucleosomes_bin1bp_fragment_center.mat" file_mat_seq="$data_dir/sp1_motifs_10e-7_sequences.mat" ## file with seeds file_seed=$results_dir'/sp1_motifs_10e-7_seed.txt' mkdir -p $results_dir touch $file_seed # parameters n_iter='20' -n_shift='21' -seeding='random' -n_core=8 +n_shift='1' +n_core=12 -# open chromatin and nucleosomes +# sequences for k in 1 2 3 4 5 6 7 8 9 10 do seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) file_prob=$results_dir/'sp1_motifs_10e-7_open_bin1bp_sequences_'$k'class_prob.mat4d' file_mod1=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_model.mat' file_mod2=$results_dir/'sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_'$k'class_model.mat' file_mod3=$results_dir/'sp1_motifs_10e-7_nucleosomes_bin1bp_fragment_center_'$k'class_model.mat' file_mod4=$results_dir/'sp1_motifs_10e-7_sequences_'$k'class_model.mat' file_aic=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_aic.txt' echo "$file_prob $seed" >> $file_seed - bin/ChIPPartitioning --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seeding $seeding --seed $seed --thread $n_core > $file_prob - bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 - bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2 - bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod3 - bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod4 + bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob + bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 + bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2 + bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod3 + bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod4 done diff --git a/scripts/10xgenomics_PBMC_5k_peaks/analysis_peaks.sh b/scripts/10xgenomics_PBMC_5k_peaks/analysis_peaks.sh new file mode 100755 index 0000000..8869f9e --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_peaks/analysis_peaks.sh @@ -0,0 +1,35 @@ +# some paths +## directories +data_dir='data/10xgenomics_PBMC_5k' +seq_dir='data/genomes' +results_dir='data/10xgenomics_PBMC_5k_peaks' +## input +file_bed_rmsk=$data_dir/'atac_v1_pbmc_5k_peaks_rmsk.bed' +file_bam_open="$data_dir/atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam" +file_bai_open="$data_dir/atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam.bai" +file_bam_nucl="$data_dir/atac_v1_pbmc_5k_possorted_filtered_nucleosomes.bam" +file_bai_nucl="$data_dir/atac_v1_pbmc_5k_possorted_filtered_nucleosomes.bam.bai" +file_seq="$seq_dir/hg19.fasta" + +mkdir -p $results_dir + +# matrix creation +## 1kb sequences +file_mat_seq_1kb="$results_dir/peaks_rmsk_sequences_1kb.mat" +bin/SequenceMatrixCreator --bed $file_bed_rmsk --fasta $file_seq --from -500 --to 500 > $file_mat_seq_1kb + +## open chromatin around peaks +for method in 'read_atac' +do + file_mat_open_1kb="$results_dir/peaks_rmsk_openchromatin_1kb_$method.mat" + bin/CorrelationMatrixCreator --bed $file_bed_rmsk --bam $file_bam_open --bai $file_bai_open --from -500 --to 500 --binSize 1 --method $method > $file_mat_open_1kb +done + +## all nucleosomes around peaks +for method in 'fragment_center' +do + file_mat_nucl_1kb="$results_dir/peaks_rmsk_nucleosomes_1kb_$method.mat" + bin/CorrelationMatrixCreator --bed $file_bed_rmsk --bam $file_bam_nucl --bai $file_bai_nucl --from -500 --to 500 --binSize 1 --method $method > $file_mat_nucl_1kb +done + + diff --git a/scripts/10xgenomics_PBMC_5k_peaks/analysis_peaks_sampled.sh b/scripts/10xgenomics_PBMC_5k_peaks/analysis_peaks_sampled.sh new file mode 100755 index 0000000..1712862 --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_peaks/analysis_peaks_sampled.sh @@ -0,0 +1,51 @@ +# some paths +## directories +data_dir='data/10xgenomics_PBMC_5k' +seq_dir='data/genomes' +results_dir='data/10xgenomics_PBMC_5k_peaks' +## input +file_bed=$data_dir'/atac_v1_pbmc_5k_peaks.bed' +file_bam_open="$data_dir/atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam" +file_bai_open="$data_dir/atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam.bai" +file_bam_nucl="$data_dir/atac_v1_pbmc_5k_possorted_filtered_nucleosomes.bam" +file_bai_nucl="$data_dir/atac_v1_pbmc_5k_possorted_filtered_nucleosomes.bam.bai" +file_seq="$seq_dir/hg19.fasta" +file_rmsk="$seq_dir/hg19_rmsk.bed" + +mkdir -p $results_dir + +# filter out peaks with >=30% repeated region inside +file_bed_rmsk=$data_dir/'atac_v1_pbmc_5k_peaks_rmsk.bed' +bin/bedtools/bedtools subtract -A -f 0.3 -a $file_bed -b $file_rmsk > $file_bed_rmsk + +# sampled from bed +file_bed_rmsk_sample=$data_dir'/atac_v1_pbmc_5k_peaks_rmsk_sampled.bed' +shuf $file_bed | head -n 10000 > $file_bed_rmsk_sample + +# matrix creation +## 1kb sequences +file_mat_seq_1kb="$results_dir/peaks_rmsk_sampled_sequences_1kb.mat" +bin/SequenceMatrixCreator --bed $file_bed_rmsk_sample --fasta $file_seq --from -500 --to 500 > $file_mat_seq_1kb +## 2kb sequences +file_mat_seq_2kb="$results_dir/peaks_rmsk_sampled_sequences_2kb.mat" +bin/SequenceMatrixCreator --bed $file_bed_rmsk_sample --fasta $file_seq --from -1000 --to 1000 > $file_mat_seq_2kb + +## open chromatin around peaks +for method in 'read_atac' +do + file_mat_open_1kb="$results_dir/peaks_rmsk_sampled_openchromatin_1kb_$method.mat" + bin/CorrelationMatrixCreator --bed $file_bed_rmsk_sample --bam $file_bam_open --bai $file_bai_open --from -500 --to 500 --binSize 1 --method $method > $file_mat_open_1kb + file_mat_open_2kb="$results_dir/peaks_rmsk_sampled_openchromatin_2kb_$method.mat" + bin/CorrelationMatrixCreator --bed $file_bed_rmsk_sample --bam $file_bam_open --bai $file_bai_open --from -1000 --to 1000 --binSize 1 --method $method > $file_mat_open_2kb +done + +## all nucleosomes around peaks +for method in 'fragment_center' +do + file_mat_nucl_1kb="$results_dir/peaks_rmsk_sampled_nucleosomes_1kb_$method.mat" + bin/CorrelationMatrixCreator --bed $file_bed_rmsk_sample --bam $file_bam_nucl --bai $file_bai_nucl --from -500 --to 500 --binSize 1 --method $method > $file_mat_nucl_1kb + file_mat_nucl_2kb="$results_dir/peaks_rmsk_sampled_nucleosomes_2kb_$method.mat" + bin/CorrelationMatrixCreator --bed $file_bed_rmsk_sample --bam $file_bam_nucl --bai $file_bai_nucl --from -1000 --to 1000 --binSize 1 --method $method > $file_mat_nucl_2kb +done + + diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_1/analysis_test_sampled.R b/scripts/10xgenomics_PBMC_5k_peaks_classification_1/analysis_test_sampled.R new file mode 100644 index 0000000..4e371a7 --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_1/analysis_test_sampled.R @@ -0,0 +1,96 @@ +setwd(file.path("/", "local", "groux", "scATAC-seq")) + +# libraries +library(RColorBrewer) + +# functions +source(file.path("scripts", "functions.R")) + +# the number of classes searched +n.classes = c(10, 20, 30) + +# path to the images for the logo +path.a = file.path("res/A.png") +path.c = file.path("res/C.png") +path.g = file.path("res/G.png") +path.t = file.path("res/T.png") + +################## sequence patterns around ctcf motifs ################## + +for(k in n.classes) +{ + # sequence + data = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_1", + sprintf("peaks_rmsk_sampled_sequences_1kb_%dclass_model.mat", k))) + model.seq = data$models + model.prob = data$prob + data = NULL + + # open chromatin + model.open = read.read.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_1", + sprintf("peaks_rmsk_sampled_openchromatin_1kb_read_atac_%dclass_model.mat", k)))$models + # nucleosomes + model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_1", + sprintf("peaks_rmsk_sampled_nucleosomes_1kb_fragment_center_%dclass_model.mat", k)))$models + + # plot classes + col = brewer.pal(3, "Set1") + # X11(width=26, height=12) + png(filename=file.path("results", "10xgenomics_PBMC_5k_peaks_classification_1", + sprintf("peaks_rmsk_sampled_sequences_%dclass.png", k)), + units="in", res=720, width=18, height=12) + m = matrix(1:30, nrow=6, ncol=5, byrow=F) + layout(m) + # order from most to least probable class + ord = order(model.prob, decreasing=T) + ref.open = model.open[ord,, drop=F] + ref.nucl = model.nucl[ord,, drop=F] + ref.seq = model.seq[,,ord, drop=F] + prob = model.prob[ord] + class = c(1:nrow(ref.open))[ord] + for(i in 1:nrow(ref.open)) + { # plot logo + plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, + main=sprintf("class %d (p=%.2f)", class[i], prob[i])) + # x-axis + x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2, length.out=3) + x.at = seq(1, ncol(ref.open), length.out=length(x.lab)) + axis(1, at=x.at, labels=x.lab) + # y-axis is [0,1] for min/max signal + y.at = seq(0, 2, length.out=2) + y.lab = c("min", "max") + axis(2, at=y.at, labels=y.lab) + # plot signal (multiplies by 2 because the y-axis goes to 2 bits) + lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) + lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) + } + # inlets with center + # row_n = 1 # row counter + # col_n = 1 # column counter + # for(i in 1:nrow(ref.open)) + # { # plot logo center + # right = 0.5*col_n - 0.01 + # left = right - 0.2 + # bottom = 1-(row_n*(0.2))+0.05 + # top = bottom + 0.15 + # par(fig=c(left, right, bottom, top), new=T) + # idx = (391-1-20):(391+1+20) + # plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) + # # plot signal (multiplies by 2 because the y-axis goes to 2 bits) + # lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) + # lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) + # # xaxis + # x.at = seq(1, length(idx), length.out = 3) + # x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2)[idx][x.at] + # axis(1, at=x.at, labels=x.lab) + # # yaxis + # axis(2, at=y.at, labels=y.lab) + # row_n = row_n + 1 + # if(i %% 5 == 0) + # { col_n = col_n + 1 + # row_n = 1 + # } + # } + dev.off() +} + diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_1/classification_peaks.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_1/classification_peaks.sh deleted file mode 100755 index 0ea6153..0000000 --- a/scripts/10xgenomics_PBMC_5k_peaks_classification_1/classification_peaks.sh +++ /dev/null @@ -1,50 +0,0 @@ -# some paths -## directories -results_dir='results/10xgenomics_PBMC_5k_peaks_classification_1' -data_dir='results/10xgenomics_PBMC_5k' -## input -file_mat_open="$data_dir/peaks_open_bin1bp_read_atac.mat" -file_mat_nucl="$data_dir/peaks_nucleosomes_bin1bp_read_atac.mat" -file_mat_seq="$data_dir/peaks_sequences.mat" - -## file with seeds -file_seed=$results_dir'/peaks_seed.txt' - -mkdir -p $results_dir -touch $file_seed - -# parameters -n_iter='20' -n_shift='201' -seeding='random' -n_core=8 - -# open chromatin and nucleosomes -# for k in 10 20 30 -# do -# seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) -# file_prob=$results_dir/'peaks_sequences_'$k'class_prob.mat4d' -# file_mod1=$results_dir/'peaks_openchromatin_bin1bp_'$k'class_model.mat' -# file_mod2=$results_dir/'peaks_nucleosomes_bin1bp_'$k'class_model.mat' -# file_mod3=$results_dir/'peaks_sequences_'$k'class_model.mat' -# echo "$file_prob $seed" >> $file_seed -# bin/ChIPPartitioning --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seeding $seeding --seed $seed --thread $n_core > $file_prob -# bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 -# bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2 -# bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod3 -# bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod4 -# done - -k=5 -seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) -file_prob=$results_dir/'peaks_sequences_'$k'class_prob.mat4d' -file_mod1=$results_dir/'peaks_openchromatin_bin1bp_'$k'class_model.mat' -file_mod2=$results_dir/'peaks_nucleosomes_bin1bp_'$k'class_model.mat' -file_mod3=$results_dir/'peaks_sequences_'$k'class_model.mat' -echo "$file_prob $seed" >> $file_seed -bin/ChIPPartitioning --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seeding $seeding --seed $seed --thread $n_core > $file_prob -bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 -bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2 -bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod3 -bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod4 - diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_1/classification_peaks_sampled.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_1/classification_peaks_sampled.sh new file mode 100755 index 0000000..618a604 --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_1/classification_peaks_sampled.sh @@ -0,0 +1,35 @@ + +# paths +## dir +data_dir="data/10xgenomics_PBMC_5k_peaks" +results_dir="results/10xgenomics_PBMC_5k_peaks_classification_1" +## matrix files +file_mat_open=$data_dir/'peaks_rmsk_sampled_openchromatin_1kb_read_atac.mat' +file_mat_nucl=$data_dir/'peaks_rmsk_sampled_nucleosomes_1kb_fragment_center.mat' +file_mat_seq=$data_dir/'peaks_rmsk_sampled_sequences_1kb.mat' +## file with seeds +file_seed=$results_dir'/peaks_rmsk_sampled_seed.txt' + +mkdir -p $results_dir +touch $file_seed + +# EM param +n_iter='100' +n_shift='981' +n_core=24 + +# classify +for k in 10 20 30 +do + ## results files + file_prob=$results_dir/'peaks_rmsk_sampled_sequences_1kb_'$k'class_prob.mat4d' + file_mod1=$results_dir/'peaks_rmsk_sampled_openchromatin_1kb_read_atac_'$k'class_model.mat' + file_mod2=$results_dir/'peaks_rmsk_sampled_nucleosomes_1kb_fragment_center_'$k'class_model.mat' + file_mod3=$results_dir/'peaks_rmsk_sampled_sequences_1kb_'$k'class_model.mat' + seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) + echo "$file_prob $seed" >> $file_seed + bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift --flip --bgclass --iter $n_iter --seed $seed --thread $n_core > $file_prob + bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 + bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod2 + bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 +done diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_2/analysis_test_sampled.R b/scripts/10xgenomics_PBMC_5k_peaks_classification_2/analysis_test_sampled.R new file mode 100644 index 0000000..fba2a15 --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_2/analysis_test_sampled.R @@ -0,0 +1,96 @@ +setwd(file.path("/", "local", "groux", "scATAC-seq")) + +# libraries +library(RColorBrewer) + +# functions +source(file.path("scripts", "functions.R")) + +# the number of classes searched +n.classes = c(10, 20, 30) + +# path to the images for the logo +path.a = file.path("res/A.png") +path.c = file.path("res/C.png") +path.g = file.path("res/G.png") +path.t = file.path("res/T.png") + +################## sequence patterns around ctcf motifs ################## + +for(k in n.classes) +{ + # sequence + data = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_2", + sprintf("peaks_rmsk_sampled_sequences_1kb_%dclass_model.mat", k))) + model.seq = data$models + model.prob = data$prob + data = NULL + + # open chromatin + model.open = read.read.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_2", + sprintf("peaks_rmsk_sampled_openchromatin_1kb_read_atac_%dclass_model.mat", k)))$models + # nucleosomes + model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_2", + sprintf("peaks_rmsk_sampled_nucleosomes_1kb_fragment_center_%dclass_model.mat", k)))$models + + # plot classes + col = brewer.pal(3, "Set1") + # X11(width=26, height=12) + png(filename=file.path("results", "10xgenomics_PBMC_5k_peaks_classification_2", + sprintf("peaks_rmsk_sampled_sequences_%dclass.png", k)), + units="in", res=720, width=18, height=12) + m = matrix(1:30, nrow=6, ncol=5, byrow=F) + layout(m) + # order from most to least probable class + ord = order(model.prob, decreasing=T) + ref.open = model.open[ord,, drop=F] + ref.nucl = model.nucl[ord,, drop=F] + ref.seq = model.seq[,,ord, drop=F] + prob = model.prob[ord] + class = c(1:nrow(ref.open))[ord] + for(i in 1:nrow(ref.open)) + { # plot logo + plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, + main=sprintf("class %d (p=%.2f)", class[i], prob[i])) + # x-axis + x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2, length.out=3) + x.at = seq(1, ncol(ref.open), length.out=length(x.lab)) + axis(1, at=x.at, labels=x.lab) + # y-axis is [0,1] for min/max signal + y.at = seq(0, 2, length.out=2) + y.lab = c("min", "max") + axis(2, at=y.at, labels=y.lab) + # plot signal (multiplies by 2 because the y-axis goes to 2 bits) + lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) + lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) + } + # inlets with center + # row_n = 1 # row counter + # col_n = 1 # column counter + # for(i in 1:nrow(ref.open)) + # { # plot logo center + # right = 0.5*col_n - 0.01 + # left = right - 0.2 + # bottom = 1-(row_n*(0.2))+0.05 + # top = bottom + 0.15 + # par(fig=c(left, right, bottom, top), new=T) + # idx = (391-1-20):(391+1+20) + # plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) + # # plot signal (multiplies by 2 because the y-axis goes to 2 bits) + # lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) + # lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) + # # xaxis + # x.at = seq(1, length(idx), length.out = 3) + # x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2)[idx][x.at] + # axis(1, at=x.at, labels=x.lab) + # # yaxis + # axis(2, at=y.at, labels=y.lab) + # row_n = row_n + 1 + # if(i %% 5 == 0) + # { col_n = col_n + 1 + # row_n = 1 + # } + # } + dev.off() +} + diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_2/classification_peaks_sampled.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_2/classification_peaks_sampled.sh new file mode 100755 index 0000000..03c84de --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_2/classification_peaks_sampled.sh @@ -0,0 +1,35 @@ + +# paths +## dir +data_dir="data/10xgenomics_PBMC_5k_peaks" +results_dir="results/10xgenomics_PBMC_5k_peaks_classification_2" +## matrix files +file_mat_open=$data_dir/'peaks_rmsk_sampled_openchromatin_1kb_read_atac.mat' +file_mat_nucl=$data_dir/'peaks_rmsk_sampled_nucleosomes_1kb_fragment_center.mat' +file_mat_seq=$data_dir/'peaks_rmsk_sampled_sequences_1kb.mat' +## file with seeds +file_seed=$results_dir'/peaks_rmsk_sampled_seed.txt' + +mkdir -p $results_dir +touch $file_seed + +# EM param +n_iter='100' +n_shift='981' +n_core=24 + +# classify +for k in 10 20 30 +do + ## results files + file_prob=$results_dir/'peaks_rmsk_sampled_sequences_1kb_'$k'class_prob.mat4d' + file_mod1=$results_dir/'peaks_rmsk_sampled_openchromatin_1kb_read_atac_'$k'class_model.mat' + file_mod2=$results_dir/'peaks_rmsk_sampled_nucleosomes_1kb_fragment_center_'$k'class_model.mat' + file_mod3=$results_dir/'peaks_rmsk_sampled_sequences_1kb_'$k'class_model.mat' + seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) + echo "$file_prob $seed" >> $file_seed + bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob + bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 + bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod2 + bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 +done diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_3/analysis_test_sampled.R b/scripts/10xgenomics_PBMC_5k_peaks_classification_3/analysis_test_sampled.R new file mode 100644 index 0000000..d0bf0d6 --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_3/analysis_test_sampled.R @@ -0,0 +1,96 @@ +setwd(file.path("/", "local", "groux", "scATAC-seq")) + +# libraries +library(RColorBrewer) + +# functions +source(file.path("scripts", "functions.R")) + +# the number of classes searched +n.classes = c(10, 20, 30) + +# path to the images for the logo +path.a = file.path("res/A.png") +path.c = file.path("res/C.png") +path.g = file.path("res/G.png") +path.t = file.path("res/T.png") + +################## sequence patterns around ctcf motifs ################## + +for(k in n.classes) +{ + # sequence + data = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_3", + sprintf("peaks_rmsk_sampled_sequences_1kb_%dclass_model.mat", k))) + model.seq = data$models + model.prob = data$prob + data = NULL + + # open chromatin + model.open = read.read.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_3", + sprintf("peaks_rmsk_sampled_openchromatin_1kb_read_atac_%dclass_model.mat", k)))$models + # nucleosomes + model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_3", + sprintf("peaks_rmsk_sampled_nucleosomes_1kb_fragment_center_%dclass_model.mat", k)))$models + + # plot classes + col = brewer.pal(3, "Set1") + # X11(width=26, height=12) + png(filename=file.path("results", "10xgenomics_PBMC_5k_peaks_classification_3", + sprintf("peaks_rmsk_sampled_sequences_%dclass.png", k)), + units="in", res=720, width=18, height=12) + m = matrix(1:30, nrow=6, ncol=5, byrow=F) + layout(m) + # order from most to least probable class + ord = order(model.prob, decreasing=T) + ref.open = model.open[ord,, drop=F] + ref.nucl = model.nucl[ord,, drop=F] + ref.seq = model.seq[,,ord, drop=F] + prob = model.prob[ord] + class = c(1:nrow(ref.open))[ord] + for(i in 1:nrow(ref.open)) + { # plot logo + plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, + main=sprintf("class %d (p=%.2f)", class[i], prob[i])) + # x-axis + x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2, length.out=3) + x.at = seq(1, ncol(ref.open), length.out=length(x.lab)) + axis(1, at=x.at, labels=x.lab) + # y-axis is [0,1] for min/max signal + y.at = seq(0, 2, length.out=2) + y.lab = c("min", "max") + axis(2, at=y.at, labels=y.lab) + # plot signal (multiplies by 2 because the y-axis goes to 2 bits) + lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) + lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) + } + # inlets with center + # row_n = 1 # row counter + # col_n = 1 # column counter + # for(i in 1:nrow(ref.open)) + # { # plot logo center + # right = 0.5*col_n - 0.01 + # left = right - 0.2 + # bottom = 1-(row_n*(0.2))+0.05 + # top = bottom + 0.15 + # par(fig=c(left, right, bottom, top), new=T) + # idx = (391-1-20):(391+1+20) + # plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) + # # plot signal (multiplies by 2 because the y-axis goes to 2 bits) + # lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) + # lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) + # # xaxis + # x.at = seq(1, length(idx), length.out = 3) + # x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2)[idx][x.at] + # axis(1, at=x.at, labels=x.lab) + # # yaxis + # axis(2, at=y.at, labels=y.lab) + # row_n = row_n + 1 + # if(i %% 5 == 0) + # { col_n = col_n + 1 + # row_n = 1 + # } + # } + dev.off() +} + diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_3/classification_peaks_sampled.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_3/classification_peaks_sampled.sh new file mode 100755 index 0000000..a396e60 --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_3/classification_peaks_sampled.sh @@ -0,0 +1,35 @@ + +# paths +## dir +data_dir="data/10xgenomics_PBMC_5k_peaks" +results_dir="results/10xgenomics_PBMC_5k_peaks_classification_3" +## matrix files +file_mat_open=$data_dir/'peaks_rmsk_sampled_openchromatin_1kb_read_atac.mat' +file_mat_nucl=$data_dir/'peaks_rmsk_sampled_nucleosomes_1kb_fragment_center.mat' +file_mat_seq=$data_dir/'peaks_rmsk_sampled_sequences_1kb.mat' +## file with seeds +file_seed=$results_dir'/peaks_rmsk_sampled_seed.txt' + +mkdir -p $results_dir +touch $file_seed + +# EM param +n_iter='100' +n_shift='981' +n_core=24 + +# classify +for k in 10 20 30 +do + ## results files + file_prob=$results_dir/'peaks_rmsk_sampled_openchromatin-sequences_1kb_'$k'class_prob.mat4d' + file_mod1=$results_dir/'peaks_rmsk_sampled_openchromatin_1kb_read_atac_'$k'class_model.mat' + file_mod2=$results_dir/'peaks_rmsk_sampled_nucleosomes_1kb_fragment_center_'$k'class_model.mat' + file_mod3=$results_dir/'peaks_rmsk_sampled_sequences_1kb_'$k'class_model.mat' + seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) + echo "$file_prob $seed" >> $file_seed + bin/EMJoint --read $file_mat_open --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob + bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 + bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod2 + bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 +done diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_4/analysis_test_sampled.R b/scripts/10xgenomics_PBMC_5k_peaks_classification_4/analysis_test_sampled.R new file mode 100644 index 0000000..df6959c --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_4/analysis_test_sampled.R @@ -0,0 +1,96 @@ +setwd(file.path("/", "local", "groux", "scATAC-seq")) + +# libraries +library(RColorBrewer) + +# functions +source(file.path("scripts", "functions.R")) + +# the number of classes searched +n.classes = c(17, 20, 30) + +# path to the images for the logo +path.a = file.path("res/A.png") +path.c = file.path("res/C.png") +path.g = file.path("res/G.png") +path.t = file.path("res/T.png") + +################## sequence patterns around ctcf motifs ################## + +for(k in n.classes) +{ + # sequence + data = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_4", + sprintf("peaks_rmsk_sampled_sequences_1kb_%dclass_model.mat", k))) + model.seq = data$models + model.prob = data$prob + data = NULL + + # open chromatin + model.open = read.read.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_4", + sprintf("peaks_rmsk_sampled_openchromatin_1kb_read_atac_%dclass_model.mat", k)))$models + # nucleosomes + model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_4", + sprintf("peaks_rmsk_sampled_nucleosomes_1kb_fragment_center_%dclass_model.mat", k)))$models + + # plot classes + col = brewer.pal(3, "Set1") + # X11(width=26, height=12) + png(filename=file.path("results", "10xgenomics_PBMC_5k_peaks_classification_4", + sprintf("peaks_rmsk_sampled_sequences_%dclass.png", k)), + units="in", res=720, width=18, height=12) + m = matrix(1:30, nrow=6, ncol=5, byrow=F) + layout(m) + # order from most to least probable class + ord = order(model.prob, decreasing=T) + ref.open = model.open[ord,, drop=F] + ref.nucl = model.nucl[ord,, drop=F] + ref.seq = model.seq[,,ord, drop=F] + prob = model.prob[ord] + class = c(1:nrow(ref.open))[ord] + for(i in 1:nrow(ref.open)) + { # plot logo + plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, + main=sprintf("class %d (p=%.2f)", class[i], prob[i])) + # x-axis + x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2, length.out=3) + x.at = seq(1, ncol(ref.open), length.out=length(x.lab)) + axis(1, at=x.at, labels=x.lab) + # y-axis is [0,1] for min/max signal + y.at = seq(0, 2, length.out=2) + y.lab = c("min", "max") + axis(2, at=y.at, labels=y.lab) + # plot signal (multiplies by 2 because the y-axis goes to 2 bits) + lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) + lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) + } + # inlets with center + # row_n = 1 # row counter + # col_n = 1 # column counter + # for(i in 1:nrow(ref.open)) + # { # plot logo center + # right = 0.5*col_n - 0.01 + # left = right - 0.2 + # bottom = 1-(row_n*(0.2))+0.05 + # top = bottom + 0.15 + # par(fig=c(left, right, bottom, top), new=T) + # idx = (391-1-20):(391+1+20) + # plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) + # # plot signal (multiplies by 2 because the y-axis goes to 2 bits) + # lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) + # lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) + # # xaxis + # x.at = seq(1, length(idx), length.out = 3) + # x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2)[idx][x.at] + # axis(1, at=x.at, labels=x.lab) + # # yaxis + # axis(2, at=y.at, labels=y.lab) + # row_n = row_n + 1 + # if(i %% 5 == 0) + # { col_n = col_n + 1 + # row_n = 1 + # } + # } + dev.off() +} + diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_4/classification_peaks_sampled.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_4/classification_peaks_sampled.sh new file mode 100755 index 0000000..d87ff4e --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_4/classification_peaks_sampled.sh @@ -0,0 +1,55 @@ + +# paths +## dir +data_dir="data/10xgenomics_PBMC_5k_peaks" +pwm_dir="data/pwm/jaspar_2018_clustering/" +results_dir="results/10xgenomics_PBMC_5k_peaks_classification_4" +## matrix files +file_mat_open=$data_dir/'peaks_rmsk_sampled_openchromatin_1kb_read_atac.mat' +file_mat_nucl=$data_dir/'peaks_rmsk_sampled_nucleosomes_1kb_fragment_center.mat' +file_mat_seq=$data_dir/'peaks_rmsk_sampled_sequences_1kb.mat' +## file with seeds +file_seed=$results_dir'/peaks_rmsk_sampled_seed.txt' + +mkdir -p $results_dir +touch $file_seed + +# EM param +n_iter='100' +n_shift='971' +n_core=30 +## PWM files +jun="$pwm_dir/cluster_3_node_23_20_motifs_prob.mat" +hif1a="$pwm_dir/cluster_4_node_31_3_motifs_prob.mat" +myc="$pwm_dir/cluster_4_node_22_4_motifs_prob.mat" +pu1="$pwm_dir/cluster_7_node_13_2_motifs_prob.mat" +cebpb="$pwm_dir/cluster_5_node_20_5_motifs_prob.mat" +irf4="$pwm_dir/cluster_31_node_4_5_motifs_prob.mat" +irf2="$pwm_dir/cluster_31_node_5_2_motifs_prob.mat" +lhx3="$pwm_dir/cluster_1_node_74_2_motifs_prob.mat" +foxh1="$pwm_dir/cluster_66_1_motifs_prob.mat" +sox3="$pwm_dir/cluster_33_node_1_2_motifs_prob.mat" +mef2c="$pwm_dir/cluster_20_4_motifs_prob.mat" +elf5="$pwm_dir/cluster_7_node_17_5_motifs_prob.mat" +stat6="$pwm_dir/cluster_32_node_STAT6_1_motifs_prob.mat" +nfe2="$pwm_dir/cluster_3_node_24_4_motifs_prob.mat" +ahr="$pwm_dir/cluster_4_node_30_2_motifs_prob.mat" +e2f2="$pwm_dir/cluster_39_node_1_2_motifs_prob.mat" +ctcf="$pwm_dir/cluster_48_node_ctcf_1_motifs_prob.mat" + + +# classify +for k in 30 20 17 +do + ## results files + file_prob=$results_dir/'peaks_rmsk_sampled_sequences_1kb_'$k'class_prob.mat4d' + file_mod1=$results_dir/'peaks_rmsk_sampled_openchromatin_1kb_read_atac_'$k'class_model.mat' + file_mod2=$results_dir/'peaks_rmsk_sampled_nucleosomes_1kb_fragment_center_'$k'class_model.mat' + file_mod3=$results_dir/'peaks_rmsk_sampled_sequences_1kb_'$k'class_model.mat' + seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) + echo "$file_prob $seed" >> $file_seed + bin/EMSequence --seq $file_mat_seq --class $k --motifs $jun,$hif1a,$myc,$pu1,$cebpb,$irf4,$irf2,$lhx3,$foxh1,$sox3,$mef2c,$elf5,$stat6,$nfe2,$ahr,$e2f2,$ctcf --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob + bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 + bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod2 + bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 +done diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_5/analysis_test_sampled.R b/scripts/10xgenomics_PBMC_5k_peaks_classification_5/analysis_test_sampled.R new file mode 100644 index 0000000..41ffc20 --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_5/analysis_test_sampled.R @@ -0,0 +1,96 @@ +setwd(file.path("/", "local", "groux", "scATAC-seq")) + +# libraries +library(RColorBrewer) + +# functions +source(file.path("scripts", "functions.R")) + +# the number of classes searched +n.classes = c(20, 30, 40) + +# path to the images for the logo +path.a = file.path("res/A.png") +path.c = file.path("res/C.png") +path.g = file.path("res/G.png") +path.t = file.path("res/T.png") + +################## sequence patterns around ctcf motifs ################## + +for(k in n.classes) +{ + # sequence + data = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_5", + sprintf("peaks_rmsk_sampled_sequences_1kb_%dclass_model.mat", k))) + model.seq = data$models + model.prob = data$prob + data = NULL + + # open chromatin + model.open = read.read.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_5", + sprintf("peaks_rmsk_sampled_openchromatin_1kb_read_atac_%dclass_model.mat", k)))$models + # nucleosomes + model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_5", + sprintf("peaks_rmsk_sampled_nucleosomes_1kb_fragment_center_%dclass_model.mat", k)))$models + + # plot classes + col = brewer.pal(3, "Set1") + # X11(width=26, height=12) + png(filename=file.path("results", "10xgenomics_PBMC_5k_peaks_classification_5", + sprintf("peaks_rmsk_sampled_sequences_%dclass.png", k)), + units="in", res=720, width=18, height=12) + m = matrix(1:42, nrow=6, ncol=7, byrow=F) + layout(m) + # order from most to least probable class + ord = order(model.prob, decreasing=T) + ref.open = model.open[ord,, drop=F] + ref.nucl = model.nucl[ord,, drop=F] + ref.seq = model.seq[,,ord, drop=F] + prob = model.prob[ord] + class = c(1:nrow(ref.open))[ord] + for(i in 1:nrow(ref.open)) + { # plot logo + plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, + main=sprintf("class %d (p=%.2f)", class[i], prob[i])) + # x-axis + x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2, length.out=3) + x.at = seq(1, ncol(ref.open), length.out=length(x.lab)) + axis(1, at=x.at, labels=x.lab) + # y-axis is [0,1] for min/max signal + y.at = seq(0, 2, length.out=2) + y.lab = c("min", "max") + axis(2, at=y.at, labels=y.lab) + # plot signal (multiplies by 2 because the y-axis goes to 2 bits) + lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) + lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) + } + # inlets with center + # row_n = 1 # row counter + # col_n = 1 # column counter + # for(i in 1:nrow(ref.open)) + # { # plot logo center + # right = 0.5*col_n - 0.01 + # left = right - 0.2 + # bottom = 1-(row_n*(0.2))+0.05 + # top = bottom + 0.15 + # par(fig=c(left, right, bottom, top), new=T) + # idx = (391-1-20):(391+1+20) + # plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) + # # plot signal (multiplies by 2 because the y-axis goes to 2 bits) + # lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) + # lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) + # # xaxis + # x.at = seq(1, length(idx), length.out = 3) + # x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2)[idx][x.at] + # axis(1, at=x.at, labels=x.lab) + # # yaxis + # axis(2, at=y.at, labels=y.lab) + # row_n = row_n + 1 + # if(i %% 5 == 0) + # { col_n = col_n + 1 + # row_n = 1 + # } + # } + dev.off() +} + diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_5/classification_peaks_sampled.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_5/classification_peaks_sampled.sh new file mode 100755 index 0000000..5f54d1d --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_5/classification_peaks_sampled.sh @@ -0,0 +1,35 @@ + +# paths +## dir +data_dir="data/10xgenomics_PBMC_5k_peaks" +results_dir="results/10xgenomics_PBMC_5k_peaks_classification_5" +## matrix files +file_mat_open=$data_dir/'peaks_rmsk_sampled_openchromatin_1kb_read_atac.mat' +file_mat_nucl=$data_dir/'peaks_rmsk_sampled_nucleosomes_1kb_fragment_center.mat' +file_mat_seq=$data_dir/'peaks_rmsk_sampled_sequences_1kb.mat' +## file with seeds +file_seed=$results_dir'/peaks_rmsk_sampled_seed.txt' + +mkdir -p $results_dir +touch $file_seed + +# EM param +n_iter='100' +n_shift='991' +n_core=24 + +# classify +for k in 20 30 40 +do + ## results files + file_prob=$results_dir/'peaks_rmsk_sampled_sequences_1kb_'$k'class_prob.mat4d' + file_mod1=$results_dir/'peaks_rmsk_sampled_openchromatin_1kb_read_atac_'$k'class_model.mat' + file_mod2=$results_dir/'peaks_rmsk_sampled_nucleosomes_1kb_fragment_center_'$k'class_model.mat' + file_mod3=$results_dir/'peaks_rmsk_sampled_sequences_1kb_'$k'class_model.mat' + seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) + echo "$file_prob $seed" >> $file_seed + bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob + bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 + bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod2 + bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 +done diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_6/analysis_test_sampled.R b/scripts/10xgenomics_PBMC_5k_peaks_classification_6/analysis_test_sampled.R new file mode 100644 index 0000000..d4fc044 --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_6/analysis_test_sampled.R @@ -0,0 +1,95 @@ +setwd(file.path("/", "local", "groux", "scATAC-seq")) + +# libraries +library(RColorBrewer) + +# functions +source(file.path("scripts", "functions.R")) + +# the number of classes searched +n.classes = c(23) + +# path to the images for the logo +path.a = file.path("res/A.png") +path.c = file.path("res/C.png") +path.g = file.path("res/G.png") +path.t = file.path("res/T.png") + +################## sequence patterns around ctcf motifs ################## + +for(k in n.classes) +{ + # sequence + data = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_6", + sprintf("peaks_rmsk_sampled_sequences_1kb_%dclass_model_extended.mat", k))) + model.seq = data$models + model.prob = data$prob + data = NULL + + # open chromatin + model.open = read.read.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_6", + sprintf("peaks_rmsk_sampled_openchromatin_1kb_read_atac_%dclass_model_extended.mat", k)))$models + # nucleosomes + model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_6", + sprintf("peaks_rmsk_sampled_nucleosomes_1kb_fragment_center_%dclass_model_extended.mat", k)))$models + + # plot classes + col = brewer.pal(3, "Set1") + X11(width=26, height=12) + # png(filename=file.path("results", "10xgenomics_PBMC_5k_peaks_classification_6", + # sprintf("peaks_rmsk_sampled_sequences_%dclass.png", k)), + # units="in", res=720, width=18, height=12) + m = matrix(1:24, nrow=6, ncol=4, byrow=F) + layout(m) + # order from most to least probable class + ord = order(model.prob, decreasing=T) + ref.open = model.open[ord,, drop=F][,316:716] + ref.nucl = model.nucl[ord,, drop=F][,316:716] + ref.seq = model.seq[,,ord, drop=F][,316:716,] + prob = model.prob[ord] + class = c(1:nrow(ref.open))[ord] + for(i in 1:nrow(ref.open)) + { # plot logo + plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, + main=sprintf("class %d (p=%.2f)", class[i], prob[i])) + # x-axis + x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2, length.out=3) + x.at = seq(1, ncol(ref.open), length.out=length(x.lab)) + axis(1, at=x.at, labels=x.lab) + # y-axis is [0,1] for min/max signal + y.at = seq(0, 2, length.out=2) + y.lab = c("min", "max") + axis(2, at=y.at, labels=y.lab) + # plot signal (multiplies by 2 because the y-axis goes to 2 bits) + lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) + lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) + } + # inlets with center + # row_n = 1 # row counter + # col_n = 1 # column counter + # for(i in 1:nrow(ref.open)) + # { # plot logo center + # right = 0.25*col_n + 0.03 + # left = right - 0.15 + # bottom = 1-(row_n*(0.2))+0.05 + # top = bottom + 0.15 + # par(fig=c(left, right, bottom, top), new=T) + # idx = (516-1-10):(516+1+10) + # plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) + # # plot signal (multiplies by 2 because the y-axis goes to 2 bits) + # lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) + # lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) + # # xaxis + # x.at = seq(1, length(idx), length.out = 3) + # x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2)[idx][x.at] + # axis(1, at=x.at, labels=x.lab) + # # yaxis + # axis(2, at=y.at, labels=y.lab) + # row_n = row_n + 1 + # if(i %% 5 == 0) + # { col_n = col_n + 1 + # row_n = 1 + # } + # } + dev.off() +} diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_6/classification_peaks_sampled.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_6/classification_peaks_sampled.sh new file mode 100755 index 0000000..1efedd4 --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_6/classification_peaks_sampled.sh @@ -0,0 +1,76 @@ + +# paths +## dir +data_dir_p="data/10xgenomics_PBMC_5k_peaks" +data_dir="data/10xgenomics_PBMC_5k" +pwm_dir="data/pwm/jaspar_2018_clustering/" +hg19_dir="data/genomes" +results_dir="results/10xgenomics_PBMC_5k_peaks_classification_6" +## matrix files +file_mat_open=$data_dir_p/'peaks_rmsk_sampled_openchromatin_1kb_read_atac.mat' +file_mat_nucl=$data_dir_p/'peaks_rmsk_sampled_nucleosomes_1kb_fragment_center.mat' +file_mat_seq=$data_dir_p/'peaks_rmsk_sampled_sequences_1kb.mat' +## file with seeds +file_seed=$results_dir'/peaks_rmsk_sampled_seed.txt' + +mkdir -p $results_dir +touch $file_seed + +# EM param +n_iter='1' +n_shift='971' +n_core=8 +## PWM files +jun="$pwm_dir/cluster_3_node_23_20_motifs_prob.mat" +hif1a="$pwm_dir/cluster_4_node_31_3_motifs_prob.mat" +myc="$pwm_dir/cluster_4_node_22_4_motifs_prob.mat" +pu1="$pwm_dir/cluster_7_node_13_2_motifs_prob.mat" +cebpb="$pwm_dir/cluster_5_node_20_5_motifs_prob.mat" +irf4="$pwm_dir/cluster_31_node_4_5_motifs_prob.mat" +irf2="$pwm_dir/cluster_31_node_5_2_motifs_prob.mat" +lhx3="$pwm_dir/cluster_1_node_74_2_motifs_prob.mat" +foxh1="$pwm_dir/cluster_66_1_motifs_prob.mat" +sox3="$pwm_dir/cluster_33_node_1_2_motifs_prob.mat" +mef2c="$pwm_dir/cluster_20_4_motifs_prob.mat" +elf5="$pwm_dir/cluster_7_node_17_5_motifs_prob.mat" +stat6="$pwm_dir/cluster_32_node_STAT6_1_motifs_prob.mat" +nfe2="$pwm_dir/cluster_3_node_24_4_motifs_prob.mat" +ahr="$pwm_dir/cluster_4_node_30_2_motifs_prob.mat" +e2f2="$pwm_dir/cluster_39_node_1_2_motifs_prob.mat" +ctcf="$pwm_dir/cluster_48_node_ctcf_1_motifs_prob.mat" +klf="$pwm_dir/cluster_28_node_14_3_motifs_prob.mat" +nr4a1="$pwm_dir/cluster_2_node_12_4_motifs_prob.mat" +egr="$pwm_dir/cluster_28_node_13_4_motifs_prob.mat" +gata="$pwm_dir/cluster_21_node_5_6_motifs_prob.mat" +nfat="$pwm_dir/cluster_19_node_2_3_motifs_prob.mat" +runx="$pwm_dir/cluster_38_node_3_3_motifs_prob.mat" + +# classify +for k in 23 +do + ## results files + file_prob=$results_dir/'peaks_rmsk_sampled_sequences_1kb_'$k'class_prob.mat4d' + file_mod1=$results_dir/'peaks_rmsk_sampled_openchromatin_1kb_read_atac_'$k'class_model.mat' + file_mod2=$results_dir/'peaks_rmsk_sampled_nucleosomes_1kb_fragment_center_'$k'class_model.mat' + file_mod3=$results_dir/'peaks_rmsk_sampled_sequences_1kb_'$k'class_model.mat' + seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) + echo "$file_prob $seed" >> $file_seed + bin/EMSequence --seq $file_mat_seq --class $k --motifs $jun,$hif1a,$myc,$pu1,$cebpb,$irf4,$irf2,$lhx3,$foxh1,$sox3,$mef2c,$elf5,$stat6,$nfe2,$ahr,$e2f2,$ctcf,$klf,$nr4a1,$egr,$gata,$nfat,$runx --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob + bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 + bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod2 + bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 + + # extend models + file_mod1_ext=$results_dir/'peaks_rmsk_sampled_openchromatin_1kb_read_atac_'$k'class_model_extended.mat' + file_mod2_ext=$results_dir/'peaks_rmsk_sampled_nucleosomes_1kb_fragment_center_'$k'class_model_extended.mat' + file_mod3_ext=$results_dir/'peaks_rmsk_sampled_sequences_1kb_'$k'class_model_extended.mat' + file_bed=$data_dir/'atac_v1_pbmc_5k_peaks_rmsk_sampled.bed' + file_fasta=$hg19_dir/'hg19.fasta' + file_bam_open=$data_dir/'atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam' + file_bai_open=$data_dir/'atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam.bai' + file_bam_nucl=$data_dir/'atac_v1_pbmc_5k_possorted_filtered_nucleosomes.bam' + file_bai_nucl=$data_dir/'atac_v1_pbmc_5k_possorted_filtered_nucleosomes.bam.bai' + bin/ReadModelExtender --bed $file_bed --bam $file_bam_open --bai $file_bai_open --prob $file_prob --from -500 --to 500 --ext 1000 --binSize 1 --method 'read_atac' --thread $n_core > $file_mod1_ext + bin/ReadModelExtender --bed $file_bed --bam $file_bam_nucl --bai $file_bai_nucl --prob $file_prob --from -500 --to 500 --ext 1000 --binSize 1 --method 'fragment_center' --thread $n_core > $file_mod2_ext + bin/SequenceModelExtender --bed $file_bed --fasta $file_fasta --prob $file_prob --from -500 --to 500 --ext 1000 --thread $n_core > $file_mod3_ext +done diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_1/classification_peaks.R b/scripts/10xgenomics_PBMC_5k_peaks_classification_7/analysis_test.R similarity index 54% rename from scripts/10xgenomics_PBMC_5k_peaks_classification_1/classification_peaks.R rename to scripts/10xgenomics_PBMC_5k_peaks_classification_7/analysis_test.R index 93d8eae..33dcd04 100644 --- a/scripts/10xgenomics_PBMC_5k_peaks_classification_1/classification_peaks.R +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_7/analysis_test.R @@ -1,96 +1,103 @@ setwd(file.path("/", "local", "groux", "scATAC-seq")) # libraries library(RColorBrewer) -library(seqLogo) # functions source(file.path("scripts", "functions.R")) -# the minimum number of classes searched -k.min = 1 -# the maximum number of classes searched -k.max = 10 +# the number of classes searched +n.classes = c(23) # path to the images for the logo path.a = file.path("res/A.png") path.c = file.path("res/C.png") path.g = file.path("res/G.png") path.t = file.path("res/T.png") ################## sequence patterns around ctcf motifs ################## -for(k in k.min:k.max) -{ - # open chromatin - data = read.read.models(file.path("results", "10xgenomics_PBMC_5k_classification_4", - sprintf("ctcf_motifs_10e-6_open_bin1bp_read_atac_%dclass_model.mat", k))) - model.open = data$models +for(k in n.classes) +{ + # sequence + data = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_6", + sprintf("peaks_rmsk_sampled_sequences_1kb_%dclass_model_extended.mat", k))) + model.seq = data$models model.prob = data$prob data = NULL + + # open chromatin + model.open = read.read.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_6", + sprintf("peaks_rmsk_sampled_openchromatin_1kb_read_atac_%dclass_model_extended.mat", k)))$models # nucleosomes - model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_classification_4", - sprintf("ctcf_motifs_10e-6_nucleosomes_bin1bp_fragment_center_%dclass_model.mat", k)))$models - # sequence - model.seq = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_classification_4", - sprintf("ctcf_motifs_10e-6_sequences_%dclass_model.mat", k)))$models + model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_6", + sprintf("peaks_rmsk_sampled_nucleosomes_1kb_fragment_center_%dclass_model_extended.mat", k)))$models # plot classes col = brewer.pal(3, "Set1") - # X11(width=17, height=10) - png(filename=file.path("results", "10xgenomics_PBMC_5k_classification_4", - sprintf("ctcf_motifs_10e-6_classification_sequences_%dclass.png", k)), - units="in", res=720, width=18, height=12) - m = matrix(1:10, nrow=5, ncol=2, byrow=F) + X11(width=26, height=12) + # png(filename=file.path("results", "test_1kb", + # sprintf("peaks_rmsk_sampled_sequences_%dclass.png", k)), + # units="in", res=720, width=18, height=12) + m = matrix(1:24, nrow=6, ncol=4, byrow=F) layout(m) # order from most to least probable class ord = order(model.prob, decreasing=T) ref.open = model.open[ord,, drop=F] ref.nucl = model.nucl[ord,, drop=F] ref.seq = model.seq[,,ord, drop=F] prob = model.prob[ord] class = c(1:nrow(ref.open))[ord] for(i in 1:nrow(ref.open)) { # plot logo plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, main=sprintf("class %d (p=%.2f)", class[i], prob[i])) # x-axis - x.lab = seq(-ncol(ref.open), ncol(ref.open), length.out=3) - x.at = (x.lab + ncol(ref.open)) / 2 + x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2, length.out=3) + x.at = seq(1, ncol(ref.open), length.out=length(x.lab)) axis(1, at=x.at, labels=x.lab) # y-axis is [0,1] for min/max signal - x.at = seq(0, 1, 0.5) - axis(2, at=x.at, labels=x.at) + y.at = seq(0, 2, length.out=2) + y.lab = c("min", "max") + axis(2, at=y.at, labels=y.lab) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) } + # inlets with center row_n = 1 # row counter col_n = 1 # column counter for(i in 1:nrow(ref.open)) { # plot logo center right = 0.5*col_n - 0.01 left = right - 0.2 bottom = 1-(row_n*(0.2))+0.05 top = bottom + 0.15 par(fig=c(left, right, bottom, top), new=T) - idx = 380:420 + idx = (516-1-20):(516+1+20) plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) # xaxis - x.at = 1:length(idx) - axis(1, at=x.at, labels=x.at) + x.at = seq(1, length(idx), length.out = 3) + x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2)[idx][x.at] + axis(1, at=x.at, labels=x.lab) # yaxis - x.at = seq(0, 2, by=1) - axis(2, at=x.at, labels=x.at) + axis(2, at=y.at, labels=y.lab) row_n = row_n + 1 if(i %% 5 == 0) { col_n = col_n + 1 row_n = 1 } } - dev.off() + # dev.off() } +m = matrix(1:24, nrow=6, ncol=4, byrow=F) +layout(m) +col=brewer.pal(3,"Set1") +for(i in 1:nrow(model.open)) +{ plot(model.open[i,]/max(model.open[i,]), type='l', lwd=2, col=col[1]) + lines(model.nucl[i,]/max(model.nucl[i,]), type='l', lwd=2, col=col[2]) +} diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_7/classification_peaks.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_7/classification_peaks.sh new file mode 100755 index 0000000..9850d7a --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_7/classification_peaks.sh @@ -0,0 +1,76 @@ + +# paths +## dir +data_dir_p="data/10xgenomics_PBMC_5k_peaks" +data_dir="data/10xgenomics_PBMC_5k" +pwm_dir="data/pwm/jaspar_2018_clustering/" +hg19_dir="data/genomes" +results_dir="results/10xgenomics_PBMC_5k_peaks_classification_7" +## matrix files +file_mat_open=$data_dir_p/'peaks_rmsk_openchromatin_1kb_read_atac.mat' +file_mat_nucl=$data_dir_p/'peaks_rmsk_nucleosomes_1kb_fragment_center.mat' +file_mat_seq=$data_dir_p/'peaks_rmsk_sequences_1kb.mat' +## file with seeds +file_seed=$results_dir'/peaks_rmsk_seed.txt' + +mkdir -p $results_dir +touch $file_seed + +# EM param +n_iter='1' +n_shift='971' +n_core=24 +## PWM files +jun="$pwm_dir/cluster_3_node_23_20_motifs_prob.mat" +hif1a="$pwm_dir/cluster_4_node_31_3_motifs_prob.mat" +myc="$pwm_dir/cluster_4_node_22_4_motifs_prob.mat" +pu1="$pwm_dir/cluster_7_node_13_2_motifs_prob.mat" +cebpb="$pwm_dir/cluster_5_node_20_5_motifs_prob.mat" +irf4="$pwm_dir/cluster_31_node_4_5_motifs_prob.mat" +irf2="$pwm_dir/cluster_31_node_5_2_motifs_prob.mat" +lhx3="$pwm_dir/cluster_1_node_74_2_motifs_prob.mat" +foxh1="$pwm_dir/cluster_66_1_motifs_prob.mat" +sox3="$pwm_dir/cluster_33_node_1_2_motifs_prob.mat" +mef2c="$pwm_dir/cluster_20_4_motifs_prob.mat" +elf5="$pwm_dir/cluster_7_node_17_5_motifs_prob.mat" +stat6="$pwm_dir/cluster_32_node_STAT6_1_motifs_prob.mat" +nfe2="$pwm_dir/cluster_3_node_24_4_motifs_prob.mat" +ahr="$pwm_dir/cluster_4_node_30_2_motifs_prob.mat" +e2f2="$pwm_dir/cluster_39_node_1_2_motifs_prob.mat" +ctcf="$pwm_dir/cluster_48_node_ctcf_1_motifs_prob.mat" +klf="$pwm_dir/cluster_28_node_14_3_motifs_prob.mat" +nr4a1="$pwm_dir/cluster_2_node_12_4_motifs_prob.mat" +egr="$pwm_dir/cluster_28_node_13_4_motifs_prob.mat" +gata="$pwm_dir/cluster_21_node_5_6_motifs_prob.mat" +nfat="$pwm_dir/cluster_19_node_2_3_motifs_prob.mat" +runx="$pwm_dir/cluster_38_node_3_3_motifs_prob.mat" + +# classify +for k in 23 +do + ## results files + file_prob=$results_dir/'peaks_rmsk_sequences_1kb_'$k'class_prob.mat4d' + file_mod1=$results_dir/'peaks_rmsk_openchromatin_1kb_read_atac_'$k'class_model.mat' + file_mod2=$results_dir/'peaks_rmsk_nucleosomes_1kb_fragment_center_'$k'class_model.mat' + file_mod3=$results_dir/'peaks_rmsk_sequences_1kb_'$k'class_model.mat' + seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) + echo "$file_prob $seed" >> $file_seed + bin/EMSequence --seq $file_mat_seq --class $k --motifs $jun,$hif1a,$myc,$pu1,$cebpb,$irf4,$irf2,$lhx3,$foxh1,$sox3,$mef2c,$elf5,$stat6,$nfe2,$ahr,$e2f2,$ctcf,$klf,$nr4a1,$egr,$gata,$nfat,$runx --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob + bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 + bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod2 + bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 + + # extend models + file_mod1_ext=$results_dir/'peaks_rmsk_openchromatin_1kb_read_atac_'$k'class_model_extended.mat' + file_mod2_ext=$results_dir/'peaks_rmsk_nucleosomes_1kb_fragment_center_'$k'class_model_extended.mat' + file_mod3_ext=$results_dir/'peaks_rmsk_sequences_1kb_'$k'class_model_extended.mat' + file_bed=$data_dir/'atac_v1_pbmc_5k_peaks_rmsk.bed' + file_fasta=$hg19_dir/'hg19.fasta' + file_bam_open=$data_dir/'atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam' + file_bai_open=$data_dir/'atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam.bai' + file_bam_nucl=$data_dir/'atac_v1_pbmc_5k_possorted_filtered_nucleosomes.bam' + file_bai_nucl=$data_dir/'atac_v1_pbmc_5k_possorted_filtered_nucleosomes.bam.bai' + bin/ReadModelExtender --bed $file_bed --bam $file_bam_open --bai $file_bai_open --prob $file_prob --from -500 --to 500 --ext 1000 --binSize 1 --method 'read_atac' --thread $n_core > $file_mod1_ext + bin/ReadModelExtender --bed $file_bed --bam $file_bam_nucl --bai $file_bai_nucl --prob $file_prob --from -500 --to 500 --ext 1000 --binSize 1 --method 'fragment_center' --thread $n_core > $file_mod2_ext + bin/SequenceModelExtender --bed $file_bed --fasta $file_fasta --prob $file_prob --from -500 --to 500 --ext 1000 --thread $n_core > $file_mod3_ext +done diff --git a/scripts/bulk_sequencing/analysis_cluster_ctcf_dnase_k562.R b/scripts/bulk_sequencing/analysis_cluster_ctcf_dnase_k562.R index 7377bed..6bf6201 100755 --- a/scripts/bulk_sequencing/analysis_cluster_ctcf_dnase_k562.R +++ b/scripts/bulk_sequencing/analysis_cluster_ctcf_dnase_k562.R @@ -1,138 +1,110 @@ setwd(file.path("/", "local", "groux", "scATAC-seq")) # libraries library(RColorBrewer) # functions source(file.path("scripts", "functions.R")) # data -data.1 = read.references(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_1class_ref.mat")) -ref.1 = data.1$references +data.1 = read.read.models(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_1class_ref.mat")) +ref.1 = data.1$models prob.1 = data.1$prob -aic.1 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_1class_aic.txt"))) +# aic.1 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_1class_aic.txt"))) data.1 = NULL -data.2 = read.references(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_2class_ref.mat")) -ref.2 = data.2$references +data.2 = read.read.models(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_2class_ref.mat")) +ref.2 = data.2$models prob.2 = data.2$prob -aic.2 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_2class_aic.txt"))) +# aic.2 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_2class_aic.txt"))) data.2 = NULL -data.3 = read.references(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_3class_ref.mat")) -ref.3 = data.3$references +data.3 = read.read.models(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_3class_ref.mat")) +ref.3 = data.3$models prob.3 = data.3$prob -aic.3 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_3class_aic.txt"))) +# aic.3 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_3class_aic.txt"))) data.3 = NULL -data.4 = read.references(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_4class_ref.mat")) -ref.4 = data.4$references +data.4 = read.read.models(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_4class_ref.mat")) +ref.4 = data.4$models prob.4 = data.4$prob -aic.4 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_4class_aic.txt"))) +# aic.4 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_4class_aic.txt"))) data.4 = NULL -data.5 = read.references(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_5class_ref.mat")) -ref.5 = data.5$references +data.5 = read.read.models(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_5class_ref.mat")) +ref.5 = data.5$models prob.5 = data.5$prob -aic.5 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_5class_aic.txt"))) +# aic.5 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_5class_aic.txt"))) data.5 = NULL -data.6 = read.references(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_6class_ref.mat")) -ref.6 = data.6$references +data.6 = read.read.models(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_6class_ref.mat")) +ref.6 = data.6$models prob.6 = data.6$prob -aic.6 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_6class_aic.txt"))) +# aic.6 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_6class_aic.txt"))) data.6 = NULL -data.7 = read.references(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_7class_ref.mat")) -ref.7 = data.7$references +data.7 = read.read.models(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_7class_ref.mat")) +ref.7 = data.7$models prob.7 = data.7$prob -aic.7 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_7class_aic.txt"))) +# aic.7 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_7class_aic.txt"))) data.7 = NULL -data.8 = read.references(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_8class_ref.mat")) -ref.8 = data.8$references +data.8 = read.read.models(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_8class_ref.mat")) +ref.8 = data.8$models prob.8 = data.8$prob -aic.8 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_8class_aic.txt"))) +# aic.8 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_8class_aic.txt"))) data.8 = NULL -data.9 = read.references(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_9class_ref.mat")) -ref.9 = data.9$references +data.9 = read.read.models(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_9class_ref.mat")) +ref.9 = data.9$models prob.9 = data.9$prob -aic.9 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_9class_aic.txt"))) +# aic.9 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_9class_aic.txt"))) data.9 = NULL -data.10 = read.references(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_10class_ref.mat")) -ref.10 = data.10$references +data.10 = read.read.models(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_10class_ref.mat")) +ref.10 = data.10$models prob.10 = data.10$prob -aic.10 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_10class_aic.txt"))) +# aic.10 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_10class_aic.txt"))) data.10 = NULL -data.11 = read.references(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_11class_ref.mat")) -ref.11 = data.11$references -prob.11 = data.11$prob -aic.11 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_11class_aic.txt"))) -data.11 = NULL - -data.12 = read.references(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_12class_ref.mat")) -ref.12 = data.12$references -prob.12 = data.12$prob -aic.12 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_12class_aic.txt"))) -data.12 = NULL - -data.13 = read.references(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_13class_ref.mat")) -ref.13 = data.13$references -prob.13 = data.13$prob -aic.13 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_13class_aic.txt"))) -data.13 = NULL - -data.14 = read.references(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_14class_ref.mat")) -ref.14 = data.14$references -prob.14 = data.14$prob -aic.14 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_14class_aic.txt"))) -data.14 = NULL - -data.15 = read.references(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_15class_ref.mat")) -ref.15 = data.15$references -prob.15 = data.15$prob -aic.15 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_15class_aic.txt"))) -data.15 = NULL - -ref = list(ref.15, ref.14, ref.13, ref.12, ref.11, ref.10, ref.9, ref.8, ref.7, ref.6, ref.5, ref.4, ref.3, ref.2, ref.1) -prob = list(prob.15, prob.14, prob.13, prob.12, prob.11, prob.10, prob.9, prob.8, prob.7, prob.6, prob.5, prob.4, prob.3, prob.2,prob.1) -aic = c(aic.15, aic.14, aic.13, aic.12, aic.11, aic.10, aic.9, aic.8, aic.7, aic.6, aic.5, aic.4, aic.3, aic.2, aic.1) +ref = list(ref.10, ref.9, ref.8, ref.7, ref.6, ref.5, ref.4, ref.3, ref.2, ref.1) +prob = list(prob.10, prob.9, prob.8, prob.7, prob.6, prob.5, prob.4, prob.3, prob.2,prob.1) +# aic = c(aic.15, aic.14, aic.13, aic.12, aic.11, aic.10, aic.9, aic.8, aic.7, aic.6, aic.5, aic.4, aic.3, aic.2, aic.1) +aic = rep(0, length(ref)) # number of runs n_run = length(ref) # number of different classes overall n_class_tot = sum(unlist(lapply(ref, nrow))) # max value of K n_class_max = max(unlist(lapply(ref, nrow))) # some colors colors = rep(brewer.pal(9, "Set1")[1], n_class_max) # construct a matrix with all discovered references on the rows references = matrix(nrow=n_class_tot, ncol=ncol(ref[[1]])) run_value = vector(length=n_class_tot) k_value = vector(length=n_class_tot) probabilities = vector(length=n_class_tot) k = 1 for(i in 1:n_run) { for(j in 1:nrow(ref[[i]])) { references[k,] = ref[[i]][j,] probabilities[k] = prob[[i]][j] run_value[k] = i k_value[k] = j k = k + 1 } } # distance matrix between all references -distances = distance.ref(references) +distances = distance.model(references) rownames(distances) = 1:nrow(distances) colnames(distances) = 1:ncol(distances) + plot.references(file.path("results","bulk_sequencing", "ctcf_dnase.png"), references, probabilities, colors, aic, distances, n_run, run_value, n_class_max) diff --git a/scripts/bulk_sequencing/analysis_cluster_ctcf_mnase_k562.R b/scripts/bulk_sequencing/analysis_cluster_ctcf_mnase_k562.R index 20bc1dd..8c2c613 100755 --- a/scripts/bulk_sequencing/analysis_cluster_ctcf_mnase_k562.R +++ b/scripts/bulk_sequencing/analysis_cluster_ctcf_mnase_k562.R @@ -1,138 +1,108 @@ setwd(file.path("/", "local", "groux", "scATAC-seq")) # libraries library(RColorBrewer) # functions source(file.path("scripts", "functions.R")) # data -data.1 = read.references(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_1class_ref.mat")) -ref.1 = data.1$references +data.1 = read.read.models(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_1class_ref.mat")) +ref.1 = data.1$models prob.1 = data.1$prob -aic.1 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_1class_aic.txt"))) +# aic.1 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_1class_aic.txt"))) data.1 = NULL -data.2 = read.references(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_2class_ref.mat")) -ref.2 = data.2$references +data.2 = read.read.models(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_2class_ref.mat")) +ref.2 = data.2$models prob.2 = data.2$prob -aic.2 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_2class_aic.txt"))) +# aic.2 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_2class_aic.txt"))) data.2 = NULL -data.3 = read.references(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_3class_ref.mat")) -ref.3 = data.3$references +data.3 = read.read.models(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_3class_ref.mat")) +ref.3 = data.3$models prob.3 = data.3$prob -aic.3 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_3class_aic.txt"))) +# aic.3 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_3class_aic.txt"))) data.3 = NULL -data.4 = read.references(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_4class_ref.mat")) -ref.4 = data.4$references +data.4 = read.read.models(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_4class_ref.mat")) +ref.4 = data.4$models prob.4 = data.4$prob -aic.4 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_4class_aic.txt"))) +# aic.4 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_4class_aic.txt"))) data.4 = NULL -data.5 = read.references(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_5class_ref.mat")) -ref.5 = data.5$references +data.5 = read.read.models(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_5class_ref.mat")) +ref.5 = data.5$models prob.5 = data.5$prob -aic.5 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_5class_aic.txt"))) +# aic.5 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_5class_aic.txt"))) data.5 = NULL -data.6 = read.references(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_6class_ref.mat")) -ref.6 = data.6$references +data.6 = read.read.models(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_6class_ref.mat")) +ref.6 = data.6$models prob.6 = data.6$prob -aic.6 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_6class_aic.txt"))) +# aic.6 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_6class_aic.txt"))) data.6 = NULL -data.7 = read.references(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_7class_ref.mat")) -ref.7 = data.7$references +data.7 = read.read.models(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_7class_ref.mat")) +ref.7 = data.7$models prob.7 = data.7$prob -aic.7 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_7class_aic.txt"))) +# aic.7 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_7class_aic.txt"))) data.7 = NULL -data.8 = read.references(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_8class_ref.mat")) -ref.8 = data.8$references +data.8 = read.read.models(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_8class_ref.mat")) +ref.8 = data.8$models prob.8 = data.8$prob -aic.8 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_8class_aic.txt"))) +# aic.8 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_8class_aic.txt"))) data.8 = NULL -data.9 = read.references(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_9class_ref.mat")) -ref.9 = data.9$references +data.9 = read.read.models(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_9class_ref.mat")) +ref.9 = data.9$models prob.9 = data.9$prob -aic.9 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_9class_aic.txt"))) +# aic.9 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_9class_aic.txt"))) data.9 = NULL -data.10 = read.references(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_10class_ref.mat")) -ref.10 = data.10$references +data.10 = read.read.models(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_10class_ref.mat")) +ref.10 = data.10$models prob.10 = data.10$prob -aic.10 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_10class_aic.txt"))) +# aic.10 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_10class_aic.txt"))) data.10 = NULL -data.11 = read.references(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_11class_ref.mat")) -ref.11 = data.11$references -prob.11 = data.11$prob -aic.11 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_11class_aic.txt"))) -data.11 = NULL - -data.12 = read.references(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_12class_ref.mat")) -ref.12 = data.12$references -prob.12 = data.12$prob -aic.12 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_12class_aic.txt"))) -data.12 = NULL - -data.13 = read.references(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_13class_ref.mat")) -ref.13 = data.13$references -prob.13 = data.13$prob -aic.13 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_13class_aic.txt"))) -data.13 = NULL - -data.14 = read.references(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_14class_ref.mat")) -ref.14 = data.14$references -prob.14 = data.14$prob -aic.14 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_14class_aic.txt"))) -data.14 = NULL - -data.15 = read.references(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_15class_ref.mat")) -ref.15 = data.15$references -prob.15 = data.15$prob -aic.15 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_15class_aic.txt"))) -data.15 = NULL - -ref = list(ref.15, ref.14, ref.13, ref.12, ref.11, ref.10, ref.9, ref.8, ref.7, ref.6, ref.5, ref.4, ref.3, ref.2, ref.1) -prob = list(prob.15, prob.14, prob.13, prob.12, prob.11, prob.10, prob.9, prob.8, prob.7, prob.6, prob.5, prob.4, prob.3, prob.2,prob.1) -aic = c(aic.15, aic.14, aic.13, aic.12, aic.11, aic.10, aic.9, aic.8, aic.7, aic.6, aic.5, aic.4, aic.3, aic.2, aic.1) +ref = list(ref.10, ref.9, ref.8, ref.7, ref.6, ref.5, ref.4, ref.3, ref.2, ref.1) +prob = list(prob.10, prob.9, prob.8, prob.7, prob.6, prob.5, prob.4, prob.3, prob.2, prob.1) +# aic = c(aic.15, aic.14, aic.13, aic.12, aic.11, aic.10, aic.9, aic.8, aic.7, aic.6, aic.5, aic.4, aic.3, aic.2, aic.1) +aic = rep(0, length(ref)) # number of runs n_run = length(ref) # number of different classes overall n_class_tot = sum(unlist(lapply(ref, nrow))) # max value of K n_class_max = max(unlist(lapply(ref, nrow))) # some colors colors = rep(brewer.pal(9, "Set1")[2], n_class_max) # construct a matrix with all discovered references on the rows references = matrix(nrow=n_class_tot, ncol=ncol(ref[[1]])) run_value = vector(length=n_class_tot) k_value = vector(length=n_class_tot) probabilities = vector(length=n_class_tot) k = 1 for(i in 1:n_run) { for(j in 1:nrow(ref[[i]])) { references[k,] = ref[[i]][j,] probabilities[k] = prob[[i]][j] run_value[k] = i k_value[k] = j k = k + 1 } } # distance matrix between all references -distances = distance.ref(references) +distances = distance.model(references) rownames(distances) = 1:nrow(distances) colnames(distances) = 1:ncol(distances) - plot.references(file.path("results","bulk_sequencing", "ctcf_mnase.png"), references, probabilities, colors, aic, distances, n_run, run_value, n_class_max) diff --git a/scripts/bulk_sequencing/cluster_ctcf_dnase_k562.sh b/scripts/bulk_sequencing/cluster_ctcf_dnase_k562.sh index 4414100..8d5d94b 100755 --- a/scripts/bulk_sequencing/cluster_ctcf_dnase_k562.sh +++ b/scripts/bulk_sequencing/cluster_ctcf_dnase_k562.sh @@ -1,23 +1,22 @@ results_dir='results/bulk_sequencing' data_dir='data/bulk_sequencing/' mkdir -p $results_dir -file_mnase=$data_dir'/ctcf_dnase_k562.mat' +file_dnase=$data_dir'/ctcf_dnase_k562.mat' file_seed=$results_dir'/ctcf_dnase_k562_seed.txt' n_iter='20' n_shift='21' -seeding='random' -n_core=5 +n_core=6 -for k in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 +for k in 1 2 3 4 5 6 7 8 9 10 do seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) file_prob=$results_dir/'ctcf_dnase_k562_'$k'class_prob.mat4d' file_ref=$results_dir/'ctcf_dnase_k562_'$k'class_ref.mat' file_aic=$results_dir/'ctcf_dnase_k562_'$k'class_aic.txt' echo "$file_prob $seed" >> $file_seed - bin/ChIPPartitioning --data $file_mnase --class $k --shift $n_shift --flip --iter $n_iter --seeding $seeding --seed $seed --parallel $n_core > $file_prob - bin/probToRef --data $file_mnase --prob $file_prob --parallel $n_core 1> $file_ref 2> $file_aic + bin/EMRead --read $file_dnase --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob + bin/ProbToModel --read $file_dnase --prob $file_prob --thread $n_core 1> $file_ref 2> $file_aic done diff --git a/scripts/bulk_sequencing/cluster_ctcf_mnase_k562.sh b/scripts/bulk_sequencing/cluster_ctcf_mnase_k562.sh index 29779c0..1c355a7 100755 --- a/scripts/bulk_sequencing/cluster_ctcf_mnase_k562.sh +++ b/scripts/bulk_sequencing/cluster_ctcf_mnase_k562.sh @@ -1,23 +1,22 @@ results_dir='results/bulk_sequencing' data_dir='data/bulk_sequencing/' mkdir -p $results_dir file_mnase=$data_dir'/ctcf_mnase_k562.mat' file_seed=$results_dir'/ctcf_mnase_k562_seed.txt' n_iter='20' n_shift='21' -seeding='random' -n_core=5 +n_core=6 -for k in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 +for k in 1 2 3 4 5 6 7 8 9 10 do seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) file_prob=$results_dir/'ctcf_mnase_k562_'$k'class_prob.mat4d' file_ref=$results_dir/'ctcf_mnase_k562_'$k'class_ref.mat' file_aic=$results_dir/'ctcf_mnase_k562_'$k'class_aic.txt' echo "$file_prob $seed" >> $file_seed - bin/ChIPPartitioning --data $file_mnase --class $k --shift $n_shift --flip --iter $n_iter --seeding $seeding --seed $seed --parallel $n_core > $file_prob - bin/probToRef --data $file_mnase --prob $file_prob --parallel $n_core 1> $file_ref 2> $file_aic + bin/EMRead --read $file_mnase --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob + bin/ProbToModel --read $file_mnase --prob $file_prob --thread $n_core 1> $file_ref 2> $file_aic done diff --git a/scripts/functions.R b/scripts/functions.R index acfa474..0345e93 100644 --- a/scripts/functions.R +++ b/scripts/functions.R @@ -1,696 +1,422 @@ #' Reads a read density model file and returns a list #' with the class models and the associated #' class probabilities. #' \param file the path to the file of interest. #' \return a list of two elements : "models" #' a matrix with the class models on each row #' and "prob" the associated class probabilities. #' read.read.models = function(file) { mod = as.matrix(read.table(file), drop=F) prob = mod[,1] mod = mod[,-1, drop=F] rownames(mod) = paste("class", 1:nrow(mod)) colnames(mod) = 1:ncol(mod) return(list(models=mod, prob=prob)) } #' Reads a sequence model file and returns a list #' with the class models and the associated #' class probabilities. #' \param file the path to the file of interest. #' \return a list of two elements : "models" #' an array containing the models as probability #' matrices with the following dimensions : #' 1) 4 for A,C,G,T #' 2) the model length #' 3) the numbler of classes #' and "prob" the associated class probabilities. #' read.sequence.models = function(file) { data = as.matrix(read.table(file.path(file))) - prob = unique(data[,1]) + # prob = unique(data[,1]) + prob = data[,1][rep(c(T,F,F,F), rep=nrow(data)/4)] n_class = length(prob) l_model = ncol(data) - 1 n_row = 4 models = array(dim=c(n_row, l_model, n_class)) dimnames(models)[[1]] = c('A', 'C', 'G', 'T') dimnames(models)[[2]] = 1:l_model dimnames(models)[[3]] = paste("class" , 1:n_class) i_from = 1 i_to = i_from + n_row - 1 for(k in 1:n_class) { models[,,k] = data[i_from:i_to,-1] i_from = i_to + 1 i_to = i_from + n_row - 1 } return(list(models=models, prob=prob)) } #' Computes the reverse complement of a #' DNA motif. #' \param the motif of interest with #' A,C,G,T on the rows and the positions #' on the columns. #' \return the reverse complement motif. #' \author Romain Groux reverse.complement = function(motif) { n.row = nrow(motif) n.col = ncol(motif) motif.rev = matrix(nrow=n.row, ncol=n.col) for(i in 1:n.row) { for(j in 1:n.col) { i_rev = n.row - i + 1 j_rev = n.col - j + 1 motif.rev[i_rev,j_rev] = motif[i,j] } } return(motif.rev) } #' Computes the Kullback-Leibler #' divergence of a given distristribution #' x to its corresponding uniform #' counterpart. #' For instance c(0.7, 0.1, 0.1, 0.1) #' will be compared to #' c(0.25, 0.25, 0.25, 0.25) #' \param x a vector containing the #' probability mass function values of #' the distribution for all possible #' values. #' \return the Kullback-Leibler #' divergence kl.divergence = function(x) { kl = 0 p0 = 1 / length(x) for(i in x) { kl = kl + (i * log(i/p0)) } return(kl) } #' A function to plot a DNA logo of a letter probability #' matrix (pwm). In essence, it does exactly the same #' as seqLogo::seqLogo except that it does not need #' a new display device on its own. #' \param pwm the letter probability matrix. #' \param path.a the path to a file containing #' the image to display for the A character, #' in PNG format. #' \param path.c the path to a file containing #' the image to display for the C character, #' in PNG format. #' \param path.g the path to a file containing #' the image to display for the G character, #' in PNG format. #' \param path.t the path to a file containing #' the image to display for the T character, #' in PNG format. #' \param pseudocounts a pseudocounts to add to #' the probabilities to avoid 0's. #' \param ... additional plotting parameters for #' plot(). #' \author Romain Groux plot.logo = function(pwm, path.a, path.c, path.g, path.t, pseudocounts=10e-10, ...) { n.row = 4 n.col = ncol(pwm) if(nrow(pwm) != n.row) { stop("Error! pwm should have 4 rows!") } if(length(dim(pwm)) != 2) { stop("Error! pwm should be a matrix!") } # images for nucleotides require(png) image.a = readPNG(path.a) image.c = readPNG(path.c) image.g = readPNG(path.g) image.t = readPNG(path.t) # add pseudo-counts to avoid 0's pwm = pwm + pseudocounts for(j in 1:n.col) { pwm[,j] = pwm[,j] / sum(pwm[,j]) } # entropy h = rep(0, n.col) for(j in 1:n.col) { for(i in 1:n.row) { h[j] = h[j] - pwm[i,j] * log2(pwm[i,j]) } } # information content r = -h + log2(4) # height heights = matrix(nrow=n.row, ncol=n.col, data=0) for(i in 1:n.row) { for(j in 1:n.col) { heights[i,j] = pwm[i,j] * r[j] } } # compute coordinates x.coord = matrix(nrow=2, ncol=n.col, data=0) rownames(x.coord) = c("from", "to") for(i in 1:n.col) { x.coord[1,i] = i - 0.5 x.coord[2,i] = i + 0.5 } # plot x.lim = c(1,n.col) y.lim = c(0,2) x.at = 1:n.col plot(0, 0, col=0, xlim=x.lim, ylim=y.lim, bty='n', xaxt='n', yaxt='n', xlab="", ylab="", ...) # axis(1, at=x.at, labels=x.at) for(j in 1:n.col) { # highest at top ord = order(heights[,j], decreasing=F) x_left = x.coord[1,j] x_right = x.coord[2,j] y_curr = 0 for(i in ord) { height = heights[i,j] y_bottom = y_curr y_top = y_bottom + height if(i == 1) { rasterImage(image.a, x_left, y_bottom, x_right, y_top) } if(i == 2) { rasterImage(image.c, x_left, y_bottom, x_right, y_top) } if(i == 3) { rasterImage(image.g, x_left, y_bottom, x_right, y_top) } if(i == 4) { rasterImage(image.t, x_left, y_bottom, x_right, y_top) } y_curr = y_curr + height } } } #' Compute the euclidean distance between two models. #' It also check if a reference is in reverse orientation #' and returns the smallest distance value. #' \param ref1 a vector containing the first reference. #' \param ref2 a vector containing the second reference. #' \return the euclidean distance. eucl.dist.models = function(mod1, mod2) { return(min(sqrt(sum(((mod1 - mod2 ) ^ 2))), sqrt(sum(((mod1 - rev(mod2)) ^ 2))))) } #' Compute the correlation distance between two models. #' It also check if a reference is in reverse orientation #' and returns the smallest distance value. #' \param ref1 a vector containing the first reference. #' \param ref2 a vector containing the second reference. #' \return the euclidean distance. cor.dist.models= function(mod1, mod2) { return(1 - min(cor(mod1, mod2 ), cor(mod1, rev(mod2)))) } #' Computes the (eucliden) distance matrix for all the given #' the models As some models may be in reverse #' orientation compared to others, the distance in both #' orientation is computed, for each pair, and the best is #' returned. #' \param models a matrix with the models on each row. #' \return a matrix containing the distances between each reference. distance.model = function(models) { n = nrow(models) d = matrix(nrow=n, ncol=n, data=0) for(i in 1:n) { for(j in 1:i) { x = eucl.dist.models(models[i,], models[j,]) d[i,j] = x d[j,i] = x } } return(d) } get_matches = function(distances, run_value) { matches = matrix(nrow=0, ncol=4) # references of run i on the row -> y coord # references of run j on the col -> x coord # run labels run_i = 1 # run_j = 2 for(run_j in setdiff(unique(run_value), run_i)) { # number of references in each run n_i = length(which(run_value == run_i)) n_j = length(which(run_value == run_j)) index_i = which(run_value == run_i) # rows of run i index_j = which(run_value == run_j) # columns of run j i_taken = c() # classes of i already matched -> rows to ignore j_taken = c() # classes of j already matched -> columns to ignore # while not all classes in j have been assigned a best match row_n = 1 while(length(j_taken) < n_j) { if(length(i_taken) == 0 && length(j_taken) == 0) { distances_tmp = distances[index_i, index_j, drop=F] coord = which(distances_tmp == min(distances_tmp), arr.ind=T) coord_i = as.numeric(rownames(distances_tmp)[coord[1]]) coord_j = as.numeric(colnames(distances_tmp)[coord[2]]) coord = c(coord_i, coord_j) } else { rows = setdiff(index_i, i_taken) cols = setdiff(index_j, j_taken) distances_tmp = distances[rows, cols, drop=F] coord = which(distances_tmp == min(distances_tmp), arr.ind=T) coord_i = as.numeric(rownames(distances_tmp)[coord[1]]) coord_j = as.numeric(colnames(distances_tmp)[coord[2]]) coord = c(coord_i, coord_j) } coord = c(coord, row_n, run_j) i_taken = c(i_taken, coord[1]) j_taken = c(j_taken, coord[2]) matches = rbind(matches, coord) row_n = row_n + 1 } } return(matches) } #'Creates a composite figure in which several class references from #'several partitions, with different numbers of classes, are plotted. #'The figure is composed of a matrix of rows and #'columns where is the highest number of classes in all #'partitions and the number of different partition. T #'The first column will contain the references of the #'partition with classes. The next columns will contain the #'references of the partition with the second biggest number of #'classes (and so on). In a given column, except the 1st one, #'the references are ordered (over the rows) such that the #'overall similarity (euclidean distance) with the 1st column #'references are maximized. #'\param file the file name where the image will be saved. #'\param references a matrix with the different references to draw on #'each row. #'\param references a vector containing the class probability (or weight) associated #'to each corresponding reference (row) in matrix. #'\param probabilities a vector of values that will be displayed atop of each #'column of plots. #'\param colors a vector of colors to draw the class profiles. There should #'be colors, they can be the same. #'\param distances a distance matrix containing the distance between all #'references. The row and column labels have to be the row and column #'number (1, 2, 3, ...)! #'\param n_run the total number of different partitions to which all #'references belong. #'\param run_value a vector indicating to which partition each reference #'(row of references) belong to. It should be a simple vector of integers, #'for instance 1,1,1,1,2,2,2,3,3 #'\param n_class_max, the highest number of classes searches in all partitions () plot.references = function(file, references, probabilities, colors, col.titles, distances, n_run, run_value, n_class_max, width=15, height=18) { # compute the best matches between all references to 1st run references matches = get_matches(distances, run_value) # make a matrix for layout with good plot numbers plots.lab = matrix(nrow=n_class_max+1, ncol=n_run) # the 1st row will be filled last with only text (col.titles) plots.lab[1,] = (length(plots.lab) - ncol(plots.lab) + 1) : length(plots.lab) plots.lab[-1,1] = 1:n_class_max # for run with max number of classes z = n_class_max + 1 for(i in 1:nrow(matches)) { coord = matches[i,] # plots.lab[coord[3], coord[4]] = z plots.lab[coord[1]+1, coord[4]] = z z = z + 1 } # these will be the empty plots for(i in 1:nrow(plots.lab)) { for(j in 1:ncol(plots.lab)) { if(is.na(plots.lab[i,j])) { plots.lab[i,j] = z z = z + 1 } } } # plot - png(filename=file, width=width, height=height, unit="in", res=720) + if(!is.null(file)) + { png(filename=file, width=width, height=height, unit="in", res=720) } + else + { X11(width=width, height=height) } # a grid m = layout(mat = plots.lab, heights=c(0.3, rep(1, nrow(plots.lab)-1)) ) layout.show(m) x = 1:ncol(references) # plot references of partition with highest number of classes for(i in 1:n_class_max) { plot(x=x, y=references[i,], lwd=2, type='l', ylim=c(0, 1.2*max(references[i,])), col=colors[i], main="", xlab="pos [bp]", ylab="Nb reads") # prob x_ = 0.85*length(references[i,]) y_ = max(references[i,]) lab = round(probabilities[i],3) text(x=x_, y=y_, labels=lab, cex=1.2) } # plot others for(i in 1:nrow(matches)) { ref_index = matches[i,2] col_index = matches[i,3] plot(x=x, y=references[ref_index,], lwd=2, type='l', ylim=c(0, 1.2*max(references[ref_index,])), col=colors[col_index], main="", xlab="pos [bp]", ylab="Nb reads") # prob x_ = 0.85*length(references[ref_index,]) y_ = max(references[ref_index,]) lab = round(probabilities[ref_index],3) text(x=x_, y=y_, labels=lab, cex=1.2) } # empty plots for(i in (length(run_value)+1):(n_run*n_class_max)) { plot(1,1,xlab="", ylab="", main="", col=0, xaxt="n", yaxt="n", bty="n") } # col titles p = par(mar=c(0,0,0,0)) for(i in 1:length(col.titles)) { plot(1,1,xlab="", ylab="", main="", col=0, xaxt="n", yaxt="n", bty="n") text(1,1, labels=col.titles[i], cex=2) } par(p) - dev.off() -} - - - - -plot.references.2 = function(file, - references, - probabilities, - colors, - col.titles, - distances, - n_run, - run_value, - n_class_max, - width=15, - height=18) -{ - # compute the best matches between all references to 1st run references - matches = get_matches(distances, run_value) - - # make a matrix for layout with good plot numbers - plots.lab = matrix(nrow=n_class_max+1, ncol=n_run) # the 1st row will be filled last with only text (col.titles) - plots.lab[1,] = (length(plots.lab) - ncol(plots.lab) + 1) : length(plots.lab) - plots.lab[-1,1] = 1:n_class_max # for run with max number of classes - z = n_class_max + 1 - for(i in 1:nrow(matches)) - { coord = matches[i,] - # plots.lab[coord[3], coord[4]] = z - plots.lab[coord[1]+1, coord[4]] = z - z = z + 1 - } - # these will be the empty plots - for(i in 1:nrow(plots.lab)) - { for(j in 1:ncol(plots.lab)) - { if(is.na(plots.lab[i,j])) - { plots.lab[i,j] = z - z = z + 1 - } - } - } - - # plot - if(is.null(file)) - { X11(width=width, height=height) } - else - { png(filename=file, width=width, height=height, unit="in", res=720) } - # a grid - m = layout(mat = plots.lab, heights=c(0.3, rep(1, nrow(plots.lab)-1)) ) - # layout.show(m) - x = 1:ncol(references[[1]]) - - # plot references of partition with highest number of classes - for(i in 1:n_class_max) - { for(j in 1:length(references)) - { - ylim = c(0, 1.2) - if(j == 1) - { plot(x=x, y=references[[j]][i,]/max(references[[j]][i,]), - lwd=2, type='l', ylim=ylim, - col=colors[j], main="", xlab="pos [bp]", ylab="Nb reads") - } - else - { lines(x=x, y=references[[j]][i,]/max(references[[j]][i,]), - lwd=2, type='l', col=colors[j]) - } - } - - # prob - x_ = 0.85*length(references[[1]][i,]) - # y_ = max(references[[1]][i,]) - y_ = 0.85 - lab = round(probabilities[i],3) - text(x=x_, y=y_, labels=lab, cex=1.2) - } - - # plot others - for(i in 1:nrow(matches)) - { ref_index = matches[i,2] - col_index = matches[i,3] - for(j in 1:length(references)) - { ylim = c(0, 1.2) - if(j == 1) - { plot(x=x, y=references[[j]][ref_index,]/max(references[[j]][ref_index,]), - lwd=2, type='l', ylim=ylim, - col=colors[j], main="", xlab="pos [bp]", ylab="Nb reads") - } - else - { lines(x=x, y=references[[j]][ref_index,]/max(references[[j]][ref_index,]), - lwd=2, col=colors[j]) - } - } - # prob - x_ = 0.85*length(references[[1]][ref_index,]) - # y_ = max(references[[1]][ref_index,]) - y_ = 0.85 - lab = round(probabilities[ref_index],3) - text(x=x_, y=y_, labels=lab, cex=1.2) - } - - # empty plots - for(i in (length(run_value)+1):(n_run*n_class_max)) - { plot(1,1,xlab="", ylab="", main="", col=0, xaxt="n", yaxt="n", bty="n") } - - # col titles - p = par(mar=c(0,0,0,0)) - for(i in 1:length(col.titles)) - { plot(1,1,xlab="", ylab="", main="", col=0, xaxt="n", yaxt="n", bty="n") - text(1,1, labels=col.titles[i], cex=2) - } - par(p) - if(!is.null(file)) - { dev.off() } -} - - -plot.references.3 = function(file, - references, - probabilities, - colors, - col.titles, - distances, - n_run, - run_value, - n_class_max, - width=15, - height=18) -{ - # compute the best matches between all references to 1st run references - matches = get_matches(distances, run_value) - - # make a matrix for layout with good plot numbers - plots.lab = matrix(nrow=n_class_max+1, ncol=n_run) # the 1st row will be filled last with only text (col.titles) - plots.lab[1,] = (length(plots.lab) - ncol(plots.lab) + 1) : length(plots.lab) - plots.lab[-1,1] = 1:n_class_max # for run with max number of classes - z = n_class_max + 1 - for(i in 1:nrow(matches)) - { coord = matches[i,] - # plots.lab[coord[3], coord[4]] = z - plots.lab[coord[1]+1, coord[4]] = z - z = z + 1 - } - # these will be the empty plots - for(i in 1:nrow(plots.lab)) - { for(j in 1:ncol(plots.lab)) - { if(is.na(plots.lab[i,j])) - { plots.lab[i,j] = z - z = z + 1 - } - } - } - - # plot - if(is.null(file)) - { X11(width=width, height=height) } - else - { png(filename=file, width=width, height=height, unit="in", res=720) } - - p = par(mar=c(0,0,0,0)) - - # a grid - m = layout(mat = plots.lab, heights=c(0.3, rep(1, nrow(plots.lab)-1)) ) - # layout.show(m) - x = 1:ncol(references[[1]]) - - # plot references of partition with highest number of classes - for(i in 1:n_class_max) - { for(j in 1:length(references)) - { - ylim = c(0, 1.2) - if(j == 1) - { plot(x=x, y=references[[j]][i,]/max(references[[j]][i,]), - lwd=2, type='l', ylim=ylim, - col=colors[j], main='', xlab='', ylab='', - xaxt='n', yaxt='n') - } - else - { lines(x=x, y=references[[j]][i,]/max(references[[j]][i,]), - lwd=2, type='l', col=colors[j]) - } - } - - # prob - x_ = 0.85*length(references[[1]][i,]) - # y_ = max(references[[1]][i,]) - y_ = 0.85 - lab = round(probabilities[i],3) - text(x=x_, y=y_, labels=lab, cex=1.2) - } - - # plot others - for(i in 1:nrow(matches)) - { ref_index = matches[i,2] - col_index = matches[i,3] - for(j in 1:length(references)) - { ylim = c(0, 1.2) - if(j == 1) - { plot(x=x, y=references[[j]][ref_index,]/max(references[[j]][ref_index,]), - lwd=2, type='l', ylim=ylim, - col=colors[j], main='', xlab='', ylab='', - xaxt='n', yaxt='n') - } - else - { lines(x=x, y=references[[j]][ref_index,]/max(references[[j]][ref_index,]), - lwd=2, col=colors[j]) - } - } - # prob - x_ = 0.85*length(references[[1]][ref_index,]) - # y_ = max(references[[1]][ref_index,]) - y_ = 0.85 - lab = round(probabilities[ref_index],3) - text(x=x_, y=y_, labels=lab, cex=1.2) - } - - # empty plots - for(i in (length(run_value)+1):(n_run*n_class_max)) - { plot(1,1,xlab="", ylab="", main="", col=0, xaxt="n", yaxt="n", bty="n") } - - # col titles - for(i in 1:length(col.titles)) - { plot(1,1, xlab="", ylab="", main="", col=0, xaxt="n", yaxt="n", bty="n") - text(1,1, labels=col.titles[i], cex=2) - } - par(p) if(!is.null(file)) { dev.off() } } - - -plot.references.4 = function(file, - references, - probabilities, - colors, - width=15, - height=18) -{ - n_class = nrow(references[[1]]) - n_col = ncol(references[[1]]) - mat = matrix(nrow=n_class, ncol=1, data=1:n_class) - - # plot - if(is.null(file)) - { X11(width=width, height=height) } - else - { png(filename=file, width=width, height=height, unit="in", res=720) } - - p = par(mar=c(0,0,0,0)) - - # a grid - m = layout(mat = mat) - # layout.show(m) - x = 1:n_col - - for(i in 1:n_class) - { for(j in 1:length(references)) - { - ylim = c(0, 1.2) - if(j == 1) - { plot(x=x, y=references[[j]][i,]/max(references[[j]][i,]), - lwd=2, type='l', ylim=ylim, - col=colors[j], main='', xlab='', ylab='', - xaxt='n', yaxt='n') - } - else - { lines(x=x, y=references[[j]][i,]/max(references[[j]][i,]), - lwd=2, type='l', col=colors[j]) - } - } - # prob - x_ = 0.85*length(references[[1]][i,]) - # y_ = max(references[[1]][i,]) - y_ = 0.85 - lab = round(probabilities[i],3) - text(x=x_, y=y_, labels=lab, cex=1.2) - } - - if(!is.null(file)) - { dev.off() } -} - diff --git a/scripts/genomes/hg19.sh b/scripts/genomes/hg19.sh new file mode 100644 index 0000000..3510350 --- /dev/null +++ b/scripts/genomes/hg19.sh @@ -0,0 +1,64 @@ +data_dir=data/genomes + +mkdir $data_dir + + +# hg19 genome from Ensembl +## NOTE the hg19 genome was downloaded on the 13 of August 2019. + +## download all chromosomes +file_fa=$data_dir/'hg19.fasta' +touch $file_fa +for chr in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y +do + file_gz=$data_dir/'hg19_chr_'$chr'.fasta.gz' + wget -O $file_gz ftp://ftp.ensembl.org/pub/grch37/current/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.dna.chromosome.$chr.fa.gz + gzip -d -c $file_gz >> $file_fa + rm $file_gz +done + +## format sequence header to fit 'chrN' format +file_tmp=$data_dir/tmp.fasta +sed -E 's/ dna.+//' $file_fa | sed 's/>/>chr/' > $file_tmp +mv $file_tmp $file_fa + + + +# repeat masked hg19 genome (repeated elements are 'N') from Ensembl +## NOTE the hg19 repeated masked genome was downloaded on the 12 of August 2019. + +## download all chromosomes +file_fa_rm=$data_dir/'hg19_rmsk.fasta' +touch $file_fa_rm +for chr in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y +do + file_gz=$data_dir/'hg19_chr_'$chr'_rmsk.fasta.gz' + wget -O $file_gz ftp://ftp.ensembl.org/pub/grch37/current/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.dna_rm.chromosome.$chr.fa.gz + gzip -d -c $file_gz >> $file_fa_rm + rm $file_gz +done + +## format sequence header to fit 'chrN' format +file_tmp=$data_dir/tmp.fasta +sed -E 's/ dna.+//' $file_fa_rm | sed 's/>/>chr/' > $file_tmp +mv $file_tmp $file_fa_rm + + + +# repeated elements annotation from USCS +## NOTE +## data/genomes/hg19_rmsk_original.bed was downloaded from http://genome.ucsc.edu/cgi-bin/hgTables +## on the 9th of August 2019 with the following options : +## clade: mammal genome: Human assembly: hg19 +## group: repeat track: repeatMasker +## output file: hg19_rmsk_original.bed +## file type returned: gzip compressed +## "get output" button +## "get BED" button + +## sort by chromosome and position +sort -k 1,1V -k2,2n -k3,3n $data_dir/hg19_rmsk_original.bed > $data_dir/hg19_rmsk_original_sorted.bed +mv $data_dir/hg19_rmsk_original_sorted.bed $data_dir/hg19_rmsk_original.bed + +## only keep chr1/2.../M/X/Y +grep -E '^chr[0-9XYM]+\s' $data_dir/hg19_rmsk_original.bed > $data_dir/hg19_rmsk.bed diff --git a/scripts/install_libraries/install_libUnitTest++.sh b/scripts/install_libraries/install_libUnitTest++.sh new file mode 100644 index 0000000..dd39af5 --- /dev/null +++ b/scripts/install_libraries/install_libUnitTest++.sh @@ -0,0 +1,23 @@ +# install the boost library +library_dir='lib/UnitTest++' +lib_dir="$library_dir/lib" +include_dir="$library_dir/include" + +# download src +git clone https://github.com/unittest-cpp/unittest-cpp.git + +mkdir -p $library_dir +mkdir -p $lib_dir +mkdir -p $include_dir + +cd unittest-cpp/ + +# install +cmake3 . && make +find UnitTest++/ -name "*.cpp" -type f -delete +mv ./libUnitTest++.a ../$lib_dir/ +mv UnitTest++/* ../$include_dir/ + +# clean +cd .. +rm -rf unittest-cpp diff --git a/scripts/install_libraries/install_libboost.sh b/scripts/install_libraries/install_libboost.sh new file mode 100644 index 0000000..e8acfca --- /dev/null +++ b/scripts/install_libraries/install_libboost.sh @@ -0,0 +1,17 @@ +# install the boost library + +# download src +wget https://dl.bintray.com/boostorg/release/1.70.0/source/boost_1_70_0.tar.gz +tar -xzvf boost_1_70_0.tar.gz + +cd boost_1_70_0/ + +# build and install +mkdir -p $library_dir +./bootstrap.sh --prefix=$(pwd)/lib/boost +./b2 install link=static # program_options + +# clean +cd .. +rm -r boost_1_70_0 +rm boost_1_70_0.tar.gz diff --git a/scripts/install_libraries/run_all.sh b/scripts/install_libraries/run_all.sh index 6251f3d..46813b5 100644 --- a/scripts/install_libraries/run_all.sh +++ b/scripts/install_libraries/run_all.sh @@ -1,5 +1,17 @@ +# install libraries + +## C++ libraries mkdir lib/ -mkdir lib/include +scripts/install_libraries/install_libboost.sh +scripts/install_libraries/install_libSeqAn.sh +scripts/install_libraries/install_libUnitTest++.sh + +## python libraries +### make sure that pip is installed for python3.6 +# curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py +# sudo python3.6 get-pip.py --force-reinstall + +sudo pip3.6 install intervaltree +sudo pip3.6 install pysam -scripts/install_libraries/install_libStatGen.sh diff --git a/scripts/install_programs/install_bedtools.sh b/scripts/install_programs/install_bedtools.sh new file mode 100644 index 0000000..b06ab32 --- /dev/null +++ b/scripts/install_programs/install_bedtools.sh @@ -0,0 +1,15 @@ +# download +wget https://github.com/arq5x/bedtools2/releases/download/v2.28.0/bedtools-2.28.0.tar.gz +tar -zxvf bedtools-2.28.0.tar.gz +cd bedtools2 + +# compile +make + +# install +mkdir ../bin/bedtools +mv bin/* ../bin/bedtools + +# clean +cd .. +rm -r bedtools2 diff --git a/scripts/install_programs/install_deeptools.sh b/scripts/install_programs/install_deeptools.sh deleted file mode 100644 index 9df2c69..0000000 --- a/scripts/install_programs/install_deeptools.sh +++ /dev/null @@ -1,8 +0,0 @@ - -# make sure that pip is installed for python3.6 -# curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py -# sudo python3.6 get-pip.py --force-reinstall - -# install deeptools for python3.6 -sudo pip3.6 install deeptools - diff --git a/scripts/install_programs/run_all.sh b/scripts/install_programs/run_all.sh index fd23b47..686706e 100644 --- a/scripts/install_programs/run_all.sh +++ b/scripts/install_programs/run_all.sh @@ -1 +1,4 @@ -scripts/install_programs/install_deeptools.sh +# install programs + +scripts/install_programs/install_bedtools.sh + diff --git a/scripts/pwm/reformat_jaspar_2018.sh b/scripts/pwm/reformat_jaspar_2018.sh new file mode 100755 index 0000000..5101c91 --- /dev/null +++ b/scripts/pwm/reformat_jaspar_2018.sh @@ -0,0 +1,7 @@ +# NOTE +# +pwmtool_dir='scripts/pwm_tools' +data_dir="data/pwm/jaspar_2018" + +wget -O $data_dir/'JASPAR2018_CORE_vertebrates_non-redundant_pfms_jaspar.zip' http://jaspar.genereg.net/download/CORE/JASPAR2018_CORE_vertebrates_non-redundant_pfms_jaspar.txt +unzip -d $data_dir $data_dir/'JASPAR2018_CORE_vertebrates_non-redundant_pfms_jaspar.zip' diff --git a/scripts/pwm/reformat_jaspar_2018_clustering.sh b/scripts/pwm/reformat_jaspar_2018_clustering.sh new file mode 100755 index 0000000..efbf3c3 --- /dev/null +++ b/scripts/pwm/reformat_jaspar_2018_clustering.sh @@ -0,0 +1,9 @@ +pwmtool_dir='scripts/pwm_tools' +data_dir="data/pwm/jaspar_2018_clustering" + + +for file_tf in $(ls $data_dir/*tf) +do + file_prob=$(basename $file_tf | sed s/.tf/_prob.mat/) + python3.6 $pwmtool_dir/extract_transfac_pwm.py -i $file_tf --norm > $data_dir/$file_prob +done diff --git a/scripts/pwm_tools/extract_transfac_pwm.py b/scripts/pwm_tools/extract_transfac_pwm.py new file mode 100644 index 0000000..2c7a189 --- /dev/null +++ b/scripts/pwm_tools/extract_transfac_pwm.py @@ -0,0 +1,70 @@ +import optparse +import os +import re + +def parse_file(file_in, norm, pseudocount): + + # matrix in vertical format + matrix_v = list() + with open(file_in) as f_in: + + # if currently reading the matrix + matrix = False + + for line in f_in: + line = line.rstrip() + # before matrix + if matrix is False: + if line.startswith("P0") or line.startswith("PO"): + matrix = True + # inside matrix + else: + # end of matrix + if re.match(r"^\d+", line) is None: + # if line.startswith("XX"): + matrix = False + break + # inside matrix + else: + values = re.split(r"\s+", line)[1:] + values = [float(x)+pseudocount for x in values] + if norm: + tot = sum(values) + values = [x/tot for x in values] + matrix_v.append(values) + # matrix in vertical format + return matrix_v + +if __name__ == "__main__": + + # parse options + usage = "usage: %s [options]" % os.path.basename(__file__) + epilog = "This program reads a transfac PWM file returns the PWM." \ + "Written by Romain Groux, August 2019" + parser = optparse.OptionParser(usage=usage, epilog=epilog) + parser.add_option("-i", "--input", dest="file_in", default=None, type="string", action="store", + help="The addresse of the transfac PWM file.") + parser.add_option("--norm", dest="norm", action="store_true", + help="Whether the values should be normalized to probabilities.") + (options, args) = parser.parse_args() + + file_in = options.file_in + norm = options.norm + + # matrix in vertical format + matrix_v = parse_file(file_in, norm, 1) + + # matrix in horizontal format + nrow = 4 + ncol = len(matrix_v) + matrix_h = ["" for _ in range(0, nrow, 1)] + for i in range(0, nrow, 1): + for j in range(0, ncol, 1): + matrix_h[i] += "%.4f\t" % matrix_v[j][i] + matrix_h[i] = matrix_h[i][:-1] + + # print matrix in horizontal format + print('\n'.join(matrix_h)) + + + diff --git a/scripts/run_all.sh b/scripts/run_all.sh index a698e65..539db86 100755 --- a/scripts/run_all.sh +++ b/scripts/run_all.sh @@ -1,11 +1,13 @@ -# install programs +# setup environment +## install programs scripts/install_programs/run_all.sh - -# install libraries +## install libraries scripts/install_libraries/run_all.sh + + # simulate data for testing purposes scripts/generate_toy_data/run_all.sh diff --git a/scripts/test.R b/scripts/test.R new file mode 100644 index 0000000..68eb808 --- /dev/null +++ b/scripts/test.R @@ -0,0 +1,284 @@ +setwd(file.path("/", "local", "groux", "scATAC-seq")) + +# libraries +library(RColorBrewer) + +# functions +source(file.path("scripts", "functions.R")) + +#' Converts a sequence in character format +#' to integer format A->0, C->1, N->2, G->3 +#' T->4. +#' \param seq a vector containing the sequence +#' in character format. +#' \return a vector containing the sequence +#' in integer format. +#' \author Romain Groux +char.to.int = function(seq) +{ seq_int = vector(length=length(seq)) + for(i in 1:length(seq)) + { if(seq[i] == 'A') { seq_int[i] = 0 } + if(seq[i] == 'C') { seq_int[i] = 1 } + if(seq[i] == 'N') { seq_int[i] = 2 } + if(seq[i] == 'G') { seq_int[i] = 3 } + if(seq[i] == 'T') { seq_int[i] = 4 } + } + return(seq_int) +} + +#' Generates the reverse complement of a kmer. +#' \param kmer a vector containing the kmer in +#' integer format. +#' \return a vector containing the reverse +#' complement kmer +#' \author Romain Groux +get_rev_compl = function(kmer) +{ kmer_rv = vector(length=length(kmer), mode="numeric") + i_rv = length(kmer) + for(i in 1:length(kmer)) + { if(kmer[i] == 0) { kmer_rv[i_rv] = 4 } # A + if(kmer[i] == 1) { kmer_rv[i_rv] = 3 } # C + if(kmer[i] == 2) { kmer_rv[i_rv] = 2 } # N + if(kmer[i] == 3) { kmer_rv[i_rv] = 1 } # G + if(kmer[i] == 4) { kmer_rv[i_rv] = 0 } # T + } + return(kmer_rv) +} + +#' Generates a hash given a kmer. +#' Kmers with a same length are guaranteed +#' to have different hashes. +#' AA..AA will generate a hash of 1, +#' AA..AC will generate a hash of 2, +#' AA..AN will generate a hash of 3, +#' AA..AG will generate a hash of 4, +#' AA..AT will generate a hash of 5, +#' TT..TG will generate a hash of 5**k - 1, +#' TT..TT will generate a hash of 5**k +#' \param seq a vector containing the kmer +#' in integer format : A->0, C->1, N->2, G->3, +#' T->4. +#' \return the kmer hash +#' \author Romain Groux +hash = function(seq) +{ k = length(seq) ; z = 5 + h = 0 + for(i in 0:(length(seq)-1)) + { if(seq[i+1] == 0) { h = h + (0*(z**(k-i-1))) } # A + if(seq[i+1] == 1) { h = h + (1*(z**(k-i-1))) } # C + if(seq[i+1] == 2) { h = h + (2*(z**(k-i-1))) } # N + if(seq[i+1] == 3) { h = h + (3*(z**(k-i-1))) } # G + if(seq[i+1] == 4) { h = h + (4*(z**(k-i-1))) } # T + } + return(h+1) +} + +#' Computes the hash of a sequence and of +#' its reverse complement and returns the +#' smallest one. +#' \param seq a vector containing the +#' sequence in integer format : : A->0, +#' C->1, N->2, G->3, T->4. +#' \author Romain Groux +hash.min(seq) +{ seq_r = get_rev_compl(seq) + return(min(hash(seq), hash(seq_r))) +} + +#' Generates all kmers for a given value of K +#' and return them in lexicographic order. +#' \param k the kmer length. +#' \return a matrix with the different kmers +#' on the rows and k columns. The kmers are +#' in integer format : A->0, C->1, N->2, G->3, +#' T->4. +#' \author Romain Groux +generate_all_kmers = function(k) +{ kmers = matrix(nrow=5**k, ncol=k, data=-1) + n = k + currentWord = rep(1, n) + i = 1 + while(n > 0) + { kmers[i,] = currentWord + i = i + 1 + while(n>0 && currentWord[n+1-1] == 5) + { currentWord[n] = 1 + n = n - 1 + } + if(n > 0) + { currentWord[n] = currentWord[n] + 1 + n = k + } + } + return(kmers - 1) +} + + +data = as.matrix(read.table(file.path("data", + "10xgenomics_PBMC_5k_peaks", + "peaks_rmsk_sampled_sequences_1kb.mat"))) + +data = as.matrix(read.table(file.path("data/toy_data/simulated_sequences_2class_flip.mat"))) +data = apply(data, 1, char.to.int) + +k = 5 +n_kmer = 5**k +hmax = ceiling(n_kmer / 2) +n_shift = ncol(data) - k + 1 + +# transitions and counts +counts = vector(length=n_kmer, mode="numeric") +kmers = generate_all_kmers(k) +counts = vector(length=n_kmer, mode="numeric") +t_out = matrix(nrow=n_kmer, ncol=n_kmer, data=0) +t_in = t_out +t_all = t_out +for(i in 1:nrow(data)) +{ for(j in 1:n_shift) + { # no in transition (1st kmer) + if(j == 1) + { # kmer1 < kmer2 + from1 = j ; to1 = from1 + k - 1 ; kmer1 = data[i,from1:to1] ; + from2 = j+1 ; to2 = from2 + k - 1 ; kmer2 = data[i,from2:to2] ; + kmer1r = get_rev_compl(kmer2) ; kmer2r = get_rev_compl(kmer1) ; + idx1 = hash(kmer1) ; idx1r = hash(kmer1r) ; + idx2 = hash(kmer2) ; idx2r = hash(kmer2r) ; + # out transition kmer1 -> kmer2 + t_out[idx1,idx2] = t_out[idx1,idx2] + 1 + t_out[idx1r,idx2r] = t_out[idx1r,idx2r] + 1 + # number of edges + t_all[idx1,idx2] = t_all[idx1,idx2] + 1 + t_all[idx2,idx1] = t_all[idx2,idx1] + 1 + t_all[idx1r,idx2r] = t_all[idx1r,idx2r] + 1 + t_all[idx2r,idx1r] = t_all[idx2r,idx1r] + 1 + # counts + counts[idx1] = counts[idx1] + 1 + counts[idx1r] = counts[idx1r] + 1 + } + # no out transition (last kmer) + else if(j == n_shift) + { # kmer1 < kmer2 + from1 = j-1 ; to1 = from1 + k - 1 ; kmer1 = data[i,from1:to1] ; + from2 = j ; to2 = from2 + k - 1 ; kmer2 = data[i,from2:to2] ; + kmer1r = get_rev_compl(kmer2) ; kmer2r = get_rev_compl(kmer1) ; + idx1 = hash(kmer1) ; idx1r = hash(kmer1r) ; + idx2 = hash(kmer2) ; idx2r = hash(kmer2r) ; + # in transition kmer1 <- kmer2 + t_in[idx1,idx2] = t_in[idx1,idx2] + 1 + t_in[idx1r,idx2r] = t_in[idx1r,idx2r] + 1 + # number of edges + t_all[idx1,idx2] = t_all[idx1,idx2] + 1 + t_all[idx2,idx1] = t_all[idx2,idx1] + 1 + t_all[idx1r,idx2r] = t_all[idx1r,idx2r] + 1 + t_all[idx2r,idx1r] = t_all[idx2r,idx1r] + 1 + # counts + # no need, kmer2 was counted at last iteration as kmer2 + } + # both out and in transitions (middle) + else + { # kmer0 < kmer1 < kmer2 + from0 = j ; to0 = from0 + k - 1 ; kmer0 = data[i,from0:to0] ; + from1 = j ; to1 = from1 + k - 1 ; kmer1 = data[i,from1:to1] ; + from2 = j+1 ; to2 = from2 + k - 1 ; kmer2 = data[i,from2:to2] ; + kmer0r = get_rev_compl(kmer2) ; kmer1r = get_rev_compl(kmer1) ; kmer2r = get_rev_compl(kmer0) ; + idx0 = hash(kmer0) ; idx0r = hash(kmer0r) ; + idx1 = hash(kmer1) ; idx1r = hash(kmer1r) ; + idx2 = hash(kmer2) ; idx2r = hash(kmer2r) ; + # out transition kmer1 -> kmer2 + t_out[idx1,idx2] = t_out[idx1,idx2] + 1 + t_out[idx1r,idx2r] = t_out[idx1r,idx2r] + 1 + # in transition kmer0 -> kmer1 + t_in[idx1,idx0] = t_in[idx1,idx0] + 1 + t_in[idx1r,idx0r] = t_in[idx1r,idx0r] + 1 + # number of edges + t_all[idx0,idx1] = t_all[idx0,idx1] + 1 + t_all[idx1,idx0] = t_all[idx1,idx0] + 1 + t_all[idx1,idx2] = t_all[idx1,idx2] + 1 + t_all[idx2,idx1] = t_all[idx2,idx1] + 1 + t_all[idx0r,idx1r] = t_all[idx0r,idx1r] + 1 + t_all[idx1r,idx0r] = t_all[idx1r,idx0r] + 1 + t_all[idx1r,idx2r] = t_all[idx1r,idx2r] + 1 + t_all[idx2r,idx1r] = t_all[idx2r,idx1r] + 1 + # counts + counts[idx1] = counts[idx1] + 1 + counts[idx1r] = counts[idx1r] + 1 + } + } +} + +# spectral clustering +# t_all is the affinity matrix +# compute the degree matrix +d = diag(apply(t_in, 1, sum)) # sum rows +# unormalized laplacian +u = d - t_in +# get eigen values and vectors +evL = eigen(u, symmetric=TRUE) +# plot eigen values +plot(1:20, rev(evL$values)[1:20], type='b') +# partition +partitions = list() +for(n_clust in 2:20) +{ print(n_clust) + # get K biggest eigen values and vectors -> embedding space + z = evL$vectors[,(ncol(evL$vectors)-n_clust+1):ncol(evL$vectors)] + partitions[[n_clust]] = kmeans(z, centers=n_clust, iter.max=100, nstart=10) +} + + +plot(evL$vectors[,3124:3125]) + +# motif 1 is ACGTTGCA +kmers_motif1 = matrix(ncol=k, + data=c(0,1,2,3,3, + 1,2,3,3,2, + 2,3,3,2,1, + 3,3,2,1,0), + byrow=T) +# motif 2 is GCGAATTT +kmers_motif2 = matrix(ncol=k, + data=c(2,1,2,0,0, + 1,3,0,0,3, + 3,0,0,3,3, + 0,0,3,3,3), + byrow=T) +idx1 = apply(kmers_motif1, 1, hash) +idx2 = apply(kmers_motif2, 1, hash) + +partitions[[2]]$size + +partitions[[2]]$cluster[idx1] +partitions[[2]]$cluster[idx2] + + + + +c1 = which(partitions[[2]]$cluster == 1) +c2 = which(partitions[[2]]$cluster == 2) + +plot(evL$vectors[,3124:3125], col=partitions[[2]]$cluster+1, cex=0.1) + +points(evL$vectors[idx1,3124:3125], col=2) +points(evL$vectors[idx2,3124:3125], col=3) + +par(mfrow=c(3,1)) +plot(t_all[idx1[1],], type='l', ylim=c(0,50)) ; abline(v=idx1, col="red", lwd=0.2) ; abline(v=idx1[1], col="blue") +plot(t_all[idx1[2],], type='l', ylim=c(0,50)) ; abline(v=idx1, col="red", lwd=0.2) +plot(t_all[idx1[3],], type='l', ylim=c(0,50)) ; abline(v=idx1, col="red", lwd=0.2) + + +boxplot(counts, counts[idx1], counts[idx2], outline=F) + + + + +# reconstruct kmers +best.k = 2 +partition = partitions[[best.k]] +clusters = partition$cluster +c1 = which(clusters == 1) +c2 = which(clusters == 2) +best1 = which.max(counts[c1]) +best2 = which.max(counts[c2]) + + diff --git a/scripts/test/analysis_test_sampled.R b/scripts/test/analysis_test_sampled.R new file mode 100644 index 0000000..afcf023 --- /dev/null +++ b/scripts/test/analysis_test_sampled.R @@ -0,0 +1,97 @@ +setwd(file.path("/", "local", "groux", "scATAC-seq")) + +# libraries +library(RColorBrewer) +library(seqLogo) + +# functions +source(file.path("scripts", "functions.R")) + +# the number of classes searched +n.classes = c(10, 20, 30) + +# path to the images for the logo +path.a = file.path("res/A.png") +path.c = file.path("res/C.png") +path.g = file.path("res/G.png") +path.t = file.path("res/T.png") + +################## sequence patterns around ctcf motifs ################## + +for(k in n.classes) +{ + # sequence + data = read.sequence.models(file.path("results", "test_1kb", + sprintf("peaks_rmsk_sampled_sequences_%dclass_model.mat", k))) + model.seq = data$models + model.prob = data$prob + data = NULL + + # open chromatin + model.open = read.read.models(file.path("results", "test_1kb", + sprintf("peaks_rmsk_sampled_openchromatin_%dclass_model.mat", k)))$models + # nucleosomes + model.nucl = read.read.models(file.path("results", "test_1kb", + sprintf("peaks_rmsk_sampled_nucleosomes_%dclass_model.mat", k)))$models + + # plot classes + col = brewer.pal(3, "Set1") + # X11(width=26, height=12) + png(filename=file.path("results", "test_1kb", + sprintf("peaks_rmsk_sampled_sequences_%dclass.png", k)), + units="in", res=720, width=18, height=12) + m = matrix(1:30, nrow=6, ncol=5, byrow=F) + layout(m) + # order from most to least probable class + ord = order(model.prob, decreasing=T) + ref.open = model.open[ord,, drop=F] + ref.nucl = model.nucl[ord,, drop=F] + ref.seq = model.seq[,,ord, drop=F] + prob = model.prob[ord] + class = c(1:nrow(ref.open))[ord] + for(i in 1:nrow(ref.open)) + { # plot logo + plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, + main=sprintf("class %d (p=%.2f)", class[i], prob[i])) + # x-axis + x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2, length.out=3) + x.at = seq(1, ncol(ref.open), length.out=length(x.lab)) + axis(1, at=x.at, labels=x.lab) + # y-axis is [0,1] for min/max signal + y.at = seq(0, 2, length.out=2) + y.lab = c("min", "max") + axis(2, at=y.at, labels=y.lab) + # plot signal (multiplies by 2 because the y-axis goes to 2 bits) + lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) + lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) + } + # inlets with center + # row_n = 1 # row counter + # col_n = 1 # column counter + # for(i in 1:nrow(ref.open)) + # { # plot logo center + # right = 0.5*col_n - 0.01 + # left = right - 0.2 + # bottom = 1-(row_n*(0.2))+0.05 + # top = bottom + 0.15 + # par(fig=c(left, right, bottom, top), new=T) + # idx = (391-1-20):(391+1+20) + # plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) + # # plot signal (multiplies by 2 because the y-axis goes to 2 bits) + # lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) + # lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) + # # xaxis + # x.at = seq(1, length(idx), length.out = 3) + # x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2)[idx][x.at] + # axis(1, at=x.at, labels=x.lab) + # # yaxis + # axis(2, at=y.at, labels=y.lab) + # row_n = row_n + 1 + # if(i %% 5 == 0) + # { col_n = col_n + 1 + # row_n = 1 + # } + # } + dev.off() +} + diff --git a/scripts/test/test.sh b/scripts/test/test.sh deleted file mode 100644 index c46b711..0000000 --- a/scripts/test/test.sh +++ /dev/null @@ -1,32 +0,0 @@ -# some paths -## directories -results_dir='results/test' -data_dir='results/10xgenomics_PBMC_5k' -## input -file_mat_open="$data_dir/ctcf_motifs_10e-6_open_bin1bp_read_atac.mat" -file_mat_seq="test.mat" -file_mat_1nucl="$data_dir/ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center.mat" -## file with seeds -file_seed=$results_dir'/ctcf_motifs_10e-6_seed.txt' - -mkdir -p $results_dir -touch $file_seed - -# parameters -n_iter='20' -n_shift='21' -seeding='random' -n_core=1 - -# open chromatin and sequence -for k in 10 -do - seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) - file_prob=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_sequence_'$k'class_prob.mat4d' - file_mod1=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_model.mat' - file_mod2=$results_dir/'ctcf_motifs_10e-6_sequence_'$k'class_model.mat' - echo "$file_prob $seed" >> $file_seed - bin/ChIPPartitioning --read $file_mat_open,$file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seeding $seeding --seed $seed --thread $n_core > $file_prob - bin/probToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 - bin/probToModel --read $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod2 -done diff --git a/scripts/test/test_1kb.sh b/scripts/test/test_1kb.sh new file mode 100755 index 0000000..3bc3541 --- /dev/null +++ b/scripts/test/test_1kb.sh @@ -0,0 +1,35 @@ + +# paths +## dir +data_dir="results/10xgenomics_PBMC_5k" +results_dir="results/test_1kb" +## matrix files +file_mat_open=$data_dir/'peaks_rmsk_open_bin1bp_1kb_read_atac.mat' +file_mat_nucl=$data_dir/'peaks_rmsk_nucleosomes_bin1bp_1kb_fragment_center.mat' +file_mat_seq=$data_dir/'peaks_rmsk_sequences_1kb.mat' +## file with seeds +file_seed=$results_dir'/peaks_rmsk_seed.txt' + +mkdir -p $results_dir +touch $file_seed + +# EM param +n_iter='100' +n_shift='951' +n_core=12 + +# classify +for k in 10 20 30 +do + ## results files + file_prob=$results_dir/'peaks_rmsk_sequences_'$k'class_prob.mat4d' + file_mod1=$results_dir/'peaks_rmsk_openchromatin_'$k'class_model.mat' + file_mod2=$results_dir/'peaks_rmsk_nucleosomes_'$k'class_model.mat' + file_mod3=$results_dir/'peaks_rmsk_sequences_'$k'class_model.mat' + seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) + echo "$file_prob $seed" >> $file_seed + bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob + bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 + bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod2 + bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 +done diff --git a/scripts/test/test_1kb_pwms.sh b/scripts/test/test_1kb_pwms.sh new file mode 100755 index 0000000..f7c51f8 --- /dev/null +++ b/scripts/test/test_1kb_pwms.sh @@ -0,0 +1,49 @@ + +# paths +## dir +pwm_dir="data/pwm/jaspar_2018_clustering/" +data_dir="results/10xgenomics_PBMC_5k" +results_dir="results/test_1kb_pwms" +## matrix files +file_mat_open=$data_dir/'peaks_rmsk_open_bin1bp_1kb_read_atac.mat' +file_mat_nucl=$data_dir/'peaks_rmsk_nucleosomes_1kb_bin1bp_fragment_center.mat' +file_mat_seq=$data_dir/'peaks_rmsk_sequences_1kb.mat' +## PWM files +jun="$pwm_dir/cluster_3_node_23_20_motifs_prob.mat" +hif1a="$pwm_dir/cluster_4_node_31_3_motifs_prob.mat" +myc="$pwm_dir/cluster_4_node_22_4_motifs_prob.mat" +pu1="$pwm_dir/cluster_7_node_13_2_motifs_prob.mat" +cebpb="$pwm_dir/cluster_5_node_20_5_motifs_prob.mat" +irf4="$pwm_dir/cluster_31_node_4_5_motifs_prob.mat" +irf2="$pwm_dir/cluster_31_node_5_2_motifs_prob.mat" +lhx3="$pwm_dir/cluster_1_node_74_2_motifs_prob.mat" +foxh1="$pwm_dir/cluster_66_1_motifs_prob.mat" +sox3="$pwm_dir/cluster_33_node_1_2_motifs_prob.mat" +mef2c="$pwm_dir/cluster_20_4_motifs_prob.mat" +elf5="$pwm_dir/cluster_7_node_17_5_motifs_prob.mat" +stat6="$pwm_dir/cluster_32_node_STAT6_1_motifs_prob.mat" +nfe2="$pwm_dir/cluster_3_node_24_4_motifs_prob.mat" +ahr="$pwm_dir/cluster_4_node_30_2_motifs_prob.mat" +elf2="$pwm_dir/cluster_39_node_1_2_motifs_prob.mat" +ctcf="$pwm_dir/cluster_48_node_ctcf_1_motifs_prob.mat" + +mkdir -p $results_dir + +# EM param +n_iter='100' +n_shift='951' +n_core=12 + +# classify +## results files +file_prob=$results_dir/'peaks_rmsk_sequences_1kb_15class_prob.mat4d' +file_mod1=$results_dir/'peaks_rmsk_openchromatin_1kb_15class_model.mat' +file_mod2=$results_dir/'peaks_rmsk_nucleosomes_1kb_15class_model.mat' +file_mod3=$results_dir/'peaks_rmsk_sequences_1kb_15class_model.mat' + +bin/EMSequence --seq $file_mat_seq --motifs $jun,$hif1a,$myc,$pu1,$cebpb,$irf4,$irf2,$lhx3,$foxh1,$sox3,$mef2c,$elf5,$nfe2,$ahr,$elf2 --shift $n_shift --flip --iter $n_iter --thread $n_core > $file_prob + +bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 +bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod2 +bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 + diff --git a/scripts/test/test_1kb_sampled.sh b/scripts/test/test_1kb_sampled.sh new file mode 100755 index 0000000..edd22fe --- /dev/null +++ b/scripts/test/test_1kb_sampled.sh @@ -0,0 +1,35 @@ + +# paths +## dir +data_dir="results/10xgenomics_PBMC_5k" +results_dir="results/test_1kb" +## matrix files +file_mat_open=$data_dir/'peaks_rmsk_sampled_open_bin1bp_1kb_read_atac.mat' +file_mat_nucl=$data_dir/'peaks_rmsk_sampled_nucleosomes_bin1bp_1kb_fragment_center.mat' +file_mat_seq=$data_dir/'peaks_rmsk_sampled_sequences_1kb.mat' +## file with seeds +file_seed=$results_dir'/peaks_rmsk_sampled_seed.txt' + +mkdir -p $results_dir +touch $file_seed + +# EM param +n_iter='100' +n_shift='951' +n_core=12 + +# classify +for k in 10 20 30 +do + ## results files + file_prob=$results_dir/'peaks_rmsk_sampled_sequences_'$k'class_prob.mat4d' + file_mod1=$results_dir/'peaks_rmsk_sampled_openchromatin_'$k'class_model.mat' + file_mod2=$results_dir/'peaks_rmsk_sampled_nucleosomes_'$k'class_model.mat' + file_mod3=$results_dir/'peaks_rmsk_sampled_sequences_'$k'class_model.mat' + seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) + echo "$file_prob $seed" >> $file_seed + bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob + bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 + bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod2 + bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 +done diff --git a/scripts/test/test_1kb_sampled_pwms.sh b/scripts/test/test_1kb_sampled_pwms.sh new file mode 100755 index 0000000..454cae9 --- /dev/null +++ b/scripts/test/test_1kb_sampled_pwms.sh @@ -0,0 +1,49 @@ + +# paths +## dir +pwm_dir="data/pwm/jaspar_2018_clustering/" +data_dir="results/10xgenomics_PBMC_5k" +results_dir="results/test_1kb_pwms" +## matrix files +file_mat_open=$data_dir/'peaks_rmsk_sampled_open_bin1bp_1kb_read_atac.mat' +file_mat_nucl=$data_dir/'peaks_rmsk_sampled_nucleosomes_1kb_bin1bp_fragment_center.mat' +file_mat_seq=$data_dir/'peaks_rmsk_sampled_sequences_1kb.mat' +## PWM files +jun="$pwm_dir/cluster_3_node_23_20_motifs_prob.mat" +hif1a="$pwm_dir/cluster_4_node_31_3_motifs_prob.mat" +myc="$pwm_dir/cluster_4_node_22_4_motifs_prob.mat" +pu1="$pwm_dir/cluster_7_node_13_2_motifs_prob.mat" +cebpb="$pwm_dir/cluster_5_node_20_5_motifs_prob.mat" +irf4="$pwm_dir/cluster_31_node_4_5_motifs_prob.mat" +irf2="$pwm_dir/cluster_31_node_5_2_motifs_prob.mat" +lhx3="$pwm_dir/cluster_1_node_74_2_motifs_prob.mat" +foxh1="$pwm_dir/cluster_66_1_motifs_prob.mat" +sox3="$pwm_dir/cluster_33_node_1_2_motifs_prob.mat" +mef2c="$pwm_dir/cluster_20_4_motifs_prob.mat" +elf5="$pwm_dir/cluster_7_node_17_5_motifs_prob.mat" +# stat6="$pwm_dir/cluster_32_node_STAT6_1_motifs_prob.mat" +nfe2="$pwm_dir/cluster_3_node_24_4_motifs_prob.mat" +ahr="$pwm_dir/cluster_4_node_30_2_motifs_prob.mat" +elf2="$pwm_dir/cluster_39_node_1_2_motifs_prob.mat" +# ctcf="$pwm_dir/cluster_48_node_ctcf_1_motifs_prob.mat" + +mkdir -p $results_dir + +# EM param +n_iter='100' +n_shift='951' +n_core=12 + +# classify +## results files +file_prob=$results_dir/'peaks_rmsk_sampled_sequences_1kb_15class_prob.mat4d' +file_mod1=$results_dir/'peaks_rmsk_sampled_openchromatin_1kb_15class_model.mat' +file_mod2=$results_dir/'peaks_rmsk_sampled_nucleosomes_1kb_15class_model.mat' +file_mod3=$results_dir/'peaks_rmsk_sampled_sequences_1kb_15class_model.mat' + +bin/EMSequence --seq $file_mat_seq --motifs $jun,$hif1a,$myc,$pu1,$cebpb,$irf4,$irf2,$lhx3,$foxh1,$sox3,$mef2c,$elf5,$nfe2,$ahr,$elf2 --shift $n_shift --flip --iter $n_iter --thread $n_core > $file_prob + +bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 +bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod2 +bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 + diff --git a/scripts/test/test_2kb.sh b/scripts/test/test_2kb.sh new file mode 100755 index 0000000..dbabf08 --- /dev/null +++ b/scripts/test/test_2kb.sh @@ -0,0 +1,36 @@ + +# paths +## dir +data_dir="results/10xgenomics_PBMC_5k" +results_dir="results/test_2kb" +## matrix files +file_mat_open=$data_dir/'peaks_rmsk_open_bin1bp_2kb_read_atac.mat' +file_mat_nucl=$data_dir/'peaks_rmsk_nucleosomes_bin1bp_2kb_fragment_center.mat' +file_mat_seq=$data_dir/'peaks_rmsk_sequences_2kb.mat' +## file with seeds +file_seed=$results_dir'/peaks_rmsk_seed.txt' + +mkdir -p $results_dir +touch $file_seed + +# EM param +n_iter='100' +n_shift='201' +n_core=12 + +# classify +for k in 10 20 30 +do + ## results files + file_prob=$results_dir/'peaks_rmsk_sequences_'$k'class_prob.mat4d' + file_mod1=$results_dir/'peaks_rmsk_openchromatin_'$k'class_model.mat' + file_mod2=$results_dir/'peaks_rmsk_nucleosomes_'$k'class_model.mat' + file_mod3=$results_dir/'peaks_rmsk_sequences_'$k'class_model.mat' + seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) + echo "$file_prob $seed" >> $file_seed + bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob + bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 + bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod2 + bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 +done + diff --git a/scripts/test/test_2kb_sampled.sh b/scripts/test/test_2kb_sampled.sh new file mode 100755 index 0000000..2f3e309 --- /dev/null +++ b/scripts/test/test_2kb_sampled.sh @@ -0,0 +1,36 @@ + +# paths +## dir +data_dir="results/10xgenomics_PBMC_5k" +results_dir="results/test_2kb" +## matrix files +file_mat_open=$data_dir/'peaks_rmsk_sampled_open_bin1bp_2kb_read_atac.mat' +file_mat_nucl=$data_dir/'peaks_rmsk_sampled_nucleosomes_bin1bp_2kb_fragment_center.mat' +file_mat_seq=$data_dir/'peaks_rmsk_sampled_sequences_2kb.mat' +## file with seeds +file_seed=$results_dir'/peaks_rmsk_sampled_seed.txt' + +mkdir -p $results_dir +touch $file_seed + +# EM param +n_iter='100' +n_shift='201' +n_core=12 + +# classify +for k in 10 20 30 +do + ## results files + file_prob=$results_dir/'peaks_rmsk_sampled_sequences_'$k'class_prob.mat4d' + file_mod1=$results_dir/'peaks_rmsk_sampled_openchromatin_'$k'class_model.mat' + file_mod2=$results_dir/'peaks_rmsk_sampled_nucleosomes_'$k'class_model.mat' + file_mod3=$results_dir/'peaks_rmsk_sampled_sequences_'$k'class_model.mat' + seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) + echo "$file_prob $seed" >> $file_seed + bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob + bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 + bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod2 + bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 +done + diff --git a/scripts/test_2/analysis_test_2_sampled.R b/scripts/test_2/analysis_test_2_sampled.R new file mode 100644 index 0000000..afcf023 --- /dev/null +++ b/scripts/test_2/analysis_test_2_sampled.R @@ -0,0 +1,97 @@ +setwd(file.path("/", "local", "groux", "scATAC-seq")) + +# libraries +library(RColorBrewer) +library(seqLogo) + +# functions +source(file.path("scripts", "functions.R")) + +# the number of classes searched +n.classes = c(10, 20, 30) + +# path to the images for the logo +path.a = file.path("res/A.png") +path.c = file.path("res/C.png") +path.g = file.path("res/G.png") +path.t = file.path("res/T.png") + +################## sequence patterns around ctcf motifs ################## + +for(k in n.classes) +{ + # sequence + data = read.sequence.models(file.path("results", "test_1kb", + sprintf("peaks_rmsk_sampled_sequences_%dclass_model.mat", k))) + model.seq = data$models + model.prob = data$prob + data = NULL + + # open chromatin + model.open = read.read.models(file.path("results", "test_1kb", + sprintf("peaks_rmsk_sampled_openchromatin_%dclass_model.mat", k)))$models + # nucleosomes + model.nucl = read.read.models(file.path("results", "test_1kb", + sprintf("peaks_rmsk_sampled_nucleosomes_%dclass_model.mat", k)))$models + + # plot classes + col = brewer.pal(3, "Set1") + # X11(width=26, height=12) + png(filename=file.path("results", "test_1kb", + sprintf("peaks_rmsk_sampled_sequences_%dclass.png", k)), + units="in", res=720, width=18, height=12) + m = matrix(1:30, nrow=6, ncol=5, byrow=F) + layout(m) + # order from most to least probable class + ord = order(model.prob, decreasing=T) + ref.open = model.open[ord,, drop=F] + ref.nucl = model.nucl[ord,, drop=F] + ref.seq = model.seq[,,ord, drop=F] + prob = model.prob[ord] + class = c(1:nrow(ref.open))[ord] + for(i in 1:nrow(ref.open)) + { # plot logo + plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, + main=sprintf("class %d (p=%.2f)", class[i], prob[i])) + # x-axis + x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2, length.out=3) + x.at = seq(1, ncol(ref.open), length.out=length(x.lab)) + axis(1, at=x.at, labels=x.lab) + # y-axis is [0,1] for min/max signal + y.at = seq(0, 2, length.out=2) + y.lab = c("min", "max") + axis(2, at=y.at, labels=y.lab) + # plot signal (multiplies by 2 because the y-axis goes to 2 bits) + lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) + lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) + } + # inlets with center + # row_n = 1 # row counter + # col_n = 1 # column counter + # for(i in 1:nrow(ref.open)) + # { # plot logo center + # right = 0.5*col_n - 0.01 + # left = right - 0.2 + # bottom = 1-(row_n*(0.2))+0.05 + # top = bottom + 0.15 + # par(fig=c(left, right, bottom, top), new=T) + # idx = (391-1-20):(391+1+20) + # plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) + # # plot signal (multiplies by 2 because the y-axis goes to 2 bits) + # lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) + # lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) + # # xaxis + # x.at = seq(1, length(idx), length.out = 3) + # x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2)[idx][x.at] + # axis(1, at=x.at, labels=x.lab) + # # yaxis + # axis(2, at=y.at, labels=y.lab) + # row_n = row_n + 1 + # if(i %% 5 == 0) + # { col_n = col_n + 1 + # row_n = 1 + # } + # } + dev.off() +} + diff --git a/scripts/test_2/test_1kb_sampled.sh b/scripts/test_2/test_1kb_sampled.sh new file mode 100755 index 0000000..2b9e096 --- /dev/null +++ b/scripts/test_2/test_1kb_sampled.sh @@ -0,0 +1,35 @@ + +# paths +## dir +data_dir="results/10xgenomics_PBMC_5k" +results_dir="results/test_1kb_2" +## matrix files +file_mat_open=$data_dir/'peaks_rmsk_sampled_open_bin1bp_1kb_read_atac.mat' +file_mat_nucl=$data_dir/'peaks_rmsk_sampled_nucleosomes_bin1bp_1kb_fragment_center.mat' +file_mat_seq=$data_dir/'peaks_rmsk_sampled_sequences_1kb.mat' +## file with seeds +file_seed=$results_dir'/peaks_rmsk_sampled_seed.txt' + +mkdir -p $results_dir +touch $file_seed + +# EM param +n_iter='100' +n_shift='971' +n_core=14 + +# classify +for k in 10 20 30 +do + ## results files + file_prob=$results_dir/'peaks_rmsk_sampled_sequences_'$k'class_prob.mat4d' + file_mod1=$results_dir/'peaks_rmsk_sampled_openchromatin_'$k'class_model.mat' + file_mod2=$results_dir/'peaks_rmsk_sampled_nucleosomes_'$k'class_model.mat' + file_mod3=$results_dir/'peaks_rmsk_sampled_sequences_'$k'class_model.mat' + seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) + echo "$file_prob $seed" >> $file_seed + bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob + bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 + bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod2 + bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 +done diff --git a/scripts/test_2/test_2kb_sampled.sh b/scripts/test_2/test_2kb_sampled.sh new file mode 100755 index 0000000..02af0c9 --- /dev/null +++ b/scripts/test_2/test_2kb_sampled.sh @@ -0,0 +1,36 @@ + +# paths +## dir +data_dir="results/10xgenomics_PBMC_5k" +results_dir="results/test_2kb_2" +## matrix files +file_mat_open=$data_dir/'peaks_rmsk_sampled_open_bin1bp_2kb_read_atac.mat' +file_mat_nucl=$data_dir/'peaks_rmsk_sampled_nucleosomes_bin1bp_2kb_fragment_center.mat' +file_mat_seq=$data_dir/'peaks_rmsk_sampled_sequences_2kb.mat' +## file with seeds +file_seed=$results_dir'/peaks_rmsk_sampled_seed.txt' + +mkdir -p $results_dir +touch $file_seed + +# EM param +n_iter='100' +n_shift='971' +n_core=14 + +# classify +for k in 10 20 30 +do + ## results files + file_prob=$results_dir/'peaks_rmsk_sampled_sequences_'$k'class_prob.mat4d' + file_mod1=$results_dir/'peaks_rmsk_sampled_openchromatin_'$k'class_model.mat' + file_mod2=$results_dir/'peaks_rmsk_sampled_nucleosomes_'$k'class_model.mat' + file_mod3=$results_dir/'peaks_rmsk_sampled_sequences_'$k'class_model.mat' + seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) + echo "$file_prob $seed" >> $file_seed + bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob + bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 + bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod2 + bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 +done + diff --git a/scripts/test_dendrogram.R b/scripts/test_dendrogram.R new file mode 100644 index 0000000..708c704 --- /dev/null +++ b/scripts/test_dendrogram.R @@ -0,0 +1,105 @@ +setwd(file.path("/", "local", "groux", "scATAC-seq")) + +# libraries +library(RColorBrewer) +library(motifStack) +library(TFBSTools) +library(MotifDb) + +# functions +source(file.path("scripts", "functions.R")) + +get_pfm_list = function(motifs, prefix_name) +{ pfm_list = list() + + for(i in 1:dim(motifs)[3]) + { pfm_list[[i]] = new("pfm", + mat=motifs[,,i], + name=sprintf("%s class %d", prefix_name, i)) + } + return(pfm_list) +} + + +# number of classes searched in the data +n_classes = c(17, 20, 30) + +# load motifs from JASPAR clustering used to initialise the classes +motifs_jaspar_paths = c("data/pwm/jaspar_2018_clustering/cluster_3_node_23_20_motifs_prob.mat", + "data/pwm/jaspar_2018_clustering/cluster_4_node_31_3_motifs_prob.mat", + "data/pwm/jaspar_2018_clustering/cluster_4_node_22_4_motifs_prob.mat", + "data/pwm/jaspar_2018_clustering/cluster_7_node_13_2_motifs_prob.mat", + "data/pwm/jaspar_2018_clustering/cluster_5_node_20_5_motifs_prob.mat", + "data/pwm/jaspar_2018_clustering/cluster_31_node_4_5_motifs_prob.mat", + "data/pwm/jaspar_2018_clustering/cluster_31_node_5_2_motifs_prob.mat", + "data/pwm/jaspar_2018_clustering/cluster_1_node_74_2_motifs_prob.mat", + "data/pwm/jaspar_2018_clustering/cluster_66_1_motifs_prob.mat", + "data/pwm/jaspar_2018_clustering/cluster_33_node_1_2_motifs_prob.mat", + "data/pwm/jaspar_2018_clustering/cluster_20_4_motifs_prob.mat", + "data/pwm/jaspar_2018_clustering/cluster_7_node_17_5_motifs_prob.mat", + "data/pwm/jaspar_2018_clustering/cluster_32_node_STAT6_1_motifs_prob.mat", + "data/pwm/jaspar_2018_clustering/cluster_3_node_24_4_motifs_prob.mat", + "data/pwm/jaspar_2018_clustering/cluster_4_node_30_2_motifs_prob.mat", + "data/pwm/jaspar_2018_clustering/cluster_39_node_1_2_motifs_prob.mat", + "data/pwm/jaspar_2018_clustering/cluster_48_node_ctcf_1_motifs_prob.mat") +motifs_jaspar = lapply(motifs_jaspar_paths, read.table) +motifs_jaspar = lapply(motifs_jaspar, as.matrix) +motifs_jaspar_names = c("jun", + "HIF1A", + "myc", + "PU.1", + "CEBPb", + "Irf4", + "Irf2", + "LHX3", + "Fox1H", + "Sox3", + "Mef2c", + "Elf5", + "STAT6", + "NFE2", + "AHR", + "E2F2", + "CTCF") +tmp = list() +for(i in 1:length(motifs_jaspar)) +{ rownames(motifs_jaspar[[i]]) = c("A", "C", "G", "T") + tmp[[i]] = new("pfm", + mat=motifs_jaspar[[i]], + name=motifs_jaspar_names[i]) +} +motifs_jaspar = tmp +rm(tmp) + +for(n_class in n_classes) +{ # load classes found + motifs_found = get_pfm_list(read.sequence.models(file.path("results", + "10xgenomics_PBMC_5k_peaks_classification_4", + sprintf("peaks_rmsk_sampled_sequences_1kb_%dclass_model.mat", n_class)))$models, + "") + # colors + red = brewer.pal(3, "Set1")[1] + blue = brewer.pal(3, "Set1")[2] + color = c(rep(blue, length(motifs_jaspar)), + rep(red, length(motifs_found))) + # plot logo stack with radial style + # X11(height=12, width=12) + png(filename=file.path(sprintf("test_%dclass.png", n_class)), + units="in", res=720, width=14, height=14) + motifStack(c(motifs_jaspar, + motifs_found), + layout="radialPhylog", + circle=0.3, + cleaves = 0.2, + clabel.leaves = 0.5, + col.bg=color, + col.bg.alpha=0.3, + col.leaves=color, + col.inner.label.circle=color, + inner.label.circle.width=0.05, + col.outer.label.circle=color, + outer.label.circle.width=0.02, + circle.motif=.5, + angle=350) + dev.off() +} diff --git a/src/Applications/ChIPPartitioningApplication.cpp b/src/Applications/ChIPPartitioningApplication.cpp index 943afc7..fabdad0 100644 --- a/src/Applications/ChIPPartitioningApplication.cpp +++ b/src/Applications/ChIPPartitioningApplication.cpp @@ -1,174 +1,176 @@ #include #include #include #include #include // std::invalid_argument #include #include // +#include + namespace po = boost::program_options ; ChIPPartitioningApplication::ChIPPartitioningApplication(int argn, char** argv) : file_read(""), file_sequence(""), n_class(0), n_iter(0), n_shift(0), flip(false), n_threads(0), seeding(EMEngine::seeding_codes::RANDOM), seed(""), runnable(true) { // parse command line options and set the fields this->parseOptions(argn, argv) ; } int ChIPPartitioningApplication::run() { if(this->runnable) { // read data std::vector read_paths ; boost::split(read_paths, this->file_read, [](char c){return c == ',';}); - std::vector data_read ; + std::vector> data_read ; for(const auto& path : read_paths) { if(path == "") { continue ; } - data_read.push_back(read_matrix2d_i(path)) ; + data_read.push_back(Matrix2D(path)) ; } // sequence data - std::vector data_seq ; + std::vector> data_seq ; if(this->file_sequence != "") - { data_seq.push_back(read_matrix2d_i(this->file_sequence)) ; } + { data_seq.push_back(Matrix2D(this->file_sequence)) ; } EMEngine em(data_read, data_seq, this->n_class, this->n_iter, this->n_shift, this->flip, this->seeding, this->seed, this->n_threads) ; em.classify() ; std::cout << em.get_post_prob() << std::endl ; return EXIT_SUCCESS ; } else { return EXIT_FAILURE ; } } void ChIPPartitioningApplication::parseOptions(int argn, char** argv) { // no option to parse if(argv == nullptr) { std::string message = "no options to parse!" ; throw std::invalid_argument(message) ; } // help messages std::string desc_msg = "\n" "ChIPPartitioning is a probabilistic partitioning algorithm that \n" "sofetly assigns genomic regions to classes given their shape \n" "of the signal over the region. The assignment probabilities \n" "are returned through stdout.\n\n" ; std::string opt_help_msg = "Produces this help message." ; std::string opt_thread_msg = "The number of threads dedicated to parallelize the computations,\n " "by default 0 (no parallelization)." ; std::string opt_read_msg = "A coma separated list of paths to the file containing the " "read density data" ; std::string opt_seq_msg = "The path to the file containing the sequence data" ; std::string opt_iter_msg = "The number of iterations." ; std::string opt_class_msg = "The number of classes to find." ; std::string opt_shift_msg = "Enables this number of column of shifting " "freedom. By default, shifting is " "disabled (equivalent to --shift 1)." ; std::string opt_flip_msg = "Enables flipping."; std::string opt_seeding_msg = "Specify which method should be used to initialise the " "cluster references." ; std::string opt_seed_msg = "A value to seed the random number generator."; // option parser boost::program_options::variables_map vm ; boost::program_options::options_description desc(desc_msg) ; std::string seeding_tmp ; desc.add_options() ("help,h", opt_help_msg.c_str()) ("read", po::value(&(this->file_read)), opt_read_msg.c_str()) ("seq", po::value(&(this->file_sequence)), opt_read_msg.c_str()) ("iter,i", po::value(&(this->n_iter)), opt_iter_msg.c_str()) ("class,c", po::value(&(this->n_class)), opt_class_msg.c_str()) ("shift,s", po::value(&(this->n_shift)), opt_shift_msg.c_str()) ("flip", opt_flip_msg.c_str()) ("seeding", po::value(&(seeding_tmp)), opt_seeding_msg.c_str()) ("seed", po::value(&(this->seed)), opt_seed_msg.c_str()) ("thread", po::value(&(this->n_threads)), opt_thread_msg.c_str()) ; // parse try { po::store(po::parse_command_line(argn, argv, desc), vm) ; po::notify(vm) ; } catch(std::invalid_argument& e) { std::string msg = std::string("Error! Invalid option given!\n") + std::string(e.what()) ; throw std::invalid_argument(msg) ; } catch(...) { throw std::invalid_argument("An unknown error occured while parsing the options") ; } bool help = vm.count("help") ; // checks unproper option settings if(this->file_read == "" and this->file_sequence == "" and (not help)) { std::string msg("Error! No data were given (--read and/or --seq)!") ; throw std::invalid_argument(msg) ; } else if((seeding_tmp != "random") and (seeding_tmp != "sampling") and (seeding_tmp != "toy") and (not help)) { std::string msg("Error! Unrecognized seeding method (--seeding)!") ; throw std::invalid_argument(msg) ; } // no class given -> 1 iter if(this->n_iter == 0) { this->n_iter = 1 ; } // no shift class given -> 1 class if(this->n_class == 0) { this->n_class = 1 ; } // no shift given, value of 1 -> no shift if(this->n_shift == 0) { this->n_shift = 1 ; } // set seeding if(seeding_tmp == "random") { this->seeding = EMEngine::seeding_codes::RANDOM ; } else if(seeding_tmp == "sampling") { this->seeding = EMEngine::seeding_codes::SAMPLING ; } else if(seeding_tmp == "toy") { this->seeding = EMEngine::seeding_codes::TOY ; } // set flip if(vm.count("flip")) { this->flip = true ; } // help invoked, run() cannot be invoked if(help) { std::cout << desc << std::endl ; this->runnable = false ; return ; } // everything fine, run() can be called else { this->runnable = true ; return ; } } int main(int argn, char** argv) { ChIPPartitioningApplication app(argn, argv) ; return app.run() ; } diff --git a/src/Applications/ChIPPartitioningApplication.hpp b/src/Applications/ChIPPartitioningApplication.hpp index 912b9aa..2b281d2 100644 --- a/src/Applications/ChIPPartitioningApplication.hpp +++ b/src/Applications/ChIPPartitioningApplication.hpp @@ -1,106 +1,105 @@ #ifndef CHIPPPARTITIONINGAPPLICATION_HPP #define CHIPPPARTITIONINGAPPLICATION_HPP #include #include -#include #include /*! * \brief The ChIPPartitioningApplication class is a wrapper around an EMEngine * instance creating an autonomous application to classify data by directly * passing all the options and parameters from the command line. */ class ChIPPartitioningApplication: public ApplicationInterface { public: ChIPPartitioningApplication() = delete ; ChIPPartitioningApplication(const ChIPPartitioningApplication& app) = delete ; /*! * \brief Constructs an object from the command line * options. * \param argn the number of options passed to the * main() function. * \param argv the vector of options passed to the * main() function. */ ChIPPartitioningApplication(int argn, char** argv) ; /*! * \brief Runs the application. The data are classified * using the given settings and the posterior probability * matrix is returned through the stdout. * The matrix is a 4D matrix with dimensions : * regions, class, shift flip. * \return an exit code EXIT_SUCCESS or EXIT_FAILURE * to return to the OS. */ virtual int run() override ; private: /*! * \brief Parses the program command line options and * sets the object field accordingly. * If the help option is detected, the "runnable" * field is set to false and subsequent calls to * run() will produce nothing. * \param argn the number of options passed to the * main() function. * \param argv the vector of options passed to the * main() function. * \throw std::invalid_argument if an error is found * in the program options. */ void parseOptions(int argn, char** argv) ; /*! * \brief the paths to the files containing the read * density data. */ std::string file_read ; /*! * \brief the path to the file containing the * sequence data. */ std::string file_sequence ; /*! * \brief the number of classes to partition the data into. */ size_t n_class ; /*! * \brief the number of iterations allowed. */ size_t n_iter ; /*! * \brief the shifting freedom. */ size_t n_shift ; /*! * \brief whether flipping freedom is allowed. */ bool flip ; /*! * \brief the number of threads. */ size_t n_threads ; /*! * \brief the seeding method to use. */ EMEngine::seeding_codes seeding ; /*! * \brief a seed to initialise the random number generator. */ std::string seed ; /*! * \brief a flag indicating whether the core of run() can be * run or not. */ bool runnable ; } ; #endif // CHIPPPARTITIONINGAPPLICATION_HPP diff --git a/src/Applications/CorrelationMatrixCreatorApplication.cpp b/src/Applications/CorrelationMatrixCreatorApplication.cpp index 3bc1842..4fe08f4 100644 --- a/src/Applications/CorrelationMatrixCreatorApplication.cpp +++ b/src/Applications/CorrelationMatrixCreatorApplication.cpp @@ -1,190 +1,190 @@ #include #include #include #include #include #include // std::invalid_argument namespace po = boost::program_options ; // the valid values for --method option std::string method_read = "read" ; std::string method_read_atac = "read_atac" ; std::string method_fragment = "fragment" ; std::string method_fragment_center = "fragment_center" ; CorrelationMatrixCreatorApplication::CorrelationMatrixCreatorApplication(int argn, char** argv) - : file_bed(""), file_bam(""), from(0), to(0), bin_size(0), + : file_bed(""), file_bam(""), file_bai(""), from(0), to(0), bin_size(0), method(CorrelationMatrixCreator::FRAGMENT), runnable(true) { // parse command line options and set the fields this->parseOptions(argn, argv) ; } int CorrelationMatrixCreatorApplication::run() { if(this->runnable) { CorrelationMatrixCreator mc(this->file_bed, this->file_bam, this->file_bai, this->from, this->to, this->bin_size, this->method) ; std::cout << mc.create_matrix() << std::endl ; return EXIT_SUCCESS ; } else { return EXIT_FAILURE ; } } void CorrelationMatrixCreatorApplication::parseOptions(int argn, char** argv) { // no option to parse if(argv == nullptr) { std::string message = "no options to parse!" ; throw std::invalid_argument(message) ; } // help messages std::string desc_msg = "\n" "CorrelationMatrixCreator is an application that creates a\n" "count matrix from a BED file and a BAM file and returnes it\n" "through stdout.\n" "The matrix contains one row per region (reference region)\n" "present in the BED file. For each region, its center is\n" "computed and a set of equally sized, non-overlapping bins,\n" "centered on the region center and covering the interval [from,to]\n" "is build. Then, each bin is assigned the number of read/fragment\n" "positions (targets) present in the BAM file that are mapped at\n" "that position.\n" "The matrix is a 2D matrix which dimensions are :\n" "1) number of regions\n" "2) length of region (to - from + 1) / bin_size\n\n" ; std::string opt_help_msg = "Produces this help message." ; std::string opt_bed_msg = "The path to the BED file containing the references."; std::string opt_bam_msg = "The path to the BAM file containing the targets."; - std::string opt_bai_msg = "The path to the BAI file containing the index BAM file."; - std::string opt_from_msg = "The upstream limit - in relative coordinate - of the region to build " + std::string opt_bai_msg = "The path to the BAI file containing the BAM file index."; + std::string opt_from_msg = "The upstream limit - in relative coordinate - of the region to build\n" "around each reference center." ; - std::string opt_to_msg = "The downstream limit - in relative coordinate - of the region to build " + std::string opt_to_msg = "The downstream limit - in relative coordinate - of the region to build\n" "around each reference center." ; std::string opt_binsize_msg = "The size of the bins." ; char tmp[4096] ; sprintf(tmp, - "How the data in the BAM file should be handled when computing " + "How the data in the BAM file should be handled when computing\n" "the number of counts in each bin.\n" "\t\"%s\" uses each position within the reads (by default)\n" "\t\"%s\" uses only the insertion site for ATAC-seq data\n" "\t\"%s\" uses each position within the fragments\n" "\t\"%s\" uses only the fragment central positions\n", method_read.c_str(), method_read_atac.c_str(), method_fragment.c_str(), method_fragment_center.c_str()) ; std::string opt_method_msg = tmp ; // option parser boost::program_options::variables_map vm ; boost::program_options::options_description desc(desc_msg) ; std::string method(method_read) ; desc.add_options() ("help,h", opt_help_msg.c_str()) ("bed", po::value(&(this->file_bed)), opt_bed_msg.c_str()) ("bam", po::value(&(this->file_bam)), opt_bam_msg.c_str()) ("bai", po::value(&(this->file_bai)), opt_bai_msg.c_str()) ("from", po::value(&(this->from)), opt_from_msg.c_str()) ("to", po::value(&(this->to)), opt_to_msg.c_str()) ("binSize", po::value(&(this->bin_size)), opt_binsize_msg.c_str()) ("method", po::value(&(method)), opt_method_msg.c_str()) ; // parse try { po::store(po::parse_command_line(argn, argv, desc), vm) ; po::notify(vm) ; } catch(std::invalid_argument& e) { std::string msg = std::string("Error! Invalid option given!\n") + std::string(e.what()) ; throw std::invalid_argument(msg) ; } catch(...) { throw std::invalid_argument("An unknown error occured while parsing the options") ; } bool help = vm.count("help") ; // checks unproper option settings if(this->file_bed == "" and (not help)) { std::string msg("Error! No BED file was given (--bed)!") ; throw std::invalid_argument(msg) ; } else if(this->file_bam == "" and (not help)) { std::string msg("Error! No BAM file was given (--bam)!") ; throw std::invalid_argument(msg) ; } else if(this->file_bai == "" and (not help)) { std::string msg("Error! No BAM index file was given (--bai)!") ; throw std::invalid_argument(msg) ; } else if(this->from == 0 and this->to == 0 and (not help)) { std::string msg("Error! No range given (--from and --to)!") ; throw std::invalid_argument(msg) ; } else if(this->from >= this->to and (not help)) { std::string msg("Error! from shoud be smaller than to (--from and --to)!") ; throw std::invalid_argument(msg) ; } else if(this->bin_size <= 0 and (not help)) { std::string msg("Error! bin size should be bigger than 0 (--binSize)!") ; throw std::invalid_argument(msg) ; } else if(method != method_read and method != method_read_atac and method != method_fragment and method != method_fragment_center) { char msg[4096] ; sprintf(msg, "Error! method should be %s, %s, %s or %s (--method)", method_read.c_str(), method_read_atac.c_str(), method_fragment.c_str(), method_fragment_center.c_str()) ; throw std::invalid_argument(msg) ; } // set method if(method == method_read) { this->method = CorrelationMatrixCreator::READ ; } else if(method == method_read_atac) { this->method = CorrelationMatrixCreator::READ_ATAC ; } else if(method == method_fragment) { this->method = CorrelationMatrixCreator::FRAGMENT ; } else if(method == method_fragment_center) { this->method = CorrelationMatrixCreator::FRAGMENT_CENTER ; } // help invoked, run() cannot be invoked if(help) { std::cout << desc << std::endl ; this->runnable = false ; return ; } // everything fine, run() can be called else { this->runnable = true ; return ; } } int main(int argn, char** argv) { CorrelationMatrixCreatorApplication app(argn, argv) ; return app.run() ; } diff --git a/src/Applications/ChIPPartitioningApplication.cpp b/src/Applications/EMJointApplication.cpp similarity index 59% copy from src/Applications/ChIPPartitioningApplication.cpp copy to src/Applications/EMJointApplication.cpp index 943afc7..b188fec 100644 --- a/src/Applications/ChIPPartitioningApplication.cpp +++ b/src/Applications/EMJointApplication.cpp @@ -1,174 +1,174 @@ -#include -#include +#include +#include #include #include #include // std::invalid_argument #include -#include // +#include // boost::split() + +#include namespace po = boost::program_options ; -ChIPPartitioningApplication::ChIPPartitioningApplication(int argn, char** argv) - : file_read(""), file_sequence(""), n_class(0), n_iter(0), n_shift(0), flip(false), - n_threads(0), seeding(EMEngine::seeding_codes::RANDOM), - seed(""), runnable(true) +EMJointApplication::EMJointApplication(int argn, char** argv) + : files_read(""), file_sequence(""), n_class(0), n_iter(0), n_shift(0), flip(false), + n_threads(0), seed(""), runnable(true) { // parse command line options and set the fields this->parseOptions(argn, argv) ; } -int ChIPPartitioningApplication::run() +int EMJointApplication::run() { if(this->runnable) { // read data std::vector read_paths ; - boost::split(read_paths, this->file_read, [](char c){return c == ',';}); - std::vector data_read ; + boost::split(read_paths, this->files_read, [](char c){return c == ',';}) ; + std::vector> data_read ; for(const auto& path : read_paths) { if(path == "") { continue ; } - data_read.push_back(read_matrix2d_i(path)) ; + data_read.push_back(Matrix2D(path)) ; } // sequence data - std::vector data_seq ; - if(this->file_sequence != "") - { data_seq.push_back(read_matrix2d_i(this->file_sequence)) ; } - - EMEngine em(data_read, - data_seq, - this->n_class, - this->n_iter, - this->n_shift, - this->flip, - this->seeding, - this->seed, - this->n_threads) ; - em.classify() ; - std::cout << em.get_post_prob() << std::endl ; + EMJoint* em = nullptr ; + if(this->file_sequence == "") + { em = new EMJoint(data_read, + this->n_class, + this->n_iter, + this->n_shift, + this->flip, + this->seed, + this->n_threads) ; + } + else + { Matrix2D data_seq(this->file_sequence) ; + em = new EMJoint(data_read, + data_seq, + this->n_class, + this->n_iter, + this->n_shift, + this->flip, + this->seed, + this->n_threads) ; + } + em->classify() ; + std::cout << em->get_post_prob() << std::endl ; + delete em ; + em = nullptr ; return EXIT_SUCCESS ; } else { return EXIT_FAILURE ; } } -void ChIPPartitioningApplication::parseOptions(int argn, char** argv) +void EMJointApplication::parseOptions(int argn, char** argv) { // no option to parse if(argv == nullptr) { std::string message = "no options to parse!" ; throw std::invalid_argument(message) ; } // help messages std::string desc_msg = "\n" - "ChIPPartitioning is a probabilistic partitioning algorithm that \n" - "sofetly assigns genomic regions to classes given their shape \n" - "of the signal over the region. The assignment probabilities \n" - "are returned through stdout.\n\n" ; + "EMJoint is a probabilistic partitioning algorithm that \n" + "sofetly assigns genomic regions to classes given 1) the shapes \n" + "of the read densities over the regions and 2) the region sequence \n" + "motif contents. \n " + "The assignment probabilitiesare returned through stdout.\n\n" ; std::string opt_help_msg = "Produces this help message." ; - std::string opt_thread_msg = "The number of threads dedicated to parallelize the computations,\n " + std::string opt_thread_msg = "The number of threads dedicated to parallelize the computations, \n" "by default 0 (no parallelization)." ; - std::string opt_read_msg = "A coma separated list of paths to the file containing the " - "read density data" ; - std::string opt_seq_msg = "The path to the file containing the sequence data" ; + std::string opt_read_msg = "A coma separated list of paths to the file containing the \n" + "read density data. At least one path is needed." ; + std::string opt_seq_msg = "The path to the file containing the sequence data. If no path is \n" + "given, the classification is only cares about the read density \n" + "shapes." ; std::string opt_iter_msg = "The number of iterations." ; std::string opt_class_msg = "The number of classes to find." ; std::string opt_shift_msg = "Enables this number of column of shifting " "freedom. By default, shifting is " "disabled (equivalent to --shift 1)." ; std::string opt_flip_msg = "Enables flipping."; - std::string opt_seeding_msg = "Specify which method should be used to initialise the " - "cluster references." ; std::string opt_seed_msg = "A value to seed the random number generator."; // option parser boost::program_options::variables_map vm ; boost::program_options::options_description desc(desc_msg) ; - std::string seeding_tmp ; - desc.add_options() ("help,h", opt_help_msg.c_str()) - ("read", po::value(&(this->file_read)), opt_read_msg.c_str()) + ("read", po::value(&(this->files_read)), opt_read_msg.c_str()) ("seq", po::value(&(this->file_sequence)), opt_read_msg.c_str()) ("iter,i", po::value(&(this->n_iter)), opt_iter_msg.c_str()) ("class,c", po::value(&(this->n_class)), opt_class_msg.c_str()) ("shift,s", po::value(&(this->n_shift)), opt_shift_msg.c_str()) ("flip", opt_flip_msg.c_str()) - ("seeding", po::value(&(seeding_tmp)), opt_seeding_msg.c_str()) ("seed", po::value(&(this->seed)), opt_seed_msg.c_str()) ("thread", po::value(&(this->n_threads)), opt_thread_msg.c_str()) ; // parse try { po::store(po::parse_command_line(argn, argv, desc), vm) ; po::notify(vm) ; } catch(std::invalid_argument& e) { std::string msg = std::string("Error! Invalid option given!\n") + std::string(e.what()) ; throw std::invalid_argument(msg) ; } catch(...) { throw std::invalid_argument("An unknown error occured while parsing the options") ; } bool help = vm.count("help") ; // checks unproper option settings - if(this->file_read == "" and + if(this->files_read == "" and this->file_sequence == "" and (not help)) - { std::string msg("Error! No data were given (--read and/or --seq)!") ; + { std::string msg("Error! No data were given (--read and --seq)!") ; throw std::invalid_argument(msg) ; } - else if((seeding_tmp != "random") and - (seeding_tmp != "sampling") and - (seeding_tmp != "toy") and - (not help)) - { std::string msg("Error! Unrecognized seeding method (--seeding)!") ; + if(this->files_read == "" and + (not help)) + { std::string msg("Error! No read density data were given (--read)!") ; throw std::invalid_argument(msg) ; } - // no class given -> 1 iter + // no iter given -> 1 iter if(this->n_iter == 0) { this->n_iter = 1 ; } // no shift class given -> 1 class if(this->n_class == 0) { this->n_class = 1 ; } // no shift given, value of 1 -> no shift if(this->n_shift == 0) { this->n_shift = 1 ; } - // set seeding - if(seeding_tmp == "random") - { this->seeding = EMEngine::seeding_codes::RANDOM ; } - else if(seeding_tmp == "sampling") - { this->seeding = EMEngine::seeding_codes::SAMPLING ; } - else if(seeding_tmp == "toy") - { this->seeding = EMEngine::seeding_codes::TOY ; } // set flip if(vm.count("flip")) { this->flip = true ; } // help invoked, run() cannot be invoked if(help) { std::cout << desc << std::endl ; this->runnable = false ; return ; } // everything fine, run() can be called else { this->runnable = true ; return ; } } int main(int argn, char** argv) -{ ChIPPartitioningApplication app(argn, argv) ; +{ EMJointApplication app(argn, argv) ; return app.run() ; } diff --git a/src/Applications/ChIPPartitioningApplication.hpp b/src/Applications/EMJointApplication.hpp similarity index 76% copy from src/Applications/ChIPPartitioningApplication.hpp copy to src/Applications/EMJointApplication.hpp index 912b9aa..4fa806c 100644 --- a/src/Applications/ChIPPartitioningApplication.hpp +++ b/src/Applications/EMJointApplication.hpp @@ -1,106 +1,101 @@ -#ifndef CHIPPPARTITIONINGAPPLICATION_HPP -#define CHIPPPARTITIONINGAPPLICATION_HPP +#ifndef EMJOINTAPPLICATION_HPP +#define EMJOINTAPPLICATION_HPP #include -#include -#include +#include #include /*! - * \brief The ChIPPartitioningApplication class is a wrapper around an EMEngine + * \brief The EMJointApplication class is a wrapper around an EMJoint * instance creating an autonomous application to classify data by directly * passing all the options and parameters from the command line. */ -class ChIPPartitioningApplication: public ApplicationInterface +class EMJointApplication: public ApplicationInterface { public: - ChIPPartitioningApplication() = delete ; - ChIPPartitioningApplication(const ChIPPartitioningApplication& app) = delete ; + EMJointApplication() = delete ; + EMJointApplication(const EMJointApplication& app) = delete ; /*! * \brief Constructs an object from the command line * options. * \param argn the number of options passed to the * main() function. * \param argv the vector of options passed to the * main() function. */ - ChIPPartitioningApplication(int argn, char** argv) ; + EMJointApplication(int argn, char** argv) ; /*! * \brief Runs the application. The data are classified * using the given settings and the posterior probability * matrix is returned through the stdout. * The matrix is a 4D matrix with dimensions : * regions, class, shift flip. * \return an exit code EXIT_SUCCESS or EXIT_FAILURE * to return to the OS. */ virtual int run() override ; private: /*! * \brief Parses the program command line options and * sets the object field accordingly. * If the help option is detected, the "runnable" * field is set to false and subsequent calls to * run() will produce nothing. * \param argn the number of options passed to the * main() function. * \param argv the vector of options passed to the * main() function. * \throw std::invalid_argument if an error is found * in the program options. */ void parseOptions(int argn, char** argv) ; /*! - * \brief the paths to the files containing the read - * density data. + * \brief a coma separated list of paths to the files + * containing the read density data */ - std::string file_read ; + std::string files_read ; /*! * \brief the path to the file containing the * sequence data. */ std::string file_sequence ; /*! * \brief the number of classes to partition the data into. */ size_t n_class ; /*! * \brief the number of iterations allowed. */ size_t n_iter ; /*! * \brief the shifting freedom. */ size_t n_shift ; /*! * \brief whether flipping freedom is allowed. */ bool flip ; /*! * \brief the number of threads. */ size_t n_threads ; - /*! - * \brief the seeding method to use. - */ - EMEngine::seeding_codes seeding ; /*! * \brief a seed to initialise the random number generator. */ std::string seed ; /*! * \brief a flag indicating whether the core of run() can be * run or not. */ bool runnable ; } ; -#endif // CHIPPPARTITIONINGAPPLICATION_HPP +#endif // EMJOINTAPPLICATION_HPP diff --git a/src/Applications/ChIPPartitioningApplication.cpp b/src/Applications/EMReadApplication.cpp similarity index 52% copy from src/Applications/ChIPPartitioningApplication.cpp copy to src/Applications/EMReadApplication.cpp index 943afc7..5b8e842 100644 --- a/src/Applications/ChIPPartitioningApplication.cpp +++ b/src/Applications/EMReadApplication.cpp @@ -1,174 +1,136 @@ -#include -#include +#include +#include #include #include #include // std::invalid_argument #include -#include // + +#include namespace po = boost::program_options ; -ChIPPartitioningApplication::ChIPPartitioningApplication(int argn, char** argv) - : file_read(""), file_sequence(""), n_class(0), n_iter(0), n_shift(0), flip(false), - n_threads(0), seeding(EMEngine::seeding_codes::RANDOM), - seed(""), runnable(true) +EMReadApplication::EMReadApplication(int argn, char** argv) + : file_read(""), n_class(0), n_iter(0), n_shift(0), flip(false), + n_threads(0), seed(""), runnable(true) { // parse command line options and set the fields this->parseOptions(argn, argv) ; } -int ChIPPartitioningApplication::run() +int EMReadApplication::run() { if(this->runnable) - { // read data - std::vector read_paths ; - boost::split(read_paths, this->file_read, [](char c){return c == ',';}); - std::vector data_read ; - for(const auto& path : read_paths) - { if(path == "") - { continue ; } - data_read.push_back(read_matrix2d_i(path)) ; - } - // sequence data - std::vector data_seq ; - if(this->file_sequence != "") - { data_seq.push_back(read_matrix2d_i(this->file_sequence)) ; } - - EMEngine em(data_read, - data_seq, - this->n_class, - this->n_iter, - this->n_shift, - this->flip, - this->seeding, - this->seed, - this->n_threads) ; + { EMRead em(Matrix2D(this->file_read), + this->n_class, + this->n_iter, + this->n_shift, + this->flip, + this->seed, + this->n_threads) ; em.classify() ; std::cout << em.get_post_prob() << std::endl ; return EXIT_SUCCESS ; } else { return EXIT_FAILURE ; } } -void ChIPPartitioningApplication::parseOptions(int argn, char** argv) +void EMReadApplication::parseOptions(int argn, char** argv) { // no option to parse if(argv == nullptr) { std::string message = "no options to parse!" ; throw std::invalid_argument(message) ; } // help messages std::string desc_msg = "\n" - "ChIPPartitioning is a probabilistic partitioning algorithm that \n" - "sofetly assigns genomic regions to classes given their shape \n" - "of the signal over the region. The assignment probabilities \n" - "are returned through stdout.\n\n" ; + "EMRead is a probabilistic partitioning algorithm that \n" + "sofetly assigns genomic regions to classes given the shape \n" + "of the read density over the region. The assignment \n" + "probabilities are returned through stdout.\n\n" ; std::string opt_help_msg = "Produces this help message." ; std::string opt_thread_msg = "The number of threads dedicated to parallelize the computations,\n " "by default 0 (no parallelization)." ; - std::string opt_read_msg = "A coma separated list of paths to the file containing the " - "read density data" ; - std::string opt_seq_msg = "The path to the file containing the sequence data" ; + std::string opt_read_msg = "The path to the file containing the read density data" ; std::string opt_iter_msg = "The number of iterations." ; std::string opt_class_msg = "The number of classes to find." ; std::string opt_shift_msg = "Enables this number of column of shifting " - "freedom. By default, shifting is " + "freedom to realign the data. By default, shifting is " "disabled (equivalent to --shift 1)." ; - std::string opt_flip_msg = "Enables flipping."; - std::string opt_seeding_msg = "Specify which method should be used to initialise the " - "cluster references." ; + std::string opt_flip_msg = "Enables flipping to realign the data."; std::string opt_seed_msg = "A value to seed the random number generator."; // option parser boost::program_options::variables_map vm ; boost::program_options::options_description desc(desc_msg) ; std::string seeding_tmp ; desc.add_options() ("help,h", opt_help_msg.c_str()) ("read", po::value(&(this->file_read)), opt_read_msg.c_str()) - ("seq", po::value(&(this->file_sequence)), opt_read_msg.c_str()) ("iter,i", po::value(&(this->n_iter)), opt_iter_msg.c_str()) ("class,c", po::value(&(this->n_class)), opt_class_msg.c_str()) ("shift,s", po::value(&(this->n_shift)), opt_shift_msg.c_str()) ("flip", opt_flip_msg.c_str()) - ("seeding", po::value(&(seeding_tmp)), opt_seeding_msg.c_str()) ("seed", po::value(&(this->seed)), opt_seed_msg.c_str()) ("thread", po::value(&(this->n_threads)), opt_thread_msg.c_str()) ; // parse try { po::store(po::parse_command_line(argn, argv, desc), vm) ; po::notify(vm) ; } catch(std::invalid_argument& e) { std::string msg = std::string("Error! Invalid option given!\n") + std::string(e.what()) ; throw std::invalid_argument(msg) ; } catch(...) { throw std::invalid_argument("An unknown error occured while parsing the options") ; } bool help = vm.count("help") ; // checks unproper option settings if(this->file_read == "" and - this->file_sequence == "" and (not help)) - { std::string msg("Error! No data were given (--read and/or --seq)!") ; - throw std::invalid_argument(msg) ; - } - else if((seeding_tmp != "random") and - (seeding_tmp != "sampling") and - (seeding_tmp != "toy") and - (not help)) - { std::string msg("Error! Unrecognized seeding method (--seeding)!") ; + { std::string msg("Error! No data were given (--read)!") ; throw std::invalid_argument(msg) ; } - // no class given -> 1 iter + // no iter given -> 1 iter if(this->n_iter == 0) { this->n_iter = 1 ; } // no shift class given -> 1 class if(this->n_class == 0) { this->n_class = 1 ; } // no shift given, value of 1 -> no shift if(this->n_shift == 0) { this->n_shift = 1 ; } - // set seeding - if(seeding_tmp == "random") - { this->seeding = EMEngine::seeding_codes::RANDOM ; } - else if(seeding_tmp == "sampling") - { this->seeding = EMEngine::seeding_codes::SAMPLING ; } - else if(seeding_tmp == "toy") - { this->seeding = EMEngine::seeding_codes::TOY ; } // set flip if(vm.count("flip")) { this->flip = true ; } // help invoked, run() cannot be invoked if(help) { std::cout << desc << std::endl ; this->runnable = false ; return ; } // everything fine, run() can be called else { this->runnable = true ; return ; } } - int main(int argn, char** argv) -{ ChIPPartitioningApplication app(argn, argv) ; +{ EMReadApplication app(argn, argv) ; return app.run() ; } diff --git a/src/Applications/ChIPPartitioningApplication.hpp b/src/Applications/EMReadApplication.hpp similarity index 74% copy from src/Applications/ChIPPartitioningApplication.hpp copy to src/Applications/EMReadApplication.hpp index 912b9aa..66cb1be 100644 --- a/src/Applications/ChIPPartitioningApplication.hpp +++ b/src/Applications/EMReadApplication.hpp @@ -1,106 +1,91 @@ -#ifndef CHIPPPARTITIONINGAPPLICATION_HPP -#define CHIPPPARTITIONINGAPPLICATION_HPP +#ifndef EMREADAPPLICATION_HPP +#define EMREADAPPLICATION_HPP #include -#include -#include #include /*! - * \brief The ChIPPartitioningApplication class is a wrapper around an EMEngine + * \brief The EMReadApplication class is a wrapper around an EMRead * instance creating an autonomous application to classify data by directly * passing all the options and parameters from the command line. */ -class ChIPPartitioningApplication: public ApplicationInterface +class EMReadApplication: public ApplicationInterface { public: - ChIPPartitioningApplication() = delete ; - ChIPPartitioningApplication(const ChIPPartitioningApplication& app) = delete ; + EMReadApplication() = delete ; + EMReadApplication(const EMReadApplication& app) = delete ; /*! * \brief Constructs an object from the command line * options. * \param argn the number of options passed to the * main() function. * \param argv the vector of options passed to the * main() function. */ - ChIPPartitioningApplication(int argn, char** argv) ; + EMReadApplication(int argn, char** argv) ; /*! * \brief Runs the application. The data are classified * using the given settings and the posterior probability * matrix is returned through the stdout. * The matrix is a 4D matrix with dimensions : * regions, class, shift flip. * \return an exit code EXIT_SUCCESS or EXIT_FAILURE * to return to the OS. */ virtual int run() override ; private: /*! * \brief Parses the program command line options and * sets the object field accordingly. * If the help option is detected, the "runnable" * field is set to false and subsequent calls to * run() will produce nothing. * \param argn the number of options passed to the * main() function. * \param argv the vector of options passed to the * main() function. * \throw std::invalid_argument if an error is found * in the program options. */ void parseOptions(int argn, char** argv) ; /*! - * \brief the paths to the files containing the read + * \brief the paths to the file containing the read * density data. */ std::string file_read ; - /*! - * \brief the path to the file containing the - * sequence data. - */ - std::string file_sequence ; /*! * \brief the number of classes to partition the data into. */ size_t n_class ; /*! * \brief the number of iterations allowed. */ size_t n_iter ; /*! * \brief the shifting freedom. */ size_t n_shift ; /*! * \brief whether flipping freedom is allowed. */ bool flip ; - /*! * \brief the number of threads. */ size_t n_threads ; - - /*! - * \brief the seeding method to use. - */ - EMEngine::seeding_codes seeding ; /*! * \brief a seed to initialise the random number generator. */ std::string seed ; - /*! * \brief a flag indicating whether the core of run() can be * run or not. */ bool runnable ; } ; - -#endif // CHIPPPARTITIONINGAPPLICATION_HPP +#endif // EMREADAPPLICATION_HPP diff --git a/src/Applications/EMSequenceApplication.cpp b/src/Applications/EMSequenceApplication.cpp new file mode 100644 index 0000000..1d10824 --- /dev/null +++ b/src/Applications/EMSequenceApplication.cpp @@ -0,0 +1,278 @@ + +#include +#include + +#include +#include +#include // std::invalid_argument +#include +#include // boost::split() + +#include +#include + +namespace po = boost::program_options ; + +template +std::ostream& operator << (std::ostream& stream, + const std::vector& v) +{ for(const auto& x : v) + { stream << x << " " ; } + return stream ; +} + +EMSequenceApplication::EMSequenceApplication(int argn, char** argv) + : file_seq(""), files_motif(""), + n_class(0), n_iter(0), n_shift(0), flip(false), bckg_class(false), + n_threads(0), seed(""), runnable(true) +{ + // parse command line options and set the fields + this->parseOptions(argn, argv) ; +} + +int EMSequenceApplication::run() +{ if(this->runnable) + { EMSequence* em(nullptr) ; + + // data + Matrix2D data(this->file_seq) ; + + // seeds motifs randomly + if(this->files_motif == "") + { em = new EMSequence(data, + this->n_class, + this->n_iter, + this->n_shift, + this->flip, + this->bckg_class, + this->seed, + this->n_threads) ; + } + // seeds motifs with the given matrices + else + { // model + std::vector motif_paths ; + boost::split(motif_paths, this->files_motif, [](char c){return c == ',';}) ; + // this->n_class = motif_paths.size() + this->bckg_class ; + size_t model_ncol = data.get_ncol() - this->n_shift + 1 ; + + // add the given motif, random motifs (if needed) and + // background class (if needed) + Matrix3D model = this->init_model(model_ncol, + data, + motif_paths) ; + + em = new EMSequence(data, + model, + this->n_iter, + this->flip, + this->n_threads) ; + } + + // classify + em->classify() ; + std::cout << em->get_post_prob() << std::endl ; + + // clean + delete em ; + em = nullptr ; + + return EXIT_SUCCESS ; + } + else + { return EXIT_FAILURE ; } +} + +void EMSequenceApplication::parseOptions(int argn, char** argv) +{ + // no option to parse + if(argv == nullptr) + { std::string message = "no options to parse!" ; + throw std::invalid_argument(message) ; + } + + // help messages + std::string desc_msg = "\n" + "EMSequence is a probabilistic partitioning algorithm that \n" + "sofetly assigns sequences to classes given their motif content \n" + "The assignment probabilities are returned through stdout.\n\n" ; + std::string opt_help_msg = "Produces this help message." ; + std::string opt_thread_msg = "The number of threads dedicated to parallelize the computations,\n " + "by default 0 (no parallelization)." ; + std::string opt_seq_msg = "The path to the file containing the sequences" ; + std::string opt_motifs_msg = "A coma separated list of path to files containing the initial motifs\n" + "values. The motifs should be probability matrices in horizontal format.\n" + "If the motifs are too short after accounting for shifting, extra\n" + "columns with uniform probabilities will be added on each side. The\n" + "given number of classes (--class) should at least be the number of\n" + "initial motifs. If the number of classes is bigger than the number of" + "given motifs, the remaining classes are initialised randomly\n." ; + std::string opt_iter_msg = "The number of iterations." ; + std::string opt_class_msg = "The number of classes to find." ; + std::string opt_shift_msg = "Enables this number of column of shifting freedom to realign\n" + "the data. By default, shifting is disabled (equivalent to\n" + "--shift 1)." ; + std::string opt_flip_msg = "Enables flipping to realign the data."; + std::string opt_bckg_msg = "Adds a class to model the sequence background. This class\n" + "contains the sequence background probabilities at each position\n" + "and is never updated." ; + std::string opt_seed_msg = "A value to seed the random number generator."; + + // option parser + boost::program_options::variables_map vm ; + boost::program_options::options_description desc(desc_msg) ; + + std::string seeding_tmp ; + + desc.add_options() + ("help,h", opt_help_msg.c_str()) + + ("seq", po::value(&(this->file_seq)), opt_seq_msg.c_str()) + + ("motifs", po::value(&(this->files_motif)), opt_motifs_msg.c_str()) + + ("iter,i", po::value(&(this->n_iter)), opt_iter_msg.c_str()) + ("class,c", po::value(&(this->n_class)), opt_class_msg.c_str()) + ("shift,s", po::value(&(this->n_shift)), opt_shift_msg.c_str()) + ("flip", opt_flip_msg.c_str()) + ("bgclass", opt_bckg_msg.c_str()) + + ("seed", po::value(&(this->seed)), opt_seed_msg.c_str()) + ("thread", po::value(&(this->n_threads)), opt_thread_msg.c_str()) ; + + // parse + try + { po::store(po::parse_command_line(argn, argv, desc), vm) ; + po::notify(vm) ; + } + catch(std::invalid_argument& e) + { std::string msg = std::string("Error! Invalid option given!\n") + std::string(e.what()) ; + throw std::invalid_argument(msg) ; + } + catch(...) + { throw std::invalid_argument("An unknown error occured while parsing the options") ; } + + bool help = vm.count("help") ; + + // checks unproper option settings + if(this->file_seq == "" and + (not help)) + { std::string msg("Error! No data were given (--seq)!") ; + throw std::invalid_argument(msg) ; + } + + // no iter given -> 1 iter + if(this->n_iter == 0) + { this->n_iter = 1 ; } + // no shift class given -> 1 class + if(this->n_class == 0) + { this->n_class = 1 ; } + // no shift given, value of 1 -> no shift + if(this->n_shift == 0) + { this->n_shift = 1 ; } + // set flip + if(vm.count("flip")) + { this->flip = true ; } + // set background class + if(vm.count("bgclass")) + { this->bckg_class = true ; } + // help invoked, run() cannot be invoked + if(help) + { std::cout << desc << std::endl ; + this->runnable = false ; + return ; + } + // everything fine, run() can be called + else + { this->runnable = true ; + return ; + } +} + +Matrix3D EMSequenceApplication::init_model(size_t model_len, + const Matrix2D& data, + const std::vector& motif_paths) const +{ + int n_class_given = motif_paths.size() ; + int n_class_bckg = this->bckg_class ; + int n_class_rand = this->n_class - n_class_given - n_class_bckg ; + + // number of classes should at least be number of motifs + if(n_class_given > (int)this->n_class) + { char msg[4096] ; + sprintf(msg, "Error! number of class given (--class %zu) should at " + "least be equal to number of motifs (--motifs %d)", + this->n_class, n_class_given) ; + throw std::invalid_argument(msg) ; + } + // check if there is room for a background class + if((int)this->n_class < n_class_given+this->bckg_class) + { char msg[4096] ; + sprintf(msg, "Error! no class left to add a background " + "class (--bgclass) with the given motifs (--motifs) (--class %zu)", + this->n_class) ; + throw std::invalid_argument(msg) ; + } + + // init empty model + Matrix3D model(this->n_class, + model_len, + 4, + 0.25) ; + // add given motifs + for(size_t i=0; i matrix(motif_paths[i]) ; + // motif is too big for this shift + if(matrix.get_ncol() > model_len) + { char msg[4096] ; + sprintf(msg, + "Error! In %s, motif column number is bigger " + "than data column number - shift + 1 " + "(%zu > %zu - %zu + 1)", + motif_paths[i].c_str(), + matrix.get_ncol(), + data.get_ncol(), + this->n_shift) ; + throw std::invalid_argument(msg) ; + } + // insert motif in middle of matrix + else + { // size_t j_model = this->n_shift / 2 ; + size_t j_model = (model_len - matrix.get_ncol()) / 2 ; + for(size_t j_mat=0, j_mod=j_model; j_mat 0) + { // initialise randomly + EMSequence em(data, + n_class_rand, + this->n_iter, + this->n_shift, + this->flip, + this->bckg_class, + this->seed, + this->n_threads) ; + Matrix3D model_rand = em.get_sequence_models() ; + // copy them into model + for(int i_rand=0, i_mod=n_class_given; i_rand -#include -#include #include +#include + +#include +#include /*! - * \brief The ChIPPartitioningApplication class is a wrapper around an EMEngine - * instance creating an autonomous application to classify data by directly + * \brief The EMSequenceApplication class is a wrapper around an EMSequence + * instance creating an autonomous application to classify sequences by directly * passing all the options and parameters from the command line. */ -class ChIPPartitioningApplication: public ApplicationInterface +class EMSequenceApplication: public ApplicationInterface { public: - ChIPPartitioningApplication() = delete ; - ChIPPartitioningApplication(const ChIPPartitioningApplication& app) = delete ; + EMSequenceApplication() = delete ; + EMSequenceApplication(const EMSequenceApplication& app) = delete ; /*! * \brief Constructs an object from the command line * options. * \param argn the number of options passed to the * main() function. * \param argv the vector of options passed to the * main() function. */ - ChIPPartitioningApplication(int argn, char** argv) ; + EMSequenceApplication(int argn, char** argv) ; /*! * \brief Runs the application. The data are classified * using the given settings and the posterior probability * matrix is returned through the stdout. * The matrix is a 4D matrix with dimensions : * regions, class, shift flip. * \return an exit code EXIT_SUCCESS or EXIT_FAILURE * to return to the OS. */ virtual int run() override ; private: /*! * \brief Parses the program command line options and * sets the object field accordingly. * If the help option is detected, the "runnable" * field is set to false and subsequent calls to * run() will produce nothing. * \param argn the number of options passed to the * main() function. * \param argv the vector of options passed to the * main() function. * \throw std::invalid_argument if an error is found * in the program options. */ void parseOptions(int argn, char** argv) ; /*! - * \brief the paths to the files containing the read - * density data. + * \brief Initialise the class models if matrices + * are given as initial class motifs. + * If the given class motifs are shorter than the + * model after accounting for shifting, extra columns + * with uniform probabilities will be added on each + * side. + * If the number of classes is higher than the + * number of given motifs, extra classes will be + * initialised randomly.A background class is included + * if needed. + * \param model_len the number of positions (columns) + * of the model to initialise. + * \param data the sequence matrix, in integer format. + * \param motif_paths the paths to the files containing + * the probability matrices to use to initialise the + * class motifs. + * \return */ - std::string file_read ; + Matrix3D init_model(size_t model_len, + const Matrix2D& data, + const std::vector& motif_paths) const ; + /*! - * \brief the path to the file containing the - * sequence data. + * \brief the paths to the file containing the sequence + * data. */ - std::string file_sequence ; + std::string file_seq ; + + /*! + * \brief a coma separated list of files containing the + * initial motif matrices. + */ + std::string files_motif ; + /*! * \brief the number of classes to partition the data into. */ size_t n_class ; /*! * \brief the number of iterations allowed. */ size_t n_iter ; /*! * \brief the shifting freedom. */ size_t n_shift ; /*! * \brief whether flipping freedom is allowed. */ bool flip ; - /*! - * \brief the number of threads. + * \brief whether a constant class to model the + * sequence background should be added. This + * class has the sequence background probabilities + * at each position. */ - size_t n_threads ; - + bool bckg_class ; /*! - * \brief the seeding method to use. + * \brief the number of threads. */ - EMEngine::seeding_codes seeding ; + size_t n_threads ; /*! * \brief a seed to initialise the random number generator. */ std::string seed ; - /*! * \brief a flag indicating whether the core of run() can be * run or not. */ bool runnable ; } ; - -#endif // CHIPPPARTITIONINGAPPLICATION_HPP +#endif // EMSEQUENCEAPPLICATION_HPP diff --git a/src/Applications/ProbToModelApplication.cpp b/src/Applications/ProbToModelApplication.cpp index f9e84aa..babc0ef 100644 --- a/src/Applications/ProbToModelApplication.cpp +++ b/src/Applications/ProbToModelApplication.cpp @@ -1,203 +1,206 @@ #include #include #include #include #include // std::invalid_argument, std::runtime_error #include #include #include -#include +#include +#include namespace po = boost::program_options ; +typedef std::vector vector_d ; + ProbToModelApplication::ProbToModelApplication(int argn, char** argv) : file_read(""), file_seq(""), file_prob(""), n_threads(0), runnable(false) { this->parseOptions(argn, argv) ; } ProbToModelApplication::~ProbToModelApplication() {} int ProbToModelApplication::run() { if(this->runnable) { // load data std::string file_data ; bool read_data = false ; bool seq_data = false ; if(this->file_read != "") { file_data = this->file_read ; read_data = true ; seq_data = false ; } else if(this->file_seq != "") { file_data = this->file_seq ; read_data = false ; seq_data = true ; } else { std::string msg("Error! Could not determine the type of the data!") ; throw std::runtime_error(msg) ; } - matrix2d_i data = read_matrix2d_i(file_data) ; - matrix4d_d prob = read_matrix4d_d(this->file_prob) ; - if(data.size() != prob.size()) + Matrix2D data(file_data) ; + Matrix4D prob(this->file_prob) ; + if(data.get_nrow() != prob.get_dim()[0]) { char msg[4096] ; sprintf(msg, "Error! data and prob matrices have unequal " "row numbers (%zu / %zu)!", - data.size(), prob.size()) ; + data.get_nrow(), prob.get_dim()[0]) ; throw std::runtime_error(msg) ; } - else if(data[0].size() < prob[0][0].size()) + else if(data.get_ncol() < prob.get_dim()[2]) { char msg[4096] ; sprintf(msg, "Error! too many shift states for the data!" "%zu shift states and %zu columns in data)!", - prob[0][0].size(), data[0].size()) ; + prob.get_dim()[2], data.get_ncol()) ; throw std::runtime_error(msg) ; } // get the data model ModelComputer* ptr = nullptr ; if(read_data) { ptr = new ReadModelComputer(data, prob, this->n_threads) ; } else if(seq_data) { ptr = new SequenceModelComputer(data, prob, this->n_threads) ; } - matrix2d_d model = ptr->get_model() ; + Matrix2D model = ptr->get_model() ; delete ptr ; ptr = nullptr ; // compute the class prob - size_t n_row = prob.size() ; - size_t n_class = prob[0].size() ; - size_t n_shift = prob[0][0].size() ; - size_t n_flip = prob[0][0][0].size() ; + size_t n_row = prob.get_dim()[0] ; + size_t n_class = prob.get_dim()[1] ; + size_t n_shift = prob.get_dim()[2] ; + size_t n_flip = prob.get_dim()[3] ; vector_d class_prob(n_class, 0.) ; double p_tot = 0. ; for(size_t i=0; i model_final(model.get_nrow(), + model.get_ncol() + 1) ; // 1st column contain the class prob if(read_data) - { for(size_t i=0; i(&(this->file_read)), opt_read_msg.c_str()) ("seq,", po::value(&(this->file_seq)), opt_seq_msg.c_str()) ("prob,", po::value(&(this->file_prob)), opt_prob_msg.c_str()) ("thread", po::value(&(this->n_threads)), opt_thread_msg.c_str()) ; // parse try { po::store(po::parse_command_line(argn, argv, desc), vm) ; po::notify(vm) ; } catch(std::invalid_argument& e) { std::string msg = std::string("Error! Invalid option given!\n") + std::string(e.what()) ; throw std::invalid_argument(msg) ; } catch(...) { throw std::invalid_argument("An unknown error occured while parsing the options") ; } bool help = vm.count("help") ; // checks unproper option settings if((this->file_read == "") and (this->file_seq == "") and (not help)) { std::string msg("Error! No data file was given (--read or --seq)!") ; throw std::invalid_argument(msg) ; } else if((this->file_read != "") and (this->file_seq != "") and (not help)) { std::string msg("Error! --read and --seq are mutually exclusive!") ; throw std::invalid_argument(msg) ; } else if(this->file_prob == "" and (not help)) { std::string msg("Error! No posterior probabily file was given (--prob)!") ; throw std::invalid_argument(msg) ; } // help invoked, run() cannot be invoked if(help) { std::cout << desc << std::endl ; this->runnable = false ; return ; } // everything fine, run() can be called else { this->runnable = true ; return ; } } int main(int argn, char** argv) { ProbToModelApplication app(argn, argv) ; return app.run() ; } diff --git a/src/Applications/ReadModelExtenderApplication.cpp b/src/Applications/ReadModelExtenderApplication.cpp new file mode 100644 index 0000000..c4b0545 --- /dev/null +++ b/src/Applications/ReadModelExtenderApplication.cpp @@ -0,0 +1,269 @@ +#include + +#include +#include +#include +#include // std::invalid_argument, std::runtime_error + +#include +#include +#include +#include +#include + +namespace po = boost::program_options ; + + +// the valid values for --method option +std::string method_read = "read" ; +std::string method_read_atac = "read_atac" ; +std::string method_fragment = "fragment" ; +std::string method_fragment_center = "fragment_center" ; + + +ReadModelExtenderApplication::ReadModelExtenderApplication(int argn, char** argv) + : file_bed(""), file_bam(""), file_bai(""), file_prob(""), + from(0), to(0), ext(0), bin_size(0), + method(CorrelationMatrixCreator::FRAGMENT), + n_threads(0), runnable(false) +{ this->parseOptions(argn, argv) ; } + +ReadModelExtenderApplication::~ReadModelExtenderApplication() +{} + +int ReadModelExtenderApplication::run() +{ if(this->runnable) + { // extend limits + int ext_right = this->ext/2 ; + int ext_left = this->ext - ext_right ; + this->from -= ext_left ; + this->to += ext_right ; + + // create extended matrix + CorrelationMatrixCreator mc(this->file_bed, + this->file_bam, + this->file_bai, + this->from, + this->to, + this->bin_size, + this->method) ; + Matrix2D data = mc.create_matrix() ; + + // compute model + Matrix4D prob(this->file_prob) ; + if(prob.get_dim()[0] != data.get_nrow()) + { char msg[4096] ; + sprintf(msg, + "Error! data matrix and probability matrix have " + "unequal row numbers (%zu and %zu)", + prob.get_dim()[0], + data.get_nrow()) ; + throw std::invalid_argument(msg) ; + } + size_t n_row = prob.get_dim()[0] ; + size_t n_class = prob.get_dim()[1] ; + size_t n_shift = prob.get_dim()[2] ; + size_t n_flip = prob.get_dim()[3] ; + + ReadModelComputer model_cp(data, prob, this->n_threads) ; + Matrix2D model = model_cp.get_model() ; + + // compute class prob + vector_d class_prob(n_class, 0.) ; + double p_tot = 0. ; + for(size_t i=0; i model_final(model.get_nrow(), + model.get_ncol() + 1) ; + // 1st column contain the class prob + for(size_t i=0; i(&(this->file_bed)), opt_bed_msg.c_str()) + ("bam", po::value(&(this->file_bam)), opt_bam_msg.c_str()) + ("bai", po::value(&(this->file_bai)), opt_bai_msg.c_str()) + ("prob,", po::value(&(this->file_prob)), opt_prob_msg.c_str()) + + ("from", po::value(&(this->from)), opt_from_msg.c_str()) + ("to", po::value(&(this->to)), opt_to_msg.c_str()) + ("ext", po::value(&(this->ext)), opt_ext_msg.c_str()) + ("binSize", po::value(&(this->bin_size)), opt_binsize_msg.c_str()) + ("method", po::value(&(method)), opt_method_msg.c_str()) + + ("thread", po::value(&(this->n_threads)), opt_thread_msg.c_str()) ; + + // parse + try + { po::store(po::parse_command_line(argn, argv, desc), vm) ; + po::notify(vm) ; + } + catch(std::invalid_argument& e) + { std::string msg = std::string("Error! Invalid option given!\n") + std::string(e.what()) ; + throw std::invalid_argument(msg) ; + } + catch(...) + { throw std::invalid_argument("An unknown error occured while parsing the options") ; } + + bool help = vm.count("help") ; + + // checks unproper option settings + if(this->file_bed == "" and (not help)) + { std::string msg("Error! No BED file was given (--bed)!") ; + throw std::invalid_argument(msg) ; + } + else if(this->file_bam == "" and (not help)) + { std::string msg("Error! No BAM file was given (--bam)!") ; + throw std::invalid_argument(msg) ; + } + else if(this->file_bai == "" and (not help)) + { std::string msg("Error! No BAM index file was given (--bai)!") ; + throw std::invalid_argument(msg) ; + } + else if(this->file_prob == "" and (not help)) + { std::string msg("Error! No posterior probability file was given (--prob)!") ; + throw std::invalid_argument(msg) ; + } + else if(this->from == 0 and this->to == 0 and (not help)) + { std::string msg("Error! No range given (--from and --to)!") ; + throw std::invalid_argument(msg) ; + } + else if(this->from >= this->to and (not help)) + { std::string msg("Error! from shoud be smaller than to (--from and --to)!") ; + throw std::invalid_argument(msg) ; + } + else if(ext <= 0 and (not help)) + { std::string msg("Error! the number of columns to add should be > 0 (--ext)!") ; + throw std::invalid_argument(msg) ; + } + else if(this->bin_size <= 0 and (not help)) + { std::string msg("Error! bin size should be bigger than 0 (--binSize)!") ; + throw std::invalid_argument(msg) ; + } + else if(method != method_read and + method != method_read_atac and + method != method_fragment and + method != method_fragment_center) + { char msg[4096] ; + sprintf(msg, "Error! method should be %s, %s, %s or %s (--method)", + method_read.c_str(), + method_read_atac.c_str(), + method_fragment.c_str(), + method_fragment_center.c_str()) ; + throw std::invalid_argument(msg) ; + } + + // set method + if(method == method_read) + { this->method = CorrelationMatrixCreator::READ ; } + else if(method == method_read_atac) + { this->method = CorrelationMatrixCreator::READ_ATAC ; } + else if(method == method_fragment) + { this->method = CorrelationMatrixCreator::FRAGMENT ; } + else if(method == method_fragment_center) + { this->method = CorrelationMatrixCreator::FRAGMENT_CENTER ; } + + // help invoked, run() cannot be invoked + if(help) + { std::cout << desc << std::endl ; + this->runnable = false ; + return ; + } + // everything fine, run() can be called + else + { this->runnable = true ; + return ; + } +} + +int main(int argn, char** argv) +{ ReadModelExtenderApplication app(argn, argv) ; + return app.run() ; +} diff --git a/src/Applications/ReadModelExtenderApplication.hpp b/src/Applications/ReadModelExtenderApplication.hpp new file mode 100644 index 0000000..5fc8379 --- /dev/null +++ b/src/Applications/ReadModelExtenderApplication.hpp @@ -0,0 +1,122 @@ +#ifndef READMODELEXTENDERAPPLICATION_HPP +#define READMODELEXTENDERAPPLICATION_HPP + +#include + +#include +#include + +#include + +/*! + * \brief The ReadModelExtenderApplication class is a class implementing an + * application to extend a read model of length L' (L' = L - S + 1 + * where L is the number of column of the data matrix and S the + * shifting freedom allowed during the classification) to a new model + * length L'' = L' + E (E is the number of columns to add to the + * model) given the data matrix and the results of the classification + * (posterior probability matrix). + * To do this, the read count matrix from which the original model + * was computed is extended (0.5*E columns on each side) and a model + * is computed using the new matrix and the given posterior probabities. + * The extended model is returned through the stdout. + */ +class ReadModelExtenderApplication : public ApplicationInterface +{ + public: + ReadModelExtenderApplication() = delete ; + ReadModelExtenderApplication(const ReadModelExtenderApplication& app) = delete ; + /*! + * \brief Constructs an object from the command line + * options. + * \param argn the number of options passed to the + * main() function. + * \param argv the vector of options passed to the + * main() function. + */ + ReadModelExtenderApplication(int argn, char** argv) ; + /*! + * \brief Destructor. + */ + virtual ~ReadModelExtenderApplication() override ; + /*! + * \brief Runs the application. The data new model + * is computed and displayed through the + * stdout. + * \return the exit code. + */ + virtual int run() override ; + + private: + /*! + * \brief Parses the program command line options and + * sets the object field accordingly. + * If the help option is detected, the "runnable" + * field is set to false and subsequent calls to + * run() will produce nothing. + * \param argn the number of options passed to the + * main() function. + * \param argv the vector of options passed to the + * main() function. + * \throw std::invalid_argument if an error is found + * in the program options. + */ + void parseOptions(int argn, char** argv) ; + + /*! + * \brief the path to the bed file. + */ + std::string file_bed ; + /*! + * \brief the path to the bam file. + */ + std::string file_bam ; + /*! + * \brief the path to the bam index file. + */ + std::string file_bai ; + /*! + * \brief the path to the file containing the + * classification posterior probabilities. + */ + std::string file_prob ; + /*! + * \brief a relative coordinate indicating the + * most downstream position to consider around + * each region in the bed file. + */ + int from ; + /*! + * \brief a relative coordinate indicating the + * most upstream position to consider around + * each region in the bed file. + */ + int to ; + /*! + * \brief the number of columns to add to the + * matrix (half of this value on each side). + */ + int ext ; + /*! + * \brief the size of the bin that will be used + * to bin the signal in the regions [from,to] around + * each region in the bed file. + */ + int bin_size ; + /*! + * \brief How to consider the sequenced fragments when computing + * the bin values. + */ + CorrelationMatrixCreator::methods method ; + + /*! + * \brief the number of threads. + */ + size_t n_threads ; + /*! + * \brief whether run() can be called. + */ + bool runnable ; +} ; + +#endif // READMODELEXTENDERAPPLICATION_HPP diff --git a/src/Applications/SequenceModelExtenderApplication.cpp b/src/Applications/SequenceModelExtenderApplication.cpp new file mode 100644 index 0000000..03ba059 --- /dev/null +++ b/src/Applications/SequenceModelExtenderApplication.cpp @@ -0,0 +1,212 @@ +#include + +#include +#include +#include +#include // std::invalid_argument, std::runtime_error + +#include +#include +#include +#include +#include + +namespace po = boost::program_options ; + + +SequenceModelExtenderApplication::SequenceModelExtenderApplication(int argn, char** argv) + : file_bed(""), file_fasta(""), file_prob(""), + from(0), to(0), ext(0), + n_threads(0), runnable(false) +{ this->parseOptions(argn, argv) ; } + +SequenceModelExtenderApplication::~SequenceModelExtenderApplication() +{} + +int SequenceModelExtenderApplication::run() +{ if(this->runnable) + { // extend limits + int ext_right = this->ext/2 ; + int ext_left = this->ext - ext_right ; + this->from -= ext_left ; + this->to += ext_right ; + + // create extended matrix + SequenceMatrixCreator mc(this->file_bed, + this->file_fasta, + this->from, + this->to) ; + Matrix2D data = mc.create_matrix() ; + + // compute model + Matrix4D prob(this->file_prob) ; + if(prob.get_dim()[0] != data.get_nrow()) + { char msg[4096] ; + sprintf(msg, + "Error! data matrix and probability matrix have " + "unequal row numbers (%zu and %zu)", + prob.get_dim()[0], + data.get_nrow()) ; + throw std::invalid_argument(msg) ; + } + size_t n_row = prob.get_dim()[0] ; + size_t n_class = prob.get_dim()[1] ; + size_t n_shift = prob.get_dim()[2] ; + size_t n_flip = prob.get_dim()[3] ; + + SequenceModelComputer model_cp(data, prob, this->n_threads) ; + Matrix2D model = model_cp.get_model() ; + + // compute class prob + vector_d class_prob(n_class, 0.) ; + double p_tot = 0. ; + for(size_t i=0; i model_final(model.get_nrow(), + model.get_ncol() + 1) ; + // 1st column contain the class prob + size_t i_class = 0 ; + for(size_t i=0; i(&(this->file_bed)), opt_bed_msg.c_str()) + ("fasta", po::value(&(this->file_fasta)), opt_fasta_msg.c_str()) + ("prob,", po::value(&(this->file_prob)), opt_prob_msg.c_str()) + + ("from", po::value(&(this->from)), opt_from_msg.c_str()) + ("to", po::value(&(this->to)), opt_to_msg.c_str()) + ("ext", po::value(&(this->ext)), opt_ext_msg.c_str()) + + ("thread", po::value(&(this->n_threads)), opt_thread_msg.c_str()) ; + + // parse + try + { po::store(po::parse_command_line(argn, argv, desc), vm) ; + po::notify(vm) ; + } + catch(std::invalid_argument& e) + { std::string msg = std::string("Error! Invalid option given!\n") + std::string(e.what()) ; + throw std::invalid_argument(msg) ; + } + catch(...) + { throw std::invalid_argument("An unknown error occured while parsing the options") ; } + + bool help = vm.count("help") ; + + // checks unproper option settings + if(this->file_bed == "" and (not help)) + { std::string msg("Error! No BED file was given (--bed)!") ; + throw std::invalid_argument(msg) ; + } + else if(this->file_fasta == "" and (not help)) + { std::string msg("Error! No fasta file was given (--fasta)!") ; + throw std::invalid_argument(msg) ; + } + else if(this->file_prob == "" and (not help)) + { std::string msg("Error! No posterior probability file was given (--prob)!") ; + throw std::invalid_argument(msg) ; + } + else if(this->from == 0 and this->to == 0 and (not help)) + { std::string msg("Error! No range given (--from and --to)!") ; + throw std::invalid_argument(msg) ; + } + else if(this->from >= this->to and (not help)) + { std::string msg("Error! from shoud be smaller than to (--from and --to)!") ; + throw std::invalid_argument(msg) ; + } + else if(ext <= 0 and (not help)) + { std::string msg("Error! the number of columns to add should be > 0 (--ext)!") ; + throw std::invalid_argument(msg) ; + } + + // help invoked, run() cannot be invoked + if(help) + { std::cout << desc << std::endl ; + this->runnable = false ; + return ; + } + // everything fine, run() can be called + else + { this->runnable = true ; + return ; + } +} + +int main(int argn, char** argv) +{ SequenceModelExtenderApplication app(argn, argv) ; + return app.run() ; +} diff --git a/src/Applications/SequenceModelExtenderApplication.hpp b/src/Applications/SequenceModelExtenderApplication.hpp new file mode 100644 index 0000000..6bc00d7 --- /dev/null +++ b/src/Applications/SequenceModelExtenderApplication.hpp @@ -0,0 +1,107 @@ +#ifndef SEQUENCEMODELEXTENDERAPPLICATION_HPP +#define SEQUENCEMODELEXTENDERAPPLICATION_HPP + +#include + +#include +#include + +#include + +/*! + * \brief The SequenceModelExtenderApplication class is a class implementing an + * application to extend a sequence model of length L' (L' = L - S + 1 + * where L is the number of column of the sequence matrix and S the + * shifting freedom allowed during the classification) to a new model + * length L'' = L' + E (E is the number of columns to add to the + * model) given the data matrix and the results of the classification + * (posterior probability matrix). + * To do this, the sequence count matrix from which the original model + * was computed is extended (0.5*E columns on each side) and a model + * is computed using the new matrix and the given posterior probabities. + * The extended model is returned through the stdout. + */ +class SequenceModelExtenderApplication : public ApplicationInterface +{ + public: + SequenceModelExtenderApplication() = delete ; + SequenceModelExtenderApplication(const SequenceModelExtenderApplication& app) = delete ; + /*! + * \brief Constructs an object from the command line + * options. + * \param argn the number of options passed to the + * main() function. + * \param argv the vector of options passed to the + * main() function. + */ + SequenceModelExtenderApplication(int argn, char** argv) ; + /*! + * \brief Destructor. + */ + virtual ~SequenceModelExtenderApplication() override ; + /*! + * \brief Runs the application. The data new model + * is computed and displayed through the + * stdout. + * \return the exit code. + */ + virtual int run() override ; + + private: + /*! + * \brief Parses the program command line options and + * sets the object field accordingly. + * If the help option is detected, the "runnable" + * field is set to false and subsequent calls to + * run() will produce nothing. + * \param argn the number of options passed to the + * main() function. + * \param argv the vector of options passed to the + * main() function. + * \throw std::invalid_argument if an error is found + * in the program options. + */ + void parseOptions(int argn, char** argv) ; + + /*! + * \brief the path to the bed file. + */ + std::string file_bed ; + /*! + * \brief the path to the fasta file + * containing the sequences. + */ + std::string file_fasta ; + /*! + * \brief the path to the file containing the + * classification posterior probabilities. + */ + std::string file_prob ; + /*! + * \brief a relative coordinate indicating the + * most downstream position to consider around + * each region in the bed file. + */ + int from ; + /*! + * \brief a relative coordinate indicating the + * most upstream position to consider around + * each region in the bed file. + */ + int to ; + /*! + * \brief the number of columns to add to the + * matrix (half of this value on each side). + */ + int ext ; + /*! + * \brief the number of threads. + */ + size_t n_threads ; + /*! + * \brief whether run() can be called. + */ + bool runnable ; +} ; + +#endif // SEQUENCEMODELEXTENDERAPPLICATION_HPP diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 9c38267..6926729 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,98 +1,122 @@ # compiler options add_compile_options(-std=c++14) add_compile_options(-O3) add_compile_options(-Wall) add_compile_options(-Wextra) add_compile_options(-Werror) add_compile_options(-Wfatal-errors) add_compile_options(-pedantic) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SEQAN_CXX_FLAGS}") add_definitions (${SEQAN_DEFINITIONS}) # include file location -include_directories (${SEQAN_INCLUDE_DIRS}) +include_directories(${Boost_INCLUDE_DIRS}) +include_directories(${SEQAN_INCLUDE_DIRS}) include_directories("${scATACseq_SOURCE_DIR}/src/Matrix") include_directories("${scATACseq_SOURCE_DIR}/src/Clustering") include_directories("${scATACseq_SOURCE_DIR}/src/Random") include_directories("${scATACseq_SOURCE_DIR}/src/Parallel") include_directories("${scATACseq_SOURCE_DIR}/src/Statistics") include_directories("${scATACseq_SOURCE_DIR}/src/GUI") include_directories("${scATACseq_SOURCE_DIR}/src/Applications") include_directories("${scATACseq_SOURCE_DIR}/src/Matrix") include_directories("${scATACseq_SOURCE_DIR}/src/GenomicTools") include_directories("${scATACseq_SOURCE_DIR}/src/Utility") - # compile modules into static libraries ## set output directory set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/lib") ## build instructions add_library(Clustering "Clustering/DataLayer.cpp" "Clustering/ReadLayer.cpp" "Clustering/SequenceLayer.cpp" - "Clustering/EMEngine.cpp" "Clustering/ModelComputer.cpp" "Clustering/ReadModelComputer.cpp" - "Clustering/SequenceModelComputer.cpp") + "Clustering/SequenceModelComputer.cpp" + "Clustering/EMBase.cpp" + "Clustering/EMRead.cpp" + "Clustering/EMSequence.cpp" + "Clustering/EMJoint.cpp") add_library(Random "Random/Random.cpp" "Random/RandomNumberGenerator.cpp") add_library(Parallel "Parallel/ThreadPool.cpp") add_library(Statistics "Statistics/Statistics.cpp") add_library(GUI "GUI/ConsoleProgressBar.cpp" "GUI/Diplayable.cpp" "GUI/Updatable.cpp") add_library(GenomicTools "GenomicTools/MatrixCreator.cpp" + "GenomicTools/ReadMatrixCreator.cpp" "GenomicTools/CorrelationMatrixCreator.cpp" + "GenomicTools/SequenceMatrixCreator.cpp" "GenomicTools/GenomeRegion.cpp") -add_library(Utility "Utility/matrices.cpp") +add_library(Utility "Utility/matrices.cpp" + "Utility/dna_utility.cpp") ## resolve dependencies -target_link_libraries(Clustering Random Statistics GUI Parallel ${SEQAN_LIBRARIES}) -target_link_libraries(Parallel Threads::Threads) -target_link_libraries(GenomicTools ${SEQAN_LIBRARIES}) +target_link_libraries(Utility ${SEQAN_LIBRARIES}) +target_link_libraries(Clustering Utility Random Statistics GUI Parallel ${SEQAN_LIBRARIES}) +target_link_libraries(Parallel Threads::Threads) +target_link_libraries(GenomicTools Utility ${SEQAN_LIBRARIES}) # executables ## a toy for seqan set(EXE_MAIN_SEQAN "main_seqan") add_executable(${EXE_MAIN_SEQAN} "main_seqan.cpp") target_link_libraries(${EXE_MAIN_SEQAN} ${SEQAN_LIBRARIES} GenomicTools Clustering) set_target_properties(${EXE_MAIN_SEQAN} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") ## a toy for correlation matrix set(EXE_MAIN_CORMAT "main_cormat") add_executable(${EXE_MAIN_CORMAT} "main_cormat.cpp") -target_link_libraries(${EXE_MAIN_CORMAT} ${SEQAN_LIBRARIES} GenomicTools) +target_link_libraries(${EXE_MAIN_CORMAT} ${SEQAN_LIBRARIES} Utility GenomicTools Random) set_target_properties(${EXE_MAIN_CORMAT} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") -## a toy for EM usage -set(EXE_MAIN_EM "main_em") -add_executable(${EXE_MAIN_EM} "main_em.cpp") -target_link_libraries(${EXE_MAIN_EM} Clustering Utility) -set_target_properties(${EXE_MAIN_EM} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") -## a 2nd toy for EM usage -set(EXE_MAIN_EM2 "main_em2") -add_executable(${EXE_MAIN_EM2} "main_em2.cpp") -target_link_libraries(${EXE_MAIN_EM2} Clustering Utility) -set_target_properties(${EXE_MAIN_EM2} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") ## an application to create a matrix from BED and a BAM file set(EXE_MAIN_BAMMATRIX "CorrelationMatrixCreator") add_executable(${EXE_MAIN_BAMMATRIX} "Applications/CorrelationMatrixCreatorApplication.cpp" "Applications/ApplicationInterface.cpp") -target_link_libraries(${EXE_MAIN_BAMMATRIX} GenomicTools Boost::program_options) +target_link_libraries(${EXE_MAIN_BAMMATRIX} GenomicTools Utility Boost::program_options) set_target_properties(${EXE_MAIN_BAMMATRIX} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") -## an ChIPPartitioning standalone -set(EXE_CHIPPART "ChIPPartitioning") -add_executable(${EXE_CHIPPART} "Applications/ChIPPartitioningApplication.cpp" "Applications/ApplicationInterface.cpp") -target_link_libraries(${EXE_CHIPPART} Clustering Utility Boost::program_options) -set_target_properties(${EXE_CHIPPART} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") -## an executable to compute classes references from the data and the post prob of ChIPPartitioning -set(EXE_PROB2REF "probToModel") +## an application to create a sequence matrix from BED and a fasta file +set(EXE_MAIN_SEQMATRIX "SequenceMatrixCreator") +add_executable(${EXE_MAIN_SEQMATRIX} "Applications/SequenceMatrixCreatorApplication.cpp" "Applications/ApplicationInterface.cpp") +target_link_libraries(${EXE_MAIN_SEQMATRIX} GenomicTools Utility Boost::program_options) +set_target_properties(${EXE_MAIN_SEQMATRIX} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") +## an EMRead standalone +set(EXE_EMREAD "EMRead") +add_executable(${EXE_EMREAD} "Applications/EMReadApplication.cpp" "Applications/ApplicationInterface.cpp") +target_link_libraries(${EXE_EMREAD} Clustering Utility Boost::program_options) +set_target_properties(${EXE_EMREAD} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") +## an EMSequence standalone +set(EXE_EMSEQ "EMSequence") +add_executable(${EXE_EMSEQ} "Applications/EMSequenceApplication.cpp" "Applications/ApplicationInterface.cpp") +target_link_libraries(${EXE_EMSEQ} Clustering Utility Boost::program_options) +set_target_properties(${EXE_EMSEQ} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") +## an EMJoint standalone +set(EXE_EMJOINT "EMJoint") +add_executable(${EXE_EMJOINT} "Applications/EMJointApplication.cpp" "Applications/ApplicationInterface.cpp") +target_link_libraries(${EXE_EMJOINT} Clustering Utility Boost::program_options) +set_target_properties(${EXE_EMJOINT} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") +## an executable to compute data models from the data and the post prob of an EM classification +set(EXE_PROB2REF "ProbToModel") add_executable(${EXE_PROB2REF} "Applications/ProbToModelApplication.cpp" "Applications/ApplicationInterface.cpp") target_link_libraries(${EXE_PROB2REF} Clustering Utility Boost::program_options) set_target_properties(${EXE_PROB2REF} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") +## an executable to extend read models from an EM classification +set(EXE_READMODELEXTENDER "ReadModelExtender") +add_executable(${EXE_READMODELEXTENDER} "Applications/ReadModelExtenderApplication.cpp" "Applications/ApplicationInterface.cpp") +target_link_libraries(${EXE_READMODELEXTENDER} Clustering GenomicTools Utility Boost::program_options) +set_target_properties(${EXE_READMODELEXTENDER} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") + +## an executable to extend read models from an EM classification +set(EXE_SEQUENCEMODELEXTENDER "SequenceModelExtender") +add_executable(${EXE_SEQUENCEMODELEXTENDER} "Applications/SequenceModelExtenderApplication.cpp" "Applications/ApplicationInterface.cpp") +target_link_libraries(${EXE_SEQUENCEMODELEXTENDER} Clustering GenomicTools Utility Boost::program_options) +set_target_properties(${EXE_SEQUENCEMODELEXTENDER} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") + ## a test suite set(EXE_TESTS "unittests") add_executable(${EXE_TESTS} "unittests.cpp" "Unittests/unittests_matrix.cpp" "Unittests/unittests_genomictools.cpp") target_link_libraries(${EXE_TESTS} ${UNITTEST_LIB} ${SEQAN_LIBRARIES} GenomicTools) set_target_properties(${EXE_TESTS} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") diff --git a/src/Clustering.old/ClusteringEngine.cpp b/src/Clustering.old/ClusteringEngine.cpp deleted file mode 100644 index fe69e87..0000000 --- a/src/Clustering.old/ClusteringEngine.cpp +++ /dev/null @@ -1,6 +0,0 @@ -#include -#include - -ClusteringEngine::~ClusteringEngine() -{} - diff --git a/src/Clustering.old/ClusteringEngine.hpp b/src/Clustering.old/ClusteringEngine.hpp deleted file mode 100644 index ecfc47b..0000000 --- a/src/Clustering.old/ClusteringEngine.hpp +++ /dev/null @@ -1,36 +0,0 @@ -#ifndef CLUSTERINGENGINE_HPP -#define CLUSTERINGENGINE_HPP - -#include - -/*! - * \brief The ClusteringEngine class is an abstract class providing an interface - * to other classes implementing data clustering methods. - */ -class ClusteringEngine -{ - public: - /*! - * \brief The possible exit codes for the cluster method. - * 0 the clustering procedure converged, 1 the clustering - * procedure succeeded without converging, 2 the clustering - * failed. - */ - enum exit_codes {CONVERGENCE=0, SUCCESS, FAILURE, NCODE=3} ; - - public: - /*! - * \brief Destructor. - */ - virtual ~ClusteringEngine() ; - - /*! - * \brief Runs the clustering. - * \return an exit code indicating whether how the clustering - * ended. - */ - virtual exit_codes cluster() = 0 ; - -} ; - -#endif // CLUSTERINGENGINE_HPP diff --git a/src/Clustering.old/EMEngine.cpp b/src/Clustering.old/EMEngine.cpp deleted file mode 100644 index 3b574a1..0000000 --- a/src/Clustering.old/EMEngine.cpp +++ /dev/null @@ -1,807 +0,0 @@ -#include -#include -#include -#include -#include -#include // rand_real_uniform(), rand_int_uniform() -#include // getRandomNumberGenerator() -#include // beta_pmf(), poisson_pmf(), normal_pmf(), sd() -#include // ConsoleProgressBar -#include // ThreadPool -#include // log(), exp(), pow() -#include -#include // numeric_limits -#include // uniform_real, variate_generator -#include // future, promise -#include // move() -#include // bind(), ref() - -#include - -EMEngine::EMEngine(const Matrix2D& data, - size_t n_class, - size_t n_iter, - size_t n_shift, - bool flip, - EMEngine::seeding_codes seeding, - const std::string& seed, - size_t n_threads) - : flip(flip), n_iter(n_iter), n_shift(n_shift), n_flip(flip+1), n_class(n_class), - n_row(data.get_nrow()), n_col(data.get_ncol()), l_slice(n_col - n_shift + 1), - seeding_method(seeding), n_threads(n_threads), threads(n_threads) -{ - // initialise random number generator - getRandomGenerator(seed) ; - - // copy the data - this->data = matrix2d_i(this->n_row, v_i(this->n_col)) ; - for(size_t i=0; in_row; i++) - { for(size_t j=0; jn_col; j++) - { this->data[i][j] = data(i,j) ; } - } - -} - -EMEngine::~EMEngine() -{ this->threads.join() ; } - -Matrix2D EMEngine::get_references() const -{ - Matrix2D references(this->n_class, this->l_slice, 0.) ; - for(size_t i=0; in_class; i++) - { for(size_t j=0; jl_slice; j++) - { references(i,j) = this->references[i][j] ; } - } - return references ; - -} - -Matrix4D EMEngine::get_posterior_prob() const -{ Matrix4D post_prob(this->n_row, this->n_class, this->n_shift, this->n_flip, 0.) ; - for(size_t i=0; in_row; i++) - { for(size_t k=0; kn_class; k++) - { for(size_t s=0; sn_shift; s++) - { for(size_t f=0; fn_flip; f++) - { post_prob(i,k,s,f) = this->post_prob[i][k][s][f] ; } - } - } - } - return post_prob ; -} - -/* -// this is the naive way, it is exact but results in Nan, -Nan, -Inf, +Inf -// sometimes... -double EMEngine::get_loglikelihood0() const -{ - double ll = 0 ; - for(size_t i=0; in_row; i++) - { double p_tmp = 0. ; - for(size_t j=0; jn_class; j++) - { for(size_t s=0; sn_shift; s++) - { // slice is [from_fw,to) - // from_dat_fw to_dat_fw [from_dat_fw, to_dat_fw] - // fw |---------->>>----------| - // ----------------------------------> data - // rev |----------<<<----------| [from_dat_rev, to_dat_rev] - // to_dat_rev can be -1 -> int - // to_dat_rev from_dat_rev - - // log likelihood - // --------------- forward --------------- - double lp_fw = 0. ; - int from_dat_fw = s ; - int to_dat_fw = from_dat_fw + this->l_slice - 1 ; - for(int j_dat_fw=from_dat_fw, j_ref_fw=0; - j_dat_fwdata[i][j_dat_fw], - this->references[j][j_ref_fw]* - this->window_mean[i][s])), - EMEngine::p_min_log) ; - lp_fw += lp ; - - p_tmp += exp(lp_fw) * this->class_prob[j][s][flip_states::FORWARD] ; - // --------------- reverse --------------- - if(this->flip) - { double lp_rev = 0. ; - int from_dat_rev = this->n_col - 1 - s ; - int to_dat_rev = from_dat_rev - (this->l_slice - 1) ; - int shift_rev = this->n_shift - s - 1 ; - for(int j_dat_rev=from_dat_rev, j_ref_fw=0; - j_dat_rev >= to_dat_rev; j_dat_rev--, j_ref_fw++) - { double lp = std::max(log(poisson_pmf(this->data[i][j_dat_rev], - this->references[j][j_ref_fw]* - this->window_mean[i][shift_rev])), - EMEngine::p_min_log) ; - lp_rev += lp ; - } - - p_tmp += exp(lp_rev) * this->class_prob[j][s][flip_states::REVERSE] ; - } - } - } - ll += log(p_tmp) ; - } - return ll ; -} -*/ - -double EMEngine::get_loglikelihood() const -{ - double ll = 0. ; - - // compute all terms needed - for(size_t i=0; in_row; i++) - { double prob_tmp = 0 ; - for(size_t j=0; jn_class; j++) - { std::vector> v3 ; - for(size_t s=0; sn_shift; s++) - { - // slice is [from_fw,to) - // from_dat_fw to_dat_fw [from_dat_fw, to_dat_fw] - // fw |---------->>>----------| - // ----------------------------------> data - // rev |----------<<<----------| [from_dat_rev, to_dat_rev] - // to_dat_rev can be -1 -> int - // to_dat_rev from_dat_rev - - // log likelihood - // --------------- forward --------------- - double lp_fw = 0. ; - int from_dat_fw = s ; - int to_dat_fw = from_dat_fw + this->l_slice - 1 ; - for(int j_dat_fw=from_dat_fw, j_ref_fw=0; - j_dat_fwdata[i][j_dat_fw], - this->references[j][j_ref_fw]* - this->window_mean[i][s]), - EMEngine::p_min)) ; - lp_fw += lp ; - } - double p_fw = this->class_prob[j][s][flip_states::FORWARD] ; - v3.push_back(std::make_pair(lp_fw, p_fw)) ; - - // --------------- reverse --------------- - if(this->flip) - { double lp_rev = 0. ; - int from_dat_rev = this->n_col - 1 - s ; - int to_dat_rev = from_dat_rev - (this->l_slice - 1) ; - int shift_rev = this->n_shift - s - 1 ; - for(int j_dat_rev=from_dat_rev, j_ref_fw=0; - j_dat_rev >= to_dat_rev; j_dat_rev--, j_ref_fw++) - { double lp = log(std::max(poisson_pmf(this->data[i][j_dat_rev], - this->references[j][j_ref_fw]* - this->window_mean[i][shift_rev]), - EMEngine::p_min)) ; - lp_rev += lp ; - } - double p_rev = this->class_prob[j][s][flip_states::REVERSE] ; - v3.push_back(std::make_pair(lp_rev, p_rev)) ; - } - } - prob_tmp += sum_exp(v3) ; - } - ll += log(prob_tmp) ; - } - return ll ; -} - -double EMEngine::get_aic() const -{ double ll = this->get_loglikelihood() ; - double n_param = ((double) this->n_class * - (double)this->l_slice) + - ((double)this->n_shift * - (double)this->flip+1. * - (double)this->n_class) - 1. ; - // std::cerr << "AIC = " << (2.*n_param) << " - " << ll << std::endl ; - return (2.*n_param) - (2.*ll) ; -} - -ClusteringEngine::exit_codes EMEngine::cluster() -{ size_t bar_update_n = this->n_iter + 1 ; - ConsoleProgressBar bar(std::cerr, bar_update_n, 70, "clustering") ; - - // construct all other required data structures - // mean number of reads per window - this->window_mean = matrix2d_d(this->n_row, v_d(this->n_shift, 0.)) ; - this->compute_window_means() ; - - // the references - this->references = matrix2d_d(this->n_class, - v_d(this->l_slice, 0.)) ; - // log loglikelihood - this->loglikelihood = matrix4d_d(this->n_row, - matrix3d_d(this->n_class, - matrix2d_d(this->n_shift, - v_d(this->n_flip, 9.)))) ; - this->loglikelihood_max = v_d(this->n_row, 0.) ; - - // posterior prob - this->post_prob = matrix4d_d(this->n_row, - matrix3d_d(this->n_class, - matrix2d_d(this->n_shift, - v_d(this->n_flip, 0.)))) ; - this->class_prob = matrix3d_d(this->n_class, - matrix2d_d(this->n_shift, - v_d(this->n_flip, 0.))) ; - this->class_prob_tot = v_d(this->n_class, 0.) ; - this->post_prob_row = v_d(this->n_row, 0.) ; - this->post_prob_class = v_d(this->n_class, 0.) ; - this->post_prob_tot = 0. ; - - // seeding - this->seeding(this->seeding_method) ; - bar.update() ; - - // optimize the partition - for(size_t n_iter=0; n_itern_iter; n_iter++) - { - // normalize the references such thjat the mean value, on each - // row, is 1 - this->normalize_references() ; - // E-step - this->compute_loglikelihood() ; - this->compute_post_prob() ; - // M-step - this->compute_class_prob() ; - this->compute_references() ; - this->center_shifts() ; - // bar.update() ; - } - bar.update() ; std::cerr << std::endl ; - return ClusteringEngine::exit_codes::SUCCESS ; -} - -void EMEngine::normalize_references() -{ - for(size_t i=0; in_class; i++) - { double mean = 0. ; - for(size_t j=0; jl_slice; j++) - { mean += this->references[i][j] ; } - mean /= this->l_slice ; - for(size_t j=0; jl_slice; j++) - { this->references[i][j] /= mean ; } - } -} - -void EMEngine::seeding(EMEngine::seeding_codes seeding) -{ - if(seeding == EMEngine::seeding_codes::RANDOM) - { this->seeding_random() ; } - else if(seeding == EMEngine::seeding_codes::SAMPLING) - { this->seeding_sampling() ; } - else if(seeding == EMEngine::seeding_codes::TOY) - { this->seeding_toy() ; } -} - -void EMEngine::seeding_random() -{ - // get random values from a beta distribution cannot be done using boost so - // i) generate random number [0,1] x - // ii) compute f(x) where f is beta distribution - - matrix2d_d prob(this->n_row, v_d(this->n_class, 0.)) ; - v_d prob_class(this->n_class, 0.) ; - double tot_sum = 0. ; - - // sample the prob - // beta distribution parameters - double alpha = pow(this->n_row, -0.5) ; - double beta = 1. ; - for(size_t i=0; in_row; i++) - { double row_sum = 0. ; - for(size_t j=0; jn_class; j++) - { double x = rand_real_uniform(0., 1.0) ; - double p = std::max(EMEngine::p_min, beta_pmf(x, alpha, beta)) ; - prob[i][j] = p ; - prob_class[j] += p ; - tot_sum += p ; - row_sum += p ; - } - // normalize - for(size_t j=0; jn_class; j++) - { prob[i][j] /= row_sum ; } - } - - // class prob - for(auto& p : prob_class) - { p /= tot_sum ; } - - // compute the refererences - for(size_t i=0; in_row; i++) - { for(size_t j=0; jn_class; j++) - { for(size_t j_ref=0, j_dat=this->n_shift/2; j_refl_slice; j_ref++, j_dat++) - { this->references[j][j_ref] += (this->data[i][j_dat] * prob[i][j]) ; } - } - } - // normalize - for(size_t i=0; in_class; i++) - { for(size_t j=0; jl_slice; j++) - { this->references[i][j] ; } - } - - // set the class probabilities to a uniform distribution - double sum = this->n_class * this->n_shift * this->n_flip ; - for(size_t i=0; in_class; i++) - { for(size_t j=0; jn_shift; j++) - { for(size_t k=0; kn_flip; k++) - { this->class_prob[i][j][k] = 1./sum ; } - } - } -} - -void EMEngine::seeding_sampling() -{ - // sample data to initialise the references - std::vector choosen(this->n_row, false) ; - - for(size_t i=0; in_class; ) - { size_t index = rand_int_uniform(size_t(0), size_t(this->n_row-1)) ; - // already choose - if(choosen[index]) - { ; } - // not yet choosen as reference - else - { for(size_t j_ref=0, j_dat=this->n_shift/2; j_refl_slice; j_ref++, j_dat++) - { this->references[i][j_ref] = this->data[index][j_dat] ; } - choosen[index] = true ; - i++ ; - } - } - - // set the class probabilities to a uniform distribution - double sum = this->n_class * this->n_shift * this->n_flip ; - for(size_t i=0; in_class; i++) - { for(size_t j=0; jn_shift; j++) - { for(size_t k=0; kn_flip; k++) - { this->class_prob[i][j][k] = 1. / sum ; - } - } - } -} - -void EMEngine::seeding_toy() -{ - // sample data to initialise the references - std::vector choosen(this->n_row, false) ; - - for(size_t i=0; in_class; ) - { size_t index = i ; - // already choose - if(choosen[index]) - { ; } - // not yet choosen as reference - else - { for(size_t j_ref=0, j_dat=this->n_shift/2; j_refl_slice; j_ref++, j_dat++) - { this->references[i][j_ref] = this->data[index][j_dat] ; } - choosen[index] = true ; - i++ ; - } - } - - // set the class probabilities to a uniform distribution - double sum = this->n_class * this->n_shift * this->n_flip ; - for(size_t i=0; in_class; i++) - { for(size_t j=0; jn_shift; j++) - { for(size_t k=0; kn_flip; k++) - { this->class_prob[i][j][k] = 1./sum ; } - } - } -} - -void EMEngine::compute_window_means() -{ // compute the slices on which each thread will work - std::vector> slices = - ThreadPool::split_range(0, this->n_row, this->n_threads) ; - - // get promises and futures - // the function run by the threads will simply fill the promise with - // "true" to indicate that they are done - std::vector> promises(this->n_threads) ; - std::vector> futures(this->n_threads) ; - for(size_t i=0; in_threads; i++) - { futures[i] = promises[i].get_future() ; } - - // distribute work to threads - // -------------------------- threads start -------------------------- - for(size_t i=0; in_threads; i++) - { auto slice = slices[i] ; - this->threads.addJob(std::move( - std::bind(&EMEngine::compute_window_means_routine, - this, - slice.first, - slice.second, - std::ref(promises[i])))) ; - } - // wait until all threads are done working - for(auto& future : futures) - { future.get() ; } - // -------------------------- threads stop --------------------------- -} - -void EMEngine::compute_window_means_routine(size_t from, - size_t to, - std::promise& done) -{ - double l_slice = double(this->l_slice) ; - for(size_t i=from; in_shift; from++) - { double sum = 0. ; - // slice is [from,to) - size_t to = from + this->l_slice ; - for(size_t j=from; jdata[i][j] ;} - this->window_mean[i][from] = sum / l_slice ; - } - } - done.set_value(true) ; -} - -void EMEngine::compute_loglikelihood() -{ - // compute the slices on which each thread will work - std::vector> slices = - ThreadPool::split_range(0, this->n_row, this->n_threads) ; - - // get promises and futures - // the function run by the threads will simply fill the promise with - // "true" to indicate that they are done - std::vector> promises(this->n_threads) ; - std::vector> futures(this->n_threads) ; - for(size_t i=0; in_threads; i++) - { futures[i] = promises[i].get_future() ; } - - // distribute work to threads - // -------------------------- threads start -------------------------- - for(size_t i=0; in_threads; i++) - { auto slice = slices[i] ; - this->threads.addJob(std::move( - std::bind(&EMEngine::compute_loglikelihood_routine, - this, - slice.first, - slice.second, - std::ref(promises[i])))) ; - } - // wait until all threads are done working - for(auto& future : futures) - { future.get() ; } - // -------------------------- threads stop --------------------------- -} - -void EMEngine::compute_loglikelihood_routine(size_t from, size_t to, std::promise& done) -{ - // access in writing - // this->loglikelihood -> only access the i-th which belong [from,to) - // this->loglikelihood_max -> only access the i-th which belong [from,to) - - for(size_t i=from; iloglikelihood_max[i] = std::numeric_limits::lowest() ; - - for(size_t j=0; jn_class; j++) - { for(size_t s_fw=0, s_rev=this->n_shift-1; - s_fwn_shift; s_fw++, s_rev--) - { // slice is [from_fw,to) - // from_dat_fw to_dat_fw [from_dat_fw, to_dat_fw] - // fw |---------->>>----------| - // ----------------------------------> data - // rev |----------<<<----------| [from_dat_rev, to_dat_rev] - // to_dat_rev can be -1 -> int - // to_dat_rev from_dat_rev - - // log likelihood - double ll_fw = 0. ; - double ll_rev = 0. ; - // --------------- forward --------------- - size_t from_dat_fw = s_fw ; - size_t to_dat_fw = from_dat_fw + this->l_slice - 1 ; - // --------------- reverse --------------- - size_t from_dat_rev = this->n_col - 1 - s_fw ; - // size_t to_dat_rev = from_dat_rev - (this->l_slice - 1) ; - - for(size_t j_dat_fw=from_dat_fw,j_ref_fw=0, j_dat_rev=from_dat_rev; - j_dat_fwdata[i][j_dat_fw], - this->references[j][j_ref_fw]* - this->window_mean[i][s_fw])) ; - ll_fw += std::max(ll, EMEngine::p_min_log) ; - // --------------- reverse --------------- - if(this->flip) - { ll = log(poisson_pmf(this->data[i][j_dat_rev], - this->references[j][j_ref_fw]* - this->window_mean[i][s_rev])) ; - ll_rev += std::max(ll, EMEngine::p_min_log) ; - } - } - this->loglikelihood[i][j][from_dat_fw][flip_states::FORWARD] = ll_fw ; - // keep track of the max per row - if(ll_fw > this->loglikelihood_max[i]) - { this->loglikelihood_max[i] = ll_fw ; } - - if(this->flip) - { this->loglikelihood[i][j][from_dat_fw][flip_states::REVERSE] = ll_rev ; - // keep track of the max per row - if(ll_rev > this->loglikelihood_max[i]) - { this->loglikelihood_max[i] = ll_rev ; } - } - } - } - } - // fill the promise to indicate that the function exited - done.set_value(true) ; -} - -void EMEngine::compute_post_prob() -{ - // compute the slices on which each thread will work - std::vector> slices = - ThreadPool::split_range(0, this->n_row, this->n_threads) ; - - // get promises and futures - // the function run by the threads will compute - // the partial sum per class of post_prob for the given slice - // this should be used to compute the complete sum of post_prob - // and the complete sum per class of post_prob - std::vector> promises(this->n_threads) ; - std::vector> futures(this->n_threads) ; - for(size_t i=0; in_threads; i++) - { futures[i] = promises[i].get_future() ; } - - // distribute work to threads - // -------------------------- threads start -------------------------- - for(size_t i=0; in_threads; i++) - { auto slice = slices[i] ; - this->threads.addJob(std::move( - std::bind(&EMEngine::compute_post_prob_routine, - this, - slice.first, - slice.second, - std::ref(promises[i])))) ; - } - // wait until all threads are done working - // compute the sum of post prob and the per class sum of post prob - // from the partial results computed on each slice - this->post_prob_tot = 0. ; - this->post_prob_class = v_d(this->n_class, 0.) ; - for(auto& future : futures) - { auto probs = future.get() ; - for(size_t i=0; in_class; i++) - { double prob = probs[i] ; - this->post_prob_class[i] += prob ; - this->post_prob_tot += prob ; - } - } - // -------------------------- threads stop --------------------------- -} - -void EMEngine::compute_post_prob_routine(size_t from, - size_t to, - std::promise& done) -{ - // this->post_prob_row -> only access the i-th which belong [from,to) - // this->post_prob -> only access the i-th which belong [from,to) - - // some values that needs to be returned - // the total of the posterior prob for this slice of the data - // the total per class of posterior prob for this slice of the data - v_d post_prob_class(this->n_class, 0.) ; - - for(size_t i=from; ipost_prob_row[i] = 0. ; - - for(size_t n_class=0; n_classn_class; n_class++) - { for(size_t n_shift=0; n_shiftn_shift; n_shift++) - { for(size_t n_flip=0; n_flipn_flip; n_flip++) - { /* - double p = exp(this->loglikelihood[i][n_class][n_shift][n_flip] - - this->loglikelihood_max[i]) * - this->class_prob[n_class][n_shift][n_flip] ; - */ - double p = std::max(exp(this->loglikelihood[i][n_class][n_shift][n_flip] - - this->loglikelihood_max[i]) * - this->class_prob[n_class][n_shift][n_flip], - EMEngine::p_min) ; - this->post_prob[i][n_class][n_shift][n_flip] = p ; - this->post_prob_row[i] += p ; - } - } - } - // normalize - for(size_t n_class=0; n_classn_class; n_class++) - { for(size_t n_shift=0; n_shiftn_shift; n_shift++) - { for(size_t n_flip=0; n_flipn_flip; n_flip++) - { this->post_prob[i][n_class][n_shift][n_flip] /= - this->post_prob_row[i] ; - double p = this->post_prob[i][n_class][n_shift][n_flip] ; - post_prob_class[n_class] += p ; - } - } - } - } - - done.set_value(post_prob_class) ; -} - -void EMEngine::compute_class_prob() -{ - for(size_t n_class=0; n_classn_class; n_class++) - { // reset total - this->class_prob_tot[n_class] = 0. ; - for(size_t n_shift=0; n_shiftn_shift; n_shift++) - { for(size_t flip=0; flipn_flip; flip++) - { // sum - this->class_prob[n_class][n_shift][flip] = 0. ; - for(size_t i=0; in_row; i++) - { this->class_prob[n_class][n_shift][flip] += - this->post_prob[i][n_class][n_shift][flip] ; - } - // normalize - this->class_prob[n_class][n_shift][flip] /= this->post_prob_tot ; - this->class_prob_tot[n_class] += this->class_prob[n_class][n_shift][flip] ; - } - } - } -} - -void EMEngine::compute_references() -{ - // compute the slices on which each thread will work - std::vector> slices = - ThreadPool::split_range(0, this->n_row, this->n_threads) ; - - // get promises and futures - // the function run by the threads will compute - // the reference from the given slice - std::vector> promises(this->n_threads) ; - std::vector> futures(this->n_threads) ; - for(size_t i=0; in_threads; i++) - { futures[i] = promises[i].get_future() ; } - - // distribute work to threads - // -------------------------- threads start -------------------------- - for(size_t i=0; in_threads; i++) - { auto& slice = slices[i] ; - this->threads.addJob(std::move( - std::bind(&EMEngine::compute_references_routine, - this, - slice.first, - slice.second, - std::ref(promises[i])))) ; - } - // while threads are working, reset the references - for(size_t i=0; in_class; i++) - { for(size_t j=0; jl_slice; j++) - { this->references[i][j] = 0. ; } - } - // wait until all threads are done working - // sum the partial class references to get the complete ones - for(size_t n=0; nn_threads; n++) - { matrix2d_d reference = futures[n].get() ; - for(size_t i=0; in_class; i++) - { for(size_t j=0; jl_slice; j++) - { this->references[i][j] += reference[i][j] ; } - } - } - // -------------------------- threads stop --------------------------- -} - -void EMEngine::compute_references_routine(size_t from, size_t to, std::promise& references) -{ // the empty references - matrix2d_d ref(this->n_class, v_d(this->l_slice, 0.)) ; - - for(size_t n_class=0; n_class < this->n_class; n_class++) - { - for(size_t i=from; in_shift; n_shift++) - { // --------------- forward --------------- - int from_dat_fw = n_shift ; - int to_dat_fw = from_dat_fw + this->l_slice - 1 ; - for(int j_dat_fw=from_dat_fw, j_ref_fw=0; - j_dat_fw<=to_dat_fw; j_dat_fw++, j_ref_fw++) - { ref[n_class][j_ref_fw] += - (this->post_prob[i][n_class][n_shift][flip_states::FORWARD] * this->data[i][j_dat_fw]) / - this->post_prob_class[n_class] ; - } - // --------------- reverse --------------- - if(this->flip) - { int from_dat_rev = this->n_col - 1 - n_shift ; - int to_dat_rev = from_dat_rev - (this->l_slice - 1) ; - for(int j_dat_rev=from_dat_rev, j_ref_fw=0; - j_dat_rev >= to_dat_rev; j_dat_rev--, j_ref_fw++) - { ref[n_class][j_ref_fw] += - (this->post_prob[i][n_class][n_shift][flip_states::REVERSE] * this->data[i][j_dat_rev]) / - this->post_prob_class[n_class] ; - } - } - } - } - } - references.set_value(ref) ; -} - -void EMEngine::center_shifts() -{ - if(this->n_shift == 1) - { return ; } - - // the possible shift states - std::vector shifts(this->n_shift) ; - std::iota(shifts.begin(), shifts.end(), 1.) ; - - // the shift probabilities and the class probabilies (no need to norm., class_prob sums to 1) - double shifts_prob_measured_tot = 0. ; - std::vector shifts_prob_measured(this->n_shift) ; - for(size_t s=0; sn_shift; s++) - { for(size_t k=0; kn_class; k++) - { for(size_t f=0; fn_flip; f++) - { shifts_prob_measured[s] += this->class_prob[k][s][f] ; - shifts_prob_measured_tot += this->class_prob[k][s][f] ; - } - } - } - - - // the shift mean and (biased) standard deviation - double shifts_sd = sd(shifts, shifts_prob_measured, false) ; - - // the shift probabilities under the assumption that is distributed as a gaussian centered on - // the central shift state with sd and mean as in the data - // sd as the data - std::vector shifts_prob_centered(shifts.size(), 0.) ; - double shifts_prob_centered_tot = 0. ; - for(size_t i=0; in_shift/2)+1, shifts_sd) ; - shifts_prob_centered_tot += shifts_prob_centered[i] ; - } - - for(size_t k=0; kn_class; k++) - { for(size_t f=0; fn_flip; f++) - { for(size_t s=0; sn_shift; s++) - { this->class_prob[k][s][f] = this->class_prob_tot[k] * shifts_prob_centered[s] / - (this->n_flip * shifts_prob_centered_tot) ; - } - } - } - - // shifts_prob_measured_tot = 0. ; - shifts_prob_measured.clear() ; - shifts_prob_measured.resize(this->n_shift) ; - for(size_t s=0; sn_shift; s++) - { for(size_t k=0; kn_class; k++) - { for(size_t f=0; fn_flip; f++) - { shifts_prob_measured[s] += this->class_prob[k][s][f] ; - } - } - } -} - -const double EMEngine::p_min = 1e-100 ; -const double EMEngine::p_min_log = log(EMEngine::p_min) ; - -#include - -double sum_exp(const std::vector>& v) -{ - double result = 0. ; - // double max = *std::max_element(lp.begin(), lp.end()) ; - - double max = std::numeric_limits::lowest() ; - for(const auto& i : v) - { if(i.first > max) - { max = i.first ; } - } - - // sum - for(const auto& i : v) - { result += (exp(i.first - max))*i.second ; } - result *= exp(max) ; - - return result ; -} diff --git a/src/Clustering.old/EMEngine.hpp b/src/Clustering.old/EMEngine.hpp deleted file mode 100644 index d4087cf..0000000 --- a/src/Clustering.old/EMEngine.hpp +++ /dev/null @@ -1,363 +0,0 @@ -#ifndef EMENGINE_HPP -#define EMENGINE_HPP - -#include -#include -#include -#include -#include -#include -#include // promise, future - -// some typdef -#include - - -/*! - * \brief This class implements the iterative expectation - * maximization classification procedure described in Nair - * et al. 2014, Bioinformatics. - * The classification procedure performs a probabilistic - * partitioning of genomic regions, based on the distribution - * of the reads over the regions. - * To mitigate a miss-alignment of the signal in the different - * regions - that is a same signal strech is present in two - * regions but at different offsets - the classification - * procedure can search protypic signals shorter than a whole - * region, at each possible offset over the region (named - * shift). - * To mitigate an inversion of the signal in the different regions - * - that is a same signal strech is present in two regions but in - * reverse orientation - the classification procedure can search - * protypic signals in both orientation. - */ -class EMEngine : public ClusteringEngine -{ - static const double p_min ; - static const double p_min_log ; - - public: - /*! - * \brief The possible seeding strategies. - */ - enum seeding_codes {RANDOM=0, SAMPLING, TOY} ; - - /*! - * \brief The possible flip states. - */ - enum flip_states{FORWARD=0, REVERSE} ; - - public: - /*! - * \brief Constructs an object. - * \param data the data to classify. - * \param n_class the number of signal classes to search. - * \param n_iter the number of iterations. - * \param n_shift the shifting freedom. 1 means no shift. - * \param flip whether flipping is allowed. - * \param n_threads the number of threads dedicated to the - * computations. - */ - EMEngine(const Matrix2D& data, - size_t n_class, - size_t n_iter, - size_t n_shift, - bool flip, - seeding_codes seeding, - const std::string& seed=std::string(""), - size_t n_threads=1) ; - - /*! - * \brief Destructor. - */ - virtual ~EMEngine() override ; - - /*! - * \brief Returns a matrix with the class class references - * (protypic signal), on each row. - * \return a matrix containing the class references, on - * each row. - */ - virtual Matrix2D get_references() const ; - - /*! - * \brief Returns a matrix with the posterior probabilies - * with the dimensions representing the data, classes, shifts - * and flips respectively. - * \return a matrix containing the posterior probabilities. - */ - virtual Matrix4D get_posterior_prob() const ; - - /*! - * \brief Returns the likelihood of the partition. - * \return the likelihood of the partition. - */ - virtual double get_loglikelihood() const ; - - /*! - * \brief Returns the Akaike Information Criterion (AIC) - * for the given partition. - * The AIC is 2n - 2LL where is the number of - * free parameters in the model and LL the log - * likelihood of the partition. - * \return the partition AIC. - */ - virtual double get_aic() const ; - - /*! - * \brief Runs the data clustering. - * \return - */ - virtual ClusteringEngine::exit_codes cluster() override ; - - protected: - /*! - * \brief Default constructor. - */ - EMEngine() = default ; - - /*! - * \brief Sets each class protypic signal to 1 count, - * in average. - */ - virtual void normalize_references() ; - - /*! - * \brief Initialises the references using the corresponding - * method. - * \param seeding the method to use. - */ - virtual void seeding(seeding_codes seeding) ; - - /*! - * \brief Initialises the references randomly. - * Generates the initial references by randomly assigning - * the data to the classes using a beta distribution and - * all classes are set equally likely. - */ - virtual void seeding_random() ; - - /*! - * \brief Initialises the K references by randomly - * sampling K rows in the data. The class are set - * equally probable. - */ - virtual void seeding_sampling() ; - - /*! - * \brief Initialises the K references using the first K - * rows in data. The class are set equally probable. - */ - virtual void seeding_toy() ; - - /*! - * \brief Computes the mean number of reads present in - * each slice (of length ncol - shift + 1), in each row - * of the data and store them in this->window_mean. - */ - virtual void compute_window_means() ; - - /*! - * \brief The routine that effectively computes the mean - * number of reads present in each slice, for the range - * [from,to) of rows in the data. - * This function is thread safe only as long as different - * [from,to) slices are given to the different threads. - * \param from the index of the first row to treat. - * \param to the index of the past last row to treat. - * \param done a promise filled when the function is done - * working. This allows to synchronize threads. - */ - virtual void compute_window_means_routine(size_t from, - size_t to, - std::promise& done) ; - - /*! - * \brief Computes the data log likelihood given the - * current class protypic signals. - */ - virtual void compute_loglikelihood() ; - - /*! - * \brief The routine that effectively computes the - * log likelihoods for the range [from,to) of rows - * in the data. This function is used to distribute - * the log likelihood computations over several threads. - * This function is thread safe only as long as - * different [from,to) slices are given to the different - * threads. - * \param from the index of the first row to treat. - * \param to the index of the past last row to treat. - * \param done a promise filled when the function is - * done working. This allows to synchronize threads. - */ - virtual void compute_loglikelihood_routine(size_t from, - size_t to, - std::promise& done) ; - - /*! - * \brief Computes the data posterior probabilties. - */ - virtual void compute_post_prob() ; - - /*! - * \brief The routine that effectively computes the - * posterior probabilities for the range [from,to) of - * rows in the data. This function is used to distribute - * the posterior probability computations over several - * threads. This function is thread safe only as long - * as different [from,to) slices are given to the - * differentthreads. - * \param from the index of the first row to treat. - * \param to the index of the past last row to treat. - * \param probs a promise containing a vector with the - * sum of the posterior probability, for each class, - * computed for the given slice. - */ - virtual void compute_post_prob_routine(size_t from, - size_t to, - std::promise& probs) ; - - /*! - * \brief Computes the class probabilities from the - * posterior probabilities. - */ - virtual void compute_class_prob() ; - - /*! - * \brief Computes the class aggregations given the - * posterior probabilities. - */ - virtual void compute_references() ; - - /*! - * \brief A routine that computes the partial class - * references for the range [from,to) of rows in the - * data. To obtain the full class references, it is - * required to 1) run this routine on the whole data - * at once or 2) run it on different slices and - * sum up the partial references obtained. This function - * is used to distribute the posterior probability - * computations over several threads. This function is - * thread safe only as long as different [from,to) slices - * are given to the different threads. - * \param from the index of the first row to treat. - * \param to the index of the past last row to treat. - * \param class_ref a promise containing a matrix with the - * partial class references on each row. - */ - virtual void compute_references_routine(size_t from, - size_t to, - std::promise& class_ref) ; - - /*! - * \brief Modifies the class probabilities in such a - * way that the shift probabilities are then normaly - * distributed, centered on the middle shift state. - * However, the overall class probabilities remain - * unchanged. - */ - virtual void center_shifts() ; - - protected: - /*! - * \brief whether flip is enabled. - */ - bool flip ; - /*! - * \brief the number of iterations. - */ - size_t n_iter ; - /*! - * \brief the number of shift states. - */ - size_t n_shift ; - /*! - * \brief the number of flip states. - */ - size_t n_flip ; - /*! - * \brief the number of classes. - */ - size_t n_class ; - - /*! - * \brief the data. - */ - matrix2d_i data ; - /*! - * \brief the mean number of reads per window in the - * data. - */ - matrix2d_d window_mean ; - /*! - * \brief the class aggregation signal. - */ - matrix2d_d references ; - /*! - * \brief the log likelihoods. - */ - matrix4d_d loglikelihood ; - /*! - * \brief the max log likelihood value for each row. - */ - v_d loglikelihood_max ; - /*! - * \brief the posterior probabilities. - */ - matrix4d_d post_prob ; - /*! - * \brief the class probabilities. - */ - matrix3d_d class_prob ; - /*! - * \brief the total prob per class. - */ - v_d class_prob_tot ; - - /*! - * \brief the sum per row of post_prob. - */ - v_d post_prob_row ; - /*! - * \brief the sum per class of post_prob. - */ - v_d post_prob_class ; - /*! - * \brief the total of post_prob. - */ - double post_prob_tot ; - - /*! - * \brief the number of rows in data. - */ - size_t n_row ; - /*! - * \brief the number of columns in data. - */ - size_t n_col ; - /*! - * \brief the size of the pattern search and of - * the scanning window in the data. - */ - size_t l_slice ; - - /*! - * \brief the seeding method to use. - */ - EMEngine::seeding_codes seeding_method ; - - /*! - * \brief the number of threads. - */ - size_t n_threads ; - /*! - * \brief the threads. - */ - ThreadPool threads ; -} ; - - -double sum_exp(const std::vector>& v) ; - -#endif // EMENGINE_HPP diff --git a/src/Clustering.old/ReferenceComputer.cpp b/src/Clustering.old/ReferenceComputer.cpp deleted file mode 100644 index 352da39..0000000 --- a/src/Clustering.old/ReferenceComputer.cpp +++ /dev/null @@ -1,79 +0,0 @@ -#include - -#include -#include - -// some typdef -#include - - -ReferenceComputer::ReferenceComputer(const Matrix2D& data, - const Matrix4D& posterior_prob, - size_t n_threads) - : EMEngine(data, - posterior_prob.get_dim()[1], - 1, - posterior_prob.get_dim()[2], - posterior_prob.get_dim()[3] == 2, - EMEngine::seeding_codes::RANDOM, - "", - n_threads) -{ - // copy the data - this->data = matrix2d_i(this->n_row, v_i(this->n_col)) ; - for(size_t i=0; in_row; i++) - { for(size_t j=0; jn_col; j++) - { this->data[i][j] = data(i,j) ; } - } - - // compute window means - this->window_mean = matrix2d_d(this->n_row, v_d(this->n_shift, 0.)) ; - this->compute_window_means() ; - - // initialise, copy and compute probs - this->post_prob = matrix4d_d(this->n_row, - matrix3d_d(this->n_class, - matrix2d_d(this->n_shift, - v_d(this->n_flip, 0.)))) ; - this->class_prob = matrix3d_d(this->n_class, - matrix2d_d(this->n_shift, - v_d(this->n_flip, 0.))) ; - this->class_prob_tot = v_d(this->n_class, 0.) ; - this->post_prob_class = v_d(this->n_class, 0.) ; - for(size_t i=0; in_row; i++) - { for(size_t j=0; jn_class; j++) - { for(size_t s=0; sn_shift; s++) - { for(size_t f=0; fn_flip; f++) - { double p = posterior_prob(i,j,s,f) ; - this->post_prob[i][j][s][f] = p ; - this->post_prob_class[j] += p ; - this->post_prob_tot += p ; - } - } - } - } - this->compute_class_prob() ; - - // compute the references - this->references = matrix2d_d(this->n_class, - v_d(this->l_slice, 0.)) ; - this->compute_references() ; -} - -ReferenceComputer::~ReferenceComputer() -{ ; } - -Matrix2D ReferenceComputer::get_references() const -{ - // add a 1st column with the class probabilities - Matrix2D references(this->n_class, this->l_slice+1, 0.) ; - for(size_t i=0; in_class; i++) - { // class prob - references(i,0) = this->class_prob_tot[i] ; - // signal - for(size_t j=0; jl_slice; j++) - { references(i,j+1) = this->references[i][j] ; } - } - return references ; -} - diff --git a/src/Clustering.old/ReferenceComputer.hpp b/src/Clustering.old/ReferenceComputer.hpp deleted file mode 100644 index bfaaa85..0000000 --- a/src/Clustering.old/ReferenceComputer.hpp +++ /dev/null @@ -1,67 +0,0 @@ -#ifndef REFERENCECOMPUTER_HPP -#define REFERENCECOMPUTER_HPP - -#include - -#include -#include - -/*! - * \brief The ReferenceComputer class is a wrapper around the - * EMEngine class that allows to compute the class references - * given the posterior probability matrix and the data without - * having to re-run the data classification. - * - * This class is typically made to be used in conjunction with - * an EMEngine instance, using the following pattern : - * - * Matrix2D data = ... ; - * EMEngine em(data, ...) ; - * em.cluster() ; - * auto prob = em.get_posterior_prob() ; - * auto obj = ReferenceComputer(data, prob, ...) ; - * auto ll = obj.get_loglikelihood() ; - * auto ref = obj.get_references() ; - */ -class ReferenceComputer : public EMEngine -{ - public: - - ReferenceComputer() = delete ; - - /*! - * \brief Constructs an obect and computes the references. - * \param the data for which the classification probabilities - * have been generated. - * \param the classification probabilities for the given data, as - * return by an EMEngine instance (see above). - * \param n_threads the number of threads dedicated to the - * computations. - */ - ReferenceComputer(const Matrix2D& data, - const Matrix4D& posterior_prob, - size_t n_threads) ; - - /*! - * \brief Destructor. - */ - virtual ~ReferenceComputer() override ; - - /*! - * \brief Returns a matrix with the class class references - * (protypic signal), on each row. - * The 1st column contains the class probability, the - * following ones the class signal. - * \return a matrix containing the class references and their - * probabalities, on each row. - */ - virtual Matrix2D get_references() const override ; - - // removes the following methods from the public interface to restrict it - private: - using EMEngine::cluster ; - -} ; - - -#endif // REFERENCECOMPUTER_HPP diff --git a/src/Clustering.old/typedef.hpp b/src/Clustering.old/typedef.hpp deleted file mode 100644 index 4d3e91a..0000000 --- a/src/Clustering.old/typedef.hpp +++ /dev/null @@ -1,16 +0,0 @@ -#ifndef TYPEDEFCLUSTERING_HPP -#define TYPEDEFCLUSTERING_HPP - -#include // std::vector -#include // std::pair - -typedef std::vector v_i ; -typedef std::vector v_d ; -typedef std::vector matrix2d_i ; -typedef std::vector matrix2d_d ; -typedef std::vector matrix3d_d ; -typedef std::vector matrix4d_d ; - -typedef std::vector> v_pair ; - -#endif // TYPEDEFCLUSTERING_HPP diff --git a/src/Clustering/DataLayer.cpp b/src/Clustering/DataLayer.cpp index 21340b2..e8ea814 100644 --- a/src/Clustering/DataLayer.cpp +++ b/src/Clustering/DataLayer.cpp @@ -1,142 +1,144 @@ #include #include // std::invalid_argument #include // log() -#include +#include +#include +#include #include DataLayer::DataLayer() {} -DataLayer::DataLayer(const matrix2d_i& data, +DataLayer::DataLayer(const Matrix2D& data, size_t n_class, size_t n_shift, bool flip) :data(data), flip(flip), - n_row(data.size()), - n_col(data[0].size()), + n_row(data.get_nrow()), + n_col(data.get_ncol()), n_class(n_class), l_model(n_col - n_shift + 1), n_shift(n_shift), n_flip(flip + 1) { // models cannot be initialise here // as the number of categories depend // on the exact class } -DataLayer::DataLayer(const matrix2d_i& data, - const matrix3d_d& model, +DataLayer::DataLayer(const Matrix2D& data, + const Matrix3D& model, bool flip) : data(data), model(model), flip(flip), - n_row(data.size()), - n_col(data[0].size()), - n_class(model.size()), - l_model(model[0].size()), - n_category(model[0][0].size()), + n_row(data.get_nrow()), + n_col(data.get_ncol()), + n_class(model.get_dim()[0]), + l_model(model.get_dim()[1]), + n_category(model.get_dim()[2]), n_shift(n_col - l_model + 1), n_flip(flip + 1) { // check if model is not too long if(this->n_col < this->l_model) { char msg[4096] ; sprintf(msg, "Error! model is longer than data : %zu / %zu", this->l_model, this->n_col) ; throw std::invalid_argument(msg) ; } this->n_shift = this->n_col - this->l_model + 1 ; } DataLayer::~DataLayer() {} -matrix3d_d DataLayer::get_model() const +Matrix3D DataLayer::get_model() const { return this->model ; } -void DataLayer::check_loglikelihood_dim(const matrix4d_d& loglikelihood) const -{ if(loglikelihood.size() != this->n_row) +void DataLayer::check_loglikelihood_dim(const Matrix4D& loglikelihood) const +{ if(loglikelihood.get_dim()[0] != this->n_row) { char msg[4096] ; sprintf(msg, "Error! loglikelihood matrix 1st dimension is not " "equal to data row number : %zu / %zu", - loglikelihood.size(), this->n_row) ; + loglikelihood.get_dim()[0], this->n_row) ; throw std::invalid_argument(msg) ; } - else if(loglikelihood[0].size() != this->n_class) + else if(loglikelihood.get_dim()[1] != this->n_class) { char msg[4096] ; sprintf(msg, "Error! loglikelihood matrix 2nd dimension is not " "equal to model class number : %zu / %zu", - loglikelihood[0].size(), this->n_class) ; + loglikelihood.get_dim()[1], this->n_class) ; throw std::invalid_argument(msg) ; } - else if(loglikelihood[0][0].size() != this->n_shift) + else if(loglikelihood.get_dim()[2] != this->n_shift) { char msg[4096] ; sprintf(msg, "Error! loglikelihood matrix 3rd dimension is not " "equal to model shift state number : %zu / %zu", - loglikelihood[0][0].size(), this->n_shift) ; + loglikelihood.get_dim()[2], this->n_shift) ; throw std::invalid_argument(msg) ; } - else if(loglikelihood[0][0][0].size() != this->n_flip) + else if(loglikelihood.get_dim()[3] != this->n_flip) { char msg[4096] ; sprintf(msg, "Error! loglikelihood matrix 4th dimension is not " "equal to model flip state number : %zu / %zu", - loglikelihood[0][0][0].size(), this->n_flip) ; + loglikelihood.get_dim()[3], this->n_flip) ; throw std::invalid_argument(msg) ; } } void DataLayer::check_loglikelihood_max_dim(const vector_d& loglikelihood_max) const { if(loglikelihood_max.size() != this->n_row) { char msg[4096] ; sprintf(msg, "Error! loglikelihood_max length is not " "equal to data row number : %zu / %zu", loglikelihood_max.size(), this->n_flip) ; throw std::invalid_argument(msg) ; } } -void DataLayer::check_posterior_prob_dim(const matrix4d_d& posterior_prob) const -{ if(posterior_prob.size() != this->n_row) +void DataLayer::check_posterior_prob_dim(const Matrix4D& posterior_prob) const +{ if(posterior_prob.get_dim()[0] != this->n_row) { char msg[4096] ; sprintf(msg, "Error! posterior_prob matrix 1st dimension is not " "equal to data row number : %zu / %zu", - posterior_prob.size(), this->n_row) ; + posterior_prob.get_dim()[0], this->n_row) ; throw std::invalid_argument(msg) ; } - else if(posterior_prob[0].size() != this->n_class) + else if(posterior_prob.get_dim()[1] != this->n_class) { char msg[4096] ; sprintf(msg, "Error! posterior_prob matrix 2nd dimension is not " "equal to model class number : %zu / %zu", - posterior_prob[0].size(), this->n_class) ; + posterior_prob.get_dim()[1], this->n_class) ; throw std::invalid_argument(msg) ; } - else if(posterior_prob[0][0].size() != this->n_shift) + else if(posterior_prob.get_dim()[2] != this->n_shift) { char msg[4096] ; sprintf(msg, "Error! posterior_prob matrix 3rd dimension is not " "equal to model shift state number : %zu / %zu", - posterior_prob[0][0].size(), this->n_shift) ; + posterior_prob.get_dim()[2], this->n_shift) ; throw std::invalid_argument(msg) ; } - else if(posterior_prob[0][0][0].size() != this->n_flip) + else if(posterior_prob.get_dim()[3] != this->n_flip) { char msg[4096] ; sprintf(msg, "Error! posterior_prob matrix 4th dimension is not " "equal to model flip state number : %zu / %zu", - posterior_prob[0][0][0].size(), this->n_flip) ; + posterior_prob.get_dim()[3], this->n_flip) ; throw std::invalid_argument(msg) ; } } const double DataLayer::p_min = 1e-100 ; const double DataLayer::p_min_log = log(DataLayer::p_min) ; diff --git a/src/Clustering/DataLayer.hpp b/src/Clustering/DataLayer.hpp index cde2156..a2bb6a4 100644 --- a/src/Clustering/DataLayer.hpp +++ b/src/Clustering/DataLayer.hpp @@ -1,239 +1,224 @@ #ifndef DATALAYER_HPP #define DATALAYER_HPP #include #include // std::promise, std::future -#include +#include +#include +#include #include +typedef std::vector vector_d ; + /*! * \brief The DataLayer class define the basic design * to handle probabilistic models together with * their data. * A DataLayer is made of two parts : * 1) a data matrix * 2) a model * The model contains the parameters of a probabilistic * model with one or more classes that fits the data. * The data likelihood given the model can be computed * and the model can be updated given a set of * posterior probabilities representing the data * assignments to the different classes. */ class DataLayer { public: /*! * \brief the smallest acceptable probability * for computations. */ static const double p_min ; /*! * \brief the log of the smallest probability. */ static const double p_min_log ; /*! * \brief The possible flip states. */ enum flip_states{FORWARD=0, REVERSE} ; /*! * \brief Default constructor. */ DataLayer() ; /*! * \brief Constructs an object with the * given data. * An empty model is not initialised yet * as the model number of categories * depends on the final class. * \param data the data. * \param n_class the number of classes * of the model. * \param n_shift the number of shift * states of the model. * \param flip whether flipping is allowed. */ - DataLayer(const matrix2d_i& data, + DataLayer(const Matrix2D& data, size_t n_class, size_t n_shift, bool flip) ; /*! * \brief Constructs an object with the * given data and model. * The model dimensions set the number of * classes and the shifting freedom. * \param data the data. * \param the model. * \param flip whether flipping is allowed. */ - DataLayer(const matrix2d_i& data, - const matrix3d_d& model, + DataLayer(const Matrix2D& data, + const Matrix3D& model, bool flip) ; /*! * \brief Destructor. */ virtual ~DataLayer() ; - /*! - * \brief Sets the model values randomly. - */ - virtual void seed_model_randomly() = 0 ; - - /*! - * \brief Sets the model values by - * sampling rows in the data and - * assigning them as initial model - * values. - */ - virtual void seed_model_sampling() = 0 ; - - /*! - * \brief Sets the model values by - * using the first n_class rows in data. - */ - virtual void seed_model_toy() = 0 ; - /*! * \brief Computes the log likelihood of the data * given the current model parameters. * \param loglikelihood a matrix to store the * results. It should have the following dimensions : * 1st : same as the data number of row * 2nd : same as the model number of classes * 3rd : same as the number of shifts * 4th : same as the number of flip states * \param loglikelihood_max a vector containing the * max value for each row of log_likelihood. * Its length should be equal to the data row number. * \param threads a pointer to a thread pool to * parallelize the computations. If nullptr is given, * the computations are performed by the main thread. */ - virtual void compute_loglikelihoods(matrix4d_d& loglikelihood, + virtual void compute_loglikelihoods(Matrix4D& loglikelihood, vector_d& loglikelihood_max, ThreadPool* threads=nullptr) const = 0 ; /*! * \brief Updates the model given the posterior * probabilities (the probabilities of each row * in the data to be assigned to each class, * for each shift and flip state). * \param posterior_prob the data assignment probabilities to * the different classes. * \param threads a pointer to a thread pool to * parallelize the computations. If nullptr is given, * the computations are performed by the main thread. */ - virtual void update_model(const matrix4d_d& posterior_prob, + virtual void update_model(const Matrix4D& posterior_prob, ThreadPool* threads=nullptr) = 0 ; /*! * \brief Returns a copy of the current model. * \return the current model. * 1st dim : the number of classes * 2nd dim : the model length * 3rd dim : the number of value categories. */ - virtual matrix3d_d get_model() const ; + virtual Matrix3D get_model() const ; protected: /*! * \brief Checks the argument has compatible * dimensions with the data and models. If this is * not the case, throw a std::invalid_argument with * a relevant message. * \param logliklihood a matrix to store the * results. It should have the following dimensions : * 1st : same as the data row number * 2nd : same as the model class number * 3rd : same as the shift state number * 4th : same as the flip state number * \throw std::invalid_argument if the dimensions are * incorrect. */ - void check_loglikelihood_dim(const matrix4d_d& loglikelihood) const ; + void check_loglikelihood_dim(const Matrix4D& loglikelihood) const ; /*! * \brief Checks that the argument has compatible * dimensions with the data and models. If this is * not the case, throw a std::invalid_argument with * a relevant message. * \param loglikelihood_max a vector containing the * max value for each row of log_likelihood. * It should have a length equal to the number of * the data row number. * \throw std::invalid_argument if the dimensions are * incorrect. */ void check_loglikelihood_max_dim(const vector_d& loglikelihood_max) const ; /*! * \brief Checks the argument has compatible * dimensions with the data and models. If this is * not the case, throw a std::invalid_argument with * a relevant message. * \param posterior_prob a matrix to store the * results. It should have the following dimensions : * 1st : same as the data row number * 2nd : same as the model class number * 3rd : same as the shift state number * 4th : same as the flip state number * \throw std::invalid_argument if the dimensions are * incorrect. */ - void check_posterior_prob_dim(const matrix4d_d& posterior_prob) const ; + void check_posterior_prob_dim(const Matrix4D& posterior_prob) const ; /*! * \brief the data. */ - matrix2d_i data ; + Matrix2D data ; /*! * \brief the data model. */ - matrix3d_d model ; + Matrix3D model ; /*! * \brief whether flip is enabled. */ bool flip ; /*! * \brief the number of row in the data. */ size_t n_row ; /*! * \brief the number of columns in the data. */ size_t n_col ; /*! * \brief the number of classes in the model. */ size_t n_class ; /*! * \brief the model length, its 2nd dimension. */ size_t l_model ; /*! * \brief the number of variable categories in * the data. This is also the model 3rd * dimension. * Read counts are quantitative values and * have a number of categories equal to one * whereas as DNA sequences are made of * A,C,G,T (at least) and have 4 different * categories. */ size_t n_category ; /*! * \brief the number of shift states. */ size_t n_shift ; /*! * \brief the number of flip states. */ size_t n_flip ; } ; #endif // DATALAYER_HPP diff --git a/src/Clustering/EMBase.cpp b/src/Clustering/EMBase.cpp new file mode 100644 index 0000000..3a45de9 --- /dev/null +++ b/src/Clustering/EMBase.cpp @@ -0,0 +1,298 @@ +#include + +#include +#include // std::invalid_argument +#include // std::promise, std::future +#include // std::pair, std::move() +#include // std::bind(), std::ref() +#include // std::iota() +#include // std::mt19937 + +#include +#include +#include +#include // beta_distribution() +#include // rand_string() +#include // getRandomNumberGenerator() +#include // sd(), normal_pmf() + + +EMBase::EMBase(size_t n_row, + size_t n_col, + size_t n_class, + size_t n_iter, + size_t n_shift, + bool flip, + size_t n_threads=0) + : n_row(n_row), + n_col(n_col), + n_class(n_class), + n_shift(n_shift), + flip(flip), + n_flip(flip+1), + n_iter(n_iter), + l_model(n_col - n_shift + 1), + post_prob_tot(0.), + threads(nullptr) +{ // check n_shift value + if(this->n_col < this->n_shift) + { char msg[4096] ; + sprintf(msg, "Error! Shift is bigger than data column number " + "(%zu / %zu)!", + this->n_shift, this->n_col) ; + throw std::invalid_argument(msg) ; + } + + // data structures + this->loglikelihood = Matrix4D(this->n_row, + this->n_class, + this->n_shift, + this->n_flip, + 0.) ; + this->post_prob = Matrix4D(this->n_row, + this->n_class, + this->n_shift, + this->n_flip, + 0.) ; + this->post_state_prob = Matrix3D(this->n_class, + this->n_shift, + this->n_flip, + 0.) ; + this->post_class_prob = vector_d(this->n_class, 0) ; + this->post_prob_rowsum = vector_d(this->n_row, 0) ; + this->post_prob_colsum = vector_d(this->n_class, 0) ; + this->post_prob_tot = 0 ; + // threads + if(n_threads) + { this->threads = new ThreadPool(n_threads) ; } + +} + +EMBase::~EMBase() +{ // threads + if(this->threads != nullptr) + { this->threads->join() ; + delete this->threads ; + this->threads = nullptr ; + } +} + +Matrix4D EMBase::get_post_prob() const +{ return this->post_prob ; } + +vector_d EMBase::get_post_class_prob() const +{ return this->post_class_prob ; } + +void EMBase::set_state_prob_uniform() +{ double sum = this->n_class * this->n_shift * this->n_flip ; + for(size_t i=0; in_class; i++) + { for(size_t j=0; jn_shift; j++) + { for(size_t k=0; kn_flip; k++) + { this->post_state_prob(i,j,k) = 1./sum ; } + } + } +} + +void EMBase::set_post_prob_random(const std::string& seed) +{ // set random number generator + // will be used to generate thread private seeds + getRandomGenerator(seed) ; + + // don't parallelize + if(this->threads == nullptr) + { std::promise promise ; + std::future future = promise.get_future() ; + this->set_post_prob_random_routine(0, this->n_row, seed, promise) ; + // compute the sum of post prob and the per class sum of post prob + // from the partial results computed on each slice + this->post_prob_tot = 0. ; + this->post_prob_colsum = future.get() ; + for(const auto& prob : this->post_prob_colsum) + { this->post_prob_tot += prob ; } + } + // parallelize + else + { size_t n_threads = this->threads->getNThread() ; + + // compute the slices on which each thread will work + std::vector> slices = + ThreadPool::split_range(0, this->n_row,n_threads) ; + + // get promises and futures + // the function run by the threads will compute + // the partial sum per class of post_prob for the given slice + // this should be used to compute the complete sum of post_prob + // and the complete sum per class of post_prob + std::vector> promises(n_threads) ; + std::vector> futures(n_threads) ; + // private seeds + std::vector private_seeds(n_threads) ; + for(size_t i=0; ithreads->addJob(std::move( + std::bind(&EMBase::set_post_prob_random_routine, + this, + slice.first, + slice.second, + private_seeds[i], + std::ref(promises[i])))) ; + } + // wait until all threads are done working + // compute the sum of post prob and the per class sum of post prob + // from the partial results computed on each slice + this->post_prob_tot = 0. ; + this->post_prob_colsum = vector_d(this->n_class, 0.) ; + for(auto& future : futures) + { auto probs = future.get() ; + for(size_t i=0; in_class; i++) + { double prob = probs[i] ; + this->post_prob_colsum[i] += prob ; + this->post_prob_tot += prob ; + } + } + // -------------------------- threads stop --------------------------- + } + + // compute class and state probs + this->compute_class_prob() ; +} + +void EMBase::set_post_prob_random_routine(size_t from, + size_t to, + const std::string& seed, + std::promise& post_prob_colsum) +{ // random number generator + std::mt19937 generator ; + std::seed_seq seed_sequence(seed.begin(),seed.end()) ; + generator.seed(seed_sequence) ; + + // this->post_prob_tot = 0. ; + // this->post_prob_colsum = vector_d(this->n_class, 0.) ; + vector_d colsums = vector_d(this->n_class, 0.) ; + + vector_d rowsums(this->n_row, 0) ; + + // random sampling + beta_distribution beta(1, this->n_row) ; + for(size_t i=from; in_class; j++) + { for(size_t k=0; kn_shift; k++) + { for(size_t l=0; ln_flip; l++) + { double p = beta(generator) ; + this->post_prob(i,j,k,l) = p ; + rowsums[i] += p ; + } + } + } + } + + // normalization + for(size_t i=from; in_class; j++) + { for(size_t k=0; kn_shift; k++) + { for(size_t l=0; ln_flip; l++) + { double p = this->post_prob(i,j,k,l) / rowsums[i] ; + this->post_prob(i,j,k,l) = p ; + // this->post_prob_tot += p ; + // this->post_prob_colsum[j] += p ; + colsums[j] += p ; + } + } + } + } + + // compute class and state probs + // this->compute_class_prob() ; + post_prob_colsum.set_value(colsums) ; +} + +void EMBase::compute_class_prob() +{ + for(size_t n_class=0; n_classn_class; n_class++) + { // reset total + this->post_class_prob[n_class] = 0. ; + for(size_t n_shift=0; n_shiftn_shift; n_shift++) + { for(size_t flip=0; flipn_flip; flip++) + { // sum + this->post_state_prob(n_class,n_shift,flip) = 0. ; + for(size_t i=0; in_row; i++) + { this->post_state_prob(n_class,n_shift,flip) += + this->post_prob(i,n_class,n_shift,flip) ; + } + // normalize + this->post_state_prob(n_class,n_shift,flip) /= this->post_prob_tot ; + this->post_class_prob[n_class] += this->post_state_prob(n_class,n_shift,flip) ; + } + } + } +} + +void EMBase::center_post_state_prob() +{ + if(this->n_shift == 1) + { return ; } + + // the possible shift states + vector_d shifts(this->n_shift) ; + std::iota(shifts.begin(), shifts.end(), 1.) ; + + // the shift probabilities and the class probabilies + // (no need to norm., class_prob sums to 1) + double shifts_prob_measured_tot = 0. ; + vector_d shifts_prob_measured(this->n_shift) ; + for(size_t s=0; sn_shift; s++) + { for(size_t k=0; kn_class; k++) + { for(size_t f=0; fn_flip; f++) + { shifts_prob_measured[s] += this->post_state_prob(k,s,f) ; + shifts_prob_measured_tot += this->post_state_prob(k,s,f) ; + } + } + } + + + // the shift mean and (biased) standard deviation + double shifts_sd = sd(shifts, shifts_prob_measured, false) ; + + // the shift probabilities under the assumption that is + // distributed as a gaussian centered on + // the central shift state with sd and mean as in the data + // sd as the data + vector_d shifts_prob_centered(shifts.size(), 0.) ; + double shifts_prob_centered_tot = 0. ; + for(size_t i=0; in_shift/2)+1, shifts_sd) ; + shifts_prob_centered_tot += shifts_prob_centered[i] ; + } + + for(size_t k=0; kn_class; k++) + { for(size_t f=0; fn_flip; f++) + { for(size_t s=0; sn_shift; s++) + { this->post_state_prob(k,s,f) = this->post_class_prob[k] * + shifts_prob_centered[s] / + (this->n_flip * shifts_prob_centered_tot) ; + } + } + } + + // shifts_prob_measured_tot = 0. ; + shifts_prob_measured.clear() ; + shifts_prob_measured.resize(this->n_shift) ; + for(size_t s=0; sn_shift; s++) + { for(size_t k=0; kn_class; k++) + { for(size_t f=0; fn_flip; f++) + { shifts_prob_measured[s] += + this->post_state_prob(k,s,f) ; + } + } + } +} diff --git a/src/Clustering/EMBase.hpp b/src/Clustering/EMBase.hpp new file mode 100644 index 0000000..fb0f00c --- /dev/null +++ b/src/Clustering/EMBase.hpp @@ -0,0 +1,240 @@ +#ifndef EMBASE_HPP +#define EMBASE_HPP + +#include +#include +#include // std::promise + +#include +#include +#include +#include + + +typedef std::vector vector_d ; + + +/*! + * \brief The EMBase class is a base class + * providing the basic support for classes + * in implementing read density, sequence + * and both at the time classification + * procedures. + */ +class EMBase +{ public: + /*! + * \brief The possible exit codes for the classification + * method. + * 0 the classification procedure converged, 1 the + * classification procedure ended by reaching the maximum + * number of iterations, 2 the classification procedure + * encountered an error. + */ + enum exit_codes {CONVERGENCE=0, ITER_MAX, FAILURE} ; + + public: + /*! + * \brief Constructs an EMBase object. + * \param n_row the number of rows in the data matrix. + * \param n_col the number of columns in the data matrix. + * \param n_iter the number of optimization iterations. + * \param n_shift the number of shift states allowed. + * \param flip whether flipping is allowed. + * \param n_threads the number of parallel threads + * to run the computations. 0 means no parallel + * computing, everything is run on the main thread. + * \throw std::invalid_argument if the shifting freedom + * is bigger than the number of columns. + */ + EMBase(size_t n_row, + size_t n_col, + size_t n_class, + size_t n_iter, + size_t n_shift, + bool flip, + size_t n_threads) ; + + EMBase(const EMBase& other) = delete ; + + /*! + * \brief Destructor. + */ + virtual ~EMBase() ; + + /*! + * \brief Returns the posterior probability + * of each point belonging to each class, for + * each possible shift and flip state. + * \return the posterior probability matrix, + * with the following dimensions : + * 1st dim : the data points + * 2nd dim : the classes + * 3rd dim : the shift states + * 4th dim : the flip states + */ + virtual Matrix4D get_post_prob() const ; + + /*! + * \brief Returns the posterior class + * probabilities (the total class + * probability over all shift and + * flip states). + * \return the posterior class + * probabilities. + */ + virtual vector_d get_post_class_prob() const ; + + /*! + * \brief Runs the models optimization and the + * data classification. + * \return a code indicating how the optimization + * ended. + */ + virtual EMBase::exit_codes classify() = 0 ; + + protected: + + /*! + * \brief Computes the data log likelihood given the + * current models, likelihood for each state. + */ + virtual void compute_loglikelihood() = 0 ; + + /*! + * \brief Computes the data posterior probabilties. + */ + virtual void compute_post_prob() = 0 ; + + /*! + * \brief Update the data models for all layers, given + * the current posterior and class probabilities. + */ + virtual void update_models() = 0; + + /*! + * \brief Sets all the state probabilities + * (all shift and flip states in all classes) + * to a uniform probability. + */ + void set_state_prob_uniform() ; + + /*! + * \brief Sets the posterior + * probabilities randomly (by + * sampling them from a beta + * distribution) and update all + * other probabilities accordingly. + * \param seed a seed to set the initial + * state of the random number generator. + */ + void set_post_prob_random(const std::string& seed) ; + + /*! + * \brief The routine that effectively + * sets the posterior probabilities randomly + * (by sampling them from a beta + * distribution). + * \param from the index of the first row + * in the data to consider. + * \param to the index of the past last row + * in the data to consider. + * \param done the partial column (over the classes) + * sum of posterior probabilities. If several routines + * are running together, the colsums are retrieved by + * summing up the vectors together. + * \param seed a seed to set the initial + * state of the random number generator. + */ + void set_post_prob_random_routine(size_t from, + size_t to, + const std::string& seed, + std::promise& post_prob_colsum) ; + + /*! + * \brief Computes the class/state probabilities from the + * posterior probabilities. + */ + void compute_class_prob() ; + + /*! + * \brief Modifies the state probabilities in such a + * way that the state probabilities are then normaly + * distributed, centered on the middle shift state. + * However, the overall class probabilities remain + * unchanged. + */ + void center_post_state_prob() ; + + /*! + * \brief the number of rows in data. + */ + size_t n_row ; + /*! + * \brief the number of columns in data. + */ + size_t n_col ; + /*! + * \brief the number of classes. + */ + size_t n_class ; + /*! + * \brief the number of shift states. + */ + size_t n_shift ; + /*! + * \brief whther flip is allowed. + */ + bool flip ; + /*! + * \brief zhe number of flip states. + */ + size_t n_flip ; + /*! + * \brief the number of iterations. + */ + size_t n_iter ; + /*! + * \brief the length of the models. + */ + size_t l_model ; + + /*! + * \brief the joint loglikelihood for each data point, + * for each state (each class for each + * shift and flip state). + */ + Matrix4D loglikelihood ; + /*! + * \brief the posterior probabilities. + */ + Matrix4D post_prob ; + /*! + * \brief the states (shift and flip in each class) + * probabilities. + */ + Matrix3D post_state_prob ; + /*! + * \brief the total prob per class. + */ + vector_d post_class_prob ; + /*! + * \brief the sum per row (data point) of post_prob. + */ + vector_d post_prob_rowsum ; + /*! + * \brief the sum per column (class) of post_prob. + */ + vector_d post_prob_colsum ; + /*! + * \brief the total of post_prob. + */ + double post_prob_tot ; + /*! + * \brief the threads. + */ + ThreadPool* threads ; +} ; + + +#endif // EMBASE_HPP diff --git a/src/Clustering/EMEngine.cpp b/src/Clustering/EMEngine.cpp deleted file mode 100644 index dc9bd11..0000000 --- a/src/Clustering/EMEngine.cpp +++ /dev/null @@ -1,586 +0,0 @@ -#include -#include // log(), exp(), pow() -#include // std::promise, std::future -#include // std::pair, std::move() -#include // std::bind(), std::ref() - -#include // rand_int_uniform() -#include // getRandomNumberGenerator() -#include // beta_distribution() -#include // poisson_pmf(), normal_pmf(), sd() -#include // ConsoleProgressBar -#include - - -EMEngine::EMEngine(const std::vector& read_matrices, - const std::vector& seq_matrices, - size_t n_class, - size_t n_iter, - size_t n_shift, - bool flip, - EMEngine::seeding_codes seeding, - const std::string& seed, - size_t n_threads) - : read_layer_list(), - sequence_layer_list(), - threads(nullptr) - -{ std::cerr << "EMEngine::EMEngine START" << std::endl ; - // nb of layers - size_t n_layer_read = read_matrices.size() ; - size_t n_layer_seq = seq_matrices.size() ; - this->n_layer = n_layer_read + n_layer_seq ; - if(this->n_layer == 0) - { throw std::invalid_argument("Error! No data layer given!") ; } - - // matrices dimensions - size_t n_row = 0 ; - size_t n_col = 0 ; - if(n_layer_read) - { n_row = read_matrices[0].size() ; - n_col = read_matrices[0][0].size() ; - } - else - { n_row = seq_matrices[0].size() ; - n_col = seq_matrices[0][0].size() ; - } - for(const auto& matrix : read_matrices) - { if(matrix.size() != n_row) - { char msg[4096] ; - sprintf(msg, "Error! A read layer row number is invalid " - "(found %zu, expected %zu)!", - matrix.size(), n_row) ; - throw std::invalid_argument(msg) ; - } - else if(matrix[0].size() != n_col) - { char msg[4096] ; - sprintf(msg, "Error! A read layer column number is invalid " - "(found %zu, expected %zu)!", - matrix.size(), n_col) ; - throw std::invalid_argument(msg) ; - } - } - for(const auto& matrix : seq_matrices) - { if(matrix.size() != n_row) - { char msg[4096] ; - sprintf(msg, "Error! A sequence layer row number is invalid " - "(found %zu, expected %zu)!", - matrix.size(), n_row) ; - throw std::invalid_argument(msg) ; - } - else if(matrix[0].size() != n_col) - { char msg[4096] ; - sprintf(msg, "Error! A sequence layes column number is invalid " - "(found %zu, expected %zu)!", - matrix.size(), n_col) ; - throw std::invalid_argument(msg) ; - } - } - this->n_row = n_row ; - this->n_col = n_col ; - - // class, shift, flip, iter - this->n_class = n_class ; - this->n_shift = n_shift ; - this->n_flip = flip+1 ; - this->flip = flip ; - this->n_iter = n_iter ; - - // model length - if(this->n_col < this->n_shift) - { char msg[4096] ; - sprintf(msg, "Error! Shift is bigger than data column number " - "(%zu / %zu)!", - this->n_shift, this->n_col) ; - throw std::invalid_argument(msg) ; - } - this->l_model = n_col - n_shift + 1 ; - - std::cerr << "EMEngine::EMEngine " << std::endl - << " n_row : " << this->n_row << std::endl - << " n_col : " << this->n_col << std::endl - << " n_class : " << this->n_class << std::endl - << " n_shift : " << this->n_shift << std::endl - << " n_flip : " << this->n_flip << std::endl - << " n_layer : " << this->n_layer << std::endl ; - - - // data structures - this->loglikelihood = - std::vector(this->n_layer, - matrix4d_d(this->n_row, - matrix3d_d(this->n_class, - matrix2d_d(this->n_shift, - vector_d(this->n_flip, 0))))) ; - this->loglikelihood_max = matrix2d_d(this->n_layer, - vector_d(this->n_row, 0)) ; - this->loglikelihood_joint = - matrix4d_d(this->n_row, - matrix3d_d(this->n_class, - matrix2d_d(this->n_shift, - vector_d(this->n_flip, 0)))) ; - std::cerr << "EMEngine::EMEngine loglikelihood _joint created " << this->n_row*this->n_class*this->n_shift*this->n_flip << std::endl ; - /* - this->post_prob = - matrix4d_d(this->n_row, - matrix3d_d(this->n_class, - matrix2d_d(this->n_shift, - vector_d(this->n_flip, 0)))) ; - */ - std::vector tmp(this->n_row*this->n_class*this->n_shift*this->n_flip) ; - std::cerr << "EMEngine::EMEngine post_prob created " << this->n_row*this->n_class*this->n_shift*this->n_flip << std::endl ; - this->post_state_prob = - matrix3d_d(this->n_class, - matrix2d_d(this->n_shift, - vector_d(this->n_flip, 0))) ; - std::cerr << "EMEngine::EMEngine post_state_prob created " << this->n_class*this->n_shift*this->n_flip << std::endl ; - this->post_class_prob = vector_d(this->n_class, 0) ; - std::cerr << "EMEngine::EMEngine post_class_prob created " << this->n_class << std::endl ; - this->post_prob_rowsum = vector_d(this->n_row, 0) ; - std::cerr << "EMEngine::EMEngine post_prob_rowsum created " << this->n_row << std::endl ; - this->post_prob_colsum = vector_d(this->n_class, 0) ; - std::cerr << "EMEngine::EMEngine post_prob_colsum created" << this->n_class << std::endl ; - this->post_prob_tot = 0 ; - - // set random number generator seed - getRandomGenerator(seed) ; - - // threads - if(n_threads) - { this->threads = new ThreadPool(n_threads) ; } - - if(seeding == EMEngine::RANDOM) - {} - - // initialise post prob randomly - this->set_post_prob_random() ; - std::cerr << "EMEngine::EMEngine post prob set" << std::endl ; - // create read layer and compute the models from the post prob - for(const auto& matrix : read_matrices) - { std::cerr << "EMEngine::EMEngine creating ReadLayer" << std::endl ; - // create the layer - this->read_layer_list.push_back(new ReadLayer(matrix, - this->n_class, - this->n_shift, - flip, - this->threads)) ; - this->read_layer_list.back()->update_model(this->post_prob, - this->threads) ; - } - // create read layer and compute the models from the post prob - for(const auto& matrix : seq_matrices) - { std::cerr << "EMEngine::EMEngine creating SeqLayer" << std::endl ; - // create the layer - this->sequence_layer_list.push_back(new SequenceLayer(matrix, - this->n_class, - this->n_shift, - flip)) ; - this->sequence_layer_list.back()->update_model(this->post_prob, - this->threads) ; - } - std::cerr << "EMEngine::EMEngine END" << std::endl ; - - /* - // create read layers with initialised models - for(const auto& matrix : read_matrices) - { // create the layer - this->read_layer_list.push_back(new ReadLayer(matrix, - this->n_class, - this->n_shift, - flip, - this->threads)) ; - // seed the models - if(seeding == EMEngine::RANDOM) - { this->read_layer_list.back()->seed_model_randomly() ; } - else if(seeding == EMEngine::SAMPLING) - { this->read_layer_list.back()->seed_model_sampling() ; } - else if(seeding == EMEngine::TOY) - { this->read_layer_list.back()->seed_model_toy() ; } - } - // create read layers with initialised models - for(const auto& matrix : seq_matrices) - { // create the layer - this->sequence_layer_list.push_back(new SequenceLayer(matrix, - this->n_class, - this->n_shift, - flip)) ; - // seed the models - if(seeding == EMEngine::RANDOM) - { this->sequence_layer_list.back()->seed_model_randomly() ; } - else if(seeding == EMEngine::SAMPLING) - { this->sequence_layer_list.back()->seed_model_sampling() ; } - else if(seeding == EMEngine::TOY) - { this->sequence_layer_list.back()->seed_model_toy() ; } - } - // set the class probabilities to a uniform distribution - this->set_state_prob_uniform() - */ -} - -EMEngine::~EMEngine() -{ // threads - if(this->threads != nullptr) - { this->threads->join() ; - delete this->threads ; - this->threads = nullptr ; - } - // read data and models - for(auto& ptr : this->read_layer_list) - { if(ptr != nullptr) - { delete ptr ; - ptr = nullptr ; - } - } - // sequence data and models - for(auto& ptr : this->sequence_layer_list) - { if(ptr != nullptr) - { delete ptr ; - ptr = nullptr ; - } - } -} - -std::vector EMEngine::get_read_models() const -{ std::vector models ; - for(const auto& ptr : this->read_layer_list) - { models.push_back(ptr->get_model()) ; } - return models ; -} - -std::vector EMEngine::get_sequence_models() const -{ std::vector models ; - for(const auto& ptr : this->sequence_layer_list) - { models.push_back(ptr->get_model()) ; } - return models ; -} - -matrix4d_d EMEngine::get_post_prob() const -{ return this->post_prob ; } - -vector_d EMEngine::get_post_class_prob() const -{ return this->post_class_prob ; } - -EMEngine::exit_codes EMEngine::classify() -{ - size_t bar_update_n = this->n_iter ; - ConsoleProgressBar bar(std::cerr, bar_update_n, 60, "classifying") ; - - // optimize the partition - for(size_t n_iter=0; n_itern_iter; n_iter++) - { - // E-step - this->compute_loglikelihood() ; - this->compute_post_prob() ; - // M-step - this->compute_class_prob() ; - this->update_models() ; - this->center_post_state_prob() ; - - bar.update() ; - } - bar.update() ; std::cerr << std::endl ; - return EMEngine::exit_codes::SUCCESS ; -} - -void EMEngine::set_post_prob_random() -{ - this->post_prob_tot = 0. ; - this->post_prob_colsum = vector_d(this->n_class, 0.) ; - - vector_d rowsums(this->n_row, 0) ; - - // random sampling - beta_distribution<> beta(1, this->n_row) ; - for(size_t i=0; in_row; i++) - { for(size_t j=0; jn_class; j++) - { for(size_t k=0; kn_shift; k++) - { for(size_t l=0; ln_flip; l++) - { double p = beta(getRandomGenerator()) ; - this->post_prob[i][j][k][l] = p ; - rowsums[i] += p ; - } - } - } - } - - // normalization - for(size_t i=0; in_row; i++) - { for(size_t j=0; jn_class; j++) - { for(size_t k=0; kn_shift; k++) - { for(size_t l=0; ln_flip; l++) - { double p = this->post_prob[i][j][k][l] / rowsums[i] ; - this->post_prob[i][j][k][l] = p ; - this->post_prob_tot += p ; - this->post_prob_colsum[j] += p ; - } - } - } - } - - // compute class and state probs - this->compute_class_prob() ; -} - -void EMEngine::set_state_prob_uniform() -{ double sum = this->n_class * this->n_shift * this->n_flip ; - for(size_t i=0; in_class; i++) - { for(size_t j=0; jn_shift; j++) - { for(size_t k=0; kn_flip; k++) - { this->post_state_prob[i][j][k] = 1./sum ; } - } - } -} - -void EMEngine::compute_loglikelihood() -{ // compute the loglikelihood for each layer - size_t i = 0 ; - for(auto& ptr : this->read_layer_list) - { ptr->compute_loglikelihoods(this->loglikelihood[i], - this->loglikelihood_max[i], - this->threads) ; - i++ ; - } - for(auto& ptr : this->sequence_layer_list) - { ptr->compute_loglikelihoods(this->loglikelihood[i], - this->loglikelihood_max[i], - this->threads) ; - i++ ; - } - // sum the likelihood for each state, over all layers - // this is the "joint likelihood" - for(size_t i=0; in_row; i++) - { for(size_t j=0; jn_class; j++) - { for(size_t k=0; kn_shift; k++) - { for(size_t l=0; ln_flip; l++) - { - // reset - this->loglikelihood_joint[i][j][k][l] = 0. ; - // sum - for(size_t m=0; mn_layer; m++) - { this->loglikelihood_joint[i][j][k][l] += - (this->loglikelihood[m][i][j][k][l] - - this->loglikelihood_max[m][i]) ; - } - /* - double ll_joint = 0. ; - // sum - for(size_t m=0; mn_layer; m++) - { ll_joint += - (this->loglikelihood[m][i][j][k][l] - - this->loglikelihood_max[m][i]) ; - } - this->loglikelihood_joint[i][j][k][l] = std::max(ll_joint, SequenceLayer::p_min_log) ; - */ - } - } - } - } -} - -void EMEngine::compute_post_prob() -{ // don't parallelize - if(this->threads == nullptr) - { std::promise promise ; - std::future future = promise.get_future() ; - this->compute_post_prob_routine(0, this->n_row, promise) ; - // compute the sum of post prob and the per class sum of post prob - // from the partial results computed on each slice - this->post_prob_tot = 0. ; - this->post_prob_colsum = future.get() ; - for(const auto& prob : this->post_prob_colsum) - { this->post_prob_tot += prob ; } - } - // parallelize - else - { size_t n_threads = this->threads->getNThread() ; - - // compute the slices on which each thread will work - std::vector> slices = - ThreadPool::split_range(0, this->n_row,n_threads) ; - - // get promises and futures - // the function run by the threads will compute - // the partial sum per class of post_prob for the given slice - // this should be used to compute the complete sum of post_prob - // and the complete sum per class of post_prob - std::vector> promises(n_threads) ; - std::vector> futures(n_threads) ; - for(size_t i=0; ithreads->addJob(std::move( - std::bind(&EMEngine::compute_post_prob_routine, - this, - slice.first, - slice.second, - std::ref(promises[i])))) ; - } - // wait until all threads are done working - // compute the sum of post prob and the per class sum of post prob - // from the partial results computed on each slice - this->post_prob_tot = 0. ; - this->post_prob_colsum = vector_d(this->n_class, 0.) ; - for(auto& future : futures) - { auto probs = future.get() ; - for(size_t i=0; in_class; i++) - { double prob = probs[i] ; - this->post_prob_colsum[i] += prob ; - this->post_prob_tot += prob ; - } - } - // -------------------------- threads stop --------------------------- - } -} - - -void EMEngine::compute_post_prob_routine(size_t from, - size_t to, - std::promise& post_prob_colsum) -{ vector_d colsums(this->n_class, 0.) ; - - // reset grand total - // this->post_prob_tot = 0 ; - // this->post_prob_colsum = vector_d(n_class, 0) ; - - // post prob - for(size_t i=from; ipost_prob_rowsum[i] = 0. ; - for(size_t n_class=0; n_classn_class; n_class++) - { for(size_t n_shift=0; n_shiftn_shift; n_shift++) - { for(size_t n_flip=0; n_flipn_flip; n_flip++) - { /* - double p = std::max(exp(this->loglikelihood_joint[i][n_class][n_shift][n_flip]) * - this->post_state_prob[n_class][n_shift][n_flip], - DataLayer::p_min) ; - */ - double p = exp(this->loglikelihood_joint[i][n_class][n_shift][n_flip]) * - this->post_state_prob[n_class][n_shift][n_flip] ; - this->post_prob[i][n_class][n_shift][n_flip] = p ; - this->post_prob_rowsum[i] += p ; - } - } - } - // normalize - for(size_t n_class=0; n_classn_class; n_class++) - { for(size_t n_shift=0; n_shiftn_shift; n_shift++) - { for(size_t n_flip=0; n_flipn_flip; n_flip++) - { /* - this->post_prob[i][n_class][n_shift][n_flip] /= - this->post_prob_rowsum[i] ; - double p = this->post_prob[i][n_class][n_shift][n_flip] ; - colsums[n_class] += p ; - */ - double p = std::max(this->post_prob[i][n_class][n_shift][n_flip] / - this->post_prob_rowsum[i], - ReadLayer::p_min) ; - this->post_prob[i][n_class][n_shift][n_flip] = p ; - colsums[n_class] += p ; - // this->post_prob_colsum[n_class] += p ; - // this->post_prob_tot += p ; - } - } - } - } - post_prob_colsum.set_value(colsums) ; -} - -void EMEngine::compute_class_prob() -{ - for(size_t n_class=0; n_classn_class; n_class++) - { // reset total - this->post_class_prob[n_class] = 0. ; - for(size_t n_shift=0; n_shiftn_shift; n_shift++) - { for(size_t flip=0; flipn_flip; flip++) - { // sum - this->post_state_prob[n_class][n_shift][flip] = 0. ; - for(size_t i=0; in_row; i++) - { this->post_state_prob[n_class][n_shift][flip] += - this->post_prob[i][n_class][n_shift][flip] ; - } - // normalize - this->post_state_prob[n_class][n_shift][flip] /= this->post_prob_tot ; - this->post_class_prob[n_class] += this->post_state_prob[n_class][n_shift][flip] ; - } - } - } -} - -void EMEngine::update_models() -{ // read data and models - for(auto& ptr : this->read_layer_list) - { ptr->update_model(this->post_prob, - this->post_prob_colsum, - this->threads) ; - } - // sequence data and models - for(auto& ptr : this->sequence_layer_list) - { ptr->update_model(this->post_prob, - this->threads) ; - } -} - -void EMEngine::center_post_state_prob() -{ - if(this->n_shift == 1) - { return ; } - - // the possible shift states - vector_d shifts(this->n_shift) ; - std::iota(shifts.begin(), shifts.end(), 1.) ; - - // the shift probabilities and the class probabilies - // (no need to norm., class_prob sums to 1) - double shifts_prob_measured_tot = 0. ; - std::vector shifts_prob_measured(this->n_shift) ; - for(size_t s=0; sn_shift; s++) - { for(size_t k=0; kn_class; k++) - { for(size_t f=0; fn_flip; f++) - { shifts_prob_measured[s] += this->post_state_prob[k][s][f] ; - shifts_prob_measured_tot += this->post_state_prob[k][s][f] ; - } - } - } - - - // the shift mean and (biased) standard deviation - double shifts_sd = sd(shifts, shifts_prob_measured, false) ; - - // the shift probabilities under the assumption that is - // distributed as a gaussian centered on - // the central shift state with sd and mean as in the data - // sd as the data - vector_d shifts_prob_centered(shifts.size(), 0.) ; - double shifts_prob_centered_tot = 0. ; - for(size_t i=0; in_shift/2)+1, shifts_sd) ; - shifts_prob_centered_tot += shifts_prob_centered[i] ; - } - - for(size_t k=0; kn_class; k++) - { for(size_t f=0; fn_flip; f++) - { for(size_t s=0; sn_shift; s++) - { this->post_state_prob[k][s][f] = this->post_class_prob[k] * - shifts_prob_centered[s] / - (this->n_flip * shifts_prob_centered_tot) ; - } - } - } - - // shifts_prob_measured_tot = 0. ; - shifts_prob_measured.clear() ; - shifts_prob_measured.resize(this->n_shift) ; - for(size_t s=0; sn_shift; s++) - { for(size_t k=0; kn_class; k++) - { for(size_t f=0; fn_flip; f++) - { shifts_prob_measured[s] += - this->post_state_prob[k][s][f] ; - } - } - } -} diff --git a/src/Clustering/EMEngine.cpp.save b/src/Clustering/EMEngine.cpp.save deleted file mode 100644 index 5ff245c..0000000 --- a/src/Clustering/EMEngine.cpp.save +++ /dev/null @@ -1,517 +0,0 @@ -#include -#include // log(), exp(), pow() -#include // std::promise, std::future -#include // std::pair, std::move() -#include // std::bind(), std::ref() - -#include // rand_int_uniform() -#include // getRandomNumberGenerator() -#include // poisson_pmf(), normal_pmf(), sd() -#include // ConsoleProgressBar -#include - - -EMEngine::EMEngine(const std::vector& read_matrices, - const std::vector& seq_matrices, - size_t n_class, - size_t n_iter, - size_t n_shift, - bool flip, - EMEngine::seeding_codes seeding, - const std::string& seed, - size_t n_threads) - : read_layer_list(), - sequence_layer_list(), - threads(nullptr) - -{ // nb of layers - size_t n_layer_read = read_matrices.size() ; - size_t n_layer_seq = seq_matrices.size() ; - this->n_layer = n_layer_read + n_layer_seq ; - if(this->n_layer == 0) - { throw std::invalid_argument("Error! No data layer given!") ; } - - // matrices dimensions - size_t n_row = 0 ; - size_t n_col = 0 ; - if(n_layer_read) - { n_row = read_matrices[0].size() ; - n_col = read_matrices[0][0].size() ; - } - else - { n_row = seq_matrices[0].size() ; - n_col = seq_matrices[0][0].size() ; - } - for(const auto& matrix : read_matrices) - { if(matrix.size() != n_row) - { char msg[4096] ; - sprintf(msg, "Error! A read layer row number is invalid " - "(found %zu, expected %zu)!", - matrix.size(), n_row) ; - throw std::invalid_argument(msg) ; - } - else if(matrix[0].size() != n_col) - { char msg[4096] ; - sprintf(msg, "Error! A read layer column number is invalid " - "(found %zu, expected %zu)!", - matrix.size(), n_col) ; - throw std::invalid_argument(msg) ; - } - } - for(const auto& matrix : seq_matrices) - { if(matrix.size() != n_row) - { char msg[4096] ; - sprintf(msg, "Error! A sequence layer row number is invalid " - "(found %zu, expected %zu)!", - matrix.size(), n_row) ; - throw std::invalid_argument(msg) ; - } - else if(matrix[0].size() != n_col) - { char msg[4096] ; - sprintf(msg, "Error! A sequence layes column number is invalid " - "(found %zu, expected %zu)!", - matrix.size(), n_col) ; - throw std::invalid_argument(msg) ; - } - } - this->n_row = n_row ; - this->n_col = n_col ; - - // class, shift, flip, iter - this->n_class = n_class ; - this->n_shift = n_shift ; - this->n_flip = flip+1 ; - this->flip = flip ; - this->n_iter = n_iter ; - - // model length - if(this->n_col < this->n_shift) - { char msg[4096] ; - sprintf(msg, "Error! Shift is bigger than data column number " - "(%zu / %zu)!", - this->n_shift, this->n_col) ; - throw std::invalid_argument(msg) ; - } - this->l_model = n_col - n_shift + 1 ; - - // data structures - this->loglikelihood = - std::vector(this->n_layer, - matrix4d_d(n_row, - matrix3d_d(this->n_class, - matrix2d_d(this->n_shift, - vector_d(this->n_flip, 0))))) ; - this->loglikelihood_max = matrix2d_d(this->n_layer, - vector_d(this->n_row, 0)) ; - this->loglikelihood_joint = - matrix4d_d(this->n_row, - matrix3d_d(this->n_class, - matrix2d_d(this->n_shift, - vector_d(this->n_flip, 0)))) ; - this->post_prob = - matrix4d_d(this->n_row, - matrix3d_d(this->n_class, - matrix2d_d(this->n_shift, - vector_d(this->n_flip, 0)))) ; - this->post_state_prob = - matrix3d_d(n_class, - matrix2d_d(this->n_shift, - vector_d(this->n_flip, 0))) ; - this->post_class_prob = vector_d(n_class, 0) ; - this->post_prob_rowsum = vector_d(n_row, 0) ; - this->post_prob_colsum = vector_d(n_class, 0) ; - this->post_prob_tot = 0 ; - - // set random number generator seed - getRandomGenerator(seed) ; - - // threads - if(n_threads) - { this->threads = new ThreadPool(n_threads) ; } - - // create read layers with initialised models - for(const auto& matrix : read_matrices) - { // create the layer - this->read_layer_list.push_back(new ReadLayer(matrix, - this->n_class, - this->n_shift, - flip, - this->threads)) ; - // seed the models - if(seeding == EMEngine::RANDOM) - { this->read_layer_list.back()->seed_model_randomly() ; } - else if(seeding == EMEngine::SAMPLING) - { this->read_layer_list.back()->seed_model_sampling() ; } - else if(seeding == EMEngine::TOY) - { this->read_layer_list.back()->seed_model_toy() ; } - } - // create read layers with initialised models - for(const auto& matrix : seq_matrices) - { // create the layer - this->sequence_layer_list.push_back(new SequenceLayer(matrix, - this->n_class, - this->n_shift, - flip)) ; - // seed the models - if(seeding == EMEngine::RANDOM) - { this->sequence_layer_list.back()->seed_model_randomly() ; } - else if(seeding == EMEngine::SAMPLING) - { this->sequence_layer_list.back()->seed_model_sampling() ; } - else if(seeding == EMEngine::TOY) - { this->sequence_layer_list.back()->seed_model_toy() ; } - } - // set the class probabilities to a uniform distribution - this->set_state_prob_uniform() ; -} - -EMEngine::~EMEngine() -{ // threads - if(this->threads != nullptr) - { this->threads->join() ; - delete this->threads ; - this->threads = nullptr ; - } - // read data and models - for(auto& ptr : this->read_layer_list) - { if(ptr != nullptr) - { delete ptr ; - ptr = nullptr ; - } - } - // sequence data and models - for(auto& ptr : this->sequence_layer_list) - { if(ptr != nullptr) - { delete ptr ; - ptr = nullptr ; - } - } -} - -std::vector EMEngine::get_read_models() const -{ std::vector models ; - for(const auto& ptr : this->read_layer_list) - { models.push_back(ptr->get_model()) ; } - return models ; -} - -std::vector EMEngine::get_sequence_models() const -{ std::vector models ; - for(const auto& ptr : this->sequence_layer_list) - { models.push_back(ptr->get_model()) ; } - return models ; -} - -matrix4d_d EMEngine::get_post_prob() const -{ return this->post_prob ; } - -vector_d EMEngine::get_post_class_prob() const -{ return this->post_class_prob ; } - -EMEngine::exit_codes EMEngine::classify() -{ - size_t bar_update_n = this->n_iter ; - ConsoleProgressBar bar(std::cerr, bar_update_n, 60, "classifying") ; - - std::cerr << "EM new" << std::endl ; - std::cerr << "log likelihood joint" << std::endl ; - std::cerr << this->loglikelihood_joint << std::endl << std::endl ; - std::cerr << "post prob" << std::endl ; - std::cerr << this->post_prob << std::endl << std::endl ; - std::cerr << "post state prob" << std::endl ; - std::cerr << this->post_state_prob << std::endl << std::endl ; - std::cerr << "post class prob" << std::endl ; - std::cerr << this->post_class_prob << std::endl << std::endl ; - std::cerr << "model" << std::endl ; - std::cerr << this->sequence_layer_list.front()->get_model()[0] << std::endl << std::endl ; - std::cerr << "--------------------------------------------" << std::endl << std::endl ; - - // optimize the partition - for(size_t n_iter=0; n_itern_iter; n_iter++) - { - std::cerr << "model" << std::endl ; - std::cerr << this->sequence_layer_list.front()->get_model()[0] << std::endl << std::endl ; - - // E-step - this->compute_loglikelihood() ; - this->compute_post_prob() ; - // M-step - this->compute_class_prob() ; - this->update_models() ; - this->center_post_state_prob() ; - - std::cerr << "EM new" << std::endl ; - std::cerr << "log likelihood" << std::endl ; - std::cerr << this->loglikelihood[0] << std::endl << std::endl ; - std::cerr << "log likelihood max" << std::endl ; - std::cerr << this->loglikelihood_max[0] << std::endl << std::endl ; - std::cerr << "log likelihood joint" << std::endl ; - std::cerr << this->loglikelihood_joint << std::endl << std::endl ; - std::cerr << "post prob" << std::endl ; - std::cerr << this->post_prob << std::endl << std::endl ; - std::cerr << "post state prob" << std::endl ; - std::cerr << this->post_state_prob << std::endl << std::endl ; - std::cerr << "post class prob" << std::endl ; - std::cerr << this->post_class_prob << std::endl << std::endl ; - std::cerr << "model" << std::endl ; - std::cerr << this->sequence_layer_list.front()->get_model()[0] << std::endl << std::endl ; - std::cerr << "--------------------------------------------" << std::endl << std::endl ; - - - bar.update() ; - } - bar.update() ; std::cerr << std::endl ; - return EMEngine::exit_codes::SUCCESS ; -} - -void EMEngine::set_state_prob_uniform() -{ double sum = this->n_class * this->n_shift * this->n_flip ; - for(size_t i=0; in_class; i++) - { for(size_t j=0; jn_shift; j++) - { for(size_t k=0; kn_flip; k++) - { this->post_state_prob[i][j][k] = 1./sum ; } - } - } -} - -void EMEngine::compute_loglikelihood() -{ // compute the loglikelihood for each layer - size_t i = 0 ; - for(auto& ptr : this->read_layer_list) - { ptr->compute_loglikelihoods(this->loglikelihood[i], - this->loglikelihood_max[i], - this->threads) ; - i++ ; - } - for(auto& ptr : this->sequence_layer_list) - { ptr->compute_loglikelihoods(this->loglikelihood[i], - this->loglikelihood_max[i], - this->threads) ; - i++ ; - } - // sum the likelihood for each state, over all layers - // this is the "joint likelihood" - for(size_t i=0; in_row; i++) - { for(size_t j=0; jn_class; j++) - { for(size_t k=0; kn_shift; k++) - { for(size_t l=0; ln_flip; l++) - { // reset - this->loglikelihood_joint[i][j][k][l] = 0. ; - std::cerr << "loglikelihood_joint[" << i << "][" - << j << "][" - << k << "][" - << l << " = " ; - // sum - for(size_t m=0; mn_layer; m++) - { this->loglikelihood_joint[i][j][k][l] += - (this->loglikelihood[m][i][j][k][l] - - this->loglikelihood_max[m][i]) ; - std::cerr << this->loglikelihood[m][i][j][k][l] - << "-" - << this->loglikelihood_max[m][i] - << " " ; - } - std::cerr << std::endl ; - } - } - } - } -} - -void EMEngine::compute_post_prob() -{ // don't parallelize - if(this->threads == nullptr) - { std::promise promise ; - std::future future = promise.get_future() ; - this->compute_post_prob_routine(0, this->n_row, promise) ; - // compute the sum of post prob and the per class sum of post prob - // from the partial results computed on each slice - this->post_prob_tot = 0. ; - this->post_prob_colsum = future.get() ; - for(const auto& prob : this->post_prob_colsum) - { this->post_prob_tot += prob ; } - } - // parallelize - else - { size_t n_threads = this->threads->getNThread() ; - - // compute the slices on which each thread will work - std::vector> slices = - ThreadPool::split_range(0, this->n_row,n_threads) ; - - // get promises and futures - // the function run by the threads will compute - // the partial sum per class of post_prob for the given slice - // this should be used to compute the complete sum of post_prob - // and the complete sum per class of post_prob - std::vector> promises(n_threads) ; - std::vector> futures(n_threads) ; - for(size_t i=0; ithreads->addJob(std::move( - std::bind(&EMEngine::compute_post_prob_routine, - this, - slice.first, - slice.second, - std::ref(promises[i])))) ; - } - // wait until all threads are done working - // compute the sum of post prob and the per class sum of post prob - // from the partial results computed on each slice - this->post_prob_tot = 0. ; - this->post_prob_colsum = vector_d(this->n_class, 0.) ; - for(auto& future : futures) - { auto probs = future.get() ; - for(size_t i=0; in_class; i++) - { double prob = probs[i] ; - this->post_prob_colsum[i] += prob ; - this->post_prob_tot += prob ; - } - } - // -------------------------- threads stop --------------------------- - } -} - - -void EMEngine::compute_post_prob_routine(size_t from, - size_t to, - std::promise& post_prob_colsum) -{ vector_d colsums(this->n_class, 0.) ; - - // reset grand total - // this->post_prob_tot = 0 ; - // this->post_prob_colsum = vector_d(n_class, 0) ; - - // post prob - for(size_t i=from; ipost_prob_rowsum[i] = 0. ; - - for(size_t n_class=0; n_classn_class; n_class++) - { for(size_t n_shift=0; n_shiftn_shift; n_shift++) - { for(size_t n_flip=0; n_flipn_flip; n_flip++) - { - double p = std::max(exp(this->loglikelihood_joint[i][n_class][n_shift][n_flip]) * - this->post_state_prob[n_class][n_shift][n_flip], - DataLayer::p_min) ; - this->post_prob[i][n_class][n_shift][n_flip] = p ; - this->post_prob_rowsum[i] += p ; - } - } - } - // normalize - for(size_t n_class=0; n_classn_class; n_class++) - { for(size_t n_shift=0; n_shiftn_shift; n_shift++) - { for(size_t n_flip=0; n_flipn_flip; n_flip++) - { this->post_prob[i][n_class][n_shift][n_flip] /= - this->post_prob_rowsum[i] ; - double p = this->post_prob[i][n_class][n_shift][n_flip] ; - colsums[n_class] += p ; - // this->post_prob_colsum[n_class] += p ; - // this->post_prob_tot += p ; - } - } - } - } - post_prob_colsum.set_value(colsums) ; -} - -void EMEngine::compute_class_prob() -{ - for(size_t n_class=0; n_classn_class; n_class++) - { // reset total - this->post_class_prob[n_class] = 0. ; - for(size_t n_shift=0; n_shiftn_shift; n_shift++) - { for(size_t flip=0; flipn_flip; flip++) - { // sum - this->post_state_prob[n_class][n_shift][flip] = 0. ; - for(size_t i=0; in_row; i++) - { this->post_state_prob[n_class][n_shift][flip] += - this->post_prob[i][n_class][n_shift][flip] ; - } - // normalize - this->post_state_prob[n_class][n_shift][flip] /= this->post_prob_tot ; - this->post_class_prob[n_class] += this->post_state_prob[n_class][n_shift][flip] ; - } - } - } -} - -void EMEngine::update_models() -{ // read data and models - for(auto& ptr : this->read_layer_list) - { ptr->update_model(this->post_prob, - this->post_prob_colsum, - this->threads) ; - } - // sequence data and models - for(auto& ptr : this->sequence_layer_list) - { ptr->update_model(this->post_prob, - this->threads) ; - } -} - -void EMEngine::center_post_state_prob() -{ - if(this->n_shift == 1) - { return ; } - - // the possible shift states - vector_d shifts(this->n_shift) ; - std::iota(shifts.begin(), shifts.end(), 1.) ; - - // the shift probabilities and the class probabilies - // (no need to norm., class_prob sums to 1) - double shifts_prob_measured_tot = 0. ; - std::vector shifts_prob_measured(this->n_shift) ; - for(size_t s=0; sn_shift; s++) - { for(size_t k=0; kn_class; k++) - { for(size_t f=0; fn_flip; f++) - { shifts_prob_measured[s] += this->post_state_prob[k][s][f] ; - shifts_prob_measured_tot += this->post_state_prob[k][s][f] ; - } - } - } - - - // the shift mean and (biased) standard deviation - double shifts_sd = sd(shifts, shifts_prob_measured, false) ; - - // the shift probabilities under the assumption that is - // distributed as a gaussian centered on - // the central shift state with sd and mean as in the data - // sd as the data - vector_d shifts_prob_centered(shifts.size(), 0.) ; - double shifts_prob_centered_tot = 0. ; - for(size_t i=0; in_shift/2)+1, shifts_sd) ; - shifts_prob_centered_tot += shifts_prob_centered[i] ; - } - - for(size_t k=0; kn_class; k++) - { for(size_t f=0; fn_flip; f++) - { for(size_t s=0; sn_shift; s++) - { this->post_state_prob[k][s][f] = this->post_class_prob[k] * - shifts_prob_centered[s] / - (this->n_flip * shifts_prob_centered_tot) ; - } - } - } - - // shifts_prob_measured_tot = 0. ; - shifts_prob_measured.clear() ; - shifts_prob_measured.resize(this->n_shift) ; - for(size_t s=0; sn_shift; s++) - { for(size_t k=0; kn_class; k++) - { for(size_t f=0; fn_flip; f++) - { shifts_prob_measured[s] += - this->post_state_prob[k][s][f] ; - } - } - } -} diff --git a/src/Clustering/EMEngine.hpp b/src/Clustering/EMEngine.hpp deleted file mode 100644 index 1b15e16..0000000 --- a/src/Clustering/EMEngine.hpp +++ /dev/null @@ -1,307 +0,0 @@ -#ifndef EMENGINE_HPP -#define EMENGINE_HPP - -#include -#include -#include -#include -#include // std::promise - -#include -#include -#include -#include -#include - - -/*! - * \brief This class implements the iterative an expectation - * maximization classification procedure to discover - * patterns in ChIP-seq (and related data) data, as described - * in Nair et al. 2014, Bioinformatics. - * However, the classification procedure has been generalized - * such that genomic regions can be partitioned according - * to several different signal at the same time, instead - * of just one as in the original paper. Additionally, it - * is possible to include the underlying DNA sequence such - * that the partitioning procedure will find i) ChIP-seq - * data signal patterns and ii) DNA sequence motifs at - * the same time. - * To mitigate a miss-alignment of the signal/sequences in - * the different regions - that is a same signal strech/sequence - * motif is present in two regions but at different offsets - - * the classification procedure can search protypic signals - * shorter than a whole region, at each possible offset over the - * region (named shift states). - * To mitigate an inversion of the signal/sequence in the different - * regions - that is a same signal strech/sequence motif is present - * in two regions but in reverse orientation - the classification - * procedure can search protypic signals in both orientation. - */ -class EMEngine -{ - public: - /*! - * \brief The possible seeding strategies. - */ - enum seeding_codes {RANDOM=0, SAMPLING, TOY} ; - - /*! - * \brief The possible exit codes for the cluster method. - * 0 the clustering procedure converged, 1 the clustering - * procedure succeeded without converging, 2 the clustering - * failed. - */ - enum exit_codes {CONVERGENCE=0, SUCCESS, FAILURE} ; - - public: - /*! - * \brief Constructs an object to partition the - * region according to all the givend data layers - * with the given shifting and flipping freedom. - * \param read_matrices a vector containing all - * the different different data densities (ChIP-seq - * or related signal) for the regions of interest. - * \param seq_matrices a vector containing the DNA - * sequences for the regions of interest. - * \param n_class the number of region classes - * to search. - * \param n_iter the number of optimization iterations. - * \param n_shift the number of shift states allowed. - * \param flip whether flipping is allowed. - * \param seeding how to initialise the signal/sequence - * models. - * \param seed a seed to initialise the random number - * generator. - * \param n_threads the number of parallel threads - * to run the computations. 0 means no parallel - * computing, everything is run on the main thread. - */ - EMEngine(const std::vector& read_matrices, - const std::vector& seq_matrices, - size_t n_class, - size_t n_iter, - size_t n_shift, - bool flip, - EMEngine::seeding_codes seeding, - const std::string& seed="", - size_t n_threads=0) ; - /*! - * Destructor. - */ - ~EMEngine() ; - - /*! - * \brief Returns all read models. - * The models are in the same order - * as the data were given to the - * constructor. - * \return a vector containing the - * models. - */ - std::vector get_read_models() const ; - - /*! - * \brief Returns all sequence models. - * The models are in the same order - * as the data were given to the - * constructor. - * \return a vector containing the - * models. - */ - std::vector get_sequence_models() const ; - - /*! - * \brief Returns the posterior probability - * of each point belonging to each class, for - * each possible shift and flip state. - * \return the posterior probability matrix, - * with the following dimensions : - * 1st dim : the data points - * 2nd dim : the classes - * 3rd dim : the shift states - * 4th dim : the flip states - */ - matrix4d_d get_post_prob() const ; - - /*! - * \brief Returns the posterior class - * probabilities (the total class - * probability over all shift and - * flip states). - * \return the posterior class - * probabilities. - */ - vector_d get_post_class_prob() const ; - - /*! - * \brief Runs the models optimization and the - * data classification. - * \return a code indicating how the optimization - * ended. - */ - EMEngine::exit_codes classify() ; - - protected: - - /*! - * \brief Sets the posterior - * probabilities randomly (by - * sampling them from a beta - * distribution) and update all - * other probabilities accordingly.. - */ - void set_post_prob_random() ; - - /*! - * \brief Sets all the state probabilities - * (all shift and flip states in all classes) - * to a uniform probability. - */ - void set_state_prob_uniform() ; - - /*! - * \brief Computes the data log likelihood given the - * current models, for all layers and the joint - * likelihood for each state (the sum of the layer - * likelihoods for all layers, for a given state). - */ - void compute_loglikelihood() ; - - /*! - * \brief Computes the data posterior probabilties. - */ - void compute_post_prob() ; - - /*! - * \brief The routine that effectively computes - * the posterior probabilties. - * \param from the index of the first row - * in the data to consider. - * \param to the index of the past last row - * in the data to consider. - * \param done the partial column (over the classes) - * sum of posterior probabilities. If several routines - * are running together, the colsums are retrieved by - * summing up the vectors together. - */ - void compute_post_prob_routine(size_t from, - size_t to, - std::promise& post_prob_colsum) ; - - /*! - * \brief Computes the class/state probabilities from the - * posterior probabilities. - */ - void compute_class_prob() ; - - /*! - * \brief Update the data models for all layers, given - * the current posterior and class probabilities. - */ - void update_models() ; - - /*! - * \brief Modifies the state probabilities in such a - * way that the state probabilities are then normaly - * distributed, centered on the middle shift state. - * However, the overall class probabilities remain - * unchanged. - */ - void center_post_state_prob() ; - - /*! - * \brief the number of data layers. - */ - size_t n_layer ; - /*! - * \brief the number of rows in data. - */ - size_t n_row ; - /*! - * \brief the number of columns in data. - */ - size_t n_col ; - /*! - * \brief the number of classes. - */ - size_t n_class ; - /*! - * \brief the number of shift states. - */ - size_t n_shift ; - /*! - * \brief zhe number of flip states. - */ - size_t n_flip ; - /*! - * \brief the number of iterations. - */ - size_t n_iter ; - /*! - * \brief whther flip is allowed. - */ - bool flip ; - /*! - * \brief the length of the models. - */ - size_t l_model ; - - /*! - * \brief the log likelihoods. - * One per data layer. - */ - std::vector loglikelihood ; - /*! - * \brief the max log likelihood value for each row. - * One per data layer. - */ - std::vector loglikelihood_max ; - /*! - * \brief the joint loglikelihood, through all - * layers, for each state (each class for each - * shift and flip state). - */ - matrix4d_d loglikelihood_joint ; - /*! - * \brief the posterior probabilities. - */ - matrix4d_d post_prob ; - /*! - * \brief the states (shift and flip in each class) - * probabilities. - */ - matrix3d_d post_state_prob ; - /*! - * \brief the total prob per class. - */ - vector_d post_class_prob ; - /*! - * \brief the sum per row (data point) of post_prob. - */ - vector_d post_prob_rowsum ; - /*! - * \brief the sum per column (class) of post_prob. - */ - vector_d post_prob_colsum ; - /*! - * \brief the total of post_prob. - */ - double post_prob_tot ; - - /*! - * \brief the read data and their models. - */ - std::list read_layer_list ; - /*! - * \brief the sequence data and their models. - */ - std::list sequence_layer_list ; - /*! - * \brief the threads. - */ - ThreadPool* threads ; - -} ; - -#endif // EMENGINE_HPP diff --git a/src/Clustering/EMEngine.hpp.save b/src/Clustering/EMEngine.hpp.save deleted file mode 100644 index c3206b1..0000000 --- a/src/Clustering/EMEngine.hpp.save +++ /dev/null @@ -1,298 +0,0 @@ -#ifndef EMENGINE_HPP -#define EMENGINE_HPP - -#include -#include -#include -#include -#include // std::promise - -#include -#include -#include -#include -#include - - -/*! - * \brief This class implements the iterative an expectation - * maximization classification procedure to discover - * patterns in ChIP-seq (and related data) data, as described - * in Nair et al. 2014, Bioinformatics. - * However, the classification procedure has been generalized - * such that genomic regions can be partitioned according - * to several different signal at the same time, instead - * of just one as in the original paper. Additionally, it - * is possible to include the underlying DNA sequence such - * that the partitioning procedure will find i) ChIP-seq - * data signal patterns and ii) DNA sequence motifs at - * the same time. - * To mitigate a miss-alignment of the signal/sequences in - * the different regions - that is a same signal strech/sequence - * motif is present in two regions but at different offsets - - * the classification procedure can search protypic signals - * shorter than a whole region, at each possible offset over the - * region (named shift states). - * To mitigate an inversion of the signal/sequence in the different - * regions - that is a same signal strech/sequence motif is present - * in two regions but in reverse orientation - the classification - * procedure can search protypic signals in both orientation. - */ -class EMEngine -{ - public: - /*! - * \brief The possible seeding strategies. - */ - enum seeding_codes {RANDOM=0, SAMPLING, TOY} ; - - /*! - * \brief The possible exit codes for the cluster method. - * 0 the clustering procedure converged, 1 the clustering - * procedure succeeded without converging, 2 the clustering - * failed. - */ - enum exit_codes {CONVERGENCE=0, SUCCESS, FAILURE} ; - - public: - /*! - * \brief Constructs an object to partition the - * region according to all the givend data layers - * with the given shifting and flipping freedom. - * \param read_matrices a vector containing all - * the different different data densities (ChIP-seq - * or related signal) for the regions of interest. - * \param seq_matrices a vector containing the DNA - * sequences for the regions of interest. - * \param n_class the number of region classes - * to search. - * \param n_iter the number of optimization iterations. - * \param n_shift the number of shift states allowed. - * \param flip whether flipping is allowed. - * \param seeding how to initialise the signal/sequence - * models. - * \param seed a seed to initialise the random number - * generator. - * \param n_threads the number of parallel threads - * to run the computations. 0 means no parallel - * computing, everything is run on the main thread. - */ - EMEngine(const std::vector& read_matrices, - const std::vector& seq_matrices, - size_t n_class, - size_t n_iter, - size_t n_shift, - bool flip, - EMEngine::seeding_codes seeding, - const std::string& seed="", - size_t n_threads=0) ; - /*! - * Destructor. - */ - ~EMEngine() ; - - /*! - * \brief Returns all read models. - * The models are in the same order - * as the data were given to the - * constructor. - * \return a vector containing the - * models. - */ - std::vector get_read_models() const ; - - /*! - * \brief Returns all sequence models. - * The models are in the same order - * as the data were given to the - * constructor. - * \return a vector containing the - * models. - */ - std::vector get_sequence_models() const ; - - /*! - * \brief Returns the posterior probability - * of each point belonging to each class, for - * each possible shift and flip state. - * \return the posterior probability matrix, - * with the following dimensions : - * 1st dim : the data points - * 2nd dim : the classes - * 3rd dim : the shift states - * 4th dim : the flip states - */ - matrix4d_d get_post_prob() const ; - - /*! - * \brief Returns the posterior class - * probabilities (the total class - * probability over all shift and - * flip states). - * \return the posterior class - * probabilities. - */ - vector_d get_post_class_prob() const ; - - /*! - * \brief Runs the models optimization and the - * data classification. - * \return a code indicating how the optimization - * ended. - */ - EMEngine::exit_codes classify() ; - - protected: - - /*! - * \brief Sets all the state probabilities - * (all shift and flip states in all classes) - * to a uniform probability. - */ - void set_state_prob_uniform() ; - - /*! - * \brief Computes the data log likelihood given the - * current models, for all layers and the joint - * likelihood for each state (the sum of the layer - * likelihoods for all layers, for a given state). - */ - void compute_loglikelihood() ; - - /*! - * \brief Computes the data posterior probabilties. - */ - void compute_post_prob() ; - - /*! - * \brief The routine that effectively computes - * the posterior probabilties. - * \param from the index of the first row - * in the data to consider. - * \param to the index of the past last row - * in the data to consider. - * \param done the partial column (over the classes) - * sum of posterior probabilities. If several routines - * are running together, the colsums are retrieved by - * summing up the vectors together. - */ - void compute_post_prob_routine(size_t from, - size_t to, - std::promise& post_prob_colsum) ; - - /*! - * \brief Computes the class/state probabilities from the - * posterior probabilities. - */ - void compute_class_prob() ; - - /*! - * \brief Update the data models for all layers, given - * the current posterior and class probabilities. - */ - void update_models() ; - - /*! - * \brief Modifies the state probabilities in such a - * way that the state probabilities are then normaly - * distributed, centered on the middle shift state. - * However, the overall class probabilities remain - * unchanged. - */ - void center_post_state_prob() ; - - /*! - * \brief the number of data layers. - */ - size_t n_layer ; - /*! - * \brief the number of rows in data. - */ - size_t n_row ; - /*! - * \brief the number of columns in data. - */ - size_t n_col ; - /*! - * \brief the number of classes. - */ - size_t n_class ; - /*! - * \brief the number of shift states. - */ - size_t n_shift ; - /*! - * \brief zhe number of flip states. - */ - size_t n_flip ; - /*! - * \brief the number of iterations. - */ - size_t n_iter ; - /*! - * \brief whther flip is allowed. - */ - bool flip ; - /*! - * \brief the length of the models. - */ - size_t l_model ; - - /*! - * \brief the log likelihoods. - * One per data layer. - */ - std::vector loglikelihood ; - /*! - * \brief the max log likelihood value for each row. - * One per data layer. - */ - std::vector loglikelihood_max ; - /*! - * \brief the joint loglikelihood, through all - * layers, for each state (each class for each - * shift and flip state). - */ - matrix4d_d loglikelihood_joint ; - /*! - * \brief the posterior probabilities. - */ - matrix4d_d post_prob ; - /*! - * \brief the states (shift and flip in each class) - * probabilities. - */ - matrix3d_d post_state_prob ; - /*! - * \brief the total prob per class. - */ - vector_d post_class_prob ; - /*! - * \brief the sum per row (data point) of post_prob. - */ - vector_d post_prob_rowsum ; - /*! - * \brief the sum per column (class) of post_prob. - */ - vector_d post_prob_colsum ; - /*! - * \brief the total of post_prob. - */ - double post_prob_tot ; - - /*! - * \brief the read data and their models. - */ - std::list read_layer_list ; - /*! - * \brief the sequence data and their models. - */ - std::list sequence_layer_list ; - /*! - * \brief the threads. - */ - ThreadPool* threads ; - -} ; - -#endif // EMENGINE_HPP diff --git a/src/Clustering/EMJoint.cpp b/src/Clustering/EMJoint.cpp new file mode 100644 index 0000000..d26385a --- /dev/null +++ b/src/Clustering/EMJoint.cpp @@ -0,0 +1,435 @@ + +#include + +#include +#include +#include // std::promise, std::future +#include // std::pair, std::move() +#include // std::bind(), std::ref() + +#include +#include +#include +#include +#include +#include +#include // getRandomNumberGenerator() +#include // ConsoleProgressBar + + +template +std::ostream& operator << (std::ostream& stream, + const std::vector& v) +{ for(const auto& t : v) + { stream << t << " " ; } + return stream ; +} + +EMJoint::EMJoint(const std::vector>& read_matrices, + size_t n_class, + size_t n_iter, + size_t n_shift, + bool flip, + const std::string& seed, + size_t n_threads) + : EMBase(read_matrices[0].get_nrow(), + read_matrices[0].get_ncol(), + n_class, + n_iter, + n_shift, + flip, + n_threads), + n_layer(read_matrices.size()), + loglikelihood_layer(n_layer, + Matrix4D(this->n_row, + this->n_class, + this->n_shift, + this->n_flip, + 0.)), + loglikelihood_max(this->n_layer, + vector_d(this->n_row, 0.)), + read_layers(), + seq_layer(nullptr) + +{ + // check data matrices and their dimensions + if(this->n_layer == 0) + { throw std::invalid_argument("Error! No data layer given!") ; } + for(const auto& matrix : read_matrices) + { if(matrix.get_nrow() != this->n_row) + { char msg[4096] ; + sprintf(msg, "Error! Read layers have variable row numbers " + "(%zu and %zu)!", + matrix.get_nrow(), this->n_row) ; + throw std::invalid_argument(msg) ; + } + else if(matrix.get_ncol() != this->n_col) + { char msg[4096] ; + sprintf(msg, "Error! Read layers have variable column numbers " + "(%zu and %zu)!", + matrix.get_ncol(), this->n_col) ; + throw std::invalid_argument(msg) ; + } + } + + // initialise post prob randomly + // getRandomGenerator(seed) ; + this->set_post_prob_random(seed) ; + + // data and models + // create read layer and initialise the models from the post prob + for(const auto& matrix : read_matrices) + { // create the layer + this->read_layers.push_back(new ReadLayer(matrix, + this->n_class, + this->n_shift, + this->flip, + this->threads)) ; + this->read_layers.back()->update_model(this->post_prob, + this->threads) ; + } +} + +EMJoint::EMJoint(const std::vector>& read_matrices, + const Matrix2D& seq_matrix, + size_t n_class, + size_t n_iter, + size_t n_shift, + bool flip, + const std::string& seed, + size_t n_threads) + : EMBase(read_matrices[0].get_nrow(), + read_matrices[0].get_ncol(), + n_class, + n_iter, + n_shift, + flip, + n_threads), + n_layer(read_matrices.size()+1), + loglikelihood_layer(this->n_layer, + Matrix4D(this->n_row, + this->n_class, + this->n_shift, + this->n_flip, + 0.)), + loglikelihood_max(this->n_layer, + vector_d(this->n_row, 0.)), + read_layers(), + seq_layer(nullptr) +{ // check data matrices and their dimensions + for(const auto& matrix : read_matrices) + { if(matrix.get_nrow() != this->n_row) + { char msg[4096] ; + sprintf(msg, "Error! A read matrix row number is different than expected " + "(%zu instead of %zu)!", + matrix.get_nrow(), this->n_row) ; + throw std::invalid_argument(msg) ; + } + else if(matrix.get_ncol() != this->n_col) + { char msg[4096] ; + sprintf(msg, "Error! A read matrix column number is different than expected " + "(%zu instead of %zu)!", + matrix.get_ncol(), this->n_col) ; + throw std::invalid_argument(msg) ; + } + } + if(seq_matrix.get_nrow() != this->n_row) + { char msg[4096] ; + sprintf(msg, "Error! A sequence matrix row number is different than expected " + "(%zu instead of %zu)!", + seq_matrix.get_nrow(), this->n_row) ; + throw std::invalid_argument(msg) ; + } + else if(seq_matrix.get_ncol() != this->n_col) + { char msg[4096] ; + sprintf(msg, "Error! A sequence matrix column number is different than expected " + "(%zu instead of %zu)!", + seq_matrix.get_ncol(), this->n_col) ; + throw std::invalid_argument(msg) ; + } + + // initialise post prob randomly + // getRandomGenerator(seed) ; + this->set_post_prob_random(seed) ; + + // data and models + // create read layer and initialise the models from the post prob + for(const auto& matrix : read_matrices) + { // create the layer + this->read_layers.push_back(new ReadLayer(matrix, + this->n_class, + this->n_shift, + this->flip, + this->threads)) ; + this->read_layers.back()->update_model(this->post_prob, + this->threads) ; + } + // create sequence layer and initialise the models from the post prob + this->seq_layer = new SequenceLayer(seq_matrix, + this->n_class, + this->n_shift, + this->flip, + false) ; + this->seq_layer->update_model(this->post_prob, + this->threads) ; +} + +EMJoint::~EMJoint() +{ // join the threads in case + // deleted by EMBase destructor + this->threads->join() ; + + // read data and models + for(auto& ptr : this->read_layers) + { if(ptr != nullptr) + { delete ptr ; + ptr = nullptr ; + } + } + // sequence data and models + if(seq_layer != nullptr) + { delete seq_layer ; + seq_layer = nullptr ; + } +} + +std::vector> EMJoint::get_read_models() const +{ std::vector> models ; + for(const auto& ptr : this->read_layers) + { models.push_back(ptr->get_model()) ; } + return models ; +} + +Matrix3D EMJoint::get_sequence_models() const +{ return this->seq_layer->get_model() ; } + +EMJoint::exit_codes EMJoint::classify() +{ + size_t bar_update_n = this->n_iter ; + ConsoleProgressBar bar(std::cerr, bar_update_n, 60, "classifying") ; + + // optimize the partition + for(size_t n_iter=0; n_itern_iter; n_iter++) + { // E-step + this->compute_loglikelihood() ; + this->compute_post_prob() ; + // M-step + this->compute_class_prob() ; + this->update_models() ; + this->center_post_state_prob() ; + bar.update() ; + } + bar.update() ; std::cerr << std::endl ; + return EMJoint::exit_codes::ITER_MAX ; +} + +void EMJoint::compute_loglikelihood() +{ // compute the loglikelihood for each layer + size_t i = 0 ; + for(auto& ptr : this->read_layers) + { ptr->compute_loglikelihoods(this->loglikelihood_layer[i], + this->loglikelihood_max[i], + this->threads) ; + i++ ; + } + this->seq_layer->compute_loglikelihoods(this->loglikelihood_layer[i], + this->loglikelihood_max[i], + this->threads) ; + i++ ; + /* + // sum the likelihood for each state, over all layers + // this is the "joint likelihood" + for(size_t i=0; in_row; i++) + { for(size_t j=0; jn_class; j++) + { for(size_t k=0; kn_shift; k++) + { for(size_t l=0; ln_flip; l++) + { + // reset + this->loglikelihood(i,j,k,l) = 0. ; + // sum + for(size_t m=0; mn_layer; m++) + { this->loglikelihood(i,j,k,l) += + (this->loglikelihood_layer[m](i,j,k,l) - + this->loglikelihood_max[m][i]) ; + } + } + } + } + } + */ + + // sum the likelihood for each state, over all layers + // and rescale the values + // don't parallelize + if(this->threads == nullptr) + { std::promise promise ; + std::future future = promise.get_future() ; + this->compute_loglikelihood_routine(0, + this->n_row, + promise) ; + future.get() ; + } + // parallelize + else + { size_t n_threads = this->threads->getNThread() ; + + // compute the slices on which each thread will work + std::vector> slices = + ThreadPool::split_range(0, this->n_row,n_threads) ; + + // get promises and futures + std::vector> promises(n_threads) ; + std::vector> futures(n_threads) ; + for(size_t i=0; ithreads->addJob(std::move( + std::bind(&EMJoint::compute_loglikelihood_routine, + this, + slice.first, + slice.second, + std::ref(promises[i])))) ; + } + // wait until all threads are done working + for(auto& future : futures) + { future.get() ; } + // -------------------------- threads stop --------------------------- + } +} + +void EMJoint::compute_loglikelihood_routine(size_t from, + size_t to, + std::promise& done) +{ + // limite value range + for(size_t i=from; in_class; j++) + { for(size_t k=0; kn_shift; k++) + { for(size_t l=0; ln_flip; l++) + { + // reset + this->loglikelihood(i,j,k,l) = 0. ; + // sum + for(size_t m=0; mn_layer; m++) + { this->loglikelihood(i,j,k,l) += + (this->loglikelihood_layer[m](i,j,k,l) - + this->loglikelihood_max[m][i]) ; + } + } + } + } + } + done.set_value(true) ; +} + +void EMJoint::compute_post_prob() +{ // don't parallelize + if(this->threads == nullptr) + { std::promise promise ; + std::future future = promise.get_future() ; + this->compute_post_prob_routine(0, this->n_row, promise) ; + // compute the sum of post prob and the per class sum of post prob + // from the partial results computed on each slice + this->post_prob_tot = 0. ; + this->post_prob_colsum = future.get() ; + for(const auto& prob : this->post_prob_colsum) + { this->post_prob_tot += prob ; } + } + // parallelize + else + { size_t n_threads = this->threads->getNThread() ; + + // compute the slices on which each thread will work + std::vector> slices = + ThreadPool::split_range(0, this->n_row,n_threads) ; + + // get promises and futures + // the function run by the threads will compute + // the partial sum per class of post_prob for the given slice + // this should be used to compute the complete sum of post_prob + // and the complete sum per class of post_prob + std::vector> promises(n_threads) ; + std::vector> futures(n_threads) ; + for(size_t i=0; ithreads->addJob(std::move( + std::bind(&EMJoint::compute_post_prob_routine, + this, + slice.first, + slice.second, + std::ref(promises[i])))) ; + } + // wait until all threads are done working + // compute the sum of post prob and the per class sum of post prob + // from the partial results computed on each slice + this->post_prob_tot = 0. ; + this->post_prob_colsum = vector_d(this->n_class, 0.) ; + for(auto& future : futures) + { auto probs = future.get() ; + for(size_t i=0; in_class; i++) + { double prob = probs[i] ; + this->post_prob_colsum[i] += prob ; + this->post_prob_tot += prob ; + } + } + // -------------------------- threads stop --------------------------- + } +} + +void EMJoint::compute_post_prob_routine(size_t from, + size_t to, + std::promise& post_prob_colsum) +{ vector_d colsums(this->n_class, 0.) ; + + // post prob + for(size_t i=from; ipost_prob_rowsum[i] = 0. ; + for(size_t n_class=0; n_classn_class; n_class++) + { for(size_t n_shift=0; n_shiftn_shift; n_shift++) + { for(size_t n_flip=0; n_flipn_flip; n_flip++) + { + double p = exp(this->loglikelihood(i,n_class,n_shift,n_flip)) * + this->post_state_prob(n_class,n_shift,n_flip) ; + this->post_prob(i,n_class,n_shift,n_flip) = p ; + this->post_prob_rowsum[i] += p ; + } + } + } + // normalize + for(size_t n_class=0; n_classn_class; n_class++) + { for(size_t n_shift=0; n_shiftn_shift; n_shift++) + { for(size_t n_flip=0; n_flipn_flip; n_flip++) + { + double p = std::max(this->post_prob(i,n_class,n_shift,n_flip) / + this->post_prob_rowsum[i], + ReadLayer::p_min) ; + this->post_prob(i,n_class,n_shift,n_flip) = p ; + colsums[n_class] += p ; + } + } + } + } + post_prob_colsum.set_value(colsums) ; +} + +void EMJoint::update_models() +{ // read data and models + for(auto& ptr : this->read_layers) + { ptr->update_model(this->post_prob, + this->post_prob_colsum, + this->threads) ; + } + // sequence data and models + this->seq_layer->update_model(this->post_prob, + this->threads) ; +} diff --git a/src/Clustering/EMJoint.hpp b/src/Clustering/EMJoint.hpp new file mode 100644 index 0000000..adb1f0e --- /dev/null +++ b/src/Clustering/EMJoint.hpp @@ -0,0 +1,198 @@ +#ifndef EMJOINT_HPP +#define EMJOINT_HPP + + +#include + +#include +#include + +#include +#include +#include +#include +#include + + +typedef std::vector vector_d ; + +class EMJoint : public EMBase +{ + public: + + /*! + * \brief Constructs an object to partition the + * region according to all the given read densities + * with the given shifting and flipping freedom. + * \param read_matrices a vector containing all + * the different data densities (ChIP-seq or related + * signal) for the regions of interest. + * \param seq_matrix a matrix containing the DNA + * sequences for the regions of interest. + * \param n_class the number of region classes + * to search. + * \param n_iter the number of optimization iterations. + * \param n_shift the number of shift states allowed. + * \param flip whether flipping is allowed. + * \param seed a seed to initialise the random number + * generator. + * \param n_threads the number of parallel threads + * to run the computations. 0 means no parallel + * computing, everything is run on the main thread. + */ + EMJoint(const std::vector>& read_matrices, + size_t n_class, + size_t n_iter, + size_t n_shift, + bool flip, + const std::string& seed="", + size_t n_threads=0) ; + + /*! + * \brief Constructs an object to partition the + * region according to all the given read densities + * and region sequences with the given shifting and + * flipping freedom. + * \param read_matrices a vector containing all + * the different data densities (ChIP-seq or related + * signal) for the regions of interest. + * \param seq_matrix a matrix containing the DNA + * sequences for the regions of interest. + * \param n_class the number of region classes + * to search. + * \param n_iter the number of optimization iterations. + * \param n_shift the number of shift states allowed. + * \param flip whether flipping is allowed. + * \param seed a seed to initialise the random number + * generator. + * \param n_threads the number of parallel threads + * to run the computations. 0 means no parallel + * computing, everything is run on the main thread. + */ + EMJoint(const std::vector>& read_matrices, + const Matrix2D& seq_matrix, + size_t n_class, + size_t n_iter, + size_t n_shift, + bool flip, + const std::string& seed="", + size_t n_threads=0) ; + + EMJoint(const EMJoint& other) = delete ; + + /*! + * \brief Destructor. + */ + virtual ~EMJoint() override ; + + /*! + * \brief Returns all layer read models. + * The models are in the same order + * as the data were given to the + * constructor. + * \return a vector containing the + * models. + */ + std::vector> get_read_models() const ; + + /*! + * \brief Returns the sequence models. + * \return a vector containing the + * models. + */ + Matrix3D get_sequence_models() const ; + + /*! + * \brief Runs the sequence model optimization and + * the data classification. + * \return a code indicating how the optimization + * ended. + */ + virtual EMJoint::exit_codes classify() override ; + + private: + + /*! + * \brief Computes the data log likelihood given the + * current models, for all layers and the joint + * likelihood for each state (the sum of the layer + * likelihoods for all layers, for a given state). + */ + virtual void compute_loglikelihood() override ; + + /*! + * \brief This is a routine of compute_loglikelihood() that + * computes the joint loglikelihood by summing the + * individual loglikelihood obtained from each data layer. + * At the same time, this method rescales the loglikelihood + * values by substacting to each value the maximum + * loglikelihood value found in the same data row, + * for each layer. + * \param from the index of the first row + * in the data to consider. + * \param to the index of the past last row + * in the data to consider. + * \param done a promise to fill when the method + * is done. + */ + void compute_loglikelihood_routine(size_t from, + size_t to, + std::promise& done) ; + + /*! + * \brief Computes the data posterior probabilties. + */ + virtual void compute_post_prob() override ; + + /*! + * \brief The routine that effectively computes + * the posterior probabilties. + * \param from the index of the first row + * in the data to consider. + * \param to the index of the past last row + * in the data to consider. + * \param done the partial column (over the classes) + * sum of posterior probabilities. If several routines + * are running together, the colsums are retrieved by + * summing up the vectors together. + */ + void compute_post_prob_routine(size_t from, + size_t to, + std::promise& post_prob_colsum) ; + + /*! + * \brief Update the data models for all layers, given + * the current posterior and class probabilities. + */ + virtual void update_models() override ; + + /*! + * \brief the number of data layers. + */ + size_t n_layer ; + /*! + * \brief the log likelihood buffers for each individual + * layer (one element per layer). + */ + std::vector> loglikelihood_layer ; + /*! + * \brief the max loglikelihood value for + * each each data layer (1st dimension) + * and each data row of the given layer + * (2nd dimension). + */ + std::vector loglikelihood_max ; + /*! + * \brief A vector containing the pointers + * to the objects managing all the read + * layer data and models. + */ + std::vector read_layers ; + /*! + * \brief A pointer to the object managing + * the data and their model. + */ + SequenceLayer* seq_layer ; +} ; + +#endif // EMJOINT_HPP diff --git a/src/Clustering/EMRead.cpp b/src/Clustering/EMRead.cpp new file mode 100644 index 0000000..872cd54 --- /dev/null +++ b/src/Clustering/EMRead.cpp @@ -0,0 +1,265 @@ +#include + +#include +#include +#include // std::promise, std::future +#include // std::pair, std::move() +#include // std::bind(), std::ref() +#include // exp() + +#include // ReadLayer +#include // getRandomNumberGenerator() +#include // ConsoleProgressBar +#include // ThreadPool + + + +EMRead::EMRead(const Matrix2D& read_matrix, + size_t n_class, + size_t n_iter, + size_t n_shift, + bool flip, + const std::string& seed, + size_t n_threads) + : EMBase(read_matrix.get_nrow(), + read_matrix.get_ncol(), + n_class, + n_iter, + n_shift, + flip, + n_threads), + loglikelihood_max(n_row, 0.), + read_layer(nullptr) +{ this->loglikelihood_max = vector_d(n_row, 0.) ; + + // initialise post prob randomly + this->set_post_prob_random(seed) ; + // data and models + this->read_layer = new ReadLayer(read_matrix, + this->n_class, + this->n_shift, + flip, + this->threads) ; + // intialise the models with the post prob + this->read_layer->update_model(this->post_prob, + this->threads) ; +} + +EMRead::~EMRead() +{ if(this->read_layer == nullptr) + { delete this->read_layer ; + this->read_layer = nullptr ; + } +} + +Matrix3D EMRead::get_read_models() const +{ return read_layer->get_model() ; } + +EMRead::exit_codes EMRead::classify() +{ size_t bar_update_n = this->n_iter ; + ConsoleProgressBar bar(std::cerr, bar_update_n, 60, "classifying") ; + + // optimize the partition + for(size_t n_iter=0; n_itern_iter; n_iter++) + { // E-step + this->compute_loglikelihood() ; + this->compute_post_prob() ; + // M-step + this->compute_class_prob() ; + this->update_models() ; + this->center_post_state_prob() ; + + bar.update() ; + } + bar.update() ; std::cerr << std::endl ; + return EMRead::exit_codes::ITER_MAX ; +} + +void EMRead::compute_loglikelihood() +{ // compute the loglikelihood + this->read_layer->compute_loglikelihoods(this->loglikelihood, + this->loglikelihood_max, + this->threads) ; + + /* + // rescale the values + for(size_t i=0; in_row; i++) + { for(size_t j=0; jn_class; j++) + { for(size_t k=0; kn_shift; k++) + { for(size_t l=0; ln_flip; l++) + { this->loglikelihood(i,j,k,l) = + (this->loglikelihood(i,j,k,l) - + this->loglikelihood_max[i]) ; + } + } + } + } + */ + + // rescale the values + // don't parallelize + if(this->threads == nullptr) + { std::promise promise ; + std::future future = promise.get_future() ; + this->compute_loglikelihood_routine(0, + this->n_row, + promise) ; + future.get() ; + } + // parallelize + else + { size_t n_threads = this->threads->getNThread() ; + + // compute the slices on which each thread will work + std::vector> slices = + ThreadPool::split_range(0, this->n_row,n_threads) ; + + // get promises and futures + std::vector> promises(n_threads) ; + std::vector> futures(n_threads) ; + for(size_t i=0; ithreads->addJob(std::move( + std::bind(&EMRead::compute_loglikelihood_routine, + this, + slice.first, + slice.second, + std::ref(promises[i])))) ; + } + // wait until all threads are done working + for(auto& future : futures) + { future.get() ; } + // -------------------------- threads stop --------------------------- + } +} + +void EMRead::compute_loglikelihood_routine(size_t from, + size_t to, + std::promise& done) +{ + // rescale the values + for(size_t i=from; in_class; j++) + { for(size_t k=0; kn_shift; k++) + { for(size_t l=0; ln_flip; l++) + { this->loglikelihood(i,j,k,l) = + (this->loglikelihood(i,j,k,l) - + this->loglikelihood_max[i]) ; + } + } + } + } + done.set_value(true) ; +} + +void EMRead::compute_post_prob() +{ // don't parallelize + if(this->threads == nullptr) + { std::promise promise ; + std::future future = promise.get_future() ; + this->compute_post_prob_routine(0, this->n_row, promise) ; + // compute the sum of post prob and the per class sum of post prob + // from the partial results computed on each slice + this->post_prob_tot = 0. ; + this->post_prob_colsum = future.get() ; + for(const auto& prob : this->post_prob_colsum) + { this->post_prob_tot += prob ; } + } + // parallelize + else + { size_t n_threads = this->threads->getNThread() ; + + // compute the slices on which each thread will work + std::vector> slices = + ThreadPool::split_range(0, this->n_row,n_threads) ; + + // get promises and futures + // the function run by the threads will compute + // the partial sum per class of post_prob for the given slice + // this should be used to compute the complete sum of post_prob + // and the complete sum per class of post_prob + std::vector> promises(n_threads) ; + std::vector> futures(n_threads) ; + for(size_t i=0; ithreads->addJob(std::move( + std::bind(&EMRead::compute_post_prob_routine, + this, + slice.first, + slice.second, + std::ref(promises[i])))) ; + } + // wait until all threads are done working + // compute the sum of post prob and the per class sum of post prob + // from the partial results computed on each slice + this->post_prob_tot = 0. ; + this->post_prob_colsum = vector_d(this->n_class, 0.) ; + for(auto& future : futures) + { auto probs = future.get() ; + for(size_t i=0; in_class; i++) + { double prob = probs[i] ; + this->post_prob_colsum[i] += prob ; + this->post_prob_tot += prob ; + } + } + // -------------------------- threads stop --------------------------- + } +} + + +void EMRead::compute_post_prob_routine(size_t from, + size_t to, + std::promise& post_prob_colsum) +{ vector_d colsums(this->n_class, 0.) ; + + // reset grand total + // this->post_prob_tot = 0 ; + // this->post_prob_colsum = vector_d(n_class, 0) ; + + // post prob + for(size_t i=from; ipost_prob_rowsum[i] = 0. ; + for(size_t n_class=0; n_classn_class; n_class++) + { for(size_t n_shift=0; n_shiftn_shift; n_shift++) + { for(size_t n_flip=0; n_flipn_flip; n_flip++) + { + double p = exp(this->loglikelihood(i,n_class,n_shift,n_flip)) * + this->post_state_prob(n_class,n_shift,n_flip) ; + this->post_prob(i,n_class,n_shift,n_flip) = p ; + this->post_prob_rowsum[i] += p ; + } + } + } + // normalize + for(size_t n_class=0; n_classn_class; n_class++) + { for(size_t n_shift=0; n_shiftn_shift; n_shift++) + { for(size_t n_flip=0; n_flipn_flip; n_flip++) + { + double p = std::max(this->post_prob(i,n_class,n_shift,n_flip) / + this->post_prob_rowsum[i], + ReadLayer::p_min) ; + this->post_prob(i,n_class,n_shift,n_flip) = p ; + colsums[n_class] += p ; + } + } + } + } + post_prob_colsum.set_value(colsums) ; +} + +void EMRead::update_models() +{ this->read_layer->update_model(this->post_prob, + this->post_prob_colsum, + this->threads) ; +} diff --git a/src/Clustering/EMRead.hpp b/src/Clustering/EMRead.hpp new file mode 100644 index 0000000..da351b4 --- /dev/null +++ b/src/Clustering/EMRead.hpp @@ -0,0 +1,133 @@ +#ifndef EMREAD_HPP +#define EMREAD_HPP + +#include + +#include +#include +#include // std::promise + +#include +#include + + +typedef std::vector vector_d ; + + +class EMRead : public EMBase +{ public: + /*! + * \brief Constructs an object to partition the + * region (rows) according to the shape of the signal + * with the given shifting and flipping freedom. + * \param read_matrix a matrix containing the read + * densitiy (ChIP-seq or related signal) for the + * regions of interest. + * \param n_class the number of region classes + * to search. + * \param n_iter the number of optimization iterations. + * \param n_shift the number of shift states allowed. + * \param flip whether flipping is allowed. + * \param seed a seed to initialise the random number + * generator. + * \param n_threads the number of parallel threads + * to run the computations. 0 means no parallel + * computing, everything is run on the main thread. + */ + EMRead(const Matrix2D& read_matrix, + size_t n_class, + size_t n_iter, + size_t n_shift, + bool flip, + const std::string& seed="", + size_t n_threads=0) ; + + EMRead(const EMRead& other) = delete ; + + /*! + * \brief Destructor. + */ + virtual ~EMRead() override ; + + /*! + * \brief Returns the class read signal model. + * \return the class read signal model. + */ + Matrix3D get_read_models() const ; + + /*! + * \brief Runs the read signal model optimization and + * the data classification. + * \return a code indicating how the optimization + * ended. + */ + virtual EMRead::exit_codes classify() override ; + + private: + + /*! + * \brief Computes the data log likelihood given the + * current models, for all layers and the joint + * likelihood for each state (the sum of the layer + * likelihoods for all layers, for a given state). + */ + virtual void compute_loglikelihood() override ; + + /*! + * \brief This is a routine of compute_loglikelihood(). + * This method rescales the loglikelihood values by + * substacting to each value the maximum loglikelihood + * value found in the same data row. + * This method + * \param from the index of the first row + * in the data to consider. + * \param to the index of the past last row + * in the data to consider. + * \param done a promise to fill when the method + * is done. + */ + void compute_loglikelihood_routine(size_t from, + size_t to, + std::promise& done) ; + + /*! + * \brief Computes the data posterior probabilties. + */ + virtual void compute_post_prob() override ; + + /*! + * \brief The routine that effectively computes + * the posterior probabilties. + * \param from the index of the first row + * in the data to consider. + * \param to the index of the past last row + * in the data to consider. + * \param done the partial column (over the classes) + * sum of posterior probabilities. If several routines + * are running together, the colsums are retrieved by + * summing up the vectors together. + */ + void compute_post_prob_routine(size_t from, + size_t to, + std::promise& post_prob_colsum) ; + + /*! + * \brief Update the data models for all layers, given + * the current posterior and class probabilities. + */ + virtual void update_models() override ; + + /*! + * \brief the max loglikelihood value for + * each data row. + */ + std::vector loglikelihood_max ; + /*! + * \brief A pointer to the object managing + * the data and their model. + */ + ReadLayer* read_layer ; + +} ; + +#endif // EMREAD_HPP diff --git a/src/Clustering/EMSequence.cpp b/src/Clustering/EMSequence.cpp new file mode 100644 index 0000000..0d75793 --- /dev/null +++ b/src/Clustering/EMSequence.cpp @@ -0,0 +1,310 @@ +#include + +#include +#include +#include // std::promise, std::future +#include // std::pair, std::move() +#include // std::bind(), std::ref() +#include // exp() + +#include // SequenceLayer +#include // getRandomNumberGenerator() +#include // ConsoleProgressBar +#include // ThreadPool +#include // dna::base_composition() + +template +std::ostream& operator << (std::ostream& stream, const std::vector& v) +{ for(const auto& x : v) + { stream << x << " " ; } + return stream ; +} + + +EMSequence::EMSequence(const Matrix2D& seq_matrix, + size_t n_class, + size_t n_iter, + size_t n_shift, + bool flip, + bool bckg_class, + const std::string& seed, + size_t n_threads) + : EMBase(seq_matrix.get_nrow(), + seq_matrix.get_ncol(), + n_class, + n_iter, + n_shift, + flip, + n_threads), + loglikelihood_max(n_row, 0.), + seq_layer(nullptr) +{ this->loglikelihood_max = vector_d(n_row, 0.) ; + + // initialise post prob randomly + // getRandomGenerator(seed) ; + this->set_post_prob_random(seed) ; + + // data and models + this->seq_layer = new SequenceLayer(seq_matrix, + this->n_class, + this->n_shift, + this->flip, + bckg_class) ; + + // intialise the models with the post prob + this->seq_layer->update_model(this->post_prob, + this->threads) ; + // overwrite last class as background class + if(bckg_class) + { // sequence composition + std::vector base_comp = + dna::base_composition(seq_matrix, + flip) ; + // create a motif + Matrix2D bckg_motif(4, + seq_matrix.get_ncol()-this->n_shift+1) ; + for(size_t i=0; iseq_layer->set_class(this->n_class-1, + bckg_motif) ; + } +} + +EMSequence::EMSequence(const Matrix2D& seq_matrix, + const Matrix3D& motifs, + size_t n_iter, + bool flip, + bool bckg_class, + size_t n_threads) + : EMBase(seq_matrix.get_nrow(), + seq_matrix.get_ncol(), + motifs.get_dim()[0], + n_iter, + seq_matrix.get_ncol() - motifs.get_dim()[1] + 1, + flip, + n_threads), + loglikelihood_max(n_row, 0.), + seq_layer(nullptr) +{ + + this->loglikelihood_max = vector_d(n_row, 0.) ; + + // initialise post prob randomly + // getRandomGenerator(seed) ; + // this->set_post_prob_random(seed) ; + + // data and models + this->seq_layer = new SequenceLayer(seq_matrix, + motifs, + this->flip, + bckg_class) ; + + // intialise the class prob uniformly + this->set_state_prob_uniform() ; +} + + +EMSequence::~EMSequence() +{ if(this->seq_layer == nullptr) + { delete this->seq_layer ; + this->seq_layer = nullptr ; + } +} + +Matrix3D EMSequence::get_sequence_models() const +{ return seq_layer->get_model() ; } + +EMSequence::exit_codes EMSequence::classify() +{ + size_t bar_update_n = this->n_iter ; + ConsoleProgressBar bar(std::cerr, bar_update_n, 60, "classifying") ; + + // optimize the partition + for(size_t n_iter=0; n_itern_iter; n_iter++) + { // E-step + this->compute_loglikelihood() ; + this->compute_post_prob() ; + // M-step + this->compute_class_prob() ; + this->update_models() ; + this->center_post_state_prob() ; + bar.update() ; + } + bar.update() ; std::cerr << std::endl ; + return EMSequence::exit_codes::ITER_MAX ; +} + +void EMSequence::compute_loglikelihood() +{ // compute the loglikelihood + this->seq_layer->compute_loglikelihoods(this->loglikelihood, + this->loglikelihood_max, + this->threads) ; + // rescale the values + // don't parallelize + if(this->threads == nullptr) + { std::promise promise ; + std::future future = promise.get_future() ; + this->compute_loglikelihood_routine(0, + this->n_row, + promise) ; + future.get() ; + } + // parallelize + else + { size_t n_threads = this->threads->getNThread() ; + + // compute the slices on which each thread will work + std::vector> slices = + ThreadPool::split_range(0, this->n_row,n_threads) ; + + // get promises and futures + std::vector> promises(n_threads) ; + std::vector> futures(n_threads) ; + for(size_t i=0; ithreads->addJob(std::move( + std::bind(&EMSequence::compute_loglikelihood_routine, + this, + slice.first, + slice.second, + std::ref(promises[i])))) ; + } + // wait until all threads are done working + for(auto& future : futures) + { future.get() ; } + // -------------------------- threads stop --------------------------- + } +} + +void EMSequence::compute_loglikelihood_routine(size_t from, + size_t to, + std::promise& done) +{ + // rescale the values + for(size_t i=from; in_class; j++) + { for(size_t k=0; kn_shift; k++) + { for(size_t l=0; ln_flip; l++) + { this->loglikelihood(i,j,k,l) = + (this->loglikelihood(i,j,k,l) - + this->loglikelihood_max[i]) ; + } + } + } + } + done.set_value(true) ; +} + +void EMSequence::compute_post_prob() +{ // don't parallelize + if(this->threads == nullptr) + { std::promise promise ; + std::future future = promise.get_future() ; + this->compute_post_prob_routine(0, this->n_row, promise) ; + // compute the sum of post prob and the per class sum of post prob + // from the partial results computed on each slice + this->post_prob_tot = 0. ; + this->post_prob_colsum = future.get() ; + for(const auto& prob : this->post_prob_colsum) + { this->post_prob_tot += prob ; } + } + // parallelize + else + { size_t n_threads = this->threads->getNThread() ; + + // compute the slices on which each thread will work + std::vector> slices = + ThreadPool::split_range(0, this->n_row,n_threads) ; + + // get promises and futures + // the function run by the threads will compute + // the partial sum per class of post_prob for the given slice + // this should be used to compute the complete sum of post_prob + // and the complete sum per class of post_prob + std::vector> promises(n_threads) ; + std::vector> futures(n_threads) ; + for(size_t i=0; ithreads->addJob(std::move( + std::bind(&EMSequence::compute_post_prob_routine, + this, + slice.first, + slice.second, + std::ref(promises[i])))) ; + } + // wait until all threads are done working + // compute the sum of post prob and the per class sum of post prob + // from the partial results computed on each slice + this->post_prob_tot = 0. ; + this->post_prob_colsum = vector_d(this->n_class, 0.) ; + for(auto& future : futures) + { auto probs = future.get() ; + for(size_t i=0; in_class; i++) + { double prob = probs[i] ; + this->post_prob_colsum[i] += prob ; + this->post_prob_tot += prob ; + } + } + // -------------------------- threads stop --------------------------- + } +} + + +void EMSequence::compute_post_prob_routine(size_t from, + size_t to, + std::promise& post_prob_colsum) +{ vector_d colsums(this->n_class, 0.) ; + + // reset grand total + // this->post_prob_tot = 0 ; + // this->post_prob_colsum = vector_d(n_class, 0) ; + + // post prob + for(size_t i=from; ipost_prob_rowsum[i] = 0. ; + for(size_t n_class=0; n_classn_class; n_class++) + { for(size_t n_shift=0; n_shiftn_shift; n_shift++) + { for(size_t n_flip=0; n_flipn_flip; n_flip++) + { + double p = exp(this->loglikelihood(i,n_class,n_shift,n_flip)) * + this->post_state_prob(n_class,n_shift,n_flip) ; + this->post_prob(i,n_class,n_shift,n_flip) = p ; + this->post_prob_rowsum[i] += p ; + } + } + } + // normalize + for(size_t n_class=0; n_classn_class; n_class++) + { for(size_t n_shift=0; n_shiftn_shift; n_shift++) + { for(size_t n_flip=0; n_flipn_flip; n_flip++) + { + double p = std::max(this->post_prob(i,n_class,n_shift,n_flip) / + this->post_prob_rowsum[i], + SequenceLayer::p_min) ; + this->post_prob(i,n_class,n_shift,n_flip) = p ; + colsums[n_class] += p ; + } + } + } + } + post_prob_colsum.set_value(colsums) ; +} + +void EMSequence::update_models() +{ this->seq_layer->update_model(this->post_prob, + this->threads) ; +} diff --git a/src/Clustering/EMSequence.hpp b/src/Clustering/EMSequence.hpp new file mode 100644 index 0000000..fdeef47 --- /dev/null +++ b/src/Clustering/EMSequence.hpp @@ -0,0 +1,173 @@ +#ifndef EMSEQUENCE_HPP +#define EMSEQUENCE_HPP + +#include + +#include +#include +#include // std::promise + +#include +#include + + +typedef std::vector vector_d ; + + +class EMSequence : public EMBase +{ public: + /*! + * \brief Constructs an object to partition the + * given sequences (rows) according to their motif + * content. + * The sequences models are initialised randomly. + * \param sequence_matrix a matrix containing the sequences + * of interest. + * \param n_class the number of region classes + * to search. + * \param n_iter the number of optimization iterations. + * \param n_shift the number of shift states allowed. + * \param flip whether flipping is allowed. + * \param bckg_class the last class is used to model the background + * by setting all its parameters, at all positions, to the + * background base probabilties. Since the background is constant, + * this class will never be updated. + * \param seed a seed to initialise the random number + * generator. + * \param n_threads the number of parallel threads + * to run the computations. 0 means no parallel + * computing, everything is run on the main thread. + */ + EMSequence(const Matrix2D& sequence_matrix, + size_t n_class, + size_t n_iter, + size_t n_shift, + bool flip, + bool bckg_class, + const std::string& seed="", + size_t n_threads=0) ; + + /*! + * \brief Constructs an object to partition the + * given sequences (rows) according to their motif + * content. + * The sequences class models are initialised using + * the given motifs. The class probabilities are + * initialised uniformlly. + * The shifting freedom is set to (data number + * of columns) - (the model 2nd dimension) + * + 1. + * \param sequence_matrix a matrix containing the sequences + * of interest. + * \param motifs a matrix containing the different initial + * class models with the following dimensions : + * dim1 the number of classes + * dim2 the model length + * dim3 4 for A,C,G,T + * \param n_class the number of region classes + * to search. + * \param n_iter the number of optimization iterations. + * \param flip whether flipping is allowed. + * \param bckg_class indicates that the last class in the + * given motifs is used to model the background and it + * should never be updated. + * \param n_threads the number of parallel threads + * to run the computations. 0 means no parallel + * computing, everything is run on the main thread. + */ + EMSequence(const Matrix2D& sequence_matrix, + const Matrix3D& motifs, + size_t n_iter, + bool flip, + bool bckg_class, + size_t n_threads=0) ; + + EMSequence(const EMSequence& other) = delete ; + + /*! + * \brief Destructor. + */ + virtual ~EMSequence() override ; + + /*! + * \brief Returns the class sequence model. + * \return the class sequence model. + */ + Matrix3D get_sequence_models() const ; + + /*! + * \brief Runs the sequence model optimization and + * the data classification. + * \return a code indicating how the optimization + * ended. + */ + virtual EMSequence::exit_codes classify() override ; + + private: + + /*! + * \brief Computes the data log likelihood given the + * current models, for all layers and the joint + * likelihood for each state (the sum of the layer + * likelihoods for all layers, for a given state). + */ + virtual void compute_loglikelihood() override ; + + /*! + * \brief This is a routine of compute_loglikelihood(). + * This method rescales the loglikelihood values by + * substacting to each value the maximum loglikelihood + * value found in the same data row. + * This method + * \param from the index of the first row + * in the data to consider. + * \param to the index of the past last row + * in the data to consider. + * \param done a promise to fill when the method + * is done. + */ + void compute_loglikelihood_routine(size_t from, + size_t to, + std::promise& done) ; + + /*! + * \brief Computes the data posterior probabilties. + */ + virtual void compute_post_prob() override ; + + /*! + * \brief The routine that effectively computes + * the posterior probabilties. + * \param from the index of the first row + * in the data to consider. + * \param to the index of the past last row + * in the data to consider. + * \param done the partial column (over the classes) + * sum of posterior probabilities. If several routines + * are running together, the colsums are retrieved by + * summing up the vectors together. + */ + void compute_post_prob_routine(size_t from, + size_t to, + std::promise& post_prob_colsum) ; + + /*! + * \brief Update the data models for all layers, given + * the current posterior and class probabilities. + */ + virtual void update_models() override ; + + /*! + * \brief the max loglikelihood value for + * each data row. + */ + std::vector loglikelihood_max ; + /*! + * \brief A pointer to the object managing + * the data and their model. + */ + SequenceLayer* seq_layer ; + +} ; + +#endif // EMSEQUENCE_HPP diff --git a/src/Clustering/ModelComputer.cpp b/src/Clustering/ModelComputer.cpp index 20e6cd5..e9b5acd 100644 --- a/src/Clustering/ModelComputer.cpp +++ b/src/Clustering/ModelComputer.cpp @@ -1,33 +1,34 @@ #include -#include +#include +#include ModelComputer::ModelComputer() : data_layer(nullptr) {} ModelComputer::~ModelComputer() { if(this->data_layer != nullptr) { delete this->data_layer ; this->data_layer = nullptr ; } } -matrix2d_d ModelComputer::get_model() const +Matrix2D ModelComputer::get_model() const { // the model - matrix3d_d model = this->data_layer->get_model() ; - size_t n_class = model.size() ; - size_t l_model = model[0].size() ; - size_t n_categ = model[0][0].size() ; + Matrix3D model = this->data_layer->get_model() ; + size_t n_class = model.get_dim()[0] ; + size_t l_model = model.get_dim()[1] ; + size_t n_categ = model.get_dim()[2] ; // a nice representation of the model - matrix2d_d model_nice(n_class*n_categ, - vector_d(l_model)) ; + Matrix2D model_nice(n_class*n_categ, + l_model) ; for(size_t i=0; i +#include #include class ModelComputer { public: /*! * \brief Constructs an empty object. */ ModelComputer() ; ModelComputer(const ModelComputer& other) = delete ; /*! * \brief Destructor. */ virtual ~ModelComputer() ; /*! * \brief Returns the data model in a nice * format. * 1st dim: the different classes and * the model categories. For instance, * a read model with 2 classes will have * class 1 and class 2 over the rows. * A sequence model with 2 classes will * have class 1 A, class 1 C, class 1 G, * class 1 T, class 2 A, class 2 C, * class 2 G and class 2 T. * 2nd dim: the model length * ___________ * | class1 | /|\ * ___|__________|_\|/ 1 (reads) or 4 (sequences) * | class2 | /|\ * |__________| \|/ 1 (reads) or 4 (sequences) * * <----------> * model length * \return the data model. */ - virtual matrix2d_d get_model() const ; + virtual Matrix2D get_model() const ; protected: /*! * \brief The data layer containing the * data and their models. */ DataLayer* data_layer ; } ; #endif // MODELCOMPUTER_HPP diff --git a/src/Clustering/ReadLayer.cpp b/src/Clustering/ReadLayer.cpp index 511b081..a974e91 100644 --- a/src/Clustering/ReadLayer.cpp +++ b/src/Clustering/ReadLayer.cpp @@ -1,569 +1,478 @@ #include #include // std::invalid_argument #include // numeric_limits #include // log(), exp(), pow() #include #include // std::promise, std::future #include // std::pair, std::move() #include // std::bind(), std::ref() + #include // beta_pmf(), poisson_pmf() #include // rand_real_uniform(), rand_int_uniform() -#include +#include +#include +#include #include +#include + +typedef std::vector vector_d ; -ReadLayer::ReadLayer(const matrix2d_i& data, + +ReadLayer::ReadLayer(const Matrix2D& data, size_t n_class, size_t n_shift, bool flip, ThreadPool* threads) : DataLayer(data, n_class, n_shift, flip), - window_means(n_row, - vector_d(n_shift, 0.)) + window_means(n_row, n_shift, 0.) { this->n_category = 1 ; // initialise the empty model - this->model = matrix3d_d(this->n_class, - matrix2d_d(this->l_model, - vector_d(this->n_category, 0))) ; + this->model = Matrix3D(this->n_class, + this->l_model, + this->n_category, + 0) ; // compute window means this->compute_window_means(threads) ; } -ReadLayer::ReadLayer(const matrix2d_i& data, - const matrix3d_d& model, +ReadLayer::ReadLayer(const Matrix2D& data, + const Matrix3D& model, bool flip, ThreadPool* threads) : DataLayer(data, model, flip), - window_means(n_row, - vector_d(n_shift, 0.)) + window_means(n_row, n_shift, 0.) { // check that the model only has one category if(this->n_category > 1) { char msg[4096] ; sprintf(msg, "Error! model is expected to have length 1 on " "3rd dimension, not %zu", this->n_category) ; throw std::invalid_argument(msg) ; } // compute window means this->compute_window_means(threads) ; } ReadLayer::~ReadLayer() {} -void ReadLayer::seed_model_randomly() -{ - // get random values from a beta distribution cannot be done using boost so - // i) generate random number [0,1] x - // ii) compute f(x) where f is beta distribution - - matrix2d_d prob(this->n_row, vector_d(this->n_class, 0.)) ; - double tot_sum = 0. ; - - // sample the prob - // beta distribution parameters - double alpha = pow(this->n_row, -0.5) ; - double beta = 1. ; - for(size_t i=0; in_row; i++) - { double row_sum = 0. ; - for(size_t j=0; jn_class; j++) - { double x = rand_real_uniform(0., 1.0) ; - // double p = std::max(ReadLayer::p_min, beta_pmf(x, alpha, beta)) ; - double p = beta_pmf(x, alpha, beta) ; - prob[i][j] = p ; - tot_sum += p ; - row_sum += p ; - } - // normalize - for(size_t j=0; jn_class; j++) - { prob[i][j] /= row_sum ; } - } - - // compute the refererences - for(size_t i=0; in_row; i++) - { for(size_t j=0; jn_class; j++) - { for(size_t j_ref=0, j_dat=this->n_shift/2; j_refl_model; j_ref++, j_dat++) - { this->model[j][j_ref][0] += (this->data[i][j_dat] * prob[i][j]) ; } - } - } - // avoid 0's in the model to ensure that pmf_poisson() never - // return 0 - for(size_t i=0; in_class; i++) - { for(size_t j=0; jl_model; j++) - { for(size_t k=0; kn_category; k++) - { this->model[i][j][k] = - std::max(this->model[i][j][k], ReadLayer::p_min) ; - } - } - } -} - -void ReadLayer::seed_model_sampling() -{ std::vector choosen(this->n_row, false) ; - - for(size_t i=0; in_class; ) - { size_t index = rand_int_uniform(size_t(0), size_t(this->n_row-1)) ; - // already choose - if(choosen[index]) - { ; } - // not yet choosen as reference - else - { for(size_t j_ref=0, j_dat=this->n_shift/2; j_refl_model; j_ref++, j_dat++) - { this->model[i][j_ref][0] = this->data[index][j_dat] ; } - choosen[index] = true ; - i++ ; - } - } - // avoid 0's in the model to ensure that pmf_poisson() never - // return 0 - for(size_t i=0; in_class; i++) - { for(size_t j=0; jl_model; j++) - { for(size_t k=0; kn_category; k++) - { this->model[i][j][k] = - std::max(this->model[i][j][k], ReadLayer::p_min) ; - } - } - } -} - -void ReadLayer::seed_model_toy() -{ // sample data to initialise the references - std::vector choosen(this->n_row, false) ; - - for(size_t i=0; in_class; ) - { size_t index = i ; - // already choose - if(choosen[index]) - { ; } - // not yet choosen as reference - else - { for(size_t j_ref=0, j_dat=this->n_shift/2; j_refl_model; j_ref++, j_dat++) - { this->model[i][j_ref][0] = this->data[index][j_dat] ; } - choosen[index] = true ; - i++ ; - } - } - // avoid 0's in the model to ensure that pmf_poisson() never - // return 0 - for(size_t i=0; in_class; i++) - { for(size_t j=0; jl_model; j++) - { for(size_t k=0; kn_category; k++) - { this->model[i][j][k] = - std::max(this->model[i][j][k], ReadLayer::p_min) ; - } - } - } -} - - -void ReadLayer::compute_loglikelihoods(matrix4d_d& loglikelihood, +void ReadLayer::compute_loglikelihoods(Matrix4D& loglikelihood, vector_d& loglikelihood_max, ThreadPool* threads) const { // dimension checks this->check_loglikelihood_dim(loglikelihood) ; this->check_loglikelihood_max_dim(loglikelihood_max) ; // don't parallelize if(threads == nullptr) { std::promise promise ; std::future future = promise.get_future() ; - this->compute_loglikelihoods_routine(0, this->n_row, + this->compute_loglikelihoods_routine(0, + this->n_row, std::ref(loglikelihood), std::ref(loglikelihood_max), promise) ; future.get() ; } // parallelize else { size_t n_threads = threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row, n_threads) ; // get promises and futures // the function run by the threads will simply fill the promise with // "true" to indicate that they are done std::vector> promises(n_threads) ; std::vector> futures(n_threads) ; for(size_t i=0; iaddJob(std::move( std::bind(&ReadLayer::compute_loglikelihoods_routine, this, slice.first, slice.second, std::ref(loglikelihood), std::ref(loglikelihood_max), std::ref(promises[i])))) ; } // wait until all threads are done working for(auto& future : futures) { future.get() ; } // -------------------------- threads stop --------------------------- } } void ReadLayer::compute_loglikelihoods_routine(size_t from, size_t to, - matrix4d_d& loglikelihood, + Matrix4D& loglikelihood, vector_d& loglikelihood_max, std::promise& done) const { // normalize the models - matrix3d_d model_norm = this->model ; + Matrix3D model_norm = this->model ; for(size_t i=0; in_class; i++) { double mean = 0. ; for(size_t j=0; jl_model; j++) - { mean += model_norm[i][j][0] ; } + { mean += model_norm(i,j,0) ; } mean /= this->l_model ; for(size_t j=0; jl_model; j++) - { model_norm[i][j][0] /= mean ; } + { model_norm(i,j,0) /= mean ; } } + // compute log likelihood for(size_t i=from; i::lowest() ; for(size_t j=0; jn_class; j++) { for(size_t s_fw=0, s_rev=this->n_shift-1; s_fwn_shift; s_fw++, s_rev--) { // slice is [from_fw,to) // from_dat_fw to_dat_fw [from_dat_fw, to_dat_fw] // fw |---------->>>----------| // ----------------------------------> data // rev |----------<<<----------| [from_dat_rev, to_dat_rev] // to_dat_rev can be -1 -> int // to_dat_rev from_dat_rev // log likelihood double ll_fw = 0. ; double ll_rev = 0. ; // --------------- forward --------------- size_t from_dat_fw = s_fw ; size_t to_dat_fw = from_dat_fw + this->l_model - 1 ; // --------------- reverse --------------- size_t from_dat_rev = this->n_col - 1 - s_fw ; // size_t to_dat_rev = from_dat_rev - (this->l_model - 1) ; for(size_t j_dat_fw=from_dat_fw,j_ref_fw=0, j_dat_rev=from_dat_rev; j_dat_fwdata[i][j_dat_fw], - model_norm[j][j_ref_fw][0]* - this->window_means[i][s_fw])) ; + ll = log(poisson_pmf(this->data(i,j_dat_fw), + model_norm(j,j_ref_fw,0)* + this->window_means(i,s_fw))) ; ll_fw += ll ; // ll_fw += std::max(ll, ReadLayer::p_min_log) ; // --------------- reverse --------------- if(this->flip) - { ll = log(poisson_pmf(this->data[i][j_dat_rev], - model_norm[j][j_ref_fw][0]* - this->window_means[i][s_rev])) ; + { ll = log(poisson_pmf(this->data(i,j_dat_rev), + model_norm(j,j_ref_fw,0)* + this->window_means(i,s_rev))) ; ll_rev += ll ; // ll_rev += std::max(ll, ReadLayer::p_min_log) ; } } - loglikelihood[i][j][from_dat_fw][flip_states::FORWARD] = ll_fw ; + loglikelihood(i,j,from_dat_fw,flip_states::FORWARD) = ll_fw ; // keep track of the max per row if(ll_fw > loglikelihood_max[i]) { loglikelihood_max[i] = ll_fw ; } if(this->flip) - { loglikelihood[i][j][from_dat_fw][flip_states::REVERSE] = ll_rev ; + { loglikelihood(i,j,from_dat_fw,flip_states::REVERSE) = ll_rev ; // keep track of the max per row if(ll_rev > loglikelihood_max[i]) { loglikelihood_max[i] = ll_rev ; } } } } } done.set_value(true) ; } - - -void ReadLayer::update_model(const matrix4d_d& posterior_prob, +void ReadLayer::update_model(const Matrix4D& posterior_prob, ThreadPool* threads) -{ // computing sum over the columns (classes) - size_t n_row = posterior_prob.size() ; - size_t n_class = posterior_prob[0].size() ; - size_t n_shift = posterior_prob[0][0].size() ; - size_t n_flip = posterior_prob[0][0][0].size() ; +{ + // computing sum over the columns (classes) + size_t n_row = posterior_prob.get_dim()[0] ; + size_t n_class = posterior_prob.get_dim()[1] ; + size_t n_shift = posterior_prob.get_dim()[2] ; + size_t n_flip = posterior_prob.get_dim()[3] ; vector_d colsum(n_class, 0.) ; for(size_t i=0; iupdate_model(posterior_prob, + colsum, + threads) ; + /* // don't parallelize if(threads == nullptr) - { std::promise promise ; - std::future future = promise.get_future() ; + { std::promise> promise ; + std::future> future = promise.get_future() ; this->update_model_routine(0, this->n_row, posterior_prob, colsum, promise) ; this->model = future.get() ; } // parallelize else { size_t n_threads = threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row, n_threads) ; // get promises and futures // the function run by the threads will simply fill the promise with // "true" to indicate that they are done - std::vector> promises(n_threads) ; - std::vector> futures(n_threads) ; + std::vector>> promises(n_threads) ; + std::vector>> futures(n_threads) ; for(size_t i=0; iaddJob(std::move( std::bind(&ReadLayer::update_model_routine, this, slice.first, slice.second, posterior_prob, colsum, std::ref(promises[i])))) ; } // reinitialise the model - this->model = matrix3d_d(this->n_class, - matrix2d_d(this->l_model, - vector_d(this->n_category, 0))) ; + this->model = Matrix3D(this->n_class, + this->l_model, + this->n_category, + 0.) ; // wait until all threads are done working // and update the model for(auto& future : futures) - { matrix3d_d model_part = future.get() ; + { Matrix3D model_part = future.get() ; for(size_t i=0; in_class; i++) { for(size_t j=0; jl_model; j++) { for(size_t k=0; kn_category; k++) - { this->model[i][j][k] += - model_part[i][j][k] ; + { this->model(i,j,k) += + model_part(i,j,k) ; } } } } // -------------------------- threads stop --------------------------- } // avoid 0's in the model to ensure that pmf_poisson() never // return 0 for(size_t i=0; in_class; i++) { for(size_t j=0; jl_model; j++) { for(size_t k=0; kn_category; k++) - { this->model[i][j][k] = - std::max(this->model[i][j][k], ReadLayer::p_min) ; + { this->model(i,j,k) = + std::max(this->model(i,j,k), ReadLayer::p_min) ; } } } + */ } -void ReadLayer::update_model(const matrix4d_d& posterior_prob, +void ReadLayer::update_model(const Matrix4D& posterior_prob, const vector_d& posterior_prob_colsum, ThreadPool* threads) { // don't parallelize if(threads == nullptr) - { std::promise promise ; - std::future future = promise.get_future() ; + { std::promise> promise ; + std::future> future = promise.get_future() ; this->update_model_routine(0, this->n_row, posterior_prob, posterior_prob_colsum, promise) ; this->model = future.get() ; } // parallelize else { size_t n_threads = threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row, n_threads) ; // get promises and futures // the function run by the threads will simply fill the promise with // "true" to indicate that they are done - std::vector> promises(n_threads) ; - std::vector> futures(n_threads) ; + std::vector>> promises(n_threads) ; + std::vector>> futures(n_threads) ; for(size_t i=0; iaddJob(std::move( std::bind(&ReadLayer::update_model_routine, this, slice.first, slice.second, - posterior_prob, - posterior_prob_colsum, + std::ref(posterior_prob), + std::ref(posterior_prob_colsum), std::ref(promises[i])))) ; } // reinitialise the model - this->model = matrix3d_d(this->n_class, - matrix2d_d(this->l_model, - vector_d(this->n_category, 0))) ; + this->model = Matrix3D(this->n_class, + this->l_model, + this->n_category, + 0.) ; // wait until all threads are done working // and update the mode for(auto& future : futures) - { matrix3d_d model_part = future.get() ; + { Matrix3D model_part = future.get() ; for(size_t i=0; in_class; i++) { for(size_t j=0; jl_model; j++) { for(size_t k=0; kn_category; k++) - { this->model[i][j][k] += - model_part[i][j][k] ; + { this->model(i,j,k) += + model_part(i,j,k) ; } } } } // -------------------------- threads stop --------------------------- } // avoid 0's in the model to ensure that pmf_poisson() never // return 0 for(size_t i=0; in_class; i++) { for(size_t j=0; jl_model; j++) { for(size_t k=0; kn_category; k++) - { this->model[i][j][k] = - std::max(this->model[i][j][k], ReadLayer::p_min) ; + { this->model(i,j,k) = + std::max(this->model(i,j,k), ReadLayer::p_min) ; } } } } void ReadLayer::update_model_routine(size_t from, size_t to, - const matrix4d_d& posterior_prob, + const Matrix4D& posterior_prob, const vector_d& posterior_prob_colsum, - std::promise& promise) const + std::promise>& promise) const { // dimension checks this->check_posterior_prob_dim(posterior_prob) ; this->check_posterior_prob_colsum_dim(posterior_prob_colsum) ; // partial model - matrix3d_d model = matrix3d_d(this->n_class, - matrix2d_d(this->l_model, - vector_d(this->n_category, 0.))) ; + Matrix3D model(this->n_class, + this->l_model, + this->n_category, + 0.) ; for(size_t n_class=0; n_class < this->n_class; n_class++) { for(size_t i=from; in_shift; n_shift++) { // --------------- forward --------------- int from_dat_fw = n_shift ; int to_dat_fw = from_dat_fw + this->l_model - 1 ; for(int j_dat_fw=from_dat_fw, j_ref_fw=0; j_dat_fw<=to_dat_fw; j_dat_fw++, j_ref_fw++) - { model[n_class][j_ref_fw][0] += - (posterior_prob[i][n_class][n_shift][flip_states::FORWARD] * - this->data[i][j_dat_fw]) / + { model(n_class,j_ref_fw,0) += + (posterior_prob(i,n_class,n_shift,flip_states::FORWARD) * + this->data(i,j_dat_fw)) / posterior_prob_colsum[n_class] ; } // --------------- reverse --------------- if(this->flip) { int from_dat_rev = this->n_col - 1 - n_shift ; int to_dat_rev = from_dat_rev - (this->l_model - 1) ; for(int j_dat_rev=from_dat_rev, j_ref_fw=0; j_dat_rev >= to_dat_rev; j_dat_rev--, j_ref_fw++) - { model[n_class][j_ref_fw][0] += - (posterior_prob[i][n_class][n_shift][flip_states::REVERSE] * - this->data[i][j_dat_rev]) / + { model(n_class,j_ref_fw,0) += + (posterior_prob(i,n_class,n_shift,flip_states::REVERSE) * + this->data(i,j_dat_rev)) / posterior_prob_colsum[n_class] ; } } } } } promise.set_value(model) ; } void ReadLayer::compute_window_means(ThreadPool* threads) { // don't parallelize if(threads == nullptr) { std::promise promise ; std::future future = promise.get_future() ; this->compute_window_means_routine(0, this->n_row, promise) ; future.get() ; } // parallelize else { size_t n_threads = threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row, n_threads) ; // get promises and futures // the function run by the threads will simply fill the promise with // "true" to indicate that they are done std::vector> promises(n_threads) ; std::vector> futures(n_threads) ; for(size_t i=0; iaddJob(std::move( std::bind(&ReadLayer::compute_window_means_routine, this, slice.first, slice.second, std::ref(promises[i])))) ; } // wait until all threads are done working for(auto& future : futures) { future.get() ; } // -------------------------- threads stop --------------------------- } } void ReadLayer::compute_window_means_routine(size_t from, size_t to, std::promise& done) { double l_window = double(this->l_model) ; for(size_t i=from; in_shift; from++) { double sum = 0. ; // slice is [from,to) size_t to = from + this->l_model ; for(size_t j=from; jdata[i][j] ;} - this->window_means[i][from] = sum / l_window ; + { sum += this->data(i,j) ;} + this->window_means(i,from) = sum / l_window ; } } done.set_value(true) ; } void ReadLayer::check_posterior_prob_colsum_dim(const vector_d& posterior_prob_colsum) const { if(posterior_prob_colsum.size() != this->n_class) { char msg[4096] ; sprintf(msg, "Error! posterior_class_prob matrix size is not " "equal to model class number : %zu / %zu", posterior_prob_colsum.size(), this->n_class) ; throw std::invalid_argument(msg) ; } } diff --git a/src/Clustering/ReadLayer.hpp b/src/Clustering/ReadLayer.hpp index b0d7636..0be7a3c 100644 --- a/src/Clustering/ReadLayer.hpp +++ b/src/Clustering/ReadLayer.hpp @@ -1,227 +1,209 @@ #ifndef READLAYER_HPP #define READLAYER_HPP #include -#include +#include +#include +#include #include +typedef std::vector vector_d ; + class ReadLayer : public DataLayer { public: /*! * \brief Constructs an object with the * given data and an empty (0 values) * model. * \param data the data. * \param n_class the number of classes * of the model. * \param n_shift the number of shift * states of the model. * \param flip whether flipping is allowed. * \param threads a pointer to a thread pool to * parallelize the computations. If nullptr is given, * the computations are performed by the main thread. */ - ReadLayer(const matrix2d_i& data, + ReadLayer(const Matrix2D& data, size_t n_class, size_t n_shift, bool flip, ThreadPool* threads = nullptr) ; /*! * \brief Construct an object with the * given data and model. * \param data the data. * \param the model. * \param flip whether flipping is allowed. * \param threads a pointer to a thread pool to * parallelize the computations. If nullptr is given, * the computations are performed by the main thread. */ - ReadLayer(const matrix2d_i& data, - const matrix3d_d& model, + ReadLayer(const Matrix2D& data, + const Matrix3D& model, bool flip, ThreadPool* threads = nullptr) ; /*! * Destructor */ virtual ~ReadLayer() override ; - /*! - * \brief Initialises the references randomly. - * Generates the initial references by randomly - * assigning the data to the classes using a beta - * distribution. - */ - virtual void seed_model_randomly() override ; - - /*! - * \brief Sets the model values by - * sampling rows in the data and - * assigning them as initial model - * values. - */ - virtual void seed_model_sampling() override ; - - /*! - * \brief Sets the model values by - * using the first n_class rows in data. - */ - virtual void seed_model_toy() override ; - /*! * \brief Computes the log likelihood of the data * given the current model parameters. * During this process, a normalized version of the * models, having a sum of signal of 1 count in average, * is used (a copy of the models is normalized, meaning * that the original models can still be retrieved the * dedicated getter). * \param logliklihood a matrix to store the * results. It should have the following dimensions : * 1st : same as the data number of row * 2nd : same as the model number of classes * 3rd : same as the number of shifts * 4th : same as the number of flip states * \param loglikelihood_max a vector containing the * max value for each row of loglikelihood. * Its length should be equal to the data row number. * \param threads a pointer to a thread pool to * parallelize the computations. If nullptr is given, * the computations are performed by the main thread. * \throw std::invalid_argument if the dimensions are * incorrect. */ - virtual void compute_loglikelihoods(matrix4d_d& loglikelihood, + virtual void compute_loglikelihoods(Matrix4D& loglikelihood, vector_d& loglikelihood_max, ThreadPool* threads=nullptr) const override ; /*! * \brief Updates the model given the posterior * probabilities (the probabilities of each row * in the data to be assigned to each class, * for each shift and flip state). * \param posterior_prob the data assignment probabilities to * the different classes. * \param threads a pointer to a thread pool to * parallelize the computations. If nullptr is given, * the computations are performed by the main thread. */ - virtual void update_model(const matrix4d_d& posterior_prob, + virtual void update_model(const Matrix4D& posterior_prob, ThreadPool* threads=nullptr) override ; /*! * \brief Updates the model given the posterior * probabilities (the probabilities of each row * in the data to be assigned to each class, * for each shift and flip state). * This method does the same as the virtual method it * overloads. The only difference is that, for run time * gain, it is given the sum over the columns of the * posterior_prob matrix which is computed by the virtual * method. * \param posterior_prob the data assignment probabilities to * the different classes. * \param posterior_prob_colsum the sum over the columns * (classes) of the posterior_prob matrix. * \param threads a pointer to a thread pool to * parallelize the computations. If nullptr is given, * the computations are performed by the main thread. */ - void update_model(const matrix4d_d& posterior_prob, + void update_model(const Matrix4D& posterior_prob, const vector_d& posterior_prob_colsum, ThreadPool* threads=nullptr) ; protected: /*! * \brief The routine that effectively performs the * loglikelihood computations. * \param from the index of the first row of the data * to considered. * \param to the index of the past last row of the data * to considered. * \param loglikelihood a matrix to store the * results. It should have the following dimensions : * 1st : same as the data number of row * 2nd : same as the model number of classes * 3rd : same as the number of shifts * 4th : same as the number of flip states * \param loglikelihood_max a vector containing the * max value for each row of log_likelihood. * Its length should be equal to the data row number. * \param done a promise to be filled when the routine * is done running. */ void compute_loglikelihoods_routine(size_t from, size_t to, - matrix4d_d& loglikelihood, + Matrix4D& loglikelihood, vector_d& loglikelihood_max, std::promise& done) const ; /*! * \brief The routine that effectively update the model. * \param from the index of the first row of the * posterior probabilities to considered. * \param to the index of the past last row of the * posterior probabilities to considered. * \param posterior_prob the data assignment probabilities * to the different classes. * \param * \param promise a promise containing the partial model * computed from the given data slice. If several routines * work together to update the model, the promise matrices * need to be summed up to get the final model. */ void update_model_routine(size_t from, size_t to, - const matrix4d_d& posterior_prob, + const Matrix4D& posterior_prob, const vector_d& posterior_prob_colsum, - std::promise& promise) const ; + std::promise>& promise) const ; /*! * \brief Computes the mean number of reads present in * each slice (of length l_model), in each row * of the data and store them in this->window_means. * \param threads a pointer to a thread pool to * parallelize the computations. If nullptr is given, * the computations are performed by the main thread. */ void compute_window_means(ThreadPool* threads) ; /*! * \brief The routine that effectively computes the * window means. * \param from the index of the first row of the * data to considered. * \param to the index of the past last row of the * data to considered. * \param done a promise to fill when the routine * is done running. */ void compute_window_means_routine(size_t from, size_t to, std::promise& done) ; /*! * \brief Checks that the argument has compatible * dimensions with the data and models. If this is * not the case, throw a std::invalid_argument with * a relevant message. * \param posterior_class_prob a vector containing the * class probabilities. * It should have a length equal to the number of * classes. * \throw std::invalid_argument if the dimensions are * incorrect. */ void check_posterior_prob_colsum_dim(const vector_d& posterior_prob_colsum) const ; /*! * \brief contains the data means, for * each window of size l_model. */ - matrix2d_d window_means ; + Matrix2D window_means ; } ; #endif // READLAYER_HPP diff --git a/src/Clustering/ReadModelComputer.cpp b/src/Clustering/ReadModelComputer.cpp index dbfbd5f..fae46a0 100644 --- a/src/Clustering/ReadModelComputer.cpp +++ b/src/Clustering/ReadModelComputer.cpp @@ -1,43 +1,45 @@ #include +#include +#include #include #include -ReadModelComputer::ReadModelComputer(const matrix2d_i& data, - const matrix4d_d& post_prob, +#include + +ReadModelComputer::ReadModelComputer(const Matrix2D& data, + const Matrix4D& post_prob, size_t n_threads) : ModelComputer(), threads(nullptr) -{ - // parameters - size_t n_class = post_prob[0].size() ; - size_t n_shift = post_prob[0][0].size() ; - size_t n_flip = post_prob[0][0][0].size() ; +{ // parameters + size_t n_class = post_prob.get_dim()[1] ; + size_t n_shift = post_prob.get_dim()[2] ; + size_t n_flip = post_prob.get_dim()[3] ; bool flip = n_flip == 2 ; // the threads if(n_threads) { this->threads = new ThreadPool(n_threads) ; } // the data and the model this->data_layer = new ReadLayer(data, n_class, n_shift, flip) ; - this->data_layer->update_model(post_prob, this->threads) ; } ReadModelComputer::~ReadModelComputer() { // threads if(this->threads != nullptr) { this->threads->join() ; delete this->threads ; this->threads = nullptr ; } // data and model if(this->data_layer != nullptr) { delete this->data_layer ; this->data_layer = nullptr ; } } diff --git a/src/Clustering/ReadModelComputer.hpp b/src/Clustering/ReadModelComputer.hpp index 0794ea1..6341fa9 100644 --- a/src/Clustering/ReadModelComputer.hpp +++ b/src/Clustering/ReadModelComputer.hpp @@ -1,41 +1,42 @@ #ifndef READMODELCOMPUTER_HPP #define READMODELCOMPUTER_HPP #include -#include +#include +#include #include class ReadModelComputer : public ModelComputer { public: /*! * \brief Constructs an object to retrieve * the read model given the data and their * classification results. * \param data the data. * \param post_prob the data class assignment * probabilities. * \param n_threads the number of parallel threads * to run the computations. 0 means no parallel * computing, everything is run on the main thread. */ - ReadModelComputer(const matrix2d_i& data, - const matrix4d_d& post_prob, + ReadModelComputer(const Matrix2D& data, + const Matrix4D& post_prob, size_t n_threads) ; /*! * \brief Destructor. */ virtual ~ReadModelComputer() override ; protected: /*! * \brief the threads. */ ThreadPool* threads ; } ; #endif // READMODELCOMPUTER_HPP diff --git a/src/Clustering/SequenceLayer.cpp b/src/Clustering/SequenceLayer.cpp index 0e1bc21..e923e79 100644 --- a/src/Clustering/SequenceLayer.cpp +++ b/src/Clustering/SequenceLayer.cpp @@ -1,587 +1,415 @@ #include #include // std::invalid_argument #include // numeric_limits #include // log(), pow() #include #include // std::max_element() #include // beta_pmf() #include // rand_real_uniform(), rand_int_uniform() -#include +#include +#include +#include #include -double SequenceLayer::score_subseq(const vector_i& seq, +double SequenceLayer::score_subseq(const Matrix2D& seq, + size_t row, size_t start, - const matrix2d_d& model_log) + const Matrix2D& model_log) { - if(start > seq.size() - model_log.size()) + if(start > seq.get_ncol() - model_log.get_nrow()) { char msg[4096] ; sprintf(msg, "Error! given start (%zu) is too high. Max value is %zu", - start, seq.size() - model_log.size()) ; + start, seq.get_ncol() - model_log.get_nrow()) ; throw std::invalid_argument(msg) ; } - else if(model_log.size() > seq.size()) + else if(model_log.get_nrow() > seq.get_ncol()) { char msg[4096] ; sprintf(msg, "Error! given model is longer than sequences (%zu / %zu)", - model_log.size(), seq.size()) ; + model_log.get_nrow(), seq.get_ncol()) ; throw std::invalid_argument(msg) ; } - else if(model_log[0].size() != 4) + else if(model_log.get_ncol() != 4) { char msg[4096] ; sprintf(msg, "Error! given model 2nd dimension is not 4 (%zu)", - model_log[0].size()) ; + model_log.get_ncol()) ; throw std::invalid_argument(msg) ; } size_t from = start ; - size_t to = from + model_log.size() ; // will score [from, to) + size_t to = from + model_log.get_nrow() ; // will score [from, to) - // std::cerr << "scoring subseq : " ; int n_code = dna::char_to_int('N') ; double ll = 0 ; for(size_t i=from, j=0; i get max score if(base == n_code) - { ll += *(std::max_element(std::begin(model_log[j]), - std::end(model_log[j]))) ; + { std::vector row = model_log.get_row(j) ; + ll += *(std::max_element(std::begin(row), + std::end(row))) ; } // A,C,G,T -> get its score else - { ll += model_log[j][base] ; } - // std::cerr << dna::int_to_char(base) << "(" << exp(model_log[j][base]) << ")" ; + { ll += model_log(j,base) ; } } - // std::cerr << " " << ll << std::endl ; return ll ; } -SequenceLayer::SequenceLayer(const matrix2d_i& data, +SequenceLayer::SequenceLayer(const Matrix2D& data, size_t n_class, size_t n_shift, - bool flip) - : DataLayer(data, n_class, n_shift, flip) + bool flip, + bool last_class_cst) + : DataLayer(data, n_class, n_shift, flip), + last_class_cst(last_class_cst) { this->n_category = 4 ; // initialise the empty model - this->model = matrix3d_d(this->n_class, - matrix2d_d(this->l_model, - vector_d(this->n_category, 0))) ; + this->model = Matrix3D(this->n_class, + this->l_model, + this->n_category, + 0.) ; } -SequenceLayer::SequenceLayer(const matrix2d_i& data, - const matrix3d_d& model, - bool flip) - :DataLayer(data, model,flip) +SequenceLayer::SequenceLayer(const Matrix2D& data, + const Matrix3D& model, + bool flip, + bool last_class_cst) + : DataLayer(data, model,flip), + last_class_cst(last_class_cst) {} SequenceLayer::~SequenceLayer() {} -void SequenceLayer::seed_model_randomly() -{ - // get random values from a beta distribution cannot be done using boost so - // i) generate random number [0,1] x - // ii) compute f(x) where f is beta distribution - - matrix2d_d prob(this->n_row, vector_d(this->n_class, 0.)) ; - double tot_sum = 0. ; - - // sample the prob - // beta distribution parameters - // double alpha = pow(this->n_row, -0.5) ; - // double beta = 1. ; - double alpha = 1 ; - double beta = this->n_row ; - for(size_t i=0; in_row; i++) - { double row_sum = 0. ; - for(size_t j=0; jn_class; j++) - { double x = rand_real_uniform(0., 1.0) ; - double p = std::max(SequenceLayer::p_min, beta_pmf(x, alpha, beta)) ; - prob[i][j] = p ; - tot_sum += p ; - row_sum += p ; - } - // normalize - for(size_t j=0; jn_class; j++) - { prob[i][j] /= row_sum ; } - } - - // compute the refererences - for(size_t i=0; in_row; i++) - { for(size_t j=0; jn_class; j++) - { for(size_t j_ref=0, j_dat=this->n_shift/2; j_refl_model; j_ref++, j_dat++) - { size_t base = this->data[i][j_dat] ; - this->model[j][j_ref][base] += prob[i][j] ; - } - } - } - // normalize - for(size_t i=0; in_class; i++) - { for(size_t j=0; jl_model; j++) - { // sum - double colsum = 0. ; - for(size_t k=0; kn_category; k++) - { colsum += this->model[i][j][k] ; } - // normalize - // avoid 0's in the model to ensure that pmf_poisson() never - // return 0 - for(size_t k=0; kn_category; k++) - { double p = this->model[i][j][k] / colsum ; - this->model[i][j][k] = - std::max(p, SequenceLayer::p_min) ; - } - } - } -} - -void SequenceLayer::seed_model_sampling() -{ - std::vector choosen(this->n_row, false) ; - - double minor_weight = 1. ; - double major_weight = 7. ; - - for(size_t i=0; in_class; ) - { size_t index = rand_int_uniform(size_t(0), size_t(this->n_row-1)) ; - // already choose - if(choosen[index]) - { ; } - // not yet choosen as reference - else - { for(size_t j_ref=0, j_dat=this->n_shift/2; j_refl_model; j_ref++, j_dat++) - { size_t base = this->data[index][j_dat] ; - double colsum = 0. ; - for(size_t k=0; kn_category; k++) - { if(k == base) - { this->model[i][j_ref][k] = major_weight ; } - else - { this->model[i][j_ref][k] = minor_weight ; } - colsum += this->model[i][j_ref][k] ; - } - // normalize - for(size_t k=0; kn_category; k++) - { this->model[i][j_ref][k] /= colsum ; } - } - choosen[index] = true ; - i++ ; - } - } - - // NOTE - // no need to check for 0's in the model because it is guaranteed - // not to have any (minor and major_weights > 0) but if it - // changes, a check will be needed -} - -/* -void SequenceLayer::seed_model_toy() -{ - // sample data to initialise the references - std::vector choosen(this->n_row, false) ; - - double minor_weight = 1. ; - double major_weight = 7. ; - - for(size_t i=0; in_class; ) - { size_t index = i ; - // already choose - if(choosen[index]) - { ; } - // not yet choosen as reference - else - { for(size_t j_ref=0, j_dat=this->n_shift/2; j_refl_model; j_ref++, j_dat++) - { size_t base = this->data[index][j_dat] ; - double colsum = 0. ; - for(size_t k=0; kn_category; k++) - { if(k == base) - { this->model[i][j_ref][k] = major_weight ; } - else - { this->model[i][j_ref][k] = minor_weight ; } - colsum += this->model[i][j_ref][k] ; - } - // normalize - for(size_t k=0; kn_category; k++) - { this->model[i][j_ref][k] /= colsum ; } - } - choosen[index] = true ; - i++ ; - } - } - - // NOTE - // no need to check for 0's in the model because it is guaranteed - // not to have any (minor and major_weights > 0) but if it - // changes, a check will be needed -} -*/ - -void SequenceLayer::seed_model_toy() -{ - this->model[0][0][0] = 0.8 ; - this->model[0][0][1] = 0.1 ; - this->model[0][0][2] = 0.05 ; - this->model[0][0][3] = 0.05 ; - - this->model[0][1][0] = 0.1 ; - this->model[0][1][1] = 0.7 ; - this->model[0][1][2] = 0.1 ; - this->model[0][1][3] = 0.1 ; - - this->model[0][2][0] = 0.1 ; - this->model[0][2][1] = 0.1 ; - this->model[0][2][2] = 0.7 ; - this->model[0][2][3] = 0.1 ; - - this->model[0][3][0] = 0.1 ; - this->model[0][3][1] = 0.1 ; - this->model[0][3][2] = 0.1 ; - this->model[0][3][3] = 0.7 ; - - this->model[0][4][0] = 0.1 ; - this->model[0][4][1] = 0.1 ; - this->model[0][4][2] = 0.1 ; - this->model[0][4][3] = 0.7 ; - - this->model[0][5][0] = 0.1 ; - this->model[0][5][1] = 0.1 ; - this->model[0][5][2] = 0.7 ; - this->model[0][5][3] = 0.1 ; - - this->model[0][6][0] = 0.1 ; - this->model[0][6][1] = 0.7 ; - this->model[0][6][2] = 0.1 ; - this->model[0][6][3] = 0.1 ; - - this->model[0][7][0] = 0.7 ; - this->model[0][7][1] = 0.1 ; - this->model[0][7][2] = 0.1 ; - this->model[0][7][3] = 0.1 ; -} - -/* -void SequenceLayer::seed_model_toy() -{ - this->model[0][0][0] = 0.2340 ; - this->model[0][0][1] = 0.4307 ; - this->model[0][0][2] = 0.0952 ; - this->model[0][0][3] = 0.2401 ; - - this->model[0][1][0] = 0.1412 ; - this->model[0][1][1] = 0.2819 ; - this->model[0][1][2] = 0.4411 ; - this->model[0][1][3] = 0.1358 ; - - this->model[0][2][0] = 0.2963 ; - this->model[0][2][1] = 0.1578 ; - this->model[0][2][2] = 0.3153 ; - this->model[0][2][3] = 0.2306 ; - - this->model[0][3][0] = 0.1475 ; - this->model[0][3][1] = 0.3947 ; - this->model[0][3][2] = 0.2290 ; - this->model[0][3][3] = 0.2287 ; - - this->model[0][4][0] = 0.1403 ; - this->model[0][4][1] = 0.1473 ; - this->model[0][4][2] = 0.4608 ; - this->model[0][4][3] = 0.2516 ; - - this->model[0][5][0] = 0.2210 ; - this->model[0][5][1] = 0.2487 ; - this->model[0][5][2] = 0.2073 ; - this->model[0][5][3] = 0.3230 ; - - this->model[0][6][0] = 0.3288 ; - this->model[0][6][1] = 0.1526 ; - this->model[0][6][2] = 0.1529 ; - this->model[0][6][3] = 0.3656 ; - - this->model[0][7][0] = 0.1295 ; - this->model[0][7][1] = 0.3987 ; - this->model[0][7][2] = 0.2997 ; - this->model[0][7][3] = 0.1721 ; -} -*/ - -void SequenceLayer::compute_loglikelihoods(matrix4d_d& loglikelihood, +void SequenceLayer::compute_loglikelihoods(Matrix4D& loglikelihood, vector_d& loglikelihood_max, ThreadPool* threads) const { // dimension checks this->check_loglikelihood_dim(loglikelihood) ; this->check_loglikelihood_max_dim(loglikelihood_max) ; // compute the log prob model and the log prob reverse-complement model - matrix3d_d model_log(this->n_class, - matrix2d_d(this->l_model, - vector_d(this->n_category, 0.))) ; - matrix3d_d model_log_rev = model_log ; + std::vector> model_log(this->n_class, + Matrix2D(this->l_model, + this->n_category, + 0.)) ; + std::vector> model_log_rev = model_log ; + /* + Matrix3D model_log(this->n_class, + this->l_model, + this->n_category, + 0.) ; + Matrix3D model_log_rev = model_log ; + */ for(size_t i=0; in_class; i++) { for(size_t j=0; jl_model; j++) { for(size_t k=0; kn_category; k++) { // forward - model_log[i][j][k] = log(this->model[i][j][k]) ; + model_log[i](j,k) = log(this->model(i,j,k)) ; // reverse - model_log_rev[i][this->l_model-j-1][this->n_category-k-1] - = log(this->model[i][j][k]) ; + model_log_rev[i](this->l_model-j-1,this->n_category-k-1) + = log(this->model(i,j,k)) ; } } } // don't parallelize if(threads == nullptr) { std::promise promise ; std::future future = promise.get_future() ; this->compute_loglikelihoods_routine(0, this->n_row, loglikelihood, loglikelihood_max, model_log, model_log_rev, promise) ; future.get() ; } // parallelize else { size_t n_threads = threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row, n_threads) ; // get promises and futures // the function run by the threads will simply fill the promise with // "true" to indicate that they are done std::vector> promises(n_threads) ; std::vector> futures(n_threads) ; for(size_t i=0; iaddJob(std::move( std::bind(&SequenceLayer::compute_loglikelihoods_routine, this, slice.first, slice.second, std::ref(loglikelihood), std::ref(loglikelihood_max), std::ref(model_log), std::ref(model_log_rev), std::ref(promises[i])))) ; } // wait until all threads are done working for(auto& future : futures) { future.get() ; } // -------------------------- threads stop --------------------------- } } void SequenceLayer::compute_loglikelihoods_routine(size_t from, size_t to, - matrix4d_d& loglikelihood, + Matrix4D& loglikelihood, vector_d& loglikelihood_max, - const matrix3d_d& model_log, - const matrix3d_d& model_log_rev, + const std::vector>& model_log, + const std::vector>& model_log_rev, std::promise& done) const { // compute log likelihood for(size_t i=from; i::lowest() ; for(size_t j=0; jn_class; j++) { - // std::cerr << model[j] << std::endl << std::endl ; - for(size_t s=0; sn_shift; s++) { // forward strand - { double ll_fw = score_subseq(this->data[i], s, model_log[j]) ; - loglikelihood[i][j][s][flip_states::FORWARD] = ll_fw ; + { double ll_fw = score_subseq(this->data, i, s, model_log[j]) ; + loglikelihood(i,j,s,flip_states::FORWARD) = ll_fw ; // keep track of max per row if(ll_fw > loglikelihood_max[i]) { loglikelihood_max[i] = ll_fw ; } } // reverse if(this->flip) - { double ll_rev = score_subseq(this->data[i], s, model_log_rev[j]) ; - loglikelihood[i][j][s][flip_states::REVERSE] = ll_rev ; + { double ll_rev = score_subseq(this->data, i, s, model_log_rev[j]) ; + loglikelihood(i,j,s,flip_states::REVERSE) = ll_rev ; // keep track of max per row if(ll_rev > loglikelihood_max[i]) { loglikelihood_max[i] = ll_rev ; } } } } } done.set_value(true) ; } -void SequenceLayer::update_model(const matrix4d_d& posterior_prob, +void SequenceLayer::update_model(const Matrix4D& posterior_prob, ThreadPool* threads) { // don't parallelize if(threads == nullptr) - { std::promise promise ; - std::future future = promise.get_future() ; + { std::promise> promise ; + std::future> future = promise.get_future() ; this->update_model_routine(0, this->n_row, posterior_prob, promise) ; - this->model = future.get() ; + // this->model = future.get() ; + auto model = future.get() ; + size_t n_class_to_update = this->n_class - this->last_class_cst ; + for(size_t i=0; il_model; j++) + { for(size_t k=0; kn_category; k++) + { this->model(i,j,k) = model(i,j,k) ; } + } + } } // parallelize else { size_t n_threads = threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row, n_threads) ; // get promises and futures // the function run by the threads will simply fill the promise with // "true" to indicate that they are done - std::vector> promises(n_threads) ; - std::vector> futures(n_threads) ; + std::vector>> promises(n_threads) ; + std::vector>> futures(n_threads) ; for(size_t i=0; iaddJob(std::move( std::bind(&SequenceLayer::update_model_routine, this, slice.first, slice.second, std::ref(posterior_prob), std::ref(promises[i])))) ; } // reinitialise the model - this->model = matrix3d_d(this->n_class, - matrix2d_d(this->l_model, - vector_d(this->n_category, 0))) ; + /* + this->model = Matrix3D(this->n_class, + this->l_model, + this->n_category, + 0.) ; + */ + size_t n_class_to_update = this->n_class - this->last_class_cst ; + for(size_t i=0; il_model; j++) + { for(size_t k=0; kn_category; k++) + { this->model(i,j,k) = 0. ; } + } + } // wait until all threads are done working // and update the model for(auto& future : futures) - { matrix3d_d model_part = future.get() ; - for(size_t i=0; in_class; i++) + { Matrix3D model_part = future.get() ; + for(size_t i=0; in_class; i++) { for(size_t j=0; jl_model; j++) { for(size_t k=0; kn_category; k++) - { this->model[i][j][k] += model_part[i][j][k] ; } + { this->model(i,j,k) += model_part(i,j,k) ; } } } } // -------------------------- threads stop --------------------------- } // make sure to have no 0 values for(size_t i=0; in_class; i++) { for(size_t j=0; jl_model; j++) { for(size_t k=0; kn_category; k++) - { this->model[i][j][k] = - std::max(this->model[i][j][k], SequenceLayer::p_min) ; + { this->model(i,j,k) = + std::max(this->model(i,j,k), SequenceLayer::p_min) ; } } } // normalize to get probs for(size_t i=0; in_class; i++) { for(size_t j=0; jl_model; j++) { double sum = 0. ; for(size_t k=0; kn_category; k++) - { sum += this->model[i][j][k] ; } + { sum += this->model(i,j,k) ; } for(size_t k=0; kn_category; k++) - { double p = this->model[i][j][k] / sum ; - this->model[i][j][k] = p ; + { double p = this->model(i,j,k) / sum ; + this->model(i,j,k) = p ; /* - this->model[i][j][k] = + this->model(i,j,k) = std::max(p, SequenceLayer::p_min) ; */ } } } } void SequenceLayer::update_model_routine(size_t from, size_t to, - const matrix4d_d& posterior_prob, - std::promise& promise) const -{ - // dimension checks + const Matrix4D& posterior_prob, + std::promise>& promise) const +{ // dimension checks this->check_posterior_prob_dim(posterior_prob) ; - matrix3d_d model = matrix3d_d(this->n_class, - matrix2d_d(this->l_model, - vector_d(this->n_category, 0))) ; + Matrix3D model(this->n_class, + this->l_model, + this->n_category, + 0.) ; // the int code of A, C, G, T, N static int a_code = dna::char_to_int('A') ; static int c_code = dna::char_to_int('C') ; static int g_code = dna::char_to_int('G') ; static int t_code = dna::char_to_int('T') ; static int n_code = dna::char_to_int('N') ; // the int code of the reverse complement of A, C, G, T static int a_code_r = dna::char_to_int('A', true) ; static int c_code_r = dna::char_to_int('C', true) ; static int g_code_r = dna::char_to_int('G', true) ; static int t_code_r = dna::char_to_int('T', true) ; - for(size_t k=0; k < this->n_class; k++) + size_t n_class_to_update = this->n_class - this->last_class_cst ; + + for(size_t k=0; k < n_class_to_update; k++) + // for(size_t k=0; k < this->n_class; k++) { for(size_t s=0; sn_shift; s++) { for(size_t j=0; jl_model; j++) - { - // base prob on fw and rv strand - vector_d base_prob(this->n_category, 0.) ; - vector_d base_prob_rev(this->n_category,0.) ; - + { // base prob on fw and rv strand + vector_d base_prob_fw(this->n_category, 0.) ; + vector_d base_prob_rv(this->n_category, 0.) ; for(size_t i=from; idata[i][s+j] ; + { int base = this->data(i,s+j) ; int base_rev = this->n_category - base - 1 ; // N if(base == n_code) { // --------------- forward --------------- - { base_prob[a_code] += - posterior_prob[i][k][s][SequenceLayer::FORWARD] ; - base_prob[c_code] += - posterior_prob[i][k][s][SequenceLayer::FORWARD] ; - base_prob[g_code] += - posterior_prob[i][k][s][SequenceLayer::FORWARD] ; - base_prob[t_code] += - posterior_prob[i][k][s][SequenceLayer::FORWARD] ; + { base_prob_fw[a_code] += + posterior_prob(i,k,s,SequenceLayer::FORWARD) ; + base_prob_fw[c_code] += + posterior_prob(i,k,s,SequenceLayer::FORWARD) ; + base_prob_fw[g_code] += + posterior_prob(i,k,s,SequenceLayer::FORWARD) ; + base_prob_fw[t_code] += + posterior_prob(i,k,s,SequenceLayer::FORWARD) ; } // --------------- reverse --------------- if(this->flip) - { base_prob_rev[a_code_r] += - posterior_prob[i][k][s][SequenceLayer::REVERSE] ; - base_prob_rev[c_code_r] += - posterior_prob[i][k][s][SequenceLayer::REVERSE] ; - base_prob_rev[g_code_r] += - posterior_prob[i][k][s][SequenceLayer::REVERSE] ; - base_prob_rev[t_code_r] += - posterior_prob[i][k][s][SequenceLayer::REVERSE] ; + { base_prob_rv[a_code_r] += + posterior_prob(i,k,s,SequenceLayer::REVERSE) ; + base_prob_rv[c_code_r] += + posterior_prob(i,k,s,SequenceLayer::REVERSE) ; + base_prob_rv[g_code_r] += + posterior_prob(i,k,s,SequenceLayer::REVERSE) ; + base_prob_rv[t_code_r] += + posterior_prob(i,k,s,SequenceLayer::REVERSE) ; } } // A, C, G, T else - { // --------------- forward --------------- - { base_prob[base] += - posterior_prob[i][k][s][SequenceLayer::FORWARD] ; + { { base_prob_fw[base] += + posterior_prob(i,k,s,SequenceLayer::FORWARD) ; } // --------------- reverse --------------- if(this->flip) - { base_prob_rev[base_rev] += - posterior_prob[i][k][s][SequenceLayer::REVERSE] ; + { base_prob_rv[base_rev] += + posterior_prob(i,k,s,SequenceLayer::REVERSE) ; } } } // update this position of the model - for(size_t i=0,i_rev=base_prob.size()-1; iflip) - { model[k][this->l_model-j-1][i] += base_prob_rev[i] ; } + { model(k,this->l_model-j-1,i) += base_prob_rv[i] ; } } } } } promise.set_value(model) ; } + +void SequenceLayer::set_class(size_t i, const Matrix2D& motif) +{ // check dimensions + if(motif.get_nrow() != this->n_category) + { char msg[4096] ; + sprintf(msg, "Error! the given class model is incompatible " + "with the SequenceLayer (%zu rows instead of %zu)", + motif.get_nrow(), this->n_category) ; + throw std::invalid_argument(msg) ; + } + else if(motif.get_ncol() != this->l_model) + { char msg[4096] ; + sprintf(msg, "Error! the given class model is incompatible " + "with the SequenceLayer (%zu columns instead of %zu)", + motif.get_ncol(), this->l_model) ; + throw std::invalid_argument(msg) ; + } + + for(size_t j=0; jmodel(i,j,k) = motif(k,j) ; } + } +} diff --git a/src/Clustering/SequenceLayer.hpp b/src/Clustering/SequenceLayer.hpp index 844c3e5..a31d2bb 100644 --- a/src/Clustering/SequenceLayer.hpp +++ b/src/Clustering/SequenceLayer.hpp @@ -1,176 +1,203 @@ #ifndef SEQUENCELAYER_HPP #define SEQUENCELAYER_HPP #include #include #include #include // std::promise, std::future -#include +#include +#include +#include #include +typedef std::vector vector_d ; + class SequenceLayer : public DataLayer { public: /*! * \brief Computes the log-likelihood of the sub- - * sequence starting at the offset in the given - * sequence. The subsequence length is determined - * by the model lenght. + * sequence - stored in a given row - and starting + * at the offset in the given sequence matrix. + * The subsequence length is determined by the model + * lenght. * \param seq the sequences in integer format. - * \param start the index at which the sub-sequence + * \param row the row containing the sequence of + * interest. + * \param col the index at which the sub-sequence * is starting (1st index inside the subsequence * of interest). * \param model_log a model containing the log * probability model. * \return the log-likelihood of the sub-sequence * given the model. * \throw std::invalid_argument if 1) the offset is * invalid, 2) the sequence and the model have * incompatible dimensions or 3) the model 2n dimension * is not 4 (A,C,G,T). */ - static double score_subseq(const vector_i& seq, - size_t start, - const matrix2d_d& model_log) ; + static double score_subseq(const Matrix2D& seq, + size_t row, + size_t col, + const Matrix2D& model_log) ; public: /*! * \brief Constructs an object with the * given data and an empty (0 values) * model. * \param data the data. * \param n_class the number of classes * of the model. * \param n_shift the number of shift * states of the model. * \param flip whether flipping is allowed. + * \param last_class_cst indicates that the + * last class of the model is constant + * and will never be updated by calls to + * update_model(). */ - SequenceLayer(const matrix2d_i& data, + SequenceLayer(const Matrix2D& data, size_t n_class, size_t n_shift, - bool flip) ; + bool flip, + bool last_class_cst) ; /*! * \brief Construct an object with the * given data and model. + * The shifting freedom is set to (data number + * of columns) - (the model 2nd dimension) + * + 1. * \param data the data. The sequences * should be stored as integer values : * A:0, C:1, G:2, T:3, else:5. - * \param the model. + * \param model the model with the following + * dimensions : + * dim1 the number of classes + * dim2 the model length + * dim3 4 (A,C,G,T) * \param flip whether flipping is allowed. + * \param last_class_cst indicates that the + * last class of the model is constant + * and will never be updated by calls to + * update_model(). */ - SequenceLayer(const matrix2d_i& data, - const matrix3d_d& model, - bool flip) ; + SequenceLayer(const Matrix2D& data, + const Matrix3D& model, + bool flip, + bool last_class_cst) ; /*! * Destructor */ virtual ~SequenceLayer() override ; - /*! - * \brief Sets the model values randomly. - */ - virtual void seed_model_randomly() ; - - /*! - * \brief Sets the model values by - * sampling rows in the data and - * assigning them as initial model - * values. - */ - virtual void seed_model_sampling() ; - - /*! - * \brief Sets the model values by - * using the first n_class rows in data. - */ - virtual void seed_model_toy() ; - - /*! * \brief Computes the log likelihood of the data * given the current model parameters. * \param logliklihood a matrix to store the * results. It should have the following dimensions : * 1st : same as the data number of row * 2nd : same as the model number of classes * 3rd : same as the number of shifts * 4th : same as the number of flip states * \param loglikelihood_max a vector containing the * max value for each row of loglikelihood. * Its length should be equal to the data row number. * \throw std::invalid_argument if the dimensions are * incorrect. */ - virtual void compute_loglikelihoods(matrix4d_d& loglikelihood, + virtual void compute_loglikelihoods(Matrix4D& loglikelihood, vector_d& loglikelihood_max, ThreadPool* threads=nullptr) const override ; /*! * \brief Updates the model given the posterior * probabilities (the probabilities of each row * in the data to be assigned to each class, * for each shift and flip state). * \param posterior_prob the data assignment probabilities to * the different classes. */ - virtual void update_model(const matrix4d_d& posterior_prob, + virtual void update_model(const Matrix4D& posterior_prob, ThreadPool* threads=nullptr) override ; + /*! + * \brief Modify the values of th given class + * with the given parameters. + * The given motif should have the same length + * as the current model classes. + * \param i the index of the class to modify, 0-based. + * \param motif the new parameters values. + * Its dimensions should be : + * 1st : 4 for A,C,G,T + * 2nd : the model length. + * \throw std::invalid_argument if the dimensions are not + * compatible with the current model classes. + */ + void set_class(size_t i, const Matrix2D& motif) ; + protected: /*! * \brief The routine that effectively performs the * loglikelihood computations. * \param from the index of the first row of the data * to considered. * \param to the index of the past last row of the data * to considered. * \param loglikelihood a matrix to store the * results. It should have the following dimensions : * 1st : same as the data number of row * 2nd : same as the model number of classes * 3rd : same as the number of shifts * 4th : same as the number of flip states * \param loglikelihood_max a vector containing the * max value for each row of log_likelihood. * Its length should be equal to the data row number. - * \param model_log a matrix containing the log value - * of the model. - * \param model_log_rev a matrix containing the log values - * of the reverse strand model (the 1st position in the model - * becomes the last in the reverse model and probabilities are - * swapped A<->T and C<->G). + * \param model_log a vector containing the matrices with + * the log values of the model for each class. + * \param model_log_rev a vector containing the matrices with + * the log values of the reverse strand model for each class + * (the 1st position in the model becomes the last in the + * reverse model and probabilities are swapped A<->T and C<->G). * \param done a promise to be filled when the routine * is done running. */ void compute_loglikelihoods_routine(size_t from, size_t to, - matrix4d_d& loglikelihood, + Matrix4D& loglikelihood, vector_d& loglikelihood_max, - const matrix3d_d& model_log, - const matrix3d_d& model_log_rev, + const std::vector>& model_log, + const std::vector>& model_log_rev, std::promise& done) const ; /*! * \brief The routine that effectively update the model. * \param from the index of the first row of the * posterior probabilities to considered. * \param to the index of the past last row of the * posterior probabilities to considered. * \param posterior_prob the data assignment probabilities * to the different classes. * \param * \param done a promise containing the partial model * computed from the given data slice. If several routines * work together at updating the model, they need to be * summed and normalized (by the column sum) to get the * final model. */ void update_model_routine(size_t from, size_t to, - const matrix4d_d& posterior_prob, - std::promise& done) const ; + const Matrix4D& posterior_prob, + std::promise>& done) const ; + + /*! + * \brief A flag indicating that the last class of the model + * is constant and should not be updated when calling + * update_model(). + */ + bool last_class_cst ; } ; #endif // SEQUENCELAYER_HPP diff --git a/src/Clustering/SequenceModelComputer.cpp b/src/Clustering/SequenceModelComputer.cpp index b267a4a..0ba262f 100644 --- a/src/Clustering/SequenceModelComputer.cpp +++ b/src/Clustering/SequenceModelComputer.cpp @@ -1,42 +1,45 @@ #include +#include +#include #include -SequenceModelComputer::SequenceModelComputer(const matrix2d_i& data, - const matrix4d_d& post_prob, +SequenceModelComputer::SequenceModelComputer(const Matrix2D& data, + const Matrix4D& post_prob, size_t n_threads) : ModelComputer(), threads(nullptr) { // parameters - size_t n_class = post_prob[0].size() ; - size_t n_shift = post_prob[0][0].size() ; - size_t n_flip = post_prob[0][0][0].size() ; + size_t n_class = post_prob.get_dim()[1] ; + size_t n_shift = post_prob.get_dim()[2] ; + size_t n_flip = post_prob.get_dim()[3] ; bool flip = n_flip == 2 ; // the threads if(n_threads) { this->threads = new ThreadPool(n_threads) ; } // the data and the model this->data_layer = new SequenceLayer(data, n_class, n_shift, - flip) ; + flip, + false) ; this->data_layer->update_model(post_prob, this->threads) ; } SequenceModelComputer::~SequenceModelComputer() { // threads if(this->threads != nullptr) { this->threads->join() ; delete this->threads ; this->threads = nullptr ; } // data and model if(this->data_layer != nullptr) { delete this->data_layer ; this->data_layer = nullptr ; } } diff --git a/src/Clustering/SequenceModelComputer.hpp b/src/Clustering/SequenceModelComputer.hpp index 9c69b97..b1b6842 100644 --- a/src/Clustering/SequenceModelComputer.hpp +++ b/src/Clustering/SequenceModelComputer.hpp @@ -1,41 +1,42 @@ #ifndef SEQUENCEMODELCOMPUTER_HPP #define SEQUENCEMODELCOMPUTER_HPP #include -#include +#include +#include #include class SequenceModelComputer : public ModelComputer { public: /*! * \brief Constructs an object to retrieve * the sequence model given the data and their * classification results. * \param data the data. * \param post_prob the data class assignment * probabilities. * \param n_threads the number of parallel threads * to run the computations. 0 means no parallel * computing, everything is run on the main thread. */ - SequenceModelComputer(const matrix2d_i& data, - const matrix4d_d& post_prob, + SequenceModelComputer(const Matrix2D& data, + const Matrix4D& post_prob, size_t n_threads) ; /*! * \brief Destructor. */ virtual ~SequenceModelComputer() override ; protected: /*! * \brief the threads. */ ThreadPool* threads ; } ; #endif // SEQUENCEMODELCOMPUTER_HPP diff --git a/src/GenomicTools/CorrelationMatrixCreator.cpp b/src/GenomicTools/CorrelationMatrixCreator.cpp index 44162a6..0197664 100644 --- a/src/GenomicTools/CorrelationMatrixCreator.cpp +++ b/src/GenomicTools/CorrelationMatrixCreator.cpp @@ -1,375 +1,374 @@ #include #include #include // std::runtime_error #include // BamFileIn #include // BedFileIn #include -#include +#include /* template std::ostream& operator << (std::ostream& stream, const std::list& l) { for(const auto& p : l) { stream << p << " " ; } return stream ; } template std::ostream& operator << (std::ostream& stream, const std::vector& v) { for(const auto& p : v) { stream << p << " " ; } return stream ; } template std::ostream& operator << (std::ostream& stream, const std::pair& p) { stream << "[" << p.first << " " << p.second << "] " ; return stream ; } template std::ostream& operator << (std::ostream& stream, const std::unordered_map& m) { for(const auto& p : m) { stream << p << " " << std::endl; } return stream ; } */ /* A lambda to sort GenomeRegion by ascending starting coordinate */ auto sortByStartPos = [](const GenomeRegion& r1, const GenomeRegion& r2) -> bool { return r1 < r2 ; } ; CorrelationMatrixCreator::CorrelationMatrixCreator(const std::string& bed_file_path, const std::string& bam_file_path, const std::string& bai_file_path, int from, int to, int bin_size, CorrelationMatrixCreator::methods method) : ReadMatrixCreator(bed_file_path, bam_file_path, bai_file_path, from, to, bin_size, method), target_list_fw(), target_list_rv() { seqan::BedRecord bed_line ; // compute coordinates relative to each region this->compute_relative_bin_coord() ; size_t n_col = this->relative_bin_coord.size() ; // compute number of regions and get valid chromosomes names this->open_bed_file() ; this->open_bam_file() ; seqan::BamHeader header ; seqan::readHeader(header, bam_file) ; size_t n_row = 0 ; while(not seqan::atEnd(this->bed_file)) { seqan::readRecord(bed_line, this->bed_file) ; std::string chrom_name = seqan::toCString(bed_line.ref) ; // new chromosome if(this->chrom_map_names.find(chrom_name) == this->chrom_map_names.end()) { int chrom_idx = -1 ; seqan::getIdByName(chrom_idx, seqan::contigNamesCache(seqan::context(this->bam_file)), chrom_name) ; this->chrom_map_names[chrom_name] = chrom_idx ; } n_row++ ; } this->close_bed_file() ; this->close_bam_file() ; // create the count matrix - this->matrix_counts = matrix2d_i(n_row, - vector_i(n_col, 0)) ; + this->matrix_counts = Matrix2D(n_row, n_col, 0.) ; // create the region matrix this->matrix_bins = std::vector> (n_row,std::vector(n_col)) ; this->open_bed_file() ; this->open_bam_file() ; size_t i = 0 ; while(not seqan::atEnd(this->bed_file)) { seqan::readRecord(bed_line, this->bed_file) ; // find the region limits std::string region_chr = seqan::toCString(bed_line.ref) ; // int region_len = bed_line.endPos - bed_line.beginPos ; // int region_mid = bed_line.beginPos + (region_len / 2) ; int region_mid = CorrelationMatrixCreator::get_center_pos(bed_line) ; // compute the absolute bins coordinates for this region // and create the bins in this region for(size_t j=0; jrelative_bin_coord[j] ; this->matrix_bins[i][j] = GenomeRegion(region_chr, this->chrom_map_names[region_chr], region_mid + relative_coord.first, region_mid + relative_coord.second) ; } i++ ; } this->close_bed_file() ; this->close_bam_file() ; } CorrelationMatrixCreator::~CorrelationMatrixCreator() { this->close_bam_file() ; // bed file is closed in ~MatrixCreator() } -matrix2d_i CorrelationMatrixCreator::create_matrix() +Matrix2D CorrelationMatrixCreator::create_matrix() { this->open_bam_file() ; this->open_bai_file() ; // read BAM header seqan::BamHeader bam_header ; seqan::readHeader(bam_header, this->bam_file) ; - for(size_t i=0; imatrix_counts.size(); i++) + for(size_t i=0; imatrix_counts.get_nrow(); i++) { const auto& row = this->matrix_bins[i] ; GenomeRegion region(row.front().chromosome, row.front().chromosome_idx, row.front().start, row.back().end) ; bool jump = this->jump_upstream(region, 600) ; if(not jump) { continue ; } // read all relevant targets this->to_downstream_target(region) ; // update count matrix row this->update_count_matrix(i) ; // clean buffers this->clear_target_lists() ; } this->close_bam_file() ; return this->matrix_counts ; } bool CorrelationMatrixCreator::jump_upstream(const GenomeRegion& region, int margin) { bool has_alignment = false ; int rID = -10 ; if(this->chrom_map_names.find(region.chromosome) != this->chrom_map_names.end()) { rID = this->chrom_map_names[region.chromosome] ; } else { char msg[4096] ; sprintf(msg, "Error! chromosome %s is not linked with a valid ID in BAM file", region.chromosome.c_str()) ; std::cerr << msg << std::endl ; return false ; } int start = std::max(0, region.start - margin) ; int end = start + 1 ; bool jump = seqan::jumpToRegion(this->bam_file, has_alignment, rID, start, end, this->bai_file) ; return jump ; } void CorrelationMatrixCreator::to_downstream_target(const GenomeRegion& region) { if(this->method == CorrelationMatrixCreator::methods::READ or this->method == CorrelationMatrixCreator::methods::READ_ATAC) { this->to_downstream_read(region) ; } else { this->to_downstream_fragment(region) ; } } void CorrelationMatrixCreator::to_downstream_read(const GenomeRegion& region) { bool done = false ; seqan::BamAlignmentRecord record ; while(not seqan::atEnd(this->bam_file) and not done) { // QC check and transform record seqan::readRecord(record, this->bam_file) ; if(not CorrelationMatrixCreator::is_good_read(record) or not this->is_valid_chromosome(record)) { continue ; } GenomeRegion target ; try { if(this->method == CorrelationMatrixCreator::methods::READ) { target = GenomeRegion::constructRead(record, this->bam_file) ; } else { target = GenomeRegion::constructReadATAC(record, this->bam_file) ; } } catch(std::invalid_argument& e) { // connect to cerr to write in SAM seqan::BamFileOut samFileOut(seqan::context(this->bam_file), std::cerr, seqan::Sam()) ; std::cerr << "std::invalid_argument caught! could not use " "this record as read: " << std::endl ; writeRecord(samFileOut, record) ; std::cerr << "message was : " << e.what() << std::endl << std::endl ; continue ; } // upstream -> continue if(target < region) { continue ; } // overlap -> store else if(target | region) { if(not seqan::hasFlagRC(record)) { this->target_list_fw.push_back(target) ; } else { this->target_list_rv.push_back(target) ; } } // downstream -> stop else { done = true ; } } } void CorrelationMatrixCreator::to_downstream_fragment(const GenomeRegion& region) { bool done = false ; seqan::BamAlignmentRecord record ; while(not seqan::atEnd(this->bam_file) and not done) { // QC check and transform record seqan::readRecord(record, this->bam_file) ; if(not CorrelationMatrixCreator::is_good_pair(record) or not this->is_valid_chromosome(record)) { continue ; } GenomeRegion target ; try { target = GenomeRegion::constructFragment(record, this->bam_file) ; } catch(std::invalid_argument& e) { // connect to cerr to write in SAM seqan::BamFileOut samFileOut(seqan::context(this->bam_file), std::cerr, seqan::Sam()) ; std::cerr << "std::invalid_argument caught! could not use " "this record as fragment: " << std::endl ; writeRecord(samFileOut, record) ; std::cerr << "message was : " << e.what() << std::endl << std::endl ; continue ; } // upstream -> continue if(target < region) { continue ; } // overlap -> store else if(target | region) { if(this->method == CorrelationMatrixCreator::methods::FRAGMENT_CENTER) { target = GenomeRegion::constructFragmentCenter(record, this->bam_file) ; if(target | region) { this->target_list_fw.push_back(target) ; } } else { this->target_list_fw.push_back(target) ; } } // downstream -> stop else if(target > region) { // std::cerr << std::endl ; done = true ; } } // std::cerr << "to_downstream_fragment END" << std::endl ; } void CorrelationMatrixCreator::clear_target_lists() { this->target_list_fw.clear() ; this->target_list_rv.clear() ; } /* void CorrelationMatrixCreator::remove_upstream_targets(const GenomeRegion& region) { // forward targets auto iter_fw = this->target_list_fw.cbegin() ; while(iter_fw != this->target_list_fw.end()) { // remove upstream reads if(*iter_fw < region) { iter_fw = this->target_list_fw.erase(iter_fw) ; } // keep overlapping reads, don't stop here else if(*iter_fw | region) { iter_fw++ ; } // stop at first read downstream else { break ; } } // reverse targets auto iter_rv = this->target_list_rv.cbegin() ; while(iter_rv != this->target_list_rv.end()) { // remove upstream reads if(*iter_rv < region) { iter_rv = this->target_list_rv.erase(iter_rv) ; } // keep overlapping reads else if(*iter_rv | region) { iter_rv++ ; } // stop at first read downstream else { break ; } } } */ void CorrelationMatrixCreator::update_count_matrix(size_t row_index) { // forward targets for(const auto& iter : this->target_list_fw) { auto bin_start_end = CorrelationMatrixCreator:: get_bin_indices(iter, this->matrix_bins[row_index]) ; for(int j=bin_start_end.first; jmatrix_counts[row_index][j] += + { this->matrix_counts(row_index,j) += iter.overlap_len(this->matrix_bins[row_index][j]) ; } } // reverse targets for(const auto& iter : this->target_list_rv) { auto bin_start_end = CorrelationMatrixCreator:: get_bin_indices(iter, this->matrix_bins[row_index]) ; for(int j=bin_start_end.first; jmatrix_counts[row_index][j] += + { this->matrix_counts(row_index,j) += iter.overlap_len(this->matrix_bins[row_index][j]) ; } } } /* void CorrelationMatrixCreator::update_count_matrix_naive(size_t row_index) { // forward targets for(const auto& iter : target_list_fw) { for(size_t j=0; jmatrix_counts[0].size(); j++) { this->matrix_counts[row_index][j] += iter.overlap_len(this->matrix_bins[row_index][j]) ; } } // reverse targets for(const auto& iter : target_list_rv) { for(size_t j=0; jmatrix_counts[0].size(); j++) { this->matrix_counts[row_index][j] += iter.overlap_len(this->matrix_bins[row_index][j]) ; } } } */ diff --git a/src/GenomicTools/CorrelationMatrixCreator.hpp b/src/GenomicTools/CorrelationMatrixCreator.hpp index 502616f..521f781 100644 --- a/src/GenomicTools/CorrelationMatrixCreator.hpp +++ b/src/GenomicTools/CorrelationMatrixCreator.hpp @@ -1,180 +1,180 @@ #ifndef CORRELATIONMATRIXCREATOR_HPP #define CORRELATIONMATRIXCREATOR_HPP #include #include #include #include // BamFileIn #include // BedFileIn #include -#include +#include /*! * \brief The CorrelationMatrixCreator class allows * to create correlation matrices. * A correlation matrix contains the number of target * mapped at different positions around a set of * reference positions. * This class will read the reference positions from * a BED file and the targets from a BAM file. For each * reference, the region center is computed and then a * region covering the interval [from,to] is build * around the middle and divided into equally sized * bins. Finally, each bin is assigned the number of * target present in the BAM file that are mapped at * that position. * The final matrix contains one row per reference, * with the number of targets counted at each possible * position (bin). relative to this reference. */ class CorrelationMatrixCreator: public ReadMatrixCreator { public: CorrelationMatrixCreator() = delete ; /*! * \brief Constructs an object to build a * correlation matrix. * \param bed_file_path the path to the file containing * the references. * \param bam_file_path the path to the file containing * the targets. * \param bai_file_path the path to index file of the bam * file containing the targets. * \param from the upstream most relative position * to consider around the references. It may * be changed to make sure that the central bin * is centered on +/- 0. * \param to the dowmstream most relative position * to consider around the references. It may * be changed to make sure that the central bin * is centered on +/- 0. * \param bin_size the bin size in base pair. * \param method how the targets should be counted. * READ all the positions inside the reads are * counted. * READ_ATAC only the +4bp position of +strand reads * and the -5bp of -strand reads are counted. It * correspond to the insertion position in ATAC-seq * data. * FRAGMENT all the positions within fragments (the * genome segment between a pair of reads, reads * included) are counted. * FRAGMENT_CENTER only the central position of the * fragements (the genome segment between a pair of * reads, reads included) are counted. */ CorrelationMatrixCreator(const std::string& bed_file_path, const std::string& bam_file_path, const std::string& bai_file_path, int from, int to, int bin_size, CorrelationMatrixCreator::methods method) ; /*! * Destructor. */ virtual ~CorrelationMatrixCreator() ; /*! * \brief Computes the matrix and returns it. * \return the count matrix. */ - virtual matrix2d_i create_matrix() override ; + virtual Matrix2D create_matrix() override ; protected: /*! * \brief Seek in the BAM file right before the last * record upstream the given region. The margin * parameters allows to modify the region start * value. * To read a record within the region, a read * operation is required to get ride of the * record right * \param region the region in front of which the * pointer is desired. * \param margin * which streams in the stream vectors to use. * \return whether the reading pointer could be moved * to the desired position. */ bool jump_upstream(const GenomeRegion& region, int margin) ; /*! * \brief A generic routine that reads the following records * until finding the first one located downstream the region * of interest (the definition of the first target downstream * the region of interest depends if READ/READ_ATAC/FRAGMENT * or FRAGMENT_CENTER is set as method). * All record overlapping the region of interest are stored * in the target lists. * The reading pointer is supposed to be located * upstream the region of interest. If this is note the case, * the method will read records until reaching the end of * the file. * \param region the region of interest. */ void to_downstream_target(const GenomeRegion& region) ; /*! * \brief The routine that reads the following records * until finding the first one located downstream the region * of interest if READ or READ_ATAC is set as method. * All record overlapping the region of interest are stored * in the target lists. * The reading pointer is supposed to be located * upstream the region of interest. If this is note the case, * the method will read records until reaching the end of * the file. * \param region the region of interest. */ void to_downstream_read(const GenomeRegion& region) ; /*! * \brief The routine that reads the following records * until finding the first one located downstream the region * of interest if FRAGMENT or FRAGMENT_CENTER is set as * method. * All record overlapping the region of interest are stored * in the target lists. * The reading pointer is supposed to be located * upstream the region of interest. If this is note the case, * the method will read records until reaching the end of * the file. * \param region the region of interest. */ void to_downstream_fragment(const GenomeRegion& region) ; /*! * \brief Clear the content of the target lists. */ void clear_target_lists() ; /*! * \brief Update the given row of the count matrix with * the content of the target lists. * \param matrix_row_index the index of the row, in the * count matrix. */ void update_count_matrix(size_t row_index) ; /*! * \brief A buffers containing the * target mapped on the forward strand. * Target without strand (fragments) * are also stored in this list. */ std::list target_list_fw ; /*! * \brief A buffers containing the * target mapped on the reverse strand. */ std::list target_list_rv ; } ; #endif // CORRELATIONMATRIXCREATOR_HPP diff --git a/src/GenomicTools/MatrixCreator.cpp b/src/GenomicTools/MatrixCreator.cpp index 367c755..b0090cc 100644 --- a/src/GenomicTools/MatrixCreator.cpp +++ b/src/GenomicTools/MatrixCreator.cpp @@ -1,41 +1,41 @@ #include #include #include // BedFileIn #include #include -#include +#include MatrixCreator::MatrixCreator(const std::string& bed_file_path, int from, int to) : bed_path(bed_file_path), bed_file(), from(from), to(to), matrix_counts() {} int MatrixCreator::get_center_pos(const seqan::BedRecord& bed_line) { int region_len = bed_line.endPos - bed_line.beginPos ; int region_mid = bed_line.beginPos + (region_len / 2) ; return region_mid ; } MatrixCreator::~MatrixCreator() { this->close_bed_file() ; } void MatrixCreator::open_bed_file() { if(not seqan::open(this->bed_file, this->bed_path.c_str())) { char msg[4096] ; sprintf(msg, "cannot open %s", this->bed_path.c_str()) ; throw std::runtime_error(msg) ; } } void MatrixCreator::close_bed_file() { seqan::close(this->bed_file) ; } diff --git a/src/GenomicTools/MatrixCreator.hpp b/src/GenomicTools/MatrixCreator.hpp index 7fac5bf..746a586 100644 --- a/src/GenomicTools/MatrixCreator.hpp +++ b/src/GenomicTools/MatrixCreator.hpp @@ -1,103 +1,103 @@ #ifndef MATRIXCREATOR_HPP #define MATRIXCREATOR_HPP #include #include // BedFileIn, BedRecord #include -#include +#include /*! * \brief The MatrixCreator class is a base class * to be derived by classes that are dedicated to * construct data matrices which rows contains * a signal at different positions (columns) in * this given region. */ class MatrixCreator { public: /*! * \brief Returns the central position of a bed region. * \param bed_line the region of interest. * \return the position of the center. */ static int get_center_pos(const seqan::BedRecord& bed_line) ; public: /*! * \brief Constructs an object. * \param bed_file_path the path to the bed file * containing the coordinates of the regions of * interest. * \param from the downstream most position * to consider, relative to a set of genomic * positions. * \param to the upstream most position to * consider, relative to a set of genomic * positions */ MatrixCreator(const std::string& bed_file_path, int from, int to) ; /*! * Destructor. */ virtual ~MatrixCreator() ; /*! * \brief Creates and return the count matrix. * \return the count matrix. */ - virtual matrix2d_i create_matrix() = 0 ; + virtual Matrix2D create_matrix() = 0 ; protected: /*! * \brief Opens the bed file. * \throw std::runtime_error if the file cannot * be open. */ void open_bed_file() ; /*! * \brief Closes the bed file. * Does nothing if already closed. */ void close_bed_file() ; /*! * \brief Bed file path. */ std::string bed_path ; /*! * \brief An input stream to the * bed file. * Use open_bed_file() to open the stream * and close_bed_file() to close it. */ seqan::BedFileIn bed_file ; /*! * \brief The smallest relative coordinate from the region * center to consider (included). */ int from ; /*! * \brief The biggest relative coordinate from the region * center to consider (not included). */ int to ; /*! * \brief A matrix containing the number of targets * found at each position around each reference. * This is the data structure to fill. */ - matrix2d_i matrix_counts ; + Matrix2D matrix_counts ; } ; #endif // MATRIXCREATOR_HPP diff --git a/src/GenomicTools/ReadMatrixCreator.hpp b/src/GenomicTools/ReadMatrixCreator.hpp index fa57760..424ec1c 100644 --- a/src/GenomicTools/ReadMatrixCreator.hpp +++ b/src/GenomicTools/ReadMatrixCreator.hpp @@ -1,258 +1,258 @@ #ifndef READMATRIXCREATOR_HPP #define READMATRIXCREATOR_HPP #include #include #include // std::pair, std::make_pair() #include // BedFileIn #include // BamFileIn, BamAlignmentRecord #include -#include + class ReadMatrixCreator : public MatrixCreator { public: /*! * \brief A list of values indicating how the data * should be handled when counting the number of * fragments mapped in a given bin. * * FRAGMENT : all positions within a fragment are * accounted for and attributed to the * corresponding bins : * bin1 bin2 * ----|-------|-------|------------> genome * ------- ------- fragments * --> <-- --> <-- pair of reads * ||||| |||||| scoring positions * bin1 gets a score of 5 and bin2 a * score of 6. * * FRAGMENT_CENTER : only the central position * within a fragment is accounted for and * attributed to the corresponding bin : * * bin1 bin2 * ----|-------|-------|------------> genome * ------- ------- fragments * --> <-- --> <-- pair of reads * | | scoring positions * bin1 gets a score of 1 and bin2 also. * * READ : all positions within a read are * accounted for and attributed to the * corresponding bins : * bin1 bin2 * ----|-------|-------|------------> genome * ------- ------- fragments * --> <-- --> <-- reads * | ||| ||| ||| scoring positions * bin1 gets a score of 4 and bin2 a * score of 6. * * READ_ATAC : only the shifted start * of the reads are used. Additionally, the * start position is shifted by +4bp(towards * the right) for reads on the + strand and * -5bp for reads on the - strand (towards the * left). These positions indicate the insertion * position in ATAC-seq data. * bin1 bin2 * ----|-------|-------|------------> genome * ------- ------- fragments * --> <-- --> <-- reads * | | | scoring positions * bin1 gets a score of 1 and bin2 a * score of 2. */ enum methods {FRAGMENT=0, FRAGMENT_CENTER, READ, READ_ATAC} ; public: /*! * \brief Computes which bins (from a contiguous * range of bins) are overlapped by a given target * and returns two indices corresponding to : * i) the index of the 1st bin overlapped by the * target * ii) the index of the past last bin overlapepd * by the target. * If the target does not overlapp any bin (it is * located upstream the 1st bin, downstream the * last bin or on a different chromosome), the * index pair 0,0 is returned. * Thus, in any case, a loop of the type * for(i=first,i get_bin_indices(const GenomeRegion& target, const std::vector& bins) ; /*! * \brief Checks that the read is i) is mapped * , ii) passes QC and iii) is not a duplicate, * based on the flag value. * \param read the read of interest. * \return whether the read passes the above tests. */ bool is_good_read(const seqan::BamAlignmentRecord& read) ; /*! * \brief Checks that the read is i) a good read, ii) * a paired read, iii) proplery aligned, iv) the 1st * of the pair based on the flag values and that * v) they forms a proper fragment with its mate mate * (both read should point toward one other). * \param read the read of interest. * \return whether the read and its mate form a proper * fragment. */ bool is_good_pair(const seqan::BamAlignmentRecord& read) ; public: ReadMatrixCreator() = delete ; /*! * \brief Constructs an object to create * a genomic count matrix. * \param bed_file_path the path to the file containing * the references. * \param bam_file_path the path to the file containing * the targets. * \param bai_file_path the path to index file of the bam * file containing the targets. * \param from the downstream most position * to consider, relative to a set of genomic * positions. * \param to the upstream most position to * consider, relative to a set of genomic * positions * \param bin_size the size of the bins in * which the regions encompassing the set * of genomic positions will be broken * into. * \param method how the sequenced fragments * should be consider when assigning counts * to the bins. */ ReadMatrixCreator(const std::string& bed_file_path, const std::string& bam_file_path, const std::string& bai_file_path, int from, int to, int bin_size, ReadMatrixCreator::methods method) ; /*! * Destructor. */ virtual ~ReadMatrixCreator() ; protected: /*! * \brief Binarize the given range [from,to] into * equal sized bins having the specified size. * The bin coordinates are stored in bin_coord as * pairs of [start,end) coordinates. One bin is * centered on +/- 0. * */ void compute_relative_bin_coord() ; /*! * \brief Checks whether a record has a valid chromosome, * that is whether this chromosome has been found in the * bed file has well. * \param record a record from the bam file. * \return whether the record chromosome is valid. */ bool is_valid_chromosome(const seqan::BamAlignmentRecord& record) ; /*! * \brief Opens the bam file. * \throw std::runtime_error if the file cannot * be open. */ void open_bam_file() ; /*! * \brief Opens the bam index file. * \throw std::runtime_error if the file cannot * be open. */ void open_bai_file() ; /*! * \brief Closes the bam file. * Does nothing if already closed. */ void close_bam_file() ; /*! * \brief The bin size. */ int bin_size ; /*! * \brief How to consider the sequenced fragments when computing * the bin values. */ ReadMatrixCreator::methods method ; /*! * \brief The relative bin coordinates, compared to a given * position. Each bin has a pair [from,to) where is the * 1st position within the bin and is the 1st position * after the bin. One bin is centered on +/- 0. */ std::vector> relative_bin_coord ; /*! * \brief Bam file path. */ std::string bam_path ; /*! * \brief Bam index file path. */ std::string bai_path ; /*! * \brief An input stream to the * bam file. * Use open_bam_file() to open the stream * and close_bam_file() to close it. */ seqan::BamFileIn bam_file; /*! * \brief An input stream to the * bam index file. * Use open_bai_file() to open the stream * and close_bai_file() to close it. */ seqan::BamIndex bai_file ; /*! * \brief A map containing the valid chromsome * names as keys (as find in the bed file) and * their indices (as found in the BAM header) * as values. */ std::unordered_map chrom_map_names ; /*! * \brief A vector containing containing, * for each reference, the coordinates of * the genomic region covered by the bins. */ std::vector> matrix_bins ; } ; #endif // READMATRIXCREATOR_HPP diff --git a/src/GenomicTools/SequenceMatrixCreator.cpp b/src/GenomicTools/SequenceMatrixCreator.cpp index 5685fb3..837cf73 100644 --- a/src/GenomicTools/SequenceMatrixCreator.cpp +++ b/src/GenomicTools/SequenceMatrixCreator.cpp @@ -1,115 +1,113 @@ #include #include -#include #include // std::invalid_argument, std::runtime_error #include // std::make_pair(), std::move() #include #include // BedFileIn, BedRecord #include // seqan::SeqFileIn #include - +#include SequenceMatrixCreator::SequenceMatrixCreator(const std::string& bed_file_path, const std::string& fasta_file_path, int from, int to) : MatrixCreator(bed_file_path, from, to), fasta_path(fasta_file_path), fasta_file() { seqan::BedRecord bed_line ; // compute number of regions this->open_bed_file() ; size_t n_row = 0 ; size_t n_col = to - from + 1 ; while(not seqan::atEnd(this->bed_file)) { seqan::readRecord(bed_line, this->bed_file) ; n_row++ ; } this->close_bed_file() ; // create the count matrix // init to 'N' because if a part of the matrix // cannot be filled, it wil contain stretches of // 'N' - this->matrix_counts = matrix2d_i(n_row, - vector_i(n_col, dna::char_to_int('N'))) ; + this->matrix_counts = Matrix2D(n_row, n_col, dna::char_to_int('N')) ; } SequenceMatrixCreator::~SequenceMatrixCreator() { this->close_fasta_file() ; // bed file closed in ~MatrixCreator() } -matrix2d_i SequenceMatrixCreator::create_matrix() +Matrix2D SequenceMatrixCreator::create_matrix() { std::unordered_map seq_map ; // read the fasta file and store all the sequences this->open_fasta_file() ; while(not seqan::atEnd(this->fasta_file)) { seqan::CharString record_id ; seqan::Dna5String record_seq ; seqan::readRecord(record_id, record_seq, this->fasta_file) ; std::string id = seqan::toCString(record_id) ; // store it if(seq_map.find(id) == seq_map.end()) { seq_map.insert(std::make_pair(std::move(id), std::move(record_seq))) ; } else { char msg[4096] ; sprintf(msg, "Error! header %s found several times in %s", id.c_str(), this->fasta_path.c_str()) ; throw std::runtime_error(msg) ; } } this->close_fasta_file() ; // fill the matrix this->open_bed_file() ; size_t i=0 ; seqan::BedRecord bed_line ; while(not seqan::atEnd(this->bed_file)) { seqan::readRecord(bed_line, this->bed_file) ; std::string region_chr = seqan::toCString(bed_line.ref) ; // get sequence [from, to) int region_mid = MatrixCreator::get_center_pos(bed_line) ; int region_start = std::max(0, region_mid + from) ; int region_end = region_mid + to + 1 ; auto iter = seq_map.find(region_chr) ; if(iter == seq_map.end()) { char msg[4096] ; sprintf(msg, "Error! %s sequence cannot be found in %s", region_chr.c_str(), this->fasta_path.c_str()) ; throw std::runtime_error(msg) ; } else { // auto& seq_name = iter->first ; auto& seq = iter->second ; for(int j_seq=region_start, j_mat=0; j_seqsecond); j_seq++, j_mat++) - { this->matrix_counts[i][j_mat] = dna::char_to_int(seq[j_seq]) ; } + { this->matrix_counts(i,j_mat) = dna::char_to_int(seq[j_seq]) ; } } i++ ; } this->close_bed_file() ; return this->matrix_counts ; } void SequenceMatrixCreator::open_fasta_file() { if(not seqan::open(this->fasta_file, this->fasta_path.c_str())) { char msg[4096] ; sprintf(msg, "cannot open %s", this->fasta_path.c_str()) ; throw std::runtime_error(msg) ; } } void SequenceMatrixCreator::close_fasta_file() { seqan::close(this->fasta_file) ; } diff --git a/src/GenomicTools/SequenceMatrixCreator.hpp b/src/GenomicTools/SequenceMatrixCreator.hpp index f298b09..e71dc88 100644 --- a/src/GenomicTools/SequenceMatrixCreator.hpp +++ b/src/GenomicTools/SequenceMatrixCreator.hpp @@ -1,65 +1,65 @@ #ifndef SEQUENCEMATRIXCREATOR_HPP #define SEQUENCEMATRIXCREATOR_HPP #include #include // seqan::SeqFileIn #include -#include +#include class SequenceMatrixCreator : public MatrixCreator { public: SequenceMatrixCreator(const std::string& bed_file_path, const std::string& fasta_file_path, int from, int to) ; /*! * \brief Destructor */ virtual ~SequenceMatrixCreator() ; /*! * \brief Computes the matrix and returns it. * \return the sequence matrix. * \throw std::runtime_error if two sequences * have the same header in the fasta file or * if a sequence/chromosome name present * in the bed cannot be found as sequence * header in the fasta file. */ - virtual matrix2d_i create_matrix() override ; + virtual Matrix2D create_matrix() override ; protected: /*! * \brief Opens the fasta file. * \throw std::runtime_error if the file cannot * be open. */ void open_fasta_file() ; /*! * \brief Closes the fasta file. * \throw std::runtime_error if the file cannot * be open. */ void close_fasta_file() ; /*! * \brief Fasta file path. */ std::string fasta_path ; /*! * \brief An input stream to the * fasta file. * Use open_fasta_file() to open the stream * and close_fasta_file() to close it. */ seqan::SeqFileIn fasta_file ; } ; #endif // SEQUENCEMATRIXCREATOR_HPP diff --git a/src/Matrix/Matrix.hpp b/src/Matrix/Matrix.hpp index fa13945..835883c 100644 --- a/src/Matrix/Matrix.hpp +++ b/src/Matrix/Matrix.hpp @@ -1,654 +1,653 @@ #ifndef MATRIX_HPP #define MATRIX_HPP #include #include // accumulate() #include #include // setw(), setprecision(), fixed #include // out_of_range, invalid_argument #include // swap()f /*! * \brief The Matrix class is a generic class to store data in a matrix. * The matrix dimensionality can be any value : 1 is a vector, 2 is a regular * 2D matrix, 3 is a 3D matrix, etc. * * In order to store the data properly and to perform all operations smoothly, the * internal representation format differs from the "usual format". That is : the user * provides coordinates as (x,y,z,...) where x referes to the row number, y to * the column number, z the the z slice, etc. * Internally however, x corresponds to the column number and y to the row number. * Every other dimension has the same meaning. * * Internal representation : * * Here is an example of a 2x3 matrix (2D) * * {0,1,2,3,4,5} vector is turned to * X * ----------> * 0 1 2 | * 3 4 5 | Y * \|/ * * dimensions are stored as {nx, ny} which corresponds to {ncol, nrow}. Coordinates * are given using the universal format coord=(x,y) which are interpreted as {row, col}. * Thus a simple swap(coord[0],coord[1]) should be performed to ensurethat the user given * coordinates can be used in this referencial. * * * Here is an example of a 2x3x2x2 matrix(4D) * {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23} is turned to * * X * -----------> | | * 0 1 2 | | | * 3 4 5 | Y | | * \|/ | Z | * 6 7 8 | | | * 9 10 11 | Y | | * \|/ \|/ | * | A * 12 13 14 | | | * 15 16 17 | Y | | * \|/ | Z | * 18 19 20 | | | * 21 22 23 | Y | | * \|/ \|/ \|/ * * dimensions are stored as {nx, ny, nz, na} which corredponds to {ncol, nrow, nz, na}. * Coordinates are given using the universal format coord=(x,y,z,a) which are interpreted * as {row, col, z, a}. Thus a simple swap(coord[0],coord[1]) should be performed to ensure * that the user given coordinates can be used in this referencial. * * */ template class Matrix { public: // constructors Matrix() = default ; /*! * \brief Constructs an matrix with the given dimension with * 0 values. * \param dim the dimensions. */ Matrix(const std::vector& dim) ; /*! * \brief Constructs a matrix with the given dimensions and * initialize the values to the given value. * \param dim the dimensions. * \param value the value to initialize the matrix content * with. */ Matrix(const std::vector& dim, T value) ; /*! * \brief Copy constructor. * \param other the matrix to copy. */ Matrix (const Matrix& other) ; /*! * \brief Destructor. */ virtual ~Matrix() = default ; // methods /*! * \brief Gets the element at the given offset. * \param offset the offset of the element to get. * \throw std::out_of_range exception if the offset * is out of range. * \return the element. */ T get(size_t offset) const ; /*! * \brief Gets the element at the given coordinates. * \param coord the coordinates of the element to get. * \throw std::out_of_range exception if the coordinates * are out of range. * \return the element. */ T get(const std::vector& coord) const ; /*! * \brief Sets the element at the given offset * to the given value. * \param offset the offset of the element to set. * \param value the new value. * \throw std::out_of_range exception if the offset * is out of range. */ void set(size_t offset, T value) ; /*! * \brief Sets the element at the given coordinates * to the given value. * \param coord the coordinates of the element to set. * \param value the new value. * \throw std::out_of_range exception if the coordinates * are out of range. */ void set(const std::vector& coord, T value) ; /*! * \brief Gets the matrix dimensions. * \return the dimensions. */ std::vector get_dim() const ; /*! * \brief Gets the data vector. * \return a a vector containing the data. */ std::vector get_data() ; /*! * \brief Gets the number of dimensions (the length * of the dimension vector). * \return the number of dimensions */ size_t get_dim_size() const ; /*! * \brief Gets the number of elements contained in the * matrix. * \return the number of element contained in the * matrix. */ size_t get_data_size() const ; /*! * \brief Returns the partial products of the dimensions. * \return the partial products of the dimensions. */ std::vector get_dim_product() const ; /*! * \brief Produces a nice representation of the matrix on the given * stream. * \param stream the stream. * \param precision the rounding precision. * \param width the column width in number of characters. * \param sep the character separator. */ virtual void print(std::ostream& stram, size_t precision=4, size_t width=8, char sep=' ') const ; // operator /*! * \brief Assignment operator. * \param other an other matrix to copy the values from. * \return a reference to the current instance. */ Matrix& operator = (const Matrix& other) ; /*! * \brief Adds value to each element. * \param value the value to add. * \return a reference to the instance. */ Matrix& operator += (T value) ; /*! * \brief Substracts value to each element. * \param value the value to substract. * \return a reference to the instance. */ Matrix& operator -= (T value) ; /*! * \brief Multiplies each element by value. * \param value the value to multiply the elements by. * \return a reference to the instance. */ Matrix& operator *= (T value) ; /*! * \brief Divides each element by value. * \param value the value to multiply the elements by. * \throw std::invalid_argument if value is 0. * \return a reference to the instance. */ Matrix& operator /= (T value) ; /*! * \brief Comparison operator, returns true if * both matrices are identical, that is do not * have the same data and dimensions. * \param other an other matrix. * \return true if both matrices have the same * data and dimensions. */ bool operator == (const Matrix& other) const ; /*! * \brief Comparison operator, returns true if * both matrices are different, that is do not * have the same data and dimensions. * \param other an other matrix. * \return true if both matrices are different. */ bool operator != (const Matrix& other) const ; /*! * \brief Returns a reference to the corrresponding * element. This method does not perform any check on * the coordinates. * \param coord coord the coordinates of the element to get. * \return a reference to this element. */ T& operator () (const std::vector& coord) ; /*! * \brief Returns a const reference to the corrresponding * element. This method does not perform any check on * the coordinates. * \param coord coord the coordinates of the element to get. * \return a const reference to this element. */ const T& operator () (const std::vector& coord) const ; protected: // methods /*! * \brief Computes the partial dimension products and fills * this->dim_prod according to the current values of * this->_dim and this->dim_size. */ void compute_dim_product() ; /*! * \brief Given a vector of at least 2 dimensional coordinates, * it simply swaps the elements at index 0 (row number) and 1 * (column number) to make them fit the x,y,... matrix * reprensetation (x:number of columns, y:number of rows). * \param coord a vector of coordinates (row, column, ...). * \return a vector of coordinates corresponding to (x,y,...). */ std::vector swap_coord(const std::vector& coord) const ; /*! * \brief Complementary function of convert_coord(). Given * a vector of coordinates in (x,y,...) format, it turns it * into (row,col,...) format. * \param coord a vector of coordinates (x,y, ...). * \return a vector of coordinates corresponding to (row,col,...). */ std::vector convert_coord_back(const std::vector& coord) const ; /*! * \brief Checks whether a given offset is a valid offset or * whether it is out of range. * \param offset the offset to check. * \return whether the offset is valid. */ bool is_valid(size_t offset) const ; /*! * \brief Checks whether coordinates in (x,y,...) format are * valid or whether they are out of range. * \param offset the offset to check. * \return whether the offset is valid. */ bool is_valid(const std::vector& coord) const ; /*! * \brief Converts a vector of VALID (x,y,...) coordinates to a * the corresponding offset allowing to get an element in the * data vector. * If the coordinate vector has a (row, column, ...) format, the * result will be wrong. * \param coord a vector of coordinates with (x,y,...) format. * \return the corresponding offset. */ size_t convert_to_offset(const std::vector& coord) const ; /*! * \brief Complementary function of convert_to_offset(). Given an * offset, this function returns the corresponding coordinate * vector in (x,y,...) format. * \param offset a given offset. * \return the corresponding vector of (x,y,..) coordinates. */ std::vector convert_to_coord(size_t offset) const ; // fields /*! * \brief The dimensions values. */ std::vector _dim ; /*! * \brief Stores the data. */ std::vector _data ; /*! * \brief The number of dimensions. */ size_t _dim_size ; /*! * \brief The number of data elements stored. */ size_t _data_size ; /*! * \brief Contains the partial product of the dimensions. That is, * the ith element contains the product of all the i-1 precedent * dimensions : * element 0 : 1, element 1 : x, element 2 : x*y, element 3 : x*y*z, * and so one. * This is used for coordinates to offset and offset to coordinates * conversions. */ std::vector _dim_prod ; } ; // operators /*! * \brief Addition operator. * \param m the matrix of interest * \param value the value to add to each element. * \return the resulting matrix. */ template const Matrix operator + (Matrix m, T value) { Matrix other(m) ; other += value ; return other ; } /*! * \brief Substraction operator * \param m the matrix of interest. * \param value the value to substract to each element. * \return the resulting matrix. */ template const Matrix operator - (Matrix m, T value) { Matrix other(m) ; other -= value ; return other ; } /*! * \brief Multiplication operator. * \param m the matrix of interest. * \param value the value to multiply each elements by. * \return the resulting matrix. */ template const Matrix operator * (Matrix m, T value) { Matrix other(m) ; other *= value ; return other ; } /*! * \brief Division operator. * \param m the matrix of interest. * \param value the value to divide each elements by. * \throw std::invalid_argument if value is 0. * \return the resulting matrix. */ template const Matrix operator / (Matrix m, T value) { if(value == static_cast(0)) { throw std::invalid_argument("division by 0!") ; } Matrix other(m) ; other /= value ; return other ; } /*! * \brief Sends a representation of the matrix to the stream. * \param stream the stream of interest. * \param m the matrix of interest. * \return a reference to the stream. */ template std::ostream& operator << (std::ostream& stream, const Matrix& m) { m.print(stream) ; return stream ; } // method implementation template Matrix::Matrix(const std::vector& dim) : Matrix(dim, 0) {} template Matrix::Matrix(const std::vector& dim, T value) { this->_dim_size = dim.size() ; this->_dim = this->swap_coord(dim) ; - this->_data_size = std::accumulate(dim.begin(), dim.end(), 1, std::multiplies()) ; + this->_data_size = std::accumulate(dim.begin(), dim.end(), (size_t)1, std::multiplies()) ; this->_data = std::vector(this->_data_size, value) ; this->compute_dim_product() ; } template Matrix::Matrix(const Matrix &other) { *this = other ; } template T Matrix::get(size_t offset) const { if(not this->is_valid(offset)) { throw std::out_of_range("offset is out of range!") ; } return this->_data[offset] ; } template T Matrix::get(const std::vector& coord) const { std::vector coord_new = this->swap_coord(coord) ; if(not this->is_valid(coord_new)) { throw std::out_of_range("coordinates are out of range!") ; } return this->_data[this->convert_to_offset(coord_new)] ; } template void Matrix::set(size_t offset, T value) { if(not this->is_valid(offset)) { throw std::out_of_range("offset is out of range!") ; } this->_data[offset] = value ; } template void Matrix::set(const std::vector& coord, T value) { std::vector coord_new = this->swap_coord(coord) ; if(not this->is_valid(coord_new)) { throw std::out_of_range("coordinates are out of range!") ; } this->_data[this->convert_to_offset(coord_new)] = value ; } template std::vector Matrix::get_dim() const { return this->swap_coord(this->_dim) ; } template std::vector Matrix::get_data() { return this->_data ; } template size_t Matrix::get_dim_size() const { return this->_dim_size ; } template size_t Matrix::get_data_size() const { return this->_data_size ; } template std::vector Matrix::get_dim_product() const { return this->_dim_prod ; } template void Matrix::print(std::ostream& stream, size_t precision, size_t width, char sep) const { stream.setf(std::ios::left) ; stream << std::setprecision(precision) << std::fixed ; for(size_t i=0; iget_data_size(); i++) { stream << std::setw(width) << this->get(i) << sep ; } } template Matrix& Matrix::operator = (const Matrix& other) -{ - this->_dim = other._dim ; +{ this->_dim = other._dim ; this->_dim_size = other._dim_size ; this->_data = other._data ; this->_data_size = other._data_size ; this->_dim_prod = other._dim_prod ; return *this ; } template Matrix& Matrix::operator += (T value) { for(auto& i : this->_data) { i += value ; } return *this ; } template Matrix& Matrix::operator -= (T value) { for(auto& i : this->_data) { i -= value ; } return *this ; } template Matrix& Matrix::operator *= (T value) { for(auto& i : this->_data) { i *= value ; } return *this ; } template Matrix& Matrix::operator /= (T value) { if(value == static_cast(0)) { throw std::invalid_argument("division by 0!") ; } for(auto& i : this->_data) { i /= value ; } return *this ; } template bool Matrix::operator == (const Matrix& other) const { if(&other == this) { return true ; } // check dim if(this->_dim_size != other._dim_size) { return false ; } for(size_t i=0; i_dim_size; i++) { if(this->_dim[i] != other._dim[i]) { return false ; } } // check data if(this->_data_size != other._data_size) { return false ; } for(size_t i=0; i_data_size; i++) { if(this->_data[i] != other._data[i]) { return false ; } } return true ; } template bool Matrix::operator !=(const Matrix& other) const { return not ((*this) == other) ;} template T& Matrix::operator () (const std::vector& coord) { std::vector coord_new = this->swap_coord(coord) ; return this->_data[this->convert_to_offset(coord_new)] ; } template const T& Matrix::operator () (const std::vector& coord) const { std::vector coord_new = this->swap_coord(coord) ; return this->_data[this->convert_to_offset(coord_new)] ; } template void Matrix::compute_dim_product() { this->_dim_prod = std::vector(this->_dim_size, 0) ; this->_dim_prod[0] = 1 ; if(this->_dim_size > 1) { this->_dim_prod[1] = this->_dim[0] ; } if(this->_dim_size > 2) { for(size_t i=2; i_dim_size; i++) { this->_dim_prod[i] = this->_dim_prod[i-1]*this->_dim[i-1] ; } } } template std::vector Matrix::swap_coord(const std::vector &coord) const { std::vector coord_new = coord ; // reformat coord = (row,col,...) = (y,y,...) into coord = (col,row,...) = (x,y,...) if(this->_dim_size > 1) { std::swap(coord_new[0], coord_new[1]) ; } return coord_new ; } template bool Matrix::is_valid(size_t offset) const { if(offset > this->_data_size-1) { return false ; } return true ; } template bool Matrix::is_valid(const std::vector& coord) const { if(coord.size() != this->_dim_size) { return false ; } for(size_t i=0; i this->_dim[i]) { return false ; } } return true ; } template size_t Matrix::convert_to_offset(const std::vector& coord) const { size_t offset = 0 ; for(size_t i=0; i_dim_size; i++) { offset += coord[i] * this->_dim_prod[i] ; } return offset ; } template std::vector Matrix::convert_to_coord(size_t offset) const { std::vector coord(this->_dim_size, 0) ; for(int i=this->_dim_size-1; i>=0; i--) { size_t c = offset / this->_dim_prod[i] ; coord[i] = c ; offset -= (this->_dim_prod[i]*c) ; } return coord ; } #endif // MATRIX_HPP diff --git a/src/Matrix/Matrix2D.hpp b/src/Matrix/Matrix2D.hpp index 297344c..53b2f15 100644 --- a/src/Matrix/Matrix2D.hpp +++ b/src/Matrix/Matrix2D.hpp @@ -1,481 +1,556 @@ #ifndef MATRIX2D_HPP #define MATRIX2D_HPP #include #include #include #include // ifstream #include #include // setw(), setprecision(), fixed #include // istringstream #include // runtime_error, out_of_range #define BUFFER_SIZE 4096 /*! The Matrix2D class is a specialisation of the Matrix * class to make work with 2D matrices easier. * * A text format is defined to store such matrices. * In this format, each row is written on a single line * and the values should separated by any blank character * (tab, space, multiple spaces, ...). Empty lines are * not allowed. * * ---- start ---- * 1 2 3 * 4 5 6 * 7 8 9 * ----- end ----- * * Constructing a matrix from an empty file (0 bytes or only an EOL char) returns a null * matrix (0x0 dimensions). Writting a null matrix (that is with at least one null * dimension creates an empty file. * */ template class Matrix2D : public Matrix { public: // constructors Matrix2D() = default ; /*! * \brief Constructs a matrix with the given dimensions, * filled with 0 values. * \param nrow the number of rows. * \param ncol the number of columns. */ Matrix2D(size_t nrow, size_t ncol) ; /*! * \brief Constructs a matrix with the given dimensions and * initialize the values to the given value. * \param nrow the number of rows. * \param ncol the number of columns. * \param value the value to initialize the matrix content * with. */ Matrix2D(size_t nrow, size_t ncol, T value) ; /*! * \brief Copy constructor * \param other the matrix to copy the content from. */ Matrix2D(const Matrix2D& other) ; /*! * \brief Constructs a matrix from a text file. A matrix contructed * from an empty file (or a file containing only one EOL char) returns * an empty matrix (null dimensions). * \param file_address the address of the file containing the matrix. * \throw std::runtime_error if anything happen while reading the * file (format error, file not found, etc). */ Matrix2D(const std::string& file_address) ; /*! * \brief Destructor. */ virtual ~Matrix2D() = default ; // methods overloaded in Matrix using Matrix::get ; using Matrix::set ; // methods /*! * \brief Gets the element at the given coordinates. * \param row the row number of the element to set. * \param col the column number of the element to set. * \throw std::out_of_range exception if the coordinates * are out of range. * \return the element. */ T get(size_t row, size_t col) const ; /*! * \brief Sets the element at the given coordinates * to the given value. * \param row the row number of the element to set. * \param col the column number of the element to set. * \param value the new value. * \throw std::out_of_range exception if the coordinates * are out of range. */ void set(size_t row, size_t col, T value) ; /*! * \brief Gets the number of rows. * \return the number of rows. */ size_t get_nrow() const ; /*! * \brief Gets the number of columns. * \return the number of columns. */ size_t get_ncol() const ; /*! * \brief Gets the values in the i-th row. * \param i the row of interest. * \throw std::out_of_range if i is out of range. * \return the values in this row. */ std::vector get_row(size_t i) const ; /*! * \brief Gets the values in the i-th column. * \param i the column of interest. * \throw std::out_of_range if i is out of range. * \return the values in this column. */ std::vector get_col(size_t i) const ; /*! * \brief Sets the values of a given rows with the values of a given * vector. * \param i the row of interest. * \param values the new values. * \throw std::out_of_range if i is out of range. * \throw std::invalid_argument if values does not have a length equal * to the number of columns of the matrix. */ void set_row(size_t i, const std::vector& values) ; /*! * \brief Sets the values of a given column with the values of a given * vector. * \param i the column of interest. * \param values the new values. * \throw std::out_of_range if i is out of range. * \throw std::invalid_argument if values does not have a length equal * to the number of rows of the matrix. */ void set_col(size_t i, const std::vector& values) ; /*! * \brief Produces a nice representation of the matrix on the given * stream. * \param stream the stream. * \param precision the rounding precision. * \param width the column width in number of characters. * \param sep the character separator. */ virtual void print(std::ostream& stram, size_t precision=4, size_t width=8, char sep=' ') const override ; // operators /*! * \brief Returns a reference to the corrresponding * element. This method does not perform any check on * the coordinates. * \param row the row number of the element to set. * \param col the column number of the element to set. * \return a reference to this element. */ T& operator () (size_t row, size_t col) ; /*! * \brief Returns a const reference to the corrresponding * element. This method does not perform any check on * the coordinates. * \param row the row number of the element to set. * \param col the column number of the element to set. * \return a const reference to this element. */ const T& operator () (size_t row, size_t col) const ; + private: + /*! + * \brief Converts a pair of VALID (x,y) coordinates to a + * the corresponding offset allowing to get an element in the + * data vector. + * \param row the row index. + * \param col the column index. + * \return the corresponding offset. + */ + size_t convert_to_offset(size_t row, size_t col) const ; + + /*! + * \brief Computes and stores the offsets at which + * each row start. + */ + void compute_row_offsets() ; + + /*! + * \brief Computes and stores the offsets at which + * each row start. + */ + void compute_col_offsets() ; + + /*! + * \brief Contains the offsets at which each row starts. + * Each element corresponds to the corresponding rows + * (1st element -> 1st row). + */ + std::vector _row_offsets ; + /*! + * \brief Contains the offsets at which each row starts. + * Each element corresponds to the corresponding rows + * (1st element -> 1st row). + */ + std::vector _col_offsets ; + } ; // operators /*! * \brief Addition operator. * \param m the matrix of interest * \param value the value to add to each element. * \return the resulting matrix. */ template const Matrix2D operator + (Matrix2D m, T value) { Matrix2D other(m) ; m += value ; return m ; } /*! * \brief Substraction operator * \param m the matrix of interest. * \param value the value to substract to each element. * \return the resulting matrix. */ template const Matrix2D operator - (Matrix2D m, T value) { Matrix2D other(m) ; m -= value ; return m ; } /*! * \brief Multiplication operator. * \param m the matrix of interest. * \param value the value to multiply each elements by. * \return the resulting matrix. */ template const Matrix2D operator * (Matrix2D m, T value) { Matrix2D other(m) ; m *= value ; return m ; } /*! * \brief Division operator. * \param m the matrix of interest. * \param value the value to divide each elements by. * \throw std::invalid_argument if value is 0. * \return the resulting matrix. */ template const Matrix2D operator / (Matrix2D m, T value) { if(value == static_cast(0)) { throw std::invalid_argument("division by 0!") ; } Matrix2D other(m) ; other /= value ; return other ; } /*! * \brief Sends a representation of the matrix to the stream. * \param stream the stream of interest. * \param m the matrix of interest. * \return a reference to the stream. */ template std::ostream& operator << (std::ostream& stream, const Matrix2D& m) { m.print(stream) ; return stream ; } // other usefull functions /*! * \brief Produces a transpose of the given matrix. * \param m a matrix. */ template Matrix2D transpose(const Matrix2D& m) ; // method implementation template Matrix2D transpose(const Matrix2D& m) { std::vector dim = m.get_dim() ; size_t nrow = dim[0] ; size_t ncol = dim[1] ; Matrix2D m2(ncol, nrow, 0) ; for(size_t i=0; i Matrix2D::Matrix2D(size_t nrow, size_t ncol) : Matrix2D(nrow, ncol, static_cast(0)) {} template Matrix2D::Matrix2D(size_t nrow, size_t ncol, T value) - : Matrix({nrow, ncol}, value) -{} + : Matrix({nrow, ncol}, value), + _row_offsets(nrow), + _col_offsets(ncol) +{ this->compute_row_offsets() ; + this->compute_col_offsets() ; +} template Matrix2D::Matrix2D(const Matrix2D& other) : Matrix(other) -{} +{ this->_row_offsets = other._row_offsets ; + this->_col_offsets = other._col_offsets ; +} template Matrix2D::Matrix2D(const std::string &file_address) // : Matrix({0,0}) { this->_dim = {0,0} ; this->_data = std::vector() ; this->_dim_size = this->_dim.size() ; this->_data_size = this->_data.size() ; this->_dim_prod = std::vector(this->_dim_size, 0) ; std::ifstream file(file_address, std::ifstream::in) ; if(file.fail()) { char msg[BUFFER_SIZE] ; sprintf(msg, "error! cannot open %s", file_address.c_str()) ; throw std::runtime_error(msg) ; } std::string buffer_str ; std::vector buffer_vec ; T buffer_T ; // read file size_t n_line = 0 ; size_t row_len = 0 ; while(getline(file, buffer_str)) { // check stream status and read content if(file.fail()) { file.close() ; char msg[BUFFER_SIZE] ; sprintf(msg, "error! while reading %s", file_address.c_str()) ; throw std::runtime_error(msg) ; } if(buffer_str.size() == 0) { // this file only contains one eol char and should be considered as empty, // -> returns empty matrix not an error if(n_line == 0 and file.peek() == EOF and file.eof()) { break ; } file.close() ; char msg[BUFFER_SIZE] ; sprintf(msg, "format error! while reading %s (empty line)", file_address.c_str()) ; throw std::runtime_error(msg) ; } // parse line buffer_vec.clear() ; std::istringstream buffer_ss(buffer_str) ; while(buffer_ss >> buffer_T) { buffer_vec.push_back(buffer_T) ; } // check for an error which likely indicates that a value could not be // casted into a type T (mixed data types in the file) if(buffer_ss.fail() and not buffer_ss.eof()) { file.close() ; char msg[BUFFER_SIZE] ; sprintf(msg, "format error! could not read a line in %s (incompatible data types)", file_address.c_str()) ; throw std::runtime_error(msg) ; } // check that number of column is constant if(n_line == 0) { row_len = buffer_vec.size() ; } else if(buffer_vec.size() != row_len) { file.close() ; char msg[BUFFER_SIZE] ; sprintf(msg, "format error! variable number of columns in %s", file_address.c_str()) ; throw std::runtime_error(msg) ; } // update matrix content for(auto i : buffer_vec) { this->_data.push_back(i) ; this->_data_size++ ; } this->_dim[1]++ ; n_line++ ; } file.close() ; this->_dim[0] = row_len ; this->compute_dim_product() ; + + this->_row_offsets = std::vector(this->_dim[1]) ; + this->_col_offsets = std::vector(this->_dim[0]) ; + this->compute_row_offsets() ; + this->compute_col_offsets() ; } template T Matrix2D::get(size_t row, size_t col) const { try { return this->get({row, col}) ; } catch(std::out_of_range& e) { throw e ; } } template void Matrix2D::set(size_t row, size_t col, T value) { try { this->set({row, col}, value) ; } catch(std::out_of_range& e) { throw e ; } } template size_t Matrix2D::get_nrow() const { return this->_dim[1] ; } template size_t Matrix2D::get_ncol() const { return this->_dim[0] ; } template std::vector Matrix2D::get_row(size_t i) const { if(i>=this->get_nrow()) { throw std::out_of_range("row index is out of range!") ; } std::vector row(this->get_ncol()) ; for(size_t j=i*this->get_ncol(), n=0; nget_ncol(); j++, n++) { row[n] = this->_data[j] ; } return row ; } template std::vector Matrix2D::get_col(size_t i) const { if(i>=this->get_ncol()) { throw std::out_of_range("column index is out of range!") ; } std::vector col(this->get_nrow()) ; for(size_t j=i, n=0; nget_nrow(); j+=this->get_ncol(), n++) { col[n] = this->_data[j] ; } return col ; } template void Matrix2D::set_row(size_t i, const std::vector& values) { if(i>=this->get_nrow()) { throw std::out_of_range("row index is out of range!") ; } else if(values.size() != this->get_ncol()) { throw std::invalid_argument("the given vector length is not equal to the number of columns!") ; } for(size_t j=i*this->get_ncol(), n=0; nget_ncol(); j++, n++) { this->_data[j] = values[n] ; } } template void Matrix2D::set_col(size_t i, const std::vector& values) { if(i>=this->get_ncol()) { throw std::out_of_range("row index is out of range!") ; } else if(values.size() != this->get_nrow()) { throw std::invalid_argument("the given vector length is not equal to the number of rows!") ; } for(size_t n=0, j=i; nget_nrow(); n++, j+=this->get_ncol()) { this->_data[j] = values[n] ; } } template void Matrix2D::print(std::ostream& stream, size_t precision, size_t width, char sep) const { stream.setf(std::ios::left) ; stream << std::setprecision(precision) << std::fixed ; size_t n = 0 ; size_t n_tot = this->get_nrow()*this->get_ncol() ; for(size_t i=0; iget_nrow(); i++) { for(size_t j=0; jget_ncol(); j++, n++) { stream << std::setw(width) << (*this)(i,j) << sep ; } if(n T& Matrix2D::operator () (size_t row, size_t col) -{ std::vector coord = {col, row} ; - return this->_data[this->convert_to_offset(coord)] ; +{ // std::vector coord = {col, row} ; + // return this->_data[this->convert_to_offset(coord)] ; + return this->_data[this->convert_to_offset(row, col)] ; } template const T& Matrix2D::operator () (size_t row, size_t col) const -{ std::vector coord = {col, row} ; - return this->_data[this->convert_to_offset(coord)] ; +{ // std::vector coord = {col, row} ; + // return this->_data[this->convert_to_offset(coord)] ; + return this->_data[this->convert_to_offset(row, col)] ; } + +template +void Matrix2D::compute_row_offsets() +{ for(size_t i=0; i_dim[1]; i++) + { this->_row_offsets[i] = i * this->_dim_prod[1] ; } +} + +template +void Matrix2D::compute_col_offsets() +{ for(size_t i=0; i_dim[0]; i++) + { this->_col_offsets[i] = i * this->_dim_prod[0] ; } +} + +template +size_t Matrix2D::convert_to_offset(size_t row, size_t col) const +{ /* + size_t offset = 0 ; + + for(size_t i=0; i_dim_size; i++) + { offset += coord[i] * this->_dim_prod[i] ; } + + return offset ; + */ + size_t offset = this->_row_offsets[row] + this->_col_offsets[col] ; + return offset ; +} #endif // MATRIX2D_HPP diff --git a/src/Matrix/Matrix3D.hpp b/src/Matrix/Matrix3D.hpp index a812f4b..122027f 100644 --- a/src/Matrix/Matrix3D.hpp +++ b/src/Matrix/Matrix3D.hpp @@ -1,444 +1,536 @@ #ifndef MATRIX3D_HPP #define MATRIX3D_HPP #include #include #include #include #include // setw(), setprecision(), fixed #include // ifstream #include // istringstream #include // runtime_error, out_of_range #include // equal() #define BUFFER_SIZE 4096 /*! * The Matrix3D class is a specialisation of the Matrix * class to make work with 3D matrices more easily. * * A text file format is defined to store such matrices. The specifications are as * follows : * Absolutely NO empty lines are allowed! * The following lines should contain : * * 1st line : a slice header, ',,0' indicates that a slice of the 3rd dimension * is beginning (this is a z slice). * 2nd - Nth line : the firt slice, as a 2d matrix (the exemple below has dimensions 3x4). * N+1th line : a slice header, ',,1' indicates that the 2nd slice is beginning. * N+1th - ... : the second slice * and so on... * * Example of a 3x4x2 3D matrix * ---- start ---- * ,,0 * 1 2 3 4 * 5 6 7 8 * 8 9 10 11 *,,1 * 12 13 14 15 * 16 17 18 19 * 20 21 22 23 * ----- end ----- * * Constructing a matrix from an empty file (0 bytes or only an EOL char) returns a null * matrix (0x0x0 dimensions). Writting a null matrix (that is with at least one null * dimension creates an empty file. * */ template class Matrix3D : public Matrix { public: // constructors Matrix3D() = default ; /*! * \brief Constructs a matrix with the given dimensions, * filled with 0 values. * \param dim1 the first dimension. * \param dim2 the second dimension. * \param dim3 the third dimension. */ Matrix3D(size_t dim1, size_t dim2, size_t dim3) ; /*! * \brief Constructs a matrix with the given dimensions and * initialize the values to the given value. * \param dim1 the first dimension. * \param dim2 the second dimension. * \param dim3 the third dimension. * \param value the value to initialize the matrix content * with. */ Matrix3D(size_t dim1, size_t dim2, size_t dim3, T value) ; /*! * \brief Copy constructor * \param other the matrix to copy the content from. */ Matrix3D(const Matrix3D& other) ; /*! * \brief Constructs a matrix from a text file. A matrix contructed * from an empty file (or a file containing only one EOL char) returns * an empty matrix (null dimensions). * \param file_address the address of the file containing the matrix. * \throw std::runtime_error if anything happen while reading the * file (format error, file not found, etc). */ Matrix3D(const std::string& file_address) ; /*! * \brief Destructor. */ virtual ~Matrix3D() = default ; // methods overloaded from Matrix using Matrix::get ; using Matrix::set ; // methods /*! * \brief Gets the element at the given coordinates. * \param dim1 the first dimension coordinate. * \param dim2 the second dimension coordinate. * \param dim3 the third dimension coordinate. * \throw std::out_of_range exception if the coordinates * are out of range. * \return the element. */ T get(size_t dim1, size_t dim2, size_t dim3) const ; /*! * \brief Sets the element at the given coordinates * to the given value. * \param dim1 the first dimension coordinate. * \param dim2 the second dimension coordinate. * \param dim3 the third dimension coordinate. * \param value the new value. * \throw std::out_of_range exception if the coordinates * are out of range. */ void set(size_t dim1, size_t dim2, size_t dim3, T value) ; /*! * \brief Produces a nice representation of the matrix on the given * stream. * \param stream the stream. * \param precision the rounding precision. * \param width the column width in number of characters. * \param sep the character separator. */ virtual void print(std::ostream& stream, size_t precision=4 ,size_t width=8, char sep=' ') const override ; // operators /*! * \brief Returns a reference to the corrresponding * element. This method does not perform any check on * the coordinates. * \param dim1 the first dimension coordinate. * \param dim2 the second dimension coordinate. * \param dim3 the third dimension coordinate. * \return a reference to this element. */ - T& operator() (size_t dim1, size_t dim2, size_t dim3) ; + T& operator () (size_t dim1, size_t dim2, size_t dim3) ; /*! * \brief Returns a constant reference to the corrresponding * element. This method does not perform any check on * the coordinates. * \param dim1 the first dimension coordinate. * \param dim2 the second dimension coordinate. * \param dim3 the third dimension coordinate. * \return a constant reference to this element. */ - const T& operator() (size_t dim1, size_t dim2, size_t dim3) const ; + const T& operator () (size_t dim1, size_t dim2, size_t dim3) const ; private: // methods /*! * \brief Checks whether a given string is a slice header * (such as ",,0"), as found in files storing Matrix3D. * \param str the string to check. * \return whether the string is a slice header. */ bool is_header(const std::string& str) const ; + /*! + * \brief Converts a triplet of VALID (dim1, dim2, dim3) coordinates + * to a the corresponding offset allowing to get an element in the + * data vector. + * \param dim1 the index of the 1st dimension slice (row). + * \param dim2 the index of the 2nd dimension slice (column). + * \param dim3 the index of the 3rd dimension slice. + * \return the corresponding offset. + */ + size_t convert_to_offset(size_t dim1, size_t dim2, size_t dim3) const ; + + /*! + * \brief Computes and stores the offsets at which + * each slice on the 1st dimension (row) starts. + */ + void compute_dim1_offsets() ; + + /*! + * \brief Computes and stores the offsets at which + * each slice on the 2nd dimension (column) starts. + */ + void compute_dim2_offsets() ; + + /*! + * \brief Computes and stores the offsets at which + * each slice on the 3rd dimension (3rd dimension + * slice) starts. + */ + void compute_dim3_offsets() ; + + /*! + * \brief Contains the offsets at which each x slice + * starts. Each element corresponds to the corresponding + * x slice (1st element -> 1st x slice (row)). + */ + std::vector _dim1_offsets ; + /*! + * \brief Contains the offsets at which each y slice + * starts. Each element corresponds to the corresponding + * y slice (1st element -> 1st y slice (column)). + */ + std::vector _dim2_offsets ; + /*! + * \brief Contains the offsets at which each x slice + * starts. Each element corresponds to the corresponding + * x slice (1st element -> 1st z slice). + */ + std::vector _dim3_offsets ; } ; // operators /*! * \brief Addition operator. * \param m the matrix of interest * \param value the value to add to each element. * \return the resulting matrix. */ template const Matrix3D operator + (Matrix3D m, T value) { Matrix3D other(m) ; m += value ; return m ; } /*! * \brief Substraction operator * \param m the matrix of interest. * \param value the value to substract to each element. * \return the resulting matrix. */ template const Matrix3D operator - (Matrix3D m, T value) { Matrix3D other(m) ; m -= value ; return m ; } /*! * \brief Multiplication operator. * \param m the matrix of interest. * \param value the value to multiply each elements by. * \return the resulting matrix. */ template const Matrix3D operator * (Matrix3D m, T value) { Matrix3D other(m) ; m *= value ; return m ; } /*! * \brief Division operator. * \param m the matrix of interest. * \param value the value to divide each elements by. * \throw std::invalid_argument if value is 0. * \return the resulting matrix. */ template const Matrix3D operator / (Matrix3D m, T value) { if(value == static_cast(0)) { throw std::invalid_argument("division by 0!") ; } Matrix3D other(m) ; other /= value ; return other ; } /*! * \brief Sends a representation of the matrix to the stream. * \param stream the stream of interest. * \param m the matrix of interest. * \return a reference to the stream. */ template std::ostream& operator << (std::ostream& stream, const Matrix3D& m) { m.print(stream) ; return stream ; } // method implementation template Matrix3D::Matrix3D(size_t dim1, size_t dim2, size_t dim3) : Matrix3D(dim1, dim2, dim3, 0) {} template Matrix3D::Matrix3D(size_t dim1, size_t dim2, size_t dim3, T value) - : Matrix({dim1, dim2, dim3}, value) -{} + : Matrix({dim1, dim2, dim3}, value), + _dim1_offsets(dim1), + _dim2_offsets(dim2), + _dim3_offsets(dim3) +{ this->compute_dim1_offsets() ; + this->compute_dim2_offsets() ; + this->compute_dim3_offsets() ; +} template Matrix3D::Matrix3D(const Matrix3D &other) : Matrix(other) -{} +{ this->_dim1_offsets = other._dim1_offsets ; + this->_dim2_offsets = other._dim2_offsets ; + this->_dim3_offsets = other._dim3_offsets ; +} template Matrix3D::Matrix3D(const std::string &file_address) { this->_dim = {0,0,0} ; this->_data = std::vector() ; this->_dim_size = this->_dim.size() ; this->_data_size = this->_data.size() ; this->_dim_prod = std::vector(this->_dim_size, 0) ; std::ifstream file(file_address, std::ifstream::in) ; if(file.fail()) { char msg[BUFFER_SIZE] ; sprintf(msg, "error! cannot open %s", file_address.c_str()) ; throw std::runtime_error(msg) ; } std::string buffer_str ; std::vector buffer_vec ; T buffer_T ; // read file size_t n_line = 0, n_line_data = 0 ; // number of line and of data line read size_t row_len = 0, col_len = 0 ; // length of row and column in nber of values size_t row_len_cur = 0, col_len_cur = 0 ; // current number of values read in row and col while(getline(file, buffer_str)) { if(file.fail()) { file.close() ; char msg[BUFFER_SIZE] ; sprintf(msg, "error! while reading %s", file_address.c_str()) ; throw std::runtime_error(msg) ; } // check empty line if(buffer_str.size() == 0) { // this file only contains one eol char and should be considered as empty, // -> returns empty matrix not an error if(n_line == 0 and file.peek() == EOF and file.eof()) { break ; } file.close() ; char msg[BUFFER_SIZE] ; sprintf(msg, "format error! while reading %s (empty line)", file_address.c_str()) ; throw std::runtime_error(msg) ; } // check whether it is the beginning of a slice // 1st line in file should be one like this if(this->is_header(buffer_str)) { // check that slice have a constant number of rows if(this->_dim[2] == 1) { col_len = col_len_cur ; // this->_dim[0] = row_len ; // this->_dim[1] = col_len ; } else if(col_len_cur != col_len) { file.close() ; char msg[BUFFER_SIZE] ; sprintf(msg, "format error! slice have variable dimensions 1 in %s", file_address.c_str()) ; throw std::runtime_error(msg) ; } this->_dim[2]++ ; col_len_cur = 0 ; n_line++ ; continue ; } // 1st line in file should be a header and entering // this block is forbidden if(n_line == 0) { file.close() ; char msg[BUFFER_SIZE] ; sprintf(msg, "format error! first line is not a slice header in %s", file_address.c_str()) ; throw std::runtime_error(msg) ; } // parse line row_len_cur = 0 ; buffer_vec.clear() ; std::istringstream buffer_ss(buffer_str) ; while(buffer_ss >> buffer_T) { buffer_vec.push_back(buffer_T) ; row_len_cur++ ; } // check for an error which likely indicates that a value could not be // casted into a type T (mixed data types in the file) if(buffer_ss.fail() and not buffer_ss.eof()) { file.close() ; char msg[BUFFER_SIZE] ; sprintf(msg, "format error! could not read a line in %s (incompatible data types)", file_address.c_str()) ; throw std::runtime_error(msg) ; } // check that number of column is constant if(n_line_data == 0) { row_len = row_len_cur ; } else if(row_len_cur != row_len) { file.close() ; char msg[BUFFER_SIZE] ; sprintf(msg, "format error! slice have variable dimensions 2 in %s", file_address.c_str()) ; throw std::runtime_error(msg) ; } // update matrix content for(auto i : buffer_vec) { this->_data.push_back(i) ; this->_data_size++ ; } col_len_cur++ ; n_line_data++ ; n_line++ ; // update matrix dimensions this->_dim[0] = row_len_cur ; this->_dim[1] = col_len_cur ; } // check dimensions of last slice if(col_len_cur != this->_dim[1]) { file.close() ; char msg[BUFFER_SIZE] ; sprintf(msg, "format error! slice have variable dimensions in %s", file_address.c_str()) ; throw std::runtime_error(msg) ; } file.close() ; this->compute_dim_product() ; + + this->_dim1_offsets = std::vector(this->_dim[1]) ; + this->_dim2_offsets = std::vector(this->_dim[0]) ; + this->_dim3_offsets = std::vector(this->_dim[2]) ; + this->compute_dim1_offsets() ; + this->compute_dim2_offsets() ; + this->compute_dim3_offsets() ; } template T Matrix3D::get(size_t dim1, size_t dim2, size_t dim3) const { try { return this->get({dim1, dim2, dim3}) ; } catch(std::out_of_range& e) { throw e ; } } template void Matrix3D::set(size_t dim1, size_t dim2, size_t dim3, T value) { try { return this->set({dim1, dim2, dim3}, value) ; } catch(std::out_of_range& e) { throw e ; } } template T& Matrix3D::operator () (size_t dim1, size_t dim2, size_t dim3) -{ std::vector coord = {dim2, dim1, dim3} ; - return this->_data[this->convert_to_offset(coord)] ; -} +{ return this->_data[this->convert_to_offset(dim1, dim2, dim3)] ; } + +template +const T& Matrix3D::operator () (size_t dim1, size_t dim2, size_t dim3) const +{ return this->_data[this->convert_to_offset(dim1, dim2, dim3)] ; } template void Matrix3D::print(std::ostream& stream, size_t precision, size_t width, char sep) const { // if the matrix has at least one 0 dimension (no data), don't do anything if(this->_dim[0]==0 or this->_dim[1]==0 or this->_dim[2]==0) { return ; } stream.setf(std::ios::left) ; stream << std::setprecision(precision) << std::fixed ; std::vector dim = this->get_dim() ; size_t n = 0 ; size_t n_tot = std::accumulate(dim.begin(), dim.end(), 1, std::multiplies()) ; for(size_t z=0; z -const T& Matrix3D::operator () (size_t dim1, size_t dim2, size_t dim3) const -{ std::vector coord = {dim2, dim1, dim3} ; - return this->_data[this->convert_to_offset(coord)] ; -} - - template bool Matrix3D::is_header(const std::string& str) const { if(str[0] == ',' and str[1] == ',' and str.find(',', 2) == std::string::npos) { return true ; } return false ; } +template +void Matrix3D::compute_dim1_offsets() +{ for(size_t i=0; i_dim[1]; i++) + { this->_dim1_offsets[i] = i * this->_dim_prod[1] ; } +} + +template +void Matrix3D::compute_dim2_offsets() +{ for(size_t i=0; i_dim[0]; i++) + { this->_dim2_offsets[i] = i * this->_dim_prod[0] ; } +} + +template +void Matrix3D::compute_dim3_offsets() +{ for(size_t i=0; i_dim[2]; i++) + { this->_dim3_offsets[i] = i * this->_dim_prod[2] ; } +} + +template +size_t Matrix3D::convert_to_offset(size_t dim1, size_t dim2, size_t dim3) const +{ /* + size_t offset = 0 ; + + for(size_t i=0; i_dim_size; i++) + { offset += coord[i] * this->_dim_prod[i] ; } + + return offset ; + */ + size_t offset = this->_dim1_offsets[dim1] + + this->_dim2_offsets[dim2] + + this->_dim3_offsets[dim3] ; + return offset ; +} + #endif // MATRIX3D_HPP diff --git a/src/Matrix/Matrix4D.hpp b/src/Matrix/Matrix4D.hpp index d0a280a..28266ce 100644 --- a/src/Matrix/Matrix4D.hpp +++ b/src/Matrix/Matrix4D.hpp @@ -1,594 +1,719 @@ #ifndef MATRIX4D_HPP #define MATRIX4D_HPP #include #include #include #include // runtime_error, out_of_range #include #include // setw(), setprecision(), fixed #include // ifstream #include // sstream #define BUFFER_SIZE 4096 /*! * The Matrix4D class is a specialisation of the Matrix * class to make work with 4D matrices more easily. * * A text file format is defined to store such matrices. The specifications are as * follows : * Absolutely NO empty lines are allowed! * The following lines should contain : * * 1st line : a slice header ',,,0' indicating that a slice of the 4th dimension * is beginning. * 3nd - Nth line : the slice of the 4th dimension. It contains slice in the 3rd dimension * which are 2D matrices separated by headers (',,0' and ',,1', in the * example below, they have 2x3 dimensions). * N+1th line : ',,,1' indicating that the 2nd slice of the 4th dimension is beginning. * and so on... * Example * ---- start ---- * ,,,0 * ,,0 * 1 2 3 * 4 5 6 * ,,1 * 7 8 9 * 10 11 12 * ,,,1 * ,,0 * 21 22 23 * 24 25 26 * ,,1 * 27 28 29 * 30 31 32 * ----- end ----- * * Constructing a matrix from an empty file (0 bytes or only an EOL char) returns a null * matrix (0x0x0x0 dimensions). Writting a null matrix (that is with at least one null * dimension creates an empty file. * */ template class Matrix4D : public Matrix { public: // constructors Matrix4D() = default ; /*! * \brief Constructs a matrix with the given dimensions, * filled with 0 values. * \param dim1 the first dimension. * \param dim2 the second dimension. * \param dim3 the third dimension. * \param dim4 the fourth dimension. */ Matrix4D(size_t dim1, size_t dim2, size_t dim3, size_t dim4) ; /*! * \brief Constructs a matrix with the given dimensions and * initialize the values to the given value. * \param dim1 the first dimension. * \param dim2 the second dimension. * \param dim3 the third dimension. * \param dim4 the fourth dimension. * \param value the value to initialize the matrix content * with. */ Matrix4D(size_t dim1, size_t dim2, size_t dim3, size_t dim4, T value) ; /*! * \brief Copy constructor * \param other the matrix to copy the content from. */ Matrix4D(const Matrix4D& other) ; /*! * \brief Constructs a matrix from a text file. A matrix contructed * from an empty file (or a file containing only one EOL char) returns * an empty matrix (null dimensions). * \param file_address the address of the file containing the matrix. * \throw std::runtime_error if anything happen while reading the * file (format error, file not found, etc). */ Matrix4D(const std::string& file_address) ; /*! * \brief Destructor. */ virtual ~Matrix4D() = default ; // methods overloaded from Matrix using Matrix::get ; using Matrix::set ; // methods OK /*! * \brief Gets the element at the given coordinates. * \param dim1 the first dimension coordinate. * \param dim2 the second dimension coordinate. * \param dim3 the third dimension coordinate. * \param dim4 the fourth dimension coordinate. * \throw std::out_of_range exception if the coordinates * are out of range. * \return the element. */ T get(size_t dim1, size_t dim2, size_t dim3, size_t dim4) const ; /*! * \brief Sets the element at the given coordinates * to the given value. * \param dim1 the first dimension coordinate. * \param dim2 the second dimension coordinate. * \param dim3 the third dimension coordinate. * \param dim4 the fourth dimension coordinate. * \param value the new value. * \throw std::out_of_range exception if the coordinates * are out of range. */ void set(size_t dim1, size_t dim2, size_t dim3, size_t dim4, T value) ; /*! * \brief Produces a nice representation of the matrix on the given * stream. * \param stream the stream. * \param precision the rounding precision. * \param width the column width in number of characters. * \param sep the character separator. */ virtual void print(std::ostream& stream, size_t precision=4 ,size_t width=8, char sep=' ') const override ; - // operators OK + // operators /*! * \brief Returns a reference to the corrresponding * element. This method does not perform any check on * the coordinates. * \param dim1 the first dimension coordinate. * \param dim2 the second dimension coordinate. * \param dim3 the third dimension coordinate. * \param dim4 the third dimension coordinate. * \return a reference to this element. */ T& operator() (size_t dim1, size_t dim2, size_t dim3, size_t dim4) ; /*! * \brief Returns a reference to the corrresponding * element. This method does not perform any check on * the coordinates. * \param dim1 the first dimension coordinate. * \param dim2 the second dimension coordinate. * \param dim3 the third dimension coordinate. * \param dim4 the third dimension coordinate. * \return a reference to this element. */ const T& operator() (size_t dim1, size_t dim2, size_t dim3, size_t dim4) const ; private: // methods /*! * \brief Checks whether a given string is a 3D header * (such as ",,0"), as found in files storing Matrix4D. * \param str the string to check. * \return whether the string is such a slice header. */ bool is_header_3d(const std::string& str) const ; /*! * \brief Checks whether a given string is a 4D header * (such as ",,,0"), as found in files storing Matrix4D. * \param str the string to check. * \return whether the string is such a slice header. */ bool is_header_4d(const std::string& str) const ; /*! * \brief Routine to load 4D matrices from files. * This method reads from a std::ifstream object, * from the current pointer location until i) a 4D * header line is found (such as ',,,1') or ii) until * it cannot read anymore from the stream. All * data are pushed back into the data vector and * the dimensions of the data read are stored into * the dim vector (these data are actually a 3D * matrix). If the method returned because it * found another 4D header, it returns true, false * otherwise. * To read an entire 4D matrix from a file, simply * use this scheme : i) read the 1st 4D header * ii) call this function while it returns true. * \param file_name a reference to a string containing * the address of the file currently read (for exception * messages). * \param file a reference to the std::ifstream to read * from. Obviously, the stream state will be modified as * the method reads from it. However, it will never be * closed by the method. * \param data a reference to an empty vector where the * read data will be pushed back. * \param dim a reference to an empty vector where the * dimensions of the read data will be stored. * \return whether the last piece of data read from the * stream was a 4D header. */ bool get_3d_slice(const std::string& file_name, std::ifstream& file, std::vector& data, std::vector& dim) const ; + /*! + * \brief Converts a quadruplet of VALID (dim1, dim2, dim3, dim4) + * coordinates to a the corresponding offset allowing to get an + * element in the data vector. + * \param dim1 the index of the 1st dimension slice. + * \param dim2 the index of the 2nd dimension slice. + * \param dim3 the index of the 3rd dimension slice. + * \param dim4 the index of the 4th dimension slice. + * \return the corresponding offset. + */ + size_t convert_to_offset(size_t dim1, + size_t dim2, + size_t dim3, + size_t dim4) const ; + + /*! + * \brief Computes and stores the offsets at which + * each slice on the 1st dimension starts. + */ + void compute_dim1_offsets() ; + + /*! + * \brief Computes and stores the offsets at which + * each slice on the 2nd dimension starts. + */ + void compute_dim2_offsets() ; + + /*! + * \brief Computes and stores the offsets at which + * each slice on the 3rd dimension starts. + */ + void compute_dim3_offsets() ; + + /*! + * \brief Computes and stores the offsets at which + * each slice on the 4th dimension starts. + */ + void compute_dim4_offsets() ; + + /*! + * \brief Contains the offsets at which each dim1 slice + * starts. Each element corresponds to the corresponding + * dim1 slice (1st element -> 1st dim1 slice). + */ + std::vector _dim1_offsets ; + /*! + * \brief Contains the offsets at which each dim2 slice + * starts. Each element corresponds to the corresponding + * y slice (1st element -> 1st dim2 slice). + */ + std::vector _dim2_offsets ; + /*! + * \brief Contains the offsets at which each dim3 slice + * starts. Each element corresponds to the corresponding + * x slice (1st element -> 1st dim3 slice). + */ + std::vector _dim3_offsets ; + /*! + * \brief Contains the offsets at which each dim4 slice + * starts. Each element corresponds to the corresponding + * x slice (1st element -> 1st dim4 slice). + */ + std::vector _dim4_offsets ; + } ; // operators /*! * \brief Addition operator. * \param m the matrix of interest * \param value the value to add to each element. * \return the resulting matrix. */ template const Matrix4D operator + (Matrix4D m, T value) { Matrix4D other(m) ; m += value ; return m ; } /*! * \brief Substraction operator * \param m the matrix of interest. * \param value the value to substract to each element. * \return the resulting matrix. */ template const Matrix4D operator - (Matrix4D m, T value) { Matrix4D other(m) ; m -= value ; return m ; } /*! * \brief Multiplication operator. * \param m the matrix of interest. * \param value the value to multiply each elements by. * \return the resulting matrix. */ template const Matrix4D operator * (Matrix4D m, T value) { Matrix4D other(m) ; m *= value ; return m ; } /*! * \brief Division operator. * \param m the matrix of interest. * \param value the value to divide each elements by. * \throw std::invalid_argument if value is 0. * \return the resulting matrix. */ template const Matrix4D operator / (Matrix4D m, T value) { if(value == static_cast(0)) { throw std::invalid_argument("division by 0!") ; } Matrix4D other(m) ; other /= value ; return other ; } /*! * \brief Sends a representation of the matrix to the stream. * \param stream the stream of interest. * \param m the matrix of interest. * \return a reference to the stream. */ template std::ostream& operator << (std::ostream& stream, const Matrix4D& m) { m.print(stream) ; return stream ; } // method implementation template Matrix4D::Matrix4D(size_t dim1, size_t dim2, size_t dim3, size_t dim4) - : Matrix({dim1, dim2, dim3, dim4}, 0) + : Matrix4D(dim1, dim2, dim3, dim4, 0) {} template Matrix4D::Matrix4D(size_t dim1, size_t dim2, size_t dim3, size_t dim4, T value) - : Matrix({dim1, dim2, dim3, dim4}, value) -{} + : Matrix({dim1, dim2, dim3, dim4}, value), + _dim1_offsets(dim1), + _dim2_offsets(dim2), + _dim3_offsets(dim3), + _dim4_offsets(dim4) +{ this->compute_dim1_offsets() ; + this->compute_dim2_offsets() ; + this->compute_dim3_offsets() ; + this->compute_dim4_offsets() ; +} template Matrix4D::Matrix4D(const Matrix4D &other) : Matrix(other) -{} +{ this->_dim1_offsets = other._dim1_offsets ; + this->_dim2_offsets = other._dim2_offsets ; + this->_dim3_offsets = other._dim3_offsets ; + this->_dim4_offsets = other._dim4_offsets ; +} template Matrix4D::Matrix4D(const std::string &file_address) { this->_dim = {0,0,0,0} ; this->_data = std::vector() ; this->_dim_size = this->_dim.size() ; this->_data_size = this->_data.size() ; this->_dim_prod = std::vector(this->_dim_size, 0) ; std::ifstream file(file_address, std::ifstream::in) ; if(file.fail()) { char msg[BUFFER_SIZE] ; sprintf(msg, "error! cannot open %s", file_address.c_str()) ; throw std::runtime_error(msg) ; } std::string buffer_str ; std::vector buffer_t ; std::vector dim ; // read 1st line getline(file, buffer_str) ; // empty line if(buffer_str.size() == 0) { // this file only contains one eol char and should be considered as empty, // -> returns empty matrix not an error if(file.peek() == EOF and file.eof()) { file.close() ; return ; } file.close() ; char msg[BUFFER_SIZE] ; sprintf(msg, "error! while reading %s (empty line)", file_address.c_str()) ; throw std::runtime_error(msg) ; } if(file.fail()) { file.close() ; char msg[BUFFER_SIZE] ; sprintf(msg, "error! while reading %s", file_address.c_str()) ; throw std::runtime_error(msg) ; } bool found_4d_header = this->is_header_4d(buffer_str) ; do { if(file.fail()) { file.close() ; char msg[BUFFER_SIZE] ; sprintf(msg, "error! while reading %s", file_address.c_str()) ; throw std::runtime_error(msg) ; } // check empty line if(buffer_str.size() == 0) { file.close() ; char msg[BUFFER_SIZE] ; sprintf(msg, "error! while reading %s (empty line)", file_address.c_str()) ; throw std::runtime_error(msg) ; } // this is the beginning of a 3D slice -> get it using routine if(found_4d_header) { try { // get slice buffer_t.clear() ; dim.clear() ; found_4d_header = this->get_3d_slice(file_address, file, buffer_t, dim); // update data for(const auto& i : buffer_t) { this->_data.push_back(i) ; this->_data_size++ ; } // update dim only for the 1st slice (the 1st slice set the dimensions) if(this->_dim[3] == 0) { this->_dim[0] = dim[0] ; this->_dim[1] = dim[1] ; this->_dim[2] = dim[2] ; } // check dimensions of the slice else { if(dim[0] != this->_dim[0] or dim[1] != this->_dim[1] or dim[2] != this->_dim[2]) { char msg[BUFFER_SIZE] ; sprintf(msg, "format error! slice have variable dimensions in %s", file_address.c_str()) ; throw std::runtime_error(msg) ; } } this->_dim[3]++ ; } catch(std::runtime_error& e) { file.close() ; throw e ; } } // this is an error, everything between two ',,,N' header // should be read at once. The only way out of the loop // is that no more header has been read because of eof else if(not found_4d_header and not file.eof()) { file.close() ; char msg[BUFFER_SIZE] ; sprintf(msg, "error! while reading %s", file_address.c_str()) ; throw std::runtime_error(msg) ; } } while(found_4d_header) ; file.close() ; this->compute_dim_product() ; + + this->_dim1_offsets = std::vector(this->_dim[1]) ; + this->_dim2_offsets = std::vector(this->_dim[0]) ; + this->_dim3_offsets = std::vector(this->_dim[2]) ; + this->_dim4_offsets = std::vector(this->_dim[3]) ; + this->compute_dim1_offsets() ; + this->compute_dim2_offsets() ; + this->compute_dim3_offsets() ; + this->compute_dim4_offsets() ; } template T Matrix4D::get(size_t dim1, size_t dim2, size_t dim3, size_t dim4) const { try { return this->get({dim1, dim2, dim3, dim4}) ; } catch(std::out_of_range& e) { throw e ; } } template void Matrix4D::set(size_t dim1, size_t dim2, size_t dim3, size_t dim4, T value) { try { this->set({dim1, dim2, dim3, dim4}, value) ; } catch(std::out_of_range& e) { throw e ; } } template void Matrix4D::print(std::ostream &stream, size_t precision, size_t width, char sep) const { // if the matrix has at least one 0 dimension (no data), don't do anything if(this->_dim[0]==0 or this->_dim[1]==0 or this->_dim[2]==0 or this->_dim[3]==0) { return ; } stream.setf(std::ios::left) ; stream << std::setprecision(precision) << std::fixed ; std::vector dim = this->get_dim() ; size_t n = 0 ; size_t n_tot = std::accumulate(dim.begin(), dim.end(), 1, std::multiplies()) ; for(size_t dim4=0; dim4 T& Matrix4D::operator () (size_t dim1, size_t dim2, size_t dim3, size_t dim4) -{ std::vector coord = {dim2, dim1, dim3, dim4} ; - return this->_data[this->convert_to_offset(coord)] ; -} +{ return this->_data[this->convert_to_offset(dim1, dim2, dim3, dim4)] ; } template const T& Matrix4D::operator () (size_t dim1, size_t dim2, size_t dim3, size_t dim4) const -{ std::vector coord = {dim2, dim1, dim3, dim4} ; - return this->_data[this->convert_to_offset(coord)] ; -} +{ return this->_data[this->convert_to_offset(dim1, dim2, dim3, dim4)] ; } template bool Matrix4D::is_header_3d(const std::string &str) const { if(str[0] == ',' and str[1] == ',' and str.find(',', 2) == std::string::npos) { return true ; } return false ; } template bool Matrix4D::is_header_4d(const std::string &str) const { if(str[0] == ',' and str[1] == ',' and str[2] == ',' and str.find(',', 3) == std::string::npos) { return true ; } return false ; } template bool Matrix4D::get_3d_slice(const std::string& file_name, std::ifstream& file, std::vector &data, std::vector &dim) const { bool found_4d_header = false ; // the flag to return dim = {0,0,0} ; std::string buffer_str ; std::vector buffer_vec ; T buffer_T ; size_t n_line = 0, n_line_data = 0 ; // number of line and of data line read size_t row_len = 0, col_len = 0 ; // length of row and column in nber of values size_t row_len_cur = 0, col_len_cur = 0 ; // current number of values read in row and col while(getline(file, buffer_str)) { if(file.fail()) { char msg[BUFFER_SIZE] ; sprintf(msg, "error! while reading %s", file_name.c_str()) ; throw std::runtime_error(msg) ; } // check empty line if(buffer_str.size() == 0) { char msg[BUFFER_SIZE] ; sprintf(msg, "error! while reading %s (empty line)", file_name.c_str()) ; throw std::runtime_error(msg) ; } // check whether this is the beginning of a 4D slice header, if so // break if(this->is_header_4d(buffer_str)) { found_4d_header = true ; break ; } // check whether it is the beginning of a slice // 1st line in file should be if(this->is_header_3d(buffer_str)) { // check that slice have a constant number of rows if(dim[2] == 1) { col_len = col_len_cur ; // dim[0] = row_len ; // dim[1] = col_len ; } else if(col_len_cur != col_len) { char msg[BUFFER_SIZE] ; sprintf(msg, "format error! slice have variable dimensions in %s", file_name.c_str()) ; throw std::runtime_error(msg) ; } dim[2]++ ; col_len_cur = 0 ; n_line++ ; continue ; } // 1st line in file should be a header and entering // this block is forbidden if(n_line == 0) { char msg[BUFFER_SIZE] ; sprintf(msg, "format error! first line is not a slice header in %s", file_name.c_str()) ; throw std::runtime_error(msg) ; } // parse line row_len_cur = 0 ; buffer_vec.clear() ; std::istringstream buffer_ss(buffer_str) ; while(buffer_ss >> buffer_T) { buffer_vec.push_back(buffer_T) ; row_len_cur++ ; } // check for an error which likely indicates that a value could not be // casted into a type T (mixed data types in the file) if(buffer_ss.fail() and not buffer_ss.eof()) { char msg[BUFFER_SIZE] ; sprintf(msg, "format error! could not read a line in %s (incompatible data types)", file_name.c_str()) ; throw std::runtime_error(msg) ; } // check that number of column is constant if(n_line_data == 0) { row_len = row_len_cur ; } else if(row_len_cur != row_len) { char msg[BUFFER_SIZE] ; sprintf(msg, "format error! slice have variable dimensions in %s", file_name.c_str()) ; throw std::runtime_error(msg) ; } // update matrix content for(auto i : buffer_vec) { data.push_back(i) ; } col_len_cur++ ; n_line_data++ ; n_line++ ; // update dimension dim[0] = row_len_cur ; dim[1] = col_len_cur ; } // check dimensions of last slice if(col_len_cur != dim[1]) { char msg[BUFFER_SIZE] ; sprintf(msg, "format error! slice have variable dimensions 333 in %s", file_name.c_str()) ; throw std::runtime_error(msg) ; } return found_4d_header ; } +template +void Matrix4D::compute_dim1_offsets() +{ for(size_t i=0; i_dim[1]; i++) + { this->_dim1_offsets[i] = i * this->_dim_prod[1] ; } +} + +template +void Matrix4D::compute_dim2_offsets() +{ for(size_t i=0; i_dim[0]; i++) + { this->_dim2_offsets[i] = i * this->_dim_prod[0] ; } +} + +template +void Matrix4D::compute_dim3_offsets() +{ for(size_t i=0; i_dim[2]; i++) + { this->_dim3_offsets[i] = i * this->_dim_prod[2] ; } +} + +template +void Matrix4D::compute_dim4_offsets() +{ for(size_t i=0; i_dim[3]; i++) + { this->_dim4_offsets[i] = i * this->_dim_prod[3] ; } +} + +template +size_t Matrix4D::convert_to_offset(size_t dim1, + size_t dim2, + size_t dim3, + size_t dim4) const +{ /* + size_t offset = 0 ; + + for(size_t i=0; i_dim_size; i++) + { offset += coord[i] * this->_dim_prod[i] ; } + + return offset ; + */ + size_t offset = this->_dim1_offsets[dim1] + + this->_dim2_offsets[dim2] + + this->_dim3_offsets[dim3] + + this->_dim4_offsets[dim4] ; + return offset ; +} + #endif // MATRIX4D_HPP diff --git a/src/Random/Random.cpp b/src/Random/Random.cpp index 201bb70..fdc7d02 100755 --- a/src/Random/Random.cpp +++ b/src/Random/Random.cpp @@ -1,30 +1,46 @@ #include "Random.hpp" +#include +#include bool rand_bernoulli(double p) { std::bernoulli_distribution dist(p) ; return dist(getRandomGenerator()) ; } std::vector rand_bernoulli(double p, size_t n) { std::vector vector(n) ; std::bernoulli_distribution dist(p) ; for(size_t i=0; i dist(m, sd) ; return dist(getRandomGenerator()) ; } std::vector rand_normal(double m, double sd, double n) { std::vector vector(n) ; std::normal_distribution dist(m, sd) ; for(size_t i=0; i #include #include #include "RandomNumberGenerator.hpp" /*! * \brief Generates a random number from a * Bernouilli distribution of parameter p. * \param p the probability of success. * \return a random number. */ bool rand_bernoulli(double p) ; /*! * \brief Generates n random number from a * Bernouilli distribution of parameter p. * Not faster than rand_bernoulli(double p) * \param p the probability of success. * \param n the number of values to sample. * \return a vector of n random numbers. */ std::vector rand_bernoulli(double p, size_t n) ; /*! * \brief Generates a random number from a * Normal distribution of mean m and standard * deviation sd. * \param m the mean. * \param sd the standard deviation. * \return a random number. */ double rand_normal(double m, double sd) ; /*! * \brief Generates n random numbers from a * Normal distribution of mean m and standard * deviation sd. * More efficient for sampling than * rand_normal(double m, double sd). * \param m the mean. * \param sd the standard deviation. * \param n the number of values to sample. * \return a vector of n random numbers. */ std::vector rand_normal(double m, double sd, size_t n) ; +/*! + * Generates a random string made of [0-9a-zA-Z] + * characters. + * \param length the length of the string to + * generate. + * \return the generated string. + */ +std::string rand_string(size_t length) ; + /*! Generates a real random number from a uniform * distribution comprised between min and max. * \param min the lower limit of the distribution. * \param max the upper limit of the distribution. * \return a random number. */ template T rand_real_uniform(T min, T max) { std::uniform_real_distribution dist(min, max) ; return dist(getRandomGenerator()) ; } /*! Generates n real random numbers from a uniform * distribution comprised between min and max. * \param min the lower limit of the distribution. * \param max the upper limit of the distribution. * \param n the number of value to sample. * \return a vector of n random number. */ template std::vector rand_real_uniform(T min, T max, size_t n) { assert(n > 0) ; std::vector vector(n) ; std::uniform_real_distribution dist(min, max) ; for(size_t i=0; i T rand_int_uniform(T min, T max) { std::uniform_int_distribution dist(min, max) ; return dist(getRandomGenerator()) ; } /*! Generates n random integers from a uniform * distribution comprised between min and max. * \param min the lower limit of the distribution. * \param max the upper limit of the distribution. * \param n the number of value to sample. * \return a vector of n random number. */ template std::vector rand_int_uniform(T min, T max, size_t n) { assert(n > 0) ; std::vector vector(n) ; std::uniform_int_distribution dist(min, max) ; for(size_t i=0; i #include #include #include -#include +#include #include #include #include #include #include // std::invalid_argument std::string file_bed = "/local/groux/scATAC-seq/data/toy_data/peaks.bed" ; std::string file_bam = "/local/groux/scATAC-seq/data/toy_data/sc_reads.bam" ; std::string file_bai = "/local/groux/scATAC-seq/data/toy_data/sc_reads.bam.bai" ; // GenomeRegion test suite SUITE(GenomeRegion) { // displays message TEST(message) { std::cout << "Starting GenomicTools tests..." << std::endl ; } // tests vonstructor with value TEST(constructor_value) { std::string chr = "chr1" ; int idx = 0 ; GenomeRegion r1(chr, idx, 0, 10) ; CHECK_EQUAL(chr, r1.chromosome) ; CHECK_EQUAL(0, r1.start) ; CHECK_EQUAL(10, r1.end) ; CHECK_EQUAL(10, r1.length) ; GenomeRegion r2(chr, idx, 1, 10) ; CHECK_EQUAL(chr, r2.chromosome) ; CHECK_EQUAL(1, r2.start) ; CHECK_EQUAL(10, r2.end) ; CHECK_EQUAL(9, r2.length) ; CHECK_THROW(GenomeRegion(chr, idx, -1, 10), std::invalid_argument) ; CHECK_THROW(GenomeRegion(chr, idx, 0, -10), std::invalid_argument) ; } // tests constructFragment factory function to create regions from bam /* TEST(test_contructFragment) { // expected content of bam file std::vector regions ; regions.push_back(GenomeRegion("chr1", 400, 480)) ; regions.push_back(GenomeRegion("chr1", 470, 550)) ; regions.push_back(GenomeRegion("chr1", 560, 800)) ; regions.push_back(GenomeRegion("chr1", 560, 640)) ; regions.push_back(GenomeRegion("chr1", 610, 690)) ; regions.push_back(GenomeRegion("chr1", 670, 750)) ; regions.push_back(GenomeRegion("chr1", 730, 810)) ; regions.push_back(GenomeRegion("chr1", 770, 850)) ; regions.push_back(GenomeRegion("chr1", 950, 1150)) ; regions.push_back(GenomeRegion("chr1", 960, 1040)) ; regions.push_back(GenomeRegion("chr1", 1010, 1090)) ; regions.push_back(GenomeRegion("chr1", 1060, 1140)) ; regions.push_back(GenomeRegion("chr1", 1070, 1150)) ; regions.push_back(GenomeRegion("chr1", 1350, 1430)) ; regions.push_back(GenomeRegion("chr1", 1360, 1440)) ; regions.push_back(GenomeRegion("chr1", 1410, 1490)) ; regions.push_back(GenomeRegion("chr1", 1500, 1600)) ; regions.push_back(GenomeRegion("chr1", 1600, 1700)) ; regions.push_back(GenomeRegion("chr2", 400, 480)) ; regions.push_back(GenomeRegion("chr2", 470, 550)) ; regions.push_back(GenomeRegion("chr2", 560, 800)) ; regions.push_back(GenomeRegion("chr2", 560, 640)) ; regions.push_back(GenomeRegion("chr2", 610, 690)) ; regions.push_back(GenomeRegion("chr2", 670, 750)) ; regions.push_back(GenomeRegion("chr2", 730, 810)) ; regions.push_back(GenomeRegion("chr2", 770, 850)) ; regions.push_back(GenomeRegion("chr2", 950, 1150)) ; regions.push_back(GenomeRegion("chr2", 960, 1040)) ; regions.push_back(GenomeRegion("chr2", 1010, 1090)) ; regions.push_back(GenomeRegion("chr2", 1060, 1140)) ; regions.push_back(GenomeRegion("chr2", 1070, 1150)) ; regions.push_back(GenomeRegion("chr2", 1350, 1430)) ; regions.push_back(GenomeRegion("chr2", 1360, 1440)) ; regions.push_back(GenomeRegion("chr2", 1410, 1490)) ; regions.push_back(GenomeRegion("chr2", 1500, 1600)) ; regions.push_back(GenomeRegion("chr2", 1600, 1700)) ; seqan::BamAlignmentRecord record ; std::string bam_path = "src/Unittests/data/sc_reads.bam" ; // read file for fragments starting on + strand seqan::BamFileIn bam_file(bam_path.c_str()) ; // header seqan::BamHeader bam_header ; seqan::readHeader(bam_header, bam_file) ; for(size_t i=0; not seqan::atEnd(bam_file); i++) { seqan::readRecord(record, bam_file) ; if(seqan::hasFlagFirst(record) and not seqan::hasFlagRC(record)) { std::cout << regions[i] << " " << GenomeRegion::constructFragment(record) << std::endl ; CHECK_EQUAL(regions[i], GenomeRegion::constructFragment(record)) ; } } seqan::close(bam_file) ; // read file for fragments starting on - strand seqan::BamFileIn bam_file(bam_path.c_str()) ; // header seqan::BamHeader bam_header ; seqan::readHeader(bam_header, bam_file) ; for(size_t i=0; not seqan::atEnd(bam_file); i++) { seqan::readRecord(record, bam_file) ; if(seqan::hasFlagFirst(record) and seqan::hasFlagRC(record)) { CHECK_EQUAL(regions[i], GenomeRegion::constructFragment(record)) ; } } seqan::close(bam_file) ; } */ TEST(test_contructRead) { // expected content of bam file std::list regions_exp ; // chromosome 1 -> has index 0 in BAM file header regions_exp.push_back(GenomeRegion("chr1", 0, 400, 435)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 400, 435)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 445, 480)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 445, 480)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 470, 505)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 470, 505)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 515, 550)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 515, 550)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 560, 595)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 560, 595)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 560, 595)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 560, 595)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 605, 640)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 605, 640)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 610, 645)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 610, 645)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 655, 690)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 655, 690)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 670, 705)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 670, 705)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 715, 750)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 715, 750)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 730, 765)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 730, 765)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 765, 800)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 765, 800)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 770, 805)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 770, 805)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 775, 810)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 775, 810)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 815, 850)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 815, 850)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 950, 985)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 950, 985)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 960, 995)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 960, 995)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 1005, 1040)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 1005, 1040)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 1010, 1045)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 1010, 1045)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 1055, 1090)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 1055, 1090)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 1060, 1095)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 1060, 1095)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 1070, 1105)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 1070, 1105)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 1105, 1140)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 1105, 1140)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 1115, 1150)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 1115, 1150)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 1115, 1150)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 1115, 1150)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 1350, 1385)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 1350, 1385)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 1360, 1395)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 1360, 1395)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 1395, 1430)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 1395, 1430)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 1405, 1440)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 1405, 1440)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 1410, 1445)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 1410, 1445)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 1455, 1490)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 1455, 1490)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 1500, 1535)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 1500, 1535)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 1565, 1600)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 1565, 1600)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 1600, 1635)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 1600, 1635)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 1665, 1700)) ; regions_exp.push_back(GenomeRegion("chr1", 0, 1665, 1700)) ; // chromosome 2 -> has index 1 in BAM file header regions_exp.push_back(GenomeRegion("chr2", 1, 400, 435)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 400, 435)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 445, 480)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 445, 480)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 470, 505)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 470, 505)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 515, 550)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 515, 550)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 560, 595)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 560, 595)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 560, 595)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 560, 595)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 605, 640)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 605, 640)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 610, 645)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 610, 645)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 655, 690)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 655, 690)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 670, 705)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 670, 705)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 715, 750)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 715, 750)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 730, 765)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 730, 765)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 765, 800)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 765, 800)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 770, 805)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 770, 805)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 775, 810)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 775, 810)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 815, 850)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 815, 850)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 950, 985)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 950, 985)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 960, 995)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 960, 995)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 1005, 1040)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 1005, 1040)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 1010, 1045)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 1010, 1045)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 1055, 1090)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 1055, 1090)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 1060, 1095)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 1060, 1095)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 1070, 1105)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 1070, 1105)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 1105, 1140)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 1105, 1140)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 1115, 1150)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 1115, 1150)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 1115, 1150)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 1115, 1150)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 1350, 1385)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 1350, 1385)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 1360, 1395)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 1360, 1395)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 1395, 1430)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 1395, 1430)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 1405, 1440)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 1405, 1440)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 1410, 1445)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 1410, 1445)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 1455, 1490)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 1455, 1490)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 1500, 1535)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 1500, 1535)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 1565, 1600)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 1565, 1600)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 1600, 1635)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 1600, 1635)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 1665, 1700)) ; regions_exp.push_back(GenomeRegion("chr2", 1, 1665, 1700)) ; // open file seqan::BamFileIn bam_file ; if (!seqan::open(bam_file, file_bam.c_str())) { char msg[4096] ; sprintf(msg, "ERROR: could not open input file %s", file_bam.c_str()) ; } // read file seqan::BamAlignmentRecord record ; seqan::BamHeader header ; seqan::readHeader(header, bam_file) ; std::list regions_val ; while(not seqan::atEnd(bam_file)) { seqan::readRecord(record, bam_file) ; regions_val.push_back(GenomeRegion::constructRead(record, bam_file)) ; } seqan::close(bam_file) ; // compare CHECK_EQUAL(regions_exp.size(), regions_val.size()) ; auto iter_exp = regions_exp.begin() ; auto iter_val = regions_val.begin() ; while(iter_exp != regions_exp.end()) { CHECK_EQUAL(*iter_exp, *iter_val) ; iter_exp++ ; iter_val++ ; } } // tests the method to check overlaps TEST(overlap) { GenomeRegion r1("chr1", 0, 20, 30) ; // reference GenomeRegion r2("chr1", 0, 20, 30) ; // same as reference GenomeRegion r3("chr1", 0, 0, 45) ; // totally contain reference GenomeRegion r4("chr1", 0, 0, 10) ; // no overlap, upstream reference GenomeRegion r5("chr1", 0, 15, 25) ; // partial overlap reference GenomeRegion r6("chr1", 0, 22, 29) ; // inside reference GenomeRegion r7("chr1", 0, 25, 35) ; // partial overlap reference GenomeRegion r8("chr1", 0, 35, 45) ; // no overlap, downstream reference GenomeRegion r9("chr2", 1, 20, 30) ; // diff chromosome // always check reciprocity CHECK_EQUAL(true, r1 | r1) ; CHECK_EQUAL(true, r1 | r2) ; CHECK_EQUAL(true, r2 | r1) ; CHECK_EQUAL(true, r1 | r3) ; CHECK_EQUAL(true, r3 | r1) ; CHECK_EQUAL(false, r1 | r4) ; CHECK_EQUAL(false, r4 | r1) ; CHECK_EQUAL(true, r1 | r5) ; CHECK_EQUAL(true, r5 | r1) ; CHECK_EQUAL(true, r1 | r6) ; CHECK_EQUAL(true, r6 | r1) ; CHECK_EQUAL(true, r1 | r7) ; CHECK_EQUAL(true, r7 | r1) ; CHECK_EQUAL(false, r1 | r8) ; CHECK_EQUAL(false, r8 | r1) ; CHECK_EQUAL(false, r1 | r9) ; CHECK_EQUAL(false, r9 | r1) ; } // tests the methods to get overlap length TEST(overlap_len) { GenomeRegion r1("chr1", 0, 10, 20) ; // reference GenomeRegion r2("chr1", 0, 10, 20) ; // same as reference GenomeRegion r3("chr1", 0, 0, 45) ; // totally contain reference GenomeRegion r4("chr2", 1, 10, 20) ; // diff chromosome // always check reciprocity CHECK_EQUAL(10, r1.overlap_len(r1)) ; CHECK_EQUAL(10, r1.overlap_len(r2)) ; CHECK_EQUAL(10, r1.overlap_len(r2)) ; CHECK_EQUAL(10, r1.overlap_len(r3)) ; CHECK_EQUAL(10, r1.overlap_len(r3)) ; CHECK_EQUAL(0, r1.overlap_len(r4)) ; CHECK_EQUAL(0, r1.overlap_len(r4)) ; // slide a smaller region along reference, from before to after std::vector overlaps = {0,0,1,2,3,4,4,4,4,4,4,4,3,2,1,0,0,0} ; int len = 4 ; for(int i=0, start=5; start<23; i++, start++) { int end = start + len ; GenomeRegion s1("chr1", 0, start, end) ; CHECK_EQUAL(overlaps[i], r1.overlap_len(s1)) ; CHECK_EQUAL(overlaps[i], s1.overlap_len(r1)) ; } } // tests the is upstream and is downstream operators TEST(upstream_downstream) { GenomeRegion r1("chr1", 0, 10, 20) ; // reference GenomeRegion r2("chr1", 0, 10, 20) ; // same as reference GenomeRegion r3("chr1", 0, 0, 45) ; // totally contain reference GenomeRegion r4("chr2", 1, 10, 20) ; // diff chromosome (downstream has 0 < 1) // always check reciprocity CHECK_EQUAL(false, r1 < r1) ; CHECK_EQUAL(false, r1 > r1) ; CHECK_EQUAL(false, r1 < r2) ; CHECK_EQUAL(false, r1 > r2) ; CHECK_EQUAL(false, r1 < r3) ; CHECK_EQUAL(false, r1 < r3) ; CHECK_EQUAL(false, r3 < r1) ; CHECK_EQUAL(false, r3 < r1) ; // not on the same chromosome -> depends on the index value CHECK_EQUAL(r1 < r4, true) ; CHECK_EQUAL(r1 > r4, false) ; CHECK_EQUAL(r4 < r1, false) ; CHECK_EQUAL(r4 > r1, true) ; // slide a smaller region along reference, from before to after std::vector s1_upstream = {1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} ; // s1 < r1 std::vector r1_downstream = {1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} ; // r1 > s1 std::vector s1_downstream = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1} ; // s1 > r1 std::vector r1_upstream = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1} ; // r1 < s1 int len = 4 ; for(int i=0, start=5; start<23; i++, start++) { // the sliding one int end = start + len ; GenomeRegion s1("chr1", 0, start, end) ; CHECK_EQUAL(s1_upstream[i], s1 < r1) ; CHECK_EQUAL(r1_downstream[i], r1 > s1) ; CHECK_EQUAL(r1_upstream[i], r1 < s1) ; CHECK_EQUAL(s1_downstream[i], s1 > r1) ; } } } // CorrelationMatrixCreator test suite SUITE(CorrelationMatrixCreator) { // displays message TEST(message) { std::cout << "Starting CorrelationMatrixCreator tests..." << std::endl ; } // tests matrix creation with full fragments TEST(create_matrix_fragment) { CorrelationMatrixCreator creator(file_bed, file_bam, file_bai, -500, 500, 100, CorrelationMatrixCreator::FRAGMENT) ; - matrix2d_i m_val = creator.create_matrix() ; - matrix2d_i m_exp(2, - vector_i(9, 0)) ; - m_exp[0][0] = 420 ; m_exp[0][1] = 480 ; m_exp[0][2] = 380 ; - m_exp[0][3] = 0 ; m_exp[0][4] = 440 ; m_exp[0][5] = 600 ; - m_exp[0][6] = 0 ; m_exp[0][7] = 0 ; m_exp[0][8] = 400 ; - - m_exp[1][0] = 420 ; m_exp[1][1] = 480 ; m_exp[1][2] = 380 ; - m_exp[1][3] = 0 ; m_exp[1][4] = 440 ; m_exp[1][5] = 600 ; - m_exp[1][6] = 0 ; m_exp[1][7] = 0 ; m_exp[1][8] = 400 ; - - CHECK_EQUAL(m_exp.size(), m_val.size()) ; - CHECK_EQUAL(m_exp[0].size(), m_val[0].size()) ; - - for(size_t i=0; i m_val = creator.create_matrix() ; + Matrix2D m_exp(2, 9, 0) ; + m_exp(0,0) = 420 ; m_exp(0,1) = 480 ; m_exp(0,2) = 380 ; + m_exp(0,3) = 0 ; m_exp(0,4) = 440 ; m_exp(0,5) = 600 ; + m_exp(0,6) = 0 ; m_exp(0,7) = 0 ; m_exp(0,8) = 400 ; + + m_exp(1,0) = 420 ; m_exp(1,1) = 480 ; m_exp(1,2) = 380 ; + m_exp(1,3) = 0 ; m_exp(1,4) = 440 ; m_exp(1,5) = 600 ; + m_exp(1,6) = 0 ; m_exp(1,7) = 0 ; m_exp(1,8) = 400 ; + + CHECK_EQUAL(m_exp.get_nrow(), m_val.get_nrow()) ; + CHECK_EQUAL(m_exp.get_ncol(), m_val.get_ncol()) ; + + for(size_t i=0; i m_val = creator.create_matrix() ; + Matrix2D m_exp(2, 9, 0) ; + m_exp(0,0) = 2 ; m_exp(0,1) = 6 ; m_exp(0,2) = 4 ; + m_exp(0,3) = 0 ; m_exp(0,4) = 2 ; m_exp(0,5) = 8 ; + m_exp(0,6) = 0 ; m_exp(0,7) = 0 ; m_exp(0,8) = 4 ; + + m_exp(1,0) = 2 ; m_exp(1,1) = 6 ; m_exp(1,2) = 4 ; + m_exp(1,3) = 0 ; m_exp(1,4) = 2 ; m_exp(1,5) = 8 ; + m_exp(1,6) = 0 ; m_exp(1,7) = 0 ; m_exp(1,8) = 4 ; + + CHECK_EQUAL(m_exp.get_nrow(), m_val.get_nrow()) ; + CHECK_EQUAL(m_exp.get_ncol(), m_val.get_ncol()) ; + + for(size_t i=0; i m_val = creator.create_matrix() ; + Matrix2D m_exp(2, 9, 0) ; + m_exp(0,0) = 280 ; m_exp(0,1) = 250 ; m_exp(0,2) = 310 ; + m_exp(0,3) = 0 ; m_exp(0,4) = 280 ; m_exp(0,5) = 420 ; + m_exp(0,6) = 0 ; m_exp(0,7) = 0 ; m_exp(0,8) = 350 ; + + m_exp(1,0) = 280 ; m_exp(1,1) = 250 ; m_exp(1,2) = 310 ; + m_exp(1,3) = 0 ; m_exp(1,4) = 280 ; m_exp(1,5) = 420 ; + m_exp(1,6) = 0 ; m_exp(1,7) = 0 ; m_exp(1,8) = 350 ; + + CHECK_EQUAL(m_exp.get_nrow(), m_val.get_nrow()) ; + CHECK_EQUAL(m_exp.get_ncol(), m_val.get_ncol()) ; + + for(size_t i=0; i m_val = creator.create_matrix() ; + Matrix2D m_exp(2, 9, 0) ; + m_exp(0,0) = 8 ; m_exp(0,1) = 8 ; m_exp(0,2) = 8 ; + m_exp(0,3) = 0 ; m_exp(0,4) = 8 ; m_exp(0,5) = 12 ; + m_exp(0,6) = 0 ; m_exp(0,7) = 0 ; m_exp(0,8) = 10 ; + + m_exp(1,0) = 8 ; m_exp(1,1) = 8 ; m_exp(1,2) = 8 ; + m_exp(1,3) = 0 ; m_exp(1,4) = 8 ; m_exp(1,5) = 12 ; + m_exp(1,6) = 0 ; m_exp(1,7) = 0 ; m_exp(1,8) = 10 ; + + CHECK_EQUAL(m_exp.get_nrow(), m_val.get_nrow()) ; + CHECK_EQUAL(m_exp.get_ncol(), m_val.get_ncol()) ; + + for(size_t i=0; i #include // accumulate() #include #include #include #include /*! * \brief Given a matrix and an offset, this methods converts * the offset into a coordinates vector (row, col, ...). It is * a simple copy/paste of Matrix::convert_to_coord() which is * private. * \param m a matrix. * \param offset an offset * \return a vector of coordinates (row,col,...) corresponding to * the offset for the given matrix. */ std::vector convert_to_coord(const Matrix& m, size_t offset) { std::vector dim = m.get_dim() ; // (row, col, ...) format if(dim.size() > 1) { std::swap(dim[0], dim[1]) ; } // (x,y,...) format std::vector coord(dim.size(), 0) ; std::vector dim_prod(dim.size(), 0) ; dim_prod[0] = 1 ; if(dim.size() > 1) { dim_prod[1] = dim[0] ; } if(dim.size() > 2) { for(size_t i=2; i=0; i--) { size_t c = offset / dim_prod[i] ; coord[i] = c ; offset -= (dim_prod[i]*c) ; } if(dim.size() > 1) { std::swap(coord[0], coord[1]) ; } // (row,col,...) format return coord ; } -/* // Matrix test suite SUITE(Matrix) { // displays message TEST(message) { std::cout << "Starting Matrix tests..." << std::endl ; } // tests normal constructor TEST(constructor) { std::vector dim_1, dim_2, dim_3 ; size_t data_size_1, data_size_2, data_size_3 ; // from 0D to 10D for(size_t i=1; i<11; i++) { dim_1.push_back(i+1) ; dim_2.push_back(i) ; dim_3.push_back(0) ; // has non-0 dimensions : 1 /1x2 / 1x2x3 / ... / 1x2x...x11 Matrix m1(dim_1) ; data_size_1 = std::accumulate(dim_1.begin(), dim_1.end(), 1, std::multiplies()) ; CHECK_EQUAL(dim_1.size(), m1.get_dim_size()) ; CHECK_ARRAY_EQUAL(dim_1, m1.get_dim(), dim_1.size()) ; CHECK_EQUAL(data_size_1, m1.get_data_size()) ; // always has a zero dimension : 0 / 0x1 / 0x1x2/ ... / 0x1x...x10 Matrix m2(dim_2) ; data_size_2 = std::accumulate(dim_2.begin(), dim_2.end(), 1, std::multiplies()) ; CHECK_EQUAL(dim_2.size(), m2.get_dim_size()) ; CHECK_ARRAY_EQUAL(dim_2, m2.get_dim(), dim_2.size()) ; CHECK_EQUAL(data_size_2, m2.get_data_size()) ; CHECK_EQUAL(data_size_2, m2.get_data().size()) ; // is a 0 dimension matrix : 0 / 0x0 / 0x0x...x0 Matrix m3(dim_3) ; data_size_3 = std::accumulate(dim_3.begin(), dim_3.end(), 1, std::multiplies()) ; CHECK_EQUAL(dim_3.size(), m3.get_dim_size()) ; CHECK_ARRAY_EQUAL(dim_3, m3.get_dim(), dim_3.size()) ; CHECK_EQUAL(data_size_3, m3.get_data_size()) ; CHECK_EQUAL(data_size_3, m3.get_data().size()) ; } } // tests contructor with value TEST(constructor_value) { std::vector dim_1, dim_2, dim_3 ; size_t data_size_1, data_size_2, data_size_3 ; // from 0D to 10D for(size_t i=1; i<11; i++) { dim_1.push_back(i+1) ; dim_2.push_back(i) ; dim_3.push_back(0) ; // has non-0 dimensions : 1 /1x2 / 1x2x3 / ... / 1x2x...x11 Matrix m1(dim_1, i) ; data_size_1 = std::accumulate(dim_1.begin(), dim_1.end(), 1, std::multiplies()) ; CHECK_EQUAL(dim_1.size(), m1.get_dim_size()) ; CHECK_ARRAY_EQUAL(dim_1, m1.get_dim(), dim_1.size()) ; CHECK_EQUAL(data_size_1, m1.get_data_size()) ; for(const auto x : m1.get_data()) { CHECK_EQUAL(i, x) ; } // always has a zero dimension : 0 / 0x1 / 0x1x2/ ... / 0x1x...x10 Matrix m2(dim_2, i) ; data_size_2 = std::accumulate(dim_2.begin(), dim_2.end(), 1, std::multiplies()) ; CHECK_EQUAL(dim_2.size(), m2.get_dim_size()) ; CHECK_ARRAY_EQUAL(dim_2, m2.get_dim(), dim_2.size()) ; CHECK_EQUAL(data_size_2, m2.get_data_size()) ; CHECK_EQUAL(data_size_2, m2.get_data().size()) ; for(const auto x : m2.get_data()) { CHECK_EQUAL(i, x) ; } // is a 0 dimension matrix : 0 / 0x0 / 0x0x...x0 Matrix m3(dim_3, i) ; data_size_3 = std::accumulate(dim_3.begin(), dim_3.end(), 1, std::multiplies()) ; CHECK_EQUAL(dim_3.size(), m3.get_dim_size()) ; CHECK_ARRAY_EQUAL(dim_3, m3.get_dim(), dim_3.size()) ; CHECK_EQUAL(data_size_3, m3.get_data_size()) ; CHECK_EQUAL(data_size_3, m3.get_data().size()) ; for(const auto x : m3.get_data()) { CHECK_EQUAL(i, x) ; } } } // tests the get() method, compare a value get with offset with the value get with coordinates // (computed from offset) TEST(get) { std::vector dim_1, dim_2, dim_3 ; size_t data_size_1, data_size_2, data_size_3 ; // from 0D to 10D for(size_t i=1; i<11; i++) { dim_1.push_back(i+1) ; dim_2.push_back(i) ; dim_3.push_back(0) ; // has non-0 dimensions : 1 /1x2 / 1x2x3 / ... / 1x2x...x11 Matrix m1(dim_1, i) ; data_size_1 = std::accumulate(dim_1.begin(), dim_1.end(), 1, std::multiplies()) ; for(size_t j=0; j m2(dim_2, i) ; data_size_2 = std::accumulate(dim_2.begin(), dim_2.end(), 1, std::multiplies()) ; for(size_t j=0; j m3(dim_3, i) ; data_size_3 = std::accumulate(dim_3.begin(), dim_3.end(), 1, std::multiplies()) ; for(size_t j=0; j dim_1, dim_2, dim_3 ; size_t data_size_1, data_size_2, data_size_3 ; // from 0D to 10D for(size_t i=1; i<11; i++) { dim_1.push_back(i+1) ; dim_2.push_back(i) ; dim_3.push_back(0) ; // has non-0 dimensions : 1 /1x2 / 1x2x3 / ... / 1x2x...x11 Matrix m1(dim_1, i) ; data_size_1 = std::accumulate(dim_1.begin(), dim_1.end(), 1, std::multiplies()) ; for(size_t j=0; j m2(dim_2, i) ; data_size_2 = std::accumulate(dim_2.begin(), dim_2.end(), 1, std::multiplies()) ; for(size_t j=0; j m3(dim_3, i) ; data_size_3 = std::accumulate(dim_3.begin(), dim_3.end(), 1, std::multiplies()) ; for(size_t j=0; j dim_1, dim_2, dim_3 ; size_t data_size_1, data_size_2, data_size_3 ; // from 0D to 10D for(size_t i=1; i<11; i++) { dim_1.push_back(i+1) ; dim_2.push_back(i) ; dim_3.push_back(0) ; // has non-0 dimensions : 1 /1x2 / 1x2x3 / ... / 1x2x...x11 Matrix m1(dim_1, i) ; Matrix m1_2(dim_1, i) ; data_size_1 = std::accumulate(dim_1.begin(), dim_1.end(), 1, std::multiplies()) ; for(size_t j=0; j m2(dim_2, i) ; Matrix m2_2(dim_2, i) ; data_size_2 = std::accumulate(dim_2.begin(), dim_2.end(), 1, std::multiplies()) ; for(size_t j=0; j m3(dim_3, i) ; Matrix m3_2(dim_3, i) ; data_size_3 = std::accumulate(dim_3.begin(), dim_3.end(), 1, std::multiplies()) ; for(size_t j=0; j dim_1, dim_2, dim_3 ; size_t data_size_1, data_size_2, data_size_3 ; // from 0D to 10D for(size_t i=1; i<11; i++) { dim_1.push_back(i+1) ; dim_2.push_back(i) ; dim_3.push_back(0) ; // has non-0 dimensions : 1 /1x2 / 1x2x3 / ... / 1x2x...x11 Matrix m1(dim_1, i) ; Matrix m1_2(dim_1, i) ; data_size_1 = std::accumulate(dim_1.begin(), dim_1.end(), 1, std::multiplies()) ; for(size_t j=0; j m2(dim_2, i) ; Matrix m2_2(dim_2, i) ; data_size_2 = std::accumulate(dim_2.begin(), dim_2.end(), 1, std::multiplies()) ; for(size_t j=0; j m3(dim_3, i) ; Matrix m3_2(dim_3, i) ; data_size_3 = std::accumulate(dim_3.begin(), dim_3.end(), 1, std::multiplies()) ; for(size_t j=0; j dim_1, dim_2, dim_3 ; size_t data_size_1, data_size_2, data_size_3 ; // from 0D to 10D for(size_t i=1; i<11; i++) { dim_1.push_back(i+1) ; dim_2.push_back(i) ; dim_3.push_back(0) ; // has non-0 dimensions : 1 /1x2 / 1x2x3 / ... / 1x2x...x11 Matrix m1(dim_1, i) ; Matrix m1_2(dim_1, i) ; data_size_1 = std::accumulate(dim_1.begin(), dim_1.end(), 1, std::multiplies()) ; for(size_t j=0; j m2(dim_2, i) ; Matrix m2_2(dim_2, i) ; data_size_2 = std::accumulate(dim_2.begin(), dim_2.end(), 1, std::multiplies()) ; for(size_t j=0; j m3(dim_3, i) ; Matrix m3_2(dim_3, i) ; data_size_3 = std::accumulate(dim_3.begin(), dim_3.end(), 1, std::multiplies()) ; for(size_t j=0; j dim_1, dim_2, dim_3 ; size_t data_size_1, data_size_2, data_size_3 ; // from 0D to 10D for(size_t i=1; i<11; i++) { dim_1.push_back(i+1) ; dim_2.push_back(i) ; dim_3.push_back(0) ; // has non-0 dimensions : 1 /1x2 / 1x2x3 / ... / 1x2x...x11 Matrix m1(dim_1, i) ; Matrix m1_2(dim_1, i) ; data_size_1 = std::accumulate(dim_1.begin(), dim_1.end(), 1, std::multiplies()) ; for(size_t j=0; j m2(dim_2, i) ; Matrix m2_2(dim_2, i) ; data_size_2 = std::accumulate(dim_2.begin(), dim_2.end(), 1, std::multiplies()) ; for(size_t j=0; j m3(dim_3, i) ; Matrix m3_2(dim_3, i) ; data_size_3 = std::accumulate(dim_3.begin(), dim_3.end(), 1, std::multiplies()) ; for(size_t j=0; j dim_1, dim_2, dim_3 ; size_t data_size_1, data_size_2, data_size_3 ; // from 0D to 10D for(size_t i=1; i<11; i++) { dim_1.push_back(i+1) ; dim_2.push_back(i) ; dim_3.push_back(0) ; // has non-0 dimensions : 1 /1x2 / 1x2x3 / ... / 1x2x...x11 Matrix m1(dim_1, i) ; Matrix m1_2(dim_1, i) ; data_size_1 = std::accumulate(dim_1.begin(), dim_1.end(), 1, std::multiplies()) ; for(size_t j=0; j m2(dim_2, i) ; Matrix m2_2(dim_2, i) ; data_size_2 = std::accumulate(dim_2.begin(), dim_2.end(), 1, std::multiplies()) ; for(size_t j=0; j m3(dim_3, i) ; Matrix m3_2(dim_3, i) ; data_size_3 = std::accumulate(dim_3.begin(), dim_3.end(), 1, std::multiplies()) ; for(size_t j=0; j dim_1, dim_2, dim_3 ; size_t data_size_1, data_size_2, data_size_3 ; // from 0D to 10D for(size_t i=1; i<11; i++) { dim_1.push_back(i+1) ; dim_2.push_back(i) ; dim_3.push_back(0) ; // has non-0 dimensions : 1 /1x2 / 1x2x3 / ... / 1x2x...x11 Matrix m1(dim_1, i) ; Matrix m1_2(dim_1, i) ; data_size_1 = std::accumulate(dim_1.begin(), dim_1.end(), 1, std::multiplies()) ; for(size_t j=0; j m2(dim_2, i) ; Matrix m2_2(dim_2, i) ; data_size_2 = std::accumulate(dim_2.begin(), dim_2.end(), 1, std::multiplies()) ; for(size_t j=0; j m3(dim_3, i) ; Matrix m3_2(dim_3, i) ; data_size_3 = std::accumulate(dim_3.begin(), dim_3.end(), 1, std::multiplies()) ; for(size_t j=0; j dim_1, dim_2, dim_3 ; size_t data_size_1, data_size_2, data_size_3 ; // from 0D to 10D for(size_t i=1; i<11; i++) { dim_1.push_back(i+1) ; dim_2.push_back(i) ; dim_3.push_back(0) ; // has non-0 dimensions : 1 /1x2 / 1x2x3 / ... / 1x2x...x11 Matrix m1(dim_1, i) ; Matrix m1_2(dim_1, i) ; data_size_1 = std::accumulate(dim_1.begin(), dim_1.end(), 1, std::multiplies()) ; for(size_t j=0; j m2(dim_2, i) ; Matrix m2_2(dim_2, i) ; data_size_2 = std::accumulate(dim_2.begin(), dim_2.end(), 1, std::multiplies()) ; for(size_t j=0; j m3(dim_3, i) ; Matrix m3_2(dim_3, i) ; data_size_3 = std::accumulate(dim_3.begin(), dim_3.end(), 1, std::multiplies()) ; for(size_t j=0; j dim_1, dim_2, dim_3 ; size_t data_size_1, data_size_2, data_size_3 ; // from 0D to 10D for(size_t i=1; i<11; i++) { dim_1.push_back(i+1) ; dim_2.push_back(i) ; dim_3.push_back(0) ; // has non-0 dimensions : 1 /1x2 / 1x2x3 / ... / 1x2x...x11 Matrix m1(dim_1, i) ; Matrix m1_2(dim_1, i) ; data_size_1 = std::accumulate(dim_1.begin(), dim_1.end(), 1, std::multiplies()) ; for(size_t j=0; j m2(dim_2, i) ; Matrix m2_2(dim_2, i) ; data_size_2 = std::accumulate(dim_2.begin(), dim_2.end(), 1, std::multiplies()) ; for(size_t j=0; j m3(dim_3, i) ; Matrix m3_2(dim_3, i) ; data_size_3 = std::accumulate(dim_3.begin(), dim_3.end(), 1, std::multiplies()) ; for(size_t j=0; j dim_1, dim_2, dim_3 ; size_t data_size_1, data_size_2, data_size_3 ; // from 0D to 10D for(size_t i=1; i<11; i++) { dim_1.push_back(i+1) ; dim_2.push_back(i) ; dim_3.push_back(0) ; // has non-0 dimensions : 1 /1x2 / 1x2x3 / ... / 1x2x...x11 Matrix m1(dim_1, i) ; Matrix m1_2(dim_1, i) ; data_size_1 = std::accumulate(dim_1.begin(), dim_1.end(), 1, std::multiplies()) ; for(size_t j=0; j m2(dim_2, i) ; Matrix m2_2(dim_2, i) ; data_size_2 = std::accumulate(dim_2.begin(), dim_2.end(), 1, std::multiplies()) ; for(size_t j=0; j m3(dim_3, i) ; Matrix m3_2(dim_3, i) ; data_size_3 = std::accumulate(dim_3.begin(), dim_3.end(), 1, std::multiplies()) ; for(size_t j=0; j dim_1, dim_2, dim_3 ; size_t data_size_1, data_size_2, data_size_3 ; // from 0D to 10D for(size_t i=1; i<11; i++) { dim_1.push_back(i+1) ; dim_2.push_back(i) ; dim_3.push_back(0) ; // has non-0 dimensions : 1 /1x2 / 1x2x3 / ... / 1x2x...x11 Matrix m1(dim_1, i) ; Matrix m1_2(dim_1, i) ; data_size_1 = std::accumulate(dim_1.begin(), dim_1.end(), 1, std::multiplies()) ; for(size_t j=0; j m2(dim_2, i) ; Matrix m2_2(dim_2, i) ; data_size_2 = std::accumulate(dim_2.begin(), dim_2.end(), 1, std::multiplies()) ; for(size_t j=0; j m3(dim_3, i) ; Matrix m3_2(dim_3, i) ; data_size_3 = std::accumulate(dim_3.begin(), dim_3.end(), 1, std::multiplies()) ; for(size_t j=0; j dim_1, dim_2, dim_3 ; size_t data_size_1, data_size_2, data_size_3 ; // from 0D to 10D for(size_t i=1; i<11; i++) { dim_1.push_back(i+1) ; dim_2.push_back(i) ; dim_3.push_back(0) ; // has non-0 dimensions : 1 /1x2 / 1x2x3 / ... / 1x2x...x11 Matrix m1(dim_1, i) ; data_size_1 = std::accumulate(dim_1.begin(), dim_1.end(), 1, std::multiplies()) ; for(size_t j=0; j m1_2(m1) ; CHECK_EQUAL(true, m1 == m1_2) ; // always has a zero dimension : 0 / 0x1 / 0x1x2/ ... / 0x1x...x10 Matrix m2(dim_2, i) ; data_size_2 = std::accumulate(dim_2.begin(), dim_2.end(), 1, std::multiplies()) ; for(size_t j=0; j m2_2(m2) ; CHECK_EQUAL(true, m2 == m2_2) ; // is a 0 dimension matrix : 0 / 0x0 / 0x0x...x0 Matrix m3(dim_3, i) ; data_size_3 = std::accumulate(dim_3.begin(), dim_3.end(), 1, std::multiplies()) ; for(size_t j=0; j m3_2(m3) ; CHECK_EQUAL(true, m3 == m3_2) ; } } // tests the () operator TEST(parenthesis_operator) { std::vector dim_1, dim_2, dim_3 ; size_t data_size_1, data_size_2, data_size_3 ; // from 0D to 10D for(size_t i=1; i<11; i++) { dim_1.push_back(i+1) ; dim_2.push_back(i) ; dim_3.push_back(0) ; // has non-0 dimensions : 1 /1x2 / 1x2x3 / ... / 1x2x...x11 Matrix m1(dim_1, i) ; data_size_1 = std::accumulate(dim_1.begin(), dim_1.end(), 1, std::multiplies()) ; for(size_t j=0; j m2(dim_2, i) ; Matrix m2_2(dim_2, i) ; data_size_2 = std::accumulate(dim_2.begin(), dim_2.end(), 1, std::multiplies()) ; for(size_t j=0; j m3(dim_3, i) ; Matrix m3_2(dim_3, i) ; data_size_3 = std::accumulate(dim_3.begin(), dim_3.end(), 1, std::multiplies()) ; for(size_t j=0; j dim = {i,j} ; Matrix2D m(i,j) ; CHECK_EQUAL(dim.size(), m.get_dim_size()) ; CHECK_ARRAY_EQUAL(dim, m.get_dim(), dim.size()) ; CHECK_EQUAL(std::accumulate(begin(dim), end(dim), 1, std::multiplies()), m.get_data_size()) ; } } } // tests contructor with value TEST(constructor_value) { int n = 999 ; for(size_t i=0; i<10; i++) { for(size_t j=0; j<10; j++) { std::vector dim = {i,j} ; Matrix2D m(i,j,n) ; CHECK_EQUAL(dim.size(), m.get_dim_size()) ; CHECK_ARRAY_EQUAL(dim, m.get_dim(), dim.size()) ; CHECK_EQUAL(std::accumulate(begin(dim), end(dim), 1, std::multiplies()), m.get_data_size()) ; for(const auto& i : m.get_data()) { CHECK_EQUAL(n, i) ; } } } } // tests the copy constructor TEST(constructor_copy) { for(size_t i=1; i<11; i++) { std::vector dim ; // has non-0 dimensions : 1x2 / 2x3 / ... dim = {i, i+1} ; Matrix2D m1(i,i+1) ; for(size_t j=0; j m1_2(m1) ; CHECK_EQUAL(true, m1 == m1_2) ; // always has a zero dimension : // has a zero dimension : 0x1 / 0x2 / ... dim = {0, i} ; Matrix2D m2(0,i) ; for(size_t j=0; j m2_2(m2) ; CHECK_EQUAL(true, m2 == m2_2) ; // is a 0 dimension matrix : 0x0 dim = {0, 0} ; Matrix2D m3(0,0) ; for(size_t j=0; j m3_2(m3) ; CHECK_EQUAL(true, m3 == m3_2) ; } } // tests the get() method, compare a value get with offset with the value get with coordinates // (computed from offset) TEST(get) { for(size_t i=1; i<11; i++) { std::vector dim ; // has non-0 dimensions : 1x2 / 2x3 / ... Matrix2D m1(i,i+1, i) ; dim = {i,i+1} ; for(size_t j=0; j coord = convert_to_coord(m1, j) ; CHECK_EQUAL(m1.get(j), m1.get(coord[0], coord[1])) ; } // has a zero dimension : 0x1 / 0x2 / ... Matrix2D m2(0,i,i) ; dim = {0,i} ; for(size_t j=0; j coord = convert_to_coord(m2, j) ; CHECK_EQUAL(m2.get(j), m2.get(coord[0], coord[1])) ; } // has zero dimensions : 0x0 Matrix2D m3(0,0,i) ; dim = {0,0} ; for(size_t j=0; j coord = convert_to_coord(m3, j) ; CHECK_EQUAL(m3.get(j), m3.get(coord[0], coord[1])) ; } } } // test the set() method, set a value and then check it using get() TEST(set) { for(size_t i=1; i<11; i++) { std::vector dim ; // has non-0 dimensions : 1x2 / 2x3 / ... Matrix2D m1(i,i+1, i) ; dim = {i,i+1} ; for(size_t j=0; j coord = convert_to_coord(m1, j) ; m1.set(coord[0], coord[1], j) ; } for(size_t j=0; j m2(0,i,i) ; dim = {0,i} ; for(size_t j=0; j coord = convert_to_coord(m2, j) ; m2.set(coord[0], coord[1], j) ; } for(size_t j=0; j m3(0,0,i) ; dim = {0,0} ; for(size_t j=0; j coord = convert_to_coord(m3, j) ; m3.set(coord[0], coord[1], j) ; } for(size_t j=0; j m1(i,i+1) ; CHECK_EQUAL(i, m1.get_nrow()) ; // always has a zero dimension : // has a zero dimension : 0x1 / 0x2 / ... Matrix2D m2(0,i) ; CHECK_EQUAL(0, m2.get_nrow()) ; // is a 0 dimension matrix : 0x0 Matrix2D m3(0,0) ; CHECK_EQUAL(0, m3.get_nrow()) ; } } // tests get_ncol() TEST(get_ncol) { for(size_t i=1; i<11; i++) { // has non-0 dimensions : 1x2 / 2x3 / ... Matrix2D m1(i,i+1) ; CHECK_EQUAL(i+1, m1.get_ncol()) ; // always has a zero dimension : // has a zero dimension : 0x1 / 0x2 / ... Matrix2D m2(0,i) ; CHECK_EQUAL(i, m2.get_ncol()) ; // is a 0 dimension matrix : 0x0 Matrix2D m3(0,0) ; CHECK_EQUAL(0, m3.get_ncol()) ; } } // tests get_row() TEST(get_row) { for(size_t i=0; i<11; i++) { Matrix2D m(5,i) ; for(size_t j=0; j row(m.get_ncol()) ; for(size_t n=0, k=j*m.get_ncol(); n m(i,5) ; for(size_t j=0; j col(m.get_nrow()) ; for(size_t n=0, k=j; n m(5,i) ; for(size_t j=0; j new_row(i, 999) ; m.set_row(j, new_row) ; CHECK_EQUAL(i, m.get_row(j).size()) ; CHECK_ARRAY_EQUAL(new_row, m.get_row(j), new_row.size()) ; } CHECK_THROW(m.set_row(9999, std::vector(i,0)), std::out_of_range) ; CHECK_THROW(m.set_row(0, std::vector(i+1,0)), std::invalid_argument) ; } } // tests set_col() TEST(set_col) { for(size_t i=0; i<11; i++) { Matrix2D m(i,5) ; for(size_t j=0; j new_col(i, 999) ; m.set_col(j, new_col) ; CHECK_EQUAL(i, m.get_col(j).size()) ; CHECK_ARRAY_EQUAL(new_col, m.get_col(j), new_col.size()) ; } CHECK_THROW(m.set_col(9999, std::vector(i,0)), std::out_of_range) ; CHECK_THROW(m.set_col(0, std::vector(i+1,0)), std::invalid_argument) ; } } TEST(parenthesis_operator) { for(size_t i=1; i<11; i++) { std::vector dim ; // has non-0 dimensions : 1x2 / 2x3 / ... Matrix2D m1(i,i+1, i) ; dim = {i,i+1} ; for(size_t j=0; j coord = convert_to_coord(m1, j) ; m1(coord[0], coord[1]) = j ; } for(size_t j=0; j m2(0,i,i) ; dim = {0,i} ; for(size_t j=0; j coord = convert_to_coord(m2, j) ; m2(coord[0], coord[1]) = j ; } for(size_t j=0; j m3(0,0,i) ; dim = {0,0} ; for(size_t j=0; j coord = convert_to_coord(m3, j) ; m3(coord[0], coord[1]) = j ; } for(size_t j=0; j> v_int({{0,1,2,3},{4,5,6,7}}) ; std::vector> v_char({{'A','A','A'},{'C','C','C'}, {'G','G','G'},{'T','T','T'}}) ; std::vector> v_double({{0.,1.,2.,3.},{4.,5.,6.,7.}}) ; Matrix2D m_int(2,4) ; m_int.set_row(0, {0,1,2,3}) ; m_int.set_row(1, {4,5,6,7}) ; Matrix2D m_char(4,3) ; m_char.set_row(0, {'A','A','A'}) ; m_char.set_row(1, {'C','C','C'}) ; m_char.set_row(2, {'G','G','G'}) ; m_char.set_row(3, {'T','T','T'}) ; Matrix2D m_dbl(2,4) ; m_dbl.set_row(0, {0.,1.,2.,3.}) ; m_dbl.set_row(1, {4.,5.,6.,7.}) ; // matrix of int Matrix2D m_int1(file_int1) ; // this one is perfect Matrix2D m_int2(file_int2) ; // this one has inhomogeneous spaceers but is OK CHECK_EQUAL(m_int, m_int1) ; CHECK_EQUAL(m_int, m_int2) ; // matrix with only 1 int Matrix2D m_int3(file_int7) ; CHECK_EQUAL( Matrix2D(1,1,1), m_int3) ; // empty matrix (empty file) Matrix2D m_int4(file_int8) ; CHECK_EQUAL(Matrix2D(0,0), m_int4) ; // empty matrix (only eol in file) Matrix2D m_int5(file_int9) ; CHECK_EQUAL(Matrix2D(0,0), m_int5) ; // these files are not well formatted CHECK_THROW(m_int2 = Matrix2D(file_int3), std::runtime_error) ; // data are inhomogeneous CHECK_THROW(m_int2 = Matrix2D(file_int4), std::runtime_error) ; // empty line CHECK_THROW(m_int2 = Matrix2D(file_int5), std::runtime_error) ; // empty line CHECK_THROW(m_int2 = Matrix2D(file_int6), std::runtime_error) ; // empty line // matrix of char Matrix2D m_char1(file_char1) ; CHECK_EQUAL(m_char, m_char1) ; // matrix of double Matrix2D m_dbl1(file_double1) ; CHECK_EQUAL(m_dbl, m_dbl1) ; // file does not exist CHECK_THROW(Matrix2D m_int2(file_ghost), std::runtime_error) ; } // tests file format, writting a matrix and reading it should return the // same matrix, uses set() and the == operator // loading an empty file is not allowed (has no meaning, the file is empty) TEST(file_format) { for(size_t i=0; i<10; i++) { for(size_t j=0; j<10; j++) { Matrix2D m(i,j) ; for(size_t a=0; a m2("./src/Unittests/data/matrix2d_out.mat") ; // any matrix with at least one zero dimension is a null // matrix if(i==0 or j==0) { CHECK_EQUAL(Matrix2D(0,0), m2) ; } else { CHECK_EQUAL(m, m2) ; } } } } } SUITE(Matrix3D) { // displays message TEST(message) { std::cout << "Starting Matrix3D tests..." << std::endl ; } // tests constructor TEST(constructor) { for(size_t i=0; i<10; i++) { for(size_t j=0; j<10; j++) { for(size_t k=0; k<10; k++) { std::vector dim = {i,j,k} ; Matrix3D m(i,j,k) ; CHECK_EQUAL(dim.size(), m.get_dim_size()) ; CHECK_ARRAY_EQUAL(dim, m.get_dim(), dim.size()) ; CHECK_EQUAL(std::accumulate(begin(dim), end(dim), 1, std::multiplies()), m.get_data_size()) ; } } } } // test constructor value TEST(constructor_value) { int n = 999 ; for(size_t i=0; i<10; i++) { for(size_t j=0; j<10; j++) { for(size_t k=0; k<10; k++) { std::vector dim = {i,j,k} ; Matrix3D m(i,j,k,n) ; CHECK_EQUAL(dim.size(), m.get_dim_size()) ; CHECK_ARRAY_EQUAL(dim, m.get_dim(), dim.size()) ; CHECK_EQUAL(std::accumulate(begin(dim), end(dim), 1, std::multiplies()), m.get_data_size()) ; for(const auto& i : m.get_data()) { CHECK_EQUAL(n, i) ; } } } } } // tests copy constructor TEST(constructor_copy) { int n = 999 ; for(size_t i=0; i<10; i++) { for(size_t j=0; j<10; j++) { for(size_t k=0; k<10; k++) { std::vector dim = {i,j,k} ; Matrix3D m(i,j,k,n) ; Matrix3D m2(m) ; CHECK_EQUAL(m, m2) ; } } } } // tests contructor from file, uses the == operator TEST(constructor_file) { std::string file_int1("./src/Unittests/data/matrix3d_int1.mat") ; std::string file_int2("./src/Unittests/data/matrix3d_int2.mat") ; std::string file_int3("./src/Unittests/data/matrix3d_int3.mat") ; std::string file_int4("./src/Unittests/data/matrix3d_int4.mat") ; std::string file_int5("./src/Unittests/data/matrix3d_int5.mat") ; std::string file_int6("./src/Unittests/data/matrix3d_int6.mat") ; std::string file_int7("./src/Unittests/data/matrix3d_int7.mat") ; std::string file_int8("./src/Unittests/data/matrix3d_int8.mat") ; std::string file_int9("./src/Unittests/data/matrix3d_int9.mat") ; std::string file_int10("./src/Unittests/data/matrix3d_int10.mat") ; std::string file_int11("./src/Unittests/data/matrix3d_int11.mat") ; std::string file_int12("./src/Unittests/data/matrix3d_int12.mat") ; std::string file_int13("./src/Unittests/data/matrix3d_int13.mat") ; std::string file_int14("./src/Unittests/data/matrix3d_int14.mat") ; std::string file_double("./src/Unittests/data/matrix3d_double.mat") ; std::string file_ghost("./src/Unittests/data/foo.mat") ; std::vector v_int = {-1,0,2,0, 0,3,0,4, 0,0,0,0, 0,0,0,0, 0,5,-6,0, 0,7,0,0} ; std::vector v_int2 = {1} ; std::vector v_dbl = {-1.,0., 2.,0., 0.,3., 0.,4., 0.,0., 0.,0., 0.,0., 0.,0., 0.,5.,-6.,0., 0.,7., 0.,0.} ; std::vector dim = {2,4,3} ; std::vector dim2 = {1,1,1} ; // matrix of int Matrix3D m_int(file_int1) ; CHECK_EQUAL(dim.size(), m_int.get_dim_size()) ; CHECK_ARRAY_EQUAL(dim, m_int.get_dim(), dim.size()) ; CHECK_EQUAL(v_int.size(), m_int.get_data_size()) ; CHECK_ARRAY_EQUAL(v_int, m_int.get_data(), v_int.size()) ; // matrix with only 1 int Matrix3D m_int2(file_int12) ; CHECK_EQUAL(Matrix3D(1,1,1,1), m_int2) ; // empty matrix (empty file) Matrix3D m_int3(file_int13) ; CHECK_EQUAL(Matrix3D(0,0,0), m_int3) ; // empty matrix (only eol in file) Matrix3D m_int4(file_int13) ; CHECK_EQUAL(Matrix3D(0,0,0), m_int4) ; // these files are not well formatted CHECK_THROW(Matrix3D m_int3(file_int2), std::runtime_error) ; // mixed data types CHECK_THROW(Matrix3D m_int3(file_int3), std::runtime_error) ; // slice of variable dim CHECK_THROW(Matrix3D m_int3(file_int4), std::runtime_error) ; // slice of variable dim CHECK_THROW(Matrix3D m_int3(file_int5), std::runtime_error) ; // slice of variable dim CHECK_THROW(Matrix3D m_int3(file_int6), std::runtime_error) ; // empty line CHECK_THROW(Matrix3D m_int3(file_int7), std::runtime_error) ; // empty line CHECK_THROW(Matrix3D m_int3(file_int8), std::runtime_error) ; // empty line CHECK_THROW(Matrix3D m_int3(file_int9), std::runtime_error) ; // empty line CHECK_THROW(Matrix3D m_int3(file_int10), std::runtime_error) ; // empty line CHECK_THROW(Matrix3D m_int3(file_int11), std::runtime_error) ; // empty line // this file does not exist CHECK_THROW(Matrix3D m_int3(file_ghost), std::runtime_error) ; // matrix of double Matrix3D m_double(file_double) ; CHECK_EQUAL(dim.size(), m_double.get_dim_size()) ; CHECK_ARRAY_EQUAL(dim, m_double.get_dim(), dim.size()) ; CHECK_EQUAL(v_int.size(), m_double.get_data_size()) ; CHECK_ARRAY_EQUAL(v_int, m_double.get_data(), v_int.size()) ; } // tests get() TEST(get) { int n = 999 ; for(size_t i=0; i<10; i++) { for(size_t j=0; j<10; j++) { for(size_t k=0; k<10; k++) { std::vector dim = {i,j,k} ; Matrix3D m(i,j,k,n) ; for(size_t l=0; l coord = convert_to_coord(m, l) ; CHECK_EQUAL(m.get(l), m.get(coord[0], coord[1], coord[2])) ; } } } } } // tests set() TEST(set) { int n = 999 ; for(size_t i=0; i<10; i++) { for(size_t j=0; j<10; j++) { for(size_t k=0; k<10; k++) { std::vector dim = {i,j,k} ; Matrix3D m(i,j,k,n) ; for(size_t l=0; l coord = convert_to_coord(m, l) ; m.set(coord[0], coord[1], coord[2], l) ; } for(size_t l=0; l dim = {i,j,k} ; Matrix3D m(i,j,k,n) ; for(size_t l=0; l coord = convert_to_coord(m, l) ; m(coord[0], coord[1], coord[2]) = l ; } for(size_t l=0; l m(i,j,k) ; for(size_t a=0; a m2("./src/Unittests/data/matrix3d_out.mat") ; // any matrix with at least one zero dimension is a null // matrix if(i==0 or j==0 or k==0) { CHECK_EQUAL(Matrix3D(0,0,0), m2) ; } else { CHECK_EQUAL(m, m2) ; } } } } } } SUITE(Matrix4D) { // displays message TEST(message) { std::cout << "Starting Matrix4D tests..." << std::endl ; } // constructor TEST(constructor) { for(size_t i=0; i<10; i++) { for(size_t j=0; j<10; j++) { for(size_t k=0; k<10; k++) { for(size_t l=0; l<10; l++) { std::vector dim = {i,j,k,l} ; Matrix4D m(i,j,k,l) ; CHECK_EQUAL(dim.size(), m.get_dim_size()) ; CHECK_ARRAY_EQUAL(dim, m.get_dim(), dim.size()) ; CHECK_EQUAL(std::accumulate(begin(dim), end(dim), 1, std::multiplies()), m.get_data_size()) ; } } } } } // test constructor value TEST(constructor_value) { int n = 999 ; for(size_t i=0; i<10; i++) { for(size_t j=0; j<10; j++) { for(size_t k=0; k<10; k++) { for(size_t l=0; l<10; l++) { std::vector dim = {i,j,k,l} ; Matrix4D m(i,j,k,l,n) ; CHECK_EQUAL(dim.size(), m.get_dim_size()) ; CHECK_ARRAY_EQUAL(dim, m.get_dim(), dim.size()) ; CHECK_EQUAL(std::accumulate(begin(dim), end(dim), 1, std::multiplies()), m.get_data_size()) ; for(const auto& i : m.get_data()) { CHECK_EQUAL(n, i) ; } } } } } } // tests copy constructor TEST(constructor_copy) { int n = 999 ; for(size_t i=0; i<10; i++) { for(size_t j=0; j<10; j++) { for(size_t k=0; k<10; k++) { for(size_t l=0; l<10; l++) { std::vector dim = {i,j,k,l} ; Matrix4D m(i,j,k,l,n) ; Matrix4D m2(m) ; CHECK_EQUAL(m, m2) ; } } } } } // tests contructor from file, uses the == operator TEST(constructor_file) { std::string file_int1("./src/Unittests/data/matrix4d_int1.mat") ; std::string file_int2("./src/Unittests/data/matrix4d_int2.mat") ; std::string file_int3("./src/Unittests/data/matrix4d_int3.mat") ; std::string file_int4("./src/Unittests/data/matrix4d_int4.mat") ; std::string file_int5("./src/Unittests/data/matrix4d_int5.mat") ; std::string file_int6("./src/Unittests/data/matrix4d_int6.mat") ; std::string file_int7("./src/Unittests/data/matrix4d_int7.mat") ; std::string file_int8("./src/Unittests/data/matrix4d_int8.mat") ; std::string file_int9("./src/Unittests/data/matrix4d_int9.mat") ; std::string file_int10("./src/Unittests/data/matrix4d_int10.mat") ; std::string file_int11("./src/Unittests/data/matrix4d_int11.mat") ; std::string file_int12("./src/Unittests/data/matrix4d_int12.mat") ; std::string file_int13("./src/Unittests/data/matrix4d_int13.mat") ; std::string file_int14("./src/Unittests/data/matrix4d_int14.mat") ; std::string file_int15("./src/Unittests/data/matrix4d_int15.mat") ; std::string file_int16("./src/Unittests/data/matrix4d_int16.mat") ; std::string file_int17("./src/Unittests/data/matrix4d_int17.mat") ; std::string file_int18("./src/Unittests/data/matrix4d_int18.mat") ; std::string file_int19("./src/Unittests/data/matrix4d_int19.mat") ; std::string file_int20("./src/Unittests/data/matrix4d_int20.mat") ; std::string file_dbl1("./src/Unittests/data/matrix4d_double1.mat") ; std::string file_ghost("./src/Unittests/data/foo.mat") ; std::vector v_int = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,11,12, 13,14,15, 16,17,18, 19,20,21, 22,23,24, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,11,12, 13,14,15, 16,17,18, 19,20,21, 22,23,24} ; std::vector v_dbl = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,11,12, 13,14,15, 16,17,18, 19,20,21, 22,23,24, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,11,12, 13,14,15, 16,17,18, 19,20,21, 22,23,24} ; std::vector dim = {2,3,2,4} ; // matrix of int Matrix4D m_int(file_int1) ; CHECK_EQUAL(dim.size(), m_int.get_dim_size()) ; CHECK_ARRAY_EQUAL(dim, m_int.get_dim(), dim.size()) ; CHECK_EQUAL(v_int.size(), m_int.get_data_size()) ; CHECK_ARRAY_EQUAL(v_int, m_int.get_data(), v_int.size()) ; // matrix with only 1 int Matrix4D m_int2(file_int18) ; CHECK_EQUAL(Matrix4D(1,1,1,1,1), m_int2) ; // empty matrix (empty file) Matrix4D m_int3(file_int19) ; CHECK_EQUAL(Matrix4D(0,0,0,0), m_int3) ; // empty matrix (only eol in file) Matrix4D m_int4(file_int20) ; CHECK_EQUAL(Matrix4D(0,0,0,0), m_int3) ; // these files are not well formatted CHECK_THROW(Matrix4D m_int5(file_int2), std::runtime_error) ; // empty lines CHECK_THROW(Matrix4D m_int5(file_int3), std::runtime_error) ; // empty lines CHECK_THROW(Matrix4D m_int5(file_int4), std::runtime_error) ; // empty lines CHECK_THROW(Matrix4D m_int5(file_int5), std::runtime_error) ; // empty lines CHECK_THROW(Matrix4D m_int5(file_int6), std::runtime_error) ; // empty lines CHECK_THROW(Matrix4D m_int5(file_int7), std::runtime_error) ; // first line problem CHECK_THROW(Matrix4D m_int5(file_int8), std::runtime_error) ; // first line problem CHECK_THROW(Matrix4D m_int5(file_int9), std::runtime_error) ; // first line problem CHECK_THROW(Matrix4D m_int5(file_int10), std::runtime_error) ; // second line problem CHECK_THROW(Matrix4D m_int5(file_int11), std::runtime_error) ; // extra column CHECK_THROW(Matrix4D m_int5(file_int12), std::runtime_error) ; // missing column CHECK_THROW(Matrix4D m_int5(file_int13), std::runtime_error) ; // extra row CHECK_THROW(Matrix4D m_int5(file_int14), std::runtime_error) ; // extra 2d slice CHECK_THROW(Matrix4D m_int5(file_int15), std::runtime_error) ; // extra 2d slice CHECK_THROW(Matrix4D m_int5(file_int16), std::runtime_error) ; // last line problem CHECK_THROW(Matrix4D m_int5(file_int17), std::runtime_error) ; // mixded data types // this file does not exist CHECK_THROW(Matrix4D m_int3(file_ghost), std::runtime_error) ; // matrix of double Matrix4D m_dbl(file_dbl1) ; CHECK_EQUAL(dim.size(), m_dbl.get_dim_size()) ; CHECK_ARRAY_EQUAL(dim, m_dbl.get_dim(), dim.size()) ; CHECK_EQUAL(v_dbl.size(), m_dbl.get_data_size()) ; CHECK_ARRAY_EQUAL(v_dbl, m_dbl.get_data(), v_dbl.size()) ; } // tests get() TEST(get) { int n = 999 ; for(size_t i=0; i<10; i++) { for(size_t j=0; j<10; j++) { for(size_t k=0; k<10; k++) { for(size_t l=0; l<10; l++) { std::vector dim = {i,j,k,l} ; Matrix4D m(i,j,k,l,n) ; for(size_t a=0; a coord = convert_to_coord(m, a) ; CHECK_EQUAL(m.get(a), m.get(coord[0], coord[1], coord[2], coord[3])) ; } } } } } } // tests set() TEST(set) { int n = 999 ; for(size_t i=0; i<10; i++) { for(size_t j=0; j<10; j++) { for(size_t k=0; k<10; k++) { for(size_t l=0; l<10; l++) { std::vector dim = {i,j,k,l} ; Matrix4D m(i,j,k,n) ; for(size_t a=0; a coord = convert_to_coord(m, a) ; m.set(coord[0], coord[1], coord[2], coord[3], a) ; } for(size_t a=0; a dim = {i,j,k,l} ; Matrix4D m(i,j,k,l) ; for(size_t a=0; a m2("./src/Unittests/data/matrix4d_out.mat") ; // any matrix with at least one zero dimension is a null // matrix if(i==0 or j==0 or k==0 or l==0) { CHECK_EQUAL(Matrix4D(0,0,0,0), m2) ; } else { CHECK_EQUAL(m, m2) ; } } } } } } } -*/ + diff --git a/src/Utility/dna_utility.cpp b/src/Utility/dna_utility.cpp index a2f22ad..11af0e0 100644 --- a/src/Utility/dna_utility.cpp +++ b/src/Utility/dna_utility.cpp @@ -1,151 +1,142 @@ #include #include #include -#include +#include #include // seqan::SeqFileIn int dna::map(char base, bool rev_compl) { static bool init = false ; static std::unordered_map hash_map ; static std::unordered_map hash_map_rev ; if(not init) { hash_map['A'] = 0 ; hash_map['a'] = 0 ; hash_map['C'] = 1 ; hash_map['c'] = 1 ; hash_map['G'] = 2 ; hash_map['g'] = 2 ; hash_map['T'] = 3 ; hash_map['t'] = 3 ; hash_map['N'] = 4 ; hash_map['n'] = 4 ; hash_map_rev['A'] = hash_map['T'] ; hash_map_rev['a'] = hash_map['t'] ; hash_map_rev['C'] = hash_map['G'] ; hash_map_rev['c'] = hash_map['g'] ; hash_map_rev['G'] = hash_map['C'] ; hash_map_rev['g'] = hash_map['c'] ; hash_map_rev['T'] = hash_map['A'] ; hash_map_rev['t'] = hash_map['a'] ; hash_map_rev['N'] = hash_map['N'] ; hash_map_rev['n'] = hash_map['n'] ; init = true ; } try { if(rev_compl) { return hash_map_rev.at(base) ; } else { return hash_map.at(base) ; } } // key could not be found catch(std::out_of_range& e) { char msg[256] ; sprintf(msg, "Error! Invalid DNA base : %c", base) ; throw std::invalid_argument(msg) ; } } char dna::map(int base, bool rev_compl) { static bool init = false ; static std::unordered_map hash_map ; static std::unordered_map hash_map_rev ; if(not init) { hash_map[0] = 'A' ; hash_map[1] = 'C' ; hash_map[2] = 'G' ; hash_map[3] = 'T' ; hash_map[4] = 'N' ; hash_map_rev[4] = hash_map[4] ; hash_map_rev[3] = hash_map[0] ; hash_map_rev[2] = hash_map[1] ; hash_map_rev[1] = hash_map[2] ; hash_map_rev[0] = hash_map[3] ; init = true ; } try { if(rev_compl) { return hash_map_rev.at(base) ; } else { return hash_map.at(base) ; } } // key could not be found catch(std::out_of_range& e) { char msg[256] ; sprintf(msg, "Error! Invalid DNA code : %i", base) ; throw std::invalid_argument(msg) ; } } int dna::char_to_int(char c, bool rev_compl) { return dna::map(c, rev_compl) ; } -matrix2d_i dna::char_to_int(const matrix2d_c& matrix) +Matrix2D dna::char_to_int(const Matrix2D& matrix) { - size_t n_row = matrix.size() ; - size_t n_col = matrix[0].size() ; + size_t n_row = matrix.get_nrow() ; + size_t n_col = matrix.get_ncol() ; - matrix2d_i data_int(n_row, - vector_i(n_col)) ; + Matrix2D data_int(n_row, n_col) ; for(size_t i=0; i dna::base_composition(const Matrix2D& sequences, bool both_strands) { - // open file - seqan::SeqFileIn file_in; - if (not seqan::open(file_in, file_address.c_str())) - { char msg[4096] ; - sprintf(msg, "Error! Could not open %s", - file_address.c_str()) ; - throw std::invalid_argument(msg) ; - } - - // read - matrix2d_i seq_matrix ; - seqan::CharString id ; - seqan::Dna5String seq ; - size_t i = 0 ; - size_t seq_l = 0; - while(not seqan::atEnd(file_in)) - { seqan::readRecord(id, seq, file_in) ; - // get sequence length - if(i == 0) - { seq_l = seqan::length(seq) ; } - // sequence length should be constant - else if(seqan::length(seq) != seq_l) - { char msg[4096] ; - sprintf(msg, "Error! Sequences of variable length in %s", - file_address.c_str()) ; - throw std::invalid_argument(msg) ; + double total = 0. ; + std::vector base_comp(4,0.) ; + + int base_N = dna::map('N') ; + + for(size_t i=0; i adding a count to each + // is equivalent to not changing anything + if(base == base_N) + { continue ; } + else + { base_comp[base] += 1; + total += 1. ; + } + // reverse complement strand + if(both_strands) + { // size_t c_hash_rev = dna::hash(c, true) ; + base_comp[4-base-1] += 1. ; + total += 1. ; + } } - // store - seq_matrix.push_back(vector_i(seq_l)) ; - for(size_t j=0; j +#include namespace dna { /*! * \brief Contains the mapping to convert * DNA characters to integer codes. * Lower and capital characters are accepted. * \param base the character of interest. * \param rev_compl whether the reverse * complement of the character is of interest. * \return the corresponding DNA code. */ int map(char base, bool rev_compl=false) ; /*! * \brief Contains the mapping to convert * DNA code to characters. * Only capital characters are returned. * \param base the code of interest. * \param rev_compl whether the reverse * complement of the code is of interest. * \return the corresponding DNA character. */ char map(int base, bool rev_compl=false) ; /*! * \brief Converts a DNA character (A, C, * G, T) to an integer. * \param c the DNA character of interest. * \return the character integer code. * \throw std::invalid_argument if the * given character is not a valid DNA * character. */ int char_to_int(char c, bool rev_compl= false) ; /*! * \brief Converts a DNA character matrix (A, C, * G, T) to an integer matrix. * The DNA characters are converted using * SequenceLayer::char_to_int(char). * param file_address the address of the file to load. * \return the corresponding int matrix. */ - matrix2d_i char_to_int(const matrix2d_c& matrix) ; + Matrix2D char_to_int(const Matrix2D& matrix) ; /*! * \brief Converts an int DNA code to * a regular DNA character (A, C, G, T). * This method is the reverse method of * char_to_int(char). * \param n the DNA code of interest. * \return the DNA character. * \throw std::invalid_argument if the * given int is not a valid DNA * code. */ char int_to_char(int n, bool rev_compl=false) ; /*! - * \brief Loads the content of a fasta file and stores the - * data in a int matrix where each row contains one sequence. - * The sequence in the file should all have the same length. - * The DNA characters are converted using - * SequenceLayer::char_to_int(char) ; - * \param file_address the address of the file to load. - * \throw std::invalid_argument if the file cannot be read, - * if an invalid DNA character is detected in the sequences - * or if the sequences have variable lengths. - * \return a matrix containing the sequences on the rows and - * the characters over the columns. + * \brief Computes the base composition of a set of + * sequences, in integer format, contained in a matrix. + * \param sequences a matrix containing the sequences + * of interest. + * \param both_strands also accounts for the reverse + * complement of the sequences. + * \throw std::invalid_argument if a non-supported + * character is found in the matrix. + * \return a vector of 4 values corresponding to the + * frequencies of A,C,G and T + * respectively. */ - matrix2d_i read_fasta(const std::string& file_address) ; + std::vector base_composition(const Matrix2D& sequences, bool both_strands) ; } #endif // DNA_UTILITY_HPP diff --git a/src/main_cormat.cpp b/src/main_cormat.cpp index 3328b91..1efb02d 100644 --- a/src/main_cormat.cpp +++ b/src/main_cormat.cpp @@ -1,187 +1,261 @@ -#include #include #include #include +#include -#include -#include -#include -#include +#include +#include -using namespace seqan; +#include +#include +#include +#include -template -std::ostream& operator << (std::ostream& stream, const std::vector& v) +class TestTimer { - for(const auto& p : v) - { stream << p << " " ; } - return stream ; + public: + TestTimer(const std::string & name) : name(name), + start(boost::date_time::microsec_clock::local_time()) + { + } + + ~TestTimer() + { + using namespace std; + using namespace boost; + + posix_time::ptime now(date_time::microsec_clock::local_time()); + posix_time::time_duration d = now - start; + + cout << name << " completed in " << d.total_milliseconds() / 1000.0 << + " seconds" << endl; + } + + private: + std::string name; + boost::posix_time::ptime start; +}; + + +void f_vector2d(size_t nrow, size_t ncol) +{ std::vector> m ; + + { TestTimer timer("f_vector2d init") ; + m = std::vector>(nrow, + std::vector(ncol, 0.)) ; + for(size_t i=0; i -std::ostream& operator << (std::ostream& stream, const std::pair& p) -{ - stream << "[" << p.first << " " << p.second << "] " ; - return stream ; +void f_matrix2d(size_t nrow, size_t ncol) +{ Matrix2D m ; + + { TestTimer timer("f_matrix2d init") ; + m = Matrix2D(nrow, ncol, 0.) ; + } + + { TestTimer timer("f_matrix2d writting") ; + for(size_t i=0; i>> m ; + + { TestTimer timer("f_vector3d init") ; + m = std::vector>>(dim1, + std::vector>(dim2, + std::vector(dim3,0.))) ; + for(size_t i=0; i get_bin_indices(const GenomeRegion& target, - const std::vector& bins) -{ // the bin range and chromosome - int chromosome_idx = bins.front().chromosome_idx ; - int bin_size = bins.front().end - bins.front().start ; - int from = bins.front().start ; - int to = bins.back().end ; - - // not on the same chromosome - if(target.chromosome_idx != chromosome_idx) - { return std::make_pair(0,0) ; } - // target goes over all bins - else if(target.start <= from and - target.end >= to) - { return std::make_pair(0, bins.size()) ; } - // check if overlap - else - { // define whether target limits are inside - int bin_start = -1 ; - int bin_end = -1 ; - - // define whether target limits are inside - bool target_start_in = false ; - bool target_end_in = false ; - if(target.start >= from and - target.start < to) - { target_start_in = true ; } - if(target.end > from and - target.end <= to) - { target_end_in = true ; } - - // start - if(not target_start_in) - { bin_start = 0 ; } - else - { bin_start = (target.start - from) / bin_size ; } - - // end - if(target_start_in and not target_end_in) - { bin_end = bin_start + 1 ; } - else if(not target_start_in and not target_end_in) - { bin_end = 0 ; } - else - { bin_end = ((target.end - 1 - from) / bin_size) + 1 ; } - - return std::make_pair(bin_start, bin_end) ; +void f_matrix3d(size_t dim1, size_t dim2, size_t dim3) +{ Matrix3D m ; + + { TestTimer timer("f_matrix3d init") ; + m = Matrix3D(dim1, dim2, dim3) ; + for(size_t i=0; i get_bin_indices_naive(const GenomeRegion& target, - const std::vector& bins) -{ int bin_start = 0 ; - int bin_end = 0 ; - - GenomeRegion range(bins.front().chromosome, - bins.front().chromosome_idx, - bins.front().start, - bins.back().end) ; - - // no overlap - if(not (target | range)) - { return std::make_pair(0,0) ; } - else - { // start - if(target.start < bins.front().start) - { bin_start = 0 ; } - else - { for(int i=0; i< (int)bins.size(); i++) - { if(target.start >= bins[i].start and - target.start < bins[i].end) - { bin_start = i ; - break ; + +void f_vector4d(size_t dim1, size_t dim2, size_t dim3, size_t dim4) +{ std::vector>>> m ; + + { TestTimer timer("f_vector4d init") ; + m = std::vector>>>(dim1, + std::vector>>(dim2, + std::vector>(dim3, + std::vector(dim4, 0)))) ; + for(size_t i=0; i bins.back().end) - { bin_end = bins.size() ; } - else - { - for(int i=0; i<(int)bins.size(); i++) - { if(target.end <= bins[i].end and - target.end > bins[i].start) - { bin_end = i+1 ; - break ; + } + + { TestTimer timer("f_vector4d writting") ; + for(size_t i=0; i get_bin_indices(const GenomeRegion& target, - const std::vector& bins) -{ // the bin range and chromosome - GenomeRegion range(bins.front().chromosome, - bins.front().chromosome_idx, - bins.front().start, - bins.back().end) ; - // no overlap - if(not (target | range)) - { return std::make_pair(0,0) ; } - // overlap - else - { // target goes over all bins - if(target.start <= range.start and - target.end >= range.end) - { return std::make_pair(0, bins.size()) ; } - else - { int bin_start = -1 ; - int bin_end = -1 ; - int bin_size = bins.front().end - bins.front().start ; - - // start - if(target.start <= range.start) - { bin_start = 0 ; } - else - { bin_start = (target.start - range.start) / bin_size ; } - - // end - if(target.end >= range.end) - { bin_end = bins.size() ; } - else - { bin_end = ((target.end - 1 - range.start) / bin_size) + 1 ; } - return std::make_pair(bin_start, bin_end) ; +void f_matrix4d(size_t dim1, size_t dim2, size_t dim3, size_t dim4) +{ Matrix4D m ; + + { TestTimer timer("f_matrix4d init") ; + m = Matrix4D(dim1, dim2, dim3,dim4, 0) ; + for(size_t i=0; i v((size_t)70462*(size_t)23*(size_t)971*(size_t)2) ; + std::cout << "vector allocated" << std::endl ; return 0; } diff --git a/src/main_em.cpp b/src/main_em.cpp index 9b5b30e..954a16c 100644 --- a/src/main_em.cpp +++ b/src/main_em.cpp @@ -1,93 +1,92 @@ #include #include #include #include #include #include -#include using namespace std ; void get_size(const vector>>>>& m) { size_t size_d = 0 ; size_t size_m4 = 0 ; size_t size_m3 = 0 ; size_t size_m2 = 0 ; size_t size_m = 0 ; std::cout << "sizeof m : " << sizeof(m) << std::endl ; std::cout << "sizeof m[0] : " << sizeof(m[0]) << std::endl ; std::cout << "sizeof m[0][0] : " << sizeof(m[0][0]) << std::endl ; std::cout << "sizeof m[0][0][0] : " << sizeof(m[0][0][0]) << std::endl ; std::cout << "sizeof m[0][0][0][0] : " << sizeof(m[0][0][0][0]) << std::endl ; for(const auto& m4 : m) { size_m4 += sizeof(m4) ; for(const auto& m3 : m4) { size_m3 += sizeof(m3) ; for(const auto& m2 : m3) { size_m2 += sizeof(m2) ; for(const auto& m : m2) { size_m += sizeof(m) ; size_d += m.capacity() * sizeof(int) ; } } } } std::cout << "size of matrix" << std::endl << "size of m4 : " << size_m4 << std::endl << "size of m3 : " << size_m3 << std::endl << "size of m2 : " << size_m2 << std::endl << "size of m : " << size_m << std::endl << "size of data : " << size_d << std::endl ; } void get_size(const vector>>>& m) { size_t size_d = 0 ; size_t size_m3 = 0 ; size_t size_m2 = 0 ; size_t size_m = 0 ; std::cout << "sizeof m : " << sizeof(m) << std::endl ; std::cout << "sizeof m[0] : " << sizeof(m[0]) << std::endl ; std::cout << "sizeof m[0][0] : " << sizeof(m[0][0]) << std::endl ; std::cout << "sizeof m[0][0][0] : " << sizeof(m[0][0][0]) << std::endl ; for(const auto& m3 : m) { size_m3 += sizeof(m3) ; for(const auto& m2 : m3) { size_m2 += sizeof(m2) ; for(const auto& m : m2) { size_m += sizeof(m) ; size_d += m.capacity() * sizeof(int) ; } } } std::cout << "size of matrix" << std::endl << "size of m3 : " << size_m3 << std::endl << "size of m2 : " << size_m2 << std::endl << "size of m : " << size_m << std::endl << "size of data : " << size_d << std::endl ; } void get_size(const vector& v) { std::cout << "sizeof v : " << sizeof(v) << std::endl ; std::cout << "size of data : " << v.size() * sizeof(int) << std::endl ; } int main() { vector>>>> m(2, vector>>>(97998, vector>>(5, vector>(201, vector(2))))) ; get_size(m) ; vector v(97998*5*201*2) ; get_size(v) ; return EXIT_SUCCESS ; } diff --git a/src/main_em2.cpp b/src/main_em2.cpp index 41702c1..5e7e185 100644 --- a/src/main_em2.cpp +++ b/src/main_em2.cpp @@ -1,35 +1,36 @@ #include #include #include -#include +#include +#include int main() { std::string data_path1 = "/local/groux/scATAC-seq/results/10xgenomics_PBMC_5k/" "ctcf_motifs_10e-6_open_bin1bp_read_atac.mat" ; std::string data_path2 = "/local/groux/scATAC-seq/results/10xgenomics_PBMC_5k/" "ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center.mat" ; size_t n_class = 10 ; size_t n_shift = 21 ; bool flip = false ; size_t n_iter = 20 ; std::string seed = "08july2019" ; - EMEngine em_new(std::vector{read_matrix2d_i(data_path1), read_matrix2d_i(data_path2)}, - std::vector{}, + EMEngine em_new(std::vector>{Matrix2D(data_path1), Matrix2D(data_path2)}, + std::vector>{}, n_class, n_iter, n_shift, flip, EMEngine::seeding_codes::RANDOM, seed) ; em_new.classify() ; - std::cout << em_new.get_read_models() << std::endl << std::endl << std::endl ; + em_new.get_read_models() ; return EXIT_SUCCESS ; }