seeding_toy = function(data, k, shift) { l_slice = ncol(data) - shift + 1 references = matrix(nrow=k, ncol=l_slice) for(i in 1:k) { from = floor(shift / 2) + 1 to = from + l_slice - 1 references[i,] = data[i, from:to] } return(references) } seeding_random = function(data, n_class, shift) { l_slice = ncol(data) - shift + 1 n_row = nrow(data) p = matrix(nrow=n_row, ncol=n_class) for(m in 1:n_class) { p[,m] = rbeta(n_row, n_row**-0.5, 1) } c = (t(p) %*% data) / colSums(p) p = matrix(nrow=n_row, ncol=n_class) for(i in 1:n_class) { p[,i] = rbeta(n_row, n_row**-0.5, 1) } from = floor(shift/2)+1 to = from + l_slice - 1 c = (t(p) %*% data[,from:to]) / colSums(p) return(c) } em_shape_shift_flip = function(c,q,data) { K=dim(c)[1]; L=dim(c)[2]; N=dim(data)[1]; S=dim(q)[2] l=array(dim=c(N,K,S,2)); p=array(dim=c(N,K,S,2)) for(i in 1:K) { c[i,]=c[i,]/mean(c[i,]) } rm=matrix(nrow=N, ncol=S) for(k in 1:S) { rm[,k] = rowMeans(data[,k:(k+L-1)]) } for(i in 1:N) { for (j in 1:K) { for (k in 1:S) { l[i,j,k,1]=sum(dpois(data[i,k:(k+L-1)],c[j,]*rm[i,k],log=T)) l[i,j,k,2]=sum(dpois(rev(data[i,])[k:(k+L-1)],c[j,]*rm[i,S-k+1],log=T)) # xxx = c(k:(k+L-1)) # print(c(xxx[1],xxx[length(xxx)])-1) # for(m in 1:length(xxx)) # { n = xxx[m] # print(sprintf("%d %d %d %d: dpois(%d, %f*%f)", i-1, j-1, k-1, n-1, data[i,n], c[j,m], rm[i,k])) # } # for(m in 1:length(xxx)) # { n = xxx[m] # print(sprintf("%d %d %d %d: dpois(%d, %f*%f)", i-1, j-1, k-1, n-1, rev(data[i,])[n], c[j,m], rm[i,S-k+1])) # } } } } # print(" : LOGLIKELIHOOD") # print(l) # print(" : LOGLIKELIHOOD MAX") # print(apply(l, 1, max)) p_row_sum = vector(length=N) for(i in 1:N) { p[i,,,] = q*exp(l[i,,,]-max(l[i,,,])) p_row_sum[i] = sum(p[i,,,]) p[i,,,] = p[i,,,]/sum(p[i,,,]) } # print(" : POST PROB ROW") # print(p_row_sum) # print(" : POST PROB") # print(p) q = apply(p, c(2,3,4), mean) c = 0; for(k in 1:S) { c = c + (t(p[,,k,1]) %*% data[,k:(k+L-1)]) + (t(p[,,k,2]) %*% t(apply(data[,(S-k+1):(S+L-k)],1,rev))) } c = c / apply(p,2,sum) m= sum((1:S)*apply(q,2,sum)); s=sum(((1:S)-m)**2*apply(q,2,sum))**0.5 for (i in 1:K) { q[i,,1] = q[i,,2] = sum(q[i,,]) * dnorm(1:S,floor(S/2)+1,s) / sum(dnorm(1:S,floor(S/2)+1,s)) / 2 } return(list(references=c, class_prob=q, post_prob=p, loglikelihood=l)) } n_iter = 20 flip = TRUE n_shift = 21 n_class = 4 n_flip = flip + 1 data = as.matrix(read.table("/local/groux/scATAC-seq/data/ctcf_mnase_encode.txt", h=F)) # references = seeding_toy(data, n_class, n_shift) references = seeding_random(data, n_class, n_shift) m = apply(references, 1, mean) references = references / m class_prob = array(dim=c(n_class, n_shift, n_flip), 1) ; class_prob = class_prob / sum(class_prob) l = list(references=references, class_prob=class_prob) for(i in 1:n_iter) { # print("------------------- ITER -------------------") l = em_shape_shift_flip(l$references, l$class_prob, data) # print(" :LOGLIKELIHOOD") # print(l$loglikelihood) # print(" :POST PROB") # print(l$post_prob) # print(" : REFERENCES") # print(l$references) # print("-------------------------------------------------") } write.table(l$references, quote=F, row.names=F, col.names=F, sep='\t')