setwd(file.path("/", "local", "groux", "scATAC-seq")) # libraries library(RColorBrewer) # functions source(file.path("scripts", "functions.R")) #' Converts a sequence in character format #' to integer format A->0, C->1, N->2, G->3 #' T->4. #' \param seq a vector containing the sequence #' in character format. #' \return a vector containing the sequence #' in integer format. #' \author Romain Groux char.to.int = function(seq) { seq_int = vector(length=length(seq)) for(i in 1:length(seq)) { if(seq[i] == 'A') { seq_int[i] = 0 } if(seq[i] == 'C') { seq_int[i] = 1 } if(seq[i] == 'N') { seq_int[i] = 2 } if(seq[i] == 'G') { seq_int[i] = 3 } if(seq[i] == 'T') { seq_int[i] = 4 } } return(seq_int) } #' Generates the reverse complement of a kmer. #' \param kmer a vector containing the kmer in #' integer format. #' \return a vector containing the reverse #' complement kmer #' \author Romain Groux get_rev_compl = function(kmer) { kmer_rv = vector(length=length(kmer), mode="numeric") i_rv = length(kmer) for(i in 1:length(kmer)) { if(kmer[i] == 0) { kmer_rv[i_rv] = 4 } # A if(kmer[i] == 1) { kmer_rv[i_rv] = 3 } # C if(kmer[i] == 2) { kmer_rv[i_rv] = 2 } # N if(kmer[i] == 3) { kmer_rv[i_rv] = 1 } # G if(kmer[i] == 4) { kmer_rv[i_rv] = 0 } # T } return(kmer_rv) } #' Generates a hash given a kmer. #' Kmers with a same length are guaranteed #' to have different hashes. #' AA..AA will generate a hash of 1, #' AA..AC will generate a hash of 2, #' AA..AN will generate a hash of 3, #' AA..AG will generate a hash of 4, #' AA..AT will generate a hash of 5, #' TT..TG will generate a hash of 5**k - 1, #' TT..TT will generate a hash of 5**k #' \param seq a vector containing the kmer #' in integer format : A->0, C->1, N->2, G->3, #' T->4. #' \return the kmer hash #' \author Romain Groux hash = function(seq) { k = length(seq) ; z = 5 h = 0 for(i in 0:(length(seq)-1)) { if(seq[i+1] == 0) { h = h + (0*(z**(k-i-1))) } # A if(seq[i+1] == 1) { h = h + (1*(z**(k-i-1))) } # C if(seq[i+1] == 2) { h = h + (2*(z**(k-i-1))) } # N if(seq[i+1] == 3) { h = h + (3*(z**(k-i-1))) } # G if(seq[i+1] == 4) { h = h + (4*(z**(k-i-1))) } # T } return(h+1) } #' Computes the hash of a sequence and of #' its reverse complement and returns the #' smallest one. #' \param seq a vector containing the #' sequence in integer format : : A->0, #' C->1, N->2, G->3, T->4. #' \author Romain Groux hash.min(seq) { seq_r = get_rev_compl(seq) return(min(hash(seq), hash(seq_r))) } #' Generates all kmers for a given value of K #' and return them in lexicographic order. #' \param k the kmer length. #' \return a matrix with the different kmers #' on the rows and k columns. The kmers are #' in integer format : A->0, C->1, N->2, G->3, #' T->4. #' \author Romain Groux generate_all_kmers = function(k) { kmers = matrix(nrow=5**k, ncol=k, data=-1) n = k currentWord = rep(1, n) i = 1 while(n > 0) { kmers[i,] = currentWord i = i + 1 while(n>0 && currentWord[n+1-1] == 5) { currentWord[n] = 1 n = n - 1 } if(n > 0) { currentWord[n] = currentWord[n] + 1 n = k } } return(kmers - 1) } data = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_peaks", "peaks_rmsk_sampled_sequences_1kb.mat"))) data = as.matrix(read.table(file.path("data/toy_data/simulated_sequences_2class_flip.mat"))) data = apply(data, 1, char.to.int) k = 5 n_kmer = 5**k hmax = ceiling(n_kmer / 2) n_shift = ncol(data) - k + 1 # transitions and counts counts = vector(length=n_kmer, mode="numeric") kmers = generate_all_kmers(k) counts = vector(length=n_kmer, mode="numeric") t_out = matrix(nrow=n_kmer, ncol=n_kmer, data=0) t_in = t_out t_all = t_out for(i in 1:nrow(data)) { for(j in 1:n_shift) { # no in transition (1st kmer) if(j == 1) { # kmer1 < kmer2 from1 = j ; to1 = from1 + k - 1 ; kmer1 = data[i,from1:to1] ; from2 = j+1 ; to2 = from2 + k - 1 ; kmer2 = data[i,from2:to2] ; kmer1r = get_rev_compl(kmer2) ; kmer2r = get_rev_compl(kmer1) ; idx1 = hash(kmer1) ; idx1r = hash(kmer1r) ; idx2 = hash(kmer2) ; idx2r = hash(kmer2r) ; # out transition kmer1 -> kmer2 t_out[idx1,idx2] = t_out[idx1,idx2] + 1 t_out[idx1r,idx2r] = t_out[idx1r,idx2r] + 1 # number of edges t_all[idx1,idx2] = t_all[idx1,idx2] + 1 t_all[idx2,idx1] = t_all[idx2,idx1] + 1 t_all[idx1r,idx2r] = t_all[idx1r,idx2r] + 1 t_all[idx2r,idx1r] = t_all[idx2r,idx1r] + 1 # counts counts[idx1] = counts[idx1] + 1 counts[idx1r] = counts[idx1r] + 1 } # no out transition (last kmer) else if(j == n_shift) { # kmer1 < kmer2 from1 = j-1 ; to1 = from1 + k - 1 ; kmer1 = data[i,from1:to1] ; from2 = j ; to2 = from2 + k - 1 ; kmer2 = data[i,from2:to2] ; kmer1r = get_rev_compl(kmer2) ; kmer2r = get_rev_compl(kmer1) ; idx1 = hash(kmer1) ; idx1r = hash(kmer1r) ; idx2 = hash(kmer2) ; idx2r = hash(kmer2r) ; # in transition kmer1 <- kmer2 t_in[idx1,idx2] = t_in[idx1,idx2] + 1 t_in[idx1r,idx2r] = t_in[idx1r,idx2r] + 1 # number of edges t_all[idx1,idx2] = t_all[idx1,idx2] + 1 t_all[idx2,idx1] = t_all[idx2,idx1] + 1 t_all[idx1r,idx2r] = t_all[idx1r,idx2r] + 1 t_all[idx2r,idx1r] = t_all[idx2r,idx1r] + 1 # counts # no need, kmer2 was counted at last iteration as kmer2 } # both out and in transitions (middle) else { # kmer0 < kmer1 < kmer2 from0 = j ; to0 = from0 + k - 1 ; kmer0 = data[i,from0:to0] ; from1 = j ; to1 = from1 + k - 1 ; kmer1 = data[i,from1:to1] ; from2 = j+1 ; to2 = from2 + k - 1 ; kmer2 = data[i,from2:to2] ; kmer0r = get_rev_compl(kmer2) ; kmer1r = get_rev_compl(kmer1) ; kmer2r = get_rev_compl(kmer0) ; idx0 = hash(kmer0) ; idx0r = hash(kmer0r) ; idx1 = hash(kmer1) ; idx1r = hash(kmer1r) ; idx2 = hash(kmer2) ; idx2r = hash(kmer2r) ; # out transition kmer1 -> kmer2 t_out[idx1,idx2] = t_out[idx1,idx2] + 1 t_out[idx1r,idx2r] = t_out[idx1r,idx2r] + 1 # in transition kmer0 -> kmer1 t_in[idx1,idx0] = t_in[idx1,idx0] + 1 t_in[idx1r,idx0r] = t_in[idx1r,idx0r] + 1 # number of edges t_all[idx0,idx1] = t_all[idx0,idx1] + 1 t_all[idx1,idx0] = t_all[idx1,idx0] + 1 t_all[idx1,idx2] = t_all[idx1,idx2] + 1 t_all[idx2,idx1] = t_all[idx2,idx1] + 1 t_all[idx0r,idx1r] = t_all[idx0r,idx1r] + 1 t_all[idx1r,idx0r] = t_all[idx1r,idx0r] + 1 t_all[idx1r,idx2r] = t_all[idx1r,idx2r] + 1 t_all[idx2r,idx1r] = t_all[idx2r,idx1r] + 1 # counts counts[idx1] = counts[idx1] + 1 counts[idx1r] = counts[idx1r] + 1 } } } # spectral clustering # t_all is the affinity matrix # compute the degree matrix d = diag(apply(t_in, 1, sum)) # sum rows # unormalized laplacian u = d - t_in # get eigen values and vectors evL = eigen(u, symmetric=TRUE) # plot eigen values plot(1:20, rev(evL$values)[1:20], type='b') # partition partitions = list() for(n_clust in 2:20) { print(n_clust) # get K biggest eigen values and vectors -> embedding space z = evL$vectors[,(ncol(evL$vectors)-n_clust+1):ncol(evL$vectors)] partitions[[n_clust]] = kmeans(z, centers=n_clust, iter.max=100, nstart=10) } plot(evL$vectors[,3124:3125]) # motif 1 is ACGTTGCA kmers_motif1 = matrix(ncol=k, data=c(0,1,2,3,3, 1,2,3,3,2, 2,3,3,2,1, 3,3,2,1,0), byrow=T) # motif 2 is GCGAATTT kmers_motif2 = matrix(ncol=k, data=c(2,1,2,0,0, 1,3,0,0,3, 3,0,0,3,3, 0,0,3,3,3), byrow=T) idx1 = apply(kmers_motif1, 1, hash) idx2 = apply(kmers_motif2, 1, hash) partitions[[2]]$size partitions[[2]]$cluster[idx1] partitions[[2]]$cluster[idx2] c1 = which(partitions[[2]]$cluster == 1) c2 = which(partitions[[2]]$cluster == 2) plot(evL$vectors[,3124:3125], col=partitions[[2]]$cluster+1, cex=0.1) points(evL$vectors[idx1,3124:3125], col=2) points(evL$vectors[idx2,3124:3125], col=3) par(mfrow=c(3,1)) plot(t_all[idx1[1],], type='l', ylim=c(0,50)) ; abline(v=idx1, col="red", lwd=0.2) ; abline(v=idx1[1], col="blue") plot(t_all[idx1[2],], type='l', ylim=c(0,50)) ; abline(v=idx1, col="red", lwd=0.2) plot(t_all[idx1[3],], type='l', ylim=c(0,50)) ; abline(v=idx1, col="red", lwd=0.2) boxplot(counts, counts[idx1], counts[idx2], outline=F) # reconstruct kmers best.k = 2 partition = partitions[[best.k]] clusters = partition$cluster c1 = which(clusters == 1) c2 = which(clusters == 2) best1 = which.max(counts[c1]) best2 = which.max(counts[c2])