setwd(file.path("/", "local", "groux", "scATAC-seq")) # required librairies and functions library(abind) # functions #' Converts a vector of characters containing a DNA sequence #' into a vector of integers : A->1, C->2, G->3, T->4. Any #' non ACGT character triggers an error. #' \param sequence the DNA sequence stored as a vector of #' characters. #' \return a vector of integers. #' \author Romain Groux dna.to.int = function(sequence) { seq.len = length(sequence) seq.int = vector(length=seq.len, mode="numeric") for(i in 1:seq.len) { if(sequence[i] == "A") { seq.int[i] = 1 } else if(sequence[i] == "C") { seq.int[i] = 2 } else if(sequence[i] == "G") { seq.int[i] = 3 } else if(sequence[i] == "T") { seq.int[i] = 4} else { stop(sprintf("Error! Unrecognized character in DNA sequence at position %d : %s", i, sequence[i])) } } return(seq.int) } #' The complementary function to dna.to.int(). #' \param sequence the DNA stored as a vector of int : #' A->1, C->2, G->3, T->4 #' \return a vector of characters. #' \author Romain Groux int.to.dna = function(sequence) { seq.len = length(sequence) seq.let = vector(length=seq.len, mode="character") for(i in 1:seq.len) { if(sequence[i] == 1) { seq.let[i] = "A" } else if(sequence[i] == 2) { seq.let[i] = "C" } else if(sequence[i] == 3) { seq.let[i] = "G" } else if(sequence[i] == 4) { seq.let[i] = "T"} else { stop(sprintf("Error! Unrecognized character in int sequence at position %d : %d", i, sequence[i])) } } return(seq.let) } simulate_data = function(n_seq, l_seq, classes, prob_bg, p_classes, p_flip) { # the alphabet A->1, C->2, G->3, T->4 alphabet = c(1, 2, 3, 4) l_alpha = length(alphabet) # binding site length l_bs = ncol(classes[,,1]) # number of classes n_class = dim(classes)[3] # checks if(length(p_classes )!= n_class) { stop(sprintf("Error! %d classes detected but %d class probability given!", n_class, length(p_classes))) } for(k in 1:n_class) { if((nrow(classes[,,k]) != 4) || (ncol(classes[,,k]) != l_bs)) { stop(sprintf("Error! Check the dimensions of class %d motif : %d / %d!", k, nrow(classes[,,k]), ncol(classes[,,k]))) } } # last position (comprised) for a binding site to begin and be entirely in the seq last_pos_bs = l_seq - l_bs + 1 # data structures sequences = matrix(data=0, n_seq, l_seq) # the sequences bs_starts = vector(length=n_seq, mode="numeric") # the starting positions of the binding site bs_flips = vector(length=n_seq, mode="numeric") # the orientation of the binding site bs_classes = vector(length=n_seq, mode="numeric") # the class from which the binding site was sampled bs_contents = matrix(data=0, nrow=n_seq, ncol=l_bs) # the binding site sequences bs_probs = array(data=0, dim=c(l_alpha, l_bs, n_class)) # the class binding site probability matrices for(i in 1:n_seq) { # sample from a uniform distribution where the binding site should start bs_starts[i] = sample(1:last_pos_bs, 1) # sample a class class = sample(1:n_class, 1, prob=p_classes) bs_classes[i] = class # sample a flip state (0->forward, 1->reverse) flip = rbinom(1, 1, prob=p_flip) bs_flips[i] = flip # to store the int seq seq = vector(length=l_seq, mode="numeric") seq_bs = vector(length=l_bs, mode="numeric") # over the sequence j = 1 while(j <= l_seq) { # binding site starts if(j == bs_starts[i]) { for(k in 0:(l_bs-1)) { # reverse strand if(flip) { base = sample(alphabet, 1, prob=rev(classes[,,class][,l_bs-k])) seq[j+k] = base seq_bs[k+1] = base bs_probs[l_alpha-base+1, l_bs-k, class] = bs_probs[l_alpha-base+1, l_bs-k, class] + 1 } # forward strand else { base = sample(alphabet, 1, prob=classes[,,class][,k+1]) seq[j+k] = base seq_bs[k+1] = base bs_probs[base, k+1, class] = bs_probs[base, k+1, class] + 1 } } j = j + k # this is background sequence } else { seq[j] = sample(alphabet, 1, prob=prob_bg) } j = j + 1 } sequences[i,] = int.to.dna(seq) bs_contents[i,] = int.to.dna(seq_bs) } # normalize for(i in 1:n_class) { bs_probs[,,i] = bs_probs[,,i] / colSums(bs_probs[,,i]) } return(list(sequences=sequences, sites=bs_contents, motifs=bs_probs, starts=bs_starts, flips=bs_flips, classes=bs_classes)) } # some general parameters n_seq = 10 # number of sequences l_seq = 10 # length of sequences # the base probabilities inside the binding site (A,C,G,T) motif_class1 = matrix(data=c(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0), nrow=4, ncol=8, byrow=T) motif_class2 = matrix(data=c(0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0), nrow=4, ncol=8, byrow=T) dir.create(file.path("data", "toy_data"), showWarnings=FALSE) # ------------------------------------------- 1 classes with 1 motif/sequence, no flip, a really toy exemple ------------------------------------------- data = matrix(nrow=4, ncol=10, byrow=T, data=c('T', 'T', 'C', 'C', 'T', 'T', 'A', 'G', 'C', 'T', 'T', 'T', 'C', 'C', 'T', 'T', 'G', 'C', 'T', 'A', 'T', 'T', 'C', 'C', 'T', 'T', 'C', 'T', 'A', 'G', 'T', 'T', 'C', 'C', 'T', 'T', 'T', 'A', 'G', 'C')) write.table(data, file=file.path("data", "toy_data", "simulated_sequences_toy.mat"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n') rm(data) # -------------------------------------------------------- 1 classes with 1 motif/sequence, no flip ----------------------------------------------------- # seed set.seed(20190715) # the class binding site motif_classes = array(data=motif_class1, dim=c(dim(motif_class1), 1)) # the class probability p_classes = c(1) # the probability of having a binding site on the reverse strand p_flip = 0 # the base probabilities outside the binding site (A,C,G,T) prob_bg = rep(0.25, 4) # simulate the data data = simulate_data(n_seq, l_seq, motif_classes, prob_bg, p_classes, p_flip) # save write.table(data$sequences, file=file.path("data", "toy_data", "simulated_sequences_1class_noflip.mat"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n') write.table(motif_class1, file=file.path("data", "toy_data", "simulated_sequences_motif.mat"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n') write.table(data$sites, file=file.path("data", "toy_data", "simulated_sequences_1class_noflip_contents.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n') write.table(data$starts, file=file.path("data", "toy_data", "simulated_sequences_1class_noflip_starts.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n') write.table(data$motifs[,,1], file=file.path("data", "toy_data", "simulated_sequences_1class_noflip_motif1.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n') write.table(data$flips, file=file.path("data", "toy_data", "simulated_sequences_1class_noflip_flips.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n') write.table(data$classes, file=file.path("data", "toy_data", "simulated_sequences_1class_noflip_classes.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n') # clean rm(motif_classes, p_classes, p_flip, prob_bg, data) # -------------------------------------------------------- 1 classes with 1 motif/sequence, flip -------------------------------------------------------- # seed set.seed(201803142) # the class binding site motif_classes = array(data=motif_class1, dim=c(dim(motif_class1), 1)) # the class probability p_classes = c(1) # the probability of having a binding site on the reverse strand p_flip = 0.5 # the base probabilities outside the binding site (A,C,G,T) prob_bg = rep(0.25, 4) # simulate the data data = simulate_data(n_seq, l_seq, motif_classes, prob_bg, p_classes, p_flip) # save write.table(data$sequences, file=file.path("data", "toy_data", "simulated_sequences_1class_flip.mat"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n') write.table(data$sites, file=file.path("data", "toy_data", "simulated_sequences_1class_flip_contents.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n') write.table(data$starts, file=file.path("data", "toy_data", "simulated_sequences_1class_flip_starts.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n') write.table(data$motifs[,,1], file=file.path("data", "toy_data", "simulated_sequences_1class_flip_motif1.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n') write.table(data$flips, file=file.path("data", "toy_data", "simulated_sequences_1class_flip_flips.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n') write.table(data$classes, file=file.path("data", "toy_data", "simulated_sequences_1class_flip_classes.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n') # clean rm(motif_classes, p_classes, p_flip, prob_bg, data) # -------------------------------------------------------- 2 classes with 1 bs/sequence, flip -------------------------------------------------------- # seed set.seed(201803143) # the class binding site motif_classes = abind(motif_class1, motif_class2, along=3) # the class probability p_classes = c(0.5, 0.5) # the probability of having a binding site on the reverse strand p_flip = 0.5 # the base probabilities outside the binding site (A,C,G,T) prob_bg = rep(0.25, 4) # simulate the data data = simulate_data(n_seq, l_seq, motif_classes, prob_bg, p_classes, p_flip) # save write.table(data$sequences, file=file.path("data", "toy_data", "simulated_sequences_2class_flip.mat"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n') write.table(data$sites, file=file.path("data", "toy_data", "simulated_sequences_2class_flip_contents.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n') write.table(data$starts, file=file.path("data", "toy_data", "simulated_sequences_2class_flip_starts.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n') write.table(data$motifs[,,1], file=file.path("data", "toy_data", "simulated_sequences_2class_flip_motif1.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n') write.table(data$motifs[,,2], file=file.path("data", "toy_data", "simulated_sequences_2class_flip_motif2.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n') write.table(data$flips, file=file.path("data", "toy_data", "simulated_sequences_2class_flip_flips.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n') write.table(data$classes, file=file.path("data", "toy_data", "simulated_sequences_2class_flip_classes.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n') # clean rm(motif_classes, p_classes, p_flip, prob_bg, data)