setwd(file.path("/", "local", "groux", "scATAC-seq"))

# required librairies and functions
library(abind)

# functions

#' Converts a vector of characters containing a DNA sequence 
#' into a vector of integers : A->1, C->2, G->3, T->4. Any 
#' non ACGT character triggers an error.
#' \param sequence the DNA sequence stored as a vector of 
#' characters.
#' \return a vector of integers.
#' \author Romain Groux
dna.to.int = function(sequence)
{ seq.len = length(sequence)
  seq.int = vector(length=seq.len, mode="numeric")
  for(i in 1:seq.len)
  { if(sequence[i] == "A")
  { seq.int[i] = 1 }
    else if(sequence[i] == "C")
    { seq.int[i] = 2 }
    else if(sequence[i] == "G")
    { seq.int[i] = 3 }
    else if(sequence[i] == "T")
    { seq.int[i] = 4}
    else
    { stop(sprintf("Error! Unrecognized character in DNA sequence at position %d : %s", i, sequence[i])) }
  }
  return(seq.int)
}


#' The complementary function to dna.to.int().
#' \param sequence the DNA stored as a vector of int :
#' A->1, C->2, G->3, T->4
#' \return a vector of characters.
#' \author Romain Groux
int.to.dna = function(sequence)
{ seq.len = length(sequence)
  seq.let = vector(length=seq.len, mode="character")
  for(i in 1:seq.len)
  { if(sequence[i] == 1)
  { seq.let[i] = "A" }
    else if(sequence[i] == 2)
    { seq.let[i] = "C" }
    else if(sequence[i] == 3)
    { seq.let[i] = "G" }
    else if(sequence[i] == 4)
    { seq.let[i] = "T"}
    else
    { stop(sprintf("Error! Unrecognized character in int sequence at position %d : %d", i, sequence[i])) }
  }
  return(seq.let)
}

simulate_data = function(n_seq, l_seq, classes, prob_bg, p_classes, p_flip)
{ 
  # the alphabet A->1, C->2, G->3, T->4
  alphabet = c(1, 2, 3, 4)
  l_alpha  = length(alphabet)
  
  # binding site length
  l_bs     = ncol(classes[,,1])
  # number of classes
  n_class  = dim(classes)[3]
  
  # checks
  if(length(p_classes )!= n_class)
  { stop(sprintf("Error! %d classes detected but %d class probability given!", n_class, length(p_classes))) }
  for(k in 1:n_class)
  { if((nrow(classes[,,k]) != 4) || (ncol(classes[,,k]) != l_bs))
    { stop(sprintf("Error! Check the dimensions of class %d motif : %d / %d!", k, nrow(classes[,,k]), ncol(classes[,,k]))) }
  }
  
  # last position (comprised) for a binding site to begin and be entirely in the seq
  last_pos_bs = l_seq - l_bs + 1
  
  # data structures
  sequences   = matrix(data=0, n_seq, l_seq)                 # the sequences
  bs_starts   = vector(length=n_seq, mode="numeric")         # the starting positions of the binding site
  bs_flips    = vector(length=n_seq, mode="numeric")         # the orientation of the binding site
  bs_classes  = vector(length=n_seq, mode="numeric")         # the class from which the binding site was sampled 
  bs_contents = matrix(data=0, nrow=n_seq, ncol=l_bs)        # the binding site sequences
  bs_probs    = array(data=0, dim=c(l_alpha, l_bs, n_class)) # the class binding site probability matrices
  
  for(i in 1:n_seq)
  {
    # sample from a uniform distribution where the binding site should start
    bs_starts[i]  = sample(1:last_pos_bs, 1)
    # sample a class
    class         = sample(1:n_class, 1, prob=p_classes)
    bs_classes[i] = class
    # sample a flip state (0->forward, 1->reverse)
    flip          = rbinom(1, 1, prob=p_flip)
    bs_flips[i]   = flip
    # to store the int seq
    seq           = vector(length=l_seq, mode="numeric")
    seq_bs        = vector(length=l_bs, mode="numeric")
    
    # over the sequence
    j = 1
    while(j <= l_seq)
    { # binding site starts
      if(j == bs_starts[i])
      { for(k in 0:(l_bs-1))
        { # reverse strand
          if(flip)
          { base                                    = sample(alphabet, 1, prob=rev(classes[,,class][,l_bs-k]))
            seq[j+k]                                = base
            seq_bs[k+1]                             = base
            bs_probs[l_alpha-base+1, l_bs-k, class] = bs_probs[l_alpha-base+1, l_bs-k, class] + 1
          }
          # forward strand
          else
          { base                        = sample(alphabet, 1, prob=classes[,,class][,k+1])
            seq[j+k]                    = base
            seq_bs[k+1]                 = base
            bs_probs[base, k+1, class]  = bs_probs[base, k+1, class] + 1
          }
        }
        j = j + k
      # this is background sequence
      } else {
        seq[j] = sample(alphabet, 1, prob=prob_bg)
      }
      j = j + 1
    }
    sequences[i,]   = int.to.dna(seq)
    bs_contents[i,] = int.to.dna(seq_bs)
  }
  # normalize
  for(i in 1:n_class)
  { bs_probs[,,i] = bs_probs[,,i] / colSums(bs_probs[,,i]) }
  return(list(sequences=sequences, sites=bs_contents, motifs=bs_probs, starts=bs_starts, flips=bs_flips, classes=bs_classes))
}


# some general parameters
n_seq         = 10  # number of sequences
l_seq         = 10  # length of sequences

# the base probabilities inside the binding site (A,C,G,T)
motif_class1 = matrix(data=c(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0,
                             0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0,
                             0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0,
                             0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0), 
                      nrow=4, ncol=8, byrow=T)
motif_class2 = matrix(data=c(0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0,
                             0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
                             1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0,
                             0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0), 
                      nrow=4, ncol=8, byrow=T)


dir.create(file.path("data", "toy_data"), showWarnings=FALSE)

# ------------------------------------------- 1 classes with 1 motif/sequence, no flip, a really toy exemple  -------------------------------------------

data = matrix(nrow=4, ncol=10, byrow=T,
              data=c('T', 'T', 'C', 'C', 'T', 'T', 'A', 'G', 'C', 'T',
                     'T', 'T', 'C', 'C', 'T', 'T', 'G', 'C', 'T', 'A',
                     'T', 'T', 'C', 'C', 'T', 'T', 'C', 'T', 'A', 'G',
                     'T', 'T', 'C', 'C', 'T', 'T', 'T', 'A', 'G', 'C'))
write.table(data, file=file.path("data", "toy_data", "simulated_sequences_toy.mat"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n')
rm(data)


# -------------------------------------------------------- 1 classes with 1 motif/sequence, no flip ----------------------------------------------------- 

# seed
set.seed(20190715)

# the class binding site
motif_classes = array(data=motif_class1, dim=c(dim(motif_class1), 1))
# the class probability
p_classes     = c(1)
# the probability of having a binding site on the reverse strand
p_flip        = 0
# the base probabilities outside the binding site (A,C,G,T)
prob_bg       = rep(0.25, 4)

# simulate the data
data = simulate_data(n_seq, l_seq, motif_classes, prob_bg, p_classes, p_flip)
# save
write.table(data$sequences, file=file.path("data", "toy_data", "simulated_sequences_1class_noflip.mat"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n')
write.table(motif_class1,   file=file.path("data", "toy_data", "simulated_sequences_motif.mat"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n')
write.table(data$sites,       file=file.path("data", "toy_data", "simulated_sequences_1class_noflip_contents.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n')
write.table(data$starts,      file=file.path("data", "toy_data", "simulated_sequences_1class_noflip_starts.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n')
write.table(data$motifs[,,1], file=file.path("data", "toy_data", "simulated_sequences_1class_noflip_motif1.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n')
write.table(data$flips,       file=file.path("data", "toy_data", "simulated_sequences_1class_noflip_flips.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n')
write.table(data$classes,     file=file.path("data", "toy_data", "simulated_sequences_1class_noflip_classes.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n')
# clean
rm(motif_classes, p_classes, p_flip, prob_bg, data)


# -------------------------------------------------------- 1 classes with 1 motif/sequence, flip -------------------------------------------------------- 

# seed
set.seed(201803142)

# the class binding site
motif_classes = array(data=motif_class1, dim=c(dim(motif_class1), 1))
# the class probability
p_classes     = c(1)
# the probability of having a binding site on the reverse strand
p_flip        = 0.5
# the base probabilities outside the binding site (A,C,G,T)
prob_bg       = rep(0.25, 4)

# simulate the data
data = simulate_data(n_seq, l_seq, motif_classes, prob_bg, p_classes, p_flip)
# save
write.table(data$sequences,   file=file.path("data", "toy_data", "simulated_sequences_1class_flip.mat"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n')
write.table(data$sites,       file=file.path("data", "toy_data", "simulated_sequences_1class_flip_contents.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n')
write.table(data$starts,      file=file.path("data", "toy_data", "simulated_sequences_1class_flip_starts.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n')
write.table(data$motifs[,,1], file=file.path("data", "toy_data", "simulated_sequences_1class_flip_motif1.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n')
write.table(data$flips,       file=file.path("data", "toy_data", "simulated_sequences_1class_flip_flips.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n')
write.table(data$classes,     file=file.path("data", "toy_data", "simulated_sequences_1class_flip_classes.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n')
# clean
rm(motif_classes, p_classes, p_flip, prob_bg, data)


# -------------------------------------------------------- 2 classes with 1 bs/sequence, flip -------------------------------------------------------- 

# seed
set.seed(201803143)

# the class binding site
motif_classes = abind(motif_class1, motif_class2, along=3)
# the class probability
p_classes     = c(0.5, 0.5)
# the probability of having a binding site on the reverse strand
p_flip        = 0.5
# the base probabilities outside the binding site (A,C,G,T)
prob_bg       = rep(0.25, 4)

# simulate the data
data = simulate_data(n_seq, l_seq, motif_classes, prob_bg, p_classes, p_flip)
# save
write.table(data$sequences,   file=file.path("data", "toy_data", "simulated_sequences_2class_flip.mat"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n')
write.table(data$sites,       file=file.path("data", "toy_data", "simulated_sequences_2class_flip_contents.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n')
write.table(data$starts,      file=file.path("data", "toy_data", "simulated_sequences_2class_flip_starts.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n')
write.table(data$motifs[,,1], file=file.path("data", "toy_data", "simulated_sequences_2class_flip_motif1.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n')
write.table(data$motifs[,,2], file=file.path("data", "toy_data", "simulated_sequences_2class_flip_motif2.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n')
write.table(data$flips,       file=file.path("data", "toy_data", "simulated_sequences_2class_flip_flips.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n')
write.table(data$classes,     file=file.path("data", "toy_data", "simulated_sequences_2class_flip_classes.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n')
# clean
rm(motif_classes, p_classes, p_flip, prob_bg, data)