Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F61634681
EMSequence.cpp
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Tue, May 7, 23:42
Size
14 KB
Mime Type
text/x-c
Expires
Thu, May 9, 23:42 (2 d)
Engine
blob
Format
Raw Data
Handle
17538345
Attached To
R8820 scATAC-seq
EMSequence.cpp
View Options
#include <EMSequence.hpp>
#include <string>
#include <vector>
#include <future>
// std::promise, std::future
#include <utility>
// std::pair, std::move()
#include <functional>
// std::bind(), std::ref()
#include <cmath>
// exp()
#include <SequenceLayer.hpp>
// SequenceLayer
#include <RandomNumberGenerator.hpp>
// getRandomNumberGenerator()
#include <ConsoleProgressBar.hpp>
// ConsoleProgressBar
#include <ThreadPool.hpp>
// ThreadPool
#include <dna_utility.hpp>
// dna::base_composition()
Matrix2D
<
double
>
EMSequence
::
generate_bckg_motif
(
const
Matrix2D
<
int
>&
sequence_matrix
,
size_t
motif_length
,
bool
reverse_compl
)
{
// sequence composition
std
::
vector
<
double
>
base_comp
=
dna
::
base_composition
(
sequence_matrix
,
reverse_compl
)
;
// create a motif
Matrix2D
<
double
>
bckg_motif
(
4
,
motif_length
)
;
for
(
size_t
i
=
0
;
i
<
bckg_motif
.
get_nrow
();
i
++
)
{
for
(
size_t
j
=
0
;
j
<
bckg_motif
.
get_ncol
();
j
++
)
{
bckg_motif
(
i
,
j
)
=
base_comp
[
i
]
;
}
}
return
bckg_motif
;
}
EMSequence
::
EMSequence
(
const
Matrix2D
<
int
>&
seq_matrix
,
size_t
n_class
,
size_t
n_iter
,
size_t
n_shift
,
bool
flip
,
bool
bckg_class
,
const
std
::
string
&
seed
,
size_t
n_threads
)
:
EMBase
(
seq_matrix
.
get_nrow
(),
seq_matrix
.
get_ncol
(),
n_class
,
n_iter
,
n_shift
,
flip
,
n_threads
),
loglikelihood_max
(
n_row
,
0.
),
seq_layer
(
nullptr
)
{
this
->
loglikelihood_max
=
vector_d
(
n_row
,
0.
)
;
// initialise post prob randomly
// getRandomGenerator(seed) ;
this
->
set_post_prob_random
(
seed
)
;
// data and models
this
->
seq_layer
=
new
SequenceLayer
(
seq_matrix
,
this
->
n_class
,
this
->
n_shift
,
this
->
flip
,
bckg_class
)
;
// intialise the models with the post prob
this
->
seq_layer
->
update_model
(
this
->
post_prob
,
this
->
threads
)
;
// compute background and
// overwrite last class as background class
if
(
bckg_class
)
{
size_t
motif_len
=
seq_matrix
.
get_ncol
()
-
this
->
n_shift
+
1
;
Matrix2D
<
double
>
bckg_motif
=
EMSequence
::
generate_bckg_motif
(
seq_matrix
,
motif_len
,
this
->
flip
)
;
this
->
seq_layer
->
set_class
(
this
->
n_class
-
1
,
bckg_motif
)
;
}
}
EMSequence
::
EMSequence
(
Matrix2D
<
int
>&&
seq_matrix
,
size_t
n_class
,
size_t
n_iter
,
size_t
n_shift
,
bool
flip
,
bool
bckg_class
,
const
std
::
string
&
seed
,
size_t
n_threads
)
:
EMBase
(
seq_matrix
.
get_nrow
(),
seq_matrix
.
get_ncol
(),
n_class
,
n_iter
,
n_shift
,
flip
,
n_threads
),
loglikelihood_max
(
n_row
,
0.
),
seq_layer
(
nullptr
)
{
this
->
loglikelihood_max
=
vector_d
(
n_row
,
0.
)
;
// initialise post prob randomly
// getRandomGenerator(seed) ;
this
->
set_post_prob_random
(
seed
)
;
// compute background before giving data to
// SequenceLayer
Matrix2D
<
double
>
bckg_motif
;
if
(
bckg_class
)
{
size_t
motif_len
=
seq_matrix
.
get_ncol
()
-
this
->
n_shift
+
1
;
bckg_motif
=
EMSequence
::
generate_bckg_motif
(
seq_matrix
,
motif_len
,
this
->
flip
)
;
}
// data and models
this
->
seq_layer
=
new
SequenceLayer
(
std
::
move
(
seq_matrix
),
this
->
n_class
,
this
->
n_shift
,
this
->
flip
,
bckg_class
)
;
// intialise the models with the post prob
this
->
seq_layer
->
update_model
(
this
->
post_prob
,
this
->
threads
)
;
// overwrite last class as background class
if
(
bckg_class
)
{
this
->
seq_layer
->
set_class
(
this
->
n_class
-
1
,
bckg_motif
)
;
}
}
EMSequence
::
EMSequence
(
const
Matrix2D
<
int
>&
seq_matrix
,
const
Matrix3D
<
double
>&
motifs
,
size_t
n_iter
,
bool
flip
,
bool
bckg_class
,
size_t
n_threads
)
:
EMBase
(
seq_matrix
.
get_nrow
(),
seq_matrix
.
get_ncol
(),
motifs
.
get_dim
()[
0
],
n_iter
,
seq_matrix
.
get_ncol
()
-
motifs
.
get_dim
()[
1
]
+
1
,
flip
,
n_threads
),
loglikelihood_max
(
n_row
,
0.
),
seq_layer
(
nullptr
)
{
this
->
loglikelihood_max
=
vector_d
(
n_row
,
0.
)
;
// data and models
// background motif (if any) is the last of the given motifs
this
->
seq_layer
=
new
SequenceLayer
(
seq_matrix
,
motifs
,
this
->
flip
,
bckg_class
)
;
// intialise the class prob uniformly
this
->
set_state_prob_uniform
()
;
}
EMSequence
::
EMSequence
(
Matrix2D
<
int
>&&
seq_matrix
,
Matrix3D
<
double
>&&
motifs
,
size_t
n_iter
,
bool
flip
,
bool
bckg_class
,
size_t
n_threads
)
:
EMBase
(
seq_matrix
.
get_nrow
(),
seq_matrix
.
get_ncol
(),
motifs
.
get_dim
()[
0
],
n_iter
,
seq_matrix
.
get_ncol
()
-
motifs
.
get_dim
()[
1
]
+
1
,
flip
,
n_threads
),
loglikelihood_max
(
n_row
,
0.
),
seq_layer
(
nullptr
)
{
this
->
loglikelihood_max
=
vector_d
(
n_row
,
0.
)
;
// data and models
// background motif (if any) is the last of the given motifs
this
->
seq_layer
=
new
SequenceLayer
(
std
::
move
(
seq_matrix
),
std
::
move
(
motifs
),
this
->
flip
,
bckg_class
)
;
// intialise the class prob uniformly
this
->
set_state_prob_uniform
()
;
}
EMSequence
::~
EMSequence
()
{
if
(
this
->
seq_layer
==
nullptr
)
{
delete
this
->
seq_layer
;
this
->
seq_layer
=
nullptr
;
}
}
Matrix3D
<
double
>
EMSequence
::
get_sequence_models
()
const
{
return
seq_layer
->
get_model
()
;
}
EMSequence
::
exit_codes
EMSequence
::
classify
()
{
size_t
bar_update_n
=
this
->
n_iter
;
ConsoleProgressBar
bar
(
std
::
cerr
,
bar_update_n
,
60
,
"classifying"
)
;
// optimize the partition
for
(
size_t
n_iter
=
0
;
n_iter
<
this
->
n_iter
;
n_iter
++
)
{
// E-step
this
->
compute_loglikelihood
()
;
this
->
compute_post_prob
()
;
// M-step
this
->
compute_class_prob
()
;
this
->
update_models
()
;
this
->
center_post_state_prob
()
;
bar
.
update
()
;
}
bar
.
update
()
;
std
::
cerr
<<
std
::
endl
;
return
EMSequence
::
exit_codes
::
ITER_MAX
;
}
void
EMSequence
::
compute_loglikelihood
()
{
// compute the loglikelihood
this
->
seq_layer
->
compute_loglikelihoods
(
this
->
loglikelihood
,
this
->
loglikelihood_max
,
this
->
threads
)
;
// rescale the values
// don't parallelize
if
(
this
->
threads
==
nullptr
)
{
std
::
promise
<
bool
>
promise
;
std
::
future
<
bool
>
future
=
promise
.
get_future
()
;
this
->
compute_loglikelihood_routine
(
0
,
this
->
n_row
,
promise
)
;
future
.
get
()
;
}
// parallelize
else
{
size_t
n_threads
=
this
->
threads
->
getNThread
()
;
// compute the slices on which each thread will work
std
::
vector
<
std
::
pair
<
size_t
,
size_t
>>
slices
=
ThreadPool
::
split_range
(
0
,
this
->
n_row
,
n_threads
)
;
// get promises and futures
std
::
vector
<
std
::
promise
<
bool
>>
promises
(
n_threads
)
;
std
::
vector
<
std
::
future
<
bool
>>
futures
(
n_threads
)
;
for
(
size_t
i
=
0
;
i
<
n_threads
;
i
++
)
{
futures
[
i
]
=
promises
[
i
].
get_future
()
;
}
// distribute work to threads
// -------------------------- threads start --------------------------
for
(
size_t
i
=
0
;
i
<
n_threads
;
i
++
)
{
auto
slice
=
slices
[
i
]
;
this
->
threads
->
addJob
(
std
::
move
(
std
::
bind
(
&
EMSequence
::
compute_loglikelihood_routine
,
this
,
slice
.
first
,
slice
.
second
,
std
::
ref
(
promises
[
i
]))))
;
}
// wait until all threads are done working
for
(
auto
&
future
:
futures
)
{
future
.
get
()
;
}
// -------------------------- threads stop ---------------------------
}
}
void
EMSequence
::
compute_loglikelihood_routine
(
size_t
from
,
size_t
to
,
std
::
promise
<
bool
>&
done
)
{
// rescale the values
for
(
size_t
i
=
from
;
i
<
to
;
i
++
)
{
for
(
size_t
j
=
0
;
j
<
this
->
n_class
;
j
++
)
{
for
(
size_t
k
=
0
;
k
<
this
->
n_shift
;
k
++
)
{
for
(
size_t
l
=
0
;
l
<
this
->
n_flip
;
l
++
)
{
this
->
loglikelihood
(
i
,
j
,
k
,
l
)
=
(
this
->
loglikelihood
(
i
,
j
,
k
,
l
)
-
this
->
loglikelihood_max
[
i
])
;
}
}
}
}
done
.
set_value
(
true
)
;
}
void
EMSequence
::
compute_post_prob
()
{
// don't parallelize
if
(
this
->
threads
==
nullptr
)
{
std
::
promise
<
vector_d
>
promise
;
std
::
future
<
vector_d
>
future
=
promise
.
get_future
()
;
this
->
compute_post_prob_routine
(
0
,
this
->
n_row
,
promise
)
;
// compute the sum of post prob and the per class sum of post prob
// from the partial results computed on each slice
this
->
post_prob_tot
=
0.
;
this
->
post_prob_colsum
=
future
.
get
()
;
for
(
const
auto
&
prob
:
this
->
post_prob_colsum
)
{
this
->
post_prob_tot
+=
prob
;
}
}
// parallelize
else
{
size_t
n_threads
=
this
->
threads
->
getNThread
()
;
// compute the slices on which each thread will work
std
::
vector
<
std
::
pair
<
size_t
,
size_t
>>
slices
=
ThreadPool
::
split_range
(
0
,
this
->
n_row
,
n_threads
)
;
// get promises and futures
// the function run by the threads will compute
// the partial sum per class of post_prob for the given slice
// this should be used to compute the complete sum of post_prob
// and the complete sum per class of post_prob
std
::
vector
<
std
::
promise
<
vector_d
>>
promises
(
n_threads
)
;
std
::
vector
<
std
::
future
<
vector_d
>>
futures
(
n_threads
)
;
for
(
size_t
i
=
0
;
i
<
n_threads
;
i
++
)
{
futures
[
i
]
=
promises
[
i
].
get_future
()
;
}
// distribute work to threads
// -------------------------- threads start --------------------------
for
(
size_t
i
=
0
;
i
<
n_threads
;
i
++
)
{
auto
slice
=
slices
[
i
]
;
this
->
threads
->
addJob
(
std
::
move
(
std
::
bind
(
&
EMSequence
::
compute_post_prob_routine
,
this
,
slice
.
first
,
slice
.
second
,
std
::
ref
(
promises
[
i
]))))
;
}
// wait until all threads are done working
// compute the sum of post prob and the per class sum of post prob
// from the partial results computed on each slice
this
->
post_prob_tot
=
0.
;
this
->
post_prob_colsum
=
vector_d
(
this
->
n_class
,
0.
)
;
for
(
auto
&
future
:
futures
)
{
auto
probs
=
future
.
get
()
;
for
(
size_t
i
=
0
;
i
<
this
->
n_class
;
i
++
)
{
double
prob
=
probs
[
i
]
;
this
->
post_prob_colsum
[
i
]
+=
prob
;
this
->
post_prob_tot
+=
prob
;
}
}
// -------------------------- threads stop ---------------------------
}
}
void
EMSequence
::
compute_post_prob_routine
(
size_t
from
,
size_t
to
,
std
::
promise
<
vector_d
>&
post_prob_colsum
)
{
vector_d
colsums
(
this
->
n_class
,
0.
)
;
// reset grand total
// this->post_prob_tot = 0 ;
// this->post_prob_colsum = vector_d(n_class, 0) ;
// post prob
for
(
size_t
i
=
from
;
i
<
to
;
i
++
)
{
// reset row sum to 0
this
->
post_prob_rowsum
[
i
]
=
0.
;
for
(
size_t
n_class
=
0
;
n_class
<
this
->
n_class
;
n_class
++
)
{
for
(
size_t
n_shift
=
0
;
n_shift
<
this
->
n_shift
;
n_shift
++
)
{
for
(
size_t
n_flip
=
0
;
n_flip
<
this
->
n_flip
;
n_flip
++
)
{
double
p
=
exp
(
this
->
loglikelihood
(
i
,
n_class
,
n_shift
,
n_flip
))
*
this
->
post_state_prob
(
n_class
,
n_shift
,
n_flip
)
;
this
->
post_prob
(
i
,
n_class
,
n_shift
,
n_flip
)
=
p
;
this
->
post_prob_rowsum
[
i
]
+=
p
;
}
}
}
// normalize
for
(
size_t
n_class
=
0
;
n_class
<
this
->
n_class
;
n_class
++
)
{
for
(
size_t
n_shift
=
0
;
n_shift
<
this
->
n_shift
;
n_shift
++
)
{
for
(
size_t
n_flip
=
0
;
n_flip
<
this
->
n_flip
;
n_flip
++
)
{
double
p
=
std
::
max
(
this
->
post_prob
(
i
,
n_class
,
n_shift
,
n_flip
)
/
this
->
post_prob_rowsum
[
i
],
SequenceLayer
::
p_min
)
;
this
->
post_prob
(
i
,
n_class
,
n_shift
,
n_flip
)
=
p
;
colsums
[
n_class
]
+=
p
;
}
}
}
}
post_prob_colsum
.
set_value
(
colsums
)
;
}
void
EMSequence
::
update_models
()
{
this
->
seq_layer
->
update_model
(
this
->
post_prob
,
this
->
threads
)
;
}
Event Timeline
Log In to Comment