Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F96107156
data_processing.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sun, Dec 22, 16:07
Size
2 KB
Mime Type
text/x-python
Expires
Tue, Dec 24, 16:07 (1 d, 18 h)
Engine
blob
Format
Raw Data
Handle
23122091
Attached To
R13271 Optical_Trapping_ML
data_processing.py
View Options
import
numpy
as
np
from
sklearn.model_selection
import
train_test_split
def
normalize_data_model1
(
docs
):
normalized_docs
=
[]
for
doc
in
docs
:
transmission
=
np
.
array
(
doc
[
'data'
][
'transmission'
])
norm_factor
=
doc
[
'normalization_factor'
]
transmission_normalized
=
transmission
/
norm_factor
normalized_docs
.
append
({
'transmission_normalized'
:
transmission_normalized
,
'label'
:
doc
[
'gram_type'
],
# New gram classification label
'bacteria'
:
doc
[
'bacteria'
],
# Souche
'name'
:
doc
[
'name'
]
# Name for filtering
})
return
normalized_docs
def
normalize_data_model2
(
docs
):
normalized_docs
=
[]
for
doc
in
docs
:
transmission
=
np
.
array
(
doc
[
'data'
][
'transmission'
])
norm_factor
=
doc
[
'normalization_factor'
]
transmission_normalized
=
transmission
/
norm_factor
normalized_docs
.
append
({
'transmission_normalized'
:
transmission_normalized
,
'label'
:
doc
[
'gram_type'
],
'name'
:
doc
[
'name'
],
'bacteria'
:
doc
[
'bacteria'
]
})
return
normalized_docs
def
split_data
(
docs
):
train_docs
=
[]
test_docs
=
[]
bacteria_families
=
set
([
doc
[
'bacteria'
]
for
doc
in
docs
])
for
family
in
bacteria_families
:
family_docs
=
[
doc
for
doc
in
docs
if
doc
[
'bacteria'
]
==
family
]
train_family_docs
,
test_family_docs
=
train_test_split
(
family_docs
,
test_size
=
0.2
,
random_state
=
42
)
train_docs
.
extend
(
train_family_docs
)
test_docs
.
extend
(
test_family_docs
)
return
train_docs
,
test_docs
def
augment_data
(
docs
,
noise_level
=
0.05
,
shift_max
=
10
):
augmented_docs
=
[]
for
doc
in
docs
:
transmission
=
np
.
array
(
doc
[
'transmission_normalized'
])
for
_
in
range
(
2
):
# Duplicate each document twice
# Add noise
noisy_transmission
=
transmission
+
np
.
random
.
normal
(
0
,
noise_level
,
len
(
transmission
))
augmented_docs
.
append
({
'transmission_normalized'
:
noisy_transmission
,
'label'
:
doc
[
'label'
],
'bacteria'
:
doc
[
'bacteria'
],
'name'
:
doc
[
'name'
]
+
'_noise'
})
# Shift the time series
shift
=
np
.
random
.
randint
(
-
shift_max
,
shift_max
)
shifted_transmission
=
np
.
roll
(
transmission
,
shift
)
augmented_docs
.
append
({
'transmission_normalized'
:
shifted_transmission
,
'label'
:
doc
[
'label'
],
'bacteria'
:
doc
[
'bacteria'
],
'name'
:
doc
[
'name'
]
+
'_shift'
})
docs
.
extend
(
augmented_docs
)
return
docs
Event Timeline
Log In to Comment