Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F97171684
antibiotic_level_ensemble.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Fri, Jan 3, 03:55
Size
8 KB
Mime Type
text/x-python
Expires
Sun, Jan 5, 03:55 (2 d)
Engine
blob
Format
Raw Data
Handle
23344049
Attached To
R13271 Optical_Trapping_ML
antibiotic_level_ensemble.py
View Options
import
pandas
as
pd
import
pickle
from
data_processing
import
*
from
visu
import
*
from
sklearn.ensemble
import
VotingClassifier
from
sklearn.metrics
import
accuracy_score
,
f1_score
,
precision_score
,
recall_score
,
make_scorer
from
sklearn.model_selection
import
cross_val_score
,
StratifiedKFold
,
train_test_split
from
sklearn.utils
import
resample
from
sklearn.svm
import
SVC
from
imblearn.over_sampling
import
SMOTE
from
sklearn.pipeline
import
make_pipeline
from
sklearn.preprocessing
import
StandardScaler
,
PolynomialFeatures
from
sklearn.decomposition
import
PCA
from
sklearn.neighbors
import
KNeighborsClassifier
from
sklearn.semi_supervised
import
SelfTrainingClassifier
import
numpy
as
np
from
xgboost
import
XGBClassifier
import
optuna
from
sklearn.impute
import
SimpleImputer
from
sklearn.pipeline
import
Pipeline
from
collections
import
Counter
import
seaborn
as
sns
import
matplotlib.pyplot
as
plt
#ensemble methods to classify death by antibiotic levels
# Custom loss function
def
custom_loss
(
preds
,
dtrain
):
labels
=
dtrain
.
get_label
()
preds
=
(
preds
>
0.5
)
.
astype
(
int
)
true_positives
=
sum
((
preds
==
1
)
&
(
labels
==
1
))
false_positives
=
sum
((
preds
==
1
)
&
(
labels
==
0
))
false_negatives
=
sum
((
preds
==
0
)
&
(
labels
==
1
))
precision
=
true_positives
/
(
true_positives
+
false_positives
+
1e-9
)
recall
=
true_positives
/
(
true_positives
+
false_negatives
+
1e-9
)
f1
=
2
*
(
precision
*
recall
)
/
(
precision
+
recall
+
1e-9
)
return
'custom_f1'
,
-
f1
def
stratified_split_with_antibiotics
(
docs
,
test_size
=
0.2
,
val_size
=
0.1
):
known_docs
=
[
doc
for
doc
in
docs
if
doc
[
'living_state'
]
is
not
None
]
unknown_docs
=
[
doc
for
doc
in
docs
if
doc
[
'living_state'
]
is
None
]
antibiotics_quantities
=
[
doc
[
'antibiotics_quantity'
]
for
doc
in
known_docs
]
if
len
(
known_docs
)
>
0
:
train_val_docs
,
test_docs
=
train_test_split
(
known_docs
,
test_size
=
test_size
,
stratify
=
antibiotics_quantities
,
random_state
=
42
)
else
:
train_val_docs
=
[]
test_docs
=
[]
train_val_antibiotics
=
[
doc
[
'antibiotics_quantity'
]
for
doc
in
train_val_docs
]
if
len
(
train_val_docs
)
>
0
:
train_docs
,
val_docs
=
train_test_split
(
train_val_docs
,
test_size
=
val_size
/
(
1
-
test_size
),
stratify
=
train_val_antibiotics
,
random_state
=
42
)
else
:
train_docs
=
[]
val_docs
=
[]
return
train_docs
,
val_docs
,
test_docs
,
unknown_docs
def
prepare_features_with_antibiotics
(
docs
):
X
=
[]
y
=
[]
antibiotics_quantity
=
[]
for
doc
in
docs
:
features
=
{
'mean_trap'
:
doc
.
get
(
'mean_trap'
),
'std_trap'
:
doc
.
get
(
'std_trap'
),
'q25_trap'
:
doc
.
get
(
'q25_trap'
),
'q75_trap'
:
doc
.
get
(
'q75_trap'
),
'median_trap'
:
doc
.
get
(
'median_trap'
),
'min_trap'
:
doc
.
get
(
'min_trap'
),
'max_trap'
:
doc
.
get
(
'max_trap'
),
'most_prob_trap'
:
doc
.
get
(
'most_prob_trap'
),
'mean_on_to_trapping'
:
doc
.
get
(
'mean_on_to_trapping'
),
'std_on_to_trapping'
:
doc
.
get
(
'std_on_to_trapping'
),
'q25_on_to_trapping'
:
doc
.
get
(
'q25_on_to_trapping'
),
'q75_on_to_trapping'
:
doc
.
get
(
'q75_on_to_trapping'
),
'median_on_to_trapping'
:
doc
.
get
(
'median_on_to_trapping'
),
'min_on_to_trapping'
:
doc
.
get
(
'min_on_to_trapping'
),
'max_on_to_trapping'
:
doc
.
get
(
'max_on_to_trapping'
),
'most_prob_on_to_trapping'
:
doc
.
get
(
'most_prob_on_to_trapping'
)
}
if
doc
[
'living_state'
]
is
not
None
:
y
.
append
(
doc
[
'living_state'
])
antibiotics_quantity
.
append
(
doc
[
'antibiotics_quantity'
])
X
.
append
(
features
)
return
pd
.
DataFrame
(
X
),
pd
.
Series
(
y
)
if
y
else
None
,
antibiotics_quantity
def
prepare_known_data_for_knn
(
docs
):
X
=
[]
y
=
[]
for
doc
in
docs
:
if
doc
[
'antibiotics_quantity'
]
in
[
0
,
32
]:
features
=
{
'mean_trap'
:
doc
.
get
(
'mean_trap'
),
'std_trap'
:
doc
.
get
(
'std_trap'
),
'q25_trap'
:
doc
.
get
(
'q25_trap'
),
'q75_trap'
:
doc
.
get
(
'q75_trap'
),
'median_trap'
:
doc
.
get
(
'median_trap'
),
'min_trap'
:
doc
.
get
(
'min_trap'
),
'max_trap'
:
doc
.
get
(
'max_trap'
),
'most_prob_trap'
:
doc
.
get
(
'most_prob_trap'
),
'mean_on_to_trapping'
:
doc
.
get
(
'mean_on_to_trapping'
),
'std_on_to_trapping'
:
doc
.
get
(
'std_on_to_trapping'
),
'q25_on_to_trapping'
:
doc
.
get
(
'q25_on_to_trapping'
),
'q75_on_to_trapping'
:
doc
.
get
(
'q75_on_to_trapping'
),
'median_on_to_trapping'
:
doc
.
get
(
'median_on_to_trapping'
),
'min_on_to_trapping'
:
doc
.
get
(
'min_on_to_trapping'
),
'max_on_to_trapping'
:
doc
.
get
(
'max_on_to_trapping'
),
'most_prob_on_to_trapping'
:
doc
.
get
(
'most_prob_on_to_trapping'
)
}
if
doc
[
'living_state'
]
is
not
None
:
y
.
append
(
doc
[
'living_state'
])
X
.
append
(
features
)
return
pd
.
DataFrame
(
X
),
pd
.
Series
(
y
)
if
__name__
==
'__main__'
:
print
(
"Loading data from pickle file..."
)
with
open
(
'data_analysed.pkl'
,
'rb'
)
as
f
:
docs_analyzed
=
pickle
.
load
(
f
)
with
open
(
'data_meas.pkl'
,
'rb'
)
as
f
:
docs_meas
=
pickle
.
load
(
f
)
print
(
"Data loaded successfully."
)
print
(
f
"Unknown docs: {len(docs_meas)}"
)
norm_docs
=
normalize_docs
(
docs_analyzed
,
docs_meas
)
train_docs
,
val_docs
,
test_docs
,
unknown_docs
=
stratified_split_with_antibiotics
(
norm_docs
,
test_size
=
0.2
,
val_size
=
0.1
)
if
train_docs
and
val_docs
and
test_docs
:
X_train
,
y_train
,
train_antibiotics
=
prepare_features_with_antibiotics
(
train_docs
)
X_val
,
y_val
,
val_antibiotics
=
prepare_features_with_antibiotics
(
val_docs
)
X_test
,
y_test
,
test_antibiotics
=
prepare_features_with_antibiotics
(
test_docs
)
X_unknown
,
_
,
unknown_antibiotics
=
prepare_features_with_antibiotics
(
unknown_docs
)
# Impute missing values
imputer
=
SimpleImputer
(
strategy
=
'mean'
)
X_train_imputed
=
imputer
.
fit_transform
(
X_train
)
X_val_imputed
=
imputer
.
transform
(
X_val
)
X_test_imputed
=
imputer
.
transform
(
X_test
)
X_unknown_imputed
=
imputer
.
transform
(
X_unknown
)
if
X_unknown_imputed
.
size
>
0
:
# Train a k-NN classifier on known data with antibiotic levels 0 and 32
X_known
,
y_known
=
prepare_known_data_for_knn
(
norm_docs
)
X_known_imputed
=
imputer
.
fit_transform
(
X_known
)
knn_model
=
make_pipeline
(
StandardScaler
(),
KNeighborsClassifier
(
n_neighbors
=
3
))
knn_model
.
fit
(
X_known_imputed
,
y_known
)
y_unknown_pred_knn
=
knn_model
.
predict
(
X_unknown_imputed
)
for
doc
,
pred
in
zip
(
unknown_docs
,
y_unknown_pred_knn
):
doc
[
'living_state'
]
=
pred
# Self-training semi-supervised learning
ensemble_model
=
VotingClassifier
(
estimators
=
[
(
'knn'
,
KNeighborsClassifier
(
n_neighbors
=
3
)),
(
'svc'
,
SVC
(
kernel
=
'linear'
,
probability
=
True
,
random_state
=
42
)),
(
'xgb'
,
XGBClassifier
(
random_state
=
42
))
],
voting
=
'soft'
)
self_training_model
=
SelfTrainingClassifier
(
base_estimator
=
ensemble_model
,
criterion
=
'k_best'
,
k_best
=
10
)
X_combined
=
np
.
vstack
((
X_train_imputed
,
X_unknown_imputed
))
y_combined
=
np
.
hstack
((
y_train
,
[
-
1
]
*
len
(
X_unknown_imputed
)))
self_training_model
.
fit
(
X_combined
,
y_combined
)
y_unknown_pred_self_training
=
self_training_model
.
predict
(
X_unknown_imputed
)
for
doc
,
pred
in
zip
(
unknown_docs
,
y_unknown_pred_self_training
):
doc
[
'living_state'
]
=
pred
# Calculate and print the percentages of dead bacteria for each antibiotic level
print
(
"
\n
Self-Training Predictions for Each Antibiotic Level:"
)
antibiotic_levels
=
set
(
doc
[
'antibiotics_quantity'
]
for
doc
in
unknown_docs
)
for
level
in
sorted
(
antibiotic_levels
):
total
=
sum
(
1
for
doc
in
unknown_docs
if
doc
[
'antibiotics_quantity'
]
==
level
)
dead
=
sum
(
1
for
doc
in
unknown_docs
if
doc
[
'antibiotics_quantity'
]
==
level
and
doc
[
'living_state'
]
==
1
)
dead_percentage
=
(
dead
/
total
)
*
100
if
total
>
0
else
0
print
(
f
"Antibiotic Level {level} µg/mL: {dead_percentage:.2f}% Dead"
)
else
:
print
(
"No unknown samples available for prediction."
)
else
:
print
(
"Not enough data for training, validation, and testing."
)
Event Timeline
Log In to Comment