Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F96088627
main.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sun, Dec 22, 12:16
Size
6 KB
Mime Type
text/x-python
Expires
Tue, Dec 24, 12:16 (2 d)
Engine
blob
Format
Raw Data
Handle
23117766
Attached To
R13271 Optical_Trapping_ML
main.py
View Options
#main.py
from
data_processing_AMP
import
*
from
data_processing_VIAB
import
*
import
pickle
import
pandas
as
pd
from
sklearn.model_selection
import
train_test_split
from
sklearn.metrics
import
classification_report
,
accuracy_score
,
roc_auc_score
from
xgboost
import
XGBClassifier
import
optuna
from
optuna.samplers
import
TPESampler
import
matplotlib.pyplot
as
plt
from
imblearn.over_sampling
import
SMOTE
import
numpy
as
np
from
sklearn.preprocessing
import
StandardScaler
#Define the used features manually
selected_features
=
[
'min_trapping'
,
'max_trapping'
,
'mean_trapping'
,
'std_trapping'
,
'q25_trapping'
]
def
augment_data
(
X
,
y
):
"""
Augments the dataset using SMOTE to balance classes and adds random noise for further augmentation.
Args:
X (pd.DataFrame): Feature set.
y (pd.Series): Target variable.
Returns:
tuple: Augmented feature set and target variable.
"""
smote
=
SMOTE
(
random_state
=
42
)
X_res
,
y_res
=
smote
.
fit_resample
(
X
,
y
)
noise_factor
=
0.05
X_res
+=
noise_factor
*
np
.
random
.
normal
(
loc
=
0.0
,
scale
=
1.0
,
size
=
X_res
.
shape
)
return
X_res
,
y_res
def
objective
(
trial
):
"""
Objective function for optimizing XGBoost hyperparameters using Optuna.
Args:
trial (optuna.trial.Trial): A trial object that suggests hyperparameter values.
Returns:
float: ROC AUC score of the model on the validation set.
"""
param
=
{
'n_estimators'
:
trial
.
suggest_int
(
'n_estimators'
,
50
,
500
),
'max_depth'
:
trial
.
suggest_int
(
'max_depth'
,
3
,
10
),
'learning_rate'
:
trial
.
suggest_float
(
'learning_rate'
,
0.01
,
0.3
),
'subsample'
:
trial
.
suggest_float
(
'subsample'
,
0.6
,
1.0
),
'colsample_bytree'
:
trial
.
suggest_float
(
'colsample_bytree'
,
0.6
,
1.0
),
'gamma'
:
trial
.
suggest_float
(
'gamma'
,
0
,
10
),
'reg_alpha'
:
trial
.
suggest_float
(
'reg_alpha'
,
0
,
1.0
),
'reg_lambda'
:
trial
.
suggest_float
(
'reg_lambda'
,
1.0
,
10.0
),
'random_state'
:
42
,
'use_label_encoder'
:
False
,
'eval_metric'
:
'logloss'
}
model
=
XGBClassifier
(
**
param
)
model
.
fit
(
X_train_VIAB_aug
[
selected_features
],
y_train_VIAB_aug
)
y_pred
=
model
.
predict
(
X_test_VIAB_norm
[
selected_features
])
# Return the ROC AUC score for evaluation
return
roc_auc_score
(
y_test_VIAB
,
model
.
predict_proba
(
X_test_VIAB_norm
[
selected_features
])[:,
1
])
if
__name__
==
'__main__'
:
#in this code we will first train and test a model to predict death on the labeled dataset viability
#then we will use this same model to predict the dataset with unknown death labels : AMP and plot a bar plot to visualize
print
(
"Loading data from pickle files..."
)
with
open
(
'docs_analysed.pkl'
,
'rb'
)
as
f
:
docs_analyzed_VIAB
=
pickle
.
load
(
f
)
with
open
(
'docs_meas.pkl'
,
'rb'
)
as
f
:
docs_meas_VIAB
=
pickle
.
load
(
f
)
with
open
(
'docs_analysed_AMP.pkl'
,
'rb'
)
as
f
:
docs_analyzed_AMP
=
pickle
.
load
(
f
)
with
open
(
'docs_meas_AMP.pkl'
,
'rb'
)
as
f
:
docs_meas_AMP
=
pickle
.
load
(
f
)
print
(
"Data loaded successfully."
)
norm_docs_VIAB
=
extract_features_VIAB
(
docs_analyzed_VIAB
)
print
(
'first dataset'
,
norm_docs_VIAB
.
iloc
[
0
])
norm_docs_AMP
=
normalize_docs_AMP
(
docs_analyzed_AMP
,
docs_meas_AMP
)
print
(
'second dataset'
,
norm_docs_AMP
.
iloc
[
0
])
X_VIAB
=
norm_docs_VIAB
.
drop
(
columns
=
[
'dead'
])
y_VIAB
=
norm_docs_VIAB
[
'dead'
]
# Perform a stratified split to maintain the same ratio of classes in both training and testing sets
X_train_VIAB
,
X_test_VIAB
,
y_train_VIAB
,
y_test_VIAB
=
train_test_split
(
X_VIAB
,
y_VIAB
,
test_size
=
0.3
,
stratify
=
y_VIAB
,
random_state
=
42
)
# Normalize the data
scaler
=
StandardScaler
()
X_train_VIAB_norm
=
scaler
.
fit_transform
(
X_train_VIAB
[
selected_features
])
X_test_VIAB_norm
=
scaler
.
transform
(
X_test_VIAB
[
selected_features
])
X_norm_docs_AMP
=
scaler
.
transform
(
norm_docs_AMP
[
selected_features
])
# Convert back to DataFrame for easier handling
X_train_VIAB_norm
=
pd
.
DataFrame
(
X_train_VIAB_norm
,
columns
=
selected_features
)
X_test_VIAB_norm
=
pd
.
DataFrame
(
X_test_VIAB_norm
,
columns
=
selected_features
)
X_norm_docs_AMP
=
pd
.
DataFrame
(
X_norm_docs_AMP
,
columns
=
selected_features
)
print
(
"Training set class distribution:
\n
"
,
y_train_VIAB
.
value_counts
())
print
(
"Test set class distribution:
\n
"
,
y_test_VIAB
.
value_counts
())
# Data Augmentation
X_train_VIAB_aug
,
y_train_VIAB_aug
=
augment_data
(
X_train_VIAB_norm
,
y_train_VIAB
)
# Use Optuna to find the best hyperparameters for XGBoost
#sampler = TPESampler(seed=42) # Use TPE for efficient sampling
study
=
optuna
.
create_study
(
direction
=
'maximize'
)
study
.
optimize
(
objective
,
n_trials
=
150
,
timeout
=
1000
)
# Adjust n_trials and timeout as needed
print
(
"Best hyperparameters:"
,
study
.
best_params
)
# Train the best model with the found hyperparameters and augmented data
best_model
=
XGBClassifier
(
**
study
.
best_params
)
best_model
.
fit
(
X_train_VIAB_aug
[
selected_features
],
y_train_VIAB_aug
)
# Evaluate the best model
y_pred_VIAB
=
best_model
.
predict
(
X_test_VIAB_norm
[
selected_features
])
print
(
"Accuracy:"
,
accuracy_score
(
y_test_VIAB
,
y_pred_VIAB
))
print
(
"Classification Report:
\n
"
,
classification_report
(
y_test_VIAB
,
y_pred_VIAB
))
# Predict the 'dead' status for the second dataset
predicted_dead
=
best_model
.
predict
(
X_norm_docs_AMP
[
selected_features
])
# Add the predicted dead status back to the DataFrame
norm_docs_AMP
[
'predicted_dead'
]
=
predicted_dead
# Combine with antibiotics_quantity
antibiotics_quantity
=
norm_docs_AMP
[
'antibiotics_quantity'
]
norm_docs_AMP
[
'antibiotics_quantity'
]
=
antibiotics_quantity
print
(
norm_docs_AMP
)
# Group by antibiotics_quantity and count the number of 0s and 1s
death_counts
=
norm_docs_AMP
.
groupby
([
'antibiotics_quantity'
,
'predicted_dead'
])
.
size
()
.
unstack
(
fill_value
=
0
)
death_counts
.
plot
(
kind
=
'bar'
,
stacked
=
False
,
color
=
[
'blue'
,
'yellow'
])
plt
.
xlabel
(
'Antibiotics Quantity'
)
plt
.
ylabel
(
'Number of Samples'
)
plt
.
title
(
'Predicted Death vs Alive by Antibiotics Quantity'
)
plt
.
legend
([
'Alive (0)'
,
'Dead (1)'
])
plt
.
savefig
(
'bar_plot.png'
)
plt
.
close
()
Event Timeline
Log In to Comment