Page MenuHomec4science

antibiotic_level_ensemble.py
No OneTemporary

File Metadata

Created
Fri, Jan 3, 03:55

antibiotic_level_ensemble.py

import pandas as pd
import pickle
from data_processing import *
from visu import *
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, make_scorer
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split
from sklearn.utils import resample
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.semi_supervised import SelfTrainingClassifier
import numpy as np
from xgboost import XGBClassifier
import optuna
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt
#ensemble methods to classify death by antibiotic levels
# Custom loss function
def custom_loss(preds, dtrain):
labels = dtrain.get_label()
preds = (preds > 0.5).astype(int)
true_positives = sum((preds == 1) & (labels == 1))
false_positives = sum((preds == 1) & (labels == 0))
false_negatives = sum((preds == 0) & (labels == 1))
precision = true_positives / (true_positives + false_positives + 1e-9)
recall = true_positives / (true_positives + false_negatives + 1e-9)
f1 = 2 * (precision * recall) / (precision + recall + 1e-9)
return 'custom_f1', -f1
def stratified_split_with_antibiotics(docs, test_size=0.2, val_size=0.1):
known_docs = [doc for doc in docs if doc['living_state'] is not None]
unknown_docs = [doc for doc in docs if doc['living_state'] is None]
antibiotics_quantities = [doc['antibiotics_quantity'] for doc in known_docs]
if len(known_docs) > 0:
train_val_docs, test_docs = train_test_split(
known_docs, test_size=test_size, stratify=antibiotics_quantities, random_state=42)
else:
train_val_docs = []
test_docs = []
train_val_antibiotics = [doc['antibiotics_quantity'] for doc in train_val_docs]
if len(train_val_docs) > 0:
train_docs, val_docs = train_test_split(
train_val_docs, test_size=val_size / (1 - test_size), stratify=train_val_antibiotics, random_state=42)
else:
train_docs = []
val_docs = []
return train_docs, val_docs, test_docs, unknown_docs
def prepare_features_with_antibiotics(docs):
X = []
y = []
antibiotics_quantity = []
for doc in docs:
features = {
'mean_trap': doc.get('mean_trap'),
'std_trap': doc.get('std_trap'),
'q25_trap': doc.get('q25_trap'),
'q75_trap': doc.get('q75_trap'),
'median_trap': doc.get('median_trap'),
'min_trap': doc.get('min_trap'),
'max_trap': doc.get('max_trap'),
'most_prob_trap': doc.get('most_prob_trap'),
'mean_on_to_trapping': doc.get('mean_on_to_trapping'),
'std_on_to_trapping': doc.get('std_on_to_trapping'),
'q25_on_to_trapping': doc.get('q25_on_to_trapping'),
'q75_on_to_trapping': doc.get('q75_on_to_trapping'),
'median_on_to_trapping': doc.get('median_on_to_trapping'),
'min_on_to_trapping': doc.get('min_on_to_trapping'),
'max_on_to_trapping': doc.get('max_on_to_trapping'),
'most_prob_on_to_trapping': doc.get('most_prob_on_to_trapping')
}
if doc['living_state'] is not None:
y.append(doc['living_state'])
antibiotics_quantity.append(doc['antibiotics_quantity'])
X.append(features)
return pd.DataFrame(X), pd.Series(y) if y else None, antibiotics_quantity
def prepare_known_data_for_knn(docs):
X = []
y = []
for doc in docs:
if doc['antibiotics_quantity'] in [0, 32]:
features = {
'mean_trap': doc.get('mean_trap'),
'std_trap': doc.get('std_trap'),
'q25_trap': doc.get('q25_trap'),
'q75_trap': doc.get('q75_trap'),
'median_trap': doc.get('median_trap'),
'min_trap': doc.get('min_trap'),
'max_trap': doc.get('max_trap'),
'most_prob_trap': doc.get('most_prob_trap'),
'mean_on_to_trapping': doc.get('mean_on_to_trapping'),
'std_on_to_trapping': doc.get('std_on_to_trapping'),
'q25_on_to_trapping': doc.get('q25_on_to_trapping'),
'q75_on_to_trapping': doc.get('q75_on_to_trapping'),
'median_on_to_trapping': doc.get('median_on_to_trapping'),
'min_on_to_trapping': doc.get('min_on_to_trapping'),
'max_on_to_trapping': doc.get('max_on_to_trapping'),
'most_prob_on_to_trapping': doc.get('most_prob_on_to_trapping')
}
if doc['living_state'] is not None:
y.append(doc['living_state'])
X.append(features)
return pd.DataFrame(X), pd.Series(y)
if __name__ == '__main__':
print("Loading data from pickle file...")
with open('data_analysed.pkl', 'rb') as f:
docs_analyzed = pickle.load(f)
with open('data_meas.pkl', 'rb') as f:
docs_meas = pickle.load(f)
print("Data loaded successfully.")
print(f"Unknown docs: {len(docs_meas)}")
norm_docs = normalize_docs(docs_analyzed, docs_meas)
train_docs, val_docs, test_docs, unknown_docs = stratified_split_with_antibiotics(norm_docs, test_size=0.2,
val_size=0.1)
if train_docs and val_docs and test_docs:
X_train, y_train, train_antibiotics = prepare_features_with_antibiotics(train_docs)
X_val, y_val, val_antibiotics = prepare_features_with_antibiotics(val_docs)
X_test, y_test, test_antibiotics = prepare_features_with_antibiotics(test_docs)
X_unknown, _, unknown_antibiotics = prepare_features_with_antibiotics(unknown_docs)
# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_val_imputed = imputer.transform(X_val)
X_test_imputed = imputer.transform(X_test)
X_unknown_imputed = imputer.transform(X_unknown)
if X_unknown_imputed.size > 0:
# Train a k-NN classifier on known data with antibiotic levels 0 and 32
X_known, y_known = prepare_known_data_for_knn(norm_docs)
X_known_imputed = imputer.fit_transform(X_known)
knn_model = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=3))
knn_model.fit(X_known_imputed, y_known)
y_unknown_pred_knn = knn_model.predict(X_unknown_imputed)
for doc, pred in zip(unknown_docs, y_unknown_pred_knn):
doc['living_state'] = pred
# Self-training semi-supervised learning
ensemble_model = VotingClassifier(estimators=[
('knn', KNeighborsClassifier(n_neighbors=3)),
('svc', SVC(kernel='linear', probability=True, random_state=42)),
('xgb', XGBClassifier(random_state=42))
], voting='soft')
self_training_model = SelfTrainingClassifier(base_estimator=ensemble_model, criterion='k_best', k_best=10)
X_combined = np.vstack((X_train_imputed, X_unknown_imputed))
y_combined = np.hstack((y_train, [-1] * len(X_unknown_imputed)))
self_training_model.fit(X_combined, y_combined)
y_unknown_pred_self_training = self_training_model.predict(X_unknown_imputed)
for doc, pred in zip(unknown_docs, y_unknown_pred_self_training):
doc['living_state'] = pred
# Calculate and print the percentages of dead bacteria for each antibiotic level
print("\nSelf-Training Predictions for Each Antibiotic Level:")
antibiotic_levels = set(doc['antibiotics_quantity'] for doc in unknown_docs)
for level in sorted(antibiotic_levels):
total = sum(1 for doc in unknown_docs if doc['antibiotics_quantity'] == level)
dead = sum(
1 for doc in unknown_docs if doc['antibiotics_quantity'] == level and doc['living_state'] == 1)
dead_percentage = (dead / total) * 100 if total > 0 else 0
print(f"Antibiotic Level {level} µg/mL: {dead_percentage:.2f}% Dead")
else:
print("No unknown samples available for prediction.")
else:
print("Not enough data for training, validation, and testing.")

Event Timeline