explore_data.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Sun, Dec 22, 17:38

explore_data.py
View Options

	import pickle
	from data_processing import *
	from visu import *
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.metrics import accuracy_score
	from sklearn.model_selection import cross_val_score
	import seaborn as sns
	from sklearn.metrics import precision_score, recall_score, f1_score,log_loss
	from sklearn.metrics import roc_curve, auc

	if __name__ == '__main__':
	print("Loading data from pickle file...")
	with open('data_analysed.pkl', 'rb') as f:
	docs_analyzed = pickle.load(f)
	with open('data_meas.pkl', 'rb') as f:
	docs_meas = pickle.load(f)
	print("Data loaded successfully.")
	norm_docs = normalize_docs(docs_analyzed, docs_meas)
	train_docs, val_docs, test_docs, unknown_docs = stratified_split(norm_docs)

	# Prepare features for training, validation, and test sets
	X_train, y_train = prepare_features(train_docs)
	X_val, y_val = prepare_features(val_docs)
	X_test, y_test = prepare_features(test_docs)
	X_unknown, _ = prepare_features(unknown_docs)

	# Train the initial model
	model_initial = RandomForestClassifier(random_state=42)
	model_initial.fit(X_train, y_train)

	# Validate the initial model
	y_val_pred_initial = model_initial.predict(X_val)
	val_accuracy_initial = accuracy_score(y_val, y_val_pred_initial)
	print(f'Initial Validation Accuracy: {val_accuracy_initial}')

	# Predict living_state for unknown_docs with the initial model
	y_unknown_pred_initial = model_initial.predict(X_unknown)

	# Update unknown_docs with predicted living_state
	for doc, pred in zip(unknown_docs, y_unknown_pred_initial):
	doc['living_state'] = pred

	# Optionally, evaluate the initial model on the test set
	y_test_pred_initial = model_initial.predict(X_test)
	test_accuracy_initial = accuracy_score(y_test, y_test_pred_initial)
	print(f'Initial Test Accuracy: {test_accuracy_initial}')

	# Plot feature importances for the initial model
	plot_feature_importances(model_initial, X_train)

	# Plot feature correlations for the initial feature set
	plot_feature_correlations(X_train)

	# Remove highly correlated features
	X_train_reduced = remove_highly_correlated_features(X_train, model_initial)
	X_val_reduced = X_val[X_train_reduced.columns]
	X_test_reduced = X_test[X_train_reduced.columns]
	X_unknown_reduced = X_unknown[X_train_reduced.columns]

	# Train a new model on the reduced feature set
	model_reduced = RandomForestClassifier(random_state=42)
	model_reduced.fit(X_train_reduced, y_train)

	# Validate the new model
	y_val_pred_reduced = model_reduced.predict(X_val_reduced)
	val_accuracy_reduced = accuracy_score(y_val, y_val_pred_reduced)
	print(f'Reduced Validation Accuracy: {val_accuracy_reduced}')

	# Predict living_state for unknown_docs with the reduced model
	y_unknown_pred_reduced = model_reduced.predict(X_unknown_reduced)

	# Update unknown_docs with predicted living_state
	for doc, pred in zip(unknown_docs, y_unknown_pred_reduced):
	doc['living_state'] = pred

	# Optionally, evaluate the reduced model on the test set
	y_test_pred_reduced = model_reduced.predict(X_test_reduced)
	test_accuracy_reduced = accuracy_score(y_test, y_test_pred_reduced)
	print(f'Reduced Test Accuracy: {test_accuracy_reduced}')

	# Plot feature importances for the reduced model
	plot_feature_importances(model_reduced, X_train_reduced)

	# Plot feature correlations for the reduced feature set
	plot_feature_correlations(X_train_reduced)

	# Remove additional highly correlated features
	X_train_reduced_v2 = remove_highly_correlated_features_v2(X_train_reduced, model_reduced)
	X_val_reduced_v2 = X_val[X_train_reduced_v2.columns]
	X_test_reduced_v2 = X_test[X_train_reduced_v2.columns]
	X_unknown_reduced_v2 = X_unknown[X_train_reduced_v2.columns]

	# Train a new model on the further reduced feature set
	model_reduced_v2 = RandomForestClassifier(random_state=42)
	model_reduced_v2.fit(X_train_reduced_v2, y_train)

	# Validate the new model
	y_val_pred_reduced_v2 = model_reduced_v2.predict(X_val_reduced_v2)
	val_accuracy_reduced_v2 = accuracy_score(y_val, y_val_pred_reduced_v2)
	print(f'Further Reduced Validation Accuracy: {val_accuracy_reduced_v2}')

	# Predict living_state for unknown_docs with the further reduced model
	y_unknown_pred_reduced_v2 = model_reduced_v2.predict(X_unknown_reduced_v2)

	# Update unknown_docs with predicted living_state
	for doc, pred in zip(unknown_docs, y_unknown_pred_reduced_v2):
	doc['living_state'] = pred

	# Optionally, evaluate the further reduced model on the test set
	y_test_pred_reduced_v2 = model_reduced_v2.predict(X_test_reduced_v2)
	test_accuracy_reduced_v2 = accuracy_score(y_test, y_test_pred_reduced_v2)
	print(f'Further Reduced Test Accuracy: {test_accuracy_reduced_v2}')

	# Plot feature importances for the further reduced model
	plot_feature_importances(model_reduced_v2, X_train_reduced_v2)

	# Plot feature correlations for the further reduced feature set
	plot_feature_correlations(X_train_reduced_v2)

	# Cross-validation on the further reduced model
	cv_scores = cross_val_score(model_reduced_v2, X_train_reduced_v2, y_train, cv=5)
	print(f'Cross-Validation Scores: {cv_scores}')
	print(f'Cross-Validation Mean Score: {np.mean(cv_scores)}')
	# Precision, Recall, and F1 Score for the test set
	precision = precision_score(y_test, y_test_pred_reduced_v2)
	recall = recall_score(y_test, y_test_pred_reduced_v2)
	f1 = f1_score(y_test, y_test_pred_reduced_v2)
	print(f'Precision: {precision}')
	print(f'Recall: {recall}')
	print(f'F1 Score: {f1}')
	# ROC and AUC for the test set
	y_test_proba = model_reduced_v2.predict_proba(X_test_reduced_v2)[:, 1]
	fpr, tpr, _ = roc_curve(y_test, y_test_proba)
	roc_auc = auc(fpr, tpr)
	print(f'Test AUC: {roc_auc}')

	plt.figure()
	plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
	plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
	plt.xlim([0.0, 1.0])
	plt.ylim([0.0, 1.05])
	plt.xlabel('False Positive Rate')
	plt.ylabel('True Positive Rate')
	plt.title('Receiver Operating Characteristic')
	plt.legend(loc="lower right")
	plt.show()
	from sklearn.metrics import precision_recall_curve

	# Precision-Recall Curve for the test set
	precision, recall, _ = precision_recall_curve(y_test, y_test_proba)

	plt.figure()
	plt.plot(recall, precision, color='b', lw=2)
	plt.xlabel('Recall')
	plt.ylabel('Precision')
	plt.title('Precision-Recall Curve')
	plt.show()
	from sklearn.utils import resample

	# Bootstrapping for the Further Reduced Model
	n_iterations = 1000
	n_size = int(len(X_train_reduced_v2) * 0.5)
	bootstrap_scores = []
	y_train = pd.Series(y_train)
	for i in range(n_iterations):
	X_sample, y_sample = resample(X_train_reduced_v2, y_train, n_samples=n_size)
	model = RandomForestClassifier(random_state=42)
	model.fit(X_sample, y_sample)
	y_pred_sample = model.predict(X_test_reduced_v2)
	score = accuracy_score(y_test, y_pred_sample)
	bootstrap_scores.append(score)

	bootstrap_mean = np.mean(bootstrap_scores)
	bootstrap_std = np.std(bootstrap_scores)
	print(f'Bootstrapped Accuracy Mean: {bootstrap_mean}, Std: {bootstrap_std}')

	from sklearn.model_selection import StratifiedKFold

	# Stratified k-Fold Cross-Validation for the Further Reduced Model
	skf = StratifiedKFold(n_splits=5)
	cv_scores = []

	for train_index, test_index in skf.split(X_train_reduced_v2, y_train):
	X_train_fold, X_val_fold = X_train_reduced_v2.iloc[train_index], X_train_reduced_v2.iloc[test_index]
	y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[test_index]
	model = RandomForestClassifier(random_state=42)
	model.fit(X_train_fold, y_train_fold)
	y_pred_fold = model.predict(X_val_fold)
	score = accuracy_score(y_val_fold, y_pred_fold)
	cv_scores.append(score)

	cv_mean = np.mean(cv_scores)
	cv_std = np.std(cv_scores)
	print(f'k-Fold Cross-Validation Mean Score: {cv_mean}, Std: {cv_std}')

	# Plot relationships between features and living_state
	sns.boxplot(x='living_state', y='antibiotics_quantity',
	data=pd.concat([X_train_reduced_v2, pd.Series(y_train, name='living_state')], axis=1))
	plt.title('Antibiotics Quantity vs. Living State')
	plt.show()

	sns.boxplot(x='living_state', y='min_trap',
	data=pd.concat([X_train_reduced_v2, pd.Series(y_train, name='living_state')], axis=1))
	plt.title('Min Trap vs. Living State')
	plt.show()

	plot_classifications_by_antibiotic_level_with_counts(unknown_docs + train_docs + val_docs + test_docs)
	plot_classifications_by_antibiotic_level(unknown_docs + train_docs + val_docs + test_docs, model_reduced_v2,
	X_train_reduced_v2.columns)

	# Perform active learning query
	query_indices = active_learning_query(model_reduced_v2, X_unknown_reduced_v2)
	queried_samples = [unknown_docs[i] for i in query_indices]

	print("Queried samples for manual labeling:", query_indices)


	# Simulate manual labeling (In reality, this would be done by domain experts)
	def get_manual_labels(samples):
	manual_labels = []
	for sample in samples:
	# Here we simulate the manual labeling process.
	# In a real scenario, you would replace this with actual manual labeling.
	if sample['antibiotics_quantity'] > 16:
	manual_labels.append(1) # Assume high antibiotic levels lead to death
	else:
	manual_labels.append(0) # Assume low antibiotic levels lead to alive
	return manual_labels


	# Get manual labels for the queried samples
	new_labels = get_manual_labels(queried_samples)

	# Update the queried samples with the new labels
	for sample, label in zip(queried_samples, new_labels):
	sample['living_state'] = label

	# Add newly labeled samples to the training set
	train_docs += queried_samples

	# Prepare the combined features and target
	X_combined, y_combined = prepare_features(train_docs + val_docs + test_docs)

	# Ensure X_unknown_reduced_v2 uses the same feature names as X_combined
	X_unknown_reduced_v2 = X_unknown_reduced_v2.reindex(columns=X_combined.columns, fill_value=0)
	X_test_reduced_v2 = X_test_reduced_v2.reindex(columns=X_combined.columns, fill_value=0)

	# Retrain the model on the combined dataset
	model_combined = RandomForestClassifier(random_state=42)
	model_combined.fit(X_combined, y_combined)

	# Validate the new model
	y_val_pred_combined = model_combined.predict(X_val)
	val_accuracy_combined = accuracy_score(y_val, y_val_pred_combined)
	plot_predictions_vs_actual(y_val, y_val_pred_combined, 'Validation Set Predictions vs Actual')
	print(f'Combined Validation Accuracy after Active Learning: {val_accuracy_combined}')

	# Predict living_state for unknown_docs with the new combined model
	y_unknown_pred_combined = model_combined.predict(X_unknown_reduced_v2)

	# Update unknown_docs with predicted living_state
	for doc, pred in zip(unknown_docs, y_unknown_pred_combined):
	doc['living_state'] = pred

	# Check overfitting by comparing training and validation accuracy
	y_train_pred = model_combined.predict(X_combined)
	train_accuracy_combined = accuracy_score(y_combined, y_train_pred)
	print(f'Combined Training Accuracy after Active Learning: {train_accuracy_combined}')


	# Optionally, evaluate the new combined model on the test set
	y_test_pred_combined = model_combined.predict(X_test_reduced_v2)
	test_accuracy_combined = accuracy_score(y_test, y_test_pred_combined)
	print(f'Combined Test Accuracy after Active Learning: {test_accuracy_combined}')
	plot_predictions_vs_actual(y_test, y_test_pred_combined, 'Test Set Predictions vs Actual')
	# Plot feature importances for the combined model
	plot_feature_importances(model_combined, X_combined)

	# Plot feature correlations for the combined feature set
	plot_feature_correlations(X_combined)

	# Cross-validation on the combined model
	cv_scores_combined = cross_val_score(model_combined, X_combined, y_combined, cv=5)
	print(f'Cross-Validation Scores after Active Learning: {cv_scores_combined}')
	print(f'Cross-Validation Mean Score after Active Learning: {np.mean(cv_scores_combined)}')

	# Plot classifications by antibiotic level with counts
	plot_classifications_by_antibiotic_level_with_counts(unknown_docs + train_docs + val_docs + test_docs)

	# Plot classifications by antibiotic level for the combined model
	plot_classifications_by_antibiotic_level(unknown_docs + train_docs + val_docs + test_docs, model_combined,
	X_combined.columns)

explore_data.pyNo OneTemporaryActions

File Metadata

explore_data.pyView Options

Event Timeline

explore_data.py
No OneTemporary
Actions

explore_data.py
View Options