Page MenuHomec4science

data_processing.py
No OneTemporary

File Metadata

Created
Sun, Dec 22, 16:07

data_processing.py

import numpy as np
from sklearn.model_selection import train_test_split
def normalize_data_model1(docs):
normalized_docs = []
for doc in docs:
transmission = np.array(doc['data']['transmission'])
norm_factor = doc['normalization_factor']
transmission_normalized = transmission / norm_factor
normalized_docs.append({
'transmission_normalized': transmission_normalized,
'label': doc['gram_type'], # New gram classification label
'bacteria': doc['bacteria'], # Souche
'name': doc['name'] # Name for filtering
})
return normalized_docs
def normalize_data_model2(docs):
normalized_docs = []
for doc in docs:
transmission = np.array(doc['data']['transmission'])
norm_factor = doc['normalization_factor']
transmission_normalized = transmission / norm_factor
normalized_docs.append({
'transmission_normalized': transmission_normalized,
'label': doc['gram_type'],
'name': doc['name'],
'bacteria': doc['bacteria']
})
return normalized_docs
def split_data(docs):
train_docs = []
test_docs = []
bacteria_families = set([doc['bacteria'] for doc in docs])
for family in bacteria_families:
family_docs = [doc for doc in docs if doc['bacteria'] == family]
train_family_docs, test_family_docs = train_test_split(family_docs, test_size=0.2, random_state=42)
train_docs.extend(train_family_docs)
test_docs.extend(test_family_docs)
return train_docs, test_docs
def augment_data(docs, noise_level=0.05, shift_max=10):
augmented_docs = []
for doc in docs:
transmission = np.array(doc['transmission_normalized'])
for _ in range(2): # Duplicate each document twice
# Add noise
noisy_transmission = transmission + np.random.normal(0, noise_level, len(transmission))
augmented_docs.append({
'transmission_normalized': noisy_transmission,
'label': doc['label'],
'bacteria': doc['bacteria'],
'name': doc['name'] + '_noise'
})
# Shift the time series
shift = np.random.randint(-shift_max, shift_max)
shifted_transmission = np.roll(transmission, shift)
augmented_docs.append({
'transmission_normalized': shifted_transmission,
'label': doc['label'],
'bacteria': doc['bacteria'],
'name': doc['name'] + '_shift'
})
docs.extend(augmented_docs)
return docs

Event Timeline