diff --git a/ME-390-Exercises/ME-390-05-knn/data/labeled_penguins.csv b/ME-390-Exercises/ME-390-05-knn/data/labeled_penguins.csv new file mode 100644 index 0000000..a04de0d --- /dev/null +++ b/ME-390-Exercises/ME-390-05-knn/data/labeled_penguins.csv @@ -0,0 +1,205 @@ +species,bill_length_mm,body_mass_g +Chinstrap,54.2,4300.0 +Gentoo,50.0,5700.0 +Adelie,39.6,3900.0 +Gentoo,45.7,4400.0 +Gentoo,46.5,4550.0 +Adelie,36.9,3500.0 +Adelie,43.2,4775.0 +Adelie,39.5,3800.0 +Chinstrap,47.5,3900.0 +Adelie,41.5,4000.0 +Gentoo,49.8,5950.0 +Adelie,39.8,4650.0 +Adelie,38.8,3275.0 +Chinstrap,45.7,3600.0 +Adelie,41.4,3700.0 +Chinstrap,46.6,3800.0 +Gentoo,46.1,4500.0 +Gentoo,50.8,5600.0 +Adelie,34.6,3200.0 +Chinstrap,46.8,3650.0 +Adelie,38.5,3325.0 +Chinstrap,51.7,3775.0 +Chinstrap,51.3,3750.0 +Gentoo,47.8,5650.0 +Chinstrap,51.3,3650.0 +Adelie,40.6,3550.0 +Adelie,40.8,4300.0 +Adelie,39.6,3550.0 +Chinstrap,50.7,4050.0 +Chinstrap,50.3,3300.0 +Chinstrap,46.0,4150.0 +Gentoo,48.7,4450.0 +Adelie,36.0,3100.0 +Adelie,40.5,3200.0 +Gentoo,44.9,5100.0 +Gentoo,43.2,4450.0 +Chinstrap,42.4,3600.0 +Chinstrap,50.8,4100.0 +Chinstrap,45.2,3250.0 +Gentoo,42.9,5000.0 +Gentoo,49.9,5400.0 +Gentoo,45.1,5050.0 +Gentoo,48.6,5800.0 +Adelie,40.1,4300.0 +Gentoo,41.7,4700.0 +Gentoo,43.3,4575.0 +Adelie,39.5,3300.0 +Gentoo,49.0,5550.0 +Adelie,36.6,3475.0 +Gentoo,45.1,5000.0 +Adelie,35.3,3800.0 +Adelie,36.5,2850.0 +Chinstrap,51.0,4100.0 +Adelie,36.0,3700.0 +Chinstrap,55.8,4000.0 +Chinstrap,46.4,3700.0 +Chinstrap,58.0,3700.0 +Chinstrap,52.0,4050.0 +Chinstrap,50.9,3675.0 +Adelie,38.2,3950.0 +Adelie,39.7,4250.0 +Adelie,42.5,4500.0 +Gentoo,45.5,4650.0 +Gentoo,42.7,3950.0 +Gentoo,48.7,5350.0 +Gentoo,49.3,5850.0 +Chinstrap,43.5,3400.0 +Chinstrap,49.3,4050.0 +Adelie,38.9,3625.0 +Adelie,37.8,4250.0 +Adelie,36.7,3800.0 +Gentoo,49.6,4750.0 +Chinstrap,46.9,2700.0 +Adelie,39.7,3200.0 +Gentoo,48.7,5350.0 +Adelie,37.3,3775.0 +Chinstrap,45.4,3525.0 +Adelie,40.6,3475.0 +Gentoo,45.2,4750.0 +Gentoo,49.8,5700.0 +Gentoo,48.8,6000.0 +Gentoo,45.4,4800.0 +Gentoo,50.4,5550.0 +Adelie,38.1,3175.0 +Gentoo,50.0,5700.0 +Gentoo,48.4,4625.0 +Gentoo,43.6,4900.0 +Gentoo,50.1,5000.0 +Chinstrap,50.2,3800.0 +Chinstrap,42.5,3350.0 +Chinstrap,45.2,3950.0 +Gentoo,44.4,5250.0 +Adelie,41.3,4400.0 +Chinstrap,46.4,3450.0 +Gentoo,54.3,5650.0 +Gentoo,46.1,5100.0 +Adelie,41.1,4300.0 +Chinstrap,47.6,3850.0 +Adelie,40.2,3450.0 +Gentoo,48.5,4850.0 +Gentoo,50.4,5750.0 +Gentoo,47.5,4950.0 +Chinstrap,49.0,3950.0 +Chinstrap,51.3,3700.0 +Gentoo,50.8,5200.0 +Chinstrap,50.6,3800.0 +Gentoo,45.5,5000.0 +Chinstrap,43.2,2900.0 +Adelie,41.8,4450.0 +Chinstrap,50.8,4450.0 +Gentoo,46.2,5300.0 +Adelie,38.2,3900.0 +Chinstrap,45.9,3575.0 +Chinstrap,52.7,3725.0 +Gentoo,46.3,5050.0 +Adelie,32.1,3050.0 +Chinstrap,49.7,3600.0 +Adelie,37.6,3600.0 +Gentoo,43.4,4600.0 +Chinstrap,45.5,3500.0 +Chinstrap,45.6,3525.0 +Adelie,41.1,3900.0 +Adelie,36.4,3325.0 +Chinstrap,49.6,3775.0 +Chinstrap,51.5,3250.0 +Gentoo,40.9,4650.0 +Adelie,36.8,3500.0 +Adelie,40.7,3725.0 +Chinstrap,46.7,3300.0 +Chinstrap,49.5,3800.0 +Gentoo,45.8,4200.0 +Chinstrap,46.5,3500.0 +Gentoo,51.5,5500.0 +Adelie,37.3,3350.0 +Adelie,33.1,2900.0 +Chinstrap,45.7,3650.0 +Gentoo,43.3,4400.0 +Chinstrap,46.1,3250.0 +Chinstrap,53.5,4500.0 +Adelie,41.1,3200.0 +Chinstrap,50.0,3900.0 +Adelie,39.0,3050.0 +Gentoo,49.1,5500.0 +Chinstrap,50.9,3550.0 +Gentoo,45.0,5050.0 +Chinstrap,50.5,3400.0 +Chinstrap,52.2,3450.0 +Gentoo,46.9,4875.0 +Adelie,39.6,3500.0 +Adelie,37.2,3900.0 +Adelie,38.6,3800.0 +Gentoo,46.2,4800.0 +Chinstrap,46.2,3650.0 +Chinstrap,48.5,3400.0 +Adelie,42.1,4000.0 +Adelie,35.5,3700.0 +Chinstrap,40.9,3200.0 +Adelie,37.6,3300.0 +Adelie,39.2,4675.0 +Adelie,41.6,3950.0 +Gentoo,46.2,4375.0 +Adelie,38.1,3700.0 +Gentoo,46.8,5150.0 +Chinstrap,49.0,4300.0 +Gentoo,49.1,5150.0 +Adelie,35.5,3350.0 +Chinstrap,52.0,4800.0 +Chinstrap,42.5,3350.0 +Gentoo,44.9,4750.0 +Adelie,37.0,3400.0 +Adelie,39.2,4250.0 +Chinstrap,51.4,3950.0 +Adelie,35.7,3150.0 +Adelie,35.0,3450.0 +Adelie,39.7,3900.0 +Adelie,36.0,3450.0 +Adelie,36.5,3150.0 +Adelie,37.8,3400.0 +Gentoo,50.5,5000.0 +Gentoo,51.3,5300.0 +Chinstrap,50.2,3775.0 +Chinstrap,50.5,4050.0 +Chinstrap,48.1,3325.0 +Adelie,40.2,3400.0 +Gentoo,45.3,4300.0 +Gentoo,47.7,4750.0 +Gentoo,48.4,5400.0 +Gentoo,51.1,5250.0 +Gentoo,52.2,5400.0 +Chinstrap,51.9,3950.0 +Adelie,38.6,3750.0 +Gentoo,45.5,4200.0 +Adelie,33.5,3600.0 +Chinstrap,50.1,3400.0 +Chinstrap,52.0,4150.0 +Chinstrap,49.2,4400.0 +Gentoo,46.7,5200.0 +Gentoo,46.5,4400.0 +Chinstrap,47.0,3700.0 +Chinstrap,49.8,3675.0 +Gentoo,43.5,4700.0 +Chinstrap,52.8,4550.0 +Gentoo,47.6,5400.0 +Adelie,35.9,3800.0 diff --git a/ME-390-Exercises/ME-390-05-knn/data/unlabeled_penguins.csv b/ME-390-Exercises/ME-390-05-knn/data/unlabeled_penguins.csv new file mode 100644 index 0000000..711015e --- /dev/null +++ b/ME-390-Exercises/ME-390-05-knn/data/unlabeled_penguins.csv @@ -0,0 +1,51 @@ +bill_length_mm,body_mass_g +45.9,3500.0 +47.8,3900.0 +53.8,4200.0 +48.2,3500.0 +45.5,4000.0 +49.4,3200.0 +51.1,4400.0 +48.2,3700.0 +49.9,3800.0 +48.6,4200.0 +47.5,3600.0 +48.9,3300.0 +48.9,3700.0 +52.1,4100.0 +43.4,3200.0 +51.8,3500.0 +50.4,4000.0 +43.2,5100.0 +50.3,4600.0 +48.9,5200.0 +49.4,5000.0 +46.7,4300.0 +53.0,5400.0 +46.2,4800.0 +46.2,5300.0 +47.1,5400.0 +41.6,4300.0 +52.6,6100.0 +47.7,5100.0 +48.2,5100.0 +47.3,5300.0 +48.8,5800.0 +40.4,4300.0 +48.6,4900.0 +37.9,3900.0 +40.0,4100.0 +39.8,3700.0 +37.1,3400.0 +37.8,3700.0 +37.1,4000.0 +38.8,3800.0 +40.4,3400.0 +37.4,3500.0 +39.1,3900.0 +36.2,3500.0 +41.2,3300.0 +39.3,3800.0 +35.6,3000.0 +37.4,3100.0 +36.7,3400.0 diff --git a/ME-390-Exercises/ME-390-05-knn/helpers.py b/ME-390-Exercises/ME-390-05-knn/helpers.py new file mode 100644 index 0000000..a6e69f8 --- /dev/null +++ b/ME-390-Exercises/ME-390-05-knn/helpers.py @@ -0,0 +1,298 @@ +from typing import Optional, Tuple, List, Dict + +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +import seaborn as sns + + +def preprocess_data( + df: pd.DataFrame, + label: str, + train_size: float = 0.6, + val_size: float = 0.2, + seed: Optional[int] = None, +) -> Tuple[ + np.ndarray, + np.ndarray, + np.ndarray, + np.ndarray, + np.ndarray, + np.ndarray, + List[str], + Dict[int, str], +]: + """Transforms data into numpy arrays and splits it into a train, val and test set + + Args: + df: Data to split + label: name of the training label + train_size: proportion of the data used for training + val_size: proportion of the data used for validation + seed: random seed + + Returns: + object: Tuple containing the training features, training label, + validation features, validation label, test features, test label, + names of the features and map from label to label_name + """ + + df = df.sort_values(by=label) + df[label] = df[label].astype("category") + + df = df.sample(frac=1, random_state=seed) + train, val, test = ( + df[: int(len(df) * train_size)], + df[int(len(df) * train_size) : int(len(df) * (train_size + val_size))], + df[int(len(df) * (train_size + val_size)) :], + ) + + X_train = train.drop(columns=label).to_numpy() + X_val = val.drop(columns=label).to_numpy() + X_test = test.drop(columns=label).to_numpy() + + y_train = train[label].cat.codes.to_numpy() + y_val = val[label].cat.codes.to_numpy() + y_test = test[label].cat.codes.to_numpy() + + label_map = dict(enumerate(df[label].cat.categories)) + feature_names = list(df.drop(columns=label).columns) + + return X_train, y_train, X_val, y_val, X_test, y_test, feature_names, label_map + + +def plot_labeled( + X: np.ndarray, + y: np.ndarray, + label_map: Dict[int, str], + feature_names: List[str], + figsize: Tuple[float, float] = (6, 6), + title: Optional[str] = None, +): + """Plots labeled data + + Supports only 2D features and up to 10 different classes + + Args: + X: features + y: labels + label_map: Dictionary mapping class number to class name + feature_names: List of feature names + figsize: Figure size + title: The plot title + """ + # Support up to 10 different classes + markers = ["o", "v", "s", "8", "p", "x", "D", "<", "^", ">"] + colors = ( + # Default color palette if 3 classes, otherwise use another color palette + ([0, 0.5, 0], [0.25, 0.25, 1], [0.85, 0.85, 0]) + if len(label_map) == 3 + else sns.color_palette(n_colors=len(label_map)) + ) + + fig, ax = plt.subplots(nrows=1, ncols=1, figsize=figsize) + for i, class_name in label_map.items(): + ax.scatter( + x=X[y == i, 0], + y=X[y == i, 1], + color=colors[i], + marker=markers[i], + label=class_name, + alpha=0.3, + s=70, + ) + + plt.axis("equal") + plt.xlabel(feature_names[0], fontsize=12) + plt.ylabel(feature_names[1], fontsize=12) + plt.legend(prop={"size": 12}) + if title is not None: + plt.title(title) + + plt.show() + + +def plot_unlabeled( + X: np.ndarray, + feature_names: List[str], + figsize: Tuple[float, float] = (6, 6), + title: Optional[str] = None, +): + """Plots unlabeled data + + Supports only 2D features + + Args: + X: features + feature_names: List of feature names + figsize: Figure size + title: The plot title + """ + + fig, ax = plt.subplots(nrows=1, ncols=1, figsize=figsize) + ax.scatter( + x=X[:, 0], + y=X[:, 1], + color="grey", + marker="o", + label="Unknown", + alpha=0.3, + s=70, + ) + + plt.axis("equal") + plt.xlabel(feature_names[0], fontsize=12) + plt.ylabel(feature_names[1], fontsize=12) + plt.legend(prop={"size": 12}) + + if title is not None: + plt.title(title) + + plt.show() + + +def plot_nearest_neighbors( + sample: np.ndarray, + X: np.ndarray, + y: np.ndarray, + neighbor_indices: np.ndarray, + label_map: Dict[int, str], + feature_names: List[str], + figsize: Tuple[float, float] = (6, 6), + title: Optional[str] = None, +): + """ + + Args: + sample: Sample of shape (2, ) + X: Dataset of shape (N, 2) + y: Labels of shape (N, ) + neighbor_indices: Indices of k-nearest neighbors + label_map: Dictionary mapping class number to class name + feature_names: List of feature names + figsize: Figure size + title: The plot title + """ + # Support up to 10 different classes + markers = ["o", "v", "s", "8", "p", "x", "D", "<", "^", ">"] + colors = ( + # Default color palette if 3 classes, otherwise use another color palette + ([0, 0.5, 0], [0.25, 0.25, 1], [0.85, 0.85, 0]) + if len(label_map) == 3 + else sns.color_palette(n_colors=len(label_map)) + ) + + # Plot all points + fig, ax = plt.subplots(nrows=1, ncols=1, figsize=figsize) + for i, class_name in label_map.items(): + ax.scatter( + x=X[y == i, 0], + y=X[y == i, 1], + color=colors[i], + marker=markers[i], + label=class_name, + alpha=0.3, + s=70, + ) + + # Plot nearest neighbors + for i, class_name in label_map.items(): + class_indices = neighbor_indices[y[neighbor_indices] == i] + if len(class_indices) > 0: + ax.scatter( + x=X[class_indices][:, 0], + y=X[class_indices][:, 1], + color=colors[i], + marker=markers[i], + alpha=0.8, + s=80, + ) + + # Plot sample + ax.scatter( + x=sample[0], + y=sample[1], + marker="*", + color="#CCCC00", + alpha=1, + s=200, + label="Unknown", + edgecolors="#000000", + linewidths=1, + ) + + plt.axis("equal") + plt.xlabel(feature_names[0], fontsize=12) + plt.ylabel(feature_names[1], fontsize=12) + plt.legend(prop={"size": 12}) + if title is not None: + plt.title(title) + + plt.show() + + +def plot_predictions( + samples: np.ndarray, + predicted_labels: np.ndarray, + X: Optional[np.ndarray], + y: Optional[np.ndarray], + label_map: Dict[int, str], + feature_names: List[str], + figsize: Tuple[float, float] = (6, 6), + title: Optional[str] = None, +): + """ + + Args: + samples: Sample of shape (M, 2) + predicted_labels: Labels of shape (M, ) + X: Training dataset of shape (N, 2) - Set to None to avoid plotting them + y: Training labels of shape (N, ) - Set to None to avoid plotting them + label_map: Dictionary mapping class number to class name + feature_names: List of feature names + figsize: Figure size + title: The plot title + """ + # Support up to 10 different classes + markers = ["o", "v", "s", "8", "p", "x", "D", "<", "^", ">"] + colors = ( + # Default color palette if 3 classes, otherwise use another color palette + ([0, 0.5, 0], [0.25, 0.25, 1], [0.85, 0.85, 0]) + if len(label_map) == 3 + else sns.color_palette(n_colors=len(label_map)) + ) + + fig, ax = plt.subplots(nrows=1, ncols=1, figsize=figsize) + + # Plot training labels + if X is not None and y is not None: + for i, class_name in label_map.items(): + ax.scatter( + x=X[y == i, 0], + y=X[y == i, 1], + color=colors[i], + marker=markers[i], + alpha=0.2, + s=50, + ) + + # Plot predictions + for i, class_name in label_map.items(): + ax.scatter( + x=samples[predicted_labels == i, 0], + y=samples[predicted_labels == i, 1], + color=colors[i], + marker="*", + label=class_name, + alpha=0.9, + s=140, + ) + + plt.axis("equal") + plt.xlabel(feature_names[0], fontsize=12) + plt.ylabel(feature_names[1], fontsize=12) + plt.legend(prop={"size": 12}) + if title is not None: + plt.title(title) + + plt.show() diff --git a/ME-390-Exercises/ME-390-05-knn/images/manhattan_euclidean.png b/ME-390-Exercises/ME-390-05-knn/images/manhattan_euclidean.png new file mode 100644 index 0000000..ff11bda Binary files /dev/null and b/ME-390-Exercises/ME-390-05-knn/images/manhattan_euclidean.png differ diff --git a/ME-390-Exercises/ME-390-05-knn/images/penguins.jpg b/ME-390-Exercises/ME-390-05-knn/images/penguins.jpg new file mode 100644 index 0000000..8a1ff40 Binary files /dev/null and b/ME-390-Exercises/ME-390-05-knn/images/penguins.jpg differ diff --git a/ME-390-Exercises/ME-390-05-knn/kNN.ipynb b/ME-390-Exercises/ME-390-05-knn/kNN.ipynb new file mode 100644 index 0000000..3368a7b --- /dev/null +++ b/ME-390-Exercises/ME-390-05-knn/kNN.ipynb @@ -0,0 +1,969 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "framed-supply", + "metadata": {}, + "source": [ + "# k-Nearest Neighbors" + ] + }, + { + "cell_type": "markdown", + "id": "other-width", + "metadata": {}, + "source": [ + "
\n", + "\n", + "This notebook was developed for the CS-233 Introduction to Machine Learning course at EPFL, adapted for the CIVIL-226 Introduction to Machine Learning for Engineers course, and re-adapted for the ME-390.\n", + "We thank contributers in CS-233 ([CVLab](https://www.epfl.ch/labs/cvlab)) and CIVIL-226 ([VITA](https://www.epfl.ch/labs/vita/)).\n", + " \n", + "**Author(s):** Jan Bednarík, minor changes by Tom Winandy\n", + "
" + ] + }, + { + "cell_type": "markdown", + "id": "nearby-prompt", + "metadata": {}, + "source": [ + "In this exercise, you will implement a k-Nearest Neighbors (k-NN) classifier and see how to use a validation set to find optimal hyper-parameters.\n", + "\n", + "The `### START CODE HERE ###` and `### END CODE HERE ###` comments in code cells indicate where you should write code.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cordless-blood", + "metadata": {}, + "outputs": [], + "source": [ + "# Function to align all tables to the left (useful for later on)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "hazardous-velvet", + "metadata": {}, + "outputs": [], + "source": [ + "%%html\n", + "" + ] + }, + { + "cell_type": "markdown", + "id": "structural-crazy", + "metadata": {}, + "source": [ + "### Imports" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "refined-sixth", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "from typing import Any, Callable\n", + "\n", + "# Helper file with functions for pre-processing and visualization\n", + "import helpers" + ] + }, + { + "cell_type": "markdown", + "id": "cubic-dominant", + "metadata": {}, + "source": [ + "## 0. Intro\n", + "\n", + "*Background: You and your friends went on an expedition to the Antarctic, where you befriended a group of penguins. After taking a few measurements, you went back home. You're now wondering if it is possible to classify the penguins you befriended by building a classifier.*\n", + "\n", + "To build such a classifier, you will need some labeled data. Luckily for you, the Palmer Penguins dataset contains exactly the data you need.\n", + "\n", + "You decide to implement a kNN classifier. As a reminder, it works in the following way: For an unknown data point, find the k nearest neighbors from a labeled dataset, and return the majority vote (most frequent class) of these neighbors.\n", + "\n", + "![penguins](images/penguins.jpg)" + ] + }, + { + "cell_type": "markdown", + "id": "alien-charles", + "metadata": {}, + "source": [ + "## 1. Data loading & pre-processing" + ] + }, + { + "cell_type": "markdown", + "id": "informational-venice", + "metadata": {}, + "source": [ + "In this exercise, you will use the [Palmer Penguins dataset](https://github.com/allisonhorst/palmerpenguins) and your goal is to classify 3 species of Penguins: Adélie, Chinstrap and Gentoo.\n", + "\n", + "Let's load the dataset using the [pandas](https://pandas.pydata.org/) library. Don't worry, there's no need to know how to use this library for this exercise." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "talented-blend", + "metadata": {}, + "outputs": [], + "source": [ + "# In Pandas, a data table is called a DataFrame (abbreviated to df)\n", + "labeled_df = pd.read_csv('data/labeled_penguins.csv')\n", + "\n", + "print(f\"There are {labeled_df.shape[0]} rows and {labeled_df.shape[1]} columns.\")\n", + "# Show the first 5 rows of the data\n", + "labeled_df.head(5)" + ] + }, + { + "cell_type": "markdown", + "id": "terminal-complex", + "metadata": {}, + "source": [ + "Now, we'll use the function `preprocess_data()` from `helpers.py` to shuffle the data, convert the labels to integers, split it into a training, validation and test set, and then return the result as NumPy arrays.\n", + "Feel free to read the code in `helpers.py` to better understand what this function does (completely optional)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "rotary-resolution", + "metadata": {}, + "outputs": [], + "source": [ + "# We'll use 60% of our data as training data, 20% for our validation data and the remaining 20% as test data\n", + "# Here, we use a random seed to ensure that the data shuffling and splitting can be reproduced\n", + "X_train, y_train, X_val, y_val, X_test, y_test, feature_names, label_map = helpers.preprocess_data(df=labeled_df, label=\"species\", train_size=0.6, val_size=0.2, seed=42)\n" + ] + }, + { + "cell_type": "markdown", + "id": "clean-monday", + "metadata": {}, + "source": [ + "Run the following cells to better understand how the preprocessed data looks like." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "conventional-clause", + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"Features: {feature_names}\")\n", + "print(f\"Map from label to label name: {label_map}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "disciplinary-article", + "metadata": {}, + "outputs": [], + "source": [ + "# Visualisation of X_train and y_train (separation of the features and the labels)\n", + "print('Training set features (first 10 samples):')\n", + "print(f'X_train: \\n {X_train[:10]}')\n", + "\n", + "print('\\nTraining set labels (first 10 samples):')\n", + "print(f'y_train: \\n {y_train[:10]}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "spare-ranking", + "metadata": {}, + "outputs": [], + "source": [ + "# Show shapes\n", + "print('Training set shape:')\n", + "print(f'X: {X_train.shape}, y: {y_train.shape}')\n", + "\n", + "print('\\nValidation set shape:')\n", + "print(f'X: {X_val.shape}, y: {y_val.shape}')\n", + "\n", + "print('\\nTest set shape:')\n", + "print(f'X: {X_test.shape}, y: {y_test.shape}')" + ] + }, + { + "cell_type": "markdown", + "id": "endless-pattern", + "metadata": {}, + "source": [ + "Let's also load our unlabeled measurements of penguins. This is the data we want to classify." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "printable-belize", + "metadata": {}, + "outputs": [], + "source": [ + "unlabeled_df = pd.read_csv('data/unlabeled_penguins.csv')\n", + "\n", + "print(f\"There are {unlabeled_df.shape[0]} rows and {unlabeled_df.shape[1]} columns.\")\n", + "# Show the first 5 rows of the data\n", + "unlabeled_df.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "measured-spoke", + "metadata": {}, + "outputs": [], + "source": [ + "# Convert this unlabeled data to a matrix\n", + "X_unlabeled = unlabeled_df.to_numpy()\n", + "\n", + "print('Unlabeled dataset shape:')\n", + "print(f'X: {X_unlabeled.shape}')" + ] + }, + { + "cell_type": "markdown", + "id": "stopped-enzyme", + "metadata": {}, + "source": [ + "To summarize, we have an unlabeled dataset of penguins (`X_unlabeled`) that we want to classify. \n", + "\n", + "We made the assumption that the labeled Palmer Penguins dataset comes from a similar distribution, so we decided to use it for our classifier. We then split this dataset into a training set (`X_train`, `y_train`), a validation set (`X_val`, `y_val`) and a test set (`X_test`, `y_test`)." + ] + }, + { + "cell_type": "markdown", + "id": "gothic-pastor", + "metadata": {}, + "source": [ + "## 2. Data visualization & scaling" + ] + }, + { + "cell_type": "markdown", + "id": "native-maldives", + "metadata": {}, + "source": [ + "Let's take a look at our data using the `plot_labeled()` and `plot_unlabeled()` functions of `helpers.py`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "stainless-synthesis", + "metadata": {}, + "outputs": [], + "source": [ + "# Plot training data with labels\n", + "helpers.plot_labeled(X_train, y_train, label_map, feature_names, title=\"Training data\")\n", + "# Plot validation data with labels\n", + "helpers.plot_labeled(X_val, y_val, label_map, feature_names, title=\"Validation data\")\n", + "# Plot test data with labels\n", + "helpers.plot_labeled(X_test, y_test, label_map, feature_names, title=\"Test data\")\n", + "# Plot unlabeled data\n", + "helpers.plot_unlabeled(X_unlabeled, feature_names, title=\"Unlabeled data\")" + ] + }, + { + "cell_type": "markdown", + "id": "recognized-peace", + "metadata": {}, + "source": [ + "**Question:** What do you notice? How will this affect the kNN classifier?" + ] + }, + { + "cell_type": "markdown", + "id": "iraqi-pasta", + "metadata": {}, + "source": [ + "**Answer:** YOUR ANSWER HERE" + ] + }, + { + "cell_type": "markdown", + "id": "disciplinary-johns", + "metadata": {}, + "source": [ + "As features can represent different physical units and come in vastly different scales, it is common practice to normalize the data before using kNN.\n", + "\n", + "Here, we'll use Z-score standardization. That is, for each feature, compute: $x_{norm} = \\frac {x - \\mu_x} {\\sigma_x}$\n", + "\n", + "Let's implement the function `normalize()`:\n", + "\n", + "**Note:** Remember that you should not use any of the knowledge you get from the test data when implementing a model. This includes the normalization step, where you should use the mean and standard deviation of the **training set** to normalize all the other datasets." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "extensive-community", + "metadata": {}, + "outputs": [], + "source": [ + "# Compute the mean and standard deviation for each feature of the training set\n", + "\n", + "### START CODE HERE ### (≈ 2 lines of code)\n", + "mean = ...\n", + "std = ...\n", + "### END CODE HERE ###\n", + "\n", + "\n", + "# Implement the normalize function\n", + "def normalize(X: np.ndarray, mean: np.ndarray, std: np.ndarray):\n", + " \"\"\" Normalization of array using Z-score standardization\n", + " Args:\n", + " X: Dataset of shape (N, D)\n", + " mean: Mean of shape (D, )\n", + " std: Standard deviation of shape(D, )\n", + " \"\"\"\n", + " ### START CODE HERE ### (≈ 1 line of code)\n", + " X_normalized = ...\n", + " ### END CODE HERE ###\n", + " return X_normalized\n", + "\n", + "# Normalize features of the training, val and test set using the mean and std of the training set features\n", + "X_train = normalize(X_train, mean, std)\n", + "X_val = normalize(X_val, mean, std)\n", + "X_test = normalize(X_test, mean, std)\n", + "X_unlabeled = normalize(X_unlabeled, mean, std)\n", + "\n", + "# Let's rename the features to indicate that they've been normalized\n", + "feature_names = ['Bill length (normalized)', 'Body mass (normalized)']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "center-daughter", + "metadata": {}, + "outputs": [], + "source": [ + "# Verify that normalization is implemented correctly \n", + "# We check that the mean and std of each feature of the training set is very close to 0 and 1 respectively\n", + "if (np.allclose(X_train.mean(axis=0), np.zeros_like(X_train)) and (np.allclose(X_train.std(axis=0), np.ones_like(X_train)))):\n", + " print(\"All good!\")\n", + "else:\n", + " print(\"normalize() doesn't work correctly :(\")" + ] + }, + { + "cell_type": "markdown", + "id": "complex-indianapolis", + "metadata": {}, + "source": [ + "Now that the data is normalized, let's plot it again, using the same functions from `helpers` as before." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dominican-karma", + "metadata": {}, + "outputs": [], + "source": [ + "# Plot normalized training data with labeled, and normalized test data without labels\n", + "\n", + "### START CODE HERE ### (≈ 4 lines of code)\n", + "...\n", + "### END CODE HERE ###" + ] + }, + { + "cell_type": "markdown", + "id": "saved-communist", + "metadata": {}, + "source": [ + "Wow, this is much better! Now that the data is normalized, we can implement our classifier." + ] + }, + { + "cell_type": "markdown", + "id": "civilian-coating", + "metadata": {}, + "source": [ + "## 3. Distance metrics" + ] + }, + { + "cell_type": "markdown", + "id": "allied-sample", + "metadata": {}, + "source": [ + "In this part, we'll work on the distance metrics used by the classifier. We have seen two distance metrics:\n", + "\n", + "**The L1 (Manhattan) distance:**\n", + "\n", + "$$\n", + "d(\\mathbf{p}, \\mathbf{q})=\\sum_{i=1}^{n} | p_{i}-q_{i} |\n", + "$$\n", + "\n", + "It is also sometimes known as city block or taxicab distance, in allusion to the fact that the shortest path by car in a city with a grid layout (such as Manhattan) would result in this distance.\n", + "\n", + "**The L2 (Euclidean) distance:**\n", + "\n", + "$$\n", + "d(\\mathbf{p}, \\mathbf{q})=\\sqrt{\\sum_{i=1}^{n}\\left(p_{i}-q_{i}\\right)^{2}}\n", + "$$\n", + "\n", + "This is the distance with which you are probably most familar with, it can be seen as the straight line distance between two points.\n", + "\n", + "![distances](images/manhattan_euclidean.png)\n" + ] + }, + { + "cell_type": "markdown", + "id": "dependent-percentage", + "metadata": {}, + "source": [ + "Let's implement functions that compute the distance between a single vector to all vectors of the training data. Until the end of this exercise, we define \"sample\" as a single entry of the dataset. The entry is a vector of D attributes, and the code should stay flexible for any number of attributes.\n", + "\n", + "**Hint:** NumPy's [broadcasting](https://numpy.org/doc/stable/user/basics.broadcasting.html) is your friend." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "secondary-affair", + "metadata": {}, + "outputs": [], + "source": [ + "def manhattan_dist(sample: np.ndarray, X: np.ndarray):\n", + " \"\"\"Computes the Manhattan distance between a sample and the training features\n", + "\n", + " Args:\n", + " sample: Sample of shape (D, )\n", + " X: Dataset of shape (N, D)\n", + "\n", + " Returns:\n", + " dist: Distances of shape (N,)\n", + "\n", + " \"\"\"\n", + " ### START CODE HERE ### (≈ 1 line of code)\n", + " distances = ...\n", + " ### END CODE HERE ### \n", + " return distances\n", + " \n", + "def euclidean_dist(sample: np.ndarray, X: np.ndarray):\n", + " \"\"\"Computes the Euclidean distance between a sample and the training features\n", + "\n", + " Args:\n", + " sample: Sample of shape (D, )\n", + " X: Dataset of shape (N, D)\n", + "\n", + " Returns:\n", + " dist: Distances of shape (N,)\n", + "\n", + " \"\"\"\n", + " ### START CODE HERE ### (≈ 1 line of code)\n", + " distances = ...\n", + " ### END CODE HERE ### \n", + " return distances\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cultural-strategy", + "metadata": {}, + "outputs": [], + "source": [ + "# Verify implementation\n", + "print(f'Manhattan: {np.round(manhattan_dist(X_val[0], X_train[:3]), decimals=1)}')\n", + "print(f'Euclidean: {np.round(euclidean_dist(X_val[0], X_train[:3]), decimals=1)}')" + ] + }, + { + "cell_type": "markdown", + "id": "exterior-black", + "metadata": {}, + "source": [ + "**Expected output:** \n", + "\n", + "| | |\n", + "|---|--------------------------------------------------|\n", + "| **Manhattan** | [4.2 2.7 2.2] |\n", + "| **Euclidean** | [3.1 2. 2. ] |" + ] + }, + { + "cell_type": "markdown", + "id": "drawn-seller", + "metadata": {}, + "source": [ + "## 4. Finding neighbors" + ] + }, + { + "cell_type": "markdown", + "id": "supposed-third", + "metadata": {}, + "source": [ + "It's time to find the nearest neighbors.\n", + "Implement `find_nearest_neighbors()` according to its documentation.\n", + "\n", + "**Hint:** [`np.argsort()`](https://numpy.org/doc/stable/reference/generated/numpy.argsort.html) is a very useful function when working with indices." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "constitutional-norfolk", + "metadata": {}, + "outputs": [], + "source": [ + "def find_nearest_neighbors(\n", + " sample: np.ndarray, \n", + " X: np.ndarray, \n", + " distance_fn: Callable = euclidean_dist, \n", + " k: int = 1):\n", + " \"\"\"Finds the indices of the k-Nearest Neighbors to a sample\n", + " Args:\n", + " sample: Sample of shape (D, )\n", + " X: Dataset of shape (N, D)\n", + " distance_fn: Distance function\n", + " k: Number of nearest neighbors\n", + "\n", + " Returns:\n", + " indices: Neighbor indices of shape (k, )\n", + " \"\"\"\n", + " ### START CODE HERE ### (≈ 2 lines of code)\n", + " neighbor_indices = ...\n", + " ### END CODE HERE ###\n", + " \n", + " return neighbor_indices" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "atlantic-overview", + "metadata": {}, + "outputs": [], + "source": [ + "# Try changing the sample position, distance function and value of k\n", + "# Does your find_nearest_neighbors() implementation seem correct?\n", + "sample = np.array([-0.5, -1.5])\n", + "distance_fn = euclidean_dist\n", + "k = 3\n", + "\n", + "# Plot the nearest neighbors using helpers.py\n", + "neighbor_indices = find_nearest_neighbors(sample, X_train, distance_fn=distance_fn, k=k)\n", + "helpers.plot_nearest_neighbors(sample, X_train, y_train, neighbor_indices, label_map, feature_names)" + ] + }, + { + "cell_type": "markdown", + "id": "figured-judge", + "metadata": {}, + "source": [ + "The opaque points correspond to the nearest neighbors to the unknown point (star)." + ] + }, + { + "cell_type": "markdown", + "id": "dying-canadian", + "metadata": {}, + "source": [ + "Now that you have a function that finds the nearest neighbor, let's implement the function `predict_single()` which returns the majority class of the k-nearest neighbors of a given sample. Do not worry about tie-breakers (i.e. if multiple classes have as many points in proximity to a sample).\n", + "\n", + "**Hints:**\n", + "- Feel free to reuse functions you implemented previously\n", + "- Labels are numbered from 0 to n-1 (with n the number of classes) \n", + "- [`np.bincount()`](https://numpy.org/doc/stable/reference/generated/numpy.bincount.html) and [`np.argmax()`](https://numpy.org/doc/stable/reference/generated/numpy.argmax.html) can be very useful." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "canadian-surfing", + "metadata": {}, + "outputs": [], + "source": [ + "def predict_single(\n", + " sample: np.ndarray, \n", + " X: np.ndarray, \n", + " y: np.ndarray, \n", + " distance_fn: Callable, \n", + " k: int = 1):\n", + " \"\"\" Finds the k-Nearest Neighbors to a sample and returns the majority class\n", + " Args:\n", + " sample: Sample of shape (D, )\n", + " X: Dataset of shape (N, D)\n", + " y: Labels of shape (N, )\n", + " distance_fn: Distance function\n", + " k: number of nearest neighbors\n", + "\n", + " Returns:\n", + " label: Predicted label, the majority class of the k-Nearest Neighbors\n", + "\n", + " \"\"\"\n", + " \n", + " ### START CODE HERE ### (≈ 4 lines of code)\n", + " \n", + " predicted_label = ...\n", + " ### END CODE HERE ### \n", + " \n", + " return predicted_label" + ] + }, + { + "cell_type": "markdown", + "id": "civilian-campus", + "metadata": {}, + "source": [ + "Now, we can reuse `predict_single()` to predict several samples at once. \n", + "\n", + "**Hint:** There are several ways to do this, such as using a simple for-loop or with the function [`np.apply_along_axis()`](https://numpy.org/doc/stable/reference/generated/numpy.apply_along_axis.html)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "divine-encounter", + "metadata": {}, + "outputs": [], + "source": [ + "# With for-loop\n", + "def predict(\n", + " samples: np.ndarray, \n", + " X: np.ndarray = X_train, \n", + " y: np.ndarray = y_train, \n", + " distance_fn: Callable = euclidean_dist, \n", + " k: int = 1):\n", + " \"\"\" Finds the k-Nearest Neighbors to a matrix of samples and returns \n", + " the majority class for each of these samples as an array\n", + " Args:\n", + " samples: Samples of shape (M, D)\n", + " X: Dataset of shape (N, D)\n", + " y: Labels of shape (N, )\n", + " distance_fn: Distance function\n", + " k: number of nearest neighbors\n", + "\n", + " Returns:\n", + " labels: Predicted labels of shape (M, )\n", + "\n", + " \"\"\"\n", + " ### START CODE HERE ### (≈ 1 to 5 lines of code depending on the implementation)\n", + "\n", + " predicted_labels = ...\n", + " ### END CODE HERE ###\n", + " \n", + " return predicted_labels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "happy-kazakhstan", + "metadata": {}, + "outputs": [], + "source": [ + "# Verify implementation\n", + "print(f'Predicted labels: {predict(X_val[:6], X_train, y_train, euclidean_dist, 5)}')" + ] + }, + { + "cell_type": "markdown", + "id": "lasting-charge", + "metadata": {}, + "source": [ + "**Expected output:** \n", + "\n", + "| | |\n", + "|---|--------------------------------------------------|\n", + "| **Predicted labels** | [1 1 2 1 2 0] |" + ] + }, + { + "cell_type": "markdown", + "id": "rental-blanket", + "metadata": {}, + "source": [ + "Let's visualize what predictions look like on our validation set. \n", + "In the following plot, stars represent the validation data (and the color represents their predicted class)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "flexible-george", + "metadata": {}, + "outputs": [], + "source": [ + "# Try changing the distance function and value of k\n", + "distance_fn = euclidean_dist\n", + "k = 3\n", + "\n", + "# Plot the predictions using helpers.py\n", + "predicted_val_labels = predict(X_val, X_train, y_train, euclidean_dist, k=5)\n", + "helpers.plot_predictions(samples=X_val, predicted_labels=predicted_val_labels, X=X_train, y=y_train, \n", + " label_map=label_map, feature_names=feature_names, \n", + " title=\"Predicted class for validation set\")" + ] + }, + { + "cell_type": "markdown", + "id": "governing-response", + "metadata": {}, + "source": [ + "## 5. Selecting hyper-parameters" + ] + }, + { + "cell_type": "markdown", + "id": "cooperative-vertex", + "metadata": {}, + "source": [ + "Now is the time to select hyper-parameters. For kNN, the two parameters we can select is the distance function and the number of neighbors k." + ] + }, + { + "cell_type": "markdown", + "id": "fifth-calcium", + "metadata": {}, + "source": [ + "First, we need a metric to measure how well our model is doing. \n", + "\n", + "Here, we'll use the accuracy, defined as:\n", + "$$ \\text{accuracy} = \\frac{\\text{# correct predictions}}{\\text{# all predictions}}$$\n", + "\n", + "Let's implement this function:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "assured-hampshire", + "metadata": {}, + "outputs": [], + "source": [ + "def accuracy(labels_gt: np.ndarray, labels_pred: np.ndarray):\n", + " \"\"\" Computes accuracy.\n", + " \n", + " Args:\n", + " labels_gt: labels (ground-truth) of shape (M, ).\n", + " labels_pred: Predicted labels of shape (M, ).\n", + " \n", + " Returns:\n", + " float: Accuracy, in range [0, 1].\n", + " \"\"\"\n", + " ### START CODE HERE ### (≈ 1 line of code)\n", + " acc = ...\n", + " ### END CODE HERE ###\n", + " return acc" + ] + }, + { + "cell_type": "markdown", + "id": "approximate-density", + "metadata": {}, + "source": [ + "Now, let's compute the accuracy for both distance functions at all values of k. We can only afford to do that because our dataset is relatively small, so this exhaustive search isn't too computationally expensive." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "awful-wrist", + "metadata": {}, + "outputs": [], + "source": [ + "k_values = []\n", + "euclidean_accs = []\n", + "manhattan_accs = []\n", + "for i in range(1, len(X_train) + 1):\n", + " ### START CODE HERE ### (≈ 2 lines of code)\n", + " manhattan_acc = ...\n", + " euclidean_acc = ...\n", + " ### END CODE HERE ###\n", + " k_values.append(i)\n", + " manhattan_accs.append(manhattan_acc)\n", + " euclidean_accs.append(euclidean_acc)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "norwegian-trail", + "metadata": {}, + "outputs": [], + "source": [ + "# Plot accuracies\n", + "plt.figure(figsize=(8,4), dpi=100)\n", + "plt.plot(k_values, manhattan_accs, label=\"Manhattan\", alpha=0.8)\n", + "plt.plot(k_values, euclidean_accs, label=\"Euclidean\", alpha=0.8)\n", + "plt.xlabel(\"Value of k\")\n", + "plt.ylabel(\"Accuracy\")\n", + "plt.legend()\n", + "plt.title(\"Performance on the validation set for different values of $k$ and distance functions\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "sacred-australian", + "metadata": {}, + "source": [ + "**Question:** What do you notice at the highest value of k? How can this be explained?" + ] + }, + { + "cell_type": "markdown", + "id": "wooden-georgia", + "metadata": {}, + "source": [ + "**Answer:** YOUR ANSWER HERE" + ] + }, + { + "cell_type": "markdown", + "id": "defensive-picture", + "metadata": {}, + "source": [ + "The classifier seems to perform well for both distance metrics and for values of k ranging from 1 to 50, but it is hard to pinpoint which value of k is optimal, and which distance metric is better. This is why, for small datasets, it is often preferred to use [K-Fold Cross Validation](https://en.wikipedia.org/wiki/Cross-validation_(statistics)#k-fold_cross-validation) instead of just a single validation set, as it provides more accurate results and is less sensitive to noise. You will see how to implement cross-validation later in this course. For now, just select a value of k and a distance metric that provides good result." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "religious-allen", + "metadata": {}, + "outputs": [], + "source": [ + "# Choose a good value of k and a good distance metric\n", + "k = 25\n", + "distance_fn = euclidean_dist" + ] + }, + { + "cell_type": "markdown", + "id": "legislative-filename", + "metadata": {}, + "source": [ + "Now that we selected good hyperparameters, it is time to check how well our model performs on the test set." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "attended-football", + "metadata": {}, + "outputs": [], + "source": [ + "# Compute the test accuracy\n", + "### START CODE HERE ### (≈ 1 line of code)\n", + "test_accuracy = ...\n", + "### END CODE HERE ###\n", + "print(f\"Test accuracy: {test_accuracy * 100: .2f}%\")" + ] + }, + { + "cell_type": "markdown", + "id": "innocent-facing", + "metadata": {}, + "source": [ + "## 6. Final results" + ] + }, + { + "cell_type": "markdown", + "id": "breathing-nursery", + "metadata": {}, + "source": [ + "It's finally time to use our newly built classifier to assign classes to the unlabeled data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "unavailable-diary", + "metadata": {}, + "outputs": [], + "source": [ + "# Predict classes of unlabeled data\n", + "### START CODE HERE ### (≈ 1 line of code)\n", + "predicted_labels = ...\n", + "### END CODE HERE ###\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "efficient-gathering", + "metadata": {}, + "outputs": [], + "source": [ + "# Plot predictions\n", + "helpers.plot_predictions(samples=X_unlabeled, predicted_labels=predicted_labels, X=X_train, y=y_train, \n", + " label_map=label_map, feature_names=feature_names, \n", + " title=\"Predicted class for unlabeled data\")" + ] + }, + { + "cell_type": "markdown", + "id": "literary-interpretation", + "metadata": {}, + "source": [ + "**Question:** What are your thoughts on this class assignment? Do these results look reasonable?" + ] + }, + { + "cell_type": "markdown", + "id": "subtle-guide", + "metadata": {}, + "source": [ + "**Answer:** YOUR ANSWER HERE" + ] + }, + { + "cell_type": "markdown", + "id": "subtle-housing", + "metadata": {}, + "source": [ + "Good job on completing this exercise!" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + }, + "toc-showcode": false, + "toc-showmarkdowntxt": false, + "toc-showtags": false + }, + "nbformat": 4, + "nbformat_minor": 5 +}