diff --git a/ME-390-Exercises/ME-390-04-logistic-regression/logistic_regression_Sol.ipynb b/ME-390-Exercises/ME-390-04-logistic-regression/logistic_regression_Sol.ipynb new file mode 100644 index 0000000..392aeb9 --- /dev/null +++ b/ME-390-Exercises/ME-390-04-logistic-regression/logistic_regression_Sol.ipynb @@ -0,0 +1,3129 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "38d358b2edfa7258c44bf997454910cc", + "grade": false, + "grade_id": "cell-a3edf79fa89d828d", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "# Homework - Logistic Regression" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "da7dc5428d8decef51a7cd3c78041558", + "grade": false, + "grade_id": "cell-dd226607fd592d7a", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "
\n", + "\n", + "This notebook was developed for the CS-233 Introduction to Machine Learning course at EPFL, adapted for the CIVIL-226 Introduction to Machine Learning for Engineers course, and re-adapted for the ME-390.\n", + "We thank contributers in CS-233 ([CVLab](https://www.epfl.ch/labs/cvlab)) and CIVIL-226 ([VITA](https://www.epfl.ch/labs/vita/)).\n", + " \n", + "**Author(s):** Jan Bednarík, minor changes by Tom Winandy\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "1a4287e09afa7221b8075b3ef0381cb8", + "grade": false, + "grade_id": "cell-29aba77143b75021", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "### Imports" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "4749c91aff183a6a69f159fd7cf551cf", + "grade": false, + "grade_id": "cell-6c4dc1386371cb36", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "outputs": [], + "source": [ + "# Function to align all tables to the left (useful for later on)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "514a51ab056d3901500c397e76667761", + "grade": false, + "grade_id": "cell-5c5603eceb000cf6", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "outputs": [], + "source": [ + "%%html\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Tuple, List\n", + "\n", + "import pandas as pd\n", + "import numpy as np\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "import helpers" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "9d47edc0fda0c8e6a405064fe700118d", + "grade": false, + "grade_id": "cell-b9652397ee687d17", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "# Part 1: Logistic regression" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "22b8d0359b75db31f31da66f11d0f647", + "grade": false, + "grade_id": "cell-12a759793df81f98", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "The first part of this exercise consists of implementing logistic regression for a binary classification problem. In this part, you will use a slightly modified version of the Palmer Penguins dataset." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "1a7f3b3b31cb0fc1402cf2549d6d5636", + "grade": false, + "grade_id": "cell-34a8a71f636f9a09", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "## 1.1. Dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "25019d479c4409b680db276ad3a25572", + "grade": false, + "grade_id": "cell-c025d0311563a63c", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "### Loading the data & normalization" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "708089d9d52828704d5279ee4f775a54", + "grade": false, + "grade_id": "cell-99def0ebc4b0507a", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "First, you'll work on a relatively simple dataset: a modified version of the Palmers Penguins dataset with only two features (bill length and flipper length), and two species (Gentoo and Chinstrap). Using `helpers.preprocess_data()` (check helpers.py for more info), we'll obtain a training and test set." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "penguins = pd.read_csv('data/penguins.csv')\n", + "penguins.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X_train_penguins, y_train_penguins, X_test_penguins, y_test_penguins, feature_names, label_map = helpers.preprocess_data(penguins, label=\"species\", train_size=0.70, seed=42)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "a946ecea30e26b88b4f2b633eb9327f5", + "grade": false, + "grade_id": "cell-6f0f94ffbaf6e12b", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "Next, you need to normalize this data with the mean and standard deviation from the training set and keep the original variable name. For example, the normalized `X_train_penguins` should be called `X_train_penguins`.\n", + "\n", + "**Note:** Remember that you should not use any of the knowledge you get from the test data when implementing a model. This includes the normalization step, where you should use the mean and standard deviation of the **training set** to normalize both the **training** and **test** set." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "93671be7ef033be6da2d9c4410bdbad9", + "grade": false, + "grade_id": "cell-af2ef90ad754aa43", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + } + }, + "outputs": [], + "source": [ + "# Normalize features of the training and test set using the mean and std of the training set features\n", + "### START CODE HERE ###\n", + "mean = np.mean(X_train_penguins, axis=0)\n", + "std = np.std(X_train_penguins, axis=0)\n", + "\n", + "X_train_penguins = (X_train_penguins - mean) / std\n", + "X_test_penguins = (X_test_penguins - mean) / std\n", + "### END CODE HERE ###" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": true, + "nbgrader": { + "cell_type": "code", + "checksum": "36da7112adcd966ac107c29a810c60f7", + "grade": true, + "grade_id": "cell-ce6c80d103e4b490", + "locked": true, + "points": 2, + "schema_version": 3, + "solution": false, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# These assertions check that your normalization didn't go completely wrong. \n", + "# Passing these assertions does not mean the correctness of your implementation\n", + "# We may use other tests to check about it\n", + "assert np.allclose(X_train_penguins.mean(axis=0), 0)\n", + "assert np.allclose(X_train_penguins.std(axis=0), 1)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "0d0827c64519e77ceb6e7f4e0fb56837", + "grade": false, + "grade_id": "cell-b0eb41b993dd1c5d", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "Let's preview the arrays for the training and test set." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Preview of X_train_penguins and y_train_penguins (separation of the features and the labels)\n", + "print('Training set features:')\n", + "print(f'X_train_penguins: \\n {X_train_penguins[:3]}')\n", + "\n", + "print('\\nTraining set labels:')\n", + "print(f'y_train_penguins: \\n {y_train_penguins[:3]}')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "48b59962e61a38b1df1198929efde876", + "grade": false, + "grade_id": "cell-d2e1475a6abc6da5", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "**Expected output:**\n", + "\n", + "| | |\n", + "|-----------------|--------------------------------------------------|\n", + "| **X_train_penguins** | [[ 0.75973314 1.26418483]
[ 0.62957146 -0.72722878]
[-1.84350033 -1.87612124]] |\n", + "| **y_train_penguins** | [1 0 0]\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Preview of X_train_penguins and y_train_penguins (separation of the features and the labels)\n", + "print('Test set features:')\n", + "print(f'X_test_penguins: \\n {X_test_penguins[:3]}')\n", + "\n", + "print('\\nTraining set labels:')\n", + "print(f'y_test_penguins: \\n {y_test_penguins[:3]}')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "8f3173863c62208c52cac1169deb1926", + "grade": false, + "grade_id": "cell-d7c48102306b8ddc", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "**Expected output:**\n", + "\n", + "| | |\n", + "|-----------------|--------------------------------------------------|\n", + "| **X_test_penguins** | [[-1.58317698 0.1918852 ]
[-1.48555573 1.11099917]
[ 0.59703104 0.57484935]] |\n", + "| **y_test_penguins** | [1 1 1]\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Show shapes\n", + "print('Training set shape:')\n", + "print(f'X: {X_train_penguins.shape}, y: {y_train_penguins.shape}')\n", + "\n", + "print('\\nTest set shape:')\n", + "print(f'X: {X_test_penguins.shape}, y: {y_test_penguins.shape}')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "e063343d68dd72367ce3ab6e633ec119", + "grade": false, + "grade_id": "cell-baa0bd8769a75b18", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "### Notation" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "fb2df912eb016004d483215c912fe2cb", + "grade": false, + "grade_id": "cell-81df8f0141a0a158", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "Now that we have pre-processed our dataset, here's how it looks:\n", + "\n", + "- features: $\\boldsymbol{X} \\in \\mathbb{R}^{N \\times d}$, $\\forall \\ \\boldsymbol{x}^{(i)} \\in \\boldsymbol{X}: \\boldsymbol{x}^{(i)} \\in \\mathbb{R}^{d}$\n", + "- labels: $\\boldsymbol{y} \\in \\mathbb{R}^{N}$, $\\forall \\ y^{(i)} \\in \\boldsymbol{y}: y^{(i)} \\in \\{0, 1\\}$ \n", + " \n", + " where $N$ is the number of examples in our dataset, and $d$ is the number of features per example \n", + " \n", + "\n", + "For the weights, we have:\n", + " \n", + " \n", + " - weights: $\\mathbf{w} \\in \\mathbb{R}^{d}$\n", + " - bias: $b \\in \\mathbb{R}$" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "06a9aca8976831bfb7433e6949beec33", + "grade": false, + "grade_id": "cell-ef11677a4d9dd097", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + " **Note:**\n", + " \n", + " $\\boldsymbol{X}$ is called the design matrix, where $\\boldsymbol{X}_{i, :}$ denotes $\\boldsymbol{x}^{(i)}$. \n", + " Note that a single example $\\boldsymbol{x}^{(i)}$ is a column vector of shape $(d \\times 1)$, while the design matrix $\\boldsymbol{X}$ is of shape $(N \\times d)$, where each row represents an example and each column represents a feature." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "1bca5f6c87cf557ef97fd5fde67f80ff", + "grade": false, + "grade_id": "cell-1b220c79317ae0db", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "## 1.2. Diving into logistic regression" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "7847a144d1a7b54dd438ff7283cd7f50", + "grade": false, + "grade_id": "cell-650742405fb1f5ee", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "In this section, you are going to implement the different functions needed for logistic regression." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "d2d3395a85adfc12f2e84d298ab832f1", + "grade": false, + "grade_id": "cell-82488d3bf84a193e", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "### Sigmoid" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "e078b19043dcf71aa44d2987b268f51e", + "grade": false, + "grade_id": "cell-b99bf8ea309547ce", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "A key element of logistic regression is the sigmoid function. This function takes any real-valued input and outputs a value in [0, 1]. The sigmoid function is defined as:\n", + "$$\\sigma(z)= \\frac {e^z}{1+e^z}= \\frac{1}{1+e^{-z}}$$\n", + "\n", + "Here's a plot of the sigmoid:\n", + "\n", + "\"Sigmoid\"\n", + "\n", + "**Task: Implement `sigmoid()`** \n", + "\n", + "**Hint:** Use `np.exp(x)` to take the exponential of a number. You can find the documentation for this method [here](https://numpy.org/doc/stable/reference/generated/numpy.exp.html)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "377f1727b51a649917a85ae49e0ab277", + "grade": false, + "grade_id": "cell-e999e5baf8517826", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + } + }, + "outputs": [], + "source": [ + "def sigmoid(z: np.ndarray) -> np.ndarray:\n", + " \"\"\" Sigmoid function\n", + " \n", + " Args:\n", + " z: Input data of shape (N, )\n", + " \n", + " Returns:\n", + " np.ndarray: Sigmoid of z of shape (N, ), where each value is in [0, 1]\n", + " \"\"\"\n", + " ### START CODE HERE ###\n", + " s = 1 / (1 + np.exp(-z))\n", + " ### END CODE HERE ###\n", + " return s" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "566f8d0999f1a8f58d2402568d41bb28", + "grade": true, + "grade_id": "cell-a4d76e12a5ac0331", + "locked": true, + "points": 2, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "outputs": [], + "source": [ + "# Verify implementation\n", + "a = sigmoid(np.array([3, 0.5, -1]))\n", + "print(f'a: {np.round(a, 4)}')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "7dc20c08d153332ab772855f215e36ad", + "grade": false, + "grade_id": "cell-fa1fcbd71f17c4d2", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "**Expected output:**\n", + "\n", + "| | |\n", + "|---|--------------------------------------------------|\n", + "| **a** | [0.9526 0.6225 0.2689] |" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "12b23a0026f87e84bdbe1f3f41bea935", + "grade": false, + "grade_id": "cell-53f299fe5ba02f43", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "### Logistic output" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "dc23527de0273c16f11f6b66a9fb5dfc", + "grade": false, + "grade_id": "cell-49f5ed2ec12d410d", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "Now that you have a function that computes the sigmoid function, you can implement a function that gives the logistic output. As a reminder, this function outputs the estimated probability that y = 1 for an input $\\boldsymbol{x}$, that is: $ \\hat{y}^{(i)} = p(y^{(i)} = 1 | \\boldsymbol{x}^{(i)})$. \n", + "The logistic output (of a single example) is defined as:\n", + "$$\\hat{y}^{(i)} = \\sigma(\\mathbf{w}^{T} \\boldsymbol{x}^{(i)} + b) = \\frac{1}{1 + e^{-(\\mathbf{w}^{T} \\boldsymbol{x}^{(i)} + b)}}$$ " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "a735e751651dc531e564426cbe818924", + "grade": false, + "grade_id": "cell-a8ccffd1eb9e3fb0", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "For all examples, the output is defined as: \n", + "$$\\mathbf{\\hat{y}} = \\sigma(\\mathbf{X} \\mathbf{w} + b \\mathbf{1})$$\n", + "where $\\mathbf{1}$ is a vector of ones of shape $(N \\times 1)$, and the sigmoid is applied element-wise.\n", + "\n", + "**Task: Implement `logistic_output()`**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "decba212b3a4b33548994cf2d06f2eba", + "grade": false, + "grade_id": "cell-3374a361666852bd", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "**Hint:** To vectorize this operation , use `np.matmul(a, b)` (or equivalently `a @ b`) for matrix multiplication. \n", + "You can find the documentation for this method [here](https://numpy.org/doc/stable/reference/generated/numpy.matmul.html).\n", + "\n", + "**Hint:** Remember that you have already coded the sigmoid function. This might help you in the logistic output implementation. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "b321519a17a69840c3d3011cb63f5eb8", + "grade": false, + "grade_id": "cell-6caec77f1b106e66", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + } + }, + "outputs": [], + "source": [ + "def logistic_output(X: np.ndarray, w: np.ndarray, b: float) -> np.ndarray:\n", + " \"\"\" Output of logistic regression\n", + " \n", + " Args:\n", + " X: Dataset of shape (N, d)\n", + " w: Weights of logistic regression model of shape (d, )\n", + " b: bias, a scalar\n", + " Returns:\n", + " y_hat (np.ndarray): Output of logistic regression of shape (N, )\n", + " \"\"\"\n", + " ### START CODE HERE ###\n", + " y_hat = sigmoid(X @ w + np.ones(X.shape[0]) * b)\n", + " ### END CODE HERE ###\n", + " return y_hat" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "69cb10984e50fb4ee8eb204544d4ebcf", + "grade": true, + "grade_id": "cell-b981de04e23093f5", + "locked": true, + "points": 5, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "outputs": [], + "source": [ + "# Verify implementation\n", + "X = X_train_penguins[:3]\n", + "w = np.array([1, 1])\n", + "b = 1\n", + "\n", + "y_hat = logistic_output(X, w, b)\n", + "print(f'y_hat: {np.round(y_hat, decimals=4)}')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "74c49813f04f5b3117da951bab664458", + "grade": false, + "grade_id": "cell-80ab5e0354ce2714", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "**Expected output:**\n", + "\n", + "| | |\n", + "|-----------------|--------------------------------------------------|\n", + "| **y_hat** | [0.9536 0.7114 0.0618] |" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "0cb2a33a9eadee9516cdd0ab14efb66b", + "grade": false, + "grade_id": "cell-67547d64e2468fa6", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "### Binary cross-entropy loss" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "f907e2fa5e73cacdb45ae7a2efaca54a", + "grade": false, + "grade_id": "cell-0ab39c541f3ffca8", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "In order to train your model, you also need a loss function, which penalizes outputs based on how far off they are from the ground-truth. Here, we'll use the logistic loss / binary cross-entropy loss.\n", + "\n", + "It is defined as:\n", + "\n", + "$$\\text{BCE}(\\mathbf{w}, b) = - \\frac{1}{N}\\sum^{N}_{i=1} y^{(i)} \\log(\\hat{y}^{(i)}) + (1-y^{(i)}) \\log(1- \\hat{y}^{(i)})$$\n", + "where $\\log(x)$ refers to the natural logarithm of $x$.\n", + "\n", + "**Task: Implement `bce_loss()`**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "d90f356070ff267317e1cb4ef47530e9", + "grade": false, + "grade_id": "cell-ff9acc1f454da6de", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + }, + "tags": [] + }, + "source": [ + "**Hint:** Use `np.log(x)` to take the natural logarithm of x. You can find the documentation for this method [here](https://numpy.org/doc/stable/reference/generated/numpy.log.html).\n", + "\n", + "
\n", + "Due to floating-point arithmetic, values very close to 0 will get rounded to 0 (likewise for values very close to 1). \n", + "\n", + "However, the natural logarithm log(x) is undefined for values equal to 0. An easy way to fix this is to add a small term $\\epsilon$ to the logarithm. For example, write `np.log(x + epsilon)` instead of `np.log(x)`. \n", + " \n", + "In order to avoid problems in later parts of this exercise, add `epsilon` (set to $10^{-9}$) whenever you call `np.log()`.\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "4c5ca13c35a7de1b178866c3e02f6dc3", + "grade": false, + "grade_id": "cell-386968d72cf26d7d", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + } + }, + "outputs": [], + "source": [ + "def bce_loss(X: np.ndarray, y: np.ndarray, w: np.ndarray, b: float) -> float:\n", + " \"\"\" Binary cross-entropy loss function\n", + " \n", + " Args:\n", + " X: Dataset of shape (N, d)\n", + " y: Labels of shape (N, )\n", + " w: Weights of logistic regression model of shape (d, )\n", + " b: bias, a scalar\n", + " \n", + " Returns:\n", + " float: binary cross-entropy loss.\n", + " \"\"\"\n", + " # Add the epsilon term to the np.log() in your implementation (e.g. do np.log(x + epsilon) instead of np.log(x))\n", + " # Epsilon is there to avoid log(0)\n", + " epsilon = 1e-9\n", + " \n", + " ### START CODE HERE ###\n", + " N = X.shape[0]\n", + " y_hat = logistic_output(X, w, b)\n", + " loss = -(1/N) * np.sum(y * np.log(y_hat + epsilon) + (1 - y) * np.log(1 - y_hat + epsilon))\n", + " ### END CODE HERE ###\n", + " \n", + " return loss" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "a021d900a409db2e56144a5b9fa8a23a", + "grade": true, + "grade_id": "cell-207f9debdf5b4bb3", + "locked": true, + "points": 10, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "outputs": [], + "source": [ + "# Verify implementation\n", + "X = X_train_penguins[:3]\n", + "y = y_train_penguins[:3]\n", + "w = np.array([1, 1])\n", + "b = 1\n", + "\n", + "loss = bce_loss(X, y, w, b)\n", + "print(f'Loss: {loss:.3f}')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "cec4f41cbb64146e9abdad9221b118f2", + "grade": false, + "grade_id": "cell-7a9d074931ea7dba", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "**Expected output:**\n", + "\n", + "| | |\n", + "|-----------------|--------------------------------------------------|\n", + "| **Loss** | 0.451|" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "ba2e5ac4ae349714ca15f4d1dd35ca0a", + "grade": false, + "grade_id": "cell-07d0f9b9903a2b75", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "### Binary cross-entropy loss gradient" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "562876a4bdd46b484dc80f846ff03545", + "grade": false, + "grade_id": "cell-0dabc2f9cc729e75", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "After having computed the loss, you need to update the parameters of the model in order to decrease it. This can be done with gradient descent, and for that, you'll need the gradient of your loss function.\n", + "\n", + "In class, we have seen the gradient of the binary cross-entropy loss defined as:\n", + "$$\\frac{\\partial \\text{BCE}(\\mathbf{w},b)}{\\partial \\mathbf{w}} = \\frac{1}{N}\\sum^{N}_{i=1}\\boldsymbol{x}^{(i)} (\\hat{y}^{(i)} - y^{(i)}) $$\n", + "$$ \\frac{\\partial \\text{BCE}(\\mathbf{w},b)}{\\partial b} = \\frac{1}{N}\\sum^{N}_{i=1}\\hat{y}^{(i)} - y^{(i)}$$\n", + "\n", + "A vectorized implementation of that first expression is the following: \n", + "\n", + "$$\\frac{\\partial \\text{BCE}(\\mathbf{w},b)}{\\partial \\mathbf{w}} = \\frac{1}{N} \\ \\mathbf{X}^T (\\mathbf{\\hat{y}} - \\mathbf{y})$$\n", + "\n", + "**Task: Implement `bce_gradient()`**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "cf4df60e7e2c621aac722ddce2fbaa87", + "grade": false, + "grade_id": "cell-363cee6ca6251a9e", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + } + }, + "outputs": [], + "source": [ + "def bce_gradient(X: np.ndarray, y: np.ndarray, w: np.ndarray, b: float) -> Tuple[np.ndarray, float]:\n", + " \"\"\" Gradient of the binary-cross entropy loss\n", + " \n", + " Args:\n", + " X: Dataset of shape (N, d)\n", + " y: Labels of shape (N, )\n", + " w: Weights of logistic regression model of shape (d, )\n", + " b: bias, a scalar\n", + " \n", + " Returns:\n", + " dw (np.ndarray) gradient of the loss with respect to w of shape (d, )\n", + " db (float) gradient of the loss with respect to b, a scalar\n", + " \"\"\"\n", + " ### START CODE HERE ###\n", + " N = X.shape[0]\n", + " y_hat = logistic_output(X, w, b)\n", + " dw = (1/N) * X.T @ (y_hat - y)\n", + " db = (1/N) * sum(y_hat - y)\n", + " ### END CODE HERE ###\n", + " \n", + " return dw, db" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "c328baf7bf8f92c90a7f6b154d3767a9", + "grade": true, + "grade_id": "cell-64f2d5148122e30d", + "locked": true, + "points": 10, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "outputs": [], + "source": [ + "# Verify implementation\n", + "X = X_train_penguins[:3]\n", + "y = y_train_penguins[:3]\n", + "w = np.array([1, -1])\n", + "b = -0.5\n", + "\n", + "dw, db = bce_gradient(X, y, w, b)\n", + "print(f'dw: {np.round(dw, decimals=4)}')\n", + "print(f'db: {db:.4f}')\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "fbb1132b72a86886781573011022b6a9", + "grade": false, + "grade_id": "cell-5cfb006208bff804", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "**Expected output:**\n", + "\n", + "| | |\n", + "|----|-----------------------------------|\n", + "| **dw** | [-0.2748 -0.7195] |\n", + "| **db** | 0.1184 |" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "725196c5857c491c259aaf1bd8983a12", + "grade": false, + "grade_id": "cell-f402f06e5d04a8e4", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "### Classification" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "bcce7b221a2d3e97069ee0d52b8c9d7c", + "grade": false, + "grade_id": "cell-20d859b7370fc8be", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "The output of logistic regression estimates $ \\hat{y}^{(i)} = P(y^{(i)} = 1 | \\boldsymbol{x}^{(i)})$.\n", + "\n", + "Based on this output, you'll want to classify each example in one of the 2 categories. To do so, you'll need to set $\\text{predicted label}^{(i)}$ to 1 when $ \\hat{y}^{(i)} \\gt 0.5$ (as it means it is more likely to be of class 1 than of class 0), and to 0 otherwise. \n", + "\n", + "**Task: Implement `classify()`**\n", + "\n", + "**Hint:** Use `np.where(condition, [x, y])`. You can find the documentation for this method [here](https://numpy.org/doc/stable/reference/generated/numpy.where.html)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "4a2dbb07795f4e3575f1594e07d29d8e", + "grade": false, + "grade_id": "cell-5c92d62d33b944f1", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + } + }, + "outputs": [], + "source": [ + "def classify(y_hat: np.ndarray) -> np.ndarray:\n", + " \"\"\" Classification function for binary class logistic regression. \n", + " \n", + " Args:\n", + " y_hat (np.array): Output of logistic regression of shape (N, )\n", + " Returns:\n", + " np.array: Label assignments of data of shape (N, )\n", + " \"\"\"\n", + " ### START CODE HERE ###\n", + " labels_pred = np.where(y_hat > 0.5, 1, 0)\n", + " ### END CODE HERE ###\n", + " return labels_pred" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "7bee1351424f79a8449f705fa22695e0", + "grade": true, + "grade_id": "cell-b6f0cd5acea4b630", + "locked": true, + "points": 3, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "outputs": [], + "source": [ + "y_hat = np.array([0.25, 0.75])\n", + "print(classify(y_hat))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "01a7f29ad34d4642df5c4f088558aa36", + "grade": false, + "grade_id": "cell-54ef487260985115", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "**Expected output:** [0 1] " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "115dfd36116ba1fcb12659231f301b7c", + "grade": false, + "grade_id": "cell-f3237226b328f5df", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "### Accuracy" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "1ea0a36a53b56c01e87e6ff470ae498d", + "grade": false, + "grade_id": "cell-dacad3b4c6341c69", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "To measure how well your model is doing, we'll first consider the accuracy as our metric. It corresponds to the fraction of predictions our model got right.\n", + "\n", + "$$\\text{Accuracy} = \\frac{\\text{Number of correct predictions}}{\\text{Total number of predictions}}$$\n", + "\n", + "\n", + "\n", + "**Task: Implement `accuracy()`**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "e01cc8a17786d8ab053e550361a6de59", + "grade": false, + "grade_id": "cell-9da5f76d65e389ce", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + } + }, + "outputs": [], + "source": [ + "def accuracy(labels_gt: np.ndarray, labels_pred: np.ndarray) -> float:\n", + " \"\"\"Computes the accuracy.\n", + "\n", + " Args:\n", + " labels_gt: labels (ground-truth) of shape (M, ).\n", + " labels_pred: Predicted labels of shape (M, ).\n", + "\n", + " Returns:\n", + " float: Accuracy, in range [0, 1].\n", + " \"\"\"\n", + " ### START CODE HERE ###\n", + " return sum(labels_gt == labels_pred) / labels_gt.shape[0]\n", + " ### END CODE HERE ###" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "66c8b448450e7b2aec8a51ba3e5f49fb", + "grade": true, + "grade_id": "cell-bceb4550413f7dd6", + "locked": true, + "points": 3, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "outputs": [], + "source": [ + "# Check that output is in [0, 1] (ensures that output is not a percentage)\n", + "assert 0.0 <= accuracy(np.array([1, 0]), np.array([1, 1])) <= 1.0\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "6b98babacb7242432d3e057f5c0cb603", + "grade": false, + "grade_id": "cell-6af2a738f8f3701f", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "## 1.3. Training\n", + "You will now implement the training process by putting together all the functions you have implemented so far.\n", + "\n", + "Here are the different steps of the training process:\n", + "- Compute the output of logistic regression (`y_hat`)\n", + "- Compute the loss of the model (`loss`)\n", + "- Compute the derivates w.r.t to the weights `dw` and the bias `db`\n", + "- Update `w` and `b`\n", + "\n", + "Recall that a gradient step is: \n", + "$$ \\mathbf{w} := \\mathbf{w} - \\alpha \\frac{\\partial J}{\\partial \\mathbf{w}} $$\n", + "\n", + "\n", + "$$b := b - \\alpha \\frac{\\partial J}{\\partial b} $$\n", + "where $J$ is the loss function and $\\alpha$ is the learning rate.\n", + "\n", + "**Task: Complete the following `train_logistic_regression()` function.**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "72bd953eab7f4dfb5e83f14978c45ac8", + "grade": false, + "grade_id": "cell-740a96ad4a0a6142", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + } + }, + "outputs": [], + "source": [ + "def train_logistic_regression(X: np.ndarray, \n", + " y: np.ndarray, \n", + " max_iters: int = 101, \n", + " alpha: float = 0.5, \n", + " loss_freq: int = 0) -> Tuple[np.ndarray, float, dict]:\n", + " \"\"\" Training function for binary class logistic regression using gradient descent\n", + " \n", + " Args:\n", + " X: Dataset of shape (N, d)\n", + " y: Labels of shape (N, )\n", + " max_iters: Maximum number of iterations. Default : 100\n", + " alpha: The learning rate of the gradient step. Default : 1\n", + " loss_freq : Prints the loss every `loss_freq` iterations. Default : 0\n", + " \n", + " Returns:\n", + " w: weights of shape (d, )\n", + " b: scalar\n", + " viz_d: dict used for visualizations\n", + " \"\"\"\n", + " \n", + " # Initialize weights\n", + " np.random.seed(0)\n", + " w = np.random.normal(0, 1, size=(X.shape[1], ))\n", + " b = 0\n", + " \n", + " # Initialize dict with lists to keep track of loss, accuracy, weight and bias evolution\n", + " logger = {'loss': [], \n", + " 'acc': [], \n", + " 'w': [],\n", + " 'b': []\n", + " }\n", + " \n", + " \n", + " for i in range(max_iters):\n", + " # Compute loss, dw, db and update w and b \n", + " ### START CODE HERE ###\n", + " loss = bce_loss(X, y, w, b)\n", + " dw, db = bce_gradient(X, y, w, b)\n", + " \n", + " w = w - alpha * dw\n", + " b = b - alpha * db\n", + " ### END CODE HERE ###\n", + " \n", + " # Keep track of parameter, loss and accuracy values for each iteration\n", + " logger['w'].append(w)\n", + " logger['b'].append(b)\n", + " logger['loss'].append(loss)\n", + " y_hat = logistic_output(X, w, b)\n", + " logger['acc'].append(accuracy(y, classify(y_hat)))\n", + " \n", + " if (loss_freq !=0) and i % loss_freq == 0:\n", + " print(f'Loss at iter {i}: {loss:.5f}')\n", + " \n", + " if (loss_freq != 0):\n", + " print('\\nFinal loss: {:.5f}'.format(logger['loss'][-1]))\n", + " \n", + " return w, b, logger" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "96afdd23ad3b819b300bdacf0304366f", + "grade": false, + "grade_id": "cell-426d1196f5585a43", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "Now, run the following cell with the default hyperparameters `(max_iters=101, alpha=0.5)` to train your model parameters (`w` & `b`)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "bcdeb6078cf2753a76c3668221c3b95a", + "grade": true, + "grade_id": "cell-afbf4cf924cb68a4", + "locked": true, + "points": 10, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "outputs": [], + "source": [ + "w, b, logger = train_logistic_regression(X_train_penguins, y_train_penguins, max_iters=101, alpha=0.5, loss_freq=5)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "74287ad4b43380128385c7791473f939", + "grade": false, + "grade_id": "cell-d15cf2a6e36eb5b7", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "**Expected output:**\n", + "\n", + "| | |\n", + "|------------|---------|\n", + "| **Final loss** | 0.10350 |" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "7d7d196eadce0dac03d146014fcde93c", + "grade": false, + "grade_id": "cell-e09d611c113b07e0", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "## 1.4. Results" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "f22822c63691eeaf68aa2a051ec899ab", + "grade": false, + "grade_id": "cell-0f28f624609e8906", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "### Model accuracy\n", + "Great, your model is trained! But how well does it perform? To find out, run the following cells to get the accuracy of your trained model on the train and test set." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Train acc\n", + "y_hat = logistic_output(X_train_penguins, w, b)\n", + "acc = accuracy(y_train_penguins, classify(y_hat))\n", + "print(f'Train accuracy: {100 * acc:.2f}%')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Test acc\n", + "y_hat = logistic_output(X_test_penguins, w, b)\n", + "acc = accuracy(y_test_penguins, classify(y_hat))\n", + "print(f'Test accuracy: {100*acc:.2f}%')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "8d930a5c6bfed464eaf027b425d56431", + "grade": false, + "grade_id": "cell-ab4bfc1e03c3c0c5", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "**Expected output:**\n", + "\n", + "| | |\n", + "|------------|---------|\n", + "| **Train accuracy** | 95.79% |\n", + "| **Test accuracy** | 97.56% |" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "036402727a3902de26bbf42e27a58921", + "grade": false, + "grade_id": "cell-a42ae83845139239", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "### Visualization of the training\n", + "Let's observe how the training went. The first graph plots the evolution of the loss during the training while the second graph plots the evolution of the accuracy during the training. If everything was implemented correctly, you'll notice that as training goes on, the loss decreases and the training accuracy tends to increase." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Plot the evolution of loss during training\n", + "def plot_loss(loss_list):\n", + " fig = plt.figure(figsize=(6, 6))\n", + " ax = fig.add_subplot(111)\n", + " step = np.arange(1, len(loss_list)+1)\n", + " plt.plot(step, loss_list)\n", + " plt.title('Evolution of the loss during the training')\n", + " plt.xlabel('iteration')\n", + " plt.ylabel('Training loss')\n", + " plt.show()\n", + "\n", + "plot_loss(logger[\"loss\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Plot the evolution of accuracy during training\n", + "def plot_acc(acc_list):\n", + " fig = plt.figure(figsize=(6, 6))\n", + " ax = fig.add_subplot(111)\n", + " step = np.arange(1, len(acc_list)+1)\n", + " plt.plot(step, acc_list)\n", + " plt.title('Evolution of the accuracy during the training')\n", + " plt.xlabel('iteration')\n", + " plt.ylabel('Training accuracy')\n", + "\n", + "plot_acc(logger[\"acc\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "acbd16b5d8076747c3401df17e7f3168", + "grade": false, + "grade_id": "cell-3c1e75ffbae29782", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "### Plotting results" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "e5adf60d50fa30976d9dca47ab659ef2", + "grade": false, + "grade_id": "cell-fa0f390f50b91437", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "We also implemented functions in helpers.py which can help you visualize the decision boundary for this model, and visualize which points are correctly classified, and which aren't. This step is important in the process of developing your algorithm as it allows you to make sense of the mathematical result you obtained. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class_names = list(label_map.values())\n", + "ax_titles = [\"Bill length (normalized)\", \"Flipper length (normalized)\"]\n", + "\n", + "helpers.plot_boundaries(X=X_train_penguins,\n", + " y=y_train_penguins, \n", + " w=w, b=b,\n", + " output_func=logistic_output, \n", + " class_names=class_names, \n", + " ax_titles=ax_titles,\n", + " train=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "helpers.plot_boundaries(X=X_test_penguins, \n", + " y=y_test_penguins, \n", + " w=w, \n", + " b=b, \n", + " output_func=logistic_output, \n", + " class_names=class_names, \n", + " ax_titles=ax_titles, \n", + " train=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "518ebcb5185d2b2a25dc1ed88b4f6721", + "grade": false, + "grade_id": "cell-ac7d37609dd641b4", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "If everything was implemented correctly, you'll notice that your model fits the training data with a good accuracy and also generalizes to the test data. Generalizing to new and unknown data points is the ultimate goal of any machine learning algorithm. In our case, the visualisation highlights this property." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "03049e5cf209e61bccc0f19f19268826", + "grade": false, + "grade_id": "cell-f84f855675584e9a", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "### Training visualization" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "9780f14a873f7348480b58965e9d1006", + "grade": false, + "grade_id": "cell-ae3fbfb5e789d8c7", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "We alo implemented an interactive function which you can use to visualize the decision boundaries at different iterations. \n", + "Notice how the decision boundary gradually improves during training?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "helpers.interactive_boundaries(X_train_penguins, \n", + " y_train_penguins, \n", + " X_test_penguins, \n", + " y_test_penguins, \n", + " logger[\"w\"], \n", + " logger[\"b\"], \n", + " logistic_output, \n", + " class_names, \n", + " ax_titles, \n", + " total_steps=50)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "8620e8b5b98afec9497e67e7991ef9cf", + "grade": false, + "grade_id": "cell-726b08592101a09f", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "Good job on completing the first part of this exercise! In the next parts, we'll implement a slightly modified version of logistic regression and train it on a different dataset. The functions implemented so far will prove to be very useful later on, so make sure that they are correct!" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "824872de8d62f11a26c26a8eb44eca9e", + "grade": false, + "grade_id": "cell-52d83aaeea5761d8", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "# Part 2: Regularization" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "44782d96bec1319d32f82d5359088bc3", + "grade": false, + "grade_id": "cell-d8798eda727da570", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "The second part of this exercise consists of adding regularization to improve logistic regression's performance on a higher dimensional dataset." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "f28c78851a21638a195b2c36882f06ed", + "grade": false, + "grade_id": "cell-326f23c9275b7914", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "## 2.1. Connectionist Bench Dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "311224230374396a884d20c5abc2bb99", + "grade": false, + "grade_id": "cell-05b2d03e39ce0936", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "In this part, you'll use the Connectionist Bench Dataset, where the task is to train a model to discriminate between sonar signals bounced off a mine (metal cylinder) and those bounced off a roughly cylindrical rock. Each sample is composed of 60 features and a label: \"R\" if it is a rock and \"M\" if it is a mine (metal cylinder)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sonar = pd.read_csv(\"data/sonar.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"Shape: {sonar.shape}\")\n", + "\n", + "sonar.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "a799882fb2d88558e8b2259b2da94053", + "grade": false, + "grade_id": "cell-02f0a990ec899b02", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "We use `helpers.preprocess_data()` to extract a training and test set from this data. The label \"M\" (mine) is mapped to 0 and the label \"R\" (rock) is mapped to 1." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X_train_sonar, y_train_sonar, X_test_sonar, y_test_sonar, feature_names, label_map = helpers.preprocess_data(sonar, label=\"target\", train_size=0.75, seed=42)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "label_map" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "50264535247cd7bbe08a18e7d8042704", + "grade": false, + "grade_id": "cell-4acc3330d529f05b", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "You also need to normalize the data, as was done in the previous part.\n", + "\n", + "**Task: Normalize `X_train_sonar` and `X_test_sonar`**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "02e9ae21865eaf4d5085400546208f04", + "grade": false, + "grade_id": "cell-f586fe01b0c8f047", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + } + }, + "outputs": [], + "source": [ + "# Normalize features of the training and test set using the mean and std of the training set features\n", + "# Use helpers.normalize()\n", + "### START CODE HERE ###\n", + "mean = X_train_sonar.mean(axis=0)\n", + "std = X_train_sonar.std(axis=0)\n", + "\n", + "X_train_sonar = (X_train_sonar - mean) / std\n", + "X_test_sonar = (X_test_sonar - mean) / std\n", + "### END CODE HERE ###" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "7e131c179a33fc9b6dfb7263c7a11784", + "grade": true, + "grade_id": "cell-463cbb9c12fe4e46", + "locked": true, + "points": 2, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "outputs": [], + "source": [ + "assert np.allclose(X_train_sonar.mean(axis=0), 0)\n", + "assert np.allclose(X_train_sonar.std(axis=0), 1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Show shapes\n", + "print('Training set shape:')\n", + "print(f'X: {X_train_sonar.shape}, y: {y_train_sonar.shape}')\n", + "\n", + "print('\\nTest set shape:')\n", + "print(f'X: {X_test_sonar.shape}, y: {y_test_sonar.shape}')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "10fc5d39ccadcfc5eba2b2444d164acd", + "grade": false, + "grade_id": "cell-894756d945f8c7b7", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "Now that you've processed the data, let's check how well logistic regression performs without regularization." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "w, b, _ = train_logistic_regression(X_train_sonar, y_train_sonar, max_iters=1001, alpha=0.5, loss_freq=100)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Train acc\n", + "y_hat = logistic_output(X_train_sonar, w, b)\n", + "acc = accuracy(y_train_sonar, classify(y_hat))\n", + "print(f'Train accuracy: {100 * acc:.2f}%')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Test acc\n", + "y_hat = logistic_output(X_test_sonar, w, b)\n", + "acc = accuracy(y_test_sonar, classify(y_hat))\n", + "print(f'Test accuracy: {100*acc:.2f}%')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "eb314a60dcc7f59fdb19b4e369a06333", + "grade": false, + "grade_id": "cell-73236922dc5e11f9", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "## 2.2. Penalized logistic regression" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "19eecf964deb7cadd1e523878abd1891", + "grade": false, + "grade_id": "cell-2396bd8a75a96a80", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "### Penalized binary cross-entropy loss" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "e5c881d1e21a4267bbd74ec4b6ba1abc", + "grade": false, + "grade_id": "cell-6b45823dd4942f23", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "Logistic regression can overfit when there are too many parameters compared to training examples, as it can find weights for which the decision boundary perfectly separates all the training examples. When such overfitting occurs, the weights are often set to large values. One way to reduce such overfitting is to prevent weights from becoming so large, which can be done using L2 regularization, which consists of changing the training objective to penalize \"large\" weights. The name L2 regularization comes from the fact that weights are penalized using the L2-norm.\n", + "\n", + "We will call this new training objective (new loss function) the penalized binary-cross entropy loss (or PBCE), it is expressed as:\n", + "\n", + "$$\n", + "\\begin{align}\n", + "\\text{PBCE}(\\mathbf{w}, b) &= \\text{BCE}(\\mathbf{w}, b) + \\lambda \\|\\mathbf{w}\\|_{2}^{2} \\\\\n", + "&= [- \\frac{1}{N}\\sum^{N}_{i=1} y^{(i)} \\log(\\hat{y}^{(i)}) + (1-y^{(i)}) \\log(1- \\hat{y}^{(i)}) \\hspace{0.2cm}] + \\lambda\\|\\mathbf{w}\\|_{2}^{2}\n", + "\\end{align}$$\n", + "\n", + "As you can see, this loss function consists of adding a penalty term to BCE, which **penalizes the weights but not the bias term**. The hyper-parameter $\\lambda$ controls overfitting. The larger its value, the more the weights are penalized for being large, which makes the model less flexible." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "1359182e8f7e50e99498c145a38757b0", + "grade": false, + "grade_id": "cell-0a9d69ed8d9e088c", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "**Task: Implement `penalized_bce_loss()`.**\n", + "\n", + "**Note:** `lambda` is a reserved keyword in Python (used for lambda expressions), so no variables can be named this way. In our function, we use `lambda_` as a variable name instead." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "0b5deab4bfaa03586497472980f9a65f", + "grade": false, + "grade_id": "cell-85ebdc0cd223d98d", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + } + }, + "outputs": [], + "source": [ + "def penalized_bce_loss(X: np.ndarray, y: np.ndarray, w: np.ndarray, b: float, lambda_: float) -> float:\n", + " \"\"\" Penalized binary cross-entropy loss function\n", + " \n", + " Args:\n", + " X: Dataset of shape (N, d)\n", + " y: Labels of shape (N, )\n", + " w: Weights of logistic regression model of shape (d, )\n", + " b: bias, a scalar\n", + " lambda_: regularization coefficient (named this way as lambda is a reserved keyword in python)\n", + " \n", + " Returns:\n", + " float: binary cross-entropy loss.\n", + " \"\"\"\n", + " ### START CODE HERE ###\n", + " loss = bce_loss(X, y, w, b) + lambda_ * np.linalg.norm(w)**2\n", + " ### END CODE HERE ###\n", + " return loss" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "9a2bce2a16404bced40425d7175d1cdc", + "grade": true, + "grade_id": "cell-80d627a52390d834", + "locked": true, + "points": 5, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "outputs": [], + "source": [ + "X = X_train_sonar[:3]\n", + "y = y_train_sonar[:3]\n", + "w = np.ones(X.shape[1])\n", + "b = 1\n", + "\n", + "loss = penalized_bce_loss(X, y, w, b, lambda_=0.1)\n", + "assert(isinstance(loss, float))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "55a08303381c4c2f0cda2aa7a9bede2c", + "grade": false, + "grade_id": "cell-f8e5e022336e7350", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "### Penalized binary cross-entropy loss gradient" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "83e116e439a9aab883a12d9709aaf3ac", + "grade": false, + "grade_id": "cell-ac4bcb4b3ef034f2", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "In order to use the penalized binary cross-entropy loss, you need its gradient. This time, we won't give you the gradient formula, you'll need to figure it out yourself!\n", + "\n", + "**Task: Figure out the gradient for the penalized binary cross-entropy loss and implement `penalized_bce_gradient()`.**\n", + "\n", + "**Hint:** Here are two useful resources for matrix calculus:\n", + "- http://www.matrixcalculus.org/\n", + "- https://en.wikipedia.org/wiki/Matrix_calculus" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Solution\n", + "\n", + "We need to find the gradient of the PCBE:\n", + "$$\\text{PBCE}(\\mathbf{w}, b) = \\text{BCE}(\\mathbf{w}, b) + \\lambda \\|\\mathbf{w}\\|_{2}^{2}$$\n", + "\n", + "The gradient is a linear operator, so we can consider the gradient of the BCE term separately from\n", + "the penalization term. \n", + "\n", + "$$\n", + "\\frac{\\partial \\text{PCBE}(\\mathbf{w},b)}{\\partial \\mathbf{w}} =\n", + " \\frac{\\partial \\text{BCE}(\\mathbf{w},b)}{\\partial \\mathbf{w}}\n", + " + \\frac{\\partial}{\\mathbf{w}} \\lambda \\|\\mathbf{w}\\|_{2}^{2}\n", + "$$\n", + "\n", + "$$\n", + "\\frac{\\partial \\text{PCBE}(\\mathbf{w},b)}{\\partial \\mathbf{w}} =\n", + " \\frac{\\partial \\text{BCE}(\\mathbf{w},b)}{\\partial \\mathbf{b}}\n", + " + \\frac{\\partial}{\\mathbf{b}} \\lambda \\|\\mathbf{w}\\|_{2}^{2}\n", + "$$\n", + "\n", + "We already know the gradient of the BCE term from above:\n", + "$$\n", + "\\frac{\\partial \\text{BCE}(\\mathbf{w},b)}{\\partial \\mathbf{w}} = \n", + "\\frac{1}{N}\\sum^{N}_{i=1}\\boldsymbol{x}^{(i)} (\\hat{y}^{(i)} - y^{(i)})\n", + "$$\n", + "\n", + "$$\n", + "\\frac{\\partial \\text{BCE}(\\mathbf{w},b)}{\\partial b} = \n", + "\\frac{1}{N}\\sum^{N}_{i=1}\\hat{y}^{(i)} - y^{(i)}\n", + "$$\n", + "\n", + "The gradient of the penalization term w.r.t. $\\mathbf{w}$ is simply:\n", + "$$\\frac{\\partial}{\\mathbf{w}} \\lambda \\|\\mathbf{w}\\|_{2}^{2} = 2 \\lambda \\mathbf{w}$$\n", + "\n", + "The penalization doesn't depend on the bias $b$, so the gradient of the penalization w.r.t. the\n", + "bias is 0:\n", + "$$\\frac{\\partial}{\\mathbf{b}} \\lambda \\|\\mathbf{w}\\|_{2}^{2} = 0$$" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "a155fe2378e850f0f8c68609a1bc18e0", + "grade": false, + "grade_id": "cell-6f675c8a2fd8074e", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + } + }, + "outputs": [], + "source": [ + "def penalized_bce_gradient(X: np.ndarray, \n", + " y: np.ndarray, \n", + " w: np.ndarray, \n", + " b: float, \n", + " lambda_: float) -> Tuple[np.ndarray, float]:\n", + " \"\"\" Gradient of the penalized binary-cross entropy loss\n", + " \n", + " Args:\n", + " X: Dataset of shape (N, d)\n", + " y: Labels of shape (N, )\n", + " w: Weights of logistic regression model of shape (d, )\n", + " b: bias, a scalar\n", + " lambda_: regularization coefficient (named this way as lambda is a reserved keyword in python)\n", + " \n", + " Returns:\n", + " dw (np.ndarray) gradient of the loss with respect to w of shape (d, )\n", + " db (float) gradient of the loss with respect to b, a scalar\n", + " \"\"\"\n", + " ### START CODE HERE ###\n", + " dw, db = bce_gradient(X, y, w, b)\n", + " dw = dw + 2 * lambda_ * w\n", + " db = db\n", + " ### END CODE HERE ###\n", + " return dw, db" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "24e7f02d9c434d68cfe5a30f9288f033", + "grade": true, + "grade_id": "cell-9849547dd13fcd6a", + "locked": true, + "points": 10, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "outputs": [], + "source": [ + "X = X_train_sonar[:3]\n", + "y = y_train_sonar[:3]\n", + "w = np.ones(X.shape[1])\n", + "b = 1\n", + "\n", + "dw, db = penalized_bce_gradient(X, y, w, b, lambda_=0.1)\n", + "assert(dw.shape == w.shape)\n", + "assert(isinstance(db, float))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "728a73a4f3d952c1da5398e29c0c2630", + "grade": false, + "grade_id": "cell-988814e2d1d31b3d", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "### Training" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "da3ffa264a9303b59aeb0965f27c8146", + "grade": false, + "grade_id": "cell-338d8519584845d2", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "Now that you've implemented the loss function and its gradient, you can use it to train your model.\n", + "\n", + "**Task: Implement `train_penalized_logistic_regression()`.**\n", + "\n", + "**Hint:** This training function is very similar to the one implemented in Part 1." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "973eb9e380b2bc987866f7a32a044cc2", + "grade": false, + "grade_id": "cell-030119f67fac9804", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + } + }, + "outputs": [], + "source": [ + "def train_penalized_logistic_regression(X: np.ndarray, \n", + " y: np.ndarray, \n", + " lambda_: float, \n", + " max_iters: int = 1001, \n", + " alpha: float = 0.5, \n", + " loss_freq: int = 0) -> Tuple[np.ndarray, float, dict]:\n", + " \"\"\" Training function for binary class penalized logistic regression using gradient descent\n", + " \n", + " Args:\n", + " X: Dataset of shape (N, d)\n", + " y: Labels of shape (N, )\n", + " lambda_: regularization coefficient (named this way as lambda is a reserved keyword in python)\n", + " max_iters: Maximum number of iterations\n", + " alpha: The learning rate of the gradient step\n", + " loss_freq : Prints the loss every `loss_freq` iterations\n", + " \n", + " Returns:\n", + " w: weights of shape (d, )\n", + " b: scalar\n", + " viz_d: dict used for visualizations\n", + " \"\"\"\n", + " \n", + " # Initialize weights\n", + " np.random.seed(0)\n", + " w = np.random.normal(0, 1, size=(X.shape[1], ))\n", + " b = 0\n", + " \n", + " # Initialize dict with lists to keep track of loss, accuracy, weight and bias evolution\n", + " logger = {'loss': [], \n", + " 'acc': [], \n", + " }\n", + " \n", + " \n", + " for i in range(max_iters):\n", + " # Compute loss, dw, db and update w and b \n", + " ### START CODE HERE ###\n", + " loss = penalized_bce_loss(X, y, w, b, lambda_)\n", + " dw, db = penalized_bce_gradient(X, y, w, b, lambda_)\n", + " \n", + " w = w - alpha * dw\n", + " b = b - alpha * db\n", + " ### END CODE HERE ###\n", + " \n", + " # Keep track of loss and accuracy values for each iteration\n", + " logger['loss'].append(loss)\n", + " \n", + " y_hat = logistic_output(X, w, b)\n", + " logger['acc'].append(accuracy(y, classify(y_hat)))\n", + " \n", + " if (loss_freq !=0) and i % loss_freq == 0:\n", + " print(f'Loss at iter {i}: {loss:.5f}')\n", + " \n", + " if (loss_freq != 0):\n", + " print('\\nFinal loss: {:.5f}'.format(logger['loss'][-1]))\n", + " \n", + " return w, b, logger" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "5bb9b1ddbb80802a3a324039c5cd1002", + "grade": false, + "grade_id": "cell-6d4df3995d747ec2", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "Let's now the check that there are no huge mistakes in your implementation by training your model for two different values of $\\lambda$ and checking the training accuracy. Given that regularization reduces overfitting, you should expect the training accuracy to decrease when $\\lambda$ increases." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "9a38641edd8d7f7c11cd1fb29b89edd9", + "grade": true, + "grade_id": "cell-901ce6821fb221df", + "locked": true, + "points": 3, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "outputs": [], + "source": [ + "w, b, _ = train_penalized_logistic_regression(X_train_sonar, y_train_sonar, lambda_=0, max_iters=1001, alpha=0.5, loss_freq=100)\n", + "y_hat = logistic_output(X_train_sonar, w, b)\n", + "acc = accuracy(y_train_sonar, classify(y_hat))\n", + "print(f'Train accuracy: {100*acc:.2f}%')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "w, b, _ = train_penalized_logistic_regression(X_train_sonar, y_train_sonar, lambda_=0.1, max_iters=1001, alpha=0.5, loss_freq=100)\n", + "y_hat = logistic_output(X_train_sonar, w, b)\n", + "acc = accuracy(y_train_sonar, classify(y_hat))\n", + "print(f'Train accuracy: {100*acc:.2f}%')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "65204b2051649647e09ebda185901c82", + "grade": false, + "grade_id": "cell-5a09569774f78f7b", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "Now that you've implemented penalized logistic regression, you may wonder which value of $\\lambda$ to pick. This is what we'll try to figure out in part 3, using cross-validation." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "662575b52f8087423cb56b980f6cdd5d", + "grade": false, + "grade_id": "cell-3a7c5434a60cdae6", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "# Part 3: Cross-validation" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "b65e945967c5267becfd9a8e891fb438", + "grade": false, + "grade_id": "cell-e986c697c7149a78", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "In this class, you saw that a dataset is usually split into 3 parts: One training set, one validation set and one test set. The training set is used as training data, the validation set is used for tuning hyper-parameters and the test set is held out for final evaluation. However, by partitioning data this way, we reduce the number of samples available for training the model, and the results on the validation depend on a particular random choice for the training and validation sets. For datasets with a small amount of training examples, this can be especially problematic.\n", + "\n", + "\n", + "This is where cross-validation comes into play. With cross-validation, a test set will still be held out for final evaluation, but there is no need for a designated validation set. Here, you will implement a non-exhaustive cross-validation technique known as k-fold cross-validation." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "7bb7901699127361cb9eb0bc6d38ceae", + "grade": false, + "grade_id": "cell-7d4c7380ddc963c2", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "## 3.1. k-Fold Cross-validation" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "be877746cccc18683b023e38c1e13e57", + "grade": false, + "grade_id": "cell-d82878f7c4c982a1", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "In k-fold cross-validation, the training data is randomly partitioned into $k$ equal sized subsamples. Of the $k$ subsamples, a single subsample is used as the validation data, and the remaining $k − 1$ subsamples are used as training data. This process is repeated $k$ times, with each of the $k$ subsamples used exactly once as the validation data. The $k$ results are then averaged to produce a single estimation. \n", + "\n", + "This process is illustrated below:\n", + "\n", + "\"kfold\"\n", + "\n", + "We have implemented the function `k_fold_indices()` for you, which generates indices for k-fold cross-validation. You can see its implementation and an example usage in the cell below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def k_fold_indices(num_examples: int, k: int = 4) -> List[Tuple[np.ndarray, np.ndarray]]:\n", + " \"\"\"Generates indices for k-fold cross-validation\n", + "\n", + " Args:\n", + " num_examples: Number of training examples\n", + " k: Number of folds\n", + "\n", + " Returns:\n", + " List of tuples containing the training indices and validation indices\n", + "\n", + " \"\"\"\n", + " indices = np.arange(num_examples)\n", + " split_size = num_examples // k\n", + " val_indices = [indices[k * split_size : (k + 1) * split_size] for k in range(k)]\n", + " both_indices = [(np.delete(indices, val_ind), val_ind) for val_ind in val_indices]\n", + " return both_indices\n", + "\n", + "# Example usage\n", + "for train_index, val_index in k_fold_indices(num_examples=8, k=4):\n", + " # Do something with the indices\n", + " print(f\"{train_index} {val_index}\\n\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "66ee08aa5f173c1073458147244212ed", + "grade": false, + "grade_id": "cell-ba4885296a47fae1", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "**Task: With the help of `k_fold_indices()` and previously implemented functions, implement the function `cross_val_penalized_logistic_regression()` according to its documentation.**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "4a4f64c94bb8cf423f635f0a03464c10", + "grade": false, + "grade_id": "cell-0f739fc8018fc2d6", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + } + }, + "outputs": [], + "source": [ + "def cross_val_penalized_logistic_regression(X: np.ndarray,\n", + " y: np.ndarray,\n", + " lambda_: float = 0,\n", + " max_iters: int = 1001,\n", + " alpha: float = 0.5,\n", + " loss_freq: int = 0,\n", + " k: int = 4) -> float:\n", + " \"\"\"\n", + " Performs k-fold cross-validation for penalized logistic regression and returns the mean validation accuracy\n", + " \n", + " For each fold, train the model and get the validation accuracy. Then, return the mean of all validation accuracies\n", + "\n", + " Args:\n", + " X: Dataset of shape (N, d)\n", + " y: Labels of shape (N, )\n", + " lambda_: regularization coefficient (named this way as lambda is a reserved keyword in python)\n", + " max_iters: Maximum number of iterations\n", + " alpha: The learning rate of the gradient step\n", + " loss_freq : Prints the loss every `loss_freq` iterations\n", + " k: Number of folds\n", + "\n", + " Returns:\n", + " Mean validation accuracy\n", + "\n", + " \"\"\"\n", + " val_accs = []\n", + " \n", + " # Hint: Use a for-loop to iterate over all k-fold indices\n", + " ### START CODE HERE ###\n", + " N = X.shape[0]\n", + " for train_index, val_index in k_fold_indices(N, k):\n", + " X_train = X[train_index]\n", + " y_train = y[train_index]\n", + " X_val = X[val_index]\n", + " y_val = y[val_index]\n", + " w, b, _ = train_penalized_logistic_regression(X_train, y_train, lambda_, max_iters, alpha, loss_freq)\n", + " y_hat = classify(logistic_output(X_val, w, b))\n", + " val_accs.append(accuracy(y_val, y_hat))\n", + " ### END CODE HERE ###\n", + "\n", + " return np.mean(val_accs)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "ae8dd73ebde8e617dd7710765239dcae", + "grade": true, + "grade_id": "cell-7374c5c917d19ae3", + "locked": true, + "points": 15, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "outputs": [], + "source": [ + "mean_cv_acc = cross_val_penalized_logistic_regression(X_train_sonar, y_train_sonar)\n", + "print(f\"Mean CV acc for default settings: {mean_cv_acc}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "a6390bbc2a9de93b9c6f64cf75ddbbe6", + "grade": false, + "grade_id": "cell-7cb106f8ed2aba6f", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "## 3.2. Finding a good regularization parameter" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "5ce1ea5cf8c6a85598c8ae56214afd66", + "grade": false, + "grade_id": "cell-6397799822bc5810", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "You'll now use cross-validation to find a good $\\lambda$ for penalized logistic regression.\n", + "\n", + "**Task: Find the best value of $\\lambda$**\n", + "\n", + "To do so:\n", + "- Suppose that `max_iters` and `alpha` are set to the **default values** of `cross_val_penalized_logistic_regression()`, and that you can only modify the hyper-parameter `lambd`.\n", + "- Compute the 4-fold cross-validation accuracy for $\\lambda \\in \\{0, 0.001, 0.01, 0.1, 1\\}.$\n", + "- Set `best_cv_acc` to the best cross-validation accuracy obtained, and `best_lambda` to the $\\lambda$ associated with the best cross-validation accuracy." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "b05ede994a13ae3810945f4859d74619", + "grade": false, + "grade_id": "cell-50cb47cb492159f6", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + } + }, + "outputs": [], + "source": [ + "# Only search for values in this list, do not modify it\n", + "lambdas = [0, 0.001, 0.01, 0.1, 1]\n", + "k = 4\n", + "\n", + "# cv_accs should contain the mean cross-validation accuracy for each value of lambda\n", + "cv_accs = []\n", + "\n", + "### START CODE HERE ###\n", + "for lambda_ in lambdas:\n", + " cv_accs.append(\n", + " cross_val_penalized_logistic_regression(X_train_sonar, y_train_sonar, lambda_=lambda_)\n", + " )\n", + "### END CODE HERE ###\n", + "\n", + "for lambda_, acc in zip(lambdas, cv_accs):\n", + " print(f\"Lambda: {lambda_}\")\n", + " print(f\"Cross-val acc: {acc}\")\n", + " print() \n", + " \n", + "### START CODE HERE ###\n", + "best_cv_acc = np.max(cv_accs)\n", + "best_lambda = lambdas[np.argmax(cv_accs)]\n", + "### END CODE HERE ###" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "0d1658c5ff395125ff67de211f4ca50c", + "grade": true, + "grade_id": "cell-252967d41a7626da", + "locked": true, + "points": 5, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "outputs": [], + "source": [ + "print(f\"Best lambda: {best_lambda}\")\n", + "print(f\"Best CV acc: {best_cv_acc}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "78e68d41caefca3c01838a7e8754407b", + "grade": false, + "grade_id": "cell-40fa5192703e964f", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "Now that you've settled on a value for $\\lambda$, you can use it to train our model using our entire training set, and find out how well it performs on the test set." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "w, b, _ = train_penalized_logistic_regression(X_train_sonar, y_train_sonar, lambda_=best_lambda, max_iters=1001, alpha=0.5, loss_freq=100)\n", + "\n", + "# Test acc\n", + "print()\n", + "y_hat = logistic_output(X_test_sonar, w, b)\n", + "acc = accuracy(y_test_sonar, classify(y_hat))\n", + "print(f'Test accuracy: {100*acc:.2f}%')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "f94f7089bf30660babc979e7602c7591", + "grade": false, + "grade_id": "cell-4323a262e8531393", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "Assuming that penalized logistic regression, cross-validation and the hyperparameter search were implemented correctly, your test accuracy should be slightly higher than what was obtained for standard logistic regression in Part 2.1. While this is a modest increase, regularization can lead to much more significant increases in accuracy when working with complex machine learning models such as neural networks." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + }, + "toc-autonumbering": false, + "toc-showcode": false, + "toc-showmarkdowntxt": false, + "vscode": { + "interpreter": { + "hash": "05f41cfe409499c20c6ade0f680b207fd3596bcf0e119a0dff1094da52c67f55" + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}