{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Online RNN network to predict first names of us babies" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import tensorflow as tf\n", "from tensorflow import keras\n", "from tensorflow.keras import layers\n", "from tensorflow.keras import activations\n", "import matplotlib.pyplot as plt\n", "from sklearn.model_selection import train_test_split\n", "import random\n", "\n", "import sys\n", "sys.path.append('src')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Dataset" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "import csv\n", "\n", "with open('database/StateNames.csv', newline='') as csvfile:\n", " data = list(csv.reader(csvfile))\n" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Id', 'Name', 'Year', 'Gender', 'State', 'Count']\n", "1910\n", "['Mary', 'Annie', 'Anna', 'Margaret', 'Helen', 'Elsie', 'Lucy', 'Dorothy', 'Mary', 'Margaret', 'Ruth', 'Annie', 'Elizabeth', 'Helen', 'Mary', 'Elsie', 'Agnes', 'Anna', 'Helen', 'Louise', 'Jean', 'Ruth', 'Alice', 'Esther', 'Ethel', 'Margaret', 'Marie', 'Mary', 'Elizabeth', 'Margaret', 'Helen', 'Alice', 'Annie', 'Louise', 'Mary', 'Margaret', 'Alice', 'Annie', 'Elizabeth', 'Helen', 'Clara', 'Dorothy', 'Martha', 'Agnes', 'Esther', 'Frances', 'Julia', 'Lillian', 'Mildred', 'Pauline', 'Mary', 'Alice', 'Helen', 'Anna', 'Dorothy', 'Margaret', 'Ruth', 'Elizabeth', 'Frances', 'Katherine', 'Martha', 'Annie', 'Clara', 'Emma', 'Hazel', 'Julia', 'Marie', 'Minnie', 'Pauline', 'Mary', 'Helen', 'Dorothy', 'Frances', 'Ruth', 'Alice', 'Anna', 'Agnes', 'Annie', 'Julia', 'Lillian', 'Margaret', 'Daisy', 'Edna', 'Esther', 'Evelyn', 'Florence', 'Katherine', 'Louise', 'Lucy', 'Marie', 'Sally', 'Mary', 'Margaret', 'Helen', 'Frances', 'Alice', 'Olga', 'Ruth', 'Clara']\n", "['14', '12', '10', '8', '7', '6', '6', '5', '12', '7', '7', '6', '6', '6', '9', '8', '7', '7', '7', '7', '6', '6', '5', '5', '5', '5', '5', '21', '9', '8', '7', '5', '5', '5', '22', '10', '8', '8', '7', '7', '6', '6', '6', '5', '5', '5', '5', '5', '5', '5', '23', '13', '12', '10', '10', '9', '9', '7', '6', '6', '6', '5', '5', '5', '5', '5', '5', '5', '5', '18', '12', '8', '8', '8', '7', '7', '6', '6', '6', '6', '6', '5', '5', '5', '5', '5', '5', '5', '5', '5', '5', '21', '11', '10', '9', '7', '7', '7', '6']\n" ] } ], "source": [ "print(data[0])\n", "print(data[2][2])\n", "# data starts in the year 1910\n", "print([entry[1] for entry in data[1:100] if int(entry[2]) < 1940])\n", "print([entry[5] for entry in data[1:100] if int(entry[2]) < 1940])" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "name_field = [\"name\", \"last name\", \"family name\", \"nimi\", \"nom\"]\n", "gender_field = [\"sex\", \"gender\", \"geschlecht\", \"man/woman\"]\n", "location_field = [\"place\", \"address\", \"city\", \"origin\"]\n", "\n", "fields = [name_field, gender_field, location_field]\n", "fields_nb = [1, 3, 4]\n", "order = [[1, 2, 3], [1, 3, 2], [2, 1, 3], [2, 3, 1], [3, 1, 2], [3, 2, 1]]\n", "\n", "x_data = []\n", "y_data = []\n", "\n", "for cnt in range(1, 10000):\n", " question = \"\\n\"\n", " \n", " rand = random.randint(0, len(order)-1)\n", " order_loc = order[rand]\n", " \n", " for i in order_loc:\n", " rand = random.randint(0, len(fields[i-1])-1)\n", " question += fields[i-1][rand]\n", " question += \" \"\n", " question += data[cnt][fields_nb[i-1]]\n", " question += \"\\n\"\n", " \n", " question += \"\"\n", " \n", " answer = data[cnt][fields_nb[0]]\n", " answer += \"\\n\"\n", " \n", " x_data.append(question+answer) \n", " y_data.append(answer)\n", " \n", " #print(x_data[-1])\n", " \n", "with open('my_person_database.txt', 'w') as f:\n", " for item in x_data:\n", " f.write(\"%s\" % item)\n" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Reading files\n", "Writing my_person_database.npz\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\gantenbe\\Anaconda3\\envs\\gpt2\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:526: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", " _np_qint8 = np.dtype([(\"qint8\", np.int8, 1)])\n", "C:\\Users\\gantenbe\\Anaconda3\\envs\\gpt2\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:527: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", " _np_quint8 = np.dtype([(\"quint8\", np.uint8, 1)])\n", "C:\\Users\\gantenbe\\Anaconda3\\envs\\gpt2\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:528: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", " _np_qint16 = np.dtype([(\"qint16\", np.int16, 1)])\n", "C:\\Users\\gantenbe\\Anaconda3\\envs\\gpt2\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:529: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", " _np_quint16 = np.dtype([(\"quint16\", np.uint16, 1)])\n", "C:\\Users\\gantenbe\\Anaconda3\\envs\\gpt2\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:530: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", " _np_qint32 = np.dtype([(\"qint32\", np.int32, 1)])\n", "C:\\Users\\gantenbe\\Anaconda3\\envs\\gpt2\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:535: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", " np_resource = np.dtype([(\"resource\", np.ubyte, 1)])\n", "\n", " 0%| | 0/1 [00:00 10:\n", " break\n", " \n", " one_hot_gen = to_one_hot([gen_name])\n", "\n", " x_gen, y_gen = to_sequence(one_hot_gen)\n", " \n", " shift = len(gen_name)\n", " if shift > x_gen.shape[1]:\n", " pos = -1\n", " else:\n", " pos = shift-(x_gen.shape[1]+1)\n", " start_seq = x_gen[pos]\n", " start_seq = np.expand_dims(start_seq, axis=0)\n", " # print(start_seq) \n", " \n", "for _ in range(10):\n", " generate_word()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "start_seq = np.zeros((1,x_train.shape[1],27))\n", "# set every where spaces\n", "start_seq[:, 0] = 1\n", "gen_name = \"\"\n", "\n", "gen_vec = model.predict(start_seq)\n", "print(gen_vec)\n", "print(gen_vec.shape)\n", "gen_letter = from_one_hot(gen_vec, \"max\")\n", "print(gen_letter)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "start_seq = np.zeros((1,x_train.shape[1],27))\n", "y_pred = model.predict(start_seq)\n", "print(from_one_hot(y_pred, \"proba\"))\n", "print(y_pred)\n", "print(start_seq.shape)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "y_pred = model.predict(x_test[0:1])\n", "print(y_pred.shape)\n", "print(x_test.shape)\n", "print(x_test[0:1].shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# y_pred = model.predict(x_test)\n", "print(y_pred.shape)\n", "print(y_pred[1,:])\n", "print(np.argmax(y_pred[1,1:]))\n", "print([np.argmax(li[:]) for li in y_pred[:]])\n", "print([np.argmax(li[:]) for li in y_data[:]])\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## convert one-hot-coded vectors back to string" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pred_num_letter = [np.argmax(li[:]) for li in y_pred[:]]\n", "pred_letter = [int_to_char[num_letter] for num_letter in pred_num_letter]\n", "\n", "print(pred_letter)\n", "\n", "pred_num_letter = [np.argmax(li[:]) for li in y_data[:]]\n", "pred_letter = [int_to_char[num_letter] for num_letter in pred_num_letter]\n", "\n", "print(pred_letter)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "tf.linalg.matmul(embedded_vector,tf.linalg.pinv(embedding_layer.weights[0]))\n", "\n", "# https://stackoverflow.com/questions/45773660/reverse-word-embeddings-in-keras-python" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.12" } }, "nbformat": 4, "nbformat_minor": 4 }