{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Online RNN network to predict first names of us babies"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import tensorflow as tf\n",
    "from tensorflow import keras\n",
    "from tensorflow.keras import layers\n",
    "from tensorflow.keras import activations\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.model_selection import train_test_split\n",
    "import random\n",
    "\n",
    "import sys\n",
    "sys.path.append('src')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "import csv\n",
    "\n",
    "with open('database/StateNames.csv', newline='') as csvfile:\n",
    "    data = list(csv.reader(csvfile))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Id', 'Name', 'Year', 'Gender', 'State', 'Count']\n",
      "1910\n",
      "['Mary', 'Annie', 'Anna', 'Margaret', 'Helen', 'Elsie', 'Lucy', 'Dorothy', 'Mary', 'Margaret', 'Ruth', 'Annie', 'Elizabeth', 'Helen', 'Mary', 'Elsie', 'Agnes', 'Anna', 'Helen', 'Louise', 'Jean', 'Ruth', 'Alice', 'Esther', 'Ethel', 'Margaret', 'Marie', 'Mary', 'Elizabeth', 'Margaret', 'Helen', 'Alice', 'Annie', 'Louise', 'Mary', 'Margaret', 'Alice', 'Annie', 'Elizabeth', 'Helen', 'Clara', 'Dorothy', 'Martha', 'Agnes', 'Esther', 'Frances', 'Julia', 'Lillian', 'Mildred', 'Pauline', 'Mary', 'Alice', 'Helen', 'Anna', 'Dorothy', 'Margaret', 'Ruth', 'Elizabeth', 'Frances', 'Katherine', 'Martha', 'Annie', 'Clara', 'Emma', 'Hazel', 'Julia', 'Marie', 'Minnie', 'Pauline', 'Mary', 'Helen', 'Dorothy', 'Frances', 'Ruth', 'Alice', 'Anna', 'Agnes', 'Annie', 'Julia', 'Lillian', 'Margaret', 'Daisy', 'Edna', 'Esther', 'Evelyn', 'Florence', 'Katherine', 'Louise', 'Lucy', 'Marie', 'Sally', 'Mary', 'Margaret', 'Helen', 'Frances', 'Alice', 'Olga', 'Ruth', 'Clara']\n",
      "['14', '12', '10', '8', '7', '6', '6', '5', '12', '7', '7', '6', '6', '6', '9', '8', '7', '7', '7', '7', '6', '6', '5', '5', '5', '5', '5', '21', '9', '8', '7', '5', '5', '5', '22', '10', '8', '8', '7', '7', '6', '6', '6', '5', '5', '5', '5', '5', '5', '5', '23', '13', '12', '10', '10', '9', '9', '7', '6', '6', '6', '5', '5', '5', '5', '5', '5', '5', '5', '18', '12', '8', '8', '8', '7', '7', '6', '6', '6', '6', '6', '5', '5', '5', '5', '5', '5', '5', '5', '5', '5', '21', '11', '10', '9', '7', '7', '7', '6']\n"
     ]
    }
   ],
   "source": [
    "print(data[0])\n",
    "print(data[2][2])\n",
    "# data starts in the year 1910\n",
    "print([entry[1] for entry in data[1:100] if int(entry[2]) < 1940])\n",
    "print([entry[5] for entry in data[1:100] if int(entry[2]) < 1940])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "name_field = [\"name\", \"last name\", \"family name\", \"nimi\", \"nom\"]\n",
    "gender_field = [\"sex\", \"gender\", \"geschlecht\", \"man/woman\"]\n",
    "location_field = [\"place\", \"address\", \"city\", \"origin\"]\n",
    "\n",
    "fields = [name_field, gender_field, location_field]\n",
    "fields_nb = [1, 3, 4]\n",
    "order = [[1, 2, 3], [1, 3, 2], [2, 1, 3], [2, 3, 1], [3, 1, 2], [3, 2, 1]]\n",
    "\n",
    "x_data = []\n",
    "y_data = []\n",
    "\n",
    "for cnt in range(1, 10000):\n",
    "    question = \"<start>\\n\"\n",
    "    \n",
    "    rand = random.randint(0, len(order)-1)\n",
    "    order_loc = order[rand]\n",
    "    \n",
    "    for i in order_loc:\n",
    "        rand = random.randint(0, len(fields[i-1])-1)\n",
    "        question += fields[i-1][rand]\n",
    "        question += \" \"\n",
    "        question += data[cnt][fields_nb[i-1]]\n",
    "        question += \"\\n\"\n",
    "        \n",
    "    question += \"<answer>\"\n",
    "    \n",
    "    answer = data[cnt][fields_nb[0]]\n",
    "    answer += \"<end>\\n\"\n",
    "    \n",
    "    x_data.append(question+answer)   \n",
    "    y_data.append(answer)\n",
    "    \n",
    "    #print(x_data[-1])\n",
    "    \n",
    "with open('my_person_database.txt', 'w') as f:\n",
    "    for item in x_data:\n",
    "        f.write(\"%s\" % item)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reading files\n",
      "Writing my_person_database.npz\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\gantenbe\\Anaconda3\\envs\\gpt2\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:526: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  _np_qint8 = np.dtype([(\"qint8\", np.int8, 1)])\n",
      "C:\\Users\\gantenbe\\Anaconda3\\envs\\gpt2\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:527: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  _np_quint8 = np.dtype([(\"quint8\", np.uint8, 1)])\n",
      "C:\\Users\\gantenbe\\Anaconda3\\envs\\gpt2\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:528: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  _np_qint16 = np.dtype([(\"qint16\", np.int16, 1)])\n",
      "C:\\Users\\gantenbe\\Anaconda3\\envs\\gpt2\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:529: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  _np_quint16 = np.dtype([(\"quint16\", np.uint16, 1)])\n",
      "C:\\Users\\gantenbe\\Anaconda3\\envs\\gpt2\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:530: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  _np_qint32 = np.dtype([(\"qint32\", np.int32, 1)])\n",
      "C:\\Users\\gantenbe\\Anaconda3\\envs\\gpt2\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:535: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  np_resource = np.dtype([(\"resource\", np.ubyte, 1)])\n",
      "\n",
      "  0%|          | 0/1 [00:00<?, ?it/s]\n",
      "100%|##########| 1/1 [00:00<00:00,  1.04it/s]\n"
     ]
    }
   ],
   "source": [
    "!python encode.py my_person_database.txt my_person_database.npz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!python train.py --dataset my_person_database.npz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# use one hot encoding of the alphabetical letters (using just lower case)\n",
    "\n",
    "from keras.utils import to_categorical\n",
    "\n",
    "# define universe of possible input values\n",
    "alphabet = ' abcdefghijklmnopqrstuvwxyz'\n",
    "char_to_int = dict((c, i) for i, c in enumerate(alphabet))\n",
    "int_to_char = dict((i, c) for i, c in enumerate(alphabet))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_names(data, nb=1000, id0=0):\n",
    "    # take square of number of occurances, so we should learn also the special names\n",
    "    names = [entry[1] for entry in data[1+id0:] if int(entry[0]) < id0+nb for _ in range(int(np.sqrt(int(entry[5]))))]\n",
    "    start_year = data[1+id0][2]\n",
    "    end_year = data[1+id0+nb][2]\n",
    "    \n",
    "    return names, start_year, end_year, len(names)    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def split_and_shuffle(names, train_size=0.5):\n",
    "    names_train, names_test = train_test_split(names, train_size=train_size, random_state=42, shuffle=True)\n",
    "    return names_train, names_test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def to_one_hot_single(names, max_len_names):\n",
    "    num_names = np.zeros((len(names), max_len_names))\n",
    "    for i, name in enumerate(names):\n",
    "        for j in range(max_len_names):\n",
    "            if j < len(name):\n",
    "                num_names[i,j] = char_to_int[name[j].lower()]\n",
    "            else:\n",
    "                num_names[i,j] = 0\n",
    "\n",
    "    one_hot_names = to_categorical(num_names, num_classes=27)\n",
    "    \n",
    "    return one_hot_names\n",
    "\n",
    "def to_one_hot(*names_set, max_len_names=5):\n",
    "    one_hot_sets = []\n",
    "    for _set in names_set:\n",
    "        one_hot_sets.append(to_one_hot_single(_set, max_len_names))\n",
    "    \n",
    "    if len(one_hot_sets) == 1:\n",
    "        return one_hot_sets[0]\n",
    "    \n",
    "    return tuple(one_hot_sets)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def to_sequence_single(one_hot):\n",
    "    # generate sequences and apply padding with white space\n",
    "    max_len_names = one_hot.shape[1]\n",
    "    x_data = np.zeros((one_hot.shape[0]*max_len_names, max_len_names-1, 27))\n",
    "    y_data = np.zeros((one_hot.shape[0]*max_len_names, 27))\n",
    "    for i in range(one_hot.shape[0]):\n",
    "        for j in range(max_len_names):\n",
    "            #print(str(i) + \", \" + str(j))\n",
    "            tmp = np.zeros((max_len_names*3, 27))\n",
    "            # set everything equal to a space\n",
    "            tmp[:, 0] = 1\n",
    "            # insert actual word\n",
    "            tmp[max_len_names:2*max_len_names] = one_hot[i]\n",
    "            x_data[i*max_len_names+j] = tmp[j+1:max_len_names+j]\n",
    "            y_data[i*max_len_names+j] = tmp[max_len_names+j]\n",
    "            # print(y_data[i*max_len_names+j])\n",
    "\n",
    "    return x_data, y_data\n",
    "\n",
    "def to_sequence(*one_hot_set):\n",
    "    split_sets = []\n",
    "    for _set in one_hot_set:\n",
    "        split_sets += to_sequence_single(_set)\n",
    "    \n",
    "    return tuple(split_sets)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "names, s_year, e_year, count = get_names(data)\n",
    "names_train, names_test = split_and_shuffle(names)\n",
    "\n",
    "one_hot_names_train, one_hot_names_test = to_one_hot(names_train, names_test)\n",
    "\n",
    "x_train, y_train, x_test, y_test = to_sequence(one_hot_names_train, one_hot_names_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(len(names))\n",
    "print(len(names_train))\n",
    "print(x_train.shape)\n",
    "print(y_train.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = keras.Sequential()\n",
    "# Add an Embedding layer expecting input vocab of size 1000, and\n",
    "# output embedding dimension of size 64.\n",
    "#model.add(layers.Embedding(input_dim=1000, output_dim=64))\n",
    "\n",
    "model.add(layers.InputLayer(input_shape=(x_train.shape[1], len(alphabet))))\n",
    "\n",
    "# Add a LSTM layer with 128 internal units.\n",
    "model.add(layers.SimpleRNN(128))\n",
    "\n",
    "# Add a Dense layer with 10 units.\n",
    "model.add(layers.Dense(27, activation=activations.sigmoid))\n",
    "\n",
    "model.summary()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.compile(\n",
    "    loss='categorical_crossentropy',\n",
    "    #loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),\n",
    "    optimizer=\"sgd\",\n",
    "    metrics=[\"accuracy\"],\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "history = model.fit(\n",
    "    x_train, y_train, validation_data=(x_test, y_test), batch_size=1000, epochs=500, verbose=1\n",
    ")\n",
    "model_copy = tf.keras.models.clone_model(model)\n",
    "model_copy.compile(\n",
    "    loss='categorical_crossentropy',\n",
    "    #loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),\n",
    "    optimizer=\"sgd\",\n",
    "    metrics=[\"accuracy\"],\n",
    ")\n",
    "model_copy.set_weights(model.get_weights())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# list all data in history\n",
    "print(history.history.keys())\n",
    "# summarize history for accuracy\n",
    "plt.plot(history.history['accuracy'])\n",
    "plt.plot(history.history['val_accuracy'])\n",
    "plt.title('model accuracy')\n",
    "plt.ylabel('accuracy')\n",
    "plt.xlabel('epoch')\n",
    "plt.legend(['train', 'test'], loc='upper left')\n",
    "plt.show()\n",
    "\n",
    "plt.plot(history.history['loss'])\n",
    "plt.plot(history.history['val_loss'])\n",
    "plt.title('model loss')\n",
    "plt.ylabel('loss')\n",
    "plt.xlabel('epoch')\n",
    "plt.legend(['train', 'test'], loc='upper left')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# generate reference sets\n",
    "\n",
    "nb_sets = 5\n",
    "nb = 1000\n",
    "\n",
    "x_list = []\n",
    "y_list = []\n",
    "accuracy_list = []\n",
    "id_set_list = []\n",
    "\n",
    "for i in range(nb_sets):\n",
    "    id0 = int(int(len(data))/(nb_sets-1)*(i)-nb)-100  # ignore the 100 last entries\n",
    "    if id0 < 0:\n",
    "        id0 = 0\n",
    "    names, s_year, e_year, count = get_names(data, nb=nb, id0=id0)\n",
    "    names_train, names_test = split_and_shuffle(names)\n",
    "    print(\"id \" + str(id0) + \", from year \" + str(s_year) + \" to \" + str(e_year))\n",
    "    \n",
    "    one_hot_names_test = to_one_hot(names_test)\n",
    "\n",
    "    x_test_set, y_test_set = to_sequence(one_hot_names_test)\n",
    "    \n",
    "    x_list.append(x_test_set)\n",
    "    y_list.append(y_test_set)\n",
    "    id_set_list.append(id0)\n",
    "    accuracy_list.append([])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "nb=1000\n",
    "id_list = []\n",
    "train_acc = []\n",
    "test_acc = []\n",
    "train_acc_copy = []\n",
    "test_acc_copy = []\n",
    "for id0 in range(0, int(3e6), nb): # len(data)-2*nb\n",
    "    print(\"start index for training is \" + str(id0) + \" out of \" + str(len(data)))\n",
    "    id_list.append(id0)\n",
    "    \n",
    "    names, s_year, e_year, count = get_names(data, nb=nb, id0=id0)\n",
    "    names_train, names_test = split_and_shuffle(names)\n",
    "\n",
    "    one_hot_names_train, one_hot_names_test = to_one_hot(names_train, names_test)\n",
    "\n",
    "    x_train, y_train, x_test, y_test = to_sequence(one_hot_names_train, one_hot_names_test)\n",
    "    \n",
    "    history = model.fit(\n",
    "        x_train, y_train, validation_data=(x_test, y_test), batch_size=1000, epochs=1, verbose=3\n",
    "    )\n",
    "    \n",
    "    # evaluate the neural network\n",
    "    history = model.evaluate(x_train, y_train)\n",
    "    train_acc.append(history[1])\n",
    "    \n",
    "    history = model.evaluate(x_test, y_test)\n",
    "    test_acc.append(history[1])\n",
    "    \n",
    "    # evaluate the unchanged neural network\n",
    "    history = model_copy.evaluate(x_train, y_train)\n",
    "    train_acc_copy.append(history[1])\n",
    "    \n",
    "    history = model_copy.evaluate(x_test, y_test)\n",
    "    test_acc_copy.append(history[1])\n",
    "    \n",
    "    for i in range(nb_sets):\n",
    "        history = model.evaluate(x_list[i], y_list[i])\n",
    "        accuracy_list[i].append(history[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "color = [\"-r\", \"-g\", \"-b\", \"--m\", \"--y\"]\n",
    "for i in range(nb_sets):\n",
    "    plt.plot(id_list[:len(accuracy_list[i])], accuracy_list[i], color[i], label = \"ref_set \"+str(i))\n",
    "    plt.axvline(id_set_list[i])\n",
    "plt.title('model evolution')\n",
    "plt.ylabel('accuracy')\n",
    "plt.xlabel('start_id_training')\n",
    "plt.legend(loc='upper left')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.plot(id_list, train_acc)\n",
    "plt.plot(id_list, test_acc)\n",
    "plt.plot(id_list, train_acc_copy)\n",
    "plt.plot(id_list, test_acc_copy)\n",
    "plt.title('model evolution')\n",
    "plt.ylabel('accuracy')\n",
    "plt.xlabel('start_id_training')\n",
    "plt.legend(['train', 'test', 'train_copy', 'test_copy'], loc='upper left')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def from_one_hot(pred, mode=\"proba\"):\n",
    "    return [from_one_hot_single(li, mode) for li in pred[:]]\n",
    "\n",
    "def from_one_hot_single(single_pred, mode=\"proba\"):\n",
    "    if mode == \"max\":\n",
    "        letter_num = np.argmax(single_pred)\n",
    "    elif \"proba\" in mode:\n",
    "        if mode == \"probaLim\":\n",
    "            prenorm = np.sum(single_pred)\n",
    "            single_pred[single_pred < 0.05*prenorm] = 0\n",
    "        norm = np.sum(single_pred)\n",
    "        rand = random.uniform(0, norm)\n",
    "        # print(rand)\n",
    "        lim = single_pred[0]\n",
    "        letter_num = 0\n",
    "        \n",
    "        while lim < rand:\n",
    "            letter_num += 1\n",
    "            lim += single_pred[letter_num]\n",
    "        \n",
    "        if norm < 0.1:\n",
    "            letter_num = 0\n",
    "            \n",
    "        print(single_pred[letter_num])\n",
    "    \n",
    "    return int_to_char[letter_num]\n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_word():\n",
    "    start_seq = np.zeros((1,x_train.shape[1],27))\n",
    "    # set every where spaces\n",
    "    start_seq[:, 0] = 1\n",
    "    gen_name = \"\"\n",
    "    \n",
    "    # do while\n",
    "    while True:\n",
    "        gen_vec = model.predict(start_seq)\n",
    "        gen_letter = from_one_hot(gen_vec, \"probaLim\")\n",
    "        \n",
    "        if gen_letter[0] == \" \":\n",
    "            break\n",
    "\n",
    "        gen_name += gen_letter[0]\n",
    "        print(gen_name)\n",
    "        \n",
    "        if len(gen_name) > 10:\n",
    "            break\n",
    "        \n",
    "        one_hot_gen = to_one_hot([gen_name])\n",
    "\n",
    "        x_gen, y_gen = to_sequence(one_hot_gen)\n",
    "        \n",
    "        shift = len(gen_name)\n",
    "        if shift > x_gen.shape[1]:\n",
    "            pos = -1\n",
    "        else:\n",
    "            pos = shift-(x_gen.shape[1]+1)\n",
    "        start_seq = x_gen[pos]\n",
    "        start_seq = np.expand_dims(start_seq, axis=0)\n",
    "        # print(start_seq)       \n",
    "        \n",
    "for _ in range(10):\n",
    "    generate_word()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "start_seq = np.zeros((1,x_train.shape[1],27))\n",
    "# set every where spaces\n",
    "start_seq[:, 0] = 1\n",
    "gen_name = \"\"\n",
    "\n",
    "gen_vec = model.predict(start_seq)\n",
    "print(gen_vec)\n",
    "print(gen_vec.shape)\n",
    "gen_letter = from_one_hot(gen_vec, \"max\")\n",
    "print(gen_letter)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "start_seq = np.zeros((1,x_train.shape[1],27))\n",
    "y_pred = model.predict(start_seq)\n",
    "print(from_one_hot(y_pred, \"proba\"))\n",
    "print(y_pred)\n",
    "print(start_seq.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(x_test[0:1])\n",
    "print(y_pred.shape)\n",
    "print(x_test.shape)\n",
    "print(x_test[0:1].shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# y_pred = model.predict(x_test)\n",
    "print(y_pred.shape)\n",
    "print(y_pred[1,:])\n",
    "print(np.argmax(y_pred[1,1:]))\n",
    "print([np.argmax(li[:]) for li in y_pred[:]])\n",
    "print([np.argmax(li[:]) for li in y_data[:]])\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## convert one-hot-coded vectors back to string"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pred_num_letter = [np.argmax(li[:]) for li in y_pred[:]]\n",
    "pred_letter = [int_to_char[num_letter] for num_letter in pred_num_letter]\n",
    "\n",
    "print(pred_letter)\n",
    "\n",
    "pred_num_letter = [np.argmax(li[:]) for li in y_data[:]]\n",
    "pred_letter = [int_to_char[num_letter] for num_letter in pred_num_letter]\n",
    "\n",
    "print(pred_letter)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "tf.linalg.matmul(embedded_vector,tf.linalg.pinv(embedding_layer.weights[0]))\n",
    "\n",
    "# https://stackoverflow.com/questions/45773660/reverse-word-embeddings-in-keras-python"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}