pytorch
diff --git a/‎_downloads/13b143c2380f4768d9432d808ad50799/char_rnn_classification_tutorial.ipynb‎
Lines changed: 32 additions & 27 deletions b/‎_downloads/13b143c2380f4768d9432d808ad50799/char_rnn_classification_tutorial.ipynb‎
Lines changed: 32 additions & 27 deletions
@@ -97,7 +97,7 @@
    },
    "outputs": [],
    "source": [
-    "import torch \n",
+    "import torch\n",
     "\n",
     "# Check if CUDA is available\n",
     "device = torch.device('cpu')\n",
@@ -138,13 +138,14 @@
    },
    "outputs": [],
    "source": [
-    "import string \n",
+    "import string\n",
     "import unicodedata\n",
     "\n",
-    "allowed_characters = string.ascii_letters + \" .,;'\"\n",
-    "n_letters = len(allowed_characters) \n",
+    "# We can use \"_\" to represent an out-of-vocabulary character, that is, any character we are not handling in our model\n",
+    "allowed_characters = string.ascii_letters + \" .,;'\" + \"_\"\n",
+    "n_letters = len(allowed_characters)\n",
     "\n",
-    "# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427    \n",
+    "# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427\n",
     "def unicodeToAscii(s):\n",
     "    return ''.join(\n",
     "        c for c in unicodedata.normalize('NFD', s)\n",
@@ -203,7 +204,11 @@
    "source": [
     "# Find letter index from all_letters, e.g. \"a\" = 0\n",
     "def letterToIndex(letter):\n",
-    "    return allowed_characters.find(letter)\n",
+    "    # return our out-of-vocabulary character if we encounter a letter unknown to our model\n",
+    "    if letter not in allowed_characters:\n",
+    "        return allowed_characters.find(\"_\")\n",
+    "    else:\n",
+    "        return allowed_characters.find(letter)\n",
     "\n",
     "# Turn a line into a <line_length x 1 x n_letters>,\n",
     "# or an array of one-hot letter vectors\n",
@@ -261,7 +266,7 @@
     "from io import open\n",
     "import glob\n",
     "import os\n",
-    "import time \n",
+    "import time\n",
     "\n",
     "import torch\n",
     "from torch.utils.data import Dataset\n",
@@ -270,26 +275,26 @@
     "\n",
     "    def __init__(self, data_dir):\n",
     "        self.data_dir = data_dir #for provenance of the dataset\n",
-    "        self.load_time = time.localtime #for provenance of the dataset \n",
+    "        self.load_time = time.localtime #for provenance of the dataset\n",
     "        labels_set = set() #set of all classes\n",
     "\n",
     "        self.data = []\n",
     "        self.data_tensors = []\n",
-    "        self.labels = [] \n",
-    "        self.labels_tensors = [] \n",
+    "        self.labels = []\n",
+    "        self.labels_tensors = []\n",
     "\n",
     "        #read all the ``.txt`` files in the specified directory\n",
-    "        text_files = glob.glob(os.path.join(data_dir, '*.txt'))                           \n",
+    "        text_files = glob.glob(os.path.join(data_dir, '*.txt'))\n",
     "        for filename in text_files:\n",
     "            label = os.path.splitext(os.path.basename(filename))[0]\n",
     "            labels_set.add(label)\n",
     "            lines = open(filename, encoding='utf-8').read().strip().split('\\n')\n",
-    "            for name in lines: \n",
+    "            for name in lines:\n",
     "                self.data.append(name)\n",
     "                self.data_tensors.append(lineToTensor(name))\n",
     "                self.labels.append(label)\n",
     "\n",
-    "        #Cache the tensor representation of the labels \n",
+    "        #Cache the tensor representation of the labels\n",
     "        self.labels_uniq = list(labels_set)\n",
     "        for idx in range(len(self.labels)):\n",
     "            temp_tensor = torch.tensor([self.labels_uniq.index(self.labels[idx])], dtype=torch.long)\n",
@@ -302,7 +307,7 @@
     "        data_item = self.data[idx]\n",
     "        data_label = self.labels[idx]\n",
     "        data_tensor = self.data_tensors[idx]\n",
-    "        label_tensor = self.labels_tensors[idx] \n",
+    "        label_tensor = self.labels_tensors[idx]\n",
     "\n",
     "        return label_tensor, data_tensor, data_label, data_item"
    ]
@@ -402,7 +407,7 @@
     "        self.rnn = nn.RNN(input_size, hidden_size)\n",
     "        self.h2o = nn.Linear(hidden_size, output_size)\n",
     "        self.softmax = nn.LogSoftmax(dim=1)\n",
-    "    \n",
+    "\n",
     "    def forward(self, line_tensor):\n",
     "        rnn_out, hidden = self.rnn(line_tensor)\n",
     "        output = self.h2o(hidden[0])\n",
@@ -415,7 +420,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We can then create an RNN with 57 input nodes, 128 hidden nodes, and 18\n",
+    "We can then create an RNN with 58 input nodes, 128 hidden nodes, and 18\n",
     "outputs:\n"
    ]
   },
@@ -456,7 +461,7 @@
     "\n",
     "input = lineToTensor('Albert')\n",
     "output = rnn(input) #this is equivalent to ``output = rnn.forward(input)``\n",
-    "print(output) \n",
+    "print(output)\n",
     "print(label_from_output(output, alldata.labels_uniq))"
    ]
   },
@@ -494,8 +499,8 @@
    },
    "outputs": [],
    "source": [
-    "import random \n",
-    "import numpy as np \n",
+    "import random\n",
+    "import numpy as np\n",
     "\n",
     "def train(rnn, training_data, n_epoch = 10, n_batch_size = 64, report_every = 50, learning_rate = 0.2, criterion = nn.NLLLoss()):\n",
     "    \"\"\"\n",
@@ -504,22 +509,22 @@
     "    # Keep track of losses for plotting\n",
     "    current_loss = 0\n",
     "    all_losses = []\n",
-    "    rnn.train() \n",
+    "    rnn.train()\n",
     "    optimizer = torch.optim.SGD(rnn.parameters(), lr=learning_rate)\n",
     "\n",
     "    start = time.time()\n",
     "    print(f\"training on data set with n = {len(training_data)}\")\n",
     "\n",
-    "    for iter in range(1, n_epoch + 1): \n",
-    "        rnn.zero_grad() # clear the gradients \n",
+    "    for iter in range(1, n_epoch + 1):\n",
+    "        rnn.zero_grad() # clear the gradients\n",
     "\n",
     "        # create some minibatches\n",
     "        # we cannot use dataloaders because each of our names is a different length\n",
     "        batches = list(range(len(training_data)))\n",
     "        random.shuffle(batches)\n",
     "        batches = np.array_split(batches, len(batches) //n_batch_size )\n",
     "\n",
-    "        for idx, batch in enumerate(batches): \n",
+    "        for idx, batch in enumerate(batches):\n",
     "            batch_loss = 0\n",
     "            for i in batch: #for each example in this batch\n",
     "                (label_tensor, text_tensor, label, text) = training_data[i]\n",
@@ -534,12 +539,12 @@
     "            optimizer.zero_grad()\n",
     "\n",
     "            current_loss += batch_loss.item() / len(batch)\n",
-    "        \n",
+    "\n",
     "        all_losses.append(current_loss / len(batches) )\n",
     "        if iter % report_every == 0:\n",
     "            print(f\"{iter} ({iter / n_epoch:.0%}): \\t average batch loss = {all_losses[-1]}\")\n",
     "        current_loss = 0\n",
-    "    \n",
+    "\n",
     "    return all_losses"
    ]
   },
@@ -617,12 +622,12 @@
    "source": [
     "def evaluate(rnn, testing_data, classes):\n",
     "    confusion = torch.zeros(len(classes), len(classes))\n",
-    "    \n",
+    "\n",
     "    rnn.eval() #set to eval mode\n",
     "    with torch.no_grad(): # do not record the gradients during eval phase\n",
     "        for i in range(len(testing_data)):\n",
     "            (label_tensor, text_tensor, label, text) = testing_data[i]\n",
-    "            output = rnn(text_tensor)   \n",
+    "            output = rnn(text_tensor)\n",
     "            guess, guess_i = label_from_output(output, classes)\n",
     "            label_i = classes.index(label)\n",
     "            confusion[label_i][guess_i] += 1\n",