Checked seq2seq part of pipeline locally

egor · egor · commit 446323cc1a41 · 2019-10-21T12:53:46.000+02:00
Signed-off-by: egor &lt;egor@sourced.tech&gt;
diff --git a/notebooks/Name suggestion.ipynb b/notebooks/Name suggestion.ipynb
@@ -27,7 +27,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -61,6 +61,8 @@
     "    MODEL_CONFIG = [\"model\", \"config.yml\"]    \n",
     "    MODEL_PRETRAINED = [\"pretrained\", \"ckpt-25000\"]\n",
     "    ENC_VAL_NAMES_PRED = [\"val.bpe.pred.tgt\"]\n",
+    "    SAMPLE_ENC_VAL_BODIES = [\"sample_val.bpe.src\"]\n",
+    "    SAMPLE_ENC_VAL_NAMES = [\"sample_val.bpe.tgt\"]\n",
     "\n",
     "    \n",
     "class Dirs(DirsABC, Enum):\n",
@@ -278,13 +280,6 @@
     " - Y lable, a name of the function.\n"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -383,7 +378,9 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "import pandas as pd"
+   ]
   },
   {
    "cell_type": "code",
@@ -491,26 +488,6 @@
     "Get vector represenation using the vocabulary from the trained BPE tokenizer, in the format compatible with [OpenNMT](http://opennmt.net/OpenNMT-tf/data.html#vocabulary)."
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Save the vocabulary on disk\n",
-    "\n",
-    "We'll need only one file, as the same vocabulary will be used for both, identifiers and function names. Different vocabularies can be used without any change to the model e.g the sub-words (BPE) only for identifers and char for the function names."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "with open(run.path(Files.VOCABULARY), \"w\") as vocab_fd:\n",
-    "    for i in range(vocab_size + 5):\n",
-    "        vocab_fd.write(str(i) + \"\\n\")"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -537,7 +514,7 @@
     "bpe_encode(run.path(Files.TRAIN_BODIES), run.path(Files.ENC_TRAIN_BODIES))\n",
     "bpe_encode(run.path(Files.TRAIN_NAMES), run.path(Files.ENC_TRAIN_NAMES))\n",
     "bpe_encode(run.path(Files.VAL_BODIES), run.path(Files.ENC_VAL_BODIES))\n",
-    "bpe_encode(run.path(Files.VAL_BODIES), run.path(Files.ENC_VAL_NAMES))"
+    "bpe_encode(run.path(Files.VAL_NAMES), run.path(Files.ENC_VAL_NAMES))"
    ]
   },
   {
@@ -557,7 +534,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# TODO: src_vocab_loc, tgt_vocab_loc\n",
+    "import os\n",
     "\n",
     "# approach requires to provide vocabularies\n",
     "# so launch these commands\n",
@@ -567,6 +544,7 @@
     "                                                              input_text)\n",
     "\n",
     "if not os.path.exists(run.path(Files.SRC_VOCABULARY)):\n",
+    "    print(\"Generating vocabularies\")\n",
     "    # in case of pretrained model we reuse vocabulary\n",
     "    cmd = generate_build_vocab(save_vocab_loc=run.path(Files.SRC_VOCABULARY),\n",
     "                               input_text=run.path(Files.ENC_TRAIN_BODIES),\n",
@@ -585,7 +563,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "\n",
     "model_dir = run.path(Dirs.MODEL_RUN)\n",
     "\n",
     "# prepare config file for model\n",
@@ -614,7 +591,7 @@
     "train:\n",
     "  # (optional when batch_type=tokens) If not set, the training will search the largest\n",
     "  # possible batch size.\n",
-    "  batch_size: 256\n",
+    "  batch_size: 32\n",
     "\n",
     "eval:\n",
     "  # (optional) The batch size to use (default: 32).\n",
@@ -647,6 +624,15 @@
     "    f.write(yaml_content)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### small GPU vs CPU comparison:\n",
+    "* CPU with 4 cores: `source words/s = 104, target words/s = 34`\n",
+    "* 1080 GPU: `source words/s = 6959, target words/s = 1434`"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -697,17 +683,28 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "bpe_val_predictions = \"val.pred.tgt\"\n",
-    "\n",
+    "# limit number of samples to process\n",
+    "!head -50 {run.path(Files.ENC_VAL_BODIES)} > {run.path(Files.SAMPLE_ENC_VAL_BODIES)}\n",
+    "!head -50 {run.path(Files.ENC_VAL_NAMES)} > {run.path(Files.SAMPLE_ENC_VAL_NAMES)}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
     "predict_cmd = \"\"\"onmt-main \\\n",
-    "--config %s --auto_config \\\n",
+    "--config %s --auto_config --model_type LuongAttention \\\n",
+    "--checkpoint_path %s \\\n",
     "infer \\\n",
     "--features_file %s \\\n",
-    "--predictions_file %s \\\n",
-    "--checkpoint_path %s\"\"\" % (config_yaml, \n",
-    "                           run.path(Files.ENC_VAL_BODIES), \n",
+    "--predictions_file %s\n",
+    "\"\"\" % (config_yaml, pretrained_model,\n",
+    "                           run.path(Files.SAMPLE_ENC_VAL_BODIES), \n",
     "                           run.path(Files.ENC_VAL_NAMES_PRED),\n",
-    "                           pretrained_model)"
+    "                           )\n",
+    "! {predict_cmd}"
    ]
   },
   {
@@ -731,7 +728,7 @@
    "outputs": [],
    "source": [
     "gt_ids = []\n",
-    "with open(run.path(Files.ENC_VAL_NAMES), \"r\") as f:\n",
+    "with open(run.path(Files.SAMPLE_ENC_VAL_NAMES), \"r\") as f:\n",
     "    for i, line in enumerate(f.readlines()):\n",
     "        gt_ids.append(list(map(int, line.split())))\n",
     "gt_val_function_names = bpe.decode(gt_ids)"
@@ -750,10 +747,20 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "for i, (a, b) in enumerate(zip(gt_function_names, predicted_function_names)):\n",
-    "    if i == 100:\n",
-    "        break\n",
-    "    print(\"%s | %s\" % (a, b))    "
+    "for gt_name, pred_name in zip(gt_val_function_names, pred_val_function_names):\n",
+    "    print(\"%s | %s\" % (gt_name, pred_name))    "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Results maybe not so good because a lot of context information is missign\n",
+    "* roles of identifiers\n",
+    "* structural information were removed\n",
+    "* arguments to function\n",
+    "\n",
+    "and so on. There are bunch of improvements possible like [code2vec](https://github.com/tech-srl/code2vec) and many more."
    ]
   }
  ],