Update name notebook

egor · egor · commit 14a07a970192 · 2019-10-21T11:01:40.000+02:00
Signed-off-by: egor &lt;egor@sourced.tech&gt;
diff --git a/notebooks/Name suggestion.ipynb b/notebooks/Name suggestion.ipynb
@@ -27,7 +27,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -58,10 +58,14 @@
     "    ENC_VAL_NAMES = [\"val.bpe.tgt\"]\n",
     "    TGT_VOCABULARY = [\"tgt.vocab\"]\n",
     "    SRC_VOCABULARY = [\"src.vocab\"]\n",
+    "    MODEL_CONFIG = [\"model\", \"config.yml\"]    \n",
+    "    MODEL_PRETRAINED = [\"pretrained\", \"ckpt-25000\"]\n",
+    "    ENC_VAL_NAMES_PRED = [\"val.bpe.pred.tgt\"]\n",
     "\n",
     "    \n",
     "class Dirs(DirsABC, Enum):\n",
     "    TF_MODELS = [\"tf\", \"models\"]\n",
+    "    MODEL_RUN = [\"model\", \"run\"]\n",
     "\n",
     "run = Run(\"name-suggestion\", \"java-full\")\n",
     "\n",
@@ -263,13 +267,6 @@
     "highlight_function_name_and_identifiers(run.path(Files.FUNCTIONS), 3)"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -281,6 +278,13 @@
     " - Y lable, a name of the function.\n"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -341,13 +345,6 @@
     "extract_functions_parallel(run.path(Files.FUNCTIONS))"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -381,6 +378,13 @@
     "We are going to use a sing vocabulary for both, identifiers and function names. In order to do so, we will need to train BPE tokenizer on a file that contains all identifiers and function names in plain text."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -562,14 +566,17 @@
     "                                                              save_vocab_loc,\n",
     "                                                              input_text)\n",
     "\n",
-    "src_vocab_loc = os.path.join(bpe_base_dir, \"src.vocab\")\n",
-    "print(generate_build_vocab(save_vocab_loc=src_vocab_loc,\n",
-    "                           input_text=train_bodies_bpe_loc,\n",
-    "                           vocab_size=vocab_size + 10))\n",
-    "tgt_vocab_loc = os.path.join(bpe_base_dir, \"tgt.vocab\")\n",
-    "print(generate_build_vocab(save_vocab_loc=tgt_vocab_loc,\n",
-    "                           input_text=train_names_bpe_loc,\n",
-    "                           vocab_size=vocab_size + 10))"
+    "if not os.path.exists(run.path(Files.SRC_VOCABULARY)):\n",
+    "    # in case of pretrained model we reuse vocabulary\n",
+    "    cmd = generate_build_vocab(save_vocab_loc=run.path(Files.SRC_VOCABULARY),\n",
+    "                               input_text=run.path(Files.ENC_TRAIN_BODIES),\n",
+    "                               vocab_size=vocab_size + 10)\n",
+    "    ! {cmd}\n",
+    "\n",
+    "    cmd = generate_build_vocab(save_vocab_loc=run.path(Files.TGT_VOCABULARY),\n",
+    "                               input_text=run.path(Files.ENC_TRAIN_NAMES),\n",
+    "                               vocab_size=vocab_size + 10)\n",
+    "    ! {cmd}"
    ]
   },
   {
@@ -578,12 +585,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "base_train_dir =  os.path.join(bpe_base_dir, \"seq2seq\")\n",
-    "os.makedirs(base_train_dir, exist_ok=True)\n",
-    "model_dir = os.path.join(base_train_dir, \"run/\")\n",
+    "\n",
+    "model_dir = run.path(Dirs.MODEL_RUN)\n",
     "\n",
     "# prepare config file for model\n",
-    "config_yaml = os.path.join(base_train_dir, \"config.yml\")\n",
+    "config_yaml = run.path(Files.MODEL_CONFIG)\n",
     "# this directory will contain evaluation results of the model, checkpoints and so on\n",
     "yaml_content = \"model_dir: %s \\n\" % model_dir\n",
     "\n",
@@ -596,9 +602,12 @@
     "  eval_labels_file: %s\n",
     "  source_vocabulary: %s\n",
     "  target_vocabulary: %s\n",
-    "\"\"\" % (train_bodies_bpe_loc, train_names_bpe_loc,\n",
-    "       val_bodies_bpe_loc, val_names_bpe_loc,\n",
-    "       src_vocab_loc, tgt_vocab_loc)\n",
+    "\"\"\" % (run.path(Files.ENC_TRAIN_BODIES), \n",
+    "       run.path(Files.ENC_TRAIN_NAMES),\n",
+    "       run.path(Files.ENC_VAL_BODIES), \n",
+    "       run.path(Files.ENC_VAL_NAMES),\n",
+    "       run.path(Files.SRC_VOCABULARY), \n",
+    "       run.path(Files.TGT_VOCABULARY))\n",
     "\n",
     "# other useful configurations\n",
     "yaml_content += \"\"\"\n",
@@ -645,37 +654,19 @@
    "outputs": [],
    "source": [
     "# how to launch training\n",
-    "train_cmd = \"\"\"\n",
-    "onmt-main --model_type LuongAttention \\\n",
-    "--config %s --auto_config train --with_eval\"\"\" % config_yaml\n",
-    "print(train_cmd)\n",
+    "GPU_USE = False\n",
+    "if not GPU_USE:\n",
+    "    train_cmd = \"\"\"\n",
+    "    onmt-main --model_type LuongAttention \\\n",
+    "    --config %s --auto_config train --with_eval\"\"\" % config_yaml\n",
+    "    ! {train_cmd}\n",
     "\n",
     "# in case of GPU you can specify CUDA_VISIBLE_DEVICES & number of GPUs to use\n",
-    "cmd_gpu = \"\"\"\n",
-    "CUDA_VISIBLE_DEVICES=%s onmt-main --model_type LuongAttention \\\n",
-    "--config %s --auto_config train --with_eval --num_gpus %s\"\"\" % (\"0,1\", config_yaml, 2)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "alex.ipynb\t\t      jupyter-server-config.json  requirements-tf.txt\r\n",
-      "base_egor.ipynb\t\t      Makefile\t\t\t  requirements.txt\r\n",
-      "Dockerfile\t\t      notebooks\t\t\t  src.vocab\r\n",
-      "docs\t\t\t      pretrained.zip\t\t  sshuttle.pid\r\n",
-      "images\t\t\t      README.md\t\t\t  tgt.vocab\r\n",
-      "jupyter-notebook-config.json  requirements-bigartm.txt\r\n"
-     ]
-    }
-   ],
-   "source": [
-    "!ls"
+    "if GPU_USE:\n",
+    "    cmd_gpu = \"\"\"\n",
+    "    CUDA_VISIBLE_DEVICES=%s onmt-main --model_type LuongAttention \\\n",
+    "    --config %s --auto_config train --with_eval --num_gpus %s\"\"\" % (\"0,1\", config_yaml, 2)\n",
+    "    ! {cmd_gpu}"
    ]
   },
   {
@@ -694,14 +685,29 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "bpe_val_predictions = os.path.join(base_dir, \"val.pred.tgt\")\n",
-    "pretrained_model = os.path.join(base_dir, \"pretrained/model\")\n",
+    "# you have to specify location of pretrained model\n",
+    "pretrained_model = None\n",
+    "if pretrained_model is None:\n",
+    "    pretrained_model = run.path(Files.MODEL_PRETRAINED)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "bpe_val_predictions = \"val.pred.tgt\"\n",
+    "\n",
     "predict_cmd = \"\"\"onmt-main \\\n",
     "--config %s --auto_config \\\n",
     "infer \\\n",
     "--features_file %s \\\n",
     "--predictions_file %s \\\n",
-    "--checkpoint_path run/baseline/avg/ckpt-5000\"\"\" % (config_yaml, val_bodies_bpe_loc, bpe_val_predictions)"
+    "--checkpoint_path %s\"\"\" % (config_yaml, \n",
+    "                           run.path(Files.ENC_VAL_BODIES), \n",
+    "                           run.path(Files.ENC_VAL_NAMES_PRED),\n",
+    "                           pretrained_model)"
    ]
   },
   {
@@ -711,7 +717,7 @@
    "outputs": [],
    "source": [
     "pred_ids = []\n",
-    "with open(bpe_val_predictions, \"r\") as f:\n",
+    "with open(run.path(Files.ENC_VAL_NAMES_PRED), \"r\") as f:\n",
     "    for line in f.readlines():\n",
     "        pred_ids.append(list(map(int, line.split())))\n",
     "\n",
@@ -725,7 +731,7 @@
    "outputs": [],
    "source": [
     "gt_ids = []\n",
-    "with open(val_names_bpe_loc, \"r\") as f:\n",
+    "with open(run.path(Files.ENC_VAL_NAMES), \"r\") as f:\n",
     "    for i, line in enumerate(f.readlines()):\n",
     "        gt_ids.append(list(map(int, line.split())))\n",
     "gt_val_function_names = bpe.decode(gt_ids)"