Almost final version

egor · egor · commit c385724e96de · 2019-10-21T10:25:39.000+02:00
Signed-off-by: egor &lt;egor@sourced.tech&gt;
diff --git a/notebooks/Name suggestion.ipynb b/notebooks/Name suggestion.ipynb
@@ -56,7 +56,8 @@
     "    ENC_TRAIN_NAMES = [\"train.bpe.tgt\"]\n",
     "    ENC_VAL_BODIES = [\"val.bpe.src\"]\n",
     "    ENC_VAL_NAMES = [\"val.bpe.tgt\"]\n",
-    "    VOCABULARY = [\"vocab.txt\"]\n",
+    "    TGT_VOCABULARY = [\"tgt.vocab\"]\n",
+    "    SRC_VOCABULARY = [\"src.vocab\"]\n",
     "\n",
     "    \n",
     "class Dirs(DirsABC, Enum):\n",
@@ -535,63 +536,15 @@
     "bpe_encode(run.path(Files.VAL_BODIES), run.path(Files.ENC_VAL_NAMES))"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Train seq2seq model"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "\"\"\"-data /ssd/devfest2019-workshop/opennmt_format_input -save_model /ssd/devfest2019-workshop/transformer_bpe \\\n",
-    "        -layers 6 -rnn_size 512 -word_vec_size 512 -transformer_ff 2048 -heads 8  \\\n",
-    "        -encoder_type transformer -decoder_type transformer -position_encoding \\\n",
-    "        -train_steps 200000  -max_generator_batches 2 -dropout 0.1 \\\n",
-    "        -batch_size 4096 -batch_type tokens -normalization tokens  -accum_count 2 \\\n",
-    "        -optim adam -adam_beta2 0.998 -decay_method noam -warmup_steps 8000 -learning_rate 2 \\\n",
-    "        -max_grad_norm 0 -param_init 0  -param_init_glorot \\\n",
-    "        -label_smoothing 0.1 -valid_steps 10000 -save_checkpoint_steps 10000 \\\n",
-    "        -world_size 4 -gpu_ranks 0 1 2 3 \"\"\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# preprocess\n",
-    "!echo -train_src {train_bodies_bpe_loc} \\\n",
-    "    -train_tgt {train_names_bpe_loc} \\\n",
-    "    -valid_src {val_bodies_bpe_loc} \\\n",
-    "    -valid_tgt {val_names_bpe_loc} \\\n",
-    "    -save_data /ssd/devfest2019-workshop/opennmt_format_input"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Train transformer with openNMT-tf"
+    "# Train seq2seq model\n",
+    "\n",
+    "* we will use `openNMT-tf`\n",
+    "* prepare vocabularies (we will use functionality to train translation model from identifiers to function names)\n",
+    "* train model"
    ]
   },
   {
@@ -600,16 +553,22 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "# TODO: src_vocab_loc, tgt_vocab_loc\n",
+    "\n",
+    "# approach requires to provide vocabularies\n",
+    "# so launch these commands\n",
     "def generate_build_vocab(save_vocab_loc, input_text, vocab_size=vocab_size):\n",
     "    return \"onmt-build-vocab --size %s --save_vocab %s %s\" % (vocab_size, \n",
     "                                                              save_vocab_loc,\n",
     "                                                              input_text)\n",
     "\n",
-    "print(generate_build_vocab(save_vocab_loc=\"bpe_input/src.vocab\",\n",
-    "                           input_text=\"bpe_input/train.src\",\n",
+    "src_vocab_loc = os.path.join(bpe_base_dir, \"src.vocab\")\n",
+    "print(generate_build_vocab(save_vocab_loc=src_vocab_loc,\n",
+    "                           input_text=train_bodies_bpe_loc,\n",
     "                           vocab_size=vocab_size + 10))\n",
-    "print(generate_build_vocab(save_vocab_loc=\"bpe_input/tgt.vocab\",\n",
-    "                           input_text=\"bpe_input/train.tgt\",\n",
+    "tgt_vocab_loc = os.path.join(bpe_base_dir, \"tgt.vocab\")\n",
+    "print(generate_build_vocab(save_vocab_loc=tgt_vocab_loc,\n",
+    "                           input_text=train_names_bpe_loc,\n",
     "                           vocab_size=vocab_size + 10))"
    ]
   },
@@ -619,18 +578,30 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "yaml_content = \"\"\"\n",
-    "model_dir: run/\n",
-    "\n",
-    "data:\n",
-    "  train_features_file: bpe_input/train.src\n",
-    "  train_labels_file: bpe_input/train.tgt\n",
-    "  eval_features_file: bpe_input/val.src\n",
-    "  eval_labels_file: bpe_input/val.tgt\n",
-    "  source_vocabulary: bpe_input/src.vocab\n",
-    "  target_vocabulary: bpe_input/tgt.vocab\n",
+    "base_train_dir =  os.path.join(bpe_base_dir, \"seq2seq\")\n",
+    "os.makedirs(base_train_dir, exist_ok=True)\n",
+    "model_dir = os.path.join(base_train_dir, \"run/\")\n",
     "\n",
+    "# prepare config file for model\n",
+    "config_yaml = os.path.join(base_train_dir, \"config.yml\")\n",
+    "# this directory will contain evaluation results of the model, checkpoints and so on\n",
+    "yaml_content = \"model_dir: %s \\n\" % model_dir\n",
     "\n",
+    "# describe where data is located\n",
+    "yaml_content += \"\"\"\n",
+    "data:\n",
+    "  train_features_file: %s\n",
+    "  train_labels_file: %s\n",
+    "  eval_features_file: %s\n",
+    "  eval_labels_file: %s\n",
+    "  source_vocabulary: %s\n",
+    "  target_vocabulary: %s\n",
+    "\"\"\" % (train_bodies_bpe_loc, train_names_bpe_loc,\n",
+    "       val_bodies_bpe_loc, val_names_bpe_loc,\n",
+    "       src_vocab_loc, tgt_vocab_loc)\n",
+    "\n",
+    "# other useful configurations\n",
+    "yaml_content += \"\"\"\n",
     "train:\n",
     "  # (optional when batch_type=tokens) If not set, the training will search the largest\n",
     "  # possible batch size.\n",
@@ -662,7 +633,7 @@
     "    min_improvement: 0.01\n",
     "    steps: 2\n",
     "\"\"\"\n",
-    "config_yaml = \"openNMT_tf_train_data.yml\"\n",
+    "\n",
     "with open(config_yaml, \"w\") as f:\n",
     "    f.write(yaml_content)"
    ]
@@ -673,24 +644,48 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!cp openNMT_tf_train_data.yml /ssd/devfest2019-workshop/\n"
+    "# how to launch training\n",
+    "train_cmd = \"\"\"\n",
+    "onmt-main --model_type LuongAttention \\\n",
+    "--config %s --auto_config train --with_eval\"\"\" % config_yaml\n",
+    "print(train_cmd)\n",
+    "\n",
+    "# in case of GPU you can specify CUDA_VISIBLE_DEVICES & number of GPUs to use\n",
+    "cmd_gpu = \"\"\"\n",
+    "CUDA_VISIBLE_DEVICES=%s onmt-main --model_type LuongAttention \\\n",
+    "--config %s --auto_config train --with_eval --num_gpus %s\"\"\" % (\"0,1\", config_yaml, 2)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "alex.ipynb\t\t      jupyter-server-config.json  requirements-tf.txt\r\n",
+      "base_egor.ipynb\t\t      Makefile\t\t\t  requirements.txt\r\n",
+      "Dockerfile\t\t      notebooks\t\t\t  src.vocab\r\n",
+      "docs\t\t\t      pretrained.zip\t\t  sshuttle.pid\r\n",
+      "images\t\t\t      README.md\t\t\t  tgt.vocab\r\n",
+      "jupyter-notebook-config.json  requirements-bigartm.txt\r\n"
+     ]
+    }
+   ],
    "source": [
-    "cmd = \"CUDA_VISIBLE_DEVICES=0,1,2,3 onmt-main --model_type Transformer --config openNMT_tf_train_data.yml \\\n",
-    "--auto_config train --with_eval --num_gpus 4\""
+    "!ls"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Predict"
+    "# Predict\n",
+    "* we will use pretrained on several GPUs model to save time\n",
+    "* predictions will be saved to file \n",
+    "* predicted BPE ids will be converted back to text"
    ]
   },
   {
@@ -699,11 +694,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "\"\"\"onmt-main \\\n",
-    "    --config openNMT_tf_train_data.yml --auto_config \\\n",
-    "    average_checkpoints \\\n",
-    "    --output_dir run/baseline/avg \\\n",
-    "    --max_count 5\"\"\""
+    "bpe_val_predictions = os.path.join(base_dir, \"val.pred.tgt\")\n",
+    "pretrained_model = os.path.join(base_dir, \"pretrained/model\")\n",
+    "predict_cmd = \"\"\"onmt-main \\\n",
+    "--config %s --auto_config \\\n",
+    "infer \\\n",
+    "--features_file %s \\\n",
+    "--predictions_file %s \\\n",
+    "--checkpoint_path run/baseline/avg/ckpt-5000\"\"\" % (config_yaml, val_bodies_bpe_loc, bpe_val_predictions)"
    ]
   },
   {
@@ -712,7 +710,32 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!onmt-main --config openNMT_tf_train_data.yml --auto_config infer --features_file bpe_input/val.src --predictions_file predictions/val.pred.tgt"
+    "pred_ids = []\n",
+    "with open(bpe_val_predictions, \"r\") as f:\n",
+    "    for line in f.readlines():\n",
+    "        pred_ids.append(list(map(int, line.split())))\n",
+    "\n",
+    "pred_val_function_names = bpe.decode(pred_ids)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gt_ids = []\n",
+    "with open(val_names_bpe_loc, \"r\") as f:\n",
+    "    for i, line in enumerate(f.readlines()):\n",
+    "        gt_ids.append(list(map(int, line.split())))\n",
+    "gt_val_function_names = bpe.decode(gt_ids)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# And finally let's see the results!"
    ]
   },
   {
@@ -721,10 +744,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "\"\"\"onmt-main \\\n",
-    "    --config openNMT_tf_train_data.yml --auto_config \\\n",
-    "    --checkpoint_path run/baseline/avg/ckpt-5000 \\\n",
-    "    infer --features_file text_input/val.src --predictions_file predictions/val.pred.tgt\"\"\""
+    "for i, (a, b) in enumerate(zip(gt_function_names, predicted_function_names)):\n",
+    "    if i == 100:\n",
+    "        break\n",
+    "    print(\"%s | %s\" % (a, b))    "
    ]
   }
  ],
@@ -744,7 +767,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.8"
+   "version": "3.6.7"
   }
  },
  "nbformat": 4,