example 4 and 5

saransh-mehta · saransh-mehta · commit a41cf6918146 · 2020-06-12T07:06:39.000Z
diff --git a/examples/ner_pos_tagging/coNLL_data/coNLL_testa.txt b/examples/ner_pos_tagging/coNLL_data/coNLL_testa.txt
diff --git a/examples/ner_pos_tagging/coNLL_data/coNLL_testb.txt b/examples/ner_pos_tagging/coNLL_data/coNLL_testb.txt
diff --git a/examples/ner_pos_tagging/coNLL_data/coNLL_train.txt b/examples/ner_pos_tagging/coNLL_data/coNLL_train.txt
diff --git a/examples/ner_pos_tagging/ner_pos_tagging_conll.ipynb b/examples/ner_pos_tagging/ner_pos_tagging_conll.ipynb
@@ -0,0 +1,107 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Step - 1: Transforming data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!python ../../data_transformations.py \\\n",
+    "    --transform_file 'transform_file_conll.yml'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Step -2 Data Preparation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!python ../../data_preparation.py \\\n",
+    "    --task_file 'tasks_file_conll.yml' \\\n",
+    "    --data_dir '../../data' \\\n",
+    "    --max_seq_len 50"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Step -3 Running Training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!python ../../train.py \\\n",
+    "    --data_dir '../../data/bert-base-uncased_prepared_data' \\\n",
+    "    --task_file 'tasks_file_conll.yml' \\\n",
+    "    --out_dir 'conll_ner_pos_bert_base' \\\n",
+    "    --epochs 5 \\\n",
+    "    --train_batch_size 32 \\\n",
+    "    --eval_batch_size 32 \\\n",
+    "    --grad_accumulation_steps 1 \\\n",
+    "    --log_per_updates 50 \\\n",
+    "    --eval_while_train True \\\n",
+    "    --test_while_train True \\\n",
+    "    --max_seq_len 50 \\\n",
+    "    --silent True "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Step - 4 Infering"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "sys.path.insert(1, '../../')\n",
+    "from infer_pipeline import inferPipeline"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/examples/ner_pos_tagging/tasks_file_conll.yml b/examples/ner_pos_tagging/tasks_file_conll.yml
@@ -0,0 +1,31 @@
+conllner:
+  model_type: BERT
+  config_name: bert-base-uncased
+  dropout_prob: 0.2
+  label_map_or_file: ../../data/ner_coNLL_train_label_map.joblib
+  metrics:
+  - seqeval_f1_score
+  - seqeval_precision
+  - seqeval_recall
+  loss_type: NERLoss
+  task_type: NER
+  file_names:
+  - ner_coNLL_train.tsv
+  - ner_coNLL_testa.tsv
+  - ner_coNLL_testb.tsv
+
+conllpos:
+    model_type: BERT
+    config_name: bert-base-uncased
+    dropout_prob: 0.2
+    label_map_or_file: ../../data/pos_coNLL_train_label_map.joblib
+    metrics:
+    - seqeval_f1_score
+    - seqeval_precision
+    - seqeval_recall
+    loss_type: NERLoss
+    task_type: NER
+    file_names:
+    - pos_coNLL_train.tsv
+    - pos_coNLL_testa.tsv
+    - pos_coNLL_testb.tsv
diff --git a/examples/ner_pos_tagging/transform_file_conll.yml b/examples/ner_pos_tagging/transform_file_conll.yml
@@ -0,0 +1,8 @@
+transform1:
+  transform_func: coNLL_ner_pos_to_tsv
+  read_file_names:
+    - coNLL_train.txt
+    - coNLL_testa.txt
+    - coNLL_testb.txt
+  read_dir: coNLL_data
+  save_dir: ../../data
diff --git a/examples/query_pair_similarity/query_similarity_qqp.ipynb b/examples/query_pair_similarity/query_similarity_qqp.ipynb
@@ -0,0 +1,134 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!wget qim.fs.quoracdn.net/quora_duplicate_questions.tsv -P qqp_data/"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Step -1 Data Transformations\n",
+    "\n",
+    "Defining transform file\n",
+    "\n",
+    "```\n",
+    "sample_transform:\n",
+    "  transform_func: qqp_query_similarity_to_tsv\n",
+    "  read_file_names:\n",
+    "    - quora_duplicate_questions.tsv\n",
+    "  read_dir : qqp_data\n",
+    "  save_dir: ../../data\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!python ../../data_transformations.py \\\n",
+    "    --transform_file 'transform_file_qqp.yml'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Step -2 Data Preparation\n",
+    "\n",
+    "Defining task file for query similarity detection with QQP data\n",
+    "\n",
+    "```\n",
+    "querysimilarity:\n",
+    "    model_type: BERT\n",
+    "    config_name: bert-base-uncased\n",
+    "    dropout_prob: 0.2\n",
+    "    metrics:\n",
+    "    - classification_accuracy\n",
+    "    loss_type: CrossEntropyLoss\n",
+    "    class_num: 2\n",
+    "    task_type: SentencePairClassification\n",
+    "    file_names:\n",
+    "    - qqp_query_similarity_train.tsv\n",
+    "    - qqp_query_similarity_dev.tsv\n",
+    "    - qqp_query_similarity_test.tsv\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!python ../../data_preparation.py \\\n",
+    "    --task_file 'tasks_file_qqp.yml' \\\n",
+    "    --data_dir '../../data' \\\n",
+    "    --max_seq_len 200"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Step -3 Running train"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!python ../../train.py \\\n",
+    "    --data_dir '../../data/bert-base-uncased_prepared_data' \\\n",
+    "    --task_file 'tasks_file_qqp.yml' \\\n",
+    "    --out_dir 'qqp_query_similarity_bert_base' \\\n",
+    "    --epochs 3 \\\n",
+    "    --train_batch_size 8 \\\n",
+    "    --eval_batch_size 16 \\\n",
+    "    --grad_accumulation_steps 2 \\\n",
+    "    --log_per_updates 50 \\\n",
+    "    --eval_while_train True \\\n",
+    "    --test_while_train True \\\n",
+    "    --max_seq_len 200 \\\n",
+    "    --silent True "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/examples/query_pair_similarity/tasks_file_qqp.yml b/examples/query_pair_similarity/tasks_file_qqp.yml
@@ -0,0 +1,13 @@
+querysimilarity:
+    model_type: BERT
+    config_name: bert-base-uncased
+    dropout_prob: 0.2
+    metrics:
+    - classification_accuracy
+    loss_type: CrossEntropyLoss
+    class_num: 2
+    task_type: SentencePairClassification
+    file_names:
+    - qqp_query_similarity_train.tsv
+    - qqp_query_similarity_dev.tsv
+    - qqp_query_similarity_test.tsv
diff --git a/examples/query_pair_similarity/transform_file_qqp.yml b/examples/query_pair_similarity/transform_file_qqp.yml
@@ -0,0 +1,6 @@
+sample_transform:
+  transform_func: qqp_query_similarity_to_tsv
+  read_file_names:
+    - quora_duplicate_questions.tsv
+  read_dir : qqp_data
+  save_dir: ../../data