Feat/topic classifier (#1584)

dilyararimovna · web-flow · commit d39818c1359a · 2022-10-31T12:05:39.000+03:00
diff --git a/deeppavlov/configs/classifiers/topics_distilbert_base_uncased.json b/deeppavlov/configs/classifiers/topics_distilbert_base_uncased.json
@@ -0,0 +1,174 @@
+{
+  "dataset_reader": {
+    "class_name": "basic_classification_reader",
+    "class_sep": ";",
+    "x": "text",
+    "y": "topic",
+    "data_path": "{DOWNLOADS_PATH}/dp_topics_downsampled_data/",
+    "train" : "train.csv",
+    "valid" : "valid.csv"  
+  },
+  "dataset_iterator": {
+    "class_name": "basic_classification_iterator",
+    "seed": 42
+  },
+  "chainer": {
+    "in": [
+      "x"
+    ],
+    "in_y": [
+      "y"
+    ],
+    "pipe": [
+      {
+        "class_name": "torch_transformers_preprocessor",
+        "vocab_file": "{TRANSFORMER}",
+        "do_lower_case": true,
+        "max_seq_length": 128,
+        "in": [
+          "x"
+        ],
+        "out": [
+          "bert_features"
+        ]
+      },
+      {
+        "id": "classes_vocab",
+        "class_name": "simple_vocab",
+        "fit_on": [
+          "y"
+        ],
+        "save_path": "{MODEL_PATH}/classes.dict",
+        "load_path": "{MODEL_PATH}/classes.dict",
+        "in": [
+          "y"
+        ],
+        "out": [
+          "y_ids"
+        ]
+      },
+      {
+        "in": [
+          "y_ids"
+        ],
+        "out": [
+          "y_onehot"
+        ],
+        "class_name": "one_hotter",
+        "id": "my_one_hotter",
+        "depth": "#classes_vocab.len",
+        "single_vector": true
+      },
+      {
+        "class_name": "torch_transformers_classifier",
+        "one_hot_labels": true,
+        "n_classes": "#classes_vocab.len",
+        "return_probas": true,
+        "pretrained_bert": "{TRANSFORMER}",
+        "save_path": "{MODEL_PATH}/model",
+        "load_path": "{MODEL_PATH}/model",
+        "multilabel": true,
+        "optimizer": "AdamW",
+        "optimizer_parameters": {
+          "lr": 1e-05
+        },
+        "learning_rate_drop_patience": 5,
+        "learning_rate_drop_div": 2.0,
+        "in": [
+          "bert_features"
+        ],
+        "in_y": [
+          "y_onehot"
+        ],
+        "out": [
+          "y_pred_probas"
+        ]
+      },
+      {
+        "in": "y_pred_probas",
+        "out": "y_pred_ids",
+        "class_name": "proba2labels",
+        "max_proba": false,
+        "confidence_threshold": 0.5
+      },
+      {
+        "in": "y_pred_ids",
+        "out": "y_pred_labels",
+        "ref": "classes_vocab"
+      },
+      {
+        "ref": "my_one_hotter",
+        "in": "y_pred_ids",
+        "out": "y_pred_onehot"
+      }
+    ],
+    "out": [
+      "y_pred_labels"
+    ]
+  },
+  "train": {
+    "epochs": 100,
+    "batch_size": 64,
+    "metrics": [
+      {
+        "name": "f1_macro",
+        "inputs": [
+          "y_onehot",
+          "y_pred_onehot"
+        ]
+      },
+      {
+        "name": "f1_weighted",
+        "inputs": [
+          "y_onehot",
+          "y_pred_onehot"
+        ]
+      },
+      {
+        "name": "accuracy",
+        "inputs": [
+          "y",
+          "y_pred_labels"
+        ]
+      },
+      {
+        "name": "roc_auc",
+        "inputs": [
+          "y_onehot",
+          "y_pred_probas"
+        ]
+      }
+    ],
+    "validation_patience": 10,
+    "val_every_n_epochs": 1,
+    "log_every_n_epochs": 1,
+    "log_every_n_batches": 100,
+    "show_examples": false,
+    "evaluation_targets": [
+      "train",
+      "valid",
+      "test"
+    ],
+    "tensorboard_log_dir": "{MODEL_PATH}/logs",
+    "class_name": "torch_trainer"
+  },
+  "metadata": {
+    "variables": {
+      "TRANSFORMER": "distilbert-base-uncased",
+      "ROOT_PATH": "~/.deeppavlov",
+      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
+      "MODELS_PATH": "{ROOT_PATH}/models",
+      "MODEL_PATH": "{MODELS_PATH}/classifiers/topic_distilbert_base_v0"
+    },
+    "download": [
+      {
+        "url": "http://files.deeppavlov.ai/datasets/dp_topics_downsampled_dataset_v0.tar.gz",
+        "subdir": "{DOWNLOADS_PATH}"
+      },
+      {
+        "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/topic_distilbert_base_v0.tar.gz",
+        "subdir": "{MODELS_PATH}/classifiers"
+      }
+    ]
+  }
+}
diff --git a/docs/features/models/classifiers.rst b/docs/features/models/classifiers.rst
@@ -234,19 +234,21 @@ the floating point labels are converted to integer labels according to the inter
 corresponding to `very negative`, `negative`, `neutral`, `positive`, `very positive` classes.
 
 
-+------------------+--------------------+------+-------------------------------------------------------------------------------------------------+-------------+--------+--------+-----------+
-| Task             | Dataset            | Lang | Model                                                                                           | Metric      | Valid  | Test   | Downloads |
-+==================+====================+======+=================================================================================================+=============+========+========+===========+
-| Insult detection | `Insults`_         | En   | :config:`English BERT <classifiers/insults_kaggle_bert.json>`                                   | ROC-AUC     | 0.9327 | 0.8602 |  1.1 Gb   |
-+------------------+--------------------+      +-------------------------------------------------------------------------------------------------+-------------+--------+--------+-----------+
-| Sentiment        |`SST`_              |      | :config:`5-classes SST on conversational BERT <classifiers/sentiment_sst_conv_bert.json>`       | Accuracy    | 0.6293 | 0.6626 |  1.1 Gb   |
-+------------------+--------------------+------+-------------------------------------------------------------------------------------------------+-------------+--------+--------+-----------+
-| Sentiment        |`Twitter mokoron`_  | Ru   | :config:`RuWiki+Lenta emb w/o preprocessing <classifiers/sentiment_twitter.json>`               | F1-macro    | 0.9965 | 0.9961 |  6.2 Gb   |
-+                  +--------------------+      +-------------------------------------------------------------------------------------------------+-------------+--------+--------+-----------+
-|                  |`RuSentiment`_      |      | :config:`Multi-language BERT <classifiers/rusentiment_bert.json>`                               | F1-weighted | 0.6787 | 0.7005 |  1.3 Gb   |
-+                  +                    +      +-------------------------------------------------------------------------------------------------+             +--------+--------+-----------+
-|                  |                    |      | :config:`Conversational RuBERT <classifiers/rusentiment_convers_bert.json>`                     |             | 0.739  | 0.7724 |  1.5 Gb   |
-+------------------+--------------------+------+-------------------------------------------------------------------------------------------------+-------------+--------+--------+-----------+
++------------------+----------------------+------+-------------------------------------------------------------------------------------------------+-------------+--------------+--------------+-----------+
+| Task             | Dataset              | Lang | Model                                                                                           | Metric      | Valid        | Test         | Downloads |
++==================+======================+======+=================================================================================================+=============+==============+==============+===========+
+| Insult detection | `Insults`_           | En   | :config:`English BERT <classifiers/insults_kaggle_bert.json>`                                   | ROC-AUC     | 0.9327       | 0.8602       |  1.1 Gb   |
++------------------+----------------------+      +-------------------------------------------------------------------------------------------------+-------------+--------------+--------------+-----------+
+| Sentiment        |`SST`_                |      | :config:`5-classes SST on conversational BERT <classifiers/sentiment_sst_conv_bert.json>`       | Accuracy    | 0.6293       | 0.6626       |  1.1 Gb   |
++------------------+----------------------+------+-------------------------------------------------------------------------------------------------+-------------+--------------+--------------+-----------+
+| Sentiment        |`Twitter mokoron`_    | Ru   | :config:`RuWiki+Lenta emb w/o preprocessing <classifiers/sentiment_twitter.json>`               | F1-macro    | 0.9965       | 0.9961       |  6.2 Gb   |
++                  +----------------------+      +-------------------------------------------------------------------------------------------------+-------------+--------------+--------------+-----------+
+|                  |`RuSentiment`_        |      | :config:`Multilingual BERT <classifiers/rusentiment_bert.json>`                                 | F1-weighted | 0.6787       | 0.7005       |  1.3 Gb   |
++                  +                      +      +-------------------------------------------------------------------------------------------------+             +--------------+--------------+-----------+
+|                  |                      |      | :config:`Conversational RuBERT <classifiers/rusentiment_convers_bert.json>`                     |             | 0.739        | 0.7724       |  1.5 Gb   |
++------------------+----------------------+------+-------------------------------------------------------------------------------------------------+-------------+--------------+--------------+-----------+
+| Topics           | `DeepPavlov Topics`_ | En   | :config:`Distil BERT base uncased <classifiers/topics_distilbert_base_uncased.json>`            | F1-w / F1-m | 0.877/0.830  | 0.878/0.831  |  0.7 Gb   |
++------------------+----------------------+------+-------------------------------------------------------------------------------------------------+-------------+--------------+--------------+-----------+
 
 .. _`DSTC 2`: http://camdial.org/~mh521/dstc/
 .. _`Insults`: https://www.kaggle.com/c/detecting-insults-in-social-commentary
@@ -257,6 +259,7 @@ corresponding to `very negative`, `negative`, `neutral`, `positive`, `very posit
 .. _`Yahoo-L31`: https://webscope.sandbox.yahoo.com/catalog.php?datatype=l
 .. _`Yahoo-L6`: https://webscope.sandbox.yahoo.com/catalog.php?datatype=l
 .. _`SST`: https://nlp.stanford.edu/sentiment/index.html
+.. _`DeepPavlov Topics`: https://deeppavlov.ai/datasets/topics
 
 GLUE Benchmark
 --------------
diff --git a/tests/test_quick_start.py b/tests/test_quick_start.py
@@ -118,7 +118,8 @@
         ("classifiers/glue/glue_rte_roberta_mnli.json", "classifiers", ('TI',)): [TWO_ARGUMENTS_INFER_CHECK],
         ("classifiers/superglue/superglue_copa_roberta.json", "classifiers", ('TI',)): [LIST_ARGUMENTS_INFER_CHECK],
         ("classifiers/superglue/superglue_boolq_roberta_mnli.json", "classifiers", ('TI',)): [TWO_ARGUMENTS_INFER_CHECK],
-        ("classifiers/superglue/superglue_record_roberta.json", "classifiers", ('TI',)): [RECORD_ARGUMENTS_INFER_CHECK]
+        ("classifiers/superglue/superglue_record_roberta.json", "classifiers", ('TI',)): [RECORD_ARGUMENTS_INFER_CHECK],
+        ("classifiers/topics_distilbert_base_uncased.json", "classifiers", ('TI',)): [ONE_ARGUMENT_INFER_CHECK]
     },
     "distil": {
         ("classifiers/paraphraser_convers_distilrubert_2L.json", "distil", ('IP')): [TWO_ARGUMENTS_INFER_CHECK],