[UTC] Support single-label and multiple-label classification (#5083)

LemonNoel · web-flow · commit 18287d3a8543 · 2023-03-07T10:41:09.000+08:00
diff --git a/applications/zero_shot_text_classification/README.md b/applications/zero_shot_text_classification/README.md
@@ -153,6 +153,7 @@ python -u -m paddle.distributed.launch --gpus "0,1" run_train.py \
 该示例代码中由于设置了参数 `--do_eval`，因此在训练完会自动进行评估。
 
 可配置参数说明：
+* `single_label`: 每条样本是否只预测一个标签。默认为`False`，表示多标签分类。
 * `device`: 训练设备，可选择 'cpu'、'gpu' 其中的一种；默认为 GPU 训练。
 * `logging_steps`: 训练过程中日志打印的间隔 steps 数，默认10。
 * `save_steps`: 训练过程中保存模型 checkpoint 的间隔 steps 数，默认100。
@@ -199,6 +200,7 @@ python run_eval.py \
 - `test_path`: 进行评估的测试集文件。
 - `per_device_eval_batch_size`: 批处理大小，请结合机器情况进行调整，默认为16。
 - `max_seq_len`: 文本最大切分长度，输入超过最大长度时会对输入文本进行自动切分，默认为512。
+- `single_label`: 每条样本是否只预测一个标签。默认为`False`，表示多标签分类。
 
 <a name="定制模型一键预测"></a>
 
diff --git a/applications/zero_shot_text_classification/label_studio.py b/applications/zero_shot_text_classification/label_studio.py
@@ -54,24 +54,28 @@ def convert_utc_examples(self, raw_examples):
         utc_examples = []
         for example in raw_examples:
             raw_text = example["data"]["text"].split(self.text_separator)
-            raw_label = example["annotations"][0]["result"][0]["value"]["choices"][0]
             if len(raw_text) < 1:
                 continue
             elif len(raw_text) == 1:
                 raw_text.append("")
             elif len(raw_text) > 2:
                 raw_text = ["".join(raw_text[:-1]), raw_text[-1]]
-            if raw_label not in self.options:
-                raise ValueError(
-                    f"Label `{raw_label}` not found in label candidates `options`. Please recheck the data."
-                )
+
+            label_list = []
+            for raw_label in example["annotations"][0]["result"][0]["value"]["choices"]:
+                if raw_label not in self.options:
+                    raise ValueError(
+                        f"Label `{raw_label}` not found in label candidates `options`. Please recheck the data."
+                    )
+                label_list.append(np.where(np.array(self.options) == raw_label)[0].tolist()[0])
+
             utc_examples.append(
                 {
                     "text_a": raw_text[0],
                     "text_b": raw_text[1],
                     "question": "",
                     "choices": self.options,
-                    "labels": np.where(np.array(self.options) == raw_label)[0].tolist()[0],
+                    "labels": label_list,
                 }
             )
         return utc_examples
diff --git a/applications/zero_shot_text_classification/label_studio_text.md b/applications/zero_shot_text_classification/label_studio_text.md
@@ -83,6 +83,12 @@ label-studio start
 
 项目创建后，可在Setting/Labeling Interface中继续配置标签，详见[项目创建](#label)
 
+默认模式为单标签多分类数据标注。对于多标签多分类数据标注，需要将`choice`的值由`single`改为`multiple`。
+
+<div align="center">
+    <img src=https://user-images.githubusercontent.com/25607475/222630045-8d6eebf7-572f-43d2-b7a1-24bf21a47fad.png />
+</div>
+
 <a name="24"></a>
 
 #### 2.4 任务标注
diff --git a/applications/zero_shot_text_classification/label_studio_text_en.md b/applications/zero_shot_text_classification/label_studio_text_en.md
@@ -84,6 +84,12 @@ You can continue to import local txt format data after project creation. See mor
 
 After project creation, you can add/delete labels in Setting/Labeling Interface just as in [Project Creation](#label)
 
+LabelStudio supports single-label data annotation by default. Modify the value of `choice` as `multiple` in the `code` tab when multiple-label annotation is required.
+
+<div align="center">
+    <img src=https://user-images.githubusercontent.com/25607475/222630045-8d6eebf7-572f-43d2-b7a1-24bf21a47fad.png />
+</div>
+
 <a name="24"></a>
 
 #### 2.4 Task annotation
diff --git a/applications/zero_shot_text_classification/run_eval.py b/applications/zero_shot_text_classification/run_eval.py
@@ -17,6 +17,7 @@
 from dataclasses import dataclass, field
 
 import paddle
+from paddle.metric import Accuracy
 from sklearn.metrics import f1_score
 from utils import UTCLoss, read_local_dataset
 
@@ -35,6 +36,7 @@
 class DataArguments:
     test_path: str = field(default="./data/test.txt", metadata={"help": "Test dataset file name."})
     threshold: float = field(default=0.5, metadata={"help": "The threshold to produce predictions."})
+    single_label: str = field(default=False, metadata={"help": "Predict exactly one label per sample."})
 
 
 @dataclass
@@ -71,6 +73,18 @@ def main():
         prompt_model.set_state_dict(model_state)
 
     # Define the metric function.
+    def compute_metrics_single_label(eval_preds):
+        labels = paddle.to_tensor(eval_preds.label_ids, dtype="int64")
+        preds = paddle.to_tensor(eval_preds.predictions)
+        preds = paddle.nn.functional.softmax(preds, axis=-1)
+        labels = paddle.argmax(labels, axis=-1)
+        print(preds, labels)
+        metric = Accuracy()
+        correct = metric.compute(preds, labels)
+        metric.update(correct)
+        acc = metric.accumulate()
+        return {"accuracy": acc}
+
     def compute_metrics(eval_preds):
         labels = paddle.to_tensor(eval_preds.label_ids, dtype="int64")
         preds = paddle.to_tensor(eval_preds.predictions)
@@ -92,7 +106,7 @@ def compute_metrics(eval_preds):
         train_dataset=None,
         eval_dataset=None,
         callbacks=None,
-        compute_metrics=compute_metrics,
+        compute_metrics=compute_metrics_single_label if data_args.single_label else compute_metrics,
     )
 
     if data_args.test_path is not None:
@@ -102,12 +116,20 @@ def compute_metrics(eval_preds):
             json.dump(test_ret.metrics, fp)
 
         with open(os.path.join(training_args.output_dir, "test_predictions.json"), "w", encoding="utf-8") as fp:
-            preds = paddle.nn.functional.sigmoid(paddle.to_tensor(test_ret.predictions))
-            for index, pred in enumerate(preds):
-                result = {"id": index}
-                result["labels"] = paddle.where(pred > data_args.threshold)[0].tolist()
-                result["probs"] = pred[pred > data_args.threshold].tolist()
-                fp.write(json.dumps(result, ensure_ascii=False) + "\n")
+            if data_args.single_label:
+                preds = paddle.nn.functional.softmax(paddle.to_tensor(test_ret.predictions), axis=-1)
+                for index, pred in enumerate(preds):
+                    result = {"id": index}
+                    result["labels"] = paddle.argmax(pred).item()
+                    result["probs"] = pred[result["labels"]].item()
+                    fp.write(json.dumps(result, ensure_ascii=False) + "\n")
+            else:
+                preds = paddle.nn.functional.sigmoid(paddle.to_tensor(test_ret.predictions))
+                for index, pred in enumerate(preds):
+                    result = {"id": index}
+                    result["labels"] = paddle.where(pred > data_args.threshold)[0].tolist()
+                    result["probs"] = pred[pred > data_args.threshold].tolist()
+                    fp.write(json.dumps(result, ensure_ascii=False) + "\n")
 
 
 if __name__ == "__main__":
diff --git a/applications/zero_shot_text_classification/run_train.py b/applications/zero_shot_text_classification/run_train.py
@@ -15,6 +15,7 @@
 from dataclasses import dataclass, field
 
 import paddle
+from paddle.metric import Accuracy
 from paddle.static import InputSpec
 from sklearn.metrics import f1_score
 from utils import UTCLoss, read_local_dataset
@@ -39,6 +40,7 @@ class DataArguments:
     train_file: str = field(default="train.txt", metadata={"help": "Train dataset file name."})
     dev_file: str = field(default="dev.txt", metadata={"help": "Dev dataset file name."})
     threshold: float = field(default=0.5, metadata={"help": "The threshold to produce predictions."})
+    single_label: str = field(default=False, metadata={"help": "Predict exactly one label per sample."})
 
 
 @dataclass
@@ -92,10 +94,20 @@ def main():
     )
 
     # Define the metric function.
-    def compute_metrics(eval_preds):
+    def compute_metrics_single_label(eval_preds):
         labels = paddle.to_tensor(eval_preds.label_ids, dtype="int64")
         preds = paddle.to_tensor(eval_preds.predictions)
+        preds = paddle.nn.functional.softmax(preds, axis=-1)
+        labels = paddle.argmax(labels, axis=-1)
+        metric = Accuracy()
+        correct = metric.compute(preds, labels)
+        metric.update(correct)
+        acc = metric.accumulate()
+        return {"accuracy": acc}
 
+    def compute_metrics(eval_preds):
+        labels = paddle.to_tensor(eval_preds.label_ids, dtype="int64")
+        preds = paddle.to_tensor(eval_preds.predictions)
         preds = paddle.nn.functional.sigmoid(preds)
         preds = preds[labels != -100].numpy()
         labels = labels[labels != -100].numpy()
@@ -113,7 +125,7 @@ def compute_metrics(eval_preds):
         train_dataset=train_ds,
         eval_dataset=dev_ds,
         callbacks=None,
-        compute_metrics=compute_metrics,
+        compute_metrics=compute_metrics_single_label if data_args.single_label else compute_metrics,
     )
 
     # Training.