update chnsenticorp and lcqmc to qianyan format (#482)

smallv0221 · web-flow · commit 07d414a41dc9 · 2021-06-03T19:07:01.000+08:00
* update chnsenticorp and lcqmc to qianyan format

* update md5 check
diff --git a/docs/data_prepare/dataset_list.md b/docs/data_prepare/dataset_list.md
@@ -69,6 +69,11 @@ PaddleNLP提供了以下数据集的快速读取API，实际使用时请根据
 | ----  | --------- | ------ |
 |  [BSTC](https://aistudio.baidu.com/aistudio/competition/detail/44/) | 千言数据集：机器同传，包括transcription_translation和asr | `paddlenlp.datasets.load_dataset('bstc', 'asr')`|
 
+## 对话系统
+
+| 数据集名称  | 简介 | 调用方法 |
+| ----  | --------- | ------ |
+|  [DuConv](https://aistudio.baidu.com/aistudio/competition/detail/48/) | 千言数据集：开放域对话，中文知识型对话数据集 | `paddlenlp.datasets.load_dataset('duconv')`|
 
 ## 文本生成
 
diff --git a/paddlenlp/datasets/chnsenticorp.py b/paddlenlp/datasets/chnsenticorp.py
@@ -31,19 +31,19 @@ class ChnSentiCorp(DatasetBuilder):
 
     """
 
-    URL = "https://bj.bcebos.com/paddlehub-dataset/chnsenticorp.tar.gz"
-    MD5 = "fbb3217aeac76a2840d2d5cd19688b07"
+    URL = "https://dataset-bj.cdn.bcebos.com/qianyan/ChnSentiCorp.zip"
+    MD5 = "7ef61b08ad10fbddf2ba97613f071561"
     META_INFO = collections.namedtuple('META_INFO', ('file', 'md5'))
     SPLITS = {
         'train': META_INFO(
-            os.path.join('chnsenticorp', 'train.tsv'),
+            os.path.join('ChnSentiCorp', 'ChnSentiCorp', 'train.tsv'),
             '689360c4a4a9ce8d8719ed500ae80907'),
         'dev': META_INFO(
-            os.path.join('chnsenticorp', 'dev.tsv'),
-            '05e4b02561c2a327833e05bbe8156cec'),
+            os.path.join('ChnSentiCorp', 'ChnSentiCorp', 'dev.tsv'),
+            '20c77cc2371634731a367996b097ec0a'),
         'test': META_INFO(
-            os.path.join('chnsenticorp', 'test.tsv'),
-            '917dfc6fbce596bb01a91abaa6c86f9e'),
+            os.path.join('ChnSentiCorp', 'ChnSentiCorp', 'test.tsv'),
+            '9b4dc7d1e4ada48c645b7e938592f49c'),
     }
 
     def _get_data(self, mode, **kwargs):
@@ -57,7 +57,7 @@ def _get_data(self, mode, **kwargs):
 
         return fullname
 
-    def _read(self, filename):
+    def _read(self, filename, split):
         """Reads data."""
         with open(filename, 'r', encoding='utf-8') as f:
             head = None
@@ -66,8 +66,15 @@ def _read(self, filename):
                 if not head:
                     head = data
                 else:
-                    label, text = data
-                    yield {"text": text, "label": label}
+                    if split == 'train':
+                        label, text = data
+                        yield {"text": text, "label": label, "qid": ''}
+                    elif split == 'dev':
+                        qid, label, text = data
+                        yield {"text": text, "label": label, "qid": qid}
+                    elif split == 'test':
+                        qid, text = data
+                        yield {"text": text, "label": '', "qid": qid}
 
     def get_labels(self):
         """
diff --git a/paddlenlp/datasets/lcqmc.py b/paddlenlp/datasets/lcqmc.py
@@ -31,19 +31,19 @@ class LCQMC(DatasetBuilder):
 
     """
 
-    URL = "https://bj.bcebos.com/paddlehub-dataset/lcqmc.tar.gz"
-    MD5 = "62a7ba36f786a82ae59bbde0b0a9af0c"
+    URL = "https://dataset-bj.cdn.bcebos.com/qianyan/lcqmc.zip"
+    MD5 = "7069fa0cffbd2110845869c61f83814a"
     META_INFO = collections.namedtuple('META_INFO', ('file', 'md5'))
     SPLITS = {
         'train': META_INFO(
-            os.path.join('lcqmc', 'train.tsv'),
-            '2193c022439b038ac12c0ae918b211a1'),
+            os.path.join('lcqmc', 'lcqmc', 'train.tsv'),
+            '479d94fe575981f236319f2a5b8b3c03'),
         'dev': META_INFO(
-            os.path.join('lcqmc', 'dev.tsv'),
-            'c5dcba253cb4105d914964fd8b3c0e94'),
+            os.path.join('lcqmc', 'lcqmc', 'dev.tsv'),
+            '089329fb44ef26155baef9c9c8c823ba'),
         'test': META_INFO(
-            os.path.join('lcqmc', 'test.tsv'),
-            '8f4b71e15e67696cc9e112a459ec42bd'),
+            os.path.join('lcqmc', 'lcqmc', 'test.tsv'),
+            'a4a483f2f871d57e0f3894fca0d0f8f0'),
     }
 
     def _get_data(self, mode, **kwargs):
@@ -59,14 +59,14 @@ def _get_data(self, mode, **kwargs):
     def _read(self, filename):
         """Reads data."""
         with open(filename, 'r', encoding='utf-8') as f:
-            head = None
             for line in f:
                 data = line.strip().split("\t")
-                if not head:
-                    head = data
-                else:
+                if len(data) == 3:
                     query, title, label = data
                     yield {"query": query, "title": title, "label": label}
+                else:
+                    query, title = data
+                    yield {"query": query, "title": title, "label": ''}
 
     def get_labels(self):
         """