[CI] Fix data and datasets tests (#4215)

sijunhe · web-flow · commit dd792d6d0152 · 2022-12-26T10:45:01.000+08:00
* changes

* add data, datasets

* add dataset

* remove data
diff --git a/pyproject.toml b/pyproject.toml
@@ -12,6 +12,8 @@ exclude = ['.flake8']
 minversion = "6.0"
 addopts = "-ra -q"
 testpaths = [
+    "tests/data",
+    "tests/datasets",
     "tests/transformers",
     "tests/prompt",
     "tests/taskflow",
diff --git a/tests/common_test.py b/tests/common_test.py
@@ -11,11 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import numpy as np
 import unittest
-import paddle
 import warnings
 
+import numpy as np
+import paddle
+
 __all__ = ["CommonTest", "CpuCommonTest"]
 
 
@@ -92,7 +93,7 @@ def _check_output_impl(self, result, expected_result, rtol, atol, equal=True):
         error_msg = "Output has diff at place:{}. \nExpect: {} \nBut Got: {} in class {}"
         if result_t in [list, tuple]:
             result_t = get_container_type(result)
-        if result_t in [str, int, bool, set, np.bool, np.int32, np.int64, np.str]:
+        if result_t in [str, int, bool, set, bool, np.int32, np.int64]:
             assertForNormalType(
                 result,
                 expected_result,
diff --git a/tests/data/test_collate.py b/tests/data/test_collate.py
@@ -12,12 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import unittest
+
 import numpy as np
 
-from paddlenlp.data import Stack, Pad, Tuple, Dict
-from common_test import CpuCommonTest
-import util
-import unittest
+from paddlenlp.data import Dict, Pad, Stack, Tuple
+from tests import testing_utils
+from tests.common_test import CpuCommonTest
 
 
 class TestStack(CpuCommonTest):
@@ -84,7 +85,7 @@ def test_tuple(self):
     def test_tuple_list(self):
         self._test_impl(False)
 
-    @util.assert_raises
+    @testing_utils.assert_raises
     def test_empty_fn(self):
         Tuple([Stack()], Pad(axis=0, pad_val=0))
 
diff --git a/tests/data/test_sampler.py b/tests/data/test_sampler.py
@@ -12,34 +12,32 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import numpy as np
 import os
+import unittest
 
 from paddlenlp.data import SamplerHelper
 from paddlenlp.datasets import load_dataset
-
-from common_test import CpuCommonTest
-import util
-import unittest
+from tests.common_test import CpuCommonTest
+from tests.testing_utils import assert_raises, get_tests_dir
 
 
 def cmp(x, y):
     return -1 if x < y else 1 if x > y else 0
 
 
 class TestSampler(CpuCommonTest):
-    def setUp(self):
-        self.config["path"] = "imdb"
-        self.config["splits"] = "train"
-        self.train_ds = load_dataset(**self.config)
+    @classmethod
+    def setUpClass(cls):
+        fixture_path = get_tests_dir(os.path.join("fixtures", "dummy"))
+        cls.train_ds = load_dataset("clue", "tnews", data_files=[os.path.join(fixture_path, "tnews", "train.json")])
 
     def test_length(self):
         train_batch_sampler = SamplerHelper(self.train_ds)
-        self.check_output_equal(len(train_batch_sampler), 25000)
+        self.check_output_equal(len(train_batch_sampler), 10)
         self.check_output_equal(len(train_batch_sampler), train_batch_sampler.length)
 
-        train_batch_sampler.length = 20
-        self.check_output_equal(len(train_batch_sampler), 20)
+        train_batch_sampler.length = 5
+        self.check_output_equal(len(train_batch_sampler), 5)
 
     def test_iter1(self):
         train_ds_len = len(self.train_ds)
@@ -63,23 +61,15 @@ def test_list(self):
     def test_shuffle_no_buffer_size(self):
         train_batch_sampler = SamplerHelper(self.train_ds)
         shuffle_sampler = train_batch_sampler.shuffle(seed=102)
-        expected_result = {
-            0: 5189,
-            12000: 11777,
-            24999: 10496,
-        }
+        expected_result = {0: 4, 1: 9}
         for i, sample in enumerate(shuffle_sampler):
             if i in expected_result.keys():
                 self.check_output_equal(sample, expected_result[i])
 
     def test_shuffle_buffer_size(self):
         train_batch_sampler = SamplerHelper(self.train_ds)
         shuffle_sampler = train_batch_sampler.shuffle(buffer_size=10, seed=102)
-        expected_result = {
-            0: 4,
-            12000: 12003,
-            24999: 24997,
-        }
+        expected_result = {0: 4, 1: 9}
         for i, sample in enumerate(shuffle_sampler):
             if i in expected_result.keys():
                 self.check_output_equal(sample, expected_result[i])
@@ -88,12 +78,12 @@ def test_sort_buffer_size(self):
         train_ds_len = len(self.train_ds)
         ds_iter = iter(range(train_ds_len - 1, -1, -1))
         train_batch_sampler = SamplerHelper(self.train_ds, ds_iter)
-        sort_sampler = train_batch_sampler.sort(cmp=lambda x, y, dataset: cmp(x, y), buffer_size=12500)
+        sort_sampler = train_batch_sampler.sort(cmp=lambda x, y, dataset: cmp(x, y), buffer_size=5)
         for i, sample in enumerate(sort_sampler):
-            if i < 12500:
-                self.check_output_equal(i + 12500, sample)
+            if i < 5:
+                self.check_output_equal(i + 5, sample)
             else:
-                self.check_output_equal(i - 12500, sample)
+                self.check_output_equal(i - 5, sample)
 
     def test_sort_no_buffer_size(self):
         train_ds_len = len(self.train_ds)
@@ -111,14 +101,16 @@ def test_batch(self):
             for j, minibatch in enumerate(sample):
                 self.check_output_equal(i * batch_size + j, minibatch)
 
-    @util.assert_raises(ValueError)
+    @assert_raises(ValueError)
     def test_batch_oversize(self):
         train_batch_sampler = SamplerHelper(self.train_ds)
         batch_size = 3
-        key = lambda size_so_far, minibatch_len: max(size_so_far, minibatch_len)
-        batch_size_fn = lambda new, count, sofar, data_source: len(data_source)
 
-        batch_sampler = train_batch_sampler.batch(batch_size, key=key, batch_size_fn=batch_size_fn)
+        batch_sampler = train_batch_sampler.batch(
+            batch_size,
+            key=lambda size_so_far, minibatch_len: max(size_so_far, minibatch_len),
+            batch_size_fn=lambda new, count, sofar, data_source: len(data_source),
+        )
         for i, sample in enumerate(batch_sampler):
             for j, minibatch in enumerate(sample):
                 self.check_output_equal(i * batch_size + j, minibatch)
@@ -143,8 +135,9 @@ def test_apply(self):
         train_ds_len = len(self.train_ds)
         ds_iter = iter(range(train_ds_len - 1, -1, -1))
         train_batch_sampler = SamplerHelper(self.train_ds, ds_iter)
-        fn = lambda sampler: SamplerHelper.sort(sampler, cmp=lambda x, y, dataset: cmp(x, y))
-        apply_sampler = train_batch_sampler.apply(fn)
+        apply_sampler = train_batch_sampler.apply(
+            lambda sampler: SamplerHelper.sort(sampler, cmp=lambda x, y, dataset: cmp(x, y))
+        )
         for i, sample in enumerate(apply_sampler):
             self.check_output_equal(i, sample)
 
diff --git a/tests/data/test_tokenizer.py b/tests/data/test_tokenizer.py
@@ -12,13 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import numpy as np
-import os
+import unittest
 
 from paddlenlp.data import JiebaTokenizer, Vocab
-from common_test import CpuCommonTest
-from util import create_test_data
-import unittest
+from tests.common_test import CpuCommonTest
+from tests.testing_utils import create_test_data
 
 
 class TestJiebaTokenizer(CpuCommonTest):
diff --git a/tests/data/test_vocab.py b/tests/data/test_vocab.py
@@ -12,14 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import numpy as np
-import os
+import unittest
+from collections import Counter
 
 from paddlenlp.data import Vocab
-from common_test import CpuCommonTest
-from collections import Counter
-import util
-import unittest
+from tests import testing_utils
+from tests.common_test import CpuCommonTest
 
 
 class TestVocab(CpuCommonTest):
@@ -36,30 +34,30 @@ def create_counter(self):
     def setUp(self):
         self.create_counter()
 
-    @util.assert_raises(ValueError)
+    @testing_utils.assert_raises(ValueError)
     def test_invalid_specail_token(self):
         Vocab(wrong_kwarg="")
 
-    @util.assert_raises(ValueError)
+    @testing_utils.assert_raises(ValueError)
     def test_invalid_identifier(self):
         Vocab(counter=self.counter, _special_token="")
 
-    @util.assert_raises(ValueError)
+    @testing_utils.assert_raises(ValueError)
     def test_sort_index_value_error1(self):
         token_to_idx = {"一万七千多": 1, "一万七千余": 2, "IP地址": 3}
-        vocab = Vocab(counter=self.counter, unk_token="[UNK]", token_to_idx=token_to_idx)
+        Vocab(counter=self.counter, unk_token="[UNK]", token_to_idx=token_to_idx)
 
-    @util.assert_raises(ValueError)
+    @testing_utils.assert_raises(ValueError)
     def test_sort_index_value_error2(self):
         token_to_idx = {"一万七千多": 1, "一万七千余": 2, "一万七千": 2}
         Vocab(counter=self.counter, unk_token="[UNK]", token_to_idx=token_to_idx)
 
-    @util.assert_raises(ValueError)
+    @testing_utils.assert_raises(ValueError)
     def test_sort_index_value_error3(self):
         token_to_idx = {"一万七千多": -1, "一万七千余": 2, "一万七千": 3}
         Vocab(counter=self.counter, unk_token="[UNK]", token_to_idx=token_to_idx)
 
-    @util.assert_raises(ValueError)
+    @testing_utils.assert_raises(ValueError)
     def test_to_token_excess_size(self):
         token_to_idx = {"一万七千多": 1, "一万七千余": 2, "一万万": 3}
         vocab = Vocab(counter=self.counter, unk_token="[UNK]", token_to_idx=token_to_idx)
diff --git a/tests/dataset/test_imdb.py b/tests/dataset/test_imdb.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,37 +11,35 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import numpy as np
-import os
 import unittest
-from paddlenlp.datasets import load_dataset
 
-from common_test import CpuCommonTest
-import util
-import unittest
+from paddlenlp.datasets import load_dataset
+from tests.common_test import CpuCommonTest
+from tests.testing_utils import assert_raises, slow
 
 
 def get_examples(mode="train"):
     examples = {
         "train": (
-            "I loved this movie since I was 7 and I saw it on the opening day "
-            "It was so touching and beautiful I strongly recommend seeing for "
-            "all Its a movie to watch with your family by farbr br My MPAA rating "
-            "PG13 for thematic elements prolonged scenes of disastor nuditysexuality "
-            "and some language",
+            "I loved this movie since I was 7 and I saw it on the opening day. "
+            "It was so touching and beautiful. I strongly recommend seeing for all. "
+            "It's a movie to watch with your family by far.<br /><br />"
+            "My MPAA rating: PG-13 for thematic elements, prolonged scenes of disastor, "
+            "nudity/sexuality and some language.",
             1,
         ),
         "test": (
-            "Felix in Hollywood is a great film The version I viewed was very well "
-            "restored which is sometimes a problem with these silent era animated films "
-            "It has some of Hollywoods most famous stars making cameo animated "
-            "appearances A must for any silent film or animation enthusiast",
+            "Felix in Hollywood is a great film. The version I viewed was very well restored, "
+            "which is sometimes a problem with these silent era animated films. It has some of "
+            "Hollywood's most famous stars making cameo animated appearances. A must for any "
+            "silent film or animation enthusiast.",
             1,
         ),
     }
     return examples[mode]
 
 
+@slow
 class TestImdbTrainSet(CpuCommonTest):
     def setUp(self):
         self.config["path_or_read_func"] = "imdb"
@@ -56,6 +54,7 @@ def test_train_set(self):
         self.check_output_equal(expected_label, train_ds[36]["label"])
 
 
+@slow
 class TestImdbTestSet(CpuCommonTest):
     def setUp(self):
         self.config["path_or_read_func"] = "imdb"
@@ -70,6 +69,7 @@ def test_test_set(self):
         self.check_output_equal(expected_label, test_ds[23]["label"])
 
 
+@slow
 class TestImdbTrainTestSet(CpuCommonTest):
     def setUp(self):
         self.config["path_or_read_func"] = "imdb"
@@ -96,7 +96,7 @@ class TestImdbNoSplitDataFiles(CpuCommonTest):
     def setUp(self):
         self.config["path_or_read_func"] = "imdb"
 
-    @util.assert_raises
+    @assert_raises
     def test_no_split_datafiles(self):
         load_dataset(**self.config)
 
diff --git a/tests/fixtures/dummy/tnews/dev.json b/tests/fixtures/dummy/tnews/dev.json
@@ -0,0 +1,11 @@
+
+{"label": "102", "label_desc": "news_entertainment", "sentence": "江疏影甜甜圈自拍，迷之角度竟这么好看，美吸引一切事物", "keywords": "江疏影,美少女,经纪人,甜甜圈"}
+{"label": "110", "label_desc": "news_military", "sentence": "以色列大规模空袭开始！伊朗多个军事目标遭遇打击，誓言对等反击", "keywords": "伊朗,圣城军,叙利亚,以色列国防军,以色列"}
+{"label": "104", "label_desc": "news_finance", "sentence": "出栏一头猪亏损300元，究竟谁能笑到最后！", "keywords": "商品猪,养猪,猪价,仔猪,饲料"}
+{"label": "109", "label_desc": "news_tech", "sentence": "以前很火的巴铁为何现在只字不提？", "keywords": ""}
+{"label": "112", "label_desc": "news_travel", "sentence": "作为一名酒店从业人员，你经历过房客哪些特别没有素质的行为？", "keywords": ""}
+{"label": "101", "label_desc": "news_culture", "sentence": "走进荀子的世界 触摸二千年前的心灵温度", "keywords": "荀子导读,韩非子,荀卿,深切著明,稷下学宫,稷下学史,劝学,荀子,中国哲学史,儒家,风俗通义,史记·孟子荀卿列传,中国哲学,大略,成相"}
+{"label": "109", "label_desc": "news_tech", "sentence": "图解：全要素 多领域 高效益 天津智能科技军民融合发展", "keywords": "高效益,天津"}
+{"label": "104", "label_desc": "news_finance", "sentence": "区块链投资心得，能做到就不会亏钱", "keywords": "机会主义,盲人摸象,比特币,区块链,张大千"}
+{"label": "106", "label_desc": "news_house", "sentence": "你家拆迁，要钱还是要房？答案一目了然", "keywords": "房价,房产,货币化安置,三四线城市,买房"}
+{"label": "106", "label_desc": "news_house", "sentence": "军嫂探亲拧包入住，部队家属临时来队房标准有了规定，全面落实！", "keywords": "包入住,热水器,空房子"}
diff --git a/tests/fixtures/dummy/tnews/train.json b/tests/fixtures/dummy/tnews/train.json
@@ -0,0 +1,10 @@
+{"label": "108", "label_desc": "news_edu", "sentence": "上课时学生手机响个不停，老师一怒之下把手机摔了，家长拿发票让老师赔，大家怎么看待这种事？", "keywords": ""}
+{"label": "104", "label_desc": "news_finance", "sentence": "商赢环球股份有限公司关于延期回复上海证券交易所对公司2017年年度报告的事后审核问询函的公告", "keywords": "商赢环球股份有限公司,年度报告,商赢环球,赢环球股份有限公司,事后审核问询函,上海证券交易所"}
+{"label": "106", "label_desc": "news_house", "sentence": "通过中介公司买了二手房，首付都付了，现在卖家不想卖了。怎么处理？", "keywords": ""}
+{"label": "112", "label_desc": "news_travel", "sentence": "2018年去俄罗斯看世界杯得花多少钱？", "keywords": "莫斯科,贝加尔湖,世界杯,俄罗斯,Hour"}
+{"label": "109", "label_desc": "news_tech", "sentence": "剃须刀的个性革新，雷明登天猫定制版新品首发", "keywords": "剃须刀,绝地求生,定制版,战狼2,红海行动,天猫定制版三防,雷明登,维克托"}
+{"label": "103", "label_desc": "news_sports", "sentence": "再次证明了“无敌是多么寂寞”——逆天的中国乒乓球队！", "keywords": "世乒赛,张怡宁,许昕,兵乓球,乒乓球"}
+{"label": "109", "label_desc": "news_tech", "sentence": "三农盾SACC-全球首个推出：互联网+区块链+农产品的电商平台", "keywords": "湖南省,区块链,物联网,集中化,SACC三农盾"}
+{"label": "116", "label_desc": "news_game", "sentence": "重做or新英雄？其实重做对暴雪来说同样重要", "keywords": "暴雪,重做,新英雄,黑百合,英雄联盟"}
+{"label": "103", "label_desc": "news_sports", "sentence": "如何在商业活动中不受人欺骗？", "keywords": ""}
+{"label": "101", "label_desc": "news_culture", "sentence": "87版红楼梦最温柔的四个丫鬟，娶谁都是一生的福气", "keywords": "欧阳奋强,贾宝玉,花袭人,红楼梦,平儿"}