Skip to content

Commit dd792d6

Browse files
authored
[CI] Fix data and datasets tests (#4215)
* changes * add data, datasets * add dataset * remove data
1 parent 28a7ef1 commit dd792d6

File tree

9 files changed

+89
-75
lines changed

9 files changed

+89
-75
lines changed

pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ exclude = ['.flake8']
1212
minversion = "6.0"
1313
addopts = "-ra -q"
1414
testpaths = [
15+
"tests/data",
16+
"tests/datasets",
1517
"tests/transformers",
1618
"tests/prompt",
1719
"tests/taskflow",

tests/common_test.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,12 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14-
import numpy as np
1514
import unittest
16-
import paddle
1715
import warnings
1816

17+
import numpy as np
18+
import paddle
19+
1920
__all__ = ["CommonTest", "CpuCommonTest"]
2021

2122

@@ -92,7 +93,7 @@ def _check_output_impl(self, result, expected_result, rtol, atol, equal=True):
9293
error_msg = "Output has diff at place:{}. \nExpect: {} \nBut Got: {} in class {}"
9394
if result_t in [list, tuple]:
9495
result_t = get_container_type(result)
95-
if result_t in [str, int, bool, set, np.bool, np.int32, np.int64, np.str]:
96+
if result_t in [str, int, bool, set, bool, np.int32, np.int64]:
9697
assertForNormalType(
9798
result,
9899
expected_result,

tests/data/test_collate.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,13 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15+
import unittest
16+
1517
import numpy as np
1618

17-
from paddlenlp.data import Stack, Pad, Tuple, Dict
18-
from common_test import CpuCommonTest
19-
import util
20-
import unittest
19+
from paddlenlp.data import Dict, Pad, Stack, Tuple
20+
from tests import testing_utils
21+
from tests.common_test import CpuCommonTest
2122

2223

2324
class TestStack(CpuCommonTest):
@@ -84,7 +85,7 @@ def test_tuple(self):
8485
def test_tuple_list(self):
8586
self._test_impl(False)
8687

87-
@util.assert_raises
88+
@testing_utils.assert_raises
8889
def test_empty_fn(self):
8990
Tuple([Stack()], Pad(axis=0, pad_val=0))
9091

tests/data/test_sampler.py

Lines changed: 25 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -12,34 +12,32 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
import numpy as np
1615
import os
16+
import unittest
1717

1818
from paddlenlp.data import SamplerHelper
1919
from paddlenlp.datasets import load_dataset
20-
21-
from common_test import CpuCommonTest
22-
import util
23-
import unittest
20+
from tests.common_test import CpuCommonTest
21+
from tests.testing_utils import assert_raises, get_tests_dir
2422

2523

2624
def cmp(x, y):
2725
return -1 if x < y else 1 if x > y else 0
2826

2927

3028
class TestSampler(CpuCommonTest):
31-
def setUp(self):
32-
self.config["path"] = "imdb"
33-
self.config["splits"] = "train"
34-
self.train_ds = load_dataset(**self.config)
29+
@classmethod
30+
def setUpClass(cls):
31+
fixture_path = get_tests_dir(os.path.join("fixtures", "dummy"))
32+
cls.train_ds = load_dataset("clue", "tnews", data_files=[os.path.join(fixture_path, "tnews", "train.json")])
3533

3634
def test_length(self):
3735
train_batch_sampler = SamplerHelper(self.train_ds)
38-
self.check_output_equal(len(train_batch_sampler), 25000)
36+
self.check_output_equal(len(train_batch_sampler), 10)
3937
self.check_output_equal(len(train_batch_sampler), train_batch_sampler.length)
4038

41-
train_batch_sampler.length = 20
42-
self.check_output_equal(len(train_batch_sampler), 20)
39+
train_batch_sampler.length = 5
40+
self.check_output_equal(len(train_batch_sampler), 5)
4341

4442
def test_iter1(self):
4543
train_ds_len = len(self.train_ds)
@@ -63,23 +61,15 @@ def test_list(self):
6361
def test_shuffle_no_buffer_size(self):
6462
train_batch_sampler = SamplerHelper(self.train_ds)
6563
shuffle_sampler = train_batch_sampler.shuffle(seed=102)
66-
expected_result = {
67-
0: 5189,
68-
12000: 11777,
69-
24999: 10496,
70-
}
64+
expected_result = {0: 4, 1: 9}
7165
for i, sample in enumerate(shuffle_sampler):
7266
if i in expected_result.keys():
7367
self.check_output_equal(sample, expected_result[i])
7468

7569
def test_shuffle_buffer_size(self):
7670
train_batch_sampler = SamplerHelper(self.train_ds)
7771
shuffle_sampler = train_batch_sampler.shuffle(buffer_size=10, seed=102)
78-
expected_result = {
79-
0: 4,
80-
12000: 12003,
81-
24999: 24997,
82-
}
72+
expected_result = {0: 4, 1: 9}
8373
for i, sample in enumerate(shuffle_sampler):
8474
if i in expected_result.keys():
8575
self.check_output_equal(sample, expected_result[i])
@@ -88,12 +78,12 @@ def test_sort_buffer_size(self):
8878
train_ds_len = len(self.train_ds)
8979
ds_iter = iter(range(train_ds_len - 1, -1, -1))
9080
train_batch_sampler = SamplerHelper(self.train_ds, ds_iter)
91-
sort_sampler = train_batch_sampler.sort(cmp=lambda x, y, dataset: cmp(x, y), buffer_size=12500)
81+
sort_sampler = train_batch_sampler.sort(cmp=lambda x, y, dataset: cmp(x, y), buffer_size=5)
9282
for i, sample in enumerate(sort_sampler):
93-
if i < 12500:
94-
self.check_output_equal(i + 12500, sample)
83+
if i < 5:
84+
self.check_output_equal(i + 5, sample)
9585
else:
96-
self.check_output_equal(i - 12500, sample)
86+
self.check_output_equal(i - 5, sample)
9787

9888
def test_sort_no_buffer_size(self):
9989
train_ds_len = len(self.train_ds)
@@ -111,14 +101,16 @@ def test_batch(self):
111101
for j, minibatch in enumerate(sample):
112102
self.check_output_equal(i * batch_size + j, minibatch)
113103

114-
@util.assert_raises(ValueError)
104+
@assert_raises(ValueError)
115105
def test_batch_oversize(self):
116106
train_batch_sampler = SamplerHelper(self.train_ds)
117107
batch_size = 3
118-
key = lambda size_so_far, minibatch_len: max(size_so_far, minibatch_len)
119-
batch_size_fn = lambda new, count, sofar, data_source: len(data_source)
120108

121-
batch_sampler = train_batch_sampler.batch(batch_size, key=key, batch_size_fn=batch_size_fn)
109+
batch_sampler = train_batch_sampler.batch(
110+
batch_size,
111+
key=lambda size_so_far, minibatch_len: max(size_so_far, minibatch_len),
112+
batch_size_fn=lambda new, count, sofar, data_source: len(data_source),
113+
)
122114
for i, sample in enumerate(batch_sampler):
123115
for j, minibatch in enumerate(sample):
124116
self.check_output_equal(i * batch_size + j, minibatch)
@@ -143,8 +135,9 @@ def test_apply(self):
143135
train_ds_len = len(self.train_ds)
144136
ds_iter = iter(range(train_ds_len - 1, -1, -1))
145137
train_batch_sampler = SamplerHelper(self.train_ds, ds_iter)
146-
fn = lambda sampler: SamplerHelper.sort(sampler, cmp=lambda x, y, dataset: cmp(x, y))
147-
apply_sampler = train_batch_sampler.apply(fn)
138+
apply_sampler = train_batch_sampler.apply(
139+
lambda sampler: SamplerHelper.sort(sampler, cmp=lambda x, y, dataset: cmp(x, y))
140+
)
148141
for i, sample in enumerate(apply_sampler):
149142
self.check_output_equal(i, sample)
150143

tests/data/test_tokenizer.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,11 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
import numpy as np
16-
import os
15+
import unittest
1716

1817
from paddlenlp.data import JiebaTokenizer, Vocab
19-
from common_test import CpuCommonTest
20-
from util import create_test_data
21-
import unittest
18+
from tests.common_test import CpuCommonTest
19+
from tests.testing_utils import create_test_data
2220

2321

2422
class TestJiebaTokenizer(CpuCommonTest):

tests/data/test_vocab.py

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,12 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
import numpy as np
16-
import os
15+
import unittest
16+
from collections import Counter
1717

1818
from paddlenlp.data import Vocab
19-
from common_test import CpuCommonTest
20-
from collections import Counter
21-
import util
22-
import unittest
19+
from tests import testing_utils
20+
from tests.common_test import CpuCommonTest
2321

2422

2523
class TestVocab(CpuCommonTest):
@@ -36,30 +34,30 @@ def create_counter(self):
3634
def setUp(self):
3735
self.create_counter()
3836

39-
@util.assert_raises(ValueError)
37+
@testing_utils.assert_raises(ValueError)
4038
def test_invalid_specail_token(self):
4139
Vocab(wrong_kwarg="")
4240

43-
@util.assert_raises(ValueError)
41+
@testing_utils.assert_raises(ValueError)
4442
def test_invalid_identifier(self):
4543
Vocab(counter=self.counter, _special_token="")
4644

47-
@util.assert_raises(ValueError)
45+
@testing_utils.assert_raises(ValueError)
4846
def test_sort_index_value_error1(self):
4947
token_to_idx = {"一万七千多": 1, "一万七千余": 2, "IP地址": 3}
50-
vocab = Vocab(counter=self.counter, unk_token="[UNK]", token_to_idx=token_to_idx)
48+
Vocab(counter=self.counter, unk_token="[UNK]", token_to_idx=token_to_idx)
5149

52-
@util.assert_raises(ValueError)
50+
@testing_utils.assert_raises(ValueError)
5351
def test_sort_index_value_error2(self):
5452
token_to_idx = {"一万七千多": 1, "一万七千余": 2, "一万七千": 2}
5553
Vocab(counter=self.counter, unk_token="[UNK]", token_to_idx=token_to_idx)
5654

57-
@util.assert_raises(ValueError)
55+
@testing_utils.assert_raises(ValueError)
5856
def test_sort_index_value_error3(self):
5957
token_to_idx = {"一万七千多": -1, "一万七千余": 2, "一万七千": 3}
6058
Vocab(counter=self.counter, unk_token="[UNK]", token_to_idx=token_to_idx)
6159

62-
@util.assert_raises(ValueError)
60+
@testing_utils.assert_raises(ValueError)
6361
def test_to_token_excess_size(self):
6462
token_to_idx = {"一万七千多": 1, "一万七千余": 2, "一万万": 3}
6563
vocab = Vocab(counter=self.counter, unk_token="[UNK]", token_to_idx=token_to_idx)

tests/dataset/test_imdb.py

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
1+
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
22
#
33
# Licensed under the Apache License, Version 2.0 (the "License");
44
# you may not use this file except in compliance with the License.
@@ -11,37 +11,35 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14-
import numpy as np
15-
import os
1614
import unittest
17-
from paddlenlp.datasets import load_dataset
1815

19-
from common_test import CpuCommonTest
20-
import util
21-
import unittest
16+
from paddlenlp.datasets import load_dataset
17+
from tests.common_test import CpuCommonTest
18+
from tests.testing_utils import assert_raises, slow
2219

2320

2421
def get_examples(mode="train"):
2522
examples = {
2623
"train": (
27-
"I loved this movie since I was 7 and I saw it on the opening day "
28-
"It was so touching and beautiful I strongly recommend seeing for "
29-
"all Its a movie to watch with your family by farbr br My MPAA rating "
30-
"PG13 for thematic elements prolonged scenes of disastor nuditysexuality "
31-
"and some language",
24+
"I loved this movie since I was 7 and I saw it on the opening day. "
25+
"It was so touching and beautiful. I strongly recommend seeing for all. "
26+
"It's a movie to watch with your family by far.<br /><br />"
27+
"My MPAA rating: PG-13 for thematic elements, prolonged scenes of disastor, "
28+
"nudity/sexuality and some language.",
3229
1,
3330
),
3431
"test": (
35-
"Felix in Hollywood is a great film The version I viewed was very well "
36-
"restored which is sometimes a problem with these silent era animated films "
37-
"It has some of Hollywoods most famous stars making cameo animated "
38-
"appearances A must for any silent film or animation enthusiast",
32+
"Felix in Hollywood is a great film. The version I viewed was very well restored, "
33+
"which is sometimes a problem with these silent era animated films. It has some of "
34+
"Hollywood's most famous stars making cameo animated appearances. A must for any "
35+
"silent film or animation enthusiast.",
3936
1,
4037
),
4138
}
4239
return examples[mode]
4340

4441

42+
@slow
4543
class TestImdbTrainSet(CpuCommonTest):
4644
def setUp(self):
4745
self.config["path_or_read_func"] = "imdb"
@@ -56,6 +54,7 @@ def test_train_set(self):
5654
self.check_output_equal(expected_label, train_ds[36]["label"])
5755

5856

57+
@slow
5958
class TestImdbTestSet(CpuCommonTest):
6059
def setUp(self):
6160
self.config["path_or_read_func"] = "imdb"
@@ -70,6 +69,7 @@ def test_test_set(self):
7069
self.check_output_equal(expected_label, test_ds[23]["label"])
7170

7271

72+
@slow
7373
class TestImdbTrainTestSet(CpuCommonTest):
7474
def setUp(self):
7575
self.config["path_or_read_func"] = "imdb"
@@ -96,7 +96,7 @@ class TestImdbNoSplitDataFiles(CpuCommonTest):
9696
def setUp(self):
9797
self.config["path_or_read_func"] = "imdb"
9898

99-
@util.assert_raises
99+
@assert_raises
100100
def test_no_split_datafiles(self):
101101
load_dataset(**self.config)
102102

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
2+
{"label": "102", "label_desc": "news_entertainment", "sentence": "江疏影甜甜圈自拍,迷之角度竟这么好看,美吸引一切事物", "keywords": "江疏影,美少女,经纪人,甜甜圈"}
3+
{"label": "110", "label_desc": "news_military", "sentence": "以色列大规模空袭开始!伊朗多个军事目标遭遇打击,誓言对等反击", "keywords": "伊朗,圣城军,叙利亚,以色列国防军,以色列"}
4+
{"label": "104", "label_desc": "news_finance", "sentence": "出栏一头猪亏损300元,究竟谁能笑到最后!", "keywords": "商品猪,养猪,猪价,仔猪,饲料"}
5+
{"label": "109", "label_desc": "news_tech", "sentence": "以前很火的巴铁为何现在只字不提?", "keywords": ""}
6+
{"label": "112", "label_desc": "news_travel", "sentence": "作为一名酒店从业人员,你经历过房客哪些特别没有素质的行为?", "keywords": ""}
7+
{"label": "101", "label_desc": "news_culture", "sentence": "走进荀子的世界 触摸二千年前的心灵温度", "keywords": "荀子导读,韩非子,荀卿,深切著明,稷下学宫,稷下学史,劝学,荀子,中国哲学史,儒家,风俗通义,史记·孟子荀卿列传,中国哲学,大略,成相"}
8+
{"label": "109", "label_desc": "news_tech", "sentence": "图解:全要素 多领域 高效益 天津智能科技军民融合发展", "keywords": "高效益,天津"}
9+
{"label": "104", "label_desc": "news_finance", "sentence": "区块链投资心得,能做到就不会亏钱", "keywords": "机会主义,盲人摸象,比特币,区块链,张大千"}
10+
{"label": "106", "label_desc": "news_house", "sentence": "你家拆迁,要钱还是要房?答案一目了然", "keywords": "房价,房产,货币化安置,三四线城市,买房"}
11+
{"label": "106", "label_desc": "news_house", "sentence": "军嫂探亲拧包入住,部队家属临时来队房标准有了规定,全面落实!", "keywords": "包入住,热水器,空房子"}
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
{"label": "108", "label_desc": "news_edu", "sentence": "上课时学生手机响个不停,老师一怒之下把手机摔了,家长拿发票让老师赔,大家怎么看待这种事?", "keywords": ""}
2+
{"label": "104", "label_desc": "news_finance", "sentence": "商赢环球股份有限公司关于延期回复上海证券交易所对公司2017年年度报告的事后审核问询函的公告", "keywords": "商赢环球股份有限公司,年度报告,商赢环球,赢环球股份有限公司,事后审核问询函,上海证券交易所"}
3+
{"label": "106", "label_desc": "news_house", "sentence": "通过中介公司买了二手房,首付都付了,现在卖家不想卖了。怎么处理?", "keywords": ""}
4+
{"label": "112", "label_desc": "news_travel", "sentence": "2018年去俄罗斯看世界杯得花多少钱?", "keywords": "莫斯科,贝加尔湖,世界杯,俄罗斯,Hour"}
5+
{"label": "109", "label_desc": "news_tech", "sentence": "剃须刀的个性革新,雷明登天猫定制版新品首发", "keywords": "剃须刀,绝地求生,定制版,战狼2,红海行动,天猫定制版三防,雷明登,维克托"}
6+
{"label": "103", "label_desc": "news_sports", "sentence": "再次证明了“无敌是多么寂寞”——逆天的中国乒乓球队!", "keywords": "世乒赛,张怡宁,许昕,兵乓球,乒乓球"}
7+
{"label": "109", "label_desc": "news_tech", "sentence": "三农盾SACC-全球首个推出:互联网+区块链+农产品的电商平台", "keywords": "湖南省,区块链,物联网,集中化,SACC三农盾"}
8+
{"label": "116", "label_desc": "news_game", "sentence": "重做or新英雄?其实重做对暴雪来说同样重要", "keywords": "暴雪,重做,新英雄,黑百合,英雄联盟"}
9+
{"label": "103", "label_desc": "news_sports", "sentence": "如何在商业活动中不受人欺骗?", "keywords": ""}
10+
{"label": "101", "label_desc": "news_culture", "sentence": "87版红楼梦最温柔的四个丫鬟,娶谁都是一生的福气", "keywords": "欧阳奋强,贾宝玉,花袭人,红楼梦,平儿"}

0 commit comments

Comments
 (0)