Skip to content

Commit 6b1fa66

Browse files
dyan-dyZeyuChensmallv0221
authored
modified bq_corpus.py (#562)
Co-authored-by: Zeyu Chen <[email protected]> Co-authored-by: smallv0221 <[email protected]>
1 parent 480a33d commit 6b1fa66

File tree

1 file changed

+67
-0
lines changed

1 file changed

+67
-0
lines changed

paddlenlp/datasets/bq_corpus.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
import collections
2+
import json
3+
import os
4+
5+
from paddle.dataset.common import md5file
6+
from paddle.utils.download import get_path_from_url
7+
from paddlenlp.utils.env import DATA_HOME
8+
from . import DatasetBuilder
9+
10+
__all__ = ['BQCorpus']
11+
12+
13+
class BQCorpus(DatasetBuilder):
14+
"""
15+
BQCorpus: the largest dataset available for for the banking and finance sector
16+
17+
by frozenfish123@Wuhan University
18+
19+
"""
20+
lazy = False
21+
URL = "https://dataset-bj.cdn.bcebos.com/qianyan/bq_corpus.zip"
22+
MD5 = "abe6c480b96cb705b4d24bd522848009"
23+
META_INFO = collections.namedtuple('META_INFO', ('file', 'md5'))
24+
SPLITS = {
25+
'train': META_INFO(
26+
os.path.join('BQCorpus', 'train.tsv'),
27+
'd37683e9ee778ee2f4326033b654adb9'),
28+
'dev': META_INFO(
29+
os.path.join('BQCorpus', 'dev.tsv'),
30+
'8a71f2a69453646921e9ee1aa457d1e4'),
31+
'test': META_INFO(
32+
os.path.join('BQCorpus', 'test.tsv'),
33+
'c797995baa248b144ceaa4018b191e52'),
34+
}
35+
36+
def _get_data(self, mode, **kwargs):
37+
''' Check and download Dataset '''
38+
default_root = os.path.join(DATA_HOME, self.__class__.__name__)
39+
filename, data_hash = self.SPLITS[mode]
40+
fullname = os.path.join(default_root, filename)
41+
if not os.path.exists(fullname) or (data_hash and
42+
not md5file(fullname) == data_hash):
43+
get_path_from_url(self.URL, default_root, self.MD5)
44+
45+
return fullname
46+
47+
def _read(self, filename):
48+
"""Reads data."""
49+
with open(filename, 'r', encoding='utf-8') as f:
50+
head = None
51+
for line in f:
52+
data = line.strip().split("\t")
53+
if not head:
54+
head = data
55+
else:
56+
sentence1, sentence2, label = data
57+
yield {
58+
"sentence1": sentence1,
59+
"sentence2": sentence2,
60+
"label": label
61+
}
62+
63+
def get_labels(self):
64+
"""
65+
Return labels of the BQCorpus object.
66+
"""
67+
return ["0", "1"]

0 commit comments

Comments
 (0)