Skip to content

Commit 7153318

Browse files
committed
[cblue] add datasets for sequence classification
1 parent 2986c93 commit 7153318

File tree

3 files changed

+284
-2
lines changed

3 files changed

+284
-2
lines changed

paddlenlp/datasets/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,3 +39,4 @@
3939
from .nlpcc13_evsam05_hit import *
4040
from .xnli_cn import *
4141
from .xnli import *
42+
from .cblue import *

paddlenlp/datasets/cblue.py

Lines changed: 281 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,281 @@
1+
import collections
2+
import json
3+
import os
4+
5+
import pandas as pd
6+
7+
from paddle.dataset.common import md5file
8+
from paddle.utils.download import get_path_from_url
9+
from paddlenlp.utils.env import DATA_HOME
10+
from . import DatasetBuilder
11+
12+
13+
class CBLUE(DatasetBuilder):
14+
'''
15+
The Chinese Biomedical Language Understanding Evaluation (CBLUE) benchmark
16+
is a collection of natural language understanding tasks including named
17+
entity recognition, information extraction, clinical diagnosis normalization
18+
and single-sentence/sentence-pair classification.
19+
From https://github.com/CBLUEbenchmark/CBLUE
20+
21+
CMeEE:
22+
The Chinese Medical Named Entity Recognition is first released in CHIP20204.
23+
Given a pre-defined schema, the task is to identify and extract entities
24+
from the given sentence and classify them into nine categories: disease,
25+
clinical manifestations, drugs, medical equipment, medical procedures,
26+
body, medical examinations, microorganisms, and department.
27+
28+
CMeIE:
29+
The Chinese Medical Information Extraction is also released in CHIP2020.
30+
The task is aimed at identifying both entities and relations in a sentence
31+
following the schema constraints. There are 53 relations defined in the dataset,
32+
including 10 synonymous sub-relationships and 43 other sub-relationships.
33+
34+
CHIP-CDN:
35+
The CHIP Clinical Diagnosis Normalization dataset aims to standardize
36+
the terms from the final diagnoses of Chinese electronic medical records.
37+
38+
CHIP-CTC:
39+
The CHIP Clinical Trial Classification dataset aimed at classifying
40+
clinical trials eligibility criteria.
41+
42+
CHIP-STS:
43+
The CHIP Semantic Textual Similarity dataset consists of question pairs
44+
related to 5 different diseases and aims to determine sentence similarity.
45+
46+
KUAKE-QIC:
47+
The KUAKE Query Intent Classification dataset is used to classify queries
48+
of search engines into one of 11 medical intent categories, including
49+
diagnosis, etiology analysis, treatment plan, medical advice, test result
50+
analysis, disease description, consequence prediction, precautions, intended
51+
effects, treatment fees, and others.
52+
53+
KUAKE-QTR:
54+
The KUAKE Query Title Relevance dataset is used to estimate the
55+
relevance of the title of a query document.
56+
57+
KUAKE-QQR:
58+
The KUAKE Query-Query Relevance dataset is used to evaluate the
59+
relevance of the content expressed in two queries.
60+
'''
61+
62+
BUILDER_CONFIGS = {
63+
'CMeEE': {
64+
'url': 'https://paddlenlp.bj.bcebos.com/datasets/cblue/CMeEE.zip',
65+
'md5': '2f21afc5d95918346b673f84eecd06b1',
66+
'splits': {
67+
'train': [
68+
os.path.join('CMeEE', 'CMeEE_train.json'),
69+
'725b34819dd49a0ce028c37e4ad0a73b', ['text']
70+
],
71+
'dev': [
72+
os.path.join('CMeEE', 'CMeEE_dev.json'),
73+
'42778760dcce7b9ada6e290f7b2a59c2', ['text']
74+
],
75+
'test': [
76+
os.path.join('CMeEE', 'CMeEE_test.json'),
77+
'c45b3b3d79ca29776e3d9f009b7d6ee5', ['test']
78+
]
79+
},
80+
'labels': ['dis', 'sym', 'pro', 'equ', 'dru', 'ite', 'bod', 'dep', 'mic']
81+
},
82+
'CMeIE': {
83+
'url': 'https://paddlenlp.bj.bcebos.com/datasets/cblue/CMeIE.zip',
84+
'md5': '444569dfc31580c8cfa18843d0a1bd59',
85+
'splits': {
86+
'train': [
87+
os.path.join('CMeIE', 'CMeIE_train.json'),
88+
'd27a7d4f0f5326018db66f64ac63780c', ['text']
89+
],
90+
'dev': [
91+
os.path.join('CMeIE', 'CMeIE_dev.json'),
92+
'54203d1e775a2f07aaea30b61b93ca2f', ['text']
93+
],
94+
'test': [
95+
os.path.join('CMeIE', 'CMeIE_test.json'),
96+
'8ac74722e9448fdc76132206582b9a06', ['text']
97+
]
98+
},
99+
'labels': '53_schema.json'
100+
},
101+
'CHIP-CDN': {
102+
'url': 'https://paddlenlp.bj.bcebos.com/datasets/cblue/CHIP-CDN.zip',
103+
'md5': 'e378d6bfe6740aadfb197ca352db3427',
104+
'splits': {
105+
'train': [
106+
os.path.join('CHIP-CDN', 'CHIP-CDN_train.json'),
107+
'2940ff04e91f52722f10010e5cbc1f18', ['text']
108+
],
109+
'dev': [
110+
os.path.join('CHIP-CDN', 'CHIP-CDN_dev.json'),
111+
'c718cdd36f913deb11a1a0b46de51015', ['text']
112+
],
113+
'test': [
114+
os.path.join('CHIP-CDN', 'CHIP-CDN_test.json'),
115+
'8dbe229a23af30bd7c3c5bdcdf156314', ['text']
116+
]
117+
},
118+
'labels': '国际疾病分类 ICD-10北京临床版v601.xlsx'
119+
},
120+
'CHIP-CTC': {
121+
'url': 'https://paddlenlp.bj.bcebos.com/datasets/cblue/CHIP-CTC.zip',
122+
'md5': '43d804211d46f9374c18ab13d6984f29',
123+
'splits': {
124+
'train': [
125+
os.path.join('CHIP-CTC', 'CHIP-CTC_train.json'),
126+
'098ac22cafe7446393d941612f906531', ['text']
127+
],
128+
'dev': [
129+
os.path.join('CHIP-CTC', 'CHIP-CTC_dev.json'),
130+
'b48d52fd686bea286de1a3b123398483', ['text']
131+
],
132+
'test': [
133+
os.path.join('CHIP-CTC', 'CHIP-CTC_test.json'),
134+
'6a5f0f20f8f85f727d9ef1ea09f939d9', ['text']
135+
]
136+
},
137+
'labels': 'category.xlsx'
138+
},
139+
'CHIP-STS': {
140+
'url': 'https://paddlenlp.bj.bcebos.com/datasets/cblue/CHIP-STS.zip',
141+
'md5': '4d4db5ef14336e3179e4e1f3c1cc2621',
142+
'splits': {
143+
'train': [
144+
os.path.join('CHIP-STS', 'CHIP-STS_train.json'),
145+
'c6150e2628f107cf2657feb4ed2ba65b',
146+
['text1', 'text2']
147+
],
148+
'dev': [
149+
os.path.join('CHIP-STS', 'CHIP-STS_dev.json'),
150+
'2813ecc0222ef8e4612296776e54639d',
151+
['text1', 'text2']
152+
],
153+
'test': [
154+
os.path.join('CHIP-STS', 'CHIP-STS_test.json'),
155+
'44394681097024aa922e4e33fa651360',
156+
['text1', 'text2']
157+
]
158+
},
159+
'labels': ['0', '1']
160+
},
161+
'KUAKE-QIC': {
162+
'url': 'https://paddlenlp.bj.bcebos.com/datasets/cblue/KUAKE-QIC.zip',
163+
'md5': '7661e3a6b5daf4ee025ba407669788d8',
164+
'splits': {
165+
'train': [
166+
os.path.join('KUAKE-QIC', 'KUAKE-QIC_train.json'),
167+
'fc7e359decfcf7b1316e7833acc97b8a', ['query']
168+
],
169+
'dev': [
170+
os.path.join('KUAKE-QIC', 'KUAKE-QIC_dev.json'),
171+
'2fd1f4131916239d89b213cc9860c1c6', ['query']
172+
],
173+
'test': [
174+
os.path.join('KUAKE-QIC', 'KUAKE-QIC_test.json'),
175+
'337dc7f3cdc77b1a21b534ecb3142a6b', ['query']
176+
]
177+
},
178+
'labels': ['病情诊断', '治疗方案', '病因分析', '指标解读',
179+
'就医建议', '疾病表述', '后果表述',
180+
'注意事项', '功效作用', '医疗费用', '其他']
181+
},
182+
'KUAKE-QTR': {
183+
'url': 'https://paddlenlp.bj.bcebos.com/datasets/cblue/KUAKE-QTR.zip',
184+
'md5': 'a59686c2b489ac64ff6f0f029c1df068',
185+
'splits': {
186+
'train': [
187+
os.path.join('KUAKE-QTR', 'KUAKE-QTR_train.json'),
188+
'7197f9ca963f337fc81ce6c8a1c97dc4', ['query', 'title']
189+
],
190+
'dev': [
191+
os.path.join('KUAKE-QTR', 'KUAKE-QTR_dev.json'),
192+
'e6c480aa46ef2dd04290afe165cdfa9a', ['query', 'title']
193+
],
194+
'test': [
195+
os.path.join('KUAKE-QTR', 'KUAKE-QTR_test.json'),
196+
'4ccfcf83eef0563b16914d5455d225a5', ['query', 'title']
197+
]
198+
},
199+
'labels': ['0', '1', '2', '3']
200+
},
201+
'KUAKE-QQR': {
202+
'url': 'https://paddlenlp.bj.bcebos.com/datasets/cblue/KUAKE-QQR.zip',
203+
'md5': 'b7fdeed0ae56e450d7cf3aa7c0b19e20',
204+
'splits': {
205+
'train': [
206+
os.path.join('KUAKE-QQR', 'KUAKE-QQR_train.json'),
207+
'f667e31610acf3f107369310b78d56a9',
208+
('query1', 'query2')
209+
],
210+
'dev': [
211+
os.path.join('KUAKE-QQR', 'KUAKE-QQR_dev.json'),
212+
'597354382a806b8168a705584f4f6887',
213+
('query1', 'query2')
214+
],
215+
'test': [
216+
os.path.join('KUAKE-QQR', 'KUAKE-QQR_test.json'),
217+
'2d257135c6e1651d24a84496dd50c658',
218+
('query1', 'query2')
219+
]
220+
},
221+
'labels': ['0', '1', '2']
222+
}
223+
}
224+
225+
226+
def _get_data(self, mode, **kwargs):
227+
builder_config = self.BUILDER_CONFIGS[self.name]
228+
default_root = os.path.join(DATA_HOME, self.__class__.__name__)
229+
filename, data_hash, _ = builder_config['splits'][mode]
230+
fullname = os.path.join(default_root, filename)
231+
if not os.path.exists(fullname) or (data_hash and
232+
not md5file(fullname) == data_hash):
233+
get_path_from_url(builder_config['url'], default_root,
234+
builder_config['md5'])
235+
return fullname
236+
237+
238+
def _read(self, filename, split):
239+
if self.name == 'CMeIE':
240+
pass
241+
elif self.name == 'CMeEE':
242+
pass
243+
else:
244+
_, _, input_keys = self.BUILDER_CONFIGS[self.name]['splits'][split]
245+
with open(filename, 'r', encoding='utf-8') as f:
246+
data_list = json.load(f)
247+
for data in data_list:
248+
if data.get('normalized_result', None):
249+
data['labels'] = [x.strip('"') for x in data['normalized_result'].split('##')]
250+
data.pop('normalized_result')
251+
data['text_a'] = data[input_keys[0]]
252+
data.pop(input_keys[0])
253+
if len(input_keys) > 1:
254+
data['text_b'] = data[input_keys[1]]
255+
data.pop(input_keys[1])
256+
yield data
257+
258+
259+
def get_labels(self):
260+
"""
261+
Returns labels of the CBLUE task.
262+
"""
263+
labels = self.BUILDER_CONFIGS[self.name]['labels']
264+
if isinstance(labels, str):
265+
default_root = os.path.join(DATA_HOME, self.__class__.__name__)
266+
label_dir = os.path.join(default_root, self.name)
267+
if self.name == 'CHIP-CDN':
268+
name = [x for x in os.listdir(label_dir) if x.endswith('.xlsx')][0]
269+
labels = pd.read_excel(os.path.join(label_dir, name), header=None)
270+
return labels[1].values
271+
elif self.name == 'CHIP-CTC':
272+
labels = pd.read_excel(os.path.join(label_dir, labels))
273+
return labels['Label Name'].values
274+
elif self.name == 'CMeIE':
275+
labels = []
276+
with open(os.path.join(label_dir, labels), 'r') as f:
277+
for line in f.readlines():
278+
labels.append(json.loads(line))
279+
return labels
280+
else:
281+
return self.BUILDER_CONFIGS[self.name]['labels']

paddlenlp/metrics/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,6 @@
1616
from .chunk import ChunkEvaluator
1717
from .bleu import BLEU, BLEUForDuReader
1818
from .rouge import RougeL, RougeLForDuReader, RougeN, Rouge1, Rouge2
19-
from .glue import AccuracyAndF1, Mcc, PearsonAndSpearman
19+
from .glue import AccuracyAndF1, Mcc, PearsonAndSpearman, MultiLabelsMetric
2020
from .distinct import Distinct
21-
from .sighan import DetectionF1, CorrectionF1
21+
from .sighan import DetectionF1, CorrectionF1

0 commit comments

Comments
 (0)