|
| 1 | +import collections |
| 2 | +import json |
| 3 | +import os |
| 4 | + |
| 5 | +import pandas as pd |
| 6 | + |
| 7 | +from paddle.dataset.common import md5file |
| 8 | +from paddle.utils.download import get_path_from_url |
| 9 | +from paddlenlp.utils.env import DATA_HOME |
| 10 | +from . import DatasetBuilder |
| 11 | + |
| 12 | + |
| 13 | +class CBLUE(DatasetBuilder): |
| 14 | + ''' |
| 15 | + The Chinese Biomedical Language Understanding Evaluation (CBLUE) benchmark |
| 16 | + is a collection of natural language understanding tasks including named |
| 17 | + entity recognition, information extraction, clinical diagnosis normalization |
| 18 | + and single-sentence/sentence-pair classification. |
| 19 | + From https://github.com/CBLUEbenchmark/CBLUE |
| 20 | +
|
| 21 | + CMeEE: |
| 22 | + The Chinese Medical Named Entity Recognition is first released in CHIP20204. |
| 23 | + Given a pre-defined schema, the task is to identify and extract entities |
| 24 | + from the given sentence and classify them into nine categories: disease, |
| 25 | + clinical manifestations, drugs, medical equipment, medical procedures, |
| 26 | + body, medical examinations, microorganisms, and department. |
| 27 | +
|
| 28 | + CMeIE: |
| 29 | + The Chinese Medical Information Extraction is also released in CHIP2020. |
| 30 | + The task is aimed at identifying both entities and relations in a sentence |
| 31 | + following the schema constraints. There are 53 relations defined in the dataset, |
| 32 | + including 10 synonymous sub-relationships and 43 other sub-relationships. |
| 33 | +
|
| 34 | + CHIP-CDN: |
| 35 | + The CHIP Clinical Diagnosis Normalization dataset aims to standardize |
| 36 | + the terms from the final diagnoses of Chinese electronic medical records. |
| 37 | +
|
| 38 | + CHIP-CTC: |
| 39 | + The CHIP Clinical Trial Classification dataset aimed at classifying |
| 40 | + clinical trials eligibility criteria. |
| 41 | +
|
| 42 | + CHIP-STS: |
| 43 | + The CHIP Semantic Textual Similarity dataset consists of question pairs |
| 44 | + related to 5 different diseases and aims to determine sentence similarity. |
| 45 | +
|
| 46 | + KUAKE-QIC: |
| 47 | + The KUAKE Query Intent Classification dataset is used to classify queries |
| 48 | + of search engines into one of 11 medical intent categories, including |
| 49 | + diagnosis, etiology analysis, treatment plan, medical advice, test result |
| 50 | + analysis, disease description, consequence prediction, precautions, intended |
| 51 | + effects, treatment fees, and others. |
| 52 | +
|
| 53 | + KUAKE-QTR: |
| 54 | + The KUAKE Query Title Relevance dataset is used to estimate the |
| 55 | + relevance of the title of a query document. |
| 56 | +
|
| 57 | + KUAKE-QQR: |
| 58 | + The KUAKE Query-Query Relevance dataset is used to evaluate the |
| 59 | + relevance of the content expressed in two queries. |
| 60 | + ''' |
| 61 | + |
| 62 | + BUILDER_CONFIGS = { |
| 63 | + 'CMeEE': { |
| 64 | + 'url': 'https://paddlenlp.bj.bcebos.com/datasets/cblue/CMeEE.zip', |
| 65 | + 'md5': '2f21afc5d95918346b673f84eecd06b1', |
| 66 | + 'splits': { |
| 67 | + 'train': [ |
| 68 | + os.path.join('CMeEE', 'CMeEE_train.json'), |
| 69 | + '725b34819dd49a0ce028c37e4ad0a73b', ['text'] |
| 70 | + ], |
| 71 | + 'dev': [ |
| 72 | + os.path.join('CMeEE', 'CMeEE_dev.json'), |
| 73 | + '42778760dcce7b9ada6e290f7b2a59c2', ['text'] |
| 74 | + ], |
| 75 | + 'test': [ |
| 76 | + os.path.join('CMeEE', 'CMeEE_test.json'), |
| 77 | + 'c45b3b3d79ca29776e3d9f009b7d6ee5', ['test'] |
| 78 | + ] |
| 79 | + }, |
| 80 | + 'labels': ['dis', 'sym', 'pro', 'equ', 'dru', 'ite', 'bod', 'dep', 'mic'] |
| 81 | + }, |
| 82 | + 'CMeIE': { |
| 83 | + 'url': 'https://paddlenlp.bj.bcebos.com/datasets/cblue/CMeIE.zip', |
| 84 | + 'md5': '444569dfc31580c8cfa18843d0a1bd59', |
| 85 | + 'splits': { |
| 86 | + 'train': [ |
| 87 | + os.path.join('CMeIE', 'CMeIE_train.json'), |
| 88 | + 'd27a7d4f0f5326018db66f64ac63780c', ['text'] |
| 89 | + ], |
| 90 | + 'dev': [ |
| 91 | + os.path.join('CMeIE', 'CMeIE_dev.json'), |
| 92 | + '54203d1e775a2f07aaea30b61b93ca2f', ['text'] |
| 93 | + ], |
| 94 | + 'test': [ |
| 95 | + os.path.join('CMeIE', 'CMeIE_test.json'), |
| 96 | + '8ac74722e9448fdc76132206582b9a06', ['text'] |
| 97 | + ] |
| 98 | + }, |
| 99 | + 'labels': '53_schema.json' |
| 100 | + }, |
| 101 | + 'CHIP-CDN': { |
| 102 | + 'url': 'https://paddlenlp.bj.bcebos.com/datasets/cblue/CHIP-CDN.zip', |
| 103 | + 'md5': 'e378d6bfe6740aadfb197ca352db3427', |
| 104 | + 'splits': { |
| 105 | + 'train': [ |
| 106 | + os.path.join('CHIP-CDN', 'CHIP-CDN_train.json'), |
| 107 | + '2940ff04e91f52722f10010e5cbc1f18', ['text'] |
| 108 | + ], |
| 109 | + 'dev': [ |
| 110 | + os.path.join('CHIP-CDN', 'CHIP-CDN_dev.json'), |
| 111 | + 'c718cdd36f913deb11a1a0b46de51015', ['text'] |
| 112 | + ], |
| 113 | + 'test': [ |
| 114 | + os.path.join('CHIP-CDN', 'CHIP-CDN_test.json'), |
| 115 | + '8dbe229a23af30bd7c3c5bdcdf156314', ['text'] |
| 116 | + ] |
| 117 | + }, |
| 118 | + 'labels': '国际疾病分类 ICD-10北京临床版v601.xlsx' |
| 119 | + }, |
| 120 | + 'CHIP-CTC': { |
| 121 | + 'url': 'https://paddlenlp.bj.bcebos.com/datasets/cblue/CHIP-CTC.zip', |
| 122 | + 'md5': '43d804211d46f9374c18ab13d6984f29', |
| 123 | + 'splits': { |
| 124 | + 'train': [ |
| 125 | + os.path.join('CHIP-CTC', 'CHIP-CTC_train.json'), |
| 126 | + '098ac22cafe7446393d941612f906531', ['text'] |
| 127 | + ], |
| 128 | + 'dev': [ |
| 129 | + os.path.join('CHIP-CTC', 'CHIP-CTC_dev.json'), |
| 130 | + 'b48d52fd686bea286de1a3b123398483', ['text'] |
| 131 | + ], |
| 132 | + 'test': [ |
| 133 | + os.path.join('CHIP-CTC', 'CHIP-CTC_test.json'), |
| 134 | + '6a5f0f20f8f85f727d9ef1ea09f939d9', ['text'] |
| 135 | + ] |
| 136 | + }, |
| 137 | + 'labels': 'category.xlsx' |
| 138 | + }, |
| 139 | + 'CHIP-STS': { |
| 140 | + 'url': 'https://paddlenlp.bj.bcebos.com/datasets/cblue/CHIP-STS.zip', |
| 141 | + 'md5': '4d4db5ef14336e3179e4e1f3c1cc2621', |
| 142 | + 'splits': { |
| 143 | + 'train': [ |
| 144 | + os.path.join('CHIP-STS', 'CHIP-STS_train.json'), |
| 145 | + 'c6150e2628f107cf2657feb4ed2ba65b', |
| 146 | + ['text1', 'text2'] |
| 147 | + ], |
| 148 | + 'dev': [ |
| 149 | + os.path.join('CHIP-STS', 'CHIP-STS_dev.json'), |
| 150 | + '2813ecc0222ef8e4612296776e54639d', |
| 151 | + ['text1', 'text2'] |
| 152 | + ], |
| 153 | + 'test': [ |
| 154 | + os.path.join('CHIP-STS', 'CHIP-STS_test.json'), |
| 155 | + '44394681097024aa922e4e33fa651360', |
| 156 | + ['text1', 'text2'] |
| 157 | + ] |
| 158 | + }, |
| 159 | + 'labels': ['0', '1'] |
| 160 | + }, |
| 161 | + 'KUAKE-QIC': { |
| 162 | + 'url': 'https://paddlenlp.bj.bcebos.com/datasets/cblue/KUAKE-QIC.zip', |
| 163 | + 'md5': '7661e3a6b5daf4ee025ba407669788d8', |
| 164 | + 'splits': { |
| 165 | + 'train': [ |
| 166 | + os.path.join('KUAKE-QIC', 'KUAKE-QIC_train.json'), |
| 167 | + 'fc7e359decfcf7b1316e7833acc97b8a', ['query'] |
| 168 | + ], |
| 169 | + 'dev': [ |
| 170 | + os.path.join('KUAKE-QIC', 'KUAKE-QIC_dev.json'), |
| 171 | + '2fd1f4131916239d89b213cc9860c1c6', ['query'] |
| 172 | + ], |
| 173 | + 'test': [ |
| 174 | + os.path.join('KUAKE-QIC', 'KUAKE-QIC_test.json'), |
| 175 | + '337dc7f3cdc77b1a21b534ecb3142a6b', ['query'] |
| 176 | + ] |
| 177 | + }, |
| 178 | + 'labels': ['病情诊断', '治疗方案', '病因分析', '指标解读', |
| 179 | + '就医建议', '疾病表述', '后果表述', |
| 180 | + '注意事项', '功效作用', '医疗费用', '其他'] |
| 181 | + }, |
| 182 | + 'KUAKE-QTR': { |
| 183 | + 'url': 'https://paddlenlp.bj.bcebos.com/datasets/cblue/KUAKE-QTR.zip', |
| 184 | + 'md5': 'a59686c2b489ac64ff6f0f029c1df068', |
| 185 | + 'splits': { |
| 186 | + 'train': [ |
| 187 | + os.path.join('KUAKE-QTR', 'KUAKE-QTR_train.json'), |
| 188 | + '7197f9ca963f337fc81ce6c8a1c97dc4', ['query', 'title'] |
| 189 | + ], |
| 190 | + 'dev': [ |
| 191 | + os.path.join('KUAKE-QTR', 'KUAKE-QTR_dev.json'), |
| 192 | + 'e6c480aa46ef2dd04290afe165cdfa9a', ['query', 'title'] |
| 193 | + ], |
| 194 | + 'test': [ |
| 195 | + os.path.join('KUAKE-QTR', 'KUAKE-QTR_test.json'), |
| 196 | + '4ccfcf83eef0563b16914d5455d225a5', ['query', 'title'] |
| 197 | + ] |
| 198 | + }, |
| 199 | + 'labels': ['0', '1', '2', '3'] |
| 200 | + }, |
| 201 | + 'KUAKE-QQR': { |
| 202 | + 'url': 'https://paddlenlp.bj.bcebos.com/datasets/cblue/KUAKE-QQR.zip', |
| 203 | + 'md5': 'b7fdeed0ae56e450d7cf3aa7c0b19e20', |
| 204 | + 'splits': { |
| 205 | + 'train': [ |
| 206 | + os.path.join('KUAKE-QQR', 'KUAKE-QQR_train.json'), |
| 207 | + 'f667e31610acf3f107369310b78d56a9', |
| 208 | + ('query1', 'query2') |
| 209 | + ], |
| 210 | + 'dev': [ |
| 211 | + os.path.join('KUAKE-QQR', 'KUAKE-QQR_dev.json'), |
| 212 | + '597354382a806b8168a705584f4f6887', |
| 213 | + ('query1', 'query2') |
| 214 | + ], |
| 215 | + 'test': [ |
| 216 | + os.path.join('KUAKE-QQR', 'KUAKE-QQR_test.json'), |
| 217 | + '2d257135c6e1651d24a84496dd50c658', |
| 218 | + ('query1', 'query2') |
| 219 | + ] |
| 220 | + }, |
| 221 | + 'labels': ['0', '1', '2'] |
| 222 | + } |
| 223 | + } |
| 224 | + |
| 225 | + |
| 226 | + def _get_data(self, mode, **kwargs): |
| 227 | + builder_config = self.BUILDER_CONFIGS[self.name] |
| 228 | + default_root = os.path.join(DATA_HOME, self.__class__.__name__) |
| 229 | + filename, data_hash, _ = builder_config['splits'][mode] |
| 230 | + fullname = os.path.join(default_root, filename) |
| 231 | + if not os.path.exists(fullname) or (data_hash and |
| 232 | + not md5file(fullname) == data_hash): |
| 233 | + get_path_from_url(builder_config['url'], default_root, |
| 234 | + builder_config['md5']) |
| 235 | + return fullname |
| 236 | + |
| 237 | + |
| 238 | + def _read(self, filename, split): |
| 239 | + if self.name == 'CMeIE': |
| 240 | + pass |
| 241 | + elif self.name == 'CMeEE': |
| 242 | + pass |
| 243 | + else: |
| 244 | + _, _, input_keys = self.BUILDER_CONFIGS[self.name]['splits'][split] |
| 245 | + with open(filename, 'r', encoding='utf-8') as f: |
| 246 | + data_list = json.load(f) |
| 247 | + for data in data_list: |
| 248 | + if data.get('normalized_result', None): |
| 249 | + data['labels'] = [x.strip('"') for x in data['normalized_result'].split('##')] |
| 250 | + data.pop('normalized_result') |
| 251 | + data['text_a'] = data[input_keys[0]] |
| 252 | + data.pop(input_keys[0]) |
| 253 | + if len(input_keys) > 1: |
| 254 | + data['text_b'] = data[input_keys[1]] |
| 255 | + data.pop(input_keys[1]) |
| 256 | + yield data |
| 257 | + |
| 258 | + |
| 259 | + def get_labels(self): |
| 260 | + """ |
| 261 | + Returns labels of the CBLUE task. |
| 262 | + """ |
| 263 | + labels = self.BUILDER_CONFIGS[self.name]['labels'] |
| 264 | + if isinstance(labels, str): |
| 265 | + default_root = os.path.join(DATA_HOME, self.__class__.__name__) |
| 266 | + label_dir = os.path.join(default_root, self.name) |
| 267 | + if self.name == 'CHIP-CDN': |
| 268 | + name = [x for x in os.listdir(label_dir) if x.endswith('.xlsx')][0] |
| 269 | + labels = pd.read_excel(os.path.join(label_dir, name), header=None) |
| 270 | + return labels[1].values |
| 271 | + elif self.name == 'CHIP-CTC': |
| 272 | + labels = pd.read_excel(os.path.join(label_dir, labels)) |
| 273 | + return labels['Label Name'].values |
| 274 | + elif self.name == 'CMeIE': |
| 275 | + labels = [] |
| 276 | + with open(os.path.join(label_dir, labels), 'r') as f: |
| 277 | + for line in f.readlines(): |
| 278 | + labels.append(json.loads(line)) |
| 279 | + return labels |
| 280 | + else: |
| 281 | + return self.BUILDER_CONFIGS[self.name]['labels'] |
0 commit comments