|
| 1 | +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. |
| 2 | +# |
| 3 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +# you may not use this file except in compliance with the License. |
| 5 | +# You may obtain a copy of the License at |
| 6 | +# |
| 7 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +# |
| 9 | +# Unless required by applicable law or agreed to in writing, software |
| 10 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +# See the License for the specific language governing permissions and |
| 13 | +# limitations under the License. |
| 14 | + |
| 15 | +import collections |
| 16 | +import os |
| 17 | + |
| 18 | +from paddle.dataset.common import md5file |
| 19 | +from paddle.utils.download import get_path_from_url |
| 20 | +from paddlenlp.utils.env import DATA_HOME |
| 21 | +from . import DatasetBuilder |
| 22 | + |
| 23 | + |
| 24 | +class Conll2002(DatasetBuilder): |
| 25 | + """ |
| 26 | + Named entities are phrases that contain the names of persons, organizations, |
| 27 | + locations, times and quantities. Example: [PER Wolff] , currently a journalist |
| 28 | + in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] . |
| 29 | + The shared task of CoNLL-2002 concerns language-independent named entity recognition. |
| 30 | + We will concentrate on four types of named entities: persons, locations, organizations and names of |
| 31 | + miscellaneous entities that do not belong to the previous three groups. The participants of the |
| 32 | + shared task will be offered training and test data for at least two languages. |
| 33 | + They will use the data for developing a named-entity recognition system that includes a machine learning component. |
| 34 | + Information sources other than the training data may be used in this shared task. We are especially interested |
| 35 | + in methods that can use additional unannotated data for improving their performance (for example co-training). |
| 36 | + For more details see https://www.clips.uantwerpen.be/conll2002/ner/ |
| 37 | + and https://www.aclweb.org/anthology/W02-2024/ |
| 38 | + """ |
| 39 | + META_INFO = collections.namedtuple('META_INFO', ('file', 'url', 'md5')) |
| 40 | + BASE_URL = 'https://bj.bcebos.com/paddlenlp/datasets/conll2002/' |
| 41 | + BUILDER_CONFIGS = { |
| 42 | + 'es': { |
| 43 | + 'splits': { |
| 44 | + 'train': META_INFO('esp.train', BASE_URL + 'esp.train', |
| 45 | + 'c8c6b342371b9de2f83a93767d352c17'), |
| 46 | + 'dev': META_INFO('esp.testa', BASE_URL + 'esp.testa', |
| 47 | + 'de0578160dde26ec68cc580595587dde'), |
| 48 | + 'test': META_INFO('esp.testb', BASE_URL + 'esp.testb', |
| 49 | + 'c8d35f340685a2ce6559ee90d78f9e37') |
| 50 | + }, |
| 51 | + 'pos_tags': [ |
| 52 | + "AO", |
| 53 | + "AQ", |
| 54 | + "CC", |
| 55 | + "CS", |
| 56 | + "DA", |
| 57 | + "DE", |
| 58 | + "DD", |
| 59 | + "DI", |
| 60 | + "DN", |
| 61 | + "DP", |
| 62 | + "DT", |
| 63 | + "Faa", |
| 64 | + "Fat", |
| 65 | + "Fc", |
| 66 | + "Fd", |
| 67 | + "Fe", |
| 68 | + "Fg", |
| 69 | + "Fh", |
| 70 | + "Fia", |
| 71 | + "Fit", |
| 72 | + "Fp", |
| 73 | + "Fpa", |
| 74 | + "Fpt", |
| 75 | + "Fs", |
| 76 | + "Ft", |
| 77 | + "Fx", |
| 78 | + "Fz", |
| 79 | + "I", |
| 80 | + "NC", |
| 81 | + "NP", |
| 82 | + "P0", |
| 83 | + "PD", |
| 84 | + "PI", |
| 85 | + "PN", |
| 86 | + "PP", |
| 87 | + "PR", |
| 88 | + "PT", |
| 89 | + "PX", |
| 90 | + "RG", |
| 91 | + "RN", |
| 92 | + "SP", |
| 93 | + "VAI", |
| 94 | + "VAM", |
| 95 | + "VAN", |
| 96 | + "VAP", |
| 97 | + "VAS", |
| 98 | + "VMG", |
| 99 | + "VMI", |
| 100 | + "VMM", |
| 101 | + "VMN", |
| 102 | + "VMP", |
| 103 | + "VMS", |
| 104 | + "VSG", |
| 105 | + "VSI", |
| 106 | + "VSM", |
| 107 | + "VSN", |
| 108 | + "VSP", |
| 109 | + "VSS", |
| 110 | + "Y", |
| 111 | + "Z", |
| 112 | + ] |
| 113 | + }, |
| 114 | + 'nl': { |
| 115 | + 'splits': { |
| 116 | + 'train': META_INFO('ned.train', BASE_URL + 'ned.train', |
| 117 | + 'b6189d04eb34597d2a98ca5cec477605'), |
| 118 | + 'dev': META_INFO('ned.testa', BASE_URL + 'ned.testa', |
| 119 | + '626900497823fdbc4f84335518cb85ce'), |
| 120 | + 'test': META_INFO('ned.testb', BASE_URL + 'ned.testb', |
| 121 | + 'c37de92da20c68c6418a73dd42e322dc') |
| 122 | + }, |
| 123 | + 'pos_tags': [ |
| 124 | + "Adj", "Adv", "Art", "Conj", "Int", "Misc", "N", "Num", "Prep", |
| 125 | + "Pron", "Punc", "V" |
| 126 | + ] |
| 127 | + } |
| 128 | + } |
| 129 | + |
| 130 | + def _get_data(self, mode, **kwargs): |
| 131 | + builder_config = self.BUILDER_CONFIGS[self.name] |
| 132 | + default_root = os.path.join(DATA_HOME, self.__class__.__name__) |
| 133 | + filename, url, data_hash = builder_config['splits'][mode] |
| 134 | + fullname = os.path.join(default_root, filename) |
| 135 | + if not os.path.exists(fullname) or (data_hash and |
| 136 | + not md5file(fullname) == data_hash): |
| 137 | + get_path_from_url(url, default_root, data_hash) |
| 138 | + return fullname |
| 139 | + |
| 140 | + def _read(self, filename, *args): |
| 141 | + with open(filename, 'r', encoding="utf-8") as f: |
| 142 | + tokens = [] |
| 143 | + ner_tags = [] |
| 144 | + pos_tags = [] |
| 145 | + for line in f.readlines(): |
| 146 | + if line.startswith("-DOCSTART-") or line == "" or line == "\n": |
| 147 | + if tokens: |
| 148 | + yield { |
| 149 | + "tokens": tokens, |
| 150 | + "ner_tags": ner_tags, |
| 151 | + "pos_tags": pos_tags |
| 152 | + } |
| 153 | + tokens = [] |
| 154 | + ner_tags = [] |
| 155 | + pos_tags = [] |
| 156 | + else: |
| 157 | + # conll2002 tokens are space separated |
| 158 | + splits = line.split(" ") |
| 159 | + tokens.append(splits[0]) |
| 160 | + pos_tags.append(splits[1]) |
| 161 | + ner_tags.append(splits[2].rstrip()) |
| 162 | + # last example |
| 163 | + yield {"tokens": tokens, "ner_tags": ner_tags, "pos_tags": pos_tags} |
| 164 | + |
| 165 | + def get_labels(self): |
| 166 | + """ |
| 167 | + Returns labels of ner tags and pos tags. |
| 168 | + """ |
| 169 | + return ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-MISC", "I-MISC"], \ |
| 170 | + self.BUILDER_CONFIGS[self.name]['pos_tags'] |
0 commit comments