Skip to content

Commit 7b455cc

Browse files
gongenleiZeyuChen
andauthored
feat: add conll2002 dataset (#1561)
Co-authored-by: Zeyu Chen <[email protected]>
1 parent 22a497c commit 7b455cc

File tree

3 files changed

+172
-0
lines changed

3 files changed

+172
-0
lines changed

docs/data_prepare/dataset_list.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ PaddleNLP提供了以下数据集的快速读取API,实际使用时请根据
6565
| ---- | --------- | ------ |
6666
| [MSRA_NER](https://github.com/lemonhu/NER-BERT-pytorch/tree/master/data/msra) | MSRA 命名实体识别数据集| `paddlenlp.datasets.load_dataset('msra_ner')`|
6767
| [People's Daily](https://github.com/OYE93/Chinese-NLP-Corpus/tree/master/NER/People's%20Daily) | 人民日报命名实体识别数据集| `paddlenlp.datasets.load_dataset('peoples_daily_ner')`|
68+
| [CoNLL-2002](https://www.aclweb.org/anthology/W02-2024/) | 西班牙语和荷兰语实体识别数据集| `paddlenlp.datasets.load_dataset('conll2002', 'es')`|
6869

6970

7071
## 机器翻译

paddlenlp/datasets/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,3 +39,4 @@
3939
from .nlpcc13_evsam05_hit import *
4040
from .xnli_cn import *
4141
from .xnli import *
42+
from .conll2002 import *

paddlenlp/datasets/conll2002.py

Lines changed: 170 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,170 @@
1+
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import collections
16+
import os
17+
18+
from paddle.dataset.common import md5file
19+
from paddle.utils.download import get_path_from_url
20+
from paddlenlp.utils.env import DATA_HOME
21+
from . import DatasetBuilder
22+
23+
24+
class Conll2002(DatasetBuilder):
25+
"""
26+
Named entities are phrases that contain the names of persons, organizations,
27+
locations, times and quantities. Example: [PER Wolff] , currently a journalist
28+
in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .
29+
The shared task of CoNLL-2002 concerns language-independent named entity recognition.
30+
We will concentrate on four types of named entities: persons, locations, organizations and names of
31+
miscellaneous entities that do not belong to the previous three groups. The participants of the
32+
shared task will be offered training and test data for at least two languages.
33+
They will use the data for developing a named-entity recognition system that includes a machine learning component.
34+
Information sources other than the training data may be used in this shared task. We are especially interested
35+
in methods that can use additional unannotated data for improving their performance (for example co-training).
36+
For more details see https://www.clips.uantwerpen.be/conll2002/ner/
37+
and https://www.aclweb.org/anthology/W02-2024/
38+
"""
39+
META_INFO = collections.namedtuple('META_INFO', ('file', 'url', 'md5'))
40+
BASE_URL = 'https://bj.bcebos.com/paddlenlp/datasets/conll2002/'
41+
BUILDER_CONFIGS = {
42+
'es': {
43+
'splits': {
44+
'train': META_INFO('esp.train', BASE_URL + 'esp.train',
45+
'c8c6b342371b9de2f83a93767d352c17'),
46+
'dev': META_INFO('esp.testa', BASE_URL + 'esp.testa',
47+
'de0578160dde26ec68cc580595587dde'),
48+
'test': META_INFO('esp.testb', BASE_URL + 'esp.testb',
49+
'c8d35f340685a2ce6559ee90d78f9e37')
50+
},
51+
'pos_tags': [
52+
"AO",
53+
"AQ",
54+
"CC",
55+
"CS",
56+
"DA",
57+
"DE",
58+
"DD",
59+
"DI",
60+
"DN",
61+
"DP",
62+
"DT",
63+
"Faa",
64+
"Fat",
65+
"Fc",
66+
"Fd",
67+
"Fe",
68+
"Fg",
69+
"Fh",
70+
"Fia",
71+
"Fit",
72+
"Fp",
73+
"Fpa",
74+
"Fpt",
75+
"Fs",
76+
"Ft",
77+
"Fx",
78+
"Fz",
79+
"I",
80+
"NC",
81+
"NP",
82+
"P0",
83+
"PD",
84+
"PI",
85+
"PN",
86+
"PP",
87+
"PR",
88+
"PT",
89+
"PX",
90+
"RG",
91+
"RN",
92+
"SP",
93+
"VAI",
94+
"VAM",
95+
"VAN",
96+
"VAP",
97+
"VAS",
98+
"VMG",
99+
"VMI",
100+
"VMM",
101+
"VMN",
102+
"VMP",
103+
"VMS",
104+
"VSG",
105+
"VSI",
106+
"VSM",
107+
"VSN",
108+
"VSP",
109+
"VSS",
110+
"Y",
111+
"Z",
112+
]
113+
},
114+
'nl': {
115+
'splits': {
116+
'train': META_INFO('ned.train', BASE_URL + 'ned.train',
117+
'b6189d04eb34597d2a98ca5cec477605'),
118+
'dev': META_INFO('ned.testa', BASE_URL + 'ned.testa',
119+
'626900497823fdbc4f84335518cb85ce'),
120+
'test': META_INFO('ned.testb', BASE_URL + 'ned.testb',
121+
'c37de92da20c68c6418a73dd42e322dc')
122+
},
123+
'pos_tags': [
124+
"Adj", "Adv", "Art", "Conj", "Int", "Misc", "N", "Num", "Prep",
125+
"Pron", "Punc", "V"
126+
]
127+
}
128+
}
129+
130+
def _get_data(self, mode, **kwargs):
131+
builder_config = self.BUILDER_CONFIGS[self.name]
132+
default_root = os.path.join(DATA_HOME, self.__class__.__name__)
133+
filename, url, data_hash = builder_config['splits'][mode]
134+
fullname = os.path.join(default_root, filename)
135+
if not os.path.exists(fullname) or (data_hash and
136+
not md5file(fullname) == data_hash):
137+
get_path_from_url(url, default_root, data_hash)
138+
return fullname
139+
140+
def _read(self, filename, *args):
141+
with open(filename, 'r', encoding="utf-8") as f:
142+
tokens = []
143+
ner_tags = []
144+
pos_tags = []
145+
for line in f.readlines():
146+
if line.startswith("-DOCSTART-") or line == "" or line == "\n":
147+
if tokens:
148+
yield {
149+
"tokens": tokens,
150+
"ner_tags": ner_tags,
151+
"pos_tags": pos_tags
152+
}
153+
tokens = []
154+
ner_tags = []
155+
pos_tags = []
156+
else:
157+
# conll2002 tokens are space separated
158+
splits = line.split(" ")
159+
tokens.append(splits[0])
160+
pos_tags.append(splits[1])
161+
ner_tags.append(splits[2].rstrip())
162+
# last example
163+
yield {"tokens": tokens, "ner_tags": ner_tags, "pos_tags": pos_tags}
164+
165+
def get_labels(self):
166+
"""
167+
Returns labels of ner tags and pos tags.
168+
"""
169+
return ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-MISC", "I-MISC"], \
170+
self.BUILDER_CONFIGS[self.name]['pos_tags']

0 commit comments

Comments
 (0)