Skip to content

Commit 84e8026

Browse files
authored
Backup DuConv for hf dataset (#2924)
* backup duconv * update license * typo
1 parent 8292c71 commit 84e8026

File tree

4 files changed

+164
-14
lines changed

4 files changed

+164
-14
lines changed

examples/dialogue/unified_transformer/finetune.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,8 @@
1010
from paddle.optimizer.lr import NoamDecay
1111
from paddle.optimizer import AdamW
1212

13-
from paddlenlp.datasets import load_dataset
1413
from paddlenlp.transformers import UnifiedTransformerLMHeadModel, UnifiedTransformerTokenizer
14+
from datasets import load_dataset
1515

1616
from utils import print_args, set_seed, create_data_loader
1717

@@ -67,7 +67,7 @@ def train(args):
6767
if world_size > 1:
6868
model = paddle.DataParallel(model)
6969

70-
train_ds, dev_ds = load_dataset('duconv', splits=('train', 'dev'))
70+
train_ds, dev_ds = load_dataset('duconv', split=('train', 'dev'))
7171
train_ds, train_data_loader = create_data_loader(train_ds, tokenizer, args,
7272
'train')
7373
dev_ds, dev_data_loader = create_data_loader(dev_ds, tokenizer, args, 'dev')

examples/dialogue/unified_transformer/infer.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@
22
import argparse
33

44
import paddle
5-
from paddlenlp.datasets import load_dataset
65
from paddlenlp.transformers import UnifiedTransformerLMHeadModel, UnifiedTransformerTokenizer
76
from paddlenlp.metrics import BLEU, Distinct
7+
from datasets import load_dataset
88

99
from utils import print_args, set_seed, create_data_loader, select_response
1010

@@ -76,7 +76,7 @@ def infer(args):
7676
tokenizer = UnifiedTransformerTokenizer.from_pretrained(
7777
args.model_name_or_path)
7878

79-
test_ds = load_dataset('duconv', splits='test_1')
79+
test_ds = load_dataset('duconv', split='test_1')
8080
test_ds, test_data_loader = create_data_loader(test_ds, tokenizer, args,
8181
'test')
8282

examples/dialogue/unified_transformer/utils.py

Lines changed: 25 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -34,16 +34,24 @@ def preprocess_examples(examples, mode='train'):
3434
"""
3535
if mode == 'test':
3636
return examples
37-
new_examples = []
38-
for example in examples:
39-
conversation = example['conversation']
37+
new_examples = {}
38+
goal = []
39+
knowledge = []
40+
history = []
41+
response = []
42+
43+
conv = examples['conversation']
44+
for index, conversation in enumerate(conv):
4045
for i in range(0, len(conversation), 2):
41-
new_examples.append({
42-
'goal': example['goal'],
43-
'knowledge': example['knowledge'],
44-
'history': conversation[:i],
45-
'response': conversation[i]
46-
})
46+
goal.append(examples['goal'][index])
47+
knowledge.append(examples['knowledge'][index])
48+
history.append(conversation[:i])
49+
response.append(conversation[i])
50+
new_examples["goal"] = goal
51+
new_examples["knowledge"] = knowledge
52+
new_examples["history"] = history
53+
new_examples["response"] = response
54+
4755
return new_examples
4856

4957

@@ -145,7 +153,14 @@ def create_data_loader(dataset, tokenizer, args, mode):
145153
max_response_len=args.max_response_len,
146154
max_knowledge_len=args.max_knowledge_len,
147155
mode=mode)
148-
dataset = dataset.map(trans_func1, batched=True).map(trans_func2, lazy=True)
156+
remove_columns = None
157+
if mode in ["train", "dev"]:
158+
remove_columns = ["id", "conversation"]
159+
160+
dataset = dataset.map(trans_func1,
161+
batched=True,
162+
batch_size=None,
163+
remove_columns=remove_columns).map(trans_func2)
149164
if mode == 'train':
150165
batch_sampler = DistributedBatchSampler(dataset,
151166
batch_size=args.batch_size,
Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
# coding=utf-8
2+
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
3+
# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace Datasets Authors.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
# Lint as: python3
18+
19+
import json
20+
import os
21+
22+
import datasets
23+
from datasets.tasks import QuestionAnsweringExtractive
24+
25+
logger = datasets.logging.get_logger(__name__)
26+
27+
_DESCRIPTION = """\
28+
Duconv is a chinese conversation \
29+
dataset, designed to evaluate the dialogue models.
30+
"""
31+
32+
_URL = "https://bj.bcebos.com/paddlenlp/datasets/DuConv.zip"
33+
34+
35+
class DuconvConfig(datasets.BuilderConfig):
36+
"""BuilderConfig for Duconv."""
37+
38+
def __init__(self, **kwargs):
39+
"""BuilderConfig for Duconv.
40+
41+
Args:
42+
**kwargs: keyword arguments forwarded to super.
43+
"""
44+
super(DuconvConfig, self).__init__(**kwargs)
45+
46+
47+
class Duconv(datasets.GeneratorBasedBuilder):
48+
BUILDER_CONFIGS = [
49+
DuconvConfig(
50+
name="DuConv",
51+
version=datasets.Version("1.0.0", ""),
52+
description=_DESCRIPTION,
53+
),
54+
]
55+
56+
def _info(self):
57+
return datasets.DatasetInfo(
58+
description=_DESCRIPTION,
59+
features=datasets.Features({
60+
"id":
61+
datasets.Value("string"),
62+
"goal":
63+
datasets.Sequence(datasets.Sequence(datasets.Value("string"))),
64+
"knowledge":
65+
datasets.Sequence(datasets.Sequence(datasets.Value("string"))),
66+
"conversation":
67+
datasets.Sequence(datasets.Value("string")),
68+
"history":
69+
datasets.Sequence(datasets.Value("string")),
70+
"response":
71+
datasets.Value("string"),
72+
}),
73+
# No default supervised_keys (as we have to pass both question
74+
# and context as input).
75+
supervised_keys=None,
76+
homepage="https://arxiv.org/pdf/1906.05572.pdf",
77+
)
78+
79+
def _split_generators(self, dl_manager):
80+
dl_dir = dl_manager.download_and_extract(_URL)
81+
82+
return [
83+
datasets.SplitGenerator(name="train",
84+
gen_kwargs={
85+
"filepath":
86+
os.path.join(dl_dir, 'DuConv',
87+
'train.txt'),
88+
}),
89+
datasets.SplitGenerator(name="dev",
90+
gen_kwargs={
91+
"filepath":
92+
os.path.join(dl_dir, 'DuConv',
93+
'dev.txt'),
94+
}),
95+
datasets.SplitGenerator(name="test_1",
96+
gen_kwargs={
97+
"filepath":
98+
os.path.join(dl_dir, 'DuConv',
99+
'test_1.txt'),
100+
}),
101+
datasets.SplitGenerator(name="test_2",
102+
gen_kwargs={
103+
"filepath":
104+
os.path.join(dl_dir, 'DuConv',
105+
'test_2.txt'),
106+
}),
107+
]
108+
109+
def _generate_examples(self, filepath):
110+
"""This function returns the examples in the raw (text) form."""
111+
logger.info("generating examples from = %s", filepath)
112+
key = 0
113+
with open(filepath, 'r', encoding="utf-8") as fin:
114+
for line in fin:
115+
duconv = json.loads(line)
116+
117+
goal = duconv["goal"] if "goal" in duconv.keys() else [[]]
118+
knowledge = duconv["knowledge"] if "knowledge" in duconv.keys(
119+
) else [[]]
120+
conversation = duconv[
121+
"conversation"] if "conversation" in duconv.keys() else []
122+
history = duconv["history"] if "history" in duconv.keys(
123+
) else []
124+
response = duconv["response"] if "response" in duconv.keys(
125+
) else ""
126+
127+
yield key, {
128+
"id": str(key),
129+
"goal": goal,
130+
"knowledge": knowledge,
131+
"conversation": conversation,
132+
"history": history,
133+
"response": response,
134+
}
135+
key += 1

0 commit comments

Comments
 (0)