Skip to content

Commit 6308169

Browse files
authored
[prompt] add maskedlmverbalizer & fix collate_fn (#3889)
1 parent 930a986 commit 6308169

File tree

3 files changed

+91
-11
lines changed

3 files changed

+91
-11
lines changed

docs/advanced_guide/prompt.md

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -355,7 +355,7 @@ template = AutoTemplate.create_from(prompt="{'prefix': None, 'encoder': 'mlp', '
355355

356356
### 离散型标签词映射
357357

358-
``ManualVerbalizer`` 支持构造 ``{'mask'}`` 对应的标签词映射,支持多``{'mask'}``,直接作用于 ``AutoMaskedLM`` 模型结构。当标签对应的预测词长度大于 ``1`` 时,默认取均值。
358+
``ManualVerbalizer`` 支持构造 ``{'mask'}`` 对应的标签词映射,同一标签可对应多个不同长度的词,直接作用于 ``AutoMaskedLM`` 模型结构。当标签对应的预测词长度大于 ``1`` 时,默认取均值;当标签对应多个 `{'mask'}` 时,默认与单个 `{mask}` 效果等价
359359

360360
**调用 API**
361361

@@ -373,6 +373,22 @@ verbalizer = ManualVerbalizer(tokenizer=tokenizer,
373373
- ``label_words`` : 原标签到预测词之间的映射字典。
374374
- ``tokenizer`` : 预训练模型的 tokenizer,用于预测词的编码。
375375

376+
``MaskedLMVerbalizer`` 同样支持构造 ``{'mask'}`` 对应的标签词映射,映射词与模板中的 `{'mask'}` 逐字对应,因此,映射词长度应与 `{'mask'}` 数量保持一致。当定义的标签词映射中同一标签对应多个词时,仅有第一个映射词生效。在自定义的 `compute_metric` 函数中需先调用 `verbalizer.aggregate_multiple_mask` 将多 `{'mask'}` 合并后再计算评估函数,默认使用乘积的方式。
377+
378+
**调用 API**
379+
from paddlenlp.prompt import MaskedLMVerbalizer
380+
from paddlenlp.transformers import AutoTokenizer
381+
382+
tokenizer = AutoTokenizer.from_pretrained("ernie-3.0-base-zh")
383+
verbalizer = MaskedLMVerbalizer(tokenizer=tokenizer,
384+
label_words={'负向': '不', '正向': '很'})
385+
```
386+
387+
其中初始化参数定义如下
388+
389+
- ``label_words`` : 原标签到预测词之间的映射字典。
390+
- ``tokenizer`` : 预训练模型的 tokenizer,用于预测词的编码。
391+
376392
### 连续型标签词映射
377393
378394
标签词映射分类器 ``SoftVerbalizer`` 修改了原 ``AutoMaskedLM`` 的模型结构,将预训练模型最后一层“隐藏层-词表”替换为“隐藏层-标签”的映射。该层网络的初始化参数由标签词映射中的预测词词向量来决定,如果预测词长度大于 ``1`` ,则使用词向量均值进行初始化。当前支持的预训练模型包括 ``ErnieForMaskedLM`` 、 ``BertForMaskedLM`` 、 ``AlbertForMaskedLM`` 和 ``RobertaForMaskedLM`` 。可用于实现 WARP 算法。

paddlenlp/prompt/prompt_utils.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,8 +96,10 @@ def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
9696
length = len(value)
9797
new_values[index][0, :length, :length] = value
9898
values = new_values
99-
elif key != "labels":
99+
elif key in ("soft_token_ids", "encoder_ids"):
100100
for index, value in enumerate(values):
101101
values[index] = value + [0] * (max_length - len(value))
102+
elif key != "labels":
103+
continue
102104
batch[key] = self._convert_to_tensors(values)
103105
return batch

paddlenlp/prompt/verbalizer.py

Lines changed: 71 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,9 @@
2727
from paddlenlp.transformers import PretrainedTokenizer, PretrainedModel
2828
from paddlenlp.utils.log import logger
2929

30-
__all__ = ["Verbalizer", "ManualVerbalizer", "SoftVerbalizer"]
30+
__all__ = [
31+
"Verbalizer", "ManualVerbalizer", "SoftVerbalizer", "MaskedLMVerbalizer"
32+
]
3133

3234
# Verbalizer used to be saved in a file.
3335
VERBALIZER_CONFIG_FILE = "verbalizer_config.json"
@@ -263,9 +265,11 @@ class ManualVerbalizer(Verbalizer):
263265
An instance of PretrainedTokenizer for label word tokenization.
264266
"""
265267

266-
def __init__(self, label_words: Dict, tokenizer: PretrainedTokenizer):
268+
def __init__(self, label_words: Dict, tokenizer: PretrainedTokenizer,
269+
**kwargs):
267270
super(ManualVerbalizer, self).__init__(label_words=label_words,
268-
tokenizer=tokenizer)
271+
tokenizer=tokenizer,
272+
**kwargs)
269273

270274
def create_parameters(self):
271275
return None
@@ -292,10 +296,7 @@ def aggregate_multiple_mask(self, outputs: Tensor, atype: str = None):
292296
"tokens.".format(atype))
293297
return outputs
294298

295-
def process_outputs(self,
296-
outputs: Tensor,
297-
masked_positions: Tensor = None,
298-
**kwargs):
299+
def process_outputs(self, outputs: Tensor, masked_positions: Tensor = None):
299300
"""
300301
Process outputs over the vocabulary, including the following steps:
301302
@@ -364,10 +365,11 @@ class SoftVerbalizer(Verbalizer):
364365
LAST_LINEAR = ["AlbertForMaskedLM", "RobertaForMaskedLM"]
365366

366367
def __init__(self, label_words: Dict, tokenizer: PretrainedTokenizer,
367-
model: PretrainedModel):
368+
model: PretrainedModel, **kwargs):
368369
super(SoftVerbalizer, self).__init__(label_words=label_words,
369370
tokenizer=tokenizer,
370-
model=model)
371+
model=modeli,
372+
**kwargs)
371373
del self.model
372374
setattr(model, self.head_name[0], MaskedLMIdentity())
373375

@@ -472,3 +474,63 @@ def _create_init_weight(self, weight, is_bias=False):
472474
axis=1).reshape(word_shape)
473475
weight = self.aggregate(weight, token_mask, aggr_type)
474476
return weight
477+
478+
479+
class MaskedLMVerbalizer(Verbalizer):
480+
"""
481+
MaskedLMVerbalizer defines mapping from labels to words manually and supports
482+
multiple masks corresponding to multiple tokens in words.
483+
484+
Args:
485+
label_words (`dict`):
486+
Define the mapping from labels to a single word. Only the first word
487+
is used if multiple words are defined.
488+
tokenizer (`PretrainedTokenizer`):
489+
An instance of PretrainedTokenizer for label word tokenization.
490+
"""
491+
492+
def __init__(self, label_words: Dict, tokenizer: PretrainedTokenizer,
493+
**kwargs):
494+
super(MaskedLMVerbalizer, self).__init__(label_words=label_words,
495+
tokenizer=tokenizer,
496+
**kwargs)
497+
498+
def create_parameters(self):
499+
return None
500+
501+
def aggregate_multiple_mask(self, outputs: Tensor, atype: str = "product"):
502+
assert outputs.ndim == 3
503+
token_ids = self.token_ids[:, 0, :].T
504+
batch_size, num_token, num_pred = outputs.shape
505+
results = paddle.index_select(outputs[:, 0, :], token_ids[0], axis=1)
506+
if atype == "first":
507+
return results
508+
509+
for index in range(1, num_token):
510+
sub_results = paddle.index_select(outputs[:, index, :],
511+
token_ids[index],
512+
axis=1)
513+
if atype in ("mean", "sum"):
514+
results += sub_results
515+
elif atype == "product":
516+
results *= sub_results
517+
elif atype == "max":
518+
results = paddle.stack([results, sub_results], axis=-1)
519+
results = results.max(axis=-1)
520+
else:
521+
raise ValueError(
522+
"Strategy {} is not supported to aggregate multiple "
523+
"tokens.".format(atype))
524+
if atype == "mean":
525+
results = results / num_token
526+
return results
527+
528+
def process_outputs(self, outputs: Tensor, masked_positions: Tensor = None):
529+
if masked_positions is None:
530+
return outputs
531+
532+
batch_size, _, num_pred = outputs.shape
533+
outputs = outputs.reshape([-1, num_pred])
534+
outputs = paddle.gather(outputs, masked_positions)
535+
outputs = outputs.reshape([batch_size, -1, num_pred])
536+
return outputs

0 commit comments

Comments
 (0)