-
Notifications
You must be signed in to change notification settings - Fork 3.1k
Best Practice
骑马小猫 edited this page Nov 3, 2022
·
7 revisions
这里将介绍使用PaddleNLP过程中的最佳实践方法,形式不限于代码片段和github repo,也欢迎大家来贡献自己的实践方法。
- 文本处理
from paddlenlp.transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("ernie-3.0-nano-zh")
result = tokenizer("您好,欢迎使用PaddleNLP", max_length=30, padding=True, return_token_type_id=True, return_tensors='pd')
assert result['input_ids'].shape == [1, 13]
result = tokenizer("您好,欢迎使用PaddleNLP", max_length=30, padding="max_length", return_token_type_id=True, return_tensors='pd')
assert result['input_ids'].shape == [1, 30]
- 空格处理
from paddlenlp.transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("ernie-3.0-nano-zh")
result = tokenizer("您 好", return_tensors='pd', add_special_tokens=False)
assert result['input_ids'].shape == [1, 2]
result = tokenizer.encode(list("您 好"), padding=True, split_into_words = False, return_tensors='pd', add_special_tokens=False)
assert result['input_ids'].shape == [1, 3]
from paddlenlp.transformers import BertConfig, BertModel, BertForTokenClassification
from paddlenlp.utils.converter import Converter, StateDictKeysChecker
config = BertConfig()
bert_model = BertModel(config)
bert_for_token_model = BertForTokenClassification(config)
# base-downstream
checker = StateDictKeysChecker(
bert_model, Converter.get_model_state_dict(bert_for_token_model))
unexpected_keys = checker.get_unexpected_keys()
assert len(unexpected_keys) == 2
mismatched_keys = checker.get_mismatched_keys()
assert len(mismatched_keys) == 0