43
43
from paddlenlp import Taskflow
44
44
45
45
ddp = Taskflow("dependency_parsing")
46
- ddp("百度是一家高科技公司 ")
46
+ ddp("三亚是一座美丽的城市 ")
47
47
'''
48
- [{'word': ['百度 ', '是', '一家 ', '高科技 ', '公司' ], 'head': ['2', '0', '5', '5', '2' ], 'deprel': ['SBV', 'HED', 'ATT', 'ATT', 'VOB']}]
48
+ [{'word': ['三亚 ', '是', '一座 ', '美丽 ', '的', '城市' ], 'head': [2, 0, 6, 6, 4, 2 ], 'deprel': ['SBV', 'HED', 'ATT', 'ATT', 'MT ', 'VOB']}]
49
49
'''
50
- ddp(["百度是一家高科技公司", "他送了一本书"])
51
- '''
52
- [{'word': ['百度', '是', '一家', '高科技', '公司'], 'head': ['2', '0', '5', '5', '2'], 'deprel': ['SBV', 'HED', 'ATT', 'ATT', 'VOB']}, {'word': ['他', '送', '了', '一本', '书'], 'head': ['2', '0', '2', '5', '2'], 'deprel': ['SBV', 'HED', 'MT', 'ATT', 'VOB']}]
50
+ ddp(["三亚是一座美丽的城市", "他送了一本书"])
53
51
'''
52
+ [{'word': ['三亚', '是', '一座', '美丽', '的', '城市'], 'head': [2, 0, 6, 6, 4, 2], 'deprel': ['SBV', 'HED', 'ATT', 'ATT', 'MT', 'VOB']}, {'word': ['他', '送', '了', '一本', '书'], 'head': [2, 0, 2, 5, 2], 'deprel': ['SBV', 'HED', 'MT', 'ATT', 'VOB']}]
53
+ '''
54
54
55
55
ddp = Taskflow("dependency_parsing", prob=True, use_pos=True)
56
- ddp("百度是一家高科技公司 ")
56
+ ddp("三亚是一座美丽的城市 ")
57
57
'''
58
- [{'word': ['百度 ', '是', '一家 ', '高科技', '公司' ], 'postag ': ['ORG', 'v', 'm', 'n', 'n' ], 'head ': ['2 ', '0 ', '5 ', '5', '2' ], 'deprel ': ['SBV ', 'HED ', 'ATT ', 'ATT', 'VOB' ], 'prob': [1.0, 1.0, 1.0, 1.0, 1.0]}]
58
+ [{'word': ['三亚 ', '是', '一座 ', '美丽的城市' ], 'head ': [2, 0, 4, 2 ], 'deprel ': ['SBV ', 'HED ', 'ATT ', 'VOB' ], 'postag ': ['LOC ', 'v ', 'm ', 'n' ], 'prob': [1.0, 1.0, 1.0, 1.0]}]
59
59
'''
60
60
61
61
ddp = Taskflow("dependency_parsing", model="ddparser-ernie-1.0")
62
- ddp("百度是一家高科技公司 ")
62
+ ddp("三亚是一座美丽的城市 ")
63
63
'''
64
- [{'word': ['百度 ', '是', '一家 ', '高科技 ', '公司' ], 'head': ['2', '0', '5', '5', '2' ], 'deprel': ['SBV', 'HED', 'ATT', 'ATT', 'VOB']}]
64
+ [{'word': ['三亚 ', '是', '一座 ', '美丽 ', '的', '城市' ], 'head': [2, 0, 6, 6, 4, 2 ], 'deprel': ['SBV', 'HED', 'ATT', 'ATT', 'MT ', 'VOB']}]
65
65
'''
66
66
67
67
ddp = Taskflow("dependency_parsing", model="ddparser-ernie-gram-zh")
68
- ddp("百度是一家高科技公司 ")
68
+ ddp("三亚是一座美丽的城市 ")
69
69
'''
70
- [{'word': ['百度 ', '是', '一家 ', '高科技 ', '公司' ], 'head': ['2', '0', '5', '5', '2' ], 'deprel': ['SBV', 'HED', 'ATT', 'ATT', 'VOB']}]
70
+ [{'word': ['三亚 ', '是', '一座 ', '美丽 ', '的', '城市' ], 'head': [2, 0, 6, 6, 4, 2 ], 'deprel': ['SBV', 'HED', 'ATT', 'ATT', 'MT ', 'VOB']}]
71
71
'''
72
72
73
+ # 已分词输入
74
+ ddp = Taskflow("dependency_parsing", segmented=True)
75
+ ddp.from_segments([["三亚", "是", "一座", "美丽", "的", "城市"]])
76
+ '''
77
+ [{'word': ['三亚', '是', '一座', '美丽', '的', '城市'], 'head': [2, 0, 6, 6, 4, 2], 'deprel': ['SBV', 'HED', 'ATT', 'ATT', 'MT', 'VOB']}]
78
+ '''
79
+ ddp.from_segments([['三亚', '是', '一座', '美丽', '的', '城市'], ['他', '送', '了', '一本', '书']])
80
+ '''
81
+ [{'word': ['三亚', '是', '一座', '美丽', '的', '城市'], 'head': [2, 0, 6, 6, 4, 2], 'deprel': ['SBV', 'HED', 'ATT', 'ATT', 'MT', 'VOB']}, {'word': ['他', '送', '了', '一本', '书'], 'head': [2, 0, 2, 5, 2], 'deprel': ['SBV', 'HED', 'MT', 'ATT', 'VOB']}]
82
+ '''
73
83
"""
74
84
75
85
@@ -83,7 +93,7 @@ class DDParserTask(Task):
83
93
prob(bool): Whether to return the probability of predicted heads.
84
94
use_pos(bool): Whether to return the postag.
85
95
batch_size(int): Numbers of examples a batch.
86
- return_visual(bool): If set True, the result will contain the dependency visualization.
96
+ return_visual(bool): If True, the result will contain the dependency visualization.
87
97
kwargs (dict, optional): Additional keyword arguments passed along to the specific task.
88
98
"""
89
99
@@ -141,12 +151,27 @@ def __init__(self,
141
151
142
152
self .use_cuda = use_cuda
143
153
self .lac = LAC (mode = "lac" if self .use_pos else "seg" ,
144
- use_cuda = self .use_cuda )
154
+ use_cuda = self .use_cuda )
145
155
if self .static_mode :
146
156
self ._get_inference_model ()
147
157
else :
148
158
self ._construct_model (model )
149
159
160
+ def _check_segmented_words (self , inputs ):
161
+ inputs = inputs [0 ]
162
+ if not all ([isinstance (i , list ) and i and all (i ) for i in inputs ]):
163
+ raise TypeError ("Invalid input format." )
164
+ return inputs
165
+
166
+ def from_segments (self , segmented_words ):
167
+ segmented_words = self ._check_segmented_words (segmented_words )
168
+ inputs = {}
169
+ inputs ['words' ] = segmented_words
170
+ inputs = self ._preprocess_words (inputs )
171
+ outputs = self ._run_model (inputs )
172
+ results = self ._postprocess (outputs )
173
+ return results
174
+
150
175
def _construct_input_spec (self ):
151
176
"""
152
177
Construct the input spec for the convert dygraph model to static model.
@@ -182,55 +207,61 @@ def _construct_tokenizer(self, model):
182
207
"""
183
208
return None
184
209
210
+ def _preprocess_words (self , inputs ):
211
+ examples = []
212
+ for text in inputs ['words' ]:
213
+ example = {"FORM" : text }
214
+ example = convert_example (
215
+ example ,
216
+ vocabs = [self .word_vocab , self .rel_vocab ])
217
+ examples .append (example )
218
+
219
+ batches = [
220
+ examples [idx :idx + self .batch_size ]
221
+ for idx in range (0 , len (examples ), self .batch_size )
222
+ ]
223
+
224
+ def batchify_fn (batch ):
225
+ raw_batch = [raw for raw in zip (* batch )]
226
+ batch = [pad_sequence (data ) for data in raw_batch ]
227
+ return batch
228
+
229
+ batches = [flat_words (batchify_fn (batch )[0 ]) for batch in batches ]
230
+
231
+ inputs ['data_loader' ] = batches
232
+ return inputs
233
+
185
234
def _preprocess (self , inputs ):
186
235
"""
187
236
Transform the raw text to the model inputs, two steps involved:
188
237
1) Transform the raw text to token ids.
189
238
2) Generate the other model inputs from the raw text and token ids.
190
239
"""
191
- inputs = self . _check_input_text ( inputs )
240
+
192
241
# Get the config from the kwargs
193
242
num_workers = self .kwargs [
194
243
'num_workers' ] if 'num_workers' in self .kwargs else 0
195
244
lazy_load = self .kwargs [
196
245
'lazy_load' ] if 'lazy_load' in self .kwargs else False
197
246
247
+ outputs = {}
248
+
198
249
lac_results = []
199
250
position = 0
200
251
252
+ inputs = self ._check_input_text (inputs )
201
253
while position < len (inputs ):
202
254
lac_results += self .lac .run (inputs [position :position +
203
- self .batch_size ])
255
+ self .batch_size ])
204
256
position += self .batch_size
205
257
206
- outputs = {}
207
258
if not self .use_pos :
208
259
outputs ['words' ] = lac_results
209
260
else :
210
261
outputs ['words' ], outputs [
211
262
'postags' ] = [raw for raw in zip (* lac_results )]
212
263
213
- examples = []
214
- for text in outputs ['words' ]:
215
- example = {"FORM" : text , }
216
- example = convert_example (
217
- example ,
218
- vocabs = [self .word_vocab , self .rel_vocab ], )
219
- examples .append (example )
220
-
221
- batches = [
222
- examples [idx :idx + self .batch_size ]
223
- for idx in range (0 , len (examples ), self .batch_size )
224
- ]
225
-
226
- def batchify_fn (batch ):
227
- raw_batch = [raw for raw in zip (* batch )]
228
- batch = [pad_sequence (data ) for data in raw_batch ]
229
- return batch
230
-
231
- batches = [flat_words (batchify_fn (batch )[0 ]) for batch in batches ]
232
-
233
- outputs ['data_loader' ] = batches
264
+ outputs = self ._preprocess_words (outputs )
234
265
return outputs
235
266
236
267
def _run_model (self , inputs ):
0 commit comments