@@ -261,7 +261,6 @@ def _split_long_text_input(self, input_texts, max_text_len):
261
261
if the text length greater than 512, will this function that spliting the long text.
262
262
"""
263
263
short_input_texts = []
264
- short_input_texts_lens = []
265
264
for text in input_texts :
266
265
if len (text ) <= max_text_len :
267
266
short_input_texts .append (text )
@@ -279,13 +278,35 @@ def _split_long_text_input(self, input_texts, max_text_len):
279
278
]
280
279
short_input_texts .extend (temp_text_list )
281
280
else :
282
- count = 0
283
- for temp_text in temp_text_list :
284
- if len (temp_text ) + count < lens :
285
- temp_text = text [:len (temp_text ) + count + 1 ]
286
- count += len (temp_text )
281
+ list_len = len (temp_text_list )
282
+ start = 0
283
+ end = 0
284
+ for i in range (0 , list_len ):
285
+ if len (temp_text_list [i ]) + 1 >= max_text_len :
286
+ if start != end :
287
+ short_input_texts .extend (
288
+ self ._split_long_text_input (
289
+ [text [start :end ]], max_text_len ))
290
+ short_input_texts .extend (
291
+ self ._split_long_text_input ([
292
+ text [end :end + len (temp_text_list [i ]) + 1 ]
293
+ ], max_text_len ))
294
+ start = end + len (temp_text_list [i ]) + 1
295
+ end = start
296
+ else :
297
+ if start + len (temp_text_list [
298
+ i ]) + 1 > max_text_len :
299
+ short_input_texts .extend (
300
+ self ._split_long_text_input (
301
+ [text [start :end ]], max_text_len ))
302
+ start = end
303
+ end = end + len (temp_text_list [i ]) + 1
304
+ else :
305
+ end = len (temp_text_list [i ]) + 1
306
+ if start != end :
287
307
short_input_texts .extend (
288
- self ._split_long_text2short_text_list ([temp_text ]))
308
+ self ._split_long_text_input ([text [start :end ]],
309
+ max_text_len ))
289
310
return short_input_texts
290
311
291
312
def _concat_short_text_reuslts (self , input_texts , results ):
@@ -318,7 +339,6 @@ def _concat_short_text_reuslts(self, input_texts, results):
318
339
pred_words = result ['items' ]
319
340
pred_words = self ._reset_offset (pred_words )
320
341
result ['items' ] = pred_words
321
-
322
342
return concat_results
323
343
324
344
def _preprocess_text (self , input_texts ):
@@ -333,7 +353,7 @@ def _preprocess_text(self, input_texts):
333
353
lazy_load = self .kwargs [
334
354
'lazy_load' ] if 'lazy_load' in self .kwargs else False
335
355
336
- max_seq_length = 128
356
+ max_seq_length = 512
337
357
if 'max_position_embedding' in self .kwargs :
338
358
max_seq_length = self .kwargs ['max_position_embedding' ]
339
359
infer_data = []
@@ -533,7 +553,7 @@ def _postprocess(self, inputs):
533
553
"""
534
554
results = self ._decode (inputs ['short_input_texts' ],
535
555
inputs ['all_pred_tags' ])
536
- resulte = self ._concat_short_text_reuslts (inputs ['inputs' ], results )
556
+ results = self ._concat_short_text_reuslts (inputs ['inputs' ], results )
537
557
if self .linking is True :
538
558
for res in results :
539
559
self ._term_linking (res )
0 commit comments