2525from appbuilder .core .component import Component
2626from appbuilder .core .components .general_ocr .model import *
2727from appbuilder .core .message import Message
28+ from appbuilder .core .constants import COMPONENT_SUPPORT_FILE_NUMBER
2829from appbuilder .utils .trace .tracer_wrapper import components_run_trace , components_run_stream_trace
2930
3031
@@ -87,7 +88,7 @@ class GeneralOCR(Component):
8788 "type" : "object" ,
8889 "description" : "需要识别的PDF文件的对应页码,key为pdf_names中的文件名,value为对应的页码,当 pdf_file 参数有效时,识别传入页码的对应页面内容,若不传入,则默认识别第 1 页" ,
8990 "additionalProperties" : {
90- "type" : "integer "
91+ "type" : "string "
9192 },
9293 "default" : {}
9394 },
@@ -272,37 +273,71 @@ def tool_eval(
272273
273274 """
274275 traceid = kwargs .get ("_sys_traceid" , "" )
276+ if not img_names and not img_urls and not pdf_names and not pdf_urls :
277+ raise InvalidRequestArgumentError (request_id = traceid ,
278+ message = "img_names\img_urls\pdf_names\pdf_urls can not both be empty" )
275279 sys_file_urls = kwargs .get ("_sys_file_urls" , {})
276280 support_pdf_type = ["pdf" ]
277281 support_img_type = ["png" , "jpg" , "jpeg" , "webp" , "heic" , "tif" , "tiff" , "dcm" , "mha" , "nii.gz" ]
278282 img_map = {}
279283 pdf_map = {}
284+ unsupported_files = []
285+ unknown_files = []
280286 img_names = [os .path .basename (name ) for name in img_names ]
281287 pdf_names = [os .path .basename (name ) for name in pdf_names ]
282- for file_name , file_url in sys_file_urls .items ():
283- if file_url in img_urls or file_name in img_names :
284- img_map [file_name ] = file_url
285- elif file_name in pdf_names or file_url in pdf_urls :
286- pdf_map [file_name ] = {"url" : file_url , "page_num" : pdf_file_num .get (file_name , "1" )}
288+
289+ for img_name in img_names :
290+ if len (img_map ) >= COMPONENT_SUPPORT_FILE_NUMBER :
291+ break
292+ file_type = img_name .split ("." )[- 1 ].lower ()
293+ if img_name in sys_file_urls :
294+ if file_type in support_img_type :
295+ img_map [img_name ] = sys_file_urls .get (img_name , "" )
296+ else :
297+ unsupported_files .append (img_name )
287298 else :
288- file_type = file_name .split ("." )[- 1 ].lower ()
299+ unknown_files .append (img_name )
300+
301+ for pdf_name in pdf_names :
302+ if len (img_map ) + len (pdf_map ) >= 10 :
303+ break
304+ file_type = pdf_name .split ("." )[- 1 ].lower ()
305+ if pdf_name in sys_file_urls :
289306 if file_type in support_pdf_type :
290- pdf_map [file_name ] = {"url" : file_url , "page_num" : pdf_file_num .get (file_name , "1" )}
291- elif file_type in support_img_type :
292- img_map [file_name ] = file_url
307+ pdf_map [pdf_name ] = {"url" : sys_file_urls .get (pdf_name , "" ), "page_num" : pdf_file_num .get (pdf_name , "1" )}
308+ else :
309+ unsupported_files .append (pdf_name )
310+ else :
311+ unknown_files .append (pdf_name )
293312
294313 for img_url in img_urls :
314+ if len (img_map ) + len (pdf_map ) >= 10 :
315+ break
316+ if img_url in list (sys_file_urls .values ()):
317+ continue
295318 file_name = img_url .split ("/" )[- 1 ].split ("?" )[0 ]
296319 file_type = file_name .split ("." )[- 1 ].lower ()
297- if file_type in support_img_type and file_name not in img_map :
298- img_map [file_name ] = img_url
320+ if file_type in support_img_type :
321+ img_map [img_url ] = img_url
322+ else :
323+ unsupported_files .append (img_url )
299324
300- for pdf_url in pdf_urls :
325+ for pdf_url in pdf_urls :
326+ if len (img_map ) + len (pdf_map ) >= 10 :
327+ break
328+ if pdf_url in list (sys_file_urls .values ()):
329+ continue
301330 file_name = pdf_url .split ("/" )[- 1 ].split ("?" )[0 ]
302331 file_type = file_name .split ("." )[- 1 ].lower ()
303- if file_type in support_pdf_type and file_name not in pdf_map :
304- pdf_map [file_name ] = {"url" : pdf_url , "page_num" : pdf_file_num .get (file_name , "1" )}
305-
332+ if file_type in support_pdf_type :
333+ pdf_map [pdf_url ] = {"url" : pdf_url , "page_num" : pdf_file_num .get (file_name , "1" )}
334+ else :
335+ unsupported_files .append (pdf_url )
336+
337+ if not img_map and not pdf_map :
338+ raise InvalidRequestArgumentError (
339+ f"request format error, file url does not exist" )
340+
306341 if img_map :
307342 for img_name , img_url in img_map .items ():
308343 try :
@@ -313,9 +348,9 @@ def tool_eval(
313348 result_response , raw_data = self ._recognize (req , request_id = traceid )
314349 result = proto .Message .to_dict (result_response )
315350 results = {
316- f"{ img_name } 识别结果 " : " \n " .join (item ["words" ] for item in result ["words_result" ])
351+ f"{ img_name } " : " \n " .join (item ["words" ] for item in result ["words_result" ])
317352 }
318- res = json .dumps (results , ensure_ascii = False , indent = 4 )
353+ res = json .dumps (results , ensure_ascii = False )
319354 yield self .create_output (type = "text" , text = res , raw_data = raw_data , visible_scope = "llm" )
320355 yield self .create_output (type = "text" , text = "" , raw_data = raw_data , visible_scope = "user" )
321356 except Exception as e :
@@ -336,15 +371,51 @@ def tool_eval(
336371 result_response , raw_data = self ._recognize (req , request_id = traceid )
337372 result = proto .Message .to_dict (result_response )
338373 results = {
339- f"{ pdf_name } 识别结果 " : " \n " .join (item ["words" ] for item in result ["words_result" ])
374+ f"{ pdf_name } " : " \n " .join (item ["words" ] for item in result ["words_result" ])
340375 }
341- res = json .dumps (results , ensure_ascii = False , indent = 4 )
376+ res = json .dumps (results , ensure_ascii = False )
342377 yield self .create_output (type = "text" , text = res , raw_data = raw_data , visible_scope = "llm" )
343378 yield self .create_output (type = "text" , text = "" , raw_data = raw_data , visible_scope = "user" )
344379 except Exception as e :
345380 logging .warning (f"{ pdf_name } ocr failed with exception: { e } " )
346381 continue
347-
348- if not img_map and not pdf_map :
349- raise InvalidRequestArgumentError (
350- f"request format error, file url does not exist" )
382+
383+ for file_name in unknown_files :
384+ results = {
385+ f"{ file_name } " : "无法获取url,请确认是否上传成功"
386+ }
387+ res = json .dumps (results , ensure_ascii = False )
388+ llm_result = self .create_output (
389+ type = "text" ,
390+ visible_scope = "llm" ,
391+ text = res ,
392+ name = "llm_text"
393+ )
394+ yield llm_result
395+ user_result = self .create_output (
396+ type = "text" ,
397+ visible_scope = "user" ,
398+ text = "" ,
399+ name = "user_text"
400+ )
401+ yield user_result
402+
403+ for file_name in unsupported_files :
404+ results = {
405+ f"{ file_name } " : "不支持的文件类型,请确认是否为图片或者pdf文件"
406+ }
407+ res = json .dumps (results , ensure_ascii = False )
408+ llm_result = self .create_output (
409+ type = "text" ,
410+ visible_scope = "llm" ,
411+ text = res ,
412+ name = "llm_text"
413+ )
414+ yield llm_result
415+ user_result = self .create_output (
416+ type = "text" ,
417+ visible_scope = "user" ,
418+ text = "" ,
419+ name = "user_text"
420+ )
421+ yield user_result
0 commit comments