File tree Expand file tree Collapse file tree 1 file changed +23
-0
lines changed
runtime/datamate-python/app/module/generation/service Expand file tree Collapse file tree 1 file changed +23
-0
lines changed Original file line number Diff line number Diff line change 11import asyncio
22import json
3+ import random
4+ import re
35import uuid
46
57from langchain_core .language_models import BaseChatModel
@@ -36,6 +38,13 @@ def _filter_docs(split_docs, chunk_size):
3638 return filtered_docs
3739
3840
41+ def extract_img_urls (doc ):
42+ """提取文档中的图片地址"""
43+ pattern = r"!\[\]\((.*?)\)"
44+ # 查找所有匹配的地址
45+ img_urls = re .findall (pattern , doc )
46+ return img_urls
47+
3948class GenerationService :
4049 def __init__ (self , db : AsyncSession ):
4150 self .db = db
@@ -226,6 +235,15 @@ async def _process_single_chunk_qa(
226235
227236 已经进入后续流程的任务(例如其它协程正在生成答案)允许自然执行完。
228237 """
238+ # 随机决定是否对当前 chunk 进行 QA 生成
239+ if random .random () > question_cfg .temperature :
240+ logger .info (
241+ f"Skip QA generation for chunk_index={ chunk .chunk_index } in file_task={ file_task .id } due to random decision."
242+ )
243+ # 更新文件任务的 processed_chunks 计数
244+ await self ._increment_processed_chunks (file_task .id , 1 )
245+ return False
246+
229247 # 如果没有全局上限配置,维持原有行为
230248 if max_qa_pairs is not None and max_qa_pairs > 0 :
231249 from sqlalchemy import func
@@ -411,6 +429,11 @@ async def process_single_question(question: str):
411429 base_obj ["instruction" ] = question
412430 data_obj = base_obj
413431
432+ # 提取图片URL
433+ img_urls = extract_img_urls (chunk_text )
434+ if img_urls :
435+ data_obj ["img_urls" ] = img_urls
436+
414437 record = SynthesisData (
415438 id = str (uuid .uuid4 ()),
416439 data = data_obj ,
You can’t perform that action at this time.
0 commit comments