Skip to content

Commit 85eb5a9

Browse files
authored
feat(generation_service): add image URL extraction and random QA generation logic (#182)
1 parent ab4523b commit 85eb5a9

File tree

1 file changed

+23
-0
lines changed

1 file changed

+23
-0
lines changed

runtime/datamate-python/app/module/generation/service/generation_service.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import asyncio
22
import json
3+
import random
4+
import re
35
import uuid
46

57
from langchain_core.language_models import BaseChatModel
@@ -36,6 +38,13 @@ def _filter_docs(split_docs, chunk_size):
3638
return filtered_docs
3739

3840

41+
def extract_img_urls(doc):
42+
"""提取文档中的图片地址"""
43+
pattern = r"!\[\]\((.*?)\)"
44+
# 查找所有匹配的地址
45+
img_urls = re.findall(pattern, doc)
46+
return img_urls
47+
3948
class GenerationService:
4049
def __init__(self, db: AsyncSession):
4150
self.db = db
@@ -226,6 +235,15 @@ async def _process_single_chunk_qa(
226235
227236
已经进入后续流程的任务(例如其它协程正在生成答案)允许自然执行完。
228237
"""
238+
# 随机决定是否对当前 chunk 进行 QA 生成
239+
if random.random() > question_cfg.temperature:
240+
logger.info(
241+
f"Skip QA generation for chunk_index={chunk.chunk_index} in file_task={file_task.id} due to random decision."
242+
)
243+
# 更新文件任务的 processed_chunks 计数
244+
await self._increment_processed_chunks(file_task.id, 1)
245+
return False
246+
229247
# 如果没有全局上限配置,维持原有行为
230248
if max_qa_pairs is not None and max_qa_pairs > 0:
231249
from sqlalchemy import func
@@ -411,6 +429,11 @@ async def process_single_question(question: str):
411429
base_obj["instruction"] = question
412430
data_obj = base_obj
413431

432+
# 提取图片URL
433+
img_urls = extract_img_urls(chunk_text)
434+
if img_urls:
435+
data_obj["img_urls"] = img_urls
436+
414437
record = SynthesisData(
415438
id=str(uuid.uuid4()),
416439
data=data_obj,

0 commit comments

Comments
 (0)