fix(generation_service): ensure processed chunks are incremented regardless of question generation success

Dallas98 · Dallas98 · commit 81d0ed895d4b · 2025-12-18T00:49:46.000+08:00
diff --git a/runtime/datamate-python/app/module/generation/service/generation_service.py b/runtime/datamate-python/app/module/generation/service/generation_service.py
@@ -202,9 +202,20 @@ async def _process_single_chunk_qa(
         chunk_index = chunk.chunk_index
         chunk_text = chunk.chunk_content or ""
         if not chunk_text.strip():
-            logger.warning(f"Empty chunk text for file_task={file_task.id}, chunk_index={chunk_index}")
+            logger.warning(
+                f"Empty chunk text for file_task={file_task.id}, chunk_index={chunk_index}"
+            )
+            # 无论成功或失败，均视为该 chunk 已处理完成
+            try:
+                await self._increment_processed_chunks(file_task.id, 1)
+            except Exception as e:
+                logger.exception(
+                    f"Failed to increment processed_chunks for file_task={file_task.id}, chunk_index={chunk_index}: {e}"
+                )
             return False
 
+        success_any = False
+
         # 1. 生成问题
         try:
             questions = await self._generate_questions_for_one_chunk(
@@ -216,31 +227,30 @@ async def _process_single_chunk_qa(
             logger.error(
                 f"Generate questions failed for file_task={file_task.id}, chunk_index={chunk_index}: {e}"
             )
-            return False
+            questions = []
 
         if not questions:
             logger.info(
                 f"No questions generated for file_task={file_task.id}, chunk_index={chunk_index}"
             )
-            return False
-
-        # 2. 针对每个问题生成答案并入库
-        success_any = await self._generate_answers_for_one_chunk(
-            file_task=file_task,
-            chunk=chunk,
-            questions=questions,
-            answer_cfg=answer_cfg,
-            answer_chat=answer_chat,
-        )
+        else:
+            # 2. 针对每个问题生成答案并入库
+            qa_success = await self._generate_answers_for_one_chunk(
+                file_task=file_task,
+                chunk=chunk,
+                questions=questions,
+                answer_cfg=answer_cfg,
+                answer_chat=answer_chat,
+            )
+            success_any = bool(qa_success)
 
-        # 每次处理完一个chunk，若至少生成一条QA，则安全更新已处理的chunk数量，避免并发冲突
-        if success_any:
-            try:
-                await self._increment_processed_chunks(file_task.id, 1)
-            except Exception as e:
-                logger.exception(
-                    f"Failed to increment processed_chunks for file_task={file_task.id}, chunk_index={chunk_index}: {e}"
-                )
+        # 无论本 chunk 处理是否成功，都增加 processed_chunks 计数，避免任务长时间卡住
+        try:
+            await self._increment_processed_chunks(file_task.id, 1)
+        except Exception as e:
+            logger.exception(
+                f"Failed to increment processed_chunks for file_task={file_task.id}, chunk_index={chunk_index}: {e}"
+            )
 
         return success_any
 
diff --git a/runtime/datamate-python/app/module/generation/service/prompt.py b/runtime/datamate-python/app/module/generation/service/prompt.py
@@ -25,8 +25,8 @@
 1. 所有问题必须严格依据原文内容，不得添加外部信息或假设情境。
 2. 问题需覆盖文本的不同主题、层级或视角，避免集中于单一片段。
 3. 禁止输出与材料元信息相关的问题（如作者、章节、目录等）。
-4. 问题不得包含“报告/文章/文献/表格中提到”等表述，需自然流畅。
-5. 输出不少于 {{number}} 个问题，且保持格式一致。
+4. 提问时请假设没有相应的文章可供参考，因此不要在问题中使用"这个"或"这些"等指示代词，也不得包含“报告/文章/文献/表格中提到”等表述。
+5. 输出不少于 {{number}} 个问题，问题语言与原文主要语言保持一致。
 
 ## Output Format:
 - 使用合法的 JSON 数组，仅包含字符串元素。
@@ -38,7 +38,7 @@
 
 ## Output Example:
 ```
-["人工智能伦理框架应包含哪些核心要素？", "民法典对个人数据保护有哪些新规定？"]
+["人工智能伦理框架应包含哪些核心要素？", "民法典对个人数据保护有哪些新规定"]
 ```
 
 ## Text to Analyze: