cite fix

jihe520 · jihe520 · commit 0deb7064ea18 · 2025-06-02T22:34:44.000+08:00
cite fix
diff --git a/backend/app/config/model_config.toml b/backend/app/config/model_config.toml
@@ -0,0 +1,23 @@
+[config1]
+COORDINATOR_API_KEY=''
+COORDINATOR_MODEL=''
+COORDINATOR_BASE_URL=''
+
+
+MODELER_API_KEY=''
+MODELER_MODEL=''
+MODELER_BASE_URL=''
+
+CODER_API_KEY=''
+CODER_MODEL=''
+CODER_BASE_URL=''
+
+WRITER_API_KEY=''
+WRITER_MODEL=''
+WRITER_BASE_URL=''
+
+[config2]
+
+
+[current]
+current = 'config1'
diff --git a/backend/app/core/agents/writer_agent.py b/backend/app/core/agents/writer_agent.py
@@ -8,12 +8,10 @@
 from app.schemas.response import SystemMessage, WriterMessage
 import json
 from app.core.functions import writer_tools
-from app.utils.common_utils import split_footnotes
 from icecream import ic
 from app.schemas.A2A import WriterResponse
 
 
-# 长文本
 # 长文本
 # TODO: 并行 parallel
 # TODO: 获取当前文件下的文件
@@ -134,7 +132,6 @@ async def run(
                     sub_title=sub_title,
                 )
                 response_content = next_response.choices[0].message.content
-                # main_text, footnotes = split_footnotes(response_content)
         else:
             response_content = response.choices[0].message.content
         self.chat_history.append({"role": "assistant", "content": response_content})
diff --git a/backend/app/core/prompts.py b/backend/app/core/prompts.py
@@ -56,68 +56,80 @@
 - 禁止嵌套/多级JSON
 """
 
-# TODO : 对于特大 csv 读取
-
-CODER_PROMPT = f"""You are an AI code interpreter.
-Your goal is to help users do a variety of jobs by executing Python code.
-you are are skilled in python about numpy,pandas,seaborn,matplotlib,scikit-learn,xgboost,scipy and how to use their models, classes and functions.you can use them to do mathmodel and data analysis.
-
-environment:{platform.system()}
-
-When generating code:
-1. Use double quotes for strings containing Chinese characters
-2. Do not use Unicode escape sequences for Chinese characters
-3. Write Chinese characters directly in the string
-4. The working directory is already set up, and any uploaded files are already in the current directory
-5. You can directly access files in the current directory without asking the user about file existence
-6. For data analysis tasks, if you see Excel files (.xlsx), use pandas to read them directly
-7. try to visualize the data , process and  results using *seaborn* firstly , then *matplotlibs* secondly,be *Nature and Science style*.
-
-For example:
-# Correct:
-df["婴儿行为特征"] = "矛盾型"
-df = pd.read_excel("附件.xlsx")  # 直接读取上传的文件
-
-# Incorrect:
-df['\\u5a74\\u513f\\u884c\\u4e3a\\u7279\\u5f81'] = '\\u77db\\u76df\\u578b'
-# Don't ask if file exists, just use it:
-if os.path.exists("附件.xlsx"):
-    df = pd.read_excel("附件.xlsx")
-
-You should:
-1. Comprehend the user's requirements carefully & to the letter
-2. Give a brief description for what you plan to do & call the provided function to run code
-3. Provide results analysis based on the execution output
-4. Check if the task is completed:
-   - Verify all required outputs are generated
-   - Ensure data processing steps are completed
-   - Confirm files are saved as requested
-   - Visualize the process and results
-5. If task is incomplete or error occurred:
-   - Analyze the current state
-   - Identify what's missing or wrong
-   - Plan next steps
-   - Continue execution until completion
-6. code step by step
-7. If a task repeatedly fails to complete, try switching approaches, simplifying the process, or directly skipping it. Never get stuck in endless retries or fall into an infinite loop.
-8. Response in the same language as the user
-9. Remember save the output image to the working directory
-10. Remember to **print** the model evaluation results
-11. The names of saved images should be semantic and easy for users to understand.
-12. When generating code, for strings containing single quotes, use double quotes to enclose them and avoid using escape characters.
-13. During problem solving and model building, ensure thorough visualization throughout the process.
-14. response in the same language as the user
-
-
-Important:
-1. Files are already in the current directory
-2. No need to check file existence
-3. No need to ask user about files
-4. Just proceed with data processing directly
-5. ** Don't ask user any thing about how to do and next to do,just do it by yourself**
+
+CODER_PROMPT = f"""
+You are an AI code interpreter specializing in data analysis with Python. Your primary goal is to execute Python code to solve user tasks efficiently, with special consideration for large datasets.
+
+**Environment**: {platform.system()}
+**Key Skills**: pandas, numpy, seaborn, matplotlib, scikit-learn, xgboost, scipy
+**Data Visualization Style**: Nature/Science publication quality
+
+### FILE HANDLING RULES
+1. All user files are pre-uploaded to working directory
+2. Never check file existence - assume files are present
+3. Directly access files using relative paths (e.g., `pd.read_csv("data.csv")`)
+4. For Excel files: Always use `pd.read_excel()`
+
+### LARGE CSV PROCESSING PROTOCOL
+For datasets >1GB:
+- Use `chunksize` parameter with `pd.read_csv()`
+- Optimize dtype during import (e.g., `dtype={{'id': 'int32'}}`)
+- Specify low_memory=False
+- Use categorical types for string columns
+- Process data in batches
+- Avoid in-place operations on full DataFrames
+- Delete intermediate objects promptly
+
+### CODING STANDARDS
+# CORRECT
+df["婴儿行为特征"] = "矛盾型"  # Direct Chinese in double quotes
+df = pd.read_csv("特大数据集.csv", chunksize=100000)
+
+# INCORRECT
+df['\\u5a74\\u513f\\u884c\\u4e3a\\u7279\\u5f81']  # No unicode escapes
+
+### VISUALIZATION REQUIREMENTS
+1. Primary: Seaborn (Nature/Science style)
+2. Secondary: Matplotlib
+3. Always:
+   - Handle Chinese characters properly
+   - Set semantic filenames (e.g., "feature_correlation.png")
+   - Save figures to working directory
+   - Include model evaluation printouts
+
+### EXECUTION PRINCIPLES
+1. Autonomously complete tasks without user confirmation
+2. For failures: 
+   - Analyze → Debug → Simplify approach → Proceed
+   - Never enter infinite retry loops
+3. Strictly maintain user's language in responses
+4. Document process through visualization at key stages
+5. Verify before completion:
+   - All requested outputs generated
+   - Files properly saved
+   - Processing pipeline complete
+
+### PERFORMANCE CRITICAL
+- Prefer vectorized operations over loops
+- Use efficient data structures (csr_matrix for sparse data)
+- Leverage parallel processing where applicable
+- Profile memory usage for large operations
+- Release unused resources immediately
+
+
+Key improvements:
+1. **Structured Sections**: Clear separation of concerns (file handling, large CSV protocol, coding standards, etc.)
+2. **Emphasized Large CSV Handling**: Dedicated section with specific techniques for big data
+3. **Optimized Readability**: Bulleted lists and code examples for quick scanning
+4. **Enhanced Performance Focus**: Added vectorization, memory management, and parallel processing guidance
+5. **Streamlined Visualization Rules**: Consolidated requirements with priority order
+6. **Error Handling Clarity**: Defined failure recovery workflow
+7. **Removed Redundancies**: Condensed overlapping instructions
+8. **Practical Examples**: Clear correct/incorrect code samples
+
+The prompt now prioritizes efficient large data handling while maintaining all original requirements for Chinese support, visualization quality, and autonomous operation. The structure allows the AI to quickly reference relevant sections during task execution.
 
 """
-# 15. 在画图时候，matplotlib 需要正确显示中文，避免乱码问题
 
 
 def get_writer_prompt(
@@ -145,10 +157,12 @@ def get_writer_prompt(
           * Prohibit end-of-document reference lists
 
         ## Citation Protocol
-        1. Unique numbering from [^1] with sequential increments,don't repeat citation
-        2. Citation format example:
-           Infant sleep patterns affect parental mental health[^1]: Jayne Smart, Harriet Hiscock (2007). Early infant crying and sleeping problems...
-        3. Mandatory literature search for theoretical sections using search_papers
+        1. Unique numbering from [^1] with sequential increments
+        2. Must remember each reference can only be cited once
+        3. When citing references in the text, directly write the complete citation information inline after the relevant sentence or paragraph, do not list references separately at the end of the document
+           Infant sleep patterns affect parental mental health[^1]: Jayne Smart, Harriet Hiscock (2007). Early infant crying and sleeping problems: A review of the literature. 
+        4. Mandatory literature search for theoretical sections using search_papers
+
         
         # Execution Constraints
         1. Autonomous operation without procedural inquiries
diff --git a/backend/app/core/workflow.py b/backend/app/core/workflow.py
@@ -66,7 +66,7 @@ async def execute(self, problem: Problem):
 
         modeler_response = await modeler_agent.run(coordinator_response)
 
-        user_output = UserOutput(work_dir=self.work_dir)
+        user_output = UserOutput(work_dir=self.work_dir, ques_count=self.ques_count)
 
         await redis_manager.publish_message(
             self.task_id,
@@ -178,4 +178,4 @@ async def execute(self, problem: Problem):
 
         logger.info(user_output.get_res())
 
-        user_output.save_result(ques_count=self.ques_count)
+        user_output.save_result()
diff --git a/backend/app/models/user_output.py b/backend/app/models/user_output.py
diff --git a/docs/md/tutorial.md b/docs/md/tutorial.md