Skip to content

Commit 7448d36

Browse files
committed
Enhance placeholder tag cleanup and gap analysis
Added a robust cleanup function to remove all research placeholder tags from final reports. Improved gap analysis to prioritize placeholder tags and updated search logic to address high-priority gaps first. Increased default max_iterations and max_sources for more thorough research. Updated final report synthesis to ensure no placeholder tags remain.
1 parent d19eec2 commit 7448d36

File tree

2 files changed

+102
-11
lines changed

2 files changed

+102
-11
lines changed

optillm/plugins/deep_research/research_engine.py

Lines changed: 98 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,60 @@ def clean_reasoning_tags(text: str) -> str:
6161
return cleaned_text
6262

6363

64+
def cleanup_placeholder_tags(text: str) -> str:
65+
"""
66+
Remove any remaining placeholder tags from the final report.
67+
68+
This is a final cleanup step to ensure no incomplete research tags remain
69+
in the published report.
70+
71+
Args:
72+
text: Research report text
73+
74+
Returns:
75+
Text with all placeholder tags removed
76+
"""
77+
if not text:
78+
return text
79+
80+
# Patterns for research placeholder tags
81+
placeholder_patterns = [
82+
r'\[NEEDS RESEARCH[^\]]*\]',
83+
r'\[SOURCE NEEDED[^\]]*\]',
84+
r'\[RESEARCH NEEDED[^\]]*\]',
85+
r'\[CITATION NEEDED[^\]]*\]',
86+
r'\[MORE RESEARCH NEEDED[^\]]*\]',
87+
r'\[REQUIRES INVESTIGATION[^\]]*\]',
88+
r'\[TO BE RESEARCHED[^\]]*\]',
89+
r'\[VERIFY[^\]]*\]',
90+
r'\[CHECK[^\]]*\]',
91+
]
92+
93+
cleaned_text = text
94+
for pattern in placeholder_patterns:
95+
# Remove the placeholder tags
96+
cleaned_text = re.sub(pattern, '', cleaned_text, flags=re.IGNORECASE)
97+
98+
# Also remove any sentences that are entirely placeholder-based
99+
lines = cleaned_text.split('\n')
100+
filtered_lines = []
101+
102+
for line in lines:
103+
# Skip lines that are mostly just removed placeholders (now empty or just punctuation)
104+
stripped = line.strip()
105+
if stripped and not re.match(r'^[\s\-\*\.\,\;\:]*$', stripped):
106+
filtered_lines.append(line)
107+
elif not stripped: # Keep empty lines for formatting
108+
filtered_lines.append(line)
109+
110+
# Rejoin and clean up extra whitespace
111+
result = '\n'.join(filtered_lines)
112+
result = re.sub(r'\n\s*\n\s*\n+', '\n\n', result) # Multiple empty lines to double
113+
result = result.strip()
114+
115+
return result
116+
117+
64118
class DeepResearcher:
65119
"""
66120
Implementation of Test-Time Diffusion Deep Researcher (TTD-DR) algorithm
@@ -71,7 +125,7 @@ class DeepResearcher:
71125
Based on: https://arxiv.org/abs/2507.16075v1
72126
"""
73127

74-
def __init__(self, client, model: str, max_iterations: int = 5, max_sources: int = 10):
128+
def __init__(self, client, model: str, max_iterations: int = 8, max_sources: int = 15):
75129
self.client = client
76130
self.model = model
77131
self.max_iterations = max_iterations
@@ -99,6 +153,21 @@ def __init__(self, client, model: str, max_iterations: int = 5, max_sources: int
99153
}
100154
self.gap_analysis_history = [] # Track identified gaps over time
101155

156+
def cleanup_placeholder_tags(self, text: str) -> str:
157+
"""
158+
Remove any remaining placeholder tags from the final report.
159+
160+
This is a final cleanup step to ensure no incomplete research tags remain
161+
in the published report.
162+
163+
Args:
164+
text: Research report text
165+
166+
Returns:
167+
Text with all placeholder tags removed
168+
"""
169+
return cleanup_placeholder_tags(text)
170+
102171
def decompose_query(self, system_prompt: str, initial_query: str) -> List[str]:
103172
"""
104173
Decompose complex research query into focused sub-queries
@@ -394,26 +463,33 @@ def analyze_draft_gaps(self, current_draft: str, original_query: str) -> List[Di
394463
"""
395464
gap_analysis_prompt = f"""
396465
Analyze the following research draft to identify specific gaps and areas that need external research.
466+
Pay special attention to any placeholder tags like [NEEDS RESEARCH], [SOURCE NEEDED], etc.
397467
398468
Original Query: {original_query}
399469
400470
Current Draft:
401471
{current_draft}
402472
473+
PRIORITY ANALYSIS:
474+
1. First, identify any [NEEDS RESEARCH], [SOURCE NEEDED], [CITATION NEEDED] or similar placeholder tags
475+
2. Then identify other substantial gaps in content, evidence, or depth
476+
403477
For each gap you identify, provide:
404478
1. SECTION: Which section has the gap
405-
2. GAP_TYPE: [MISSING_INFO, OUTDATED_INFO, NEEDS_EVIDENCE, LACKS_DEPTH, NEEDS_EXAMPLES]
479+
2. GAP_TYPE: [PLACEHOLDER_TAG, MISSING_INFO, OUTDATED_INFO, NEEDS_EVIDENCE, LACKS_DEPTH, NEEDS_EXAMPLES]
406480
3. SPECIFIC_NEED: Exactly what information is needed
407481
4. SEARCH_QUERY: A specific search query to address this gap
482+
5. PRIORITY: [HIGH, MEDIUM, LOW] - HIGH for placeholder tags that need immediate resolution
408483
409484
Format each gap as:
410485
GAP_ID: [number]
411486
SECTION: [section name]
412487
GAP_TYPE: [type]
413488
SPECIFIC_NEED: [what's missing]
414489
SEARCH_QUERY: [search query to find this info]
490+
PRIORITY: [priority level]
415491
416-
Identify 3-5 most critical gaps.
492+
Identify 3-6 most critical gaps, prioritizing any placeholder tags that need resolution.
417493
"""
418494

419495
try:
@@ -468,10 +544,17 @@ def analyze_draft_gaps(self, current_draft: str, original_query: str) -> List[Di
468544
def perform_gap_targeted_search(self, gaps: List[Dict[str, str]]) -> str:
469545
"""
470546
Perform targeted searches based on identified gaps in the current draft
547+
Prioritizes HIGH priority gaps (placeholder tags) first
471548
"""
472549
all_results = []
473550

474-
for gap in gaps:
551+
# Sort gaps by priority - HIGH priority first (placeholder tags)
552+
sorted_gaps = sorted(gaps, key=lambda g: (
553+
0 if g.get('priority', '').upper() == 'HIGH' else
554+
1 if g.get('priority', '').upper() == 'MEDIUM' else 2
555+
))
556+
557+
for gap in sorted_gaps:
475558
search_query = gap.get('search_query', '')
476559
if not search_query:
477560
continue
@@ -807,7 +890,8 @@ def research(self, system_prompt: str, initial_query: str) -> Tuple[str, int]:
807890
print(f" - Quality scores: Completeness={completeness:.2f}, Improvement={improvement:.2f}")
808891

809892
# Terminate if high quality achieved or minimal improvement
810-
if completeness > 0.85 or improvement < 0.05:
893+
# More lenient termination to ensure complete research
894+
if completeness > 0.9 or (improvement < 0.03 and completeness > 0.7):
811895
print(" - Quality threshold reached, research complete")
812896
break
813897

@@ -839,8 +923,11 @@ def finalize_research_report(self, system_prompt: str, original_query: str, fina
839923
3. Add a compelling title and executive summary
840924
4. Ensure smooth transitions between sections
841925
5. Add conclusion that directly addresses the original query
842-
6. Remove any remaining [NEEDS RESEARCH] tags
843-
7. Polish language and style for clarity and impact
926+
6. **CRITICAL**: Remove ALL [NEEDS RESEARCH], [SOURCE NEEDED], and similar placeholder tags
927+
7. Replace any remaining placeholders with actual content or remove incomplete sections
928+
8. Polish language and style for clarity and impact
929+
930+
**IMPORTANT**: The final report must NOT contain any [NEEDS RESEARCH], [SOURCE NEEDED], [RESEARCH NEEDED], [CITATION NEEDED], or similar placeholder tags. If any placeholders remain, replace them with available information or remove the incomplete statements.
844931
845932
Return the final polished research report.
846933
"""
@@ -858,6 +945,10 @@ def finalize_research_report(self, system_prompt: str, original_query: str, fina
858945

859946
polished_report = response.choices[0].message.content.strip()
860947
polished_report = clean_reasoning_tags(polished_report)
948+
949+
# Final cleanup: Remove any remaining placeholder tags
950+
polished_report = self.cleanup_placeholder_tags(polished_report)
951+
861952
self.total_tokens += response.usage.completion_tokens
862953

863954
# Add references section

optillm/plugins/deep_research_plugin.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,16 +32,16 @@ def run(system_prompt: str, initial_query: str, client, model: str, request_conf
3232
client: OpenAI client for LLM calls
3333
model: Model name to use for synthesis
3434
request_config: Optional configuration dict with keys:
35-
- max_iterations: Maximum research iterations (default: 5)
36-
- max_sources: Maximum web sources per search (default: 10)
35+
- max_iterations: Maximum research iterations (default: 8)
36+
- max_sources: Maximum web sources per search (default: 15)
3737
3838
Returns:
3939
Tuple of (comprehensive_research_response, total_completion_tokens)
4040
"""
4141
# Parse configuration
4242
config = request_config or {}
43-
max_iterations = config.get("max_iterations", 5)
44-
max_sources = config.get("max_sources", 10)
43+
max_iterations = config.get("max_iterations", 8) # Increased to 8 for thorough research
44+
max_sources = config.get("max_sources", 15) # Increased to 15 for comprehensive coverage
4545

4646
# Validate inputs
4747
if not initial_query.strip():

0 commit comments

Comments
 (0)