Skip to content

Commit c704a56

Browse files
feat: 1. support max tokens and filter parameters; 2. support interactive elements and text via LLM filtering; 3. support dynamically plan recovery steps
1 parent 3084ddb commit c704a56

File tree

9 files changed

+723
-61
lines changed

9 files changed

+723
-61
lines changed

config/config.yaml.example

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,13 @@ test_config: # Test configuration
2020
enabled: False
2121

2222
llm_config: # LLM configuration, currently only supports OpenAI SDK compatible format
23-
model: gpt-4.1-2025-04-14 # Recommended
23+
model: gpt-4.1-2025-04-14 # Primary model for Stage 2 test planning (Recommended)
24+
filter_model: gpt-4o-mini # Lightweight model for Stage 1 element filtering (cost-effective)
2425
api_key: your_api_key
2526
base_url: https://api.example.com/v1
2627
temperature: 0.1 # Optional, default 0.1
2728
# top_p: 0.9 # Optional, if not set, this parameter will not be passed
29+
# max_tokens: 8192 # Optional, maximum output tokens (supports generating more test cases)
2830

2931
browser_config:
3032
viewport: {"width": 1280, "height": 720}

webqa-agent.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,9 +149,11 @@ def validate_and_build_llm_config(cfg):
149149
api_key = os.getenv("OPENAI_API_KEY") or llm_cfg_raw.get("api_key", "")
150150
base_url = os.getenv("OPENAI_BASE_URL") or llm_cfg_raw.get("base_url", "")
151151
model = llm_cfg_raw.get("model", "gpt-4o-mini")
152+
filter_model = llm_cfg_raw.get("filter_model", model) # For two-stage architecture, defaults to primary model
152153
# Sampling configuration: default temperature is 0.1; top_p not set by default
153154
temperature = llm_cfg_raw.get("temperature", 0.1)
154155
top_p = llm_cfg_raw.get("top_p")
156+
max_tokens = llm_cfg_raw.get("max_tokens") # Optional: maximum output tokens
155157

156158
# Validate required fields
157159
if not api_key:
@@ -168,12 +170,15 @@ def validate_and_build_llm_config(cfg):
168170
llm_config = {
169171
"api": "openai",
170172
"model": model,
173+
"filter_model": filter_model,
171174
"api_key": api_key,
172175
"base_url": base_url,
173176
"temperature": temperature,
174177
}
175178
if top_p is not None:
176179
llm_config["top_p"] = top_p
180+
if max_tokens is not None:
181+
llm_config["max_tokens"] = max_tokens
177182

178183
# Show configuration source (hide sensitive information)
179184
api_key_masked = f"{api_key[:8]}...{api_key[-4:]}" if len(api_key) > 12 else "***"
@@ -184,9 +189,13 @@ def validate_and_build_llm_config(cfg):
184189
print(f" - API Key: {api_key_masked} ({'Environment variable' if env_api_key else 'Config file'})")
185190
print(f" - Base URL: {base_url} ({'Environment variable' if env_base_url else 'Config file/Default'})")
186191
print(f" - Model: {model}")
192+
if filter_model != model:
193+
print(f" - Filter Model: {filter_model} (for two-stage architecture)")
187194
print(f" - Temperature: {temperature}")
188195
if top_p is not None:
189196
print(f" - Top_p: {top_p}")
197+
if max_tokens is not None:
198+
print(f" - Max Tokens: {max_tokens}")
190199

191200
return llm_config
192201

webqa_agent/actions/action_executor.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import asyncio
2+
import inspect
23
import logging
34
from typing import Dict, List, Optional
45

@@ -42,9 +43,20 @@ async def execute(self, action):
4243
logging.error(f"Unknown action type: {action_type}")
4344
return False
4445

45-
# Execute the action
46+
# Execute the action with introspection to handle different method signatures
4647
logging.debug(f"Executing action: {action_type}")
47-
return await execute_func(action)
48+
49+
# Use introspection to check if method accepts action parameter
50+
sig = inspect.signature(execute_func)
51+
params = list(sig.parameters.keys())
52+
53+
# If method only has 'self' parameter (no additional params), call without action
54+
# Note: bound methods don't show 'self' in signature, so empty params means no action param
55+
if len(params) == 0:
56+
logging.debug(f"Calling {action_type} without action parameter (method signature has no parameters)")
57+
return await execute_func()
58+
else:
59+
return await execute_func(action)
4860

4961
except Exception as e:
5062
logging.error(f"Action execution failed: {str(e)}")

webqa_agent/actions/action_handler.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ async def go_to_page(self, page: Page, url: str, cookies=None):
9797
await self.page.goto(url=url, wait_until='domcontentloaded')
9898
await self.page.wait_for_load_state('networkidle', timeout=60000)
9999

100-
async def smart_navigate_to_page(self, page: Page, url: str, cookies=None) -> bool:
100+
async def smart_navigate_to_page(self, page: Page, url: str, cookies=None) -> bool | None:
101101
"""Smart navigation to target page, avoiding redundant navigation.
102102
103103
Args:
@@ -141,7 +141,7 @@ def normalize_url(u):
141141

142142
if current_normalized == target_normalized:
143143
logging.debug('Already on target page (normalized match), skipping navigation')
144-
return False
144+
return None # Return None to indicate "already on page, no navigation needed"
145145

146146
# More flexible URL matching: if domain is same and path is similar, also consider as match
147147
def extract_domain(u):

webqa_agent/crawler/deep_crawler.py

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -539,6 +539,104 @@ def dump_json(node: Dict[str, Any], path: Path) -> None:
539539
with open(path, 'w', encoding='utf-8') as f:
540540
json.dump(node, f, ensure_ascii=False, indent=2)
541541

542+
@staticmethod
543+
def smart_truncate_page_text(
544+
text_array: List[str],
545+
max_tokens: int = 3000,
546+
strategy: str = "head_tail_sample"
547+
) -> Dict[str, Any]:
548+
"""
549+
Intelligently truncate page text while preserving semantic completeness.
550+
551+
Based on 2024 research on semantic chunking and context preservation:
552+
- Avoids "lost-in-the-middle" problem
553+
- Preserves page structure (head, middle sample, tail)
554+
- Maintains overall context and flow
555+
556+
Args:
557+
text_array: Original text array from get_text()
558+
max_tokens: Maximum token budget (default: 3000)
559+
strategy: Truncation strategy (currently supports "head_tail_sample")
560+
561+
Returns:
562+
Dict containing:
563+
- summary: Overview of the truncation
564+
- text_content: Sampled text segments
565+
- coverage: Coverage ratio (selected/total)
566+
- estimated_tokens: Estimated token count
567+
"""
568+
if not text_array:
569+
return {
570+
"summary": "No text content found",
571+
"text_content": [],
572+
"coverage": "0/0 (0%)",
573+
"estimated_tokens": 0,
574+
"strategy_used": strategy
575+
}
576+
577+
total_items = len(text_array)
578+
# Conservative estimate: 1 token ≈ 2 chars (mixed Chinese/English)
579+
char_budget = max_tokens * 2
580+
581+
if strategy == "head_tail_sample":
582+
result_parts = []
583+
current_chars = 0
584+
585+
# Keep head 30% (navigation, titles, important info)
586+
keep_head = int(total_items * 0.3)
587+
for item in text_array[:keep_head]:
588+
if current_chars + len(item) > char_budget * 0.5:
589+
break
590+
result_parts.append(item)
591+
current_chars += len(item)
592+
593+
# Middle sampling (max 20 samples to maintain page flow)
594+
middle_start = keep_head
595+
middle_end = max(keep_head, total_items - int(total_items * 0.1))
596+
middle_section = text_array[middle_start:middle_end]
597+
598+
if middle_section:
599+
sample_rate = max(1, len(middle_section) // 20)
600+
for item in middle_section[::sample_rate]:
601+
if current_chars + len(item) > char_budget * 0.8:
602+
break
603+
result_parts.append(item)
604+
current_chars += len(item)
605+
606+
# Keep tail 10% (footer, contact, legal info)
607+
keep_tail = int(total_items * 0.1)
608+
for item in text_array[-keep_tail:] if keep_tail > 0 else []:
609+
if current_chars + len(item) > char_budget:
610+
break
611+
result_parts.append(item)
612+
current_chars += len(item)
613+
614+
return {
615+
"summary": f"Intelligently sampled {len(result_parts)} from {total_items} text segments",
616+
"text_content": result_parts,
617+
"coverage": f"{len(result_parts)}/{total_items} ({len(result_parts)/total_items*100:.1f}%)",
618+
"estimated_tokens": current_chars // 2,
619+
"strategy_used": strategy
620+
}
621+
622+
else:
623+
# Fallback: simple truncation
624+
result = []
625+
chars = 0
626+
for item in text_array:
627+
if chars + len(item) > char_budget:
628+
break
629+
result.append(item)
630+
chars += len(item)
631+
632+
return {
633+
"summary": f"Simple truncation: {len(result)}/{total_items} items",
634+
"text_content": result,
635+
"coverage": f"{len(result)}/{total_items} ({len(result)/total_items*100:.1f}%)",
636+
"estimated_tokens": chars // 2,
637+
"strategy_used": "simple_truncate"
638+
}
639+
542640
# ------------------------------------------------------------------------
543641
# VISUAL DEBUGGING METHODS
544642
# ------------------------------------------------------------------------

webqa_agent/llm/llm_api.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ def __init__(self, llm_config) -> None:
99
self.llm_config = llm_config
1010
self.api_type = self.llm_config.get("api")
1111
self.model = self.llm_config.get("model")
12+
self.filter_model = self.llm_config.get("filter_model", self.model) # For two-stage architecture
1213
self.client = None
1314
self._client = None # httpx client
1415

@@ -32,8 +33,10 @@ async def _get_client(self):
3233
self._client = httpx.AsyncClient(timeout=60.0)
3334
return self._client
3435

35-
async def get_llm_response(self, system_prompt, prompt, images=None, temperature=None, top_p=None):
36-
model_input = {"model": self.model, "api_type": self.api_type}
36+
async def get_llm_response(self, system_prompt, prompt, images=None, temperature=None, top_p=None, max_tokens=None, model_override=None):
37+
# Allow temporary model override for two-stage architecture (e.g., lightweight model for filtering)
38+
actual_model = model_override or self.model
39+
model_input = {"model": actual_model, "api_type": self.api_type}
3740
if self.api_type == "openai" and self.client is None:
3841
await self.initialize()
3942

@@ -50,8 +53,9 @@ async def get_llm_response(self, system_prompt, prompt, images=None, temperature
5053
temperature if temperature is not None else self.llm_config.get("temperature", 0.1)
5154
)
5255
resolved_top_p = top_p if top_p is not None else self.llm_config.get("top_p", None)
53-
logging.debug(f"Resolved temperature: {resolved_temperature}, top_p: {resolved_top_p}")
54-
result = await self._call_openai(messages, resolved_temperature, resolved_top_p)
56+
resolved_max_tokens = max_tokens if max_tokens is not None else self.llm_config.get("max_tokens", None)
57+
logging.debug(f"Resolved temperature: {resolved_temperature}, top_p: {resolved_top_p}, max_tokens: {resolved_max_tokens}")
58+
result = await self._call_openai(messages, resolved_temperature, resolved_top_p, resolved_max_tokens, actual_model)
5559

5660
return result
5761
except Exception as e:
@@ -84,10 +88,13 @@ def _handle_images_openai(self, messages, images):
8488
logging.error(f"Error while handling images for OpenAI: {e}")
8589
raise ValueError(f"Failed to process images for OpenAI. Error: {e}")
8690

87-
async def _call_openai(self, messages, temperature=None, top_p=None):
91+
async def _call_openai(self, messages, temperature=None, top_p=None, max_tokens=None, model=None):
8892
try:
93+
# Use provided model or fallback to config model
94+
actual_model = model or self.llm_config.get("model")
95+
8996
create_kwargs = {
90-
"model": self.llm_config.get("model"),
97+
"model": actual_model,
9198
"messages": messages,
9299
"timeout": 60,
93100
}
@@ -96,6 +103,8 @@ async def _call_openai(self, messages, temperature=None, top_p=None):
96103
create_kwargs["temperature"] = temperature
97104
if top_p is not None:
98105
create_kwargs["top_p"] = top_p
106+
if max_tokens is not None:
107+
create_kwargs["max_tokens"] = max_tokens
99108

100109
completion = await self.client.chat.completions.create(**create_kwargs)
101110
content = completion.choices[0].message.content

0 commit comments

Comments
 (0)