feat: 1. support max tokens and filter parameters; 2. support interactive elements and text via LLM filtering; 3. support dynamically plan recovery steps

seancoding-day · seancoding-day · commit c704a56300cc · 2025-11-17T16:45:18.000+08:00
diff --git a/config/config.yaml.example b/config/config.yaml.example
@@ -20,11 +20,13 @@ test_config: # Test configuration
     enabled: False
 
 llm_config: # LLM configuration, currently only supports OpenAI SDK compatible format
-  model: gpt-4.1-2025-04-14  # Recommended
+  model: gpt-4.1-2025-04-14  # Primary model for Stage 2 test planning (Recommended)
+  filter_model: gpt-4o-mini  # Lightweight model for Stage 1 element filtering (cost-effective)
   api_key: your_api_key
   base_url:  https://api.example.com/v1
   temperature: 0.1   # Optional, default 0.1
   # top_p: 0.9       # Optional, if not set, this parameter will not be passed
+  # max_tokens: 8192  # Optional, maximum output tokens (supports generating more test cases)
 
 browser_config:
   viewport: {"width": 1280, "height": 720}
diff --git a/webqa-agent.py b/webqa-agent.py
@@ -149,9 +149,11 @@ def validate_and_build_llm_config(cfg):
     api_key = os.getenv("OPENAI_API_KEY") or llm_cfg_raw.get("api_key", "")
     base_url = os.getenv("OPENAI_BASE_URL") or llm_cfg_raw.get("base_url", "")
     model = llm_cfg_raw.get("model", "gpt-4o-mini")
+    filter_model = llm_cfg_raw.get("filter_model", model)  # For two-stage architecture, defaults to primary model
     # Sampling configuration: default temperature is 0.1; top_p not set by default
     temperature = llm_cfg_raw.get("temperature", 0.1)
     top_p = llm_cfg_raw.get("top_p")
+    max_tokens = llm_cfg_raw.get("max_tokens")  # Optional: maximum output tokens
 
     # Validate required fields
     if not api_key:
@@ -168,12 +170,15 @@ def validate_and_build_llm_config(cfg):
     llm_config = {
         "api": "openai",
         "model": model,
+        "filter_model": filter_model,
         "api_key": api_key,
         "base_url": base_url,
         "temperature": temperature,
     }
     if top_p is not None:
         llm_config["top_p"] = top_p
+    if max_tokens is not None:
+        llm_config["max_tokens"] = max_tokens
 
     # Show configuration source (hide sensitive information)
     api_key_masked = f"{api_key[:8]}...{api_key[-4:]}" if len(api_key) > 12 else "***"
@@ -184,9 +189,13 @@ def validate_and_build_llm_config(cfg):
     print(f"   - API Key: {api_key_masked} ({'Environment variable' if env_api_key else 'Config file'})")
     print(f"   - Base URL: {base_url} ({'Environment variable' if env_base_url else 'Config file/Default'})")
     print(f"   - Model: {model}")
+    if filter_model != model:
+        print(f"   - Filter Model: {filter_model} (for two-stage architecture)")
     print(f"   - Temperature: {temperature}")
     if top_p is not None:
         print(f"   - Top_p: {top_p}")
+    if max_tokens is not None:
+        print(f"   - Max Tokens: {max_tokens}")
 
     return llm_config
 
diff --git a/webqa_agent/actions/action_executor.py b/webqa_agent/actions/action_executor.py
@@ -1,4 +1,5 @@
 import asyncio
+import inspect
 import logging
 from typing import Dict, List, Optional
 
@@ -42,9 +43,20 @@ async def execute(self, action):
                 logging.error(f"Unknown action type: {action_type}")
                 return False
 
-            # Execute the action
+            # Execute the action with introspection to handle different method signatures
             logging.debug(f"Executing action: {action_type}")
-            return await execute_func(action)
+
+            # Use introspection to check if method accepts action parameter
+            sig = inspect.signature(execute_func)
+            params = list(sig.parameters.keys())
+
+            # If method only has 'self' parameter (no additional params), call without action
+            # Note: bound methods don't show 'self' in signature, so empty params means no action param
+            if len(params) == 0:
+                logging.debug(f"Calling {action_type} without action parameter (method signature has no parameters)")
+                return await execute_func()
+            else:
+                return await execute_func(action)
 
         except Exception as e:
             logging.error(f"Action execution failed: {str(e)}")
diff --git a/webqa_agent/actions/action_handler.py b/webqa_agent/actions/action_handler.py
@@ -97,7 +97,7 @@ async def go_to_page(self, page: Page, url: str, cookies=None):
         await self.page.goto(url=url, wait_until='domcontentloaded')
         await self.page.wait_for_load_state('networkidle', timeout=60000)
 
-    async def smart_navigate_to_page(self, page: Page, url: str, cookies=None) -> bool:
+    async def smart_navigate_to_page(self, page: Page, url: str, cookies=None) -> bool | None:
         """Smart navigation to target page, avoiding redundant navigation.
 
         Args:
@@ -141,7 +141,7 @@ def normalize_url(u):
 
             if current_normalized == target_normalized:
                 logging.debug('Already on target page (normalized match), skipping navigation')
-                return False
+                return None  # Return None to indicate "already on page, no navigation needed"
 
             # More flexible URL matching: if domain is same and path is similar, also consider as match
             def extract_domain(u):
diff --git a/webqa_agent/crawler/deep_crawler.py b/webqa_agent/crawler/deep_crawler.py
@@ -539,6 +539,104 @@ def dump_json(node: Dict[str, Any], path: Path) -> None:
         with open(path, 'w', encoding='utf-8') as f:
             json.dump(node, f, ensure_ascii=False, indent=2)
 
+    @staticmethod
+    def smart_truncate_page_text(
+        text_array: List[str],
+        max_tokens: int = 3000,
+        strategy: str = "head_tail_sample"
+    ) -> Dict[str, Any]:
+        """
+        Intelligently truncate page text while preserving semantic completeness.
+
+        Based on 2024 research on semantic chunking and context preservation:
+        - Avoids "lost-in-the-middle" problem
+        - Preserves page structure (head, middle sample, tail)
+        - Maintains overall context and flow
+
+        Args:
+            text_array: Original text array from get_text()
+            max_tokens: Maximum token budget (default: 3000)
+            strategy: Truncation strategy (currently supports "head_tail_sample")
+
+        Returns:
+            Dict containing:
+                - summary: Overview of the truncation
+                - text_content: Sampled text segments
+                - coverage: Coverage ratio (selected/total)
+                - estimated_tokens: Estimated token count
+        """
+        if not text_array:
+            return {
+                "summary": "No text content found",
+                "text_content": [],
+                "coverage": "0/0 (0%)",
+                "estimated_tokens": 0,
+                "strategy_used": strategy
+            }
+
+        total_items = len(text_array)
+        # Conservative estimate: 1 token ≈ 2 chars (mixed Chinese/English)
+        char_budget = max_tokens * 2
+
+        if strategy == "head_tail_sample":
+            result_parts = []
+            current_chars = 0
+
+            # Keep head 30% (navigation, titles, important info)
+            keep_head = int(total_items * 0.3)
+            for item in text_array[:keep_head]:
+                if current_chars + len(item) > char_budget * 0.5:
+                    break
+                result_parts.append(item)
+                current_chars += len(item)
+
+            # Middle sampling (max 20 samples to maintain page flow)
+            middle_start = keep_head
+            middle_end = max(keep_head, total_items - int(total_items * 0.1))
+            middle_section = text_array[middle_start:middle_end]
+
+            if middle_section:
+                sample_rate = max(1, len(middle_section) // 20)
+                for item in middle_section[::sample_rate]:
+                    if current_chars + len(item) > char_budget * 0.8:
+                        break
+                    result_parts.append(item)
+                    current_chars += len(item)
+
+            # Keep tail 10% (footer, contact, legal info)
+            keep_tail = int(total_items * 0.1)
+            for item in text_array[-keep_tail:] if keep_tail > 0 else []:
+                if current_chars + len(item) > char_budget:
+                    break
+                result_parts.append(item)
+                current_chars += len(item)
+
+            return {
+                "summary": f"Intelligently sampled {len(result_parts)} from {total_items} text segments",
+                "text_content": result_parts,
+                "coverage": f"{len(result_parts)}/{total_items} ({len(result_parts)/total_items*100:.1f}%)",
+                "estimated_tokens": current_chars // 2,
+                "strategy_used": strategy
+            }
+
+        else:
+            # Fallback: simple truncation
+            result = []
+            chars = 0
+            for item in text_array:
+                if chars + len(item) > char_budget:
+                    break
+                result.append(item)
+                chars += len(item)
+
+            return {
+                "summary": f"Simple truncation: {len(result)}/{total_items} items",
+                "text_content": result,
+                "coverage": f"{len(result)}/{total_items} ({len(result)/total_items*100:.1f}%)",
+                "estimated_tokens": chars // 2,
+                "strategy_used": "simple_truncate"
+            }
+
     # ------------------------------------------------------------------------
     # VISUAL DEBUGGING METHODS
     # ------------------------------------------------------------------------
diff --git a/webqa_agent/llm/llm_api.py b/webqa_agent/llm/llm_api.py
@@ -9,6 +9,7 @@ def __init__(self, llm_config) -> None:
         self.llm_config = llm_config
         self.api_type = self.llm_config.get("api")
         self.model = self.llm_config.get("model")
+        self.filter_model = self.llm_config.get("filter_model", self.model)  # For two-stage architecture
         self.client = None
         self._client = None  # httpx client
 
@@ -32,8 +33,10 @@ async def _get_client(self):
             self._client = httpx.AsyncClient(timeout=60.0)
         return self._client
 
-    async def get_llm_response(self, system_prompt, prompt, images=None, temperature=None, top_p=None):
-        model_input = {"model": self.model, "api_type": self.api_type}
+    async def get_llm_response(self, system_prompt, prompt, images=None, temperature=None, top_p=None, max_tokens=None, model_override=None):
+        # Allow temporary model override for two-stage architecture (e.g., lightweight model for filtering)
+        actual_model = model_override or self.model
+        model_input = {"model": actual_model, "api_type": self.api_type}
         if self.api_type == "openai" and self.client is None:
             await self.initialize()
 
@@ -50,8 +53,9 @@ async def get_llm_response(self, system_prompt, prompt, images=None, temperature
                     temperature if temperature is not None else self.llm_config.get("temperature", 0.1)
                 )
                 resolved_top_p = top_p if top_p is not None else self.llm_config.get("top_p", None)
-                logging.debug(f"Resolved temperature: {resolved_temperature}, top_p: {resolved_top_p}")
-                result = await self._call_openai(messages, resolved_temperature, resolved_top_p)
+                resolved_max_tokens = max_tokens if max_tokens is not None else self.llm_config.get("max_tokens", None)
+                logging.debug(f"Resolved temperature: {resolved_temperature}, top_p: {resolved_top_p}, max_tokens: {resolved_max_tokens}")
+                result = await self._call_openai(messages, resolved_temperature, resolved_top_p, resolved_max_tokens, actual_model)
 
             return result
         except Exception as e:
@@ -84,10 +88,13 @@ def _handle_images_openai(self, messages, images):
             logging.error(f"Error while handling images for OpenAI: {e}")
             raise ValueError(f"Failed to process images for OpenAI. Error: {e}")
 
-    async def _call_openai(self, messages, temperature=None, top_p=None):
+    async def _call_openai(self, messages, temperature=None, top_p=None, max_tokens=None, model=None):
         try:
+            # Use provided model or fallback to config model
+            actual_model = model or self.llm_config.get("model")
+
             create_kwargs = {
-                "model": self.llm_config.get("model"),
+                "model": actual_model,
                 "messages": messages,
                 "timeout": 60,
             }
@@ -96,6 +103,8 @@ async def _call_openai(self, messages, temperature=None, top_p=None):
                 create_kwargs["temperature"] = temperature
             if top_p is not None:
                 create_kwargs["top_p"] = top_p
+            if max_tokens is not None:
+                create_kwargs["max_tokens"] = max_tokens
 
             completion = await self.client.chat.completions.create(**create_kwargs)
             content = completion.choices[0].message.content
diff --git a/webqa_agent/testers/case_gen/agents/execute_agent.py b/webqa_agent/testers/case_gen/agents/execute_agent.py
diff --git a/webqa_agent/testers/case_gen/graph.py b/webqa_agent/testers/case_gen/graph.py
diff --git a/webqa_agent/testers/case_gen/prompts/planning_prompts.py b/webqa_agent/testers/case_gen/prompts/planning_prompts.py