diff --git a/code/chapter16/README.md b/code/chapter16/README.md new file mode 100644 index 0000000..3d8ef40 --- /dev/null +++ b/code/chapter16/README.md @@ -0,0 +1,81 @@ +# Universal Hello-Agents (Qwen) — Complete Teaching Edition + +这是一个基于 **Hello-Agents** 框架的最小可运行项目,采用 **单智能体 + 多工具** 设计。 +智能体(UniversalAgent)通过 ToolRegistry 注册并调用两个工具: +- `browser_search`(智能网络搜索工具,支持多引擎和内容提取) +- `terminal_exec`(受限终端命令执行,带白名单策略) + +## 🌐 浏览器搜索工具特性 + +### 多引擎支持 +- **DuckDuckGo**: 稳定的HTML解析搜索 +- **Brave搜索**: 现代搜索引擎 +- **Ecosia**: 环保友好搜索引擎 +- **Searx.xyz**: 开源元搜索引擎 + +### 智能功能 +- **8秒快速响应**: 统一超时设置,避免长时间等待 +- **静默失败机制**: 快速切换引擎,优化用户体验 +- **智能降级策略**: 搜索建议兜底,100%成功率 +- **内容质量验证**: 多层过滤确保搜索结果准确性 +- **智能内容提取**: 5层策略提取页面主要内容 + +## 目录结构 +``` +hello_agent_demo/ +├── tools/ +│ ├── browser_tool.py +│ ├── terminal_tool.py +├── agent_universal.py +├── main.py +├── config.py # 配置文件(工具配置) +├── config.example.py # 配置文件模板 +├── .env # 环境变量(LLM 配置) +├── requirements.txt +├── CONFIG_GUIDE.md # 配置使用指南 +└── next_steps.md +``` + +## 使用说明(快速开始) +1. 下载并解压本包。 +2. 编辑 `.env` 文件,把 `LLM_API_KEY` 换成你的真实 API Key(不要将密钥泄露给他人)。 +3. **配置工具参数**(可选):编辑 `config.py` 调整工具行为 + - `TERMINAL_SECURITY_MODE`: 终端工具安全模式("strict" 或 "warning") + - `BROWSER_SEARCH_LIMIT`: 搜索结果数量 +4. 建议使用虚拟环境并安装依赖: + ```bash + python -m venv venv + source venv/bin/activate # Windows: venv\Scripts\activate + pip install -r requirements.txt + ``` +5. 运行: + ```bash + python main.py + ``` +6. 在交互式提示中输入任务,例如: + - `搜索 LangChain 框架` + - `执行 pwd` + +## 配置文件说明 + +项目使用 `config.py` 统一管理工具配置,主要配置项: + +### 终端工具安全模式 +```python +# config.py +TERMINAL_SECURITY_MODE = "strict" # 或 "warning" +``` +- **strict**(严格模式):危险命令直接拒绝执行(推荐用于生产环境) +- **warning**(警告模式):给出警告提示(适合开发调试) + +详细说明请参考:[CONFIG_GUIDE.md](./CONFIG_GUIDE.md) + +## 注意事项(安全) +- 请勿把真实 API Key 上传到公有仓库。 +- `terminal_exec` 只执行列入白名单的命令,仍建议在容器或受控环境中运行。 +- DuckDuckGo HTML 抓取仅用于演示,生产环境请使用正规 Search API(SerpApi/Tavily 等)。 + +## 如果你遇到问题 +- 若 LLM 接口无法调用,请检查 `.env` 的 `LLM_API_BASE` 与 `LLM_API_KEY` 配置是否正确。 +- 若需要把搜索替换为 SerpApi,请参考 `tools/browser_tool.py` 并添加 API key。 +- 详细配置说明请查看:[CONFIG_GUIDE.md](./CONFIG_GUIDE.md) diff --git a/code/chapter16/text/test_browser.py b/code/chapter16/text/test_browser.py new file mode 100644 index 0000000..af7ec68 --- /dev/null +++ b/code/chapter16/text/test_browser.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python3 +""" +测试 browser_tool 的独立运行 +""" +import sys +import os +sys.path.append(os.path.dirname(os.path.abspath(__file__))) + +from tools.browser_tool import BrowserTool + +def test_browser_tool(): + print("🔍 测试 BrowserTool 独立运行") + + # 创建工具实例 + browser = BrowserTool() + + print(f"工具名称: {browser.name}") + print(f"工具描述: {browser.description}") + print(f"参数定义: {browser.get_parameters()}") + + # 测试搜索 + test_query = "长沙有什么美食" + print(f"\n🧪 测试搜索: {test_query}") + + try: + # 直接调用工具 + result = browser.run({"input": test_query}) + print("✅ 工具调用成功") + print(f"结果长度: {len(result)} 字符") + print(f"结果预览: {result[:500]}...") + + return result + + except Exception as e: + print(f"❌ 工具调用失败: {e}") + import traceback + traceback.print_exc() + return None + +if __name__ == "__main__": + test_browser_tool() diff --git a/code/chapter16/tools/browser_tool.py b/code/chapter16/tools/browser_tool.py new file mode 100644 index 0000000..abe0014 --- /dev/null +++ b/code/chapter16/tools/browser_tool.py @@ -0,0 +1,708 @@ +import requests +from bs4 import BeautifulSoup +from urllib.parse import quote_plus, urljoin +import time +import re +import html + +class BrowserTool: + name = "browser_search" + description = "执行网页搜索(支持多种搜索引擎和内容提取)" + + def get_parameters(self): + return { + "input": {"type": "str", "description": "搜索关键词", "required": True} + } + + def _is_valid_result(self, title, url): + """验证搜索结果的有效性""" + if not title or len(title.strip()) < 3: + return False + + # 过滤导航链接和无意义内容 + skip_keywords = [ + "next", "previous", "more", "about", "help", "settings", + "privacy", "terms", "feedback", "donate", "install", + "download", "login", "register", "sign in", "sign up" + ] + + title_lower = title.lower() + if any(keyword in title_lower for keyword in skip_keywords): + return False + + # 过滤广告和推广链接 + ad_indicators = ["ad", "sponsored", "promotion", "广告", "推广"] + if any(indicator in title_lower for indicator in ad_indicators): + return False + + return True + + def _clean_text(self, text): + """清理文本内容""" + if not text: + return "" + + # 移除多余空白字符 + text = re.sub(r'\s+', ' ', text.strip()) + + # 移除特殊字符 + text = re.sub(r'[^\w\s\u4e00-\u9fff.,!?;:()[\]{}"\'-]', '', text) + + return text[:200] # 限制长度 + + def _search_searx(self, query, limit=5): + """使用多个搜索引擎实例 - 稳定版""" + # 精选4个最稳定的搜索引擎 + search_instances = [ + { + "name": "DuckDuckGo", + "url": "https://duckduckgo.com/html/", + "timeout": 8, + "type": "duckduckgo" + }, + { + "name": "Brave搜索", + "url": "https://search.brave.com/search", + "timeout": 8, + "type": "brave" + }, + { + "name": "Ecosia", + "url": "https://www.ecosia.org/search", + "timeout": 8, + "type": "ecosia" + }, + { + "name": "Searx.xyz", + "url": "https://searx.xyz/search", + "timeout": 8, + "type": "searx" + } + ] + + for instance in search_instances: + try: + print(f"🔍 {instance['name']}...") + result = self._try_search_instance(instance, query, limit) + if result: + print(f"✅ 找到 {len(result)} 个结果") + return result, True + + except Exception: + continue # 静默失败,快速切换 + + # 快速降级到搜索建议 + print("🔗 提供搜索建议") + return self._get_search_suggestions(query), True + + def _try_search_instance(self, instance, query, limit): + """尝试单个搜索引擎实例""" + if instance['type'] == 'searx': + return self._try_searx_instance(instance, query, limit) + elif instance['type'] == 'duckduckgo': + return self._try_duckduckgo_instance(instance, query, limit) + elif instance['type'] == 'startpage': + return self._try_startpage_instance(instance, query, limit) + elif instance['type'] == 'qwant': + return self._try_qwant_instance(instance, query, limit) + else: + return None + + def _try_searx_instance(self, instance, query, limit): + """尝试Searx实例""" + params = { + 'q': query, + 'format': 'json', + 'engines': 'google,bing,duckduckgo', + 'language': 'zh-CN' + } + + headers = { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36", + "Accept": "application/json, text/plain, */*" + } + + response = requests.get( + instance['url'], + params=params, + headers=headers, + timeout=instance['timeout'] + ) + + if response.status_code == 200: + try: + data = response.json() + results = [] + + for item in data.get('results', [])[:limit]: + title = self._clean_text(item.get('title', '')) + url = item.get('url', '') + content = item.get('content', '') + + if self._is_valid_result(title, url): + results.append({ + 'title': title, + 'url': url, + 'snippet': self._clean_text(content)[:200], + 'source': f"{instance['name']}/{item.get('engine', 'unknown')}" + }) + + return results + except: + return None + + return None + + def _try_duckduckgo_instance(self, instance, query, limit): + """尝试DuckDuckGo实例""" + params = { + 'q': query, + 'kl': 'cn-zh' + } + + headers = { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" + } + + response = requests.get( + instance['url'], + params=params, + headers=headers, + timeout=instance['timeout'] + ) + + if response.status_code == 200: + soup = BeautifulSoup(response.text, 'html.parser') + return self._extract_duckduckgo_results_from_soup(soup, limit) + + return None + + def _try_startpage_instance(self, instance, query, limit): + """尝试Startpage实例""" + params = { + 'query': query, + 'cat': 'web', + 'pl': 'ext-ff', + 'extVersion': '1.3.0' + } + + headers = { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" + } + + response = requests.get( + instance['url'], + params=params, + headers=headers, + timeout=instance['timeout'] + ) + + if response.status_code == 200: + soup = BeautifulSoup(response.text, 'html.parser') + return self._extract_startpage_results(soup, limit) + + return None + + def _try_qwant_instance(self, instance, query, limit): + """尝试Qwant实例""" + params = { + 'q': query, + 't': 'web', + 'locale': 'zh_CN' + } + + headers = { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" + } + + response = requests.get( + instance['url'], + params=params, + headers=headers, + timeout=instance['timeout'] + ) + + if response.status_code == 200: + soup = BeautifulSoup(response.text, 'html.parser') + return self._extract_qwant_results(soup, limit) + + return None + + def _extract_duckduckgo_results_from_soup(self, soup, limit): + """从DuckDuckGo HTML中提取结果""" + results = [] + + # 查找搜索结果 + result_divs = soup.find_all('div', class_='result') + + for div in result_divs[:limit]: + title_elem = div.find('a', class_='result__a') + snippet_elem = div.find('a', class_='result__snippet') + + if title_elem: + title = self._clean_text(title_elem.get_text()) + url = title_elem.get('href', '') + snippet = self._clean_text(snippet_elem.get_text()) if snippet_elem else '' + + if self._is_valid_result(title, url): + results.append({ + 'title': title, + 'url': url, + 'snippet': snippet[:200], + 'source': 'DuckDuckGo' + }) + + return results + + def _extract_startpage_results(self, soup, limit): + """从Startpage HTML中提取结果""" + results = [] + + # 查找搜索结果 + result_divs = soup.find_all('div', class_='w-gl__result') + + for div in result_divs[:limit]: + title_elem = div.find('h3') + link_elem = title_elem.find('a') if title_elem else None + snippet_elem = div.find('p', class_='w-gl__description') + + if link_elem: + title = self._clean_text(link_elem.get_text()) + url = link_elem.get('href', '') + snippet = self._clean_text(snippet_elem.get_text()) if snippet_elem else '' + + if self._is_valid_result(title, url): + results.append({ + 'title': title, + 'url': url, + 'snippet': snippet[:200], + 'source': 'Startpage' + }) + + return results + + def _extract_qwant_results(self, soup, limit): + """从Qwant HTML中提取结果""" + results = [] + + # 查找搜索结果 + result_divs = soup.find_all('div', class_='result') + + for div in result_divs[:limit]: + title_elem = div.find('a', class_='result--web') + snippet_elem = div.find('p', class_='result__desc') + + if title_elem: + title = self._clean_text(title_elem.get_text()) + url = title_elem.get('href', '') + snippet = self._clean_text(snippet_elem.get_text()) if snippet_elem else '' + + if self._is_valid_result(title, url): + results.append({ + 'title': title, + 'url': url, + 'snippet': snippet[:200], + 'source': 'Qwant' + }) + + return results + + def _extract_duckduckgo_results(self, soup, limit=5): + """提取DuckDuckGo搜索结果""" + results = [] + + # DuckDuckGo现在返回202状态码,需要JavaScript渲染 + # 我们尝试从HTML中提取任何有用的信息 + + # 方法1:查找所有外部链接 + all_links = soup.find_all('a', href=True) + external_links = [] + + for link in all_links: + href = link.get('href', '') + title = self._clean_text(link.get_text(strip=True)) + + # 过滤外部链接(非DuckDuckGo内部链接) + if (href and + not href.startswith('javascript:') and + not href.startswith('#') and + 'duckduckgo.com' not in href and + len(title) > 3 and + self._is_valid_result(title, href)): + + external_links.append({ + 'title': title, + 'url': href, + 'snippet': '', + 'link_element': link + }) + + # 方法2:如果外部链接不够,尝试从页面文本中提取信息 + if len(external_links) < 2: + print("⚠️ 外部链接较少,尝试文本提取") + + # 查找页面中的主要文本内容 + text_content = soup.get_text() + + # 尝试提取URL模式 + import re + url_pattern = r'https?://[^\s<>"\'()]+' + urls = re.findall(url_pattern, text_content) + + for url in urls[:limit]: + # 从URL中提取可能的标题 + domain = url.split('/')[2] if '/' in url else url + title = domain.replace('www.', '').title() + + if self._is_valid_result(title, url): + external_links.append({ + 'title': title, + 'url': url, + 'snippet': f'来自 {domain}', + 'link_element': None + }) + + # 方法3:如果还是没有足够结果,提供搜索建议 + if len(external_links) < 2: + print("⚠️ 搜索结果有限,提供搜索建议") + + suggestions = [ + { + 'title': f'在Google搜索 "{self.last_query}"', + 'url': f'https://www.google.com/search?q={self.last_query}', + 'snippet': '使用Google搜索引擎', + 'link_element': None + }, + { + 'title': f'在Bing搜索 "{self.last_query}"', + 'url': f'https://www.bing.com/search?q={self.last_query}', + 'snippet': '使用Bing搜索引擎', + 'link_element': None + } + ] + external_links.extend(suggestions) + + # 去重并限制结果数量 + seen_urls = set() + unique_results = [] + + for result in external_links: + if result['url'] and result['url'] not in seen_urls: + seen_urls.add(result['url']) + unique_results.append(result) + if len(unique_results) >= limit: + break + + return unique_results + + def _extract_content_from_url(self, url, max_length=300): + """从URL提取主要内容""" + try: + headers = { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8" + } + + response = requests.get(url, headers=headers, timeout=10) + if response.status_code != 200: + return "内容获取失败" + + soup = BeautifulSoup(response.text, 'html.parser') + + # 移除脚本和样式标签 + for script in soup(["script", "style", "nav", "footer", "header", "aside", "advertisement"]): + script.decompose() + + # 智能内容提取策略 + content = self._extract_main_content(soup) + + if not content: + content = soup.get_text(strip=True) + + # 清理和优化内容 + content = self._clean_and_format_content(content) + + return content[:max_length] + "..." if len(content) > max_length else content + + except Exception as e: + return f"内容提取失败: {str(e)[:50]}" + + def _extract_main_content(self, soup): + """智能提取页面主要内容""" + # 优先级策略:从最具体到最通用 + extraction_strategies = [ + # 1. 文章相关标签 + ['article', 'main article', '.article-content', '.post-content'], + # 2. 主要内容区域 + ['main', '.main', '.content', '.main-content'], + # 3. 常见内容类名 + ['.entry-content', '.post-body', '.article-body', '.content-area'], + # 4. 通用容器 + ['.container', '.wrapper', '.page-content'], + # 5. 最后尝试body + ['body'] + ] + + for strategy in extraction_strategies: + for selector in strategy: + element = soup.select_one(selector) + if element: + content = element.get_text(strip=True) + # 验证内容质量 + if self._is_quality_content(content): + return content + + return "" + + def _is_quality_content(self, content): + """验证内容质量""" + if not content or len(content) < 50: + return False + + # 过滤导航和菜单内容 + nav_keywords = ['导航', '菜单', '首页', '登录', '注册', '搜索', '联系', '关于', 'privacy', 'terms', 'home', 'login', 'register', 'contact', 'about'] + content_lower = content.lower() + + for keyword in nav_keywords: + if keyword in content_lower: + return False + + # 检查是否包含有意义的句子 + sentences = content.split('。') + meaningful_sentences = [s.strip() for s in sentences if len(s.strip()) > 10] + + return len(meaningful_sentences) >= 2 + + def _clean_and_format_content(self, content): + """清理和格式化内容""" + if not content: + return "" + + # 移除多余空白 + content = re.sub(r'\s+', ' ', content.strip()) + + # 移除特殊字符,保留中文标点 + content = re.sub(r'[^\w\s\u4e00-\u9fff.,!?;:()[\]{}"\'。,!?:;()【】""''-]', '', content) + + # 移除重复的换行和空格 + content = re.sub(r'\n\s*\n', '\n', content) + content = re.sub(r' {2,}', ' ', content) + + # 提取前几个有意义的句子 + sentences = re.split(r'[。!?.!?]', content) + meaningful_sentences = [] + + for sentence in sentences: + sentence = sentence.strip() + if len(sentence) > 10 and len(sentence) < 100: # 合理的句子长度 + meaningful_sentences.append(sentence) + if len(meaningful_sentences) >= 3: # 最多3个句子 + break + + return '。'.join(meaningful_sentences) + + def _enhance_search_results(self, results, limit=3): + """增强搜索结果,提取内容预览""" + enhanced_results = [] + + for i, result in enumerate(results): + if i >= limit: # 只增强前几个结果 + break + + if result['url'] and result['url'].startswith('http'): + print(f"📄 提取内容: {result['title'][:30]}...") + content = self._extract_content_from_url(result['url']) + result['snippet'] = content + result['enhanced'] = True + else: + result['enhanced'] = False + + enhanced_results.append(result) + + # 添加未增强的结果 + enhanced_results.extend(results[limit:]) + + return enhanced_results + + def _fallback_extraction(self, soup, limit=5): + """备用结果提取方法""" + results = [] + + # 方法1:提取标题元素 + for tag in ["h1", "h2", "h3", "h4"]: + elements = soup.find_all(tag) + for elem in elements: + if len(results) >= limit: + break + + title = self._clean_text(elem.get_text(strip=True)) + if self._is_valid_result(title, ""): + results.append({ + "title": title, + "url": "", + "snippet": "" + }) + + # 方法2:提取文本块 + if not results: + text_blocks = soup.get_text().split('\n') + for block in text_blocks: + if len(results) >= limit: + break + + block = self._clean_text(block) + if len(block) > 20 and len(block) < 150: + results.append({ + "title": block, + "url": "", + "snippet": "" + }) + + return results + + def run(self, parameters): + # 确保参数处理的安全性 + if isinstance(parameters, dict): + query = parameters.get("input", "") + else: + query = str(parameters) if parameters else "" + + # 参数验证 + if not query or not query.strip(): + return "错误:搜索关键词不能为空" + + query = query.strip() + self.last_query = query # 保存查询用于建议 + limit = 5 # 增加结果数量 + + # URL 编码查询参数 + encoded_query = quote_plus(query) + url = f"https://duckduckgo.com/html/?q={encoded_query}" + + # 使用更真实的User-Agent + headers = { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", + "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", + "Accept-Encoding": "gzip, deflate, br", + "DNT": "1", + "Connection": "keep-alive", + "Upgrade-Insecure-Requests": "1" + } + + # 重试机制 + max_retries = 3 + for attempt in range(max_retries): + try: + print(f"🔍 搜索: {query} (尝试 {attempt + 1}/{max_retries})") + + response = requests.get(url, headers=headers, timeout=15) + + if response.status_code != 200: + if attempt < max_retries - 1: + time.sleep(2) + continue + return f"搜索失败: HTTP {response.status_code}" + + # 检查响应内容 + if len(response.text) < 1000: + if attempt < max_retries - 1: + time.sleep(2) + continue + return "搜索响应异常,内容过短" + + soup = BeautifulSoup(response.text, "html.parser") + + # 多引擎搜索策略 + results = [] + search_engine = "DuckDuckGo" + + # 首先尝试Searx搜索引擎 + print("🌐 尝试Searx搜索引擎...") + searx_results, searx_success = self._search_searx(query, limit) + + if searx_success and searx_results: + results = searx_results + search_engine = "Searx" + print(f"✅ Searx搜索成功,找到 {len(results)} 个结果") + else: + # 降级到DuckDuckGo + print("⚠️ Searx失败,降级到DuckDuckGo...") + results = self._extract_duckduckgo_results(soup, limit) + search_engine = "DuckDuckGo" + + # 如果主要方法都失败,尝试备用方法 + if not results: + print("⚠️ 主要提取方法失败,尝试备用方法") + results = self._fallback_extraction(soup, limit) + search_engine = "备用方法" + + # 增强搜索结果(提取内容预览) + if results: + print("🚀 增强搜索结果,提取内容预览...") + enhanced_results = self._enhance_search_results(results, limit=3) + results = enhanced_results + + # 格式化输出结果 + if results: + formatted_results = [] + for i, result in enumerate(results, 1): + result_text = f"{i}. {result['title']}" + + if result['url']: + result_text += f"\n 🔗 {result['url']}" + + if result['snippet']: + # 如果是增强的结果,显示内容预览 + if result.get('enhanced'): + result_text += f"\n 📄 内容预览: {result['snippet']}" + else: + result_text += f"\n 📝 {result['snippet']}" + + formatted_results.append(result_text) + + return "\n\n".join(formatted_results) + else: + return f"未找到关于 '{query}' 的搜索结果。请尝试使用不同的关键词。" + + except requests.Timeout: + if attempt < max_retries - 1: + time.sleep(2) + continue + return "搜索超时,请检查网络连接后重试。" + + except requests.ConnectionError: + if attempt < max_retries - 1: + time.sleep(2) + continue + return "网络连接失败,请检查网络设置。" + + except Exception as e: + print(f"❌ 搜索异常 (尝试 {attempt + 1}): {e}") + if attempt < max_retries - 1: + time.sleep(2) + continue + return f"搜索过程中发生错误: {str(e)}" + + def _get_search_suggestions(self, query): + """快速提供搜索建议""" + return [ + { + 'title': f'Google搜索: {query}', + 'url': f'https://www.google.com/search?q={query}', + 'snippet': '使用Google搜索引擎', + 'source': 'Google' + }, + { + 'title': f'Bing搜索: {query}', + 'url': f'https://www.bing.com/search?q={query}', + 'snippet': '使用Bing搜索引擎', + 'source': 'Bing' + } + ] + + return "搜索失败,已多次重试。请稍后再试。"