diff --git a/CHANGELOG.md b/CHANGELOG.md index ce63516f9..207508c1f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added +- **🔐 Claude Code Provider**: New LLM provider using Claude Code CLI authentication + - Enables LLM extraction using existing Claude Code subscriptions without API keys + - Integrated with LiteLLM as custom provider via `claude-code/` prefix + - Supports models: `claude-sonnet-4-20250514`, `claude-opus-4-20250514`, `claude-haiku-3-5-latest` + - Async and sync completion with automatic event loop handling + - New optional dependency: `pip install crawl4ai[claude-code]` + - Provider: `crawl4ai/providers/claude_code_provider.py` + - Example: `examples/claude_code_extraction.py` + - Contributor: [@chansearrington](https://github.com/chansearrington) + - **🔒 HTTPS Preservation for Internal Links**: New `preserve_https_for_internal_links` configuration flag - Maintains HTTPS scheme for internal links even when servers redirect to HTTP - Prevents security downgrades during deep crawling @@ -15,6 +25,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Fully backward compatible with opt-in flag (default: `False`) - Fixes issue #1410 where HTTPS URLs were being downgraded to HTTP +### Improved +- **📊 LLM Verbose Logging**: Now logs provider and model name when `verbose=True` + - Shows which LLM provider/model is being used for each extraction + - Helps debug configuration issues and track usage across providers + - Example output: `[LOG] LLM Provider: claude-code | Model: claude-sonnet-4-20250514` + ## [0.7.3] - 2025-08-09 ### Added diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 000000000..574b2516a --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,96 @@ +# Contributing to Crawl4AI + +Thank you for your interest in contributing to Crawl4AI! + +## Getting Started + +1. Fork the repository +2. Clone your fork +3. Install in development mode: `pip install -e ".[all]"` +4. Run tests: `pytest tests/` + +## Contributing New LLM Providers + +### 1. Create Provider Class + +Create `crawl4ai/providers/your_provider.py`: + +```python +from litellm import CustomLLM +from litellm.types.utils import ModelResponse + +class YourProvider(CustomLLM): + async def acompletion(self, model, messages, **kwargs) -> ModelResponse: + # Implement async completion + pass + + def completion(self, model, messages, **kwargs) -> ModelResponse: + # Implement sync completion + pass +``` + +### 2. Register Provider + +Update `crawl4ai/providers/__init__.py` to register your provider: + +```python +from .your_provider import YourProvider + +def register_your_provider(): + import litellm + your_provider = YourProvider() + litellm.custom_provider_map = [ + {"provider": "your-provider", "custom_handler": your_provider} + ] +``` + +### 3. Add Optional Dependency + +Update `pyproject.toml` with your SDK dependency: + +```toml +[project.optional-dependencies] +your-provider = ["your-sdk>=1.0.0"] +``` + +### 4. Write Tests + +Add tests at: +- `tests/unit/test_your_provider.py` - Unit tests for the provider +- `tests/integration/test_your_integration.py` - Integration tests + +### 5. Add Documentation + +- Update `docs/md_v2/extraction/llm-strategies.md` with a provider example +- Consider creating a dedicated doc page at `docs/md_v2/extraction/your-provider.md` + +## Pull Request Process + +1. Ensure all tests pass: `pytest tests/` +2. Update documentation as needed +3. Add a CHANGELOG.md entry under `[Unreleased]` +4. Submit PR against the `main` branch + +## Code Style + +- Follow PEP 8 +- Use type hints where possible +- Add docstrings to public functions and classes +- Keep functions focused and testable + +## Testing + +```bash +# Run all tests +pytest tests/ + +# Run specific test file +pytest tests/unit/test_your_provider.py -v + +# Run with coverage +pytest tests/ --cov=crawl4ai +``` + +## Questions? + +Open an issue for discussion before starting major changes. diff --git a/README.md b/README.md index 7535e6bd5..5c94ee95d 100644 --- a/README.md +++ b/README.md @@ -293,6 +293,7 @@ pip install -e ".[torch]" # With PyTorch features pip install -e ".[transformer]" # With Transformer features pip install -e ".[cosine]" # With cosine similarity features pip install -e ".[sync]" # With synchronous crawling (Selenium) +pip install -e ".[claude-code]" # With Claude Code provider (no API keys needed) pip install -e ".[all]" # Install all optional features ``` @@ -517,6 +518,52 @@ if __name__ == "__main__": +
+🔐 Using Claude Code Provider (No API Keys Required) + +Use your Claude Code CLI subscription for LLM extraction without managing API keys: + +```python +import asyncio +from crawl4ai import AsyncWebCrawler, LLMConfig, CrawlerRunConfig +from crawl4ai import LLMExtractionStrategy + +async def main(): + # Claude Code uses local CLI authentication - no API key needed! + llm_config = LLMConfig(provider="claude-code/claude-sonnet-4-20250514") + + strategy = LLMExtractionStrategy( + llm_config=llm_config, + instruction="Extract all product names and prices as JSON" + ) + + config = CrawlerRunConfig(extraction_strategy=strategy) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun(url="https://example.com", config=config) + print(result.extracted_content) + +asyncio.run(main()) +``` + +**Installation:** +```bash +pip install crawl4ai[claude-code] +``` + +**Prerequisites:** +- Claude Code CLI installed: `npm install -g @anthropic-ai/claude-code` +- CLI authenticated: run `claude login` + +**Supported Models:** +| Model | Provider String | Use Case | +|-------|-----------------|----------| +| Sonnet 4 | `claude-code/claude-sonnet-4-20250514` | Balanced (recommended) | +| Opus 4 | `claude-code/claude-opus-4-20250514` | Most capable | +| Haiku 3.5 | `claude-code/claude-haiku-3-5-latest` | Fastest | + +
+
🤖 Using Your own Browser with Custom User Profile diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py index af35e6a0e..f4b6e8ecd 100644 --- a/crawl4ai/__init__.py +++ b/crawl4ai/__init__.py @@ -1,6 +1,10 @@ # __init__.py import warnings +# Register custom LLM providers with LiteLLM +from .providers import register_custom_providers +register_custom_providers() + from .async_webcrawler import AsyncWebCrawler, CacheMode # MODIFIED: Add SeedingConfig and VirtualScrollConfig here from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig, SeedingConfig, VirtualScrollConfig, LinkPreviewConfig, MatchMode diff --git a/crawl4ai/config.py b/crawl4ai/config.py index 08f56b832..9e0d2f546 100644 --- a/crawl4ai/config.py +++ b/crawl4ai/config.py @@ -36,6 +36,7 @@ "anthropic": os.getenv("ANTHROPIC_API_KEY"), "gemini": os.getenv("GEMINI_API_KEY"), "deepseek": os.getenv("DEEPSEEK_API_KEY"), + "claude-code": "no-token-needed", # Uses local Claude Code CLI auth } # Chunk token threshold diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index 7033e3800..6a46fdce1 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -616,7 +616,12 @@ def extract(self, url: str, ix: int, html: str) -> List[Dict[str, Any]]: A list of extracted blocks or chunks. """ if self.verbose: - # print("[LOG] Extracting blocks from URL:", url) + # Log which LLM provider/model is being used + provider_str = self.llm_config.provider if self.llm_config else "unknown" + provider_parts = provider_str.split('/') + provider_name = provider_parts[0] if provider_parts else "unknown" + model_name = provider_parts[1] if len(provider_parts) > 1 else provider_str + print(f"[LOG] LLM Provider: {provider_name} | Model: {model_name}") print(f"[LOG] Call LLM for {url} - block index: {ix}") variable_values = { @@ -817,6 +822,12 @@ async def aextract(self, url: str, ix: int, html: str) -> List[Dict[str, Any]]: from .utils import aperform_completion_with_backoff if self.verbose: + # Log which LLM provider/model is being used + provider_str = self.llm_config.provider if self.llm_config else "unknown" + provider_parts = provider_str.split('/') + provider_name = provider_parts[0] if provider_parts else "unknown" + model_name = provider_parts[1] if len(provider_parts) > 1 else provider_str + print(f"[LOG] LLM Provider: {provider_name} | Model: {model_name}") print(f"[LOG] Call LLM for {url} - block index: {ix}") variable_values = { diff --git a/crawl4ai/providers/__init__.py b/crawl4ai/providers/__init__.py new file mode 100644 index 000000000..7e56ed196 --- /dev/null +++ b/crawl4ai/providers/__init__.py @@ -0,0 +1,47 @@ +""" +Custom LLM providers for Crawl4AI. + +This module provides custom LLM provider integrations beyond what LiteLLM +offers out of the box. +""" + +_providers_registered = False + + +def register_custom_providers(): + """ + Register custom LLM providers with LiteLLM. + + This function registers all custom providers defined in this package + with LiteLLM's custom_provider_map. It is idempotent - calling it + multiple times has no additional effect. + + Currently registered providers: + - claude-code: Uses Claude Code CLI for LLM completions (requires local auth) + """ + global _providers_registered + if _providers_registered: + return + + import litellm + + # Initialize custom_provider_map if it doesn't exist + if litellm.custom_provider_map is None: + litellm.custom_provider_map = [] + + # Try to register Claude Code provider (optional dependency) + try: + from .claude_code_provider import ClaudeCodeProvider + + # Check if already registered + existing_providers = [p.get("provider") for p in litellm.custom_provider_map] + if "claude-code" not in existing_providers: + litellm.custom_provider_map.append({ + "provider": "claude-code", + "custom_handler": ClaudeCodeProvider() + }) + except ImportError: + # claude-agent-sdk not installed, skip registration + pass + + _providers_registered = True diff --git a/crawl4ai/providers/claude_code_provider.py b/crawl4ai/providers/claude_code_provider.py new file mode 100644 index 000000000..d8bbba78a --- /dev/null +++ b/crawl4ai/providers/claude_code_provider.py @@ -0,0 +1,369 @@ +""" +Claude Code/Agent SDK provider for LiteLLM integration. + +This provider enables using Claude Code CLI authentication for LLM completions, +allowing users with Claude Code subscriptions to leverage their existing auth +without needing separate API keys. + +IMPORTANT: Uses LOCAL Claude Code CLI authentication. +Each user must have their own Claude Code CLI installed and authenticated. + +Usage: + >>> from crawl4ai.async_configs import LLMConfig + >>> config = LLMConfig(provider="claude-code/claude-sonnet-4-20250514") + +Supported models: + - claude-code/claude-sonnet-4-20250514 (recommended) + - claude-code/claude-opus-4-20250514 + - claude-code/claude-haiku-3-5-latest + +Requirements: + - Claude Code CLI: npm install -g @anthropic-ai/claude-code + - Authenticated: run `claude login` + - SDK: pip install crawl4ai[claude-code] + +Exceptions: + - ClaudeCodeError: Base exception for all provider errors + - ClaudeCodeSDKError: SDK not installed + - ClaudeCodeAuthenticationError: CLI auth failed + - ClaudeCodeConnectionError: Connection to service failed + +See Also: + - LLMConfig: Configuration class for LLM providers + - https://docs.anthropic.com/claude-code/ + +.. versionadded:: 0.7.9 +""" +import asyncio +import logging +import time +from typing import Any, Dict, List, Optional, Tuple + +from litellm import CustomLLM +from litellm.types.utils import Choices, Message, ModelResponse, Usage + +logger = logging.getLogger(__name__) + + +class ClaudeCodeError(Exception): + """Base exception for Claude Code provider errors.""" + pass + + +class ClaudeCodeSDKError(ClaudeCodeError): + """Raised when claude-agent-sdk is not installed. + + Recovery: + 1. Install: pip install crawl4ai[claude-code] + 2. Or directly: pip install claude-agent-sdk + """ + pass + + +class ClaudeCodeAuthenticationError(ClaudeCodeError): + """Raised when Claude Code CLI authentication fails. + + Recovery: + 1. Install CLI: npm install -g @anthropic-ai/claude-code + 2. Authenticate: claude login + 3. Verify: run `claude` in terminal + """ + pass + + +class ClaudeCodeConnectionError(ClaudeCodeError): + """Raised when connection to Claude Code service fails.""" + pass + + +class ClaudeCodeProvider(CustomLLM): + """ + Custom LiteLLM provider for Claude Code/Agent SDK. + + This provider wraps the Claude Agent SDK to provide LLM completions + using the user's local Claude Code CLI authentication. + """ + + def _extract_model(self, model: str) -> str: + """ + Extract model name from provider string. + + Args: + model: Provider string like "claude-code/claude-sonnet-4-20250514" + + Returns: + Model name like "claude-sonnet-4-20250514" + """ + if "/" in model: + return model.split("/", 1)[1] + return model + + def _convert_messages_to_prompt(self, messages: List[Dict]) -> str: + """ + Convert LiteLLM message format to single prompt string. + + The Claude Agent SDK expects a single prompt string, not a messages + array. This method converts the standard messages format to a + formatted prompt string. + + Args: + messages: List of message dicts with 'role' and 'content' keys + + Returns: + Formatted prompt string + + Raises: + ValueError: If messages is empty or has invalid format + TypeError: If message content is not a string + """ + if not messages: + return "" + + parts = [] + for i, msg in enumerate(messages): + if not isinstance(msg, dict): + raise TypeError( + f"Message at index {i} must be a dict, got {type(msg).__name__}" + ) + + role = msg.get("role", "user") + content = msg.get("content", "") + + # Validate content is a string (not multimodal) + if content is not None and not isinstance(content, str): + raise TypeError( + f"Message at index {i} has non-string content (type: {type(content).__name__}). " + "Multimodal content is not supported by claude-code provider." + ) + + if role == "system": + parts.append(f"System: {content}") + elif role == "user": + parts.append(content or "") + elif role == "assistant": + parts.append(f"Assistant: {content}") + + return "\n\n".join(parts) + + async def _collect_response( + self, prompt: str, model: str + ) -> Tuple[str, Optional[Dict[str, Any]]]: + """ + Call Claude Agent SDK and collect the full response. + + Args: + prompt: The prompt string to send + model: The model name to use + + Returns: + Tuple of (response_text, usage_info) + + Raises: + ImportError: If claude-agent-sdk is not installed + ClaudeCodeError: If the SDK returns an error or empty response + """ + try: + from claude_agent_sdk import ( + AssistantMessage, + ClaudeAgentOptions, + ResultMessage, + TextBlock, + query, + ) + except ImportError: + raise ClaudeCodeSDKError( + "The claude-agent-sdk package is not installed.\n\n" + "To fix this:\n" + " 1. Install: pip install crawl4ai[claude-code]\n" + " 2. Or directly: pip install claude-agent-sdk\n\n" + "Docs: https://github.com/anthropics/claude-code" + ) + + options = ClaudeAgentOptions( + model=model, + max_turns=1, # Single turn for extraction tasks + allowed_tools=[], # No tools needed for text completion + ) + + logger.debug(f"Sending prompt to Claude Code: model={model}, prompt_length={len(prompt)}") + + collected_text = [] + usage_info = None + + try: + async for message in query(prompt=prompt, options=options): + if isinstance(message, AssistantMessage): + for block in message.content: + if isinstance(block, TextBlock): + collected_text.append(block.text) + elif isinstance(message, ResultMessage): + usage_info = { + "session_id": message.session_id, + "duration_ms": message.duration_ms, + "input_tokens": message.usage.get("input_tokens", 0) + if message.usage + else 0, + "output_tokens": message.usage.get("output_tokens", 0) + if message.usage + else 0, + } + else: + logger.debug(f"Received unexpected message type: {type(message).__name__}") + except ConnectionError as e: + raise ClaudeCodeConnectionError( + f"Failed to connect to Claude Code service: {e}\n\n" + "To fix this:\n" + " 1. Check Claude Code CLI is running: claude --version\n" + " 2. Re-authenticate if needed: claude login\n" + " 3. Verify it works: claude 'Hello'" + ) from e + except Exception as e: + # Re-raise ImportError as-is + if isinstance(e, ImportError): + raise + raise ClaudeCodeError( + f"Claude Code SDK error: {e}. " + "Verify your Claude Code CLI is properly installed and authenticated " + "by running 'claude' in your terminal." + ) from e + + response_text = "".join(collected_text) + + logger.debug(f"Received response: length={len(response_text)}, usage={usage_info}") + + # Warn if response is empty (but don't fail - the SDK may have valid reasons) + if not response_text.strip(): + logger.warning( + "Claude Code returned an empty response. This may indicate an " + "authentication issue, rate limiting, or API error." + ) + + return response_text, usage_info + + async def acompletion( + self, + model: str, + messages: List[Dict], + api_base: Optional[str] = None, + api_key: Optional[str] = None, + optional_params: Optional[Dict] = None, + timeout: Optional[float] = None, + client=None, + **kwargs, + ) -> ModelResponse: + """ + Async completion using Claude Code SDK. + + Args: + model: Model string like "claude-code/claude-sonnet-4-20250514" + messages: List of message dicts with 'role' and 'content' + api_base: Not used (Claude Code uses local auth) + api_key: Not used (Claude Code uses local auth) + optional_params: Additional parameters (currently unused) + timeout: Request timeout (not currently enforced - SDK handles timeouts) + client: HTTP client (not used) + **kwargs: Additional arguments + + Returns: + LiteLLM ModelResponse object + """ + model_name = self._extract_model(model) + prompt = self._convert_messages_to_prompt(messages) + + logger.info(f"Claude Code completion: model={model_name}, messages={len(messages)}") + + response_text, usage_info = await self._collect_response(prompt, model_name) + + # Calculate token counts + input_tokens = usage_info.get("input_tokens", 0) if usage_info else 0 + output_tokens = usage_info.get("output_tokens", 0) if usage_info else 0 + + # Generate response ID + response_id = ( + usage_info.get("session_id") if usage_info + else f"claude-code-{int(time.time())}" + ) + + return ModelResponse( + id=response_id, + choices=[ + Choices( + message=Message(role="assistant", content=response_text), + index=0, + finish_reason="stop", + ) + ], + created=int(time.time()), + model=model_name, + usage=Usage( + prompt_tokens=input_tokens, + completion_tokens=output_tokens, + total_tokens=input_tokens + output_tokens, + ), + object="chat.completion", + ) + + def completion( + self, + model: str, + messages: List[Dict], + api_base: Optional[str] = None, + api_key: Optional[str] = None, + optional_params: Optional[Dict] = None, + timeout: Optional[float] = None, + client=None, + **kwargs, + ) -> ModelResponse: + """ + Sync completion - runs async version in event loop. + + Args: + Same as acompletion + + Returns: + LiteLLM ModelResponse object + """ + # Check if we're in an async context by looking for a running event loop + loop = None + try: + loop = asyncio.get_running_loop() + except RuntimeError as e: + # Only catch the specific "no running event loop" error + if "no running event loop" not in str(e).lower(): + raise + + if loop and loop.is_running(): + # We're in an async context, need to use a new thread + import concurrent.futures + + with concurrent.futures.ThreadPoolExecutor() as executor: + future = executor.submit( + asyncio.run, + self.acompletion( + model=model, + messages=messages, + api_base=api_base, + api_key=api_key, + optional_params=optional_params, + timeout=timeout, + client=client, + **kwargs, + ), + ) + try: + return future.result() + except Exception as e: + raise ClaudeCodeError(f"Claude Code completion failed: {e}") from e + else: + return asyncio.run( + self.acompletion( + model=model, + messages=messages, + api_base=api_base, + api_key=api_key, + optional_params=optional_params, + timeout=timeout, + client=client, + **kwargs, + ) + ) diff --git a/docs/md_v2/extraction/claude-code-provider.md b/docs/md_v2/extraction/claude-code-provider.md new file mode 100644 index 000000000..266a6faf4 --- /dev/null +++ b/docs/md_v2/extraction/claude-code-provider.md @@ -0,0 +1,177 @@ +# Claude Code Provider + +Use your Claude Code CLI subscription for LLM-powered web extraction without managing API keys. + +## Overview + +The Claude Code provider enables Crawl4AI users with Claude Code subscriptions to leverage their existing authentication for LLM extraction: + +- **No API Keys Required**: Uses local Claude Code CLI authentication +- **Familiar Models**: Access Claude Sonnet, Opus, and Haiku models +- **Seamless Integration**: Works with all existing `LLMExtractionStrategy` features + +## Prerequisites + +1. **Claude Code CLI** installed and authenticated: + ```bash + npm install -g @anthropic-ai/claude-code + claude login + ``` + +2. **Crawl4AI with claude-code extra**: + ```bash + pip install crawl4ai[claude-code] + ``` + +## Quick Start + +```python +import asyncio +from crawl4ai import AsyncWebCrawler, LLMConfig, CrawlerRunConfig +from crawl4ai import LLMExtractionStrategy + +async def main(): + # No API token needed - uses local Claude Code authentication + llm_config = LLMConfig(provider="claude-code/claude-sonnet-4-20250514") + + strategy = LLMExtractionStrategy( + llm_config=llm_config, + instruction="Extract article titles and summaries as JSON" + ) + + config = CrawlerRunConfig(extraction_strategy=strategy) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://news.ycombinator.com", + config=config + ) + print(result.extracted_content) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +## Supported Models + +| Model ID | Description | Use Case | +|----------|-------------|----------| +| `claude-code/claude-haiku-3-5-latest` | Fastest, most economical | Quick extractions, high volume | +| `claude-code/claude-sonnet-4-20250514` | Balanced performance | **Recommended default** | +| `claude-code/claude-opus-4-20250514` | Most capable | Complex reasoning tasks | + +## Configuration + +The Claude Code provider works with standard `LLMConfig`: + +```python +llm_config = LLMConfig( + provider="claude-code/claude-sonnet-4-20250514" + # api_token is optional (uses "no-token-needed" internally) +) +``` + +## Examples + +### Schema-Based Extraction + +```python +from pydantic import BaseModel + +class Product(BaseModel): + name: str + price: str + rating: float + +strategy = LLMExtractionStrategy( + llm_config=LLMConfig(provider="claude-code/claude-sonnet-4-20250514"), + schema=Product.model_json_schema(), + extraction_type="schema", + instruction="Extract all products" +) +``` + +### With Chunking for Large Pages + +```python +strategy = LLMExtractionStrategy( + llm_config=LLMConfig(provider="claude-code/claude-sonnet-4-20250514"), + instruction="Extract all article summaries", + chunk_token_threshold=2000, + overlap_rate=0.1, + apply_chunking=True +) +``` + +### Verbose Mode with Model Logging + +```python +strategy = LLMExtractionStrategy( + llm_config=LLMConfig(provider="claude-code/claude-sonnet-4-20250514"), + instruction="Extract main content", + verbose=True # Logs which provider/model is being used +) +``` + +Output: +``` +[LOG] LLM Provider: claude-code | Model: claude-sonnet-4-20250514 +[LOG] Call LLM for https://example.com - block index: 0 +``` + +## Comparison with API Providers + +| Feature | Claude Code | Anthropic API | +|---------|-------------|---------------| +| Authentication | Local CLI | API Key | +| Billing | Subscription | Per-token | +| Setup | CLI login once | Environment variable | + +## Docker Deployment + +When running Crawl4AI in Docker, mount your Claude Code credentials: + +```bash +docker run -d \ + -p 11235:11235 \ + -v /path/to/.claude:/home/appuser/.claude:rw \ + crawl4ai-claude:latest +``` + +Set the provider via environment variable: + +```bash +-e LLM_PROVIDER=claude-code/claude-sonnet-4-20250514 +``` + +## Troubleshooting + +### "claude-agent-sdk is not installed" + +```bash +pip install crawl4ai[claude-code] +``` + +### "Failed to connect to Claude Code service" + +```bash +claude --version # Check installation +claude login # Re-authenticate +``` + +### Empty Responses + +Verify CLI works: `claude "Hello"` + +### Permission Denied (Docker) + +Ensure the credentials directory is readable by the container user (uid 999): + +```bash +sudo chown -R 999:999 /path/to/.claude +``` + +## See Also + +- [LLM Extraction Strategies](./llm-strategies.md) +- [Claude Code Documentation](https://docs.anthropic.com/claude-code/) diff --git a/docs/md_v2/extraction/llm-strategies.md b/docs/md_v2/extraction/llm-strategies.md index df948a9eb..a0071b86c 100644 --- a/docs/md_v2/extraction/llm-strategies.md +++ b/docs/md_v2/extraction/llm-strategies.md @@ -32,7 +32,18 @@ Crawl4AI uses a “provider string” (e.g., `"openai/gpt-4o"`, `"ollama/llama2. - **`api_token`**: If needed (for OpenAI, HuggingFace, etc.); local models or Ollama might not require it. - **`base_url`** (optional): If your provider has a custom endpoint. -This means you **aren’t locked** into a single LLM vendor. Switch or experiment easily. +This means you **aren't locked** into a single LLM vendor. Switch or experiment easily. + +### Claude Code Provider (No API Keys) + +If you have a Claude Code subscription, use your local CLI authentication: + +```python +llm_config = LLMConfig(provider="claude-code/claude-sonnet-4-20250514") +# No api_token needed - uses local Claude Code CLI auth +``` + +See [Claude Code Provider](./claude-code-provider.md) for full documentation. --- diff --git a/examples/claude_code_error_handling.py b/examples/claude_code_error_handling.py new file mode 100644 index 000000000..dcb35036a --- /dev/null +++ b/examples/claude_code_error_handling.py @@ -0,0 +1,71 @@ +""" +Example: Robust Error Handling with Claude Code Provider + +Demonstrates proper error handling patterns for production applications. +""" + +import asyncio +from crawl4ai import AsyncWebCrawler +from crawl4ai.async_configs import LLMConfig, CrawlerRunConfig +from crawl4ai.extraction_strategy import LLMExtractionStrategy + + +async def extraction_with_retry(url: str, max_retries: int = 3): + """Extract content with automatic retry on transient failures.""" + from crawl4ai.providers.claude_code_provider import ( + ClaudeCodeError, + ClaudeCodeConnectionError, + ClaudeCodeSDKError, + ) + + llm_config = LLMConfig(provider="claude-code/claude-sonnet-4-20250514") + strategy = LLMExtractionStrategy( + llm_config=llm_config, + instruction="Extract the main content as a summary" + ) + run_config = CrawlerRunConfig(extraction_strategy=strategy) + + for attempt in range(max_retries): + try: + async with AsyncWebCrawler() as crawler: + result = await crawler.arun(url=url, config=run_config) + if result.success: + return result.extracted_content + else: + print(f"Crawl failed: {result.error_message}") + + except ClaudeCodeSDKError as e: + # SDK not installed - no point retrying + print(f"SDK Error (no retry): {e}") + raise + + except ClaudeCodeConnectionError as e: + # Connection issue - may be transient, worth retrying + wait_time = 2 ** attempt + print(f"Connection error (attempt {attempt + 1}/{max_retries}), " + f"retrying in {wait_time}s: {e}") + await asyncio.sleep(wait_time) + + except ClaudeCodeError as e: + # Other Claude Code errors + wait_time = 2 ** attempt + print(f"Claude Code error (attempt {attempt + 1}/{max_retries}), " + f"retrying in {wait_time}s: {e}") + await asyncio.sleep(wait_time) + + raise Exception(f"Max retries ({max_retries}) exceeded") + + +async def main(): + print("Claude Code Error Handling Example") + print("=" * 50) + + try: + result = await extraction_with_retry("https://example.com") + print(f"\nSuccess! Extracted content:\n{result[:500]}...") + except Exception as e: + print(f"\nFailed: {e}") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/claude_code_extraction.py b/examples/claude_code_extraction.py new file mode 100644 index 000000000..650dc08f0 --- /dev/null +++ b/examples/claude_code_extraction.py @@ -0,0 +1,166 @@ +""" +Example: Using Claude Code Provider for LLM Extraction + +Demonstrates using your Claude Code CLI subscription for web content extraction +with Crawl4AI. + +Prerequisites: + - Claude Code CLI installed and authenticated (run `claude` in terminal) + - pip install crawl4ai[claude-code] + +Usage: + python examples/claude_code_extraction.py +""" + +import asyncio +from crawl4ai import AsyncWebCrawler +from crawl4ai.async_configs import LLMConfig, CrawlerRunConfig +from crawl4ai.extraction_strategy import LLMExtractionStrategy + + +async def basic_extraction(): + """Basic example: Extract article titles from Hacker News.""" + print("=" * 60) + print("Example 1: Basic Extraction with Claude Code") + print("=" * 60) + + # Configure Claude Code provider (no API token needed - uses local auth) + llm_config = LLMConfig( + provider="claude-code/claude-sonnet-4-20250514" # Recommended model + ) + + strategy = LLMExtractionStrategy( + llm_config=llm_config, + instruction="Extract all article titles and their point counts as a JSON array" + ) + + run_config = CrawlerRunConfig(extraction_strategy=strategy) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://news.ycombinator.com", + config=run_config + ) + + if result.success: + print("\nExtracted Content:") + print(result.extracted_content[:500]) # First 500 chars + else: + print(f"\nExtraction failed: {result.error_message}") + + +async def model_comparison(): + """Compare different Claude models.""" + print("\n" + "=" * 60) + print("Example 2: Model Comparison") + print("=" * 60) + + models = [ + ("claude-code/claude-haiku-3-5-latest", "Haiku (fastest, cheapest)"), + ("claude-code/claude-sonnet-4-20250514", "Sonnet (balanced)"), + # Uncomment to test Opus (slowest, most capable): + # ("claude-code/claude-opus-4-20250514", "Opus (most capable)"), + ] + + html = "

Product: Widget Pro

Price: $99.99

Rating: 4.5 stars

" + + for provider, description in models: + print(f"\nTesting {description}...") + + llm_config = LLMConfig(provider=provider) + strategy = LLMExtractionStrategy( + llm_config=llm_config, + instruction="Extract product name, price, and rating as JSON" + ) + + result = await strategy.aextract( + url="https://example.com", + ix=0, + html=html + ) + + print(f" Result: {result}") + + +async def structured_extraction(): + """Extract structured data with a schema.""" + print("\n" + "=" * 60) + print("Example 3: Structured Extraction with Schema") + print("=" * 60) + + llm_config = LLMConfig( + provider="claude-code/claude-sonnet-4-20250514" + ) + + # Define expected schema + schema = { + "type": "array", + "items": { + "type": "object", + "properties": { + "title": {"type": "string"}, + "author": {"type": "string"}, + "date": {"type": "string"} + } + } + } + + strategy = LLMExtractionStrategy( + llm_config=llm_config, + instruction="Extract all blog posts with their titles, authors, and dates", + schema=schema, + extraction_type="schema" + ) + + html = """ + + +
+

First Post

+ John Doe + +
+
+

Second Post

+ Jane Smith + +
+ + + """ + + result = await strategy.aextract( + url="https://example.com", + ix=0, + html=html + ) + + print(f"\nExtracted Posts:") + print(result) + + +async def main(): + """Run all examples.""" + print("\nClaude Code Provider Examples for Crawl4AI") + print("=" * 60) + print("Using your Claude Code subscription for LLM extraction") + print("No API keys required - uses local Claude Code authentication") + print("=" * 60) + + try: + await basic_extraction() + await model_comparison() + await structured_extraction() + except ImportError as e: + print(f"\nError: {e}") + print("\nMake sure to install the claude-code extras:") + print(" pip install crawl4ai[claude-code]") + except Exception as e: + print(f"\nError: {e}") + print("\nMake sure Claude Code CLI is installed and authenticated:") + print(" 1. Install: npm install -g @anthropic-ai/claude-code") + print(" 2. Authenticate: claude login") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/mkdocs.yml b/mkdocs.yml index ef23b4551..00d4f99f6 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -58,6 +58,7 @@ nav: - Extraction: - "LLM-Free Strategies": "extraction/no-llm-strategies.md" - "LLM Strategies": "extraction/llm-strategies.md" + - "Claude Code Provider": "extraction/claude-code-provider.md" - "Clustering Strategies": "extraction/clustring-strategies.md" - "Chunking": "extraction/chunking.md" - API Reference: diff --git a/pyproject.toml b/pyproject.toml index 06d1e4ab0..d4342045f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,11 +59,12 @@ classifiers = [ ] [project.optional-dependencies] -pdf = ["pypdf"] +pdf = ["pypdf"] torch = ["torch", "nltk", "scikit-learn"] transformer = ["transformers", "tokenizers", "sentence-transformers"] cosine = ["torch", "transformers", "nltk", "sentence-transformers"] sync = ["selenium"] +claude-code = ["claude-agent-sdk>=0.1.0"] all = [ "pypdf", "torch", @@ -72,7 +73,8 @@ all = [ "transformers", "tokenizers", "sentence-transformers", - "selenium" + "selenium", + "claude-agent-sdk>=0.1.0" ] [project.scripts] diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/integration/test_claude_code_integration.py b/tests/integration/test_claude_code_integration.py new file mode 100644 index 000000000..76b8782cc --- /dev/null +++ b/tests/integration/test_claude_code_integration.py @@ -0,0 +1,173 @@ +""" +Integration tests for Claude Code Provider. + +These tests require Claude Code CLI to be installed and authenticated, +AND the claude-agent-sdk package to be installed. +They will be skipped if either is not available. +""" +import pytest +import shutil +import asyncio + +# Check if claude-agent-sdk is installed +def _sdk_available(): + try: + import claude_agent_sdk + return True + except ImportError: + return False + +# Mark for tests that need both CLI and SDK +requires_claude_code = pytest.mark.skipif( + not shutil.which("claude") or not _sdk_available(), + reason="Claude Code CLI or claude-agent-sdk not installed" +) + + +class TestClaudeCodeIntegration: + """Integration tests with real Claude Code CLI.""" + + @requires_claude_code + @pytest.mark.asyncio + async def test_basic_completion_with_claude_code(self): + """Test basic completion using Claude Code provider.""" + from crawl4ai.providers.claude_code_provider import ClaudeCodeProvider + from litellm.types.utils import ModelResponse + + provider = ClaudeCodeProvider() + + # Simple prompt that should get a quick response + result = await provider.acompletion( + model="claude-code/claude-haiku-3-5-latest", # Use Haiku for speed + messages=[{"role": "user", "content": "Reply with only the word 'hello'"}] + ) + + assert isinstance(result, ModelResponse) + assert result.choices[0].message.content is not None + assert len(result.choices[0].message.content) > 0 + + def test_llm_config_with_claude_code(self): + """Test LLMConfig works with claude-code provider.""" + from crawl4ai.async_configs import LLMConfig + + config = LLMConfig(provider="claude-code/claude-haiku-3-5-latest") + + assert config.provider == "claude-code/claude-haiku-3-5-latest" + assert config.api_token == "no-token-needed" + + @requires_claude_code + @pytest.mark.asyncio + async def test_extraction_strategy_with_claude_code(self): + """Test LLMExtractionStrategy with Claude Code provider.""" + from crawl4ai.async_configs import LLMConfig + from crawl4ai.extraction_strategy import LLMExtractionStrategy + + config = LLMConfig(provider="claude-code/claude-haiku-3-5-latest") + + strategy = LLMExtractionStrategy( + llm_config=config, + instruction="Extract the main heading text" + ) + + # Simple HTML to extract from + html = "

Test Heading

Some content

" + + # Run extraction + result = await strategy.aextract( + url="https://example.com", + ix=0, + html=html + ) + + # Should get some result (exact format depends on extraction) + assert result is not None + + @requires_claude_code + @pytest.mark.asyncio + async def test_model_selection_sonnet(self): + """Test that Sonnet model can be used.""" + from crawl4ai.providers.claude_code_provider import ClaudeCodeProvider + from litellm.types.utils import ModelResponse + + provider = ClaudeCodeProvider() + + result = await provider.acompletion( + model="claude-code/claude-sonnet-4-20250514", + messages=[{"role": "user", "content": "Say 'test' only"}] + ) + + assert isinstance(result, ModelResponse) + assert result.model == "claude-sonnet-4-20250514" + + @requires_claude_code + @pytest.mark.asyncio + async def test_system_prompt_handling(self): + """Test that system prompts are properly included.""" + from crawl4ai.providers.claude_code_provider import ClaudeCodeProvider + + provider = ClaudeCodeProvider() + + result = await provider.acompletion( + model="claude-code/claude-haiku-3-5-latest", + messages=[ + {"role": "system", "content": "You only respond with the word 'CONFIRMED'"}, + {"role": "user", "content": "Hello"} + ] + ) + + # System prompt should influence the response + assert result.choices[0].message.content is not None + + +class TestBackwardCompatibility: + """Test that existing providers still work alongside claude-code.""" + + def test_existing_providers_still_in_config(self): + """Ensure existing providers weren't removed.""" + from crawl4ai.config import PROVIDER_MODELS_PREFIXES + + # Original providers should still exist + assert "ollama" in PROVIDER_MODELS_PREFIXES + assert "openai" in PROVIDER_MODELS_PREFIXES + assert "anthropic" in PROVIDER_MODELS_PREFIXES + assert "gemini" in PROVIDER_MODELS_PREFIXES + assert "deepseek" in PROVIDER_MODELS_PREFIXES + + # New provider should also exist + assert "claude-code" in PROVIDER_MODELS_PREFIXES + + def test_llm_config_still_works_with_openai(self): + """LLMConfig should still work with OpenAI provider.""" + from crawl4ai.async_configs import LLMConfig + import os + + # Set a dummy key for testing + original = os.environ.get("OPENAI_API_KEY") + os.environ["OPENAI_API_KEY"] = "test-key" + + try: + config = LLMConfig(provider="openai/gpt-4o") + assert config.provider == "openai/gpt-4o" + finally: + if original: + os.environ["OPENAI_API_KEY"] = original + else: + os.environ.pop("OPENAI_API_KEY", None) + + def test_llm_config_still_works_with_anthropic(self): + """LLMConfig should still work with Anthropic provider.""" + from crawl4ai.async_configs import LLMConfig + import os + + # Set a dummy key for testing + original = os.environ.get("ANTHROPIC_API_KEY") + os.environ["ANTHROPIC_API_KEY"] = "test-key" + + try: + config = LLMConfig(provider="anthropic/claude-3-5-sonnet-20240620") + assert config.provider == "anthropic/claude-3-5-sonnet-20240620" + finally: + if original: + os.environ["ANTHROPIC_API_KEY"] = original + else: + os.environ.pop("ANTHROPIC_API_KEY", None) diff --git a/tests/unit/test_claude_code_provider.py b/tests/unit/test_claude_code_provider.py new file mode 100644 index 000000000..1b0f020bf --- /dev/null +++ b/tests/unit/test_claude_code_provider.py @@ -0,0 +1,261 @@ +""" +Unit tests for Claude Code Provider (TDD - RED Phase) + +These tests are written BEFORE the implementation to follow TDD methodology. +All tests should FAIL initially until the provider is implemented. +""" +import pytest +from unittest.mock import AsyncMock, MagicMock, patch +import asyncio + + +class TestClaudeCodeProviderImport: + """Test that the provider module can be imported.""" + + def test_provider_module_exists(self): + """The provider module should exist and be importable.""" + from crawl4ai.providers import claude_code_provider + assert claude_code_provider is not None + + def test_provider_class_exists(self): + """The ClaudeCodeProvider class should exist.""" + from crawl4ai.providers.claude_code_provider import ClaudeCodeProvider + assert ClaudeCodeProvider is not None + + def test_provider_inherits_from_custom_llm(self): + """ClaudeCodeProvider should inherit from LiteLLM's CustomLLM.""" + from crawl4ai.providers.claude_code_provider import ClaudeCodeProvider + from litellm import CustomLLM + assert issubclass(ClaudeCodeProvider, CustomLLM) + + +class TestModelExtraction: + """Test model name extraction from provider string.""" + + def test_extracts_model_from_provider_string(self): + """Should extract 'claude-sonnet-4' from 'claude-code/claude-sonnet-4'.""" + from crawl4ai.providers.claude_code_provider import ClaudeCodeProvider + provider = ClaudeCodeProvider() + + result = provider._extract_model("claude-code/claude-sonnet-4-20250514") + assert result == "claude-sonnet-4-20250514" + + def test_extracts_model_with_opus(self): + """Should extract opus model correctly.""" + from crawl4ai.providers.claude_code_provider import ClaudeCodeProvider + provider = ClaudeCodeProvider() + + result = provider._extract_model("claude-code/claude-opus-4-20250514") + assert result == "claude-opus-4-20250514" + + def test_extracts_model_with_haiku(self): + """Should extract haiku model correctly.""" + from crawl4ai.providers.claude_code_provider import ClaudeCodeProvider + provider = ClaudeCodeProvider() + + result = provider._extract_model("claude-code/claude-haiku-3-5-latest") + assert result == "claude-haiku-3-5-latest" + + def test_handles_model_without_prefix(self): + """Should return model as-is if no prefix.""" + from crawl4ai.providers.claude_code_provider import ClaudeCodeProvider + provider = ClaudeCodeProvider() + + result = provider._extract_model("claude-sonnet-4-20250514") + assert result == "claude-sonnet-4-20250514" + + +class TestMessageConversion: + """Test conversion from LiteLLM message format to prompt string.""" + + def test_single_user_message(self): + """Should convert single user message to prompt.""" + from crawl4ai.providers.claude_code_provider import ClaudeCodeProvider + provider = ClaudeCodeProvider() + + messages = [{"role": "user", "content": "Hello"}] + result = provider._convert_messages_to_prompt(messages) + + assert "Hello" in result + + def test_system_and_user_messages(self): + """Should include both system and user messages.""" + from crawl4ai.providers.claude_code_provider import ClaudeCodeProvider + provider = ClaudeCodeProvider() + + messages = [ + {"role": "system", "content": "You are helpful"}, + {"role": "user", "content": "Hello"} + ] + result = provider._convert_messages_to_prompt(messages) + + assert "You are helpful" in result + assert "Hello" in result + + def test_multi_turn_conversation(self): + """Should handle multi-turn conversations.""" + from crawl4ai.providers.claude_code_provider import ClaudeCodeProvider + provider = ClaudeCodeProvider() + + messages = [ + {"role": "system", "content": "Be helpful"}, + {"role": "user", "content": "Hello"}, + {"role": "assistant", "content": "Hi there"}, + {"role": "user", "content": "How are you?"} + ] + result = provider._convert_messages_to_prompt(messages) + + assert "Be helpful" in result + assert "Hello" in result + assert "Hi there" in result + assert "How are you?" in result + + def test_empty_messages(self): + """Should handle empty message list.""" + from crawl4ai.providers.claude_code_provider import ClaudeCodeProvider + provider = ClaudeCodeProvider() + + messages = [] + result = provider._convert_messages_to_prompt(messages) + + assert result == "" + + +class TestCompletionMethods: + """Test completion and acompletion methods.""" + + @pytest.mark.asyncio + async def test_acompletion_returns_model_response(self): + """acompletion should return a LiteLLM ModelResponse.""" + from crawl4ai.providers.claude_code_provider import ClaudeCodeProvider + from litellm.types.utils import ModelResponse + + provider = ClaudeCodeProvider() + + # Mock the _collect_response method instead of the SDK directly + async def mock_collect_response(prompt, model): + return ("Test response", {"session_id": "test-session", "input_tokens": 10, "output_tokens": 5}) + + with patch.object(provider, '_collect_response', mock_collect_response): + result = await provider.acompletion( + model="claude-code/claude-sonnet-4-20250514", + messages=[{"role": "user", "content": "Hello"}] + ) + + assert isinstance(result, ModelResponse) + + @pytest.mark.asyncio + async def test_acompletion_extracts_text_content(self): + """acompletion should extract text from Claude SDK response.""" + from crawl4ai.providers.claude_code_provider import ClaudeCodeProvider + + provider = ClaudeCodeProvider() + + async def mock_collect_response(prompt, model): + return ("The capital of France is Paris.", {"session_id": "test-session", "input_tokens": 10, "output_tokens": 5}) + + with patch.object(provider, '_collect_response', mock_collect_response): + result = await provider.acompletion( + model="claude-code/claude-sonnet-4-20250514", + messages=[{"role": "user", "content": "What is the capital of France?"}] + ) + + assert "Paris" in result.choices[0].message.content + + def test_completion_calls_acompletion(self): + """Sync completion should internally call acompletion.""" + from crawl4ai.providers.claude_code_provider import ClaudeCodeProvider + + provider = ClaudeCodeProvider() + + with patch.object(provider, 'acompletion', new_callable=AsyncMock) as mock_acompletion: + mock_response = MagicMock() + mock_acompletion.return_value = mock_response + + result = provider.completion( + model="claude-code/claude-sonnet-4-20250514", + messages=[{"role": "user", "content": "Hello"}] + ) + + mock_acompletion.assert_called_once() + + +class TestErrorHandling: + """Test error handling scenarios.""" + + @pytest.mark.asyncio + async def test_import_error_when_sdk_not_installed(self): + """Should raise ClaudeCodeSDKError with helpful message if SDK not installed.""" + from crawl4ai.providers.claude_code_provider import ( + ClaudeCodeProvider, + ClaudeCodeSDKError, + ) + + provider = ClaudeCodeProvider() + + with patch.dict('sys.modules', {'claude_agent_sdk': None}): + with pytest.raises(ClaudeCodeSDKError) as exc_info: + await provider.acompletion( + model="claude-code/claude-sonnet-4-20250514", + messages=[{"role": "user", "content": "Hello"}] + ) + + assert "claude-agent-sdk" in str(exc_info.value).lower() or \ + "pip install" in str(exc_info.value).lower() + + +class TestProviderRegistration: + """Test that the provider is registered correctly with LiteLLM.""" + + def test_registration_function_exists(self): + """register_custom_providers function should exist.""" + from crawl4ai.providers import register_custom_providers + assert callable(register_custom_providers) + + def test_provider_registered_after_import(self): + """Provider should be registered after calling register_custom_providers.""" + import litellm + from crawl4ai.providers import register_custom_providers + + # Call registration + register_custom_providers() + + # Check if claude-code is in custom_provider_map + providers = [p.get("provider") for p in (litellm.custom_provider_map or [])] + assert "claude-code" in providers + + +class TestConfigIntegration: + """Test integration with Crawl4AI's config system.""" + + def test_provider_prefix_in_config(self): + """claude-code should be in PROVIDER_MODELS_PREFIXES.""" + from crawl4ai.config import PROVIDER_MODELS_PREFIXES + + assert "claude-code" in PROVIDER_MODELS_PREFIXES + + def test_provider_prefix_value_is_no_token_needed(self): + """claude-code should require no token (uses local auth).""" + from crawl4ai.config import PROVIDER_MODELS_PREFIXES + + assert PROVIDER_MODELS_PREFIXES.get("claude-code") == "no-token-needed" + + +class TestLLMConfigIntegration: + """Test integration with LLMConfig class.""" + + def test_llm_config_accepts_claude_code_provider(self): + """LLMConfig should accept claude-code provider without error.""" + from crawl4ai.async_configs import LLMConfig + + config = LLMConfig(provider="claude-code/claude-sonnet-4-20250514") + + assert config.provider == "claude-code/claude-sonnet-4-20250514" + + def test_llm_config_resolves_no_token_needed(self): + """LLMConfig should resolve api_token to 'no-token-needed' for claude-code.""" + from crawl4ai.async_configs import LLMConfig + + config = LLMConfig(provider="claude-code/claude-sonnet-4-20250514") + + assert config.api_token == "no-token-needed"