diff --git a/.github/workflows/build-artifacts.yml b/.github/workflows/build-artifacts.yml index bfe4fbf..cba567e 100644 --- a/.github/workflows/build-artifacts.yml +++ b/.github/workflows/build-artifacts.yml @@ -77,7 +77,7 @@ jobs: # - name: Build executable # run: | - # poetry run pyinstaller --onefile --name kb src/knowledgebase_processor/cli_v2/main.py + # poetry run pyinstaller --onefile --name kb src/knowledgebase_processor/cli/main.py # - name: Upload executable # uses: actions/upload-artifact@v4 diff --git a/ARCHITECTURE_V2.md b/ARCHITECTURE_V2.md index 95775b6..9e56ba1 100644 --- a/ARCHITECTURE_V2.md +++ b/ARCHITECTURE_V2.md @@ -2,7 +2,7 @@ ## ๐Ÿ—๏ธ Service-Oriented Architecture -The CLI v2 introduces a **service-oriented architecture** that separates the user interface from business logic, making the system more testable, maintainable, and extensible. +The CLI introduces a **service-oriented architecture** that separates the user interface from business logic, making the system more testable, maintainable, and extensible. ## ๐Ÿ“ Architecture Overview @@ -66,7 +66,7 @@ class OrchestratorService: ``` ### ๐Ÿ–ฅ๏ธ CLI Commands -**Location**: `src/knowledgebase_processor/cli_v2/commands/` +**Location**: `src/knowledgebase_processor/cli/commands/` **Purpose**: Thin wrappers that provide beautiful UI around service operations. @@ -147,7 +147,7 @@ class ProjectStats: - โœ… Error handling for uninitialized projects ### 2. **End-to-End Tests** (Minimal) -**Location**: `tests/cli_v2/test_cli_e2e.py` +**Location**: `tests/cli/test_cli_e2e.py` **Scope**: - Test CLI interface with Click test runner diff --git a/CLI_V2_DEMO.md b/CLI_V2_DEMO.md deleted file mode 100644 index 7fd60f8..0000000 --- a/CLI_V2_DEMO.md +++ /dev/null @@ -1,256 +0,0 @@ -# Knowledge Base Processor CLI v2.0 - User Experience Transformation - -## ๐ŸŽฏ Design Philosophy - -The new CLI v2 transforms the Knowledge Base Processor from a technical tool into a **delightful user experience** that guides users through their knowledge management journey. - -## โœจ Key Improvements - -### 1. **Human-Centric Commands** -```bash -# Old CLI (technical, intimidating) -kbp process --pattern "**/*.md" --rdf-output-dir output/rdf - -# New CLI (intuitive, friendly) -kb scan --watch -``` - -### 2. **Rich Visual Feedback** -- **Progress bars** with spinner animations -- **Colored output** with semantic meaning -- **Emojis** for better visual scanning -- **Tables** for structured data display -- **Panels** for important information - -### 3. **Interactive Mode** -```bash -# Simply run without arguments -kb - -# Enters a guided wizard experience: -๐Ÿง  Knowledge Base Processor -Your intelligent document companion - -โœจ Welcome! Let's set up your knowledge base. - -What would you like to do? - 1. Initialize a new knowledge base here - 2. Scan an existing document folder - 3. Connect to a SPARQL endpoint - 4. Just explore the CLI -``` - -### 4. **Intelligent Help System** -- **Contextual examples** in every command -- **Tip suggestions** based on usage patterns -- **Error recovery** with actionable suggestions -- **Progressive disclosure** of advanced features - -### 5. **Smart Defaults** -- Auto-detects knowledge base location -- Suggests sensible configuration values -- Remembers user preferences -- Graceful fallbacks for missing configs - -## ๐Ÿ“Š Before vs After Comparison - -| Aspect | CLI v1 (Old) | CLI v2 (New) | -|--------|-------------|-------------| -| **First Impression** | `kbp process --help` (intimidating) | `kb` (friendly wizard) | -| **Learning Curve** | Steep - requires manual reading | Gentle - guides you through | -| **Error Messages** | Technical stack traces | Helpful suggestions with solutions | -| **Visual Design** | Plain text output | Rich colors, emojis, progress bars | -| **User Journey** | Manual documentation lookup | Interactive guided experience | -| **Command Length** | Long, complex arguments | Short, memorable commands | -| **Discovery** | Hidden behind --help flags | Progressively revealed through use | - -## ๐Ÿš€ Command Showcase - -### Configure Processor for Documents -```bash -kb init ~/my-notes --name "Project Notes" - -# Output: -๐Ÿš€ Configuring Knowledge Base Processor - - Setting up processor configuration... โ”โ”โ”โ”โ”โ”โ”โ”โ”โ” 100% -โœ“ Processor configured for project 'Project Notes'! - ๐Ÿ“ Document directory: /Users/me/my-notes - โš™๏ธ Configuration: /Users/me/my-notes/.kbp/config.yaml -โ„น๏ธ Found 156 existing documents ready to process - -Next steps: - 1. Run kb scan to process your existing documents - 2. Use kb search to explore extracted knowledge - 3. Check kb status for processing statistics -``` - -### Scan Documents -```bash -kb scan --watch - -# Output: -๐Ÿ“ Scanning Documents - -โ„น๏ธ Scanning: /Users/me/my-notes - ๐Ÿ“‹ Patterns: **/*.md, **/*.txt - ๐Ÿ”„ Recursive: Yes - ๐Ÿ’ช Force: No - -๐Ÿ“Š Found 156 files to process - - Processing files... โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ” 100% 0:00:45 - -โœ“ Processing completed in 45.2s - -๐Ÿ“ˆ Results Summary: - โœ… Files processed: 156 - ๐Ÿ”— Entities extracted: 1,234 - โ˜ Todos found: 89 - -๐Ÿ‘€ Watch Mode Enabled -Monitoring for file changes... Press Ctrl+C to stop -``` - -### Search Knowledge Base -```bash -kb search "project todos" --type todo - -# Output: -๐Ÿ” Searching Knowledge Base - - ๐Ÿ” Query: 'project todos' - โ˜ Type: todo - ๐Ÿ“„ Limit: 20 - ๐Ÿ“ Scope: my-notes - -โœ“ Found 12 results in 0.15s - - Search Results -โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” -โ”‚ Type โ”‚ Title โ”‚ Score โ”‚ -โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ค -โ”‚ โ˜ todo โ”‚ Implement user auth โ”‚ 95% โ”‚ -โ”‚ โ˜‘ todo โ”‚ Setup project structure โ”‚ 87% โ”‚ -โ”‚ โ˜ todo โ”‚ Write API documentation โ”‚ 78% โ”‚ -โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ - -Search Tips: - โ€ข Use quotes for exact phrases: kb search "exact phrase" - โ€ข Filter by type: kb search --type todo urgent - โ€ข Use regex patterns: kb search --regex "bug-[0-9]+" -``` - -### View Status -```bash -kb status --detailed - -# Output: -๐Ÿ“Š Knowledge Base Status - -โ„น๏ธ Knowledge Base: Project Notes -๐Ÿ“ Location: /Users/me/my-notes - - Overview -โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” -โ”‚ Metric โ”‚ Value โ”‚ Status โ”‚ -โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ค -โ”‚ ๐Ÿ“„ Documents โ”‚ 156 / 156 โ”‚ โœ… Good โ”‚ -โ”‚ ๐Ÿ”— Total Entities โ”‚ 1,234 โ”‚ โœ… Active โ”‚ -โ”‚ โ˜ Todos โ”‚ 23 done, 66 pending โ”‚ ๐Ÿ“ Active โ”‚ -โ”‚ ๐Ÿท๏ธ Tags โ”‚ 45 โ”‚ โœ… Active โ”‚ -โ”‚ ๐Ÿ”— Wiki Links โ”‚ 567 โ”‚ โœ… Connected โ”‚ -โ”‚ ๐Ÿ“… Last Scan โ”‚ just now โ”‚ โœ… Recent โ”‚ -โ”‚ ๐Ÿ’พ Database Size โ”‚ 11.8 MB โ”‚ โœ… Normal โ”‚ -โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ - -โšก Performance - โ€ข Last processing time: 45.2s - โ€ข Average processing speed: 3.5 docs/second - -๐Ÿ’ก Suggestions - โ€ข Your knowledge base is in great shape! ๐ŸŒŸ -``` - -### Sync to SPARQL -```bash -kb sync fuseki --dataset kb-test - -# Output: -๐Ÿ”„ Syncing to SPARQL Endpoint - - ๐ŸŽฏ Endpoint: http://localhost:3030/kb-test - ๐Ÿ“Š Graph: http://example.org/knowledgebase - ๐Ÿ‘ค Authentication: No - ๐Ÿ“ฆ Batch size: 1,000 triples - -๐Ÿ“‹ Data to sync: - โ€ข 4,567 total triples - โ€ข 156 documents - โ€ข 1,234 entities - โ€ข 89 todos - โ€ข 567 relationships - - Uploading data in 5 batches... โ”โ”โ”โ”โ”โ”โ”โ”โ”โ” 100% 0:00:12 - Verifying upload... โ”โ”โ”โ”โ”โ”โ”โ”โ”โ” 100% 0:00:02 - -โœ“ Sync completed successfully in 14.3s - -๐Ÿ“ˆ Sync Results: - โœ… 4,567 triples uploaded - ๐Ÿ“Š Graph: http://example.org/knowledgebase - โšก Transfer rate: 319 triples/second - -๐ŸŽฏ Next Steps: - โ€ข Test your endpoint: http://localhost:3030/kb-test - โ€ข Run SPARQL queries to explore your data - โ€ข Use kb status to monitor your knowledge base -``` - -## ๐ŸŽจ Design Principles Applied - -### 1. **Progressive Disclosure** -- Start simple (`kb` with no args) -- Reveal complexity as needed -- Advanced features available but not overwhelming - -### 2. **Helpful Guidance** -- Every action suggests logical next steps -- Error messages include solutions -- Tips and examples throughout - -### 3. **Visual Hierarchy** -- Important information stands out -- Status indicators use consistent colors -- Tables organize complex data clearly - -### 4. **Human Language** -- "Scanning documents" vs "Processing files" -- "Your knowledge base" vs "Database" -- "Found 5 results" vs "Query returned 5 rows" - -### 5. **Forgiveness** -- Smart defaults reduce required input -- --dry-run options for safe testing -- --force flags for intentional overrides -- Auto-recovery suggestions - -## ๐Ÿ† User Experience Wins - -1. **Reduced Time to First Success**: 30 seconds vs 10+ minutes -2. **Error Recovery**: Actionable suggestions vs cryptic stack traces -3. **Discoverability**: Interactive exploration vs manual diving -4. **Confidence**: Clear feedback vs uncertainty about what happened -5. **Memorability**: Short, logical commands vs complex argument chains - -## ๐Ÿ”ฎ Future Enhancements - -- **Auto-completion** for bash/zsh shells -- **Configuration wizard** for complex setups -- **Health monitoring** with proactive suggestions -- **Usage analytics** to improve UX further -- **Plugin system** for extensibility - ---- - -The CLI v2 transforms the Knowledge Base Processor from a powerful but intimidating tool into an **intelligent companion** that guides users through their knowledge management journey with confidence and delight. \ No newline at end of file diff --git a/README.md b/README.md index 03f488a..db53520 100644 --- a/README.md +++ b/README.md @@ -4,15 +4,17 @@ A Python tool for extracting, analyzing, and managing metadata from Markdown-bas ## Features -- Extracts metadata, tags, and structural elements from Markdown files -- Modular architecture for analyzers, extractors, and enrichers -- Easily extensible for new metadata types or processing logic -- Command-line interface for batch processing -- Comprehensive test suite +- ๐Ÿ” Extracts metadata, tags, and structural elements from Markdown files +- ๐Ÿ—๏ธ Modular architecture for analyzers, extractors, and enrichers +- ๐Ÿ”Œ Easily extensible for new metadata types or processing logic +- ๐ŸŽจ Modern command-line interface with rich terminal UI +- ๐Ÿ“Š Interactive mode for guided workflows +- ๐Ÿ”„ Real-time file watching and continuous processing +- ๐Ÿงช Comprehensive test suite -## Developing +## Quick Start -### Setup +### Installation 1. **Clone the repository:** ```bash @@ -30,68 +32,164 @@ A Python tool for extracting, analyzing, and managing metadata from Markdown-bas poetry install ``` -### Running Tests +### Basic Usage + +The Knowledge Base Processor provides a modern CLI interface with two command aliases: `kb` and `kbp`. -Run all tests using the provided script: ```bash -poetry run python scripts/run_tests.py +# Initialize a new knowledge base in the current directory +kb init + +# Process documents in the current directory +kb scan + +# Search for content +kb search "todo items" + +# Process and sync to SPARQL endpoint in one command +kb publish --endpoint http://localhost:3030/kb + +# Enter interactive mode (just run kb without arguments) +kb ``` -### Running the Processor +## CLI Commands -To process your knowledge base, use: +### ๐Ÿš€ `kb init` - Initialize Knowledge Base +Configure the processor for your documents: ```bash -poetry run python scripts/run_processor.py +kb init # Interactive setup +kb init ~/Documents # Initialize specific directory +kb init --name "My KB" # Set project name ``` -For available options and arguments, run: + +### ๐Ÿ“ `kb scan` - Process Documents +Process documents and extract knowledge entities: ```bash -poetry run python scripts/run_processor.py --help +kb scan # Scan current directory +kb scan ~/Documents # Scan specific directory +kb scan --pattern "*.md" # Only process Markdown files +kb scan --watch # Watch for changes +kb scan --sync --endpoint # Process + sync to SPARQL ``` -### Process and Load Knowledgebase into SPARQL Endpoint +### ๐Ÿ” `kb search` - Search Knowledge Base +Search your processed knowledge base: +```bash +kb search "machine learning" # Full-text search +kb search --type todo "project" # Search specific entity types +kb search --tag important # Search by tags +``` -The `process-and-load` command processes all files in a knowledgebase directory, generates RDF data, and loads it into a SPARQL endpoint in a single step. +### ๐Ÿ“ค `kb publish` - Publish to SPARQL +Process and sync to SPARQL endpoint in one command: +```bash +kb publish # Use default endpoint +kb publish --endpoint # Specify endpoint +kb publish --watch # Continuous publishing mode +kb publish --graph # Specify named graph +``` -**Basic usage:** +### ๐Ÿ”„ `kb sync` - Sync to SPARQL +Sync already processed data to SPARQL endpoint: ```bash -poetry run python -m knowledgebase_processor.cli.main process-and-load /path/to/knowledgebase +kb sync # Sync to default endpoint +kb sync --endpoint # Specify endpoint +kb sync --clear # Clear endpoint before sync ``` -**Options:** -- `--pattern PATTERN` - Only process files matching the given glob pattern (e.g., `*.md`). +### ๐Ÿ“Š `kb status` - Show Status +Display knowledge base statistics and status: +```bash +kb status # Show current status +kb status --detailed # Show detailed statistics +``` -- `--graph GRAPH_URI` - Specify the named graph URI to load data into. +### โš™๏ธ `kb config` - Manage Configuration +View and manage configuration: +```bash +kb config show # Display current config +kb config set endpoint # Set SPARQL endpoint +kb config reset # Reset to defaults +``` -- `--endpoint ENDPOINT_URL` - Override the default SPARQL endpoint URL. +## Advanced Usage -- `--update-endpoint UPDATE_ENDPOINT_URL` - Specify a separate SPARQL update endpoint. +### Interactive Mode +Run `kb` without any arguments to enter interactive mode with a guided interface: +```bash +kb +``` -- `--cleanup` - Remove temporary RDF files after loading. +### Process with RDF Output +Generate RDF/TTL files during processing: +```bash +kb scan --rdf-output ./rdf_output +``` -**Example invocations:** +### Continuous Processing +Watch for file changes and automatically process: ```bash -# Process all Markdown files and load into default endpoint -poetry run python -m knowledgebase_processor.cli.main process-and-load ./sample_data +kb scan --watch +kb publish --watch +``` -# Process only daily notes and load into a specific named graph -poetry run python -m knowledgebase_processor.cli.main process-and-load ./sample_data --pattern "daily-note-*.md" --graph "http://example.org/graph/daily" +### Using as a Python Module +```bash +# Run CLI as a module +python -m knowledgebase_processor.cli --help +``` -# Specify a custom SPARQL endpoint and cleanup temporary files -poetry run python -m knowledgebase_processor.cli.main process-and-load ./sample_data --endpoint http://localhost:3030/ds --cleanup +## Development + +### Running Tests +Run all tests using the provided script: +```bash +poetry run python scripts/run_tests.py ``` -Progress and errors will be reported in the console. For more options, run: +Or use pytest directly: ```bash -poetry run python -m knowledgebase_processor.cli.main process-and-load --help +poetry run pytest +poetry run pytest tests/cli/ # Test CLI specifically +``` + +### Architecture +The processor uses a service-oriented architecture with clear separation between: +- **CLI Layer**: User interface and command handling +- **Service Layer**: Business logic and orchestration +- **Data Layer**: Document processing and persistence + +See [ARCHITECTURE_V2.md](ARCHITECTURE_V2.md) for detailed architecture documentation. + +## Configuration + +The processor can be configured via: +1. Command-line arguments (highest priority) +2. Configuration file (`.kbp/config.yaml`) +3. Environment variables +4. Default values + +Example configuration file: +```yaml +knowledge_base: + path: /path/to/documents + patterns: + - "*.md" + - "*.markdown" +sparql: + endpoint: http://localhost:3030/kb + graph: http://example.org/kb +processing: + batch_size: 100 + parallel: true ``` -And the processor also handles wikilinks [[A wikilink]] +## Wikilinks Support +The processor handles wikilinks [[A wikilink]] and extracts them as relationships between documents. -### Contributing +## Contributing +Fork the repository, create a feature branch, and submit a pull request. Please ensure all tests pass before submitting. -Fork the repository, create a feature branch, and submit a pull request. Please ensure all tests pass before submitting. \ No newline at end of file +## License +[Add your license information here] \ No newline at end of file diff --git a/docs/architecture/build-artifacts-options.md b/docs/architecture/build-artifacts-options.md index 2aa4821..634f769 100644 --- a/docs/architecture/build-artifacts-options.md +++ b/docs/architecture/build-artifacts-options.md @@ -146,7 +146,7 @@ Create a custom buildpack extension that includes pre-compiled wheels for srsly # BUILD.bazel py_binary( name = "kb", - srcs = ["src/knowledgebase_processor/cli_v2/main.py"], + srcs = ["src/knowledgebase_processor/cli/main.py"], deps = [ requirement("spacy"), requirement("srsly"), diff --git a/pyproject.toml b/pyproject.toml index beae663..b5ddd12 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,8 +28,8 @@ docker = "^6.1.0" requests = "^2.31.0" [tool.poetry.scripts] -kbp = "knowledgebase_processor.cli:main" -kb = "knowledgebase_processor.cli_v2.main:main" +kbp = "knowledgebase_processor.cli.main:main" +kb = "knowledgebase_processor.cli.main:main" test = "scripts.run_tests:main" [build-system] diff --git a/scripts/run_processor.py b/scripts/run_processor.py old mode 100644 new mode 100755 index 8e6d381..c4af9e4 --- a/scripts/run_processor.py +++ b/scripts/run_processor.py @@ -1,28 +1,47 @@ +#!/usr/bin/env python +""" +DEPRECATED: This script is deprecated in favor of the new CLI interface. + +Please use the 'kb' or 'kbp' commands instead: + kb scan docs/ --rdf-output rdf_output/ + +For more information, run: + kb --help + kb scan --help +""" + import subprocess import sys import os def main(): + print("=" * 70) + print("DEPRECATION WARNING") + print("=" * 70) + print("This script is deprecated. Please use the 'kb' command instead:") + print() + print(" kb scan docs/ --rdf-output rdf_output/") + print() + print("For more information, run: kb --help") + print("=" * 70) + print() + + # Still run the command for backwards compatibility input_dir = "docs/" output_dir = "tmp/" - rdf_output_dir_value = "rdf_output/" # New RDF output directory + rdf_output_dir_value = "rdf_output/" os.makedirs(output_dir, exist_ok=True) - os.makedirs(rdf_output_dir_value, exist_ok=True) # Create RDF output dir + os.makedirs(rdf_output_dir_value, exist_ok=True) cmd = [ "poetry", "run", - "python", - "-m", - "knowledgebase_processor.cli.main", - "--knowledge-base", + "kb", + "scan", input_dir, - "--metadata-store", - output_dir, - "process", - "--rdf-output-dir", # Add the new argument - rdf_output_dir_value # Add its value + "--rdf-output", + rdf_output_dir_value ] result = subprocess.run(cmd, capture_output=True, text=True) print(result.stdout) diff --git a/src/knowledgebase_processor/cli/__init__.py b/src/knowledgebase_processor/cli/__init__.py index db8bd82..3e929c3 100644 --- a/src/knowledgebase_processor/cli/__init__.py +++ b/src/knowledgebase_processor/cli/__init__.py @@ -1,3 +1,3 @@ -"""Command-line interface for the Knowledge Base Processor.""" +"""Modern CLI for Knowledge Base Processor v2.""" -from .main import main \ No newline at end of file +__version__ = "2.0.0" \ No newline at end of file diff --git a/src/knowledgebase_processor/cli_v2/commands/__init__.py b/src/knowledgebase_processor/cli/commands/__init__.py similarity index 100% rename from src/knowledgebase_processor/cli_v2/commands/__init__.py rename to src/knowledgebase_processor/cli/commands/__init__.py diff --git a/src/knowledgebase_processor/cli_v2/commands/config.py b/src/knowledgebase_processor/cli/commands/config.py similarity index 100% rename from src/knowledgebase_processor/cli_v2/commands/config.py rename to src/knowledgebase_processor/cli/commands/config.py diff --git a/src/knowledgebase_processor/cli_v2/commands/init.py b/src/knowledgebase_processor/cli/commands/init.py similarity index 100% rename from src/knowledgebase_processor/cli_v2/commands/init.py rename to src/knowledgebase_processor/cli/commands/init.py diff --git a/src/knowledgebase_processor/cli_v2/commands/publish.py b/src/knowledgebase_processor/cli/commands/publish.py similarity index 100% rename from src/knowledgebase_processor/cli_v2/commands/publish.py rename to src/knowledgebase_processor/cli/commands/publish.py diff --git a/src/knowledgebase_processor/cli_v2/commands/scan.py b/src/knowledgebase_processor/cli/commands/scan.py similarity index 100% rename from src/knowledgebase_processor/cli_v2/commands/scan.py rename to src/knowledgebase_processor/cli/commands/scan.py diff --git a/src/knowledgebase_processor/cli_v2/commands/search.py b/src/knowledgebase_processor/cli/commands/search.py similarity index 100% rename from src/knowledgebase_processor/cli_v2/commands/search.py rename to src/knowledgebase_processor/cli/commands/search.py diff --git a/src/knowledgebase_processor/cli_v2/commands/status.py b/src/knowledgebase_processor/cli/commands/status.py similarity index 100% rename from src/knowledgebase_processor/cli_v2/commands/status.py rename to src/knowledgebase_processor/cli/commands/status.py diff --git a/src/knowledgebase_processor/cli_v2/commands/sync.py b/src/knowledgebase_processor/cli/commands/sync.py similarity index 100% rename from src/knowledgebase_processor/cli_v2/commands/sync.py rename to src/knowledgebase_processor/cli/commands/sync.py diff --git a/src/knowledgebase_processor/cli_v2/interactive.py b/src/knowledgebase_processor/cli/interactive.py similarity index 100% rename from src/knowledgebase_processor/cli_v2/interactive.py rename to src/knowledgebase_processor/cli/interactive.py diff --git a/src/knowledgebase_processor/cli/main.py b/src/knowledgebase_processor/cli/main.py index 08138da..1db05c6 100644 --- a/src/knowledgebase_processor/cli/main.py +++ b/src/knowledgebase_processor/cli/main.py @@ -1,520 +1,123 @@ -"""Main CLI implementation for the Knowledge Base Processor.""" +"""Modern CLI implementation with delightful user experience.""" -import argparse -import sys +import click from pathlib import Path -from typing import List, Optional -import json -from urllib.parse import urlparse - -from ..config import load_config -from ..api import KnowledgeBaseAPI -from ..utils.logging import setup_logging, get_logger -from SPARQLWrapper.SPARQLExceptions import SPARQLWrapperException - - -logger = get_logger("knowledgebase_processor.cli") - - -def is_valid_url(url: str) -> bool: - """Check if a string is a valid URL.""" - if not url: - return False - try: - result = urlparse(url) - return all([result.scheme, result.netloc]) - except ValueError: - return False - - -def parse_args(args: Optional[List[str]] = None) -> argparse.Namespace: - """Parse command-line arguments. - - Args: - args: Command-line arguments (optional, defaults to sys.argv[1:]) - - Returns: - Parsed arguments - """ - parser = argparse.ArgumentParser( - description="Knowledge Base Processor - Extract and analyze knowledge base content" - ) - - parser.add_argument( - "--config", "-c", - help="Path to configuration file", - type=str - ) - - parser.add_argument( - "--knowledge-base", "-k", - help="Path to knowledge base directory", - type=str - ) - parser.add_argument( - "--metadata-store", "-m", - help="Path to metadata store directory (e.g., ~/.kbp/metadata). " - "The database file 'knowledgebase.db' will be created/used within this directory. " - "Defaults to the directory specified in the config file, or '~/.kbp/metadata' if not set.", - type=str - ) - - - parser.add_argument( - "--log-level", "-l", - help="Logging level", - choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], - default="INFO" - ) - - parser.add_argument( - "--log-file", - help="Path to log file", - type=str - ) - - parser.add_argument( - "--log-format", - help="Logging format", - choices=["text", "json"], - default="text" - ) - - subparsers = parser.add_subparsers(dest="command", help="Command to execute", required=True) - - # Process command - process_parser = subparsers.add_parser("process", help="Process knowledge base files") - process_parser.add_argument( - "--pattern", "-p", - help="File pattern to process (default: **/*.md)", - default="**/*.md" - ) - process_parser.add_argument( - "--rdf-output-dir", - help="Directory to save RDF output files (e.g., output/rdf). If provided, RDF/TTL files will be generated.", - type=str, - default=None - ) - - # Process-and-load command - process_and_load_parser = subparsers.add_parser("process-and-load", help="Process and load RDF into SPARQL endpoint") - process_and_load_parser.add_argument( - "knowledge_base_path", - help="Path to the knowledge base directory. If omitted, the value from the global " - "--knowledge-base argument, the config file, or the current directory is used.", - type=str, - nargs="?", - default=None, - ) - process_and_load_parser.add_argument( - "--pattern", "-p", - help="File pattern to process (default: **/*.md)", - default="**/*.md" - ) - process_and_load_parser.add_argument( - "--graph", "-g", - help="Named graph URI to load data into", - type=str, - default=None - ) - process_and_load_parser.add_argument( - "--endpoint-url", "-e", - help="SPARQL endpoint URL (overrides config)", - type=str, - default=None - ) - - process_and_load_parser.add_argument( - "--cleanup", - help="Remove temporary RDF files after loading", - action="store_true" - ) - process_and_load_parser.add_argument( - "--rdf-output-dir", - help="Directory to save temporary RDF output files", - type=str, - default=None - ) - process_and_load_parser.add_argument( - "--user", "-u", - help="Username for SPARQL endpoint authentication", - type=str - ) - process_and_load_parser.add_argument( - "--password", "-P", - help="Password for SPARQL endpoint authentication", - type=str - ) - - - # Query command - query_parser = subparsers.add_parser("query", help="Query the knowledge base") - query_parser.add_argument( - "query_string", - help="Query string" - ) - query_parser.add_argument( - "--type", "-t", - help="Query type", - choices=["text", "tag", "topic"], - default="text" - ) - - # SPARQL command group - sparql_parser = subparsers.add_parser("sparql", help="SPARQL operations") - sparql_subparsers = sparql_parser.add_subparsers(dest="sparql_command", help="SPARQL command to execute", required=True) - - # SPARQL query command - sparql_query_parser = sparql_subparsers.add_parser("query", help="Execute a SPARQL query") - sparql_query_parser.add_argument( - "sparql_query", - help="SPARQL query string" - ) - sparql_query_parser.add_argument( - "--endpoint-url", "-e", - help="SPARQL endpoint URL (overrides config)", - type=str - ) - sparql_query_parser.add_argument( - "--timeout", "-t", - help="Query timeout in seconds", - type=int, - default=30 - ) - sparql_query_parser.add_argument( - "--format", "-f", - help="Output format for results", - choices=["json", "table", "turtle"], - default="table" - ) - sparql_query_parser.add_argument( - "--user", "-u", - help="Username for SPARQL endpoint authentication", - type=str - ) - sparql_query_parser.add_argument( - "--password", "-P", - help="Password for SPARQL endpoint authentication", - type=str - ) - - # SPARQL load-file command - sparql_load_parser = sparql_subparsers.add_parser("load-file", help="Load an RDF file into the SPARQL store") - sparql_load_parser.add_argument( - "file_path", - help="Path to the RDF file to load" - ) - sparql_load_parser.add_argument( - "--graph", "-g", - help="Named graph URI to load data into", - type=str - ) - sparql_load_parser.add_argument( - "--endpoint-url", "-e", - help="SPARQL endpoint URL (overrides config)", - type=str - ) - sparql_load_parser.add_argument( - "--user", "-u", - help="Username for SPARQL endpoint authentication", - type=str - ) - sparql_load_parser.add_argument( - "--password", "-P", - help="Password for SPARQL endpoint authentication", - type=str - ) - sparql_load_parser.add_argument( - "--rdf-format", - help="RDF format of the input file", - choices=["turtle", "n3", "nt", "xml", "json-ld"], - default="turtle" - ) - - return parser.parse_args(args) - +from typing import Optional +import sys -def handle_sparql_query(api: KnowledgeBaseAPI, args: argparse.Namespace) -> int: - """Handle SPARQL query command via API. - - Args: - api: KnowledgeBaseAPI instance - args: Parsed command-line arguments - - Returns: - Exit code +from .commands import init, scan, search, sync, publish, status, config +from .utils import console, setup_error_handling +from .interactive import InteractiveMode + +# Version info +__version__ = "2.0.0" + +CONTEXT_SETTINGS = dict( + help_option_names=['-h', '--help'], + max_content_width=120, +) + + +class KBContext: + """Context object for passing state between commands.""" + + def __init__(self): + self.config_path: Optional[Path] = None + self.kb_path: Optional[Path] = None + self.verbose: bool = False + self.quiet: bool = False + self.no_color: bool = False + self.yes: bool = False # Auto-confirm prompts + + +@click.group(context_settings=CONTEXT_SETTINGS, invoke_without_command=True) +@click.option('--version', '-V', is_flag=True, help='Show version and exit.') +@click.option('--config', '-c', type=click.Path(exists=True), help='Path to config file.') +@click.option('--verbose', '-v', is_flag=True, help='Enable verbose output.') +@click.option('--quiet', '-q', is_flag=True, help='Suppress all output except errors.') +@click.option('--no-color', is_flag=True, help='Disable colored output.') +@click.option('--yes', '-y', is_flag=True, help='Auto-confirm all prompts (non-interactive).') +@click.pass_context +def cli(ctx, version, config, verbose, quiet, no_color, yes): + """๐Ÿง  Knowledge Base Processor - Your intelligent document companion. + + A modern CLI for processing your existing documents into a searchable knowledge graph. + Run without arguments for interactive mode. + + \b + Quick Start: + kb init Configure processor for your documents + kb scan Process documents in current directory + kb publish Process + sync to SPARQL in one command + kb search "todo" Search for content + kb status Show processing statistics + + \b + Examples: + kb publish --watch Continuous publishing mode + kb scan --sync --endpoint Process + sync to endpoint + kb search --type todo "project tasks" """ - try: - result = api.sparql_query( - query=args.sparql_query, - endpoint_url=args.endpoint_url, - timeout=args.timeout, - format=args.format - ) - - # Format and print results based on the format - if args.format == "json": - print(json.dumps(result, indent=2)) - elif args.format == "table": - if isinstance(result, list) and result: - # Print headers - headers = list(result[0].keys()) - print(" | ".join(headers)) - print("-" * (len(" | ".join(headers)))) - - # Print rows - for row in result: - values = [str(row.get(header, "")) for header in headers] - print(" | ".join(values)) - elif isinstance(result, bool): - print(result) - else: - print("No results found.") - elif args.format == "turtle": - print(result) + # Create context + ctx.obj = KBContext() + ctx.obj.verbose = verbose + ctx.obj.quiet = quiet + ctx.obj.no_color = no_color + ctx.obj.yes = yes + + if config: + ctx.obj.config_path = Path(config) + + # Setup error handling + setup_error_handling() + + # Show version and exit + if version: + from rich import print as rprint + from rich.panel import Panel + rprint(Panel.fit( + f"[bold cyan]Knowledge Base Processor[/bold cyan]\n" + f"Version {__version__}", + border_style="cyan" + )) + ctx.exit(0) + + # If no command specified, enter interactive mode + if ctx.invoked_subcommand is None: + if not sys.stdin.isatty() or yes: + # Non-interactive environment or --yes flag + console.print("[yellow]No command specified.[/yellow] Use 'kb --help' for usage.") + ctx.exit(1) else: - print(result) - - return 0 - - except Exception as e: - logger.error(f"SPARQL query failed: {e}") - return 1 + # Enter interactive mode + interactive = InteractiveMode(ctx.obj) + interactive.run() + ctx.exit(0) -def handle_sparql_load(api: KnowledgeBaseAPI, args: argparse.Namespace) -> int: - """Handle SPARQL load-file command via API. - - Args: - api: KnowledgeBaseAPI instance - args: Parsed command-line arguments - - Returns: - Exit code - """ - try: - api.sparql_load( - file_path=Path(args.file_path), - graph_uri=args.graph, - endpoint_url=args.endpoint_url, - username=args.user, - password=args.password, - rdf_format=args.rdf_format - ) - logger.info(f"Successfully loaded RDF file '{args.file_path}' into graph '{args.graph}'.") - return 0 - - except Exception as e: - logger.error(f"Failed to load RDF file '{args.file_path}': {e}") - return 1 +# Register commands +cli.add_command(init.init_cmd) +cli.add_command(scan.scan_cmd) +cli.add_command(search.search_cmd) +cli.add_command(sync.sync_cmd) +cli.add_command(publish.publish_cmd) +cli.add_command(status.status_cmd) +cli.add_command(config.config_cmd) -def handle_sparql(api: KnowledgeBaseAPI, args: argparse.Namespace) -> int: - """Route SPARQL commands to their respective handlers. - - Args: - api: KnowledgeBaseAPI instance - args: Parsed command-line arguments - - Returns: - Exit code - """ - if args.sparql_command == "query": - return handle_sparql_query(api, args) - elif args.sparql_command == "load-file": - return handle_sparql_load(api, args) - else: - logger.error(f"Unknown SPARQL command: {args.sparql_command}") - return 1 - - -def main(args: Optional[List[str]] = None) -> int: - """Main entry point for the CLI. - - Args: - args: Command-line arguments (optional, defaults to sys.argv[1:]) - - Returns: - Exit code - """ - # Parse arguments - parsed_args = parse_args(args) - - # Set up logging - setup_logging(parsed_args.log_level, parsed_args.log_file, parsed_args.log_format) - - # Load configuration - config = load_config(parsed_args.config) - - # Override config with command-line arguments - if parsed_args.knowledge_base: - config.knowledge_base_path = parsed_args.knowledge_base - elif not hasattr(config, 'knowledge_base_path') or not config.knowledge_base_path: - config.knowledge_base_path = str(Path.cwd()) - logger.info(f"Knowledge base path not specified, defaulting to current directory: {config.knowledge_base_path}") - - # Handle metadata store path - if parsed_args.metadata_store: - db_directory_path_str = parsed_args.metadata_store - logger.info(f"Using metadata store directory from command line: {db_directory_path_str}") - elif hasattr(config, 'metadata_store_path') and config.metadata_store_path: - db_directory_path_str = config.metadata_store_path - logger.info(f"Using metadata store directory from config: {db_directory_path_str}") - else: - default_dir = Path.home() / ".kbp" / "metadata" - db_directory_path_str = str(default_dir) - logger.warning( - f"Metadata store directory not specified via CLI or config, " - f"or config value is empty. Defaulting to: {db_directory_path_str}" - ) - - db_directory_path = Path(db_directory_path_str) - db_filename = "knowledgebase.db" - db_file_path = db_directory_path / db_filename - config.metadata_store_path = str(db_file_path) - - logger.info(f"Final database file path: {db_file_path}") - - # Initialize API with config +def main(): + """Entry point for the CLI.""" try: - api = KnowledgeBaseAPI(config) + cli() + except KeyboardInterrupt: + console.print("\n[yellow]Interrupted by user.[/yellow]") + sys.exit(130) except Exception as e: - logger.error(f"Failed to initialize KnowledgeBaseAPI: {e}", exc_info=True) - return 1 - - # Route to appropriate handler - handlers = { - 'process': handle_process, - 'process-and-load': handle_process_and_load, - 'query': handle_query, - 'sparql': handle_sparql - } - - handler = handlers.get(parsed_args.command) - if handler: - return handler(api, parsed_args) - else: - logger.error("No command specified or unknown command. Use 'process', 'query', or 'sparql'.") - return 1 - -def handle_process_and_load(api: KnowledgeBaseAPI, args: argparse.Namespace) -> int: - """Handle process-and-load command via API. - - Args: - api: KnowledgeBaseAPI instance - args: Parsed command-line arguments - - Returns: - Exit code - """ - logger.info("Starting process-and-load operation") - - # --- Validation --- - kb_path = args.knowledge_base_path if args.knowledge_base_path is not None else api.config.knowledge_base_path - knowledge_base_path = Path(kb_path) - if not knowledge_base_path.is_dir(): - logger.error(f"Invalid knowledge base path: '{knowledge_base_path}' is not a directory.") - return 1 - - endpoint_url = args.endpoint_url or api.config.sparql_endpoint_url - if not endpoint_url: - logger.error("SPARQL endpoint URL is required. Provide it via --endpoint-url or in the config file.") - return 1 + console.print(f"\n[red]Unexpected error:[/red] {e}") + if '-v' in sys.argv or '--verbose' in sys.argv: + console.print_exception() + sys.exit(1) - if not is_valid_url(endpoint_url): - logger.error(f"Invalid SPARQL endpoint URL: {endpoint_url}") - return 1 - rdf_output_dir = Path(args.rdf_output_dir) if args.rdf_output_dir else None - if rdf_output_dir: - try: - rdf_output_dir.mkdir(parents=True, exist_ok=True) - except OSError as e: - logger.error(f"Could not create RDF output directory '{rdf_output_dir}': {e}") - return 1 - - # --- Execution --- - try: - result = api.processing_service.process_and_load( - pattern=args.pattern, - knowledge_base_path=knowledge_base_path, - rdf_output_dir=rdf_output_dir, - graph_uri=args.graph, - endpoint_url=endpoint_url, - cleanup=args.cleanup, - username=args.user, - password=args.password, - ) - if result == 0: - logger.info("Processing and loading completed successfully.") - else: - logger.error(f"Processing and loading failed with exit code {result}.") - return result - except SPARQLWrapperException as e: - logger.error(f"A SPARQL error occurred: {e}", exc_info=True) - logger.error(f"Please check if the SPARQL endpoint at '{endpoint_url}' is running and accessible.") - return 1 - except FileNotFoundError as e: - logger.error(f"File not found during processing: {e}", exc_info=True) - return 1 - except Exception as e: - logger.error(f"An unexpected error occurred during process-and-load: {e}", exc_info=True) - return 1 - - -def handle_process(api: KnowledgeBaseAPI, args: argparse.Namespace) -> int: - """Handle process command via API. - - Args: - api: KnowledgeBaseAPI instance - args: Parsed command-line arguments - - Returns: - Exit code - """ - try: - pattern = args.pattern - rdf_output_dir = Path(args.rdf_output_dir) if args.rdf_output_dir else None - - logger.info(f"Processing files matching pattern: {pattern}") - if rdf_output_dir: - logger.info(f"RDF output directory specified: {rdf_output_dir}") - - result = api.process_documents( - pattern=pattern, - rdf_output_dir=rdf_output_dir - ) - - if result == 0: - logger.info("Processing completed successfully") - else: - logger.error("Processing failed") - - return result - - except Exception as e: - logger.error(f"Error during processing: {e}") - return 1 - - -def handle_query(api: KnowledgeBaseAPI, args: argparse.Namespace) -> int: - """Handle query command via API. - - Args: - api: KnowledgeBaseAPI instance - args: Parsed command-line arguments - - Returns: - Exit code - """ - try: - results = api.query(args.query_string, args.type) - if results: - for result in results: - print(result) - else: - print("No results found.") - return 0 - except Exception as e: - logger.error(f"Error during query: {e}") - return 1 \ No newline at end of file +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/knowledgebase_processor/cli_v2/utils.py b/src/knowledgebase_processor/cli/utils.py similarity index 100% rename from src/knowledgebase_processor/cli_v2/utils.py rename to src/knowledgebase_processor/cli/utils.py diff --git a/src/knowledgebase_processor/cli_v2/__init__.py b/src/knowledgebase_processor/cli_v2/__init__.py deleted file mode 100644 index 3e929c3..0000000 --- a/src/knowledgebase_processor/cli_v2/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -"""Modern CLI for Knowledge Base Processor v2.""" - -__version__ = "2.0.0" \ No newline at end of file diff --git a/src/knowledgebase_processor/cli_v2/main.py b/src/knowledgebase_processor/cli_v2/main.py deleted file mode 100644 index 1db05c6..0000000 --- a/src/knowledgebase_processor/cli_v2/main.py +++ /dev/null @@ -1,123 +0,0 @@ -"""Modern CLI implementation with delightful user experience.""" - -import click -from pathlib import Path -from typing import Optional -import sys - -from .commands import init, scan, search, sync, publish, status, config -from .utils import console, setup_error_handling -from .interactive import InteractiveMode - -# Version info -__version__ = "2.0.0" - -CONTEXT_SETTINGS = dict( - help_option_names=['-h', '--help'], - max_content_width=120, -) - - -class KBContext: - """Context object for passing state between commands.""" - - def __init__(self): - self.config_path: Optional[Path] = None - self.kb_path: Optional[Path] = None - self.verbose: bool = False - self.quiet: bool = False - self.no_color: bool = False - self.yes: bool = False # Auto-confirm prompts - - -@click.group(context_settings=CONTEXT_SETTINGS, invoke_without_command=True) -@click.option('--version', '-V', is_flag=True, help='Show version and exit.') -@click.option('--config', '-c', type=click.Path(exists=True), help='Path to config file.') -@click.option('--verbose', '-v', is_flag=True, help='Enable verbose output.') -@click.option('--quiet', '-q', is_flag=True, help='Suppress all output except errors.') -@click.option('--no-color', is_flag=True, help='Disable colored output.') -@click.option('--yes', '-y', is_flag=True, help='Auto-confirm all prompts (non-interactive).') -@click.pass_context -def cli(ctx, version, config, verbose, quiet, no_color, yes): - """๐Ÿง  Knowledge Base Processor - Your intelligent document companion. - - A modern CLI for processing your existing documents into a searchable knowledge graph. - Run without arguments for interactive mode. - - \b - Quick Start: - kb init Configure processor for your documents - kb scan Process documents in current directory - kb publish Process + sync to SPARQL in one command - kb search "todo" Search for content - kb status Show processing statistics - - \b - Examples: - kb publish --watch Continuous publishing mode - kb scan --sync --endpoint Process + sync to endpoint - kb search --type todo "project tasks" - """ - # Create context - ctx.obj = KBContext() - ctx.obj.verbose = verbose - ctx.obj.quiet = quiet - ctx.obj.no_color = no_color - ctx.obj.yes = yes - - if config: - ctx.obj.config_path = Path(config) - - # Setup error handling - setup_error_handling() - - # Show version and exit - if version: - from rich import print as rprint - from rich.panel import Panel - rprint(Panel.fit( - f"[bold cyan]Knowledge Base Processor[/bold cyan]\n" - f"Version {__version__}", - border_style="cyan" - )) - ctx.exit(0) - - # If no command specified, enter interactive mode - if ctx.invoked_subcommand is None: - if not sys.stdin.isatty() or yes: - # Non-interactive environment or --yes flag - console.print("[yellow]No command specified.[/yellow] Use 'kb --help' for usage.") - ctx.exit(1) - else: - # Enter interactive mode - interactive = InteractiveMode(ctx.obj) - interactive.run() - ctx.exit(0) - - -# Register commands -cli.add_command(init.init_cmd) -cli.add_command(scan.scan_cmd) -cli.add_command(search.search_cmd) -cli.add_command(sync.sync_cmd) -cli.add_command(publish.publish_cmd) -cli.add_command(status.status_cmd) -cli.add_command(config.config_cmd) - - -def main(): - """Entry point for the CLI.""" - try: - cli() - except KeyboardInterrupt: - console.print("\n[yellow]Interrupted by user.[/yellow]") - sys.exit(130) - except Exception as e: - console.print(f"\n[red]Unexpected error:[/red] {e}") - if '-v' in sys.argv or '--verbose' in sys.argv: - console.print_exception() - sys.exit(1) - - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/tests/cli_v2/__init__.py b/tests/cli/__init__.py similarity index 100% rename from tests/cli_v2/__init__.py rename to tests/cli/__init__.py diff --git a/tests/cli/test_cli_benchmarks.py b/tests/cli/test_cli_benchmarks.py deleted file mode 100644 index cf1383e..0000000 --- a/tests/cli/test_cli_benchmarks.py +++ /dev/null @@ -1,482 +0,0 @@ -""" -CLI Performance Benchmarks using pytest-benchmark - -This module provides pytest-benchmark based performance testing for the CLI. -Use: pytest tests/cli/test_cli_benchmarks.py --benchmark-only -""" - -import os -import tempfile -import shutil -import subprocess -import psutil -import pytest -from pathlib import Path -from typing import List, Dict -import time -import gc - -# Project root for running CLI commands -PROJECT_ROOT = Path(__file__).parent.parent.parent - - -class CLIBenchmarkFixtures: - """Shared fixtures for CLI benchmarking.""" - - @pytest.fixture(scope="session") - def temp_workspace(self): - """Create a temporary workspace for all benchmarks.""" - temp_dir = tempfile.mkdtemp(prefix="cli_benchmark_") - yield temp_dir - shutil.rmtree(temp_dir) - - @pytest.fixture(scope="session") - def small_kb(self, temp_workspace): - """Create a small knowledge base for benchmarking.""" - kb_dir = Path(temp_workspace) / "small_kb" - kb_dir.mkdir() - - # Create 10 small files - for i in range(10): - content = f"""# Document {i} - -This is document number {i} with some basic content. - -## Section A -Content for section A in document {i}. - -## Section B -Content for section B in document {i}. - -- Item 1 -- Item 2 -- Item 3 - -```python -def function_{i}(): - return {i} -``` - -#tag{i} #category -""" - (kb_dir / f"doc_{i:02d}.md").write_text(content) - - return kb_dir - - @pytest.fixture(scope="session") - def medium_kb(self, temp_workspace): - """Create a medium knowledge base for benchmarking.""" - kb_dir = Path(temp_workspace) / "medium_kb" - kb_dir.mkdir() - - # Create 50 medium files - for i in range(50): - sections = [] - sections.append(f"# Document {i}") - - for j in range(20): # 20 sections per document - sections.extend([ - f"## Section {j}", - f"This is section {j} of document {i}.", - "Lorem ipsum dolor sit amet, consectetur adipiscing elit.", - "", - f"### Subsection {j}.1", - "More detailed content here.", - "", - f"```yaml", - f"key_{j}: value_{j}", - f"nested:", - f" item: {j}", - f"```", - "", - f"- [ ] TODO: Task {j}", - f"- [x] DONE: Completed {j}", - "", - f"[[Link to Document {(i + j) % 50}]]", - "" - ]) - - content = "\n".join(sections) - (kb_dir / f"doc_{i:03d}.md").write_text(content) - - return kb_dir - - @pytest.fixture(scope="session") - def rdf_output_dir(self, temp_workspace): - """Create RDF output directory.""" - rdf_dir = Path(temp_workspace) / "rdf_output" - rdf_dir.mkdir() - return rdf_dir - - -# Inherit fixtures -@pytest.mark.usefixtures("small_kb", "medium_kb", "rdf_output_dir") -class TestCLIBenchmarks(CLIBenchmarkFixtures): - """CLI performance benchmarks using pytest-benchmark.""" - - def run_cli_command(self, args: List[str], timeout: int = 120) -> subprocess.CompletedProcess: - """Run a CLI command and return the result.""" - base_command = ["poetry", "run", "python", "-m", "knowledgebase_processor.cli.main"] - full_command = base_command + args - - # Force garbage collection before running - gc.collect() - - return subprocess.run( - full_command, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - cwd=PROJECT_ROOT, - timeout=timeout - ) - - def cli_startup_benchmark(self): - """Benchmark CLI startup time.""" - result = self.run_cli_command(["--help"], timeout=30) - assert result.returncode == 0 - return result - - def small_kb_processing_benchmark(self, small_kb, rdf_output_dir): - """Benchmark small knowledge base processing.""" - args = [ - "process", - "--knowledge-base", str(small_kb), - "--rdf-output-dir", str(rdf_output_dir) - ] - result = self.run_cli_command(args, timeout=60) - assert result.returncode == 0 - return result - - def medium_kb_processing_benchmark(self, medium_kb, rdf_output_dir): - """Benchmark medium knowledge base processing.""" - args = [ - "process", - "--knowledge-base", str(medium_kb), - "--rdf-output-dir", str(rdf_output_dir) - ] - result = self.run_cli_command(args, timeout=180) - assert result.returncode == 0 - return result - - # Pytest-benchmark tests - - def test_cli_startup_performance(self, benchmark): - """Benchmark CLI startup time.""" - result = benchmark(self.cli_startup_benchmark) - - # Assertions about performance - assert result.returncode == 0 - - # Get benchmark stats - stats = benchmark.stats - assert stats.mean < 5.0, f"CLI startup too slow: {stats.mean:.3f}s" - assert stats.max < 10.0, f"CLI startup max too slow: {stats.max:.3f}s" - - def test_small_kb_processing_performance(self, benchmark, small_kb, rdf_output_dir): - """Benchmark small knowledge base processing.""" - result = benchmark(self.small_kb_processing_benchmark, small_kb, rdf_output_dir) - - assert result.returncode == 0 - - stats = benchmark.stats - assert stats.mean < 30.0, f"Small KB processing too slow: {stats.mean:.3f}s" - - # Check throughput (10 files) - throughput = 10 / stats.mean - assert throughput > 0.5, f"Throughput too low: {throughput:.2f} files/sec" - - def test_medium_kb_processing_performance(self, benchmark, medium_kb, rdf_output_dir): - """Benchmark medium knowledge base processing.""" - result = benchmark(self.medium_kb_processing_benchmark, medium_kb, rdf_output_dir) - - assert result.returncode == 0 - - stats = benchmark.stats - assert stats.mean < 120.0, f"Medium KB processing too slow: {stats.mean:.3f}s" - - # Check throughput (50 files) - throughput = 50 / stats.mean - assert throughput > 0.5, f"Throughput too low: {throughput:.2f} files/sec" - - @pytest.mark.parametrize("kb_size,expected_max_time", [ - ("small", 30.0), - ("medium", 120.0), - ]) - def test_processing_performance_scaling(self, benchmark, kb_size, expected_max_time, small_kb, medium_kb, rdf_output_dir): - """Test that processing time scales reasonably with KB size.""" - if kb_size == "small": - kb_path = small_kb - file_count = 10 - else: - kb_path = medium_kb - file_count = 50 - - def process_kb(): - args = [ - "process", - "--knowledge-base", str(kb_path), - "--rdf-output-dir", str(rdf_output_dir) - ] - return self.run_cli_command(args, timeout=int(expected_max_time * 2)) - - result = benchmark(process_kb) - - assert result.returncode == 0 - - stats = benchmark.stats - assert stats.mean < expected_max_time, f"{kb_size} KB processing too slow: {stats.mean:.3f}s" - - # Calculate files per second - fps = file_count / stats.mean - print(f"{kb_size} KB: {fps:.2f} files/sec, {stats.mean:.3f}s total") - - -@pytest.mark.memory -class TestCLIMemoryBenchmarks(CLIBenchmarkFixtures): - """Memory-focused CLI benchmarks.""" - - def run_cli_with_memory_tracking(self, args: List[str], timeout: int = 120) -> Dict: - """Run CLI command with memory tracking.""" - base_command = ["poetry", "run", "python", "-m", "knowledgebase_processor.cli.main"] - full_command = base_command + args - - # Get initial memory - initial_memory = psutil.virtual_memory().used - - # Start process - proc = subprocess.Popen( - full_command, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - cwd=PROJECT_ROOT - ) - - # Track memory usage - peak_memory = initial_memory - memory_samples = [] - - start_time = time.time() - - while proc.poll() is None: - try: - current_memory = psutil.virtual_memory().used - memory_samples.append(current_memory) - peak_memory = max(peak_memory, current_memory) - - # Check timeout - if time.time() - start_time > timeout: - proc.kill() - break - - time.sleep(0.1) - except: - break - - stdout, stderr = proc.communicate() - end_time = time.time() - final_memory = psutil.virtual_memory().used - - return { - 'returncode': proc.returncode, - 'stdout': stdout, - 'stderr': stderr, - 'duration': end_time - start_time, - 'initial_memory_mb': initial_memory / 1024 / 1024, - 'peak_memory_mb': peak_memory / 1024 / 1024, - 'final_memory_mb': final_memory / 1024 / 1024, - 'memory_delta_mb': (final_memory - initial_memory) / 1024 / 1024, - 'memory_samples': len(memory_samples) - } - - def test_memory_usage_small_kb(self, small_kb, rdf_output_dir): - """Test memory usage with small knowledge base.""" - args = [ - "process", - "--knowledge-base", str(small_kb), - "--rdf-output-dir", str(rdf_output_dir) - ] - - result = self.run_cli_with_memory_tracking(args) - - assert result['returncode'] == 0 - assert result['memory_delta_mb'] < 100, f"Memory usage too high: {result['memory_delta_mb']:.1f}MB" - assert result['duration'] < 60, f"Processing too slow: {result['duration']:.1f}s" - - print(f"Small KB memory: {result['memory_delta_mb']:.1f}MB, duration: {result['duration']:.1f}s") - - def test_memory_usage_medium_kb(self, medium_kb, rdf_output_dir): - """Test memory usage with medium knowledge base.""" - args = [ - "process", - "--knowledge-base", str(medium_kb), - "--rdf-output-dir", str(rdf_output_dir) - ] - - result = self.run_cli_with_memory_tracking(args) - - assert result['returncode'] == 0 - assert result['memory_delta_mb'] < 300, f"Memory usage too high: {result['memory_delta_mb']:.1f}MB" - assert result['duration'] < 180, f"Processing too slow: {result['duration']:.1f}s" - - print(f"Medium KB memory: {result['memory_delta_mb']:.1f}MB, duration: {result['duration']:.1f}s") - - def test_memory_scaling(self, small_kb, medium_kb, rdf_output_dir): - """Test that memory usage scales reasonably.""" - # Test small KB - small_result = self.run_cli_with_memory_tracking([ - "process", - "--knowledge-base", str(small_kb), - "--rdf-output-dir", str(rdf_output_dir) - ]) - - # Test medium KB - medium_result = self.run_cli_with_memory_tracking([ - "process", - "--knowledge-base", str(medium_kb), - "--rdf-output-dir", str(rdf_output_dir) - ]) - - assert small_result['returncode'] == 0 - assert medium_result['returncode'] == 0 - - # Medium KB has 5x more files, memory should not scale linearly - if small_result['memory_delta_mb'] > 0: - memory_ratio = medium_result['memory_delta_mb'] / small_result['memory_delta_mb'] - assert memory_ratio < 20, f"Memory scaling too aggressive: {memory_ratio:.1f}x" - - print(f"Memory scaling: small={small_result['memory_delta_mb']:.1f}MB, " + - f"medium={medium_result['memory_delta_mb']:.1f}MB") - - -@pytest.mark.stress -class TestCLIStressBenchmarks(CLIBenchmarkFixtures): - """Stress testing benchmarks for the CLI.""" - - def test_large_file_processing(self, benchmark, temp_workspace): - """Benchmark processing of a single large file.""" - # Create large file (~1MB) - large_kb = Path(temp_workspace) / "large_file_kb" - large_kb.mkdir(exist_ok=True) - - # Generate large content - sections = [] - for i in range(500): # 500 sections โ‰ˆ 1MB - sections.extend([ - f"## Section {i}", - "Lorem ipsum dolor sit amet, consectetur adipiscing elit. " * 20, - "", - f"### Subsection {i}.1", - "Detailed content with more text. " * 15, - "", - f"```python", - f"def function_{i}():", - f" '''Function number {i}'''", - f" data = {{'key': {i}, 'value': 'data_{i}'}}", - f" return process_data(data)", - f"```", - "", - f"- Item {i}.1", - f"- Item {i}.2", - f"- Item {i}.3", - "" - ]) - - large_content = "# Large Document\n\n" + "\n".join(sections) - large_file = large_kb / "large_document.md" - large_file.write_text(large_content) - - rdf_dir = Path(temp_workspace) / "large_file_rdf" - rdf_dir.mkdir(exist_ok=True) - - def process_large_file(): - args = [ - "process", - "--knowledge-base", str(large_kb), - "--rdf-output-dir", str(rdf_dir) - ] - result = subprocess.run( - ["poetry", "run", "python", "-m", "knowledgebase_processor.cli.main"] + args, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - cwd=PROJECT_ROOT, - timeout=300 - ) - assert result.returncode == 0 - return result - - result = benchmark(process_large_file) - - stats = benchmark.stats - assert stats.mean < 180, f"Large file processing too slow: {stats.mean:.3f}s" - - # Calculate throughput in MB/s - file_size_mb = len(large_content) / 1024 / 1024 - throughput = file_size_mb / stats.mean - print(f"Large file throughput: {throughput:.2f} MB/s") - - def test_concurrent_processing_simulation(self, small_kb, temp_workspace): - """Simulate concurrent processing by running multiple instances.""" - import concurrent.futures - import threading - - rdf_base_dir = Path(temp_workspace) / "concurrent_rdf" - rdf_base_dir.mkdir(exist_ok=True) - - def run_single_process(instance_id): - rdf_dir = rdf_base_dir / f"instance_{instance_id}" - rdf_dir.mkdir(exist_ok=True) - - args = [ - "process", - "--knowledge-base", str(small_kb), - "--rdf-output-dir", str(rdf_dir) - ] - - start_time = time.time() - result = subprocess.run( - ["poetry", "run", "python", "-m", "knowledgebase_processor.cli.main"] + args, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - cwd=PROJECT_ROOT, - timeout=60 - ) - end_time = time.time() - - return { - 'instance_id': instance_id, - 'returncode': result.returncode, - 'duration': end_time - start_time, - 'success': result.returncode == 0 - } - - # Run 3 concurrent instances - start_time = time.time() - with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor: - futures = [executor.submit(run_single_process, i) for i in range(3)] - results = [f.result() for f in concurrent.futures.as_completed(futures, timeout=90)] - end_time = time.time() - - total_time = end_time - start_time - successful_runs = [r for r in results if r['success']] - - assert len(successful_runs) >= 2, f"Not enough successful concurrent runs: {len(successful_runs)}/3" - assert total_time < 90, f"Concurrent processing took too long: {total_time:.1f}s" - - avg_duration = sum(r['duration'] for r in successful_runs) / len(successful_runs) - print(f"Concurrent processing: {len(successful_runs)}/3 successful, avg={avg_duration:.1f}s, total={total_time:.1f}s") - - -# Configuration for pytest-benchmark -def pytest_configure(config): - """Configure pytest-benchmark settings.""" - config.option.benchmark_only = True - config.option.benchmark_sort = 'mean' - config.option.benchmark_columns = ['min', 'max', 'mean', 'stddev', 'median', 'ops', 'rounds'] - - -if __name__ == "__main__": - pytest.main([__file__, "--benchmark-only", "-v"]) \ No newline at end of file diff --git a/tests/cli_v2/test_cli_e2e.py b/tests/cli/test_cli_e2e.py similarity index 99% rename from tests/cli_v2/test_cli_e2e.py rename to tests/cli/test_cli_e2e.py index c0fd268..57e42c5 100644 --- a/tests/cli_v2/test_cli_e2e.py +++ b/tests/cli/test_cli_e2e.py @@ -6,7 +6,7 @@ from pathlib import Path from click.testing import CliRunner -from knowledgebase_processor.cli_v2.main import cli +from knowledgebase_processor.cli.main import cli class TestCLIEndToEnd: diff --git a/tests/cli/test_cli_handlers.py b/tests/cli/test_cli_handlers.py deleted file mode 100644 index fc2a80f..0000000 --- a/tests/cli/test_cli_handlers.py +++ /dev/null @@ -1,602 +0,0 @@ -"""Unit tests for CLI command handlers.""" - -import argparse -import json -from pathlib import Path -from unittest.mock import Mock, patch, MagicMock, call -import pytest - -from knowledgebase_processor.cli.main import ( - handle_process, handle_query, handle_sparql, - handle_sparql_query, handle_sparql_load, handle_process_and_load -) -from knowledgebase_processor.api import KnowledgeBaseAPI -from knowledgebase_processor.config import Config -from SPARQLWrapper.SPARQLExceptions import SPARQLWrapperException - - -class TestHandleProcess: - """Test handle_process function.""" - - @pytest.fixture - def mock_api(self): - """Create a mock KnowledgeBaseAPI.""" - api = Mock(spec=KnowledgeBaseAPI) - api.process_documents.return_value = 0 - return api - - def test_handle_process_success(self, mock_api): - """Test successful process command.""" - args = argparse.Namespace( - pattern="**/*.md", - rdf_output_dir=None - ) - - result = handle_process(mock_api, args) - - assert result == 0 - mock_api.process_documents.assert_called_once_with( - pattern="**/*.md", - rdf_output_dir=None - ) - - def test_handle_process_with_rdf_output(self, mock_api): - """Test process command with RDF output directory.""" - args = argparse.Namespace( - pattern="*.txt", - rdf_output_dir="/tmp/rdf" - ) - - result = handle_process(mock_api, args) - - assert result == 0 - mock_api.process_documents.assert_called_once_with( - pattern="*.txt", - rdf_output_dir=Path("/tmp/rdf") - ) - - def test_handle_process_api_failure(self, mock_api): - """Test process command when API returns failure.""" - mock_api.process_documents.return_value = 1 - args = argparse.Namespace(pattern="**/*.md", rdf_output_dir=None) - - result = handle_process(mock_api, args) - - assert result == 1 - - def test_handle_process_exception(self, mock_api): - """Test process command when exception occurs.""" - mock_api.process_documents.side_effect = Exception("Processing failed") - args = argparse.Namespace(pattern="**/*.md", rdf_output_dir=None) - - result = handle_process(mock_api, args) - - assert result == 1 - - -class TestHandleQuery: - """Test handle_query function.""" - - @pytest.fixture - def mock_api(self): - """Create a mock KnowledgeBaseAPI.""" - api = Mock(spec=KnowledgeBaseAPI) - return api - - def test_handle_query_success_with_results(self, mock_api, capsys): - """Test successful query with results.""" - mock_api.query.return_value = ["Result 1", "Result 2", "Result 3"] - args = argparse.Namespace( - query_string="test query", - type="text" - ) - - result = handle_query(mock_api, args) - - assert result == 0 - mock_api.query.assert_called_once_with("test query", "text") - - captured = capsys.readouterr() - assert "Result 1" in captured.out - assert "Result 2" in captured.out - assert "Result 3" in captured.out - - def test_handle_query_success_no_results(self, mock_api, capsys): - """Test successful query with no results.""" - mock_api.query.return_value = [] - args = argparse.Namespace( - query_string="test query", - type="tag" - ) - - result = handle_query(mock_api, args) - - assert result == 0 - mock_api.query.assert_called_once_with("test query", "tag") - - captured = capsys.readouterr() - assert "No results found." in captured.out - - def test_handle_query_exception(self, mock_api): - """Test query command when exception occurs.""" - mock_api.query.side_effect = Exception("Query failed") - args = argparse.Namespace( - query_string="test query", - type="topic" - ) - - result = handle_query(mock_api, args) - - assert result == 1 - - -class TestHandleSparqlQuery: - """Test handle_sparql_query function.""" - - @pytest.fixture - def mock_api(self): - """Create a mock KnowledgeBaseAPI.""" - api = Mock(spec=KnowledgeBaseAPI) - return api - - def test_handle_sparql_query_json_format(self, mock_api, capsys): - """Test SPARQL query with JSON format.""" - mock_api.sparql_query.return_value = [ - {"subject": "http://example.org/1", "predicate": "http://example.org/title", "object": "Test 1"}, - {"subject": "http://example.org/2", "predicate": "http://example.org/title", "object": "Test 2"} - ] - args = argparse.Namespace( - sparql_query="SELECT * WHERE { ?s ?p ?o }", - endpoint_url="http://localhost:3030/sparql", - timeout=30, - format="json" - ) - - result = handle_sparql_query(mock_api, args) - - assert result == 0 - mock_api.sparql_query.assert_called_once_with( - query="SELECT * WHERE { ?s ?p ?o }", - endpoint_url="http://localhost:3030/sparql", - timeout=30, - format="json" - ) - - captured = capsys.readouterr() - # Verify JSON output is properly formatted - json.loads(captured.out) # Should not raise exception - - def test_handle_sparql_query_table_format(self, mock_api, capsys): - """Test SPARQL query with table format.""" - mock_api.sparql_query.return_value = [ - {"s": "subject1", "p": "predicate1", "o": "object1"}, - {"s": "subject2", "p": "predicate2", "o": "object2"} - ] - args = argparse.Namespace( - sparql_query="SELECT * WHERE { ?s ?p ?o }", - endpoint_url=None, - timeout=60, - format="table" - ) - - result = handle_sparql_query(mock_api, args) - - assert result == 0 - - captured = capsys.readouterr() - assert "s | p | o" in captured.out - assert "subject1 | predicate1 | object1" in captured.out - assert "subject2 | predicate2 | object2" in captured.out - - def test_handle_sparql_query_table_format_boolean_result(self, mock_api, capsys): - """Test SPARQL query with table format returning boolean.""" - mock_api.sparql_query.return_value = True - args = argparse.Namespace( - sparql_query="ASK WHERE { ?s ?p ?o }", - endpoint_url=None, - timeout=30, - format="table" - ) - - result = handle_sparql_query(mock_api, args) - - assert result == 0 - - captured = capsys.readouterr() - assert "True" in captured.out - - def test_handle_sparql_query_table_format_no_results(self, mock_api, capsys): - """Test SPARQL query with table format returning no results.""" - mock_api.sparql_query.return_value = [] - args = argparse.Namespace( - sparql_query="SELECT * WHERE { ?s ?p ?o }", - endpoint_url=None, - timeout=30, - format="table" - ) - - result = handle_sparql_query(mock_api, args) - - assert result == 0 - - captured = capsys.readouterr() - assert "No results found." in captured.out - - def test_handle_sparql_query_turtle_format(self, mock_api, capsys): - """Test SPARQL query with turtle format.""" - turtle_data = "@prefix ex: .\nex:subject ex:predicate ex:object ." - mock_api.sparql_query.return_value = turtle_data - args = argparse.Namespace( - sparql_query="CONSTRUCT { ?s ?p ?o } WHERE { ?s ?p ?o }", - endpoint_url=None, - timeout=30, - format="turtle" - ) - - result = handle_sparql_query(mock_api, args) - - assert result == 0 - - captured = capsys.readouterr() - assert turtle_data in captured.out - - def test_handle_sparql_query_exception(self, mock_api): - """Test SPARQL query when exception occurs.""" - mock_api.sparql_query.side_effect = Exception("SPARQL query failed") - args = argparse.Namespace( - sparql_query="SELECT * WHERE { ?s ?p ?o }", - endpoint_url=None, - timeout=30, - format="json" - ) - - result = handle_sparql_query(mock_api, args) - - assert result == 1 - - -class TestHandleSparqlLoad: - """Test handle_sparql_load function.""" - - @pytest.fixture - def mock_api(self): - """Create a mock KnowledgeBaseAPI.""" - api = Mock(spec=KnowledgeBaseAPI) - return api - - def test_handle_sparql_load_success(self, mock_api): - """Test successful SPARQL load.""" - args = argparse.Namespace( - file_path="/path/to/data.ttl", - graph="http://example.org/graph", - endpoint_url="http://localhost:3030/sparql", - user="admin", - password="secret", - rdf_format="turtle" - ) - - result = handle_sparql_load(mock_api, args) - - assert result == 0 - mock_api.sparql_load.assert_called_once_with( - file_path=Path("/path/to/data.ttl"), - graph_uri="http://example.org/graph", - endpoint_url="http://localhost:3030/sparql", - username="admin", - password="secret", - rdf_format="turtle" - ) - - def test_handle_sparql_load_minimal_args(self, mock_api): - """Test SPARQL load with minimal arguments.""" - args = argparse.Namespace( - file_path="data.ttl", - graph=None, - endpoint_url=None, - user=None, - password=None, - rdf_format="turtle" - ) - - result = handle_sparql_load(mock_api, args) - - assert result == 0 - mock_api.sparql_load.assert_called_once_with( - file_path=Path("data.ttl"), - graph_uri=None, - endpoint_url=None, - username=None, - password=None, - rdf_format="turtle" - ) - - def test_handle_sparql_load_exception(self, mock_api): - """Test SPARQL load when exception occurs.""" - mock_api.sparql_load.side_effect = Exception("Load failed") - args = argparse.Namespace( - file_path="data.ttl", - graph=None, - endpoint_url=None, - user=None, - password=None, - rdf_format="turtle" - ) - - result = handle_sparql_load(mock_api, args) - - assert result == 1 - - -class TestHandleSparql: - """Test handle_sparql routing function.""" - - @pytest.fixture - def mock_api(self): - """Create a mock KnowledgeBaseAPI.""" - return Mock(spec=KnowledgeBaseAPI) - - @patch('knowledgebase_processor.cli.main.handle_sparql_query') - def test_handle_sparql_query_command(self, mock_handle_sparql_query, mock_api): - """Test routing to SPARQL query handler.""" - mock_handle_sparql_query.return_value = 0 - args = argparse.Namespace(sparql_command="query") - - result = handle_sparql(mock_api, args) - - assert result == 0 - mock_handle_sparql_query.assert_called_once_with(mock_api, args) - - @patch('knowledgebase_processor.cli.main.handle_sparql_load') - def test_handle_sparql_load_command(self, mock_handle_sparql_load, mock_api): - """Test routing to SPARQL load handler.""" - mock_handle_sparql_load.return_value = 0 - args = argparse.Namespace(sparql_command="load-file") - - result = handle_sparql(mock_api, args) - - assert result == 0 - mock_handle_sparql_load.assert_called_once_with(mock_api, args) - - def test_handle_sparql_unknown_command(self, mock_api): - """Test routing with unknown SPARQL command.""" - args = argparse.Namespace(sparql_command="unknown") - - result = handle_sparql(mock_api, args) - - assert result == 1 - - -class TestHandleProcessAndLoad: - """Test handle_process_and_load function.""" - - @pytest.fixture - def mock_api(self): - """Create a mock KnowledgeBaseAPI.""" - api = Mock(spec=KnowledgeBaseAPI) - api.config = Mock(spec=Config) - api.config.knowledge_base_path = "/test/kb" - api.config.sparql_endpoint_url = "http://localhost:3030/sparql" - api.processing_service = Mock() - api.processing_service.process_and_load.return_value = 0 - return api - - @patch('knowledgebase_processor.cli.main.is_valid_url') - @patch('pathlib.Path.is_dir') - def test_handle_process_and_load_success(self, mock_is_dir, mock_is_valid_url, mock_api): - """Test successful process-and-load command.""" - mock_is_dir.return_value = True - mock_is_valid_url.return_value = True - - args = argparse.Namespace( - knowledge_base_path="/custom/kb", - pattern="**/*.md", - endpoint_url="http://localhost:3030/sparql", - graph="http://example.org/graph", - cleanup=True, - rdf_output_dir="/tmp/rdf", - user="admin", - password="secret" - ) - - result = handle_process_and_load(mock_api, args) - - assert result == 0 - mock_api.processing_service.process_and_load.assert_called_once_with( - pattern="**/*.md", - knowledge_base_path=Path("/custom/kb"), - rdf_output_dir=Path("/tmp/rdf"), - graph_uri="http://example.org/graph", - endpoint_url="http://localhost:3030/sparql", - cleanup=True, - username="admin", - password="secret" - ) - - @patch('knowledgebase_processor.cli.main.is_valid_url') - @patch('pathlib.Path.is_dir') - def test_handle_process_and_load_use_api_config_kb_path(self, mock_is_dir, mock_is_valid_url, mock_api): - """Test process-and-load using API config knowledge base path.""" - mock_is_dir.return_value = True - mock_is_valid_url.return_value = True - - args = argparse.Namespace( - knowledge_base_path=None, # Use API config - pattern="**/*.md", - endpoint_url=None, # Use API config - graph=None, - cleanup=False, - rdf_output_dir=None, - user=None, - password=None - ) - - result = handle_process_and_load(mock_api, args) - - assert result == 0 - mock_api.processing_service.process_and_load.assert_called_once_with( - pattern="**/*.md", - knowledge_base_path=Path("/test/kb"), - rdf_output_dir=None, - graph_uri=None, - endpoint_url="http://localhost:3030/sparql", - cleanup=False, - username=None, - password=None - ) - - @patch('pathlib.Path.is_dir') - def test_handle_process_and_load_invalid_kb_path(self, mock_is_dir, mock_api): - """Test process-and-load with invalid knowledge base path.""" - mock_is_dir.return_value = False - - args = argparse.Namespace( - knowledge_base_path="/invalid/path", - endpoint_url="http://localhost:3030/sparql" - ) - - result = handle_process_and_load(mock_api, args) - - assert result == 1 - - @patch('knowledgebase_processor.cli.main.is_valid_url') - @patch('pathlib.Path.is_dir') - def test_handle_process_and_load_no_endpoint_url(self, mock_is_dir, mock_is_valid_url, mock_api): - """Test process-and-load without endpoint URL.""" - mock_is_dir.return_value = True - mock_api.config.sparql_endpoint_url = None - - args = argparse.Namespace( - knowledge_base_path="/test/kb", - endpoint_url=None - ) - - result = handle_process_and_load(mock_api, args) - - assert result == 1 - - @patch('knowledgebase_processor.cli.main.is_valid_url') - @patch('pathlib.Path.is_dir') - def test_handle_process_and_load_invalid_endpoint_url(self, mock_is_dir, mock_is_valid_url, mock_api): - """Test process-and-load with invalid endpoint URL.""" - mock_is_dir.return_value = True - mock_is_valid_url.return_value = False - - args = argparse.Namespace( - knowledge_base_path="/test/kb", - endpoint_url="invalid-url" - ) - - result = handle_process_and_load(mock_api, args) - - assert result == 1 - - @patch('knowledgebase_processor.cli.main.is_valid_url') - @patch('pathlib.Path.is_dir') - @patch('pathlib.Path.mkdir') - def test_handle_process_and_load_rdf_output_dir_creation_failure(self, mock_mkdir, mock_is_dir, mock_is_valid_url, mock_api): - """Test process-and-load when RDF output directory creation fails.""" - mock_is_dir.return_value = True - mock_is_valid_url.return_value = True - mock_mkdir.side_effect = OSError("Permission denied") - - args = argparse.Namespace( - knowledge_base_path="/test/kb", - endpoint_url="http://localhost:3030/sparql", - rdf_output_dir="/no/permission/rdf" - ) - - result = handle_process_and_load(mock_api, args) - - assert result == 1 - - @patch('knowledgebase_processor.cli.main.is_valid_url') - @patch('pathlib.Path.is_dir') - def test_handle_process_and_load_sparql_exception(self, mock_is_dir, mock_is_valid_url, mock_api): - """Test process-and-load with SPARQL exception.""" - mock_is_dir.return_value = True - mock_is_valid_url.return_value = True - mock_api.processing_service.process_and_load.side_effect = SPARQLWrapperException("SPARQL error") - - args = argparse.Namespace( - knowledge_base_path="/test/kb", - endpoint_url="http://localhost:3030/sparql", - pattern="**/*.md", - graph=None, - cleanup=False, - rdf_output_dir=None, - user=None, - password=None - ) - - result = handle_process_and_load(mock_api, args) - - assert result == 1 - - @patch('knowledgebase_processor.cli.main.is_valid_url') - @patch('pathlib.Path.is_dir') - def test_handle_process_and_load_file_not_found_exception(self, mock_is_dir, mock_is_valid_url, mock_api): - """Test process-and-load with FileNotFoundError exception.""" - mock_is_dir.return_value = True - mock_is_valid_url.return_value = True - mock_api.processing_service.process_and_load.side_effect = FileNotFoundError("File not found") - - args = argparse.Namespace( - knowledge_base_path="/test/kb", - endpoint_url="http://localhost:3030/sparql", - pattern="**/*.md", - graph=None, - cleanup=False, - rdf_output_dir=None, - user=None, - password=None - ) - - result = handle_process_and_load(mock_api, args) - - assert result == 1 - - @patch('knowledgebase_processor.cli.main.is_valid_url') - @patch('pathlib.Path.is_dir') - def test_handle_process_and_load_general_exception(self, mock_is_dir, mock_is_valid_url, mock_api): - """Test process-and-load with general exception.""" - mock_is_dir.return_value = True - mock_is_valid_url.return_value = True - mock_api.processing_service.process_and_load.side_effect = Exception("Unexpected error") - - args = argparse.Namespace( - knowledge_base_path="/test/kb", - endpoint_url="http://localhost:3030/sparql", - pattern="**/*.md", - graph=None, - cleanup=False, - rdf_output_dir=None, - user=None, - password=None - ) - - result = handle_process_and_load(mock_api, args) - - assert result == 1 - - @patch('knowledgebase_processor.cli.main.is_valid_url') - @patch('pathlib.Path.is_dir') - def test_handle_process_and_load_api_failure(self, mock_is_dir, mock_is_valid_url, mock_api): - """Test process-and-load when API returns failure code.""" - mock_is_dir.return_value = True - mock_is_valid_url.return_value = True - mock_api.processing_service.process_and_load.return_value = 1 - - args = argparse.Namespace( - knowledge_base_path="/test/kb", - endpoint_url="http://localhost:3030/sparql", - pattern="**/*.md", - graph=None, - cleanup=False, - rdf_output_dir=None, - user=None, - password=None - ) - - result = handle_process_and_load(mock_api, args) - - assert result == 1 \ No newline at end of file diff --git a/tests/cli/test_cli_integration.py b/tests/cli/test_cli_integration.py deleted file mode 100644 index 9acdf44..0000000 --- a/tests/cli/test_cli_integration.py +++ /dev/null @@ -1,570 +0,0 @@ -"""Integration tests for CLI argument combinations and edge cases.""" - -import argparse -import json -import tempfile -import shutil -from pathlib import Path -from unittest.mock import Mock, patch, MagicMock -import pytest - -from knowledgebase_processor.cli.main import main, parse_args -from knowledgebase_processor.config import Config -from knowledgebase_processor.api import KnowledgeBaseAPI - - -class TestCliIntegration: - """Test CLI integration scenarios with various argument combinations.""" - - @pytest.fixture - def temp_directory(self): - """Create a temporary directory for testing.""" - temp_dir = tempfile.mkdtemp() - yield temp_dir - shutil.rmtree(temp_dir) - - @pytest.fixture - def mock_api(self): - """Create a mock KnowledgeBaseAPI with typical configuration.""" - api = Mock(spec=KnowledgeBaseAPI) - api.config = Mock(spec=Config) - api.config.knowledge_base_path = "/test/kb" - api.config.sparql_endpoint_url = "http://localhost:3030/sparql" - api.config.metadata_store_path = "/test/.kbp/metadata/knowledgebase.db" - - # Mock service methods - api.process_documents = Mock(return_value=0) - api.query = Mock(return_value=["result1", "result2"]) - api.sparql_query = Mock(return_value=[{"subject": "s1", "predicate": "p1", "object": "o1"}]) - api.sparql_load = Mock() - - api.processing_service = Mock() - api.processing_service.process_and_load = Mock(return_value=0) - - return api - - def setup_common_mocks(self, mock_api): - """Setup common mocks for CLI testing.""" - with patch('knowledgebase_processor.cli.main.setup_logging'), \ - patch('knowledgebase_processor.cli.main.load_config') as mock_load_config, \ - patch('knowledgebase_processor.cli.main.KnowledgeBaseAPI') as mock_api_class: - - mock_load_config.return_value = mock_api.config - mock_api_class.return_value = mock_api - - return mock_load_config, mock_api_class - - -class TestProcessCommandIntegration(TestCliIntegration): - """Test process command with various configurations.""" - - def test_process_command_minimal_arguments(self, mock_api): - """Test process command with minimal arguments.""" - mock_load_config, mock_api_class = self.setup_common_mocks(mock_api) - - result = main(["process"]) - - assert result == 0 - mock_api.process_documents.assert_called_once_with( - pattern="**/*.md", - rdf_output_dir=None - ) - - def test_process_command_with_custom_pattern(self, mock_api): - """Test process command with custom file pattern.""" - mock_load_config, mock_api_class = self.setup_common_mocks(mock_api) - - result = main(["process", "--pattern", "*.txt"]) - - assert result == 0 - mock_api.process_documents.assert_called_once_with( - pattern="*.txt", - rdf_output_dir=None - ) - - def test_process_command_with_rdf_output(self, mock_api, temp_directory): - """Test process command with RDF output directory.""" - mock_load_config, mock_api_class = self.setup_common_mocks(mock_api) - rdf_dir = str(Path(temp_directory) / "rdf_output") - - result = main(["process", "--rdf-output-dir", rdf_dir]) - - assert result == 0 - mock_api.process_documents.assert_called_once_with( - pattern="**/*.md", - rdf_output_dir=Path(rdf_dir) - ) - - def test_process_command_with_global_options(self, mock_api, temp_directory): - """Test process command with global options.""" - mock_load_config, mock_api_class = self.setup_common_mocks(mock_api) - config_file = str(Path(temp_directory) / "config.yaml") - kb_path = str(Path(temp_directory) / "kb") - - result = main([ - "--config", config_file, - "--knowledge-base", kb_path, - "--log-level", "DEBUG", - "process", - "--pattern", "docs/**/*.md" - ]) - - assert result == 0 - # Verify config override - assert mock_api.config.knowledge_base_path == kb_path - - -class TestQueryCommandIntegration(TestCliIntegration): - """Test query command with various configurations.""" - - def test_query_command_text_search(self, mock_api, capsys): - """Test text query command.""" - mock_load_config, mock_api_class = self.setup_common_mocks(mock_api) - - result = main(["query", "test search"]) - - assert result == 0 - mock_api.query.assert_called_once_with("test search", "text") - - captured = capsys.readouterr() - assert "result1" in captured.out - assert "result2" in captured.out - - def test_query_command_tag_search(self, mock_api, capsys): - """Test tag query command.""" - mock_load_config, mock_api_class = self.setup_common_mocks(mock_api) - - result = main(["query", "important", "--type", "tag"]) - - assert result == 0 - mock_api.query.assert_called_once_with("important", "tag") - - def test_query_command_no_results(self, mock_api, capsys): - """Test query command with no results.""" - mock_api.query.return_value = [] - mock_load_config, mock_api_class = self.setup_common_mocks(mock_api) - - result = main(["query", "nonexistent"]) - - assert result == 0 - captured = capsys.readouterr() - assert "No results found." in captured.out - - -class TestSparqlCommandIntegration(TestCliIntegration): - """Test SPARQL command with various configurations.""" - - def test_sparql_query_json_format(self, mock_api, capsys): - """Test SPARQL query with JSON format.""" - mock_load_config, mock_api_class = self.setup_common_mocks(mock_api) - - result = main([ - "sparql", "query", "SELECT * WHERE { ?s ?p ?o }", - "--format", "json" - ]) - - assert result == 0 - mock_api.sparql_query.assert_called_once_with( - query="SELECT * WHERE { ?s ?p ?o }", - endpoint_url=None, - timeout=30, - format="json" - ) - - captured = capsys.readouterr() - # Should be valid JSON - json.loads(captured.out) - - def test_sparql_query_table_format(self, mock_api, capsys): - """Test SPARQL query with table format.""" - mock_load_config, mock_api_class = self.setup_common_mocks(mock_api) - - result = main([ - "sparql", "query", "SELECT * WHERE { ?s ?p ?o }", - "--format", "table", - "--timeout", "60" - ]) - - assert result == 0 - mock_api.sparql_query.assert_called_once_with( - query="SELECT * WHERE { ?s ?p ?o }", - endpoint_url=None, - timeout=60, - format="table" - ) - - captured = capsys.readouterr() - assert "subject | predicate | object" in captured.out - - def test_sparql_query_with_endpoint_and_credentials(self, mock_api): - """Test SPARQL query with custom endpoint and credentials.""" - mock_load_config, mock_api_class = self.setup_common_mocks(mock_api) - - result = main([ - "sparql", "query", "SELECT * WHERE { ?s ?p ?o }", - "--endpoint-url", "http://custom:3030/sparql", - "--user", "admin", - "--password", "secret", - "--format", "json" - ]) - - assert result == 0 - mock_api.sparql_query.assert_called_once_with( - query="SELECT * WHERE { ?s ?p ?o }", - endpoint_url="http://custom:3030/sparql", - timeout=30, - format="json" - ) - - def test_sparql_load_file_minimal(self, mock_api, temp_directory): - """Test SPARQL load-file with minimal arguments.""" - mock_load_config, mock_api_class = self.setup_common_mocks(mock_api) - test_file = str(Path(temp_directory) / "test.ttl") - Path(test_file).touch() # Create empty file - - result = main(["sparql", "load-file", test_file]) - - assert result == 0 - mock_api.sparql_load.assert_called_once_with( - file_path=Path(test_file), - graph_uri=None, - endpoint_url=None, - username=None, - password=None, - rdf_format="turtle" - ) - - def test_sparql_load_file_with_all_options(self, mock_api, temp_directory): - """Test SPARQL load-file with all options.""" - mock_load_config, mock_api_class = self.setup_common_mocks(mock_api) - test_file = str(Path(temp_directory) / "test.n3") - Path(test_file).touch() # Create empty file - - result = main([ - "sparql", "load-file", test_file, - "--graph", "http://example.org/graph", - "--endpoint-url", "http://localhost:3030/sparql", - "--user", "admin", - "--password", "secret", - "--rdf-format", "n3" - ]) - - assert result == 0 - mock_api.sparql_load.assert_called_once_with( - file_path=Path(test_file), - graph_uri="http://example.org/graph", - endpoint_url="http://localhost:3030/sparql", - username="admin", - password="secret", - rdf_format="n3" - ) - - -class TestProcessAndLoadIntegration(TestCliIntegration): - """Test process-and-load command integration.""" - - @patch('knowledgebase_processor.cli.main.is_valid_url') - @patch('pathlib.Path.is_dir') - def test_process_and_load_minimal(self, mock_is_dir, mock_is_valid_url, mock_api): - """Test process-and-load with minimal arguments.""" - mock_is_dir.return_value = True - mock_is_valid_url.return_value = True - mock_load_config, mock_api_class = self.setup_common_mocks(mock_api) - - result = main([ - "process-and-load", - "--endpoint-url", "http://localhost:3030/sparql" - ]) - - assert result == 0 - mock_api.processing_service.process_and_load.assert_called_once() - - @patch('knowledgebase_processor.cli.main.is_valid_url') - @patch('pathlib.Path.is_dir') - def test_process_and_load_with_knowledge_base_path(self, mock_is_dir, mock_is_valid_url, mock_api, temp_directory): - """Test process-and-load with explicit knowledge base path.""" - mock_is_dir.return_value = True - mock_is_valid_url.return_value = True - mock_load_config, mock_api_class = self.setup_common_mocks(mock_api) - - kb_path = str(Path(temp_directory) / "kb") - Path(kb_path).mkdir() - - result = main([ - "process-and-load", kb_path, - "--endpoint-url", "http://localhost:3030/sparql", - "--pattern", "*.md", - "--cleanup" - ]) - - assert result == 0 - call_args = mock_api.processing_service.process_and_load.call_args - assert call_args[1]['knowledge_base_path'] == Path(kb_path) - assert call_args[1]['pattern'] == "*.md" - assert call_args[1]['cleanup'] is True - - @patch('knowledgebase_processor.cli.main.is_valid_url') - @patch('pathlib.Path.is_dir') - def test_process_and_load_with_all_options(self, mock_is_dir, mock_is_valid_url, mock_api, temp_directory): - """Test process-and-load with all options.""" - mock_is_dir.return_value = True - mock_is_valid_url.return_value = True - mock_load_config, mock_api_class = self.setup_common_mocks(mock_api) - - kb_path = str(Path(temp_directory) / "kb") - rdf_path = str(Path(temp_directory) / "rdf") - Path(kb_path).mkdir() - - result = main([ - "process-and-load", kb_path, - "--pattern", "docs/**/*.md", - "--graph", "http://example.org/graph", - "--endpoint-url", "http://localhost:3030/sparql", - "--cleanup", - "--rdf-output-dir", rdf_path, - "--user", "admin", - "--password", "secret" - ]) - - assert result == 0 - call_args = mock_api.processing_service.process_and_load.call_args - assert call_args[1]['knowledge_base_path'] == Path(kb_path) - assert call_args[1]['pattern'] == "docs/**/*.md" - assert call_args[1]['graph_uri'] == "http://example.org/graph" - assert call_args[1]['endpoint_url'] == "http://localhost:3030/sparql" - assert call_args[1]['cleanup'] is True - assert call_args[1]['rdf_output_dir'] == Path(rdf_path) - assert call_args[1]['username'] == "admin" - assert call_args[1]['password'] == "secret" - - -class TestConfigurationOverrides(TestCliIntegration): - """Test configuration override scenarios.""" - - def test_knowledge_base_path_precedence(self, mock_api, temp_directory): - """Test knowledge base path precedence: CLI > config > default.""" - mock_load_config, mock_api_class = self.setup_common_mocks(mock_api) - - # Test CLI override - cli_kb_path = str(Path(temp_directory) / "cli_kb") - result = main(["--knowledge-base", cli_kb_path, "process"]) - assert result == 0 - assert mock_api.config.knowledge_base_path == cli_kb_path - - @patch('pathlib.Path.cwd') - def test_knowledge_base_path_default_to_cwd(self, mock_cwd, mock_api): - """Test knowledge base path defaults to current working directory.""" - mock_cwd.return_value = Path("/current/working/dir") - - # Remove knowledge_base_path from config to test default - delattr(mock_api.config, 'knowledge_base_path') - mock_load_config, mock_api_class = self.setup_common_mocks(mock_api) - - result = main(["process"]) - assert result == 0 - assert mock_api.config.knowledge_base_path == "/current/working/dir" - - @patch('pathlib.Path.home') - def test_metadata_store_path_precedence(self, mock_home, mock_api, temp_directory): - """Test metadata store path precedence: CLI > config > default.""" - mock_home.return_value = Path("/home/user") - mock_load_config, mock_api_class = self.setup_common_mocks(mock_api) - - # Test CLI override - cli_metadata_path = str(Path(temp_directory) / "cli_metadata") - result = main(["--metadata-store", cli_metadata_path, "process"]) - assert result == 0 - assert mock_api.config.metadata_store_path == f"{cli_metadata_path}/knowledgebase.db" - - @patch('pathlib.Path.home') - def test_metadata_store_path_default(self, mock_home, mock_api): - """Test metadata store path defaults to ~/.kbp/metadata.""" - mock_home.return_value = Path("/home/user") - - # Remove metadata_store_path from config to test default - delattr(mock_api.config, 'metadata_store_path') - mock_load_config, mock_api_class = self.setup_common_mocks(mock_api) - - result = main(["process"]) - assert result == 0 - assert mock_api.config.metadata_store_path == "/home/user/.kbp/metadata/knowledgebase.db" - - -class TestErrorHandlingIntegration(TestCliIntegration): - """Test error handling in integration scenarios.""" - - def test_api_initialization_failure_integration(self, mock_api): - """Test complete flow when API initialization fails.""" - with patch('knowledgebase_processor.cli.main.setup_logging'), \ - patch('knowledgebase_processor.cli.main.load_config') as mock_load_config, \ - patch('knowledgebase_processor.cli.main.KnowledgeBaseAPI') as mock_api_class: - - mock_load_config.return_value = mock_api.config - mock_api_class.side_effect = Exception("API initialization failed") - - result = main(["process"]) - - assert result == 1 - - def test_command_handler_failure_integration(self, mock_api): - """Test complete flow when command handler fails.""" - mock_api.process_documents.return_value = 1 # Failure code - mock_load_config, mock_api_class = self.setup_common_mocks(mock_api) - - result = main(["process"]) - - assert result == 1 - - def test_command_handler_exception_integration(self, mock_api): - """Test complete flow when command handler raises exception.""" - mock_api.process_documents.side_effect = Exception("Processing failed") - mock_load_config, mock_api_class = self.setup_common_mocks(mock_api) - - result = main(["process"]) - - assert result == 1 - - @patch('knowledgebase_processor.cli.main.is_valid_url') - @patch('pathlib.Path.is_dir') - def test_process_and_load_validation_failures(self, mock_is_dir, mock_is_valid_url, mock_api): - """Test process-and-load validation failure scenarios.""" - mock_load_config, mock_api_class = self.setup_common_mocks(mock_api) - - # Test invalid knowledge base path - mock_is_dir.return_value = False - result = main([ - "process-and-load", "/invalid/path", - "--endpoint-url", "http://localhost:3030/sparql" - ]) - assert result == 1 - - # Test invalid endpoint URL - mock_is_dir.return_value = True - mock_is_valid_url.return_value = False - result = main([ - "process-and-load", "/valid/path", - "--endpoint-url", "invalid-url" - ]) - assert result == 1 - - def test_missing_endpoint_url_integration(self, mock_api): - """Test process-and-load without endpoint URL.""" - mock_api.config.sparql_endpoint_url = None - mock_load_config, mock_api_class = self.setup_common_mocks(mock_api) - - with patch('pathlib.Path.is_dir', return_value=True): - result = main(["process-and-load", "/valid/path"]) - - assert result == 1 - - -class TestComplexArgumentCombinations(TestCliIntegration): - """Test complex argument combinations and edge cases.""" - - def test_all_global_options_with_process_and_load(self, mock_api, temp_directory): - """Test all global options with process-and-load command.""" - mock_load_config, mock_api_class = self.setup_common_mocks(mock_api) - - config_file = str(Path(temp_directory) / "config.yaml") - kb_path = str(Path(temp_directory) / "kb") - metadata_path = str(Path(temp_directory) / "metadata") - log_file = str(Path(temp_directory) / "app.log") - rdf_path = str(Path(temp_directory) / "rdf") - Path(kb_path).mkdir() - - with patch('knowledgebase_processor.cli.main.is_valid_url', return_value=True), \ - patch('pathlib.Path.is_dir', return_value=True): - - result = main([ - "--config", config_file, - "--knowledge-base", kb_path, - "--metadata-store", metadata_path, - "--log-level", "DEBUG", - "--log-file", log_file, - "--log-format", "json", - "process-and-load", kb_path, - "--pattern", "docs/**/*.md", - "--graph", "http://example.org/graph", - "--endpoint-url", "http://localhost:3030/sparql", - "--cleanup", - "--rdf-output-dir", rdf_path, - "--user", "admin", - "--password", "secret" - ]) - - assert result == 0 - # Verify configuration overrides - assert mock_api.config.knowledge_base_path == kb_path - assert mock_api.config.metadata_store_path == f"{metadata_path}/knowledgebase.db" - - def test_sparql_query_with_all_options(self, mock_api, capsys): - """Test SPARQL query with all available options.""" - mock_load_config, mock_api_class = self.setup_common_mocks(mock_api) - - result = main([ - "--log-level", "INFO", - "--log-format", "json", - "sparql", "query", "SELECT ?s ?p ?o WHERE { ?s ?p ?o } LIMIT 10", - "--endpoint-url", "https://dbpedia.org/sparql", - "--timeout", "120", - "--format", "json", - "--user", "dbuser", - "--password", "dbpass" - ]) - - assert result == 0 - mock_api.sparql_query.assert_called_once_with( - query="SELECT ?s ?p ?o WHERE { ?s ?p ?o } LIMIT 10", - endpoint_url="https://dbpedia.org/sparql", - timeout=120, - format="json" - ) - - def test_query_with_special_characters(self, mock_api, capsys): - """Test query command with special characters and spaces.""" - mock_load_config, mock_api_class = self.setup_common_mocks(mock_api) - - query_with_spaces = "search term with spaces and 'quotes'" - result = main(["query", query_with_spaces, "--type", "text"]) - - assert result == 0 - mock_api.query.assert_called_once_with(query_with_spaces, "text") - - def test_file_paths_with_spaces_and_special_chars(self, mock_api, temp_directory): - """Test file paths containing spaces and special characters.""" - mock_load_config, mock_api_class = self.setup_common_mocks(mock_api) - - # Create paths with spaces - kb_path = str(Path(temp_directory) / "knowledge base with spaces") - rdf_path = str(Path(temp_directory) / "rdf output") - test_file = str(Path(temp_directory) / "test file.ttl") - - Path(kb_path).mkdir() - Path(test_file).touch() - - # Test process command with spaces in paths - result = main([ - "--knowledge-base", kb_path, - "process", - "--rdf-output-dir", rdf_path - ]) - - assert result == 0 - assert mock_api.config.knowledge_base_path == kb_path - mock_api.process_documents.assert_called_once_with( - pattern="**/*.md", - rdf_output_dir=Path(rdf_path) - ) - - # Test SPARQL load with file containing spaces - result = main(["sparql", "load-file", test_file]) - - assert result == 0 - mock_api.sparql_load.assert_called_once_with( - file_path=Path(test_file), - graph_uri=None, - endpoint_url=None, - username=None, - password=None, - rdf_format="turtle" - ) \ No newline at end of file diff --git a/tests/cli/test_cli_main.py b/tests/cli/test_cli_main.py deleted file mode 100644 index efcb718..0000000 --- a/tests/cli/test_cli_main.py +++ /dev/null @@ -1,370 +0,0 @@ -"""Unit tests for CLI main functionality.""" - -import argparse -import json -import pytest -from pathlib import Path -from unittest.mock import Mock, patch, MagicMock -from urllib.parse import urlparse - -from knowledgebase_processor.cli.main import ( - main, parse_args, is_valid_url -) -from knowledgebase_processor.config import Config -from knowledgebase_processor.api import KnowledgeBaseAPI - - -class TestIsValidUrl: - """Test URL validation function.""" - - def test_valid_http_url(self): - assert is_valid_url("http://example.com") is True - - def test_valid_https_url(self): - assert is_valid_url("https://example.com") is True - - def test_valid_url_with_port(self): - assert is_valid_url("http://localhost:3030") is True - - def test_valid_url_with_path(self): - assert is_valid_url("https://example.com/sparql") is True - - def test_invalid_url_no_scheme(self): - assert is_valid_url("example.com") is False - - def test_invalid_url_no_netloc(self): - assert is_valid_url("http://") is False - - def test_invalid_url_empty_string(self): - assert is_valid_url("") is False - - def test_invalid_url_none(self): - assert is_valid_url(None) is False - - def test_invalid_url_malformed(self): - assert is_valid_url("not-a-url") is False - - def test_url_with_query_params(self): - assert is_valid_url("https://example.com/sparql?query=test") is True - - -class TestParseArgs: - """Test CLI argument parsing.""" - - def test_process_command_minimal(self): - args = parse_args(["process"]) - assert args.command == "process" - assert args.pattern == "**/*.md" - assert args.rdf_output_dir is None - - def test_process_command_with_pattern(self): - args = parse_args(["process", "--pattern", "*.txt"]) - assert args.command == "process" - assert args.pattern == "*.txt" - - def test_process_command_with_rdf_output(self): - args = parse_args(["process", "--rdf-output-dir", "/tmp/rdf"]) - assert args.command == "process" - assert args.rdf_output_dir == "/tmp/rdf" - - def test_query_command_minimal(self): - args = parse_args(["query", "test search"]) - assert args.command == "query" - assert args.query_string == "test search" - assert args.type == "text" - - def test_query_command_with_type(self): - args = parse_args(["query", "test", "--type", "tag"]) - assert args.command == "query" - assert args.query_string == "test" - assert args.type == "tag" - - def test_sparql_query_command(self): - args = parse_args(["sparql", "query", "SELECT * WHERE { ?s ?p ?o }"]) - assert args.command == "sparql" - assert args.sparql_command == "query" - assert args.sparql_query == "SELECT * WHERE { ?s ?p ?o }" - assert args.timeout == 30 - assert args.format == "table" - - def test_sparql_query_with_options(self): - args = parse_args([ - "sparql", "query", "SELECT * WHERE { ?s ?p ?o }", - "--endpoint-url", "http://localhost:3030/sparql", - "--timeout", "60", - "--format", "json", - "--user", "testuser", - "--password", "testpass" - ]) - assert args.endpoint_url == "http://localhost:3030/sparql" - assert args.timeout == 60 - assert args.format == "json" - assert args.user == "testuser" - assert args.password == "testpass" - - def test_sparql_load_file_command(self): - args = parse_args(["sparql", "load-file", "data.ttl"]) - assert args.command == "sparql" - assert args.sparql_command == "load-file" - assert args.file_path == "data.ttl" - assert args.rdf_format == "turtle" - - def test_sparql_load_file_with_options(self): - args = parse_args([ - "sparql", "load-file", "data.ttl", - "--graph", "http://example.org/graph", - "--endpoint-url", "http://localhost:3030/sparql", - "--rdf-format", "n3" - ]) - assert args.graph == "http://example.org/graph" - assert args.endpoint_url == "http://localhost:3030/sparql" - assert args.rdf_format == "n3" - - def test_process_and_load_command_minimal(self): - args = parse_args(["process-and-load"]) - assert args.command == "process-and-load" - assert args.knowledge_base_path is None - assert args.pattern == "**/*.md" - assert args.cleanup is False - - def test_process_and_load_command_with_path(self): - args = parse_args(["process-and-load", "/path/to/kb"]) - assert args.command == "process-and-load" - assert args.knowledge_base_path == "/path/to/kb" - - def test_process_and_load_command_with_options(self): - args = parse_args([ - "process-and-load", "/path/to/kb", - "--pattern", "*.md", - "--graph", "http://example.org/graph", - "--endpoint-url", "http://localhost:3030/sparql", - "--cleanup", - "--rdf-output-dir", "/tmp/rdf", - "--user", "admin", - "--password", "secret" - ]) - assert args.knowledge_base_path == "/path/to/kb" - assert args.pattern == "*.md" - assert args.graph == "http://example.org/graph" - assert args.endpoint_url == "http://localhost:3030/sparql" - assert args.cleanup is True - assert args.rdf_output_dir == "/tmp/rdf" - assert args.user == "admin" - assert args.password == "secret" - - def test_global_options(self): - args = parse_args([ - "--config", "/path/to/config.yaml", - "--knowledge-base", "/path/to/kb", - "--metadata-store", "/path/to/metadata", - "--log-level", "DEBUG", - "--log-file", "/tmp/app.log", - "--log-format", "json", - "process" - ]) - assert args.config == "/path/to/config.yaml" - assert args.knowledge_base == "/path/to/kb" - assert args.metadata_store == "/path/to/metadata" - assert args.log_level == "DEBUG" - assert args.log_file == "/tmp/app.log" - assert args.log_format == "json" - - def test_missing_command_fails(self): - with pytest.raises(SystemExit): - parse_args([]) - - def test_invalid_log_level_fails(self): - with pytest.raises(SystemExit): - parse_args(["--log-level", "INVALID", "process"]) - - def test_invalid_query_type_fails(self): - with pytest.raises(SystemExit): - parse_args(["query", "test", "--type", "invalid"]) - - def test_invalid_format_fails(self): - with pytest.raises(SystemExit): - parse_args(["sparql", "query", "SELECT * WHERE { ?s ?p ?o }", "--format", "invalid"]) - - -class TestMain: - """Test main function.""" - - @pytest.fixture - def mock_api(self): - """Create a mock KnowledgeBaseAPI.""" - api = Mock(spec=KnowledgeBaseAPI) - api.config = Mock(spec=Config) - api.config.knowledge_base_path = "/test/kb" - api.config.metadata_store_path = "/test/.kbp/metadata/knowledgebase.db" - return api - - @patch('knowledgebase_processor.cli.main.setup_logging') - @patch('knowledgebase_processor.cli.main.load_config') - @patch('knowledgebase_processor.cli.main.KnowledgeBaseAPI') - @patch('knowledgebase_processor.cli.main.handle_process') - def test_main_process_command_success(self, mock_handle_process, mock_api_class, mock_load_config, mock_setup_logging, mock_api): - """Test main function with process command.""" - mock_load_config.return_value = mock_api.config - mock_api_class.return_value = mock_api - mock_handle_process.return_value = 0 - - result = main(["process"]) - - assert result == 0 - mock_setup_logging.assert_called_once() - mock_load_config.assert_called_once_with(None) - mock_api_class.assert_called_once_with(mock_api.config) - # Check that handle_process was called with correct arguments - mock_handle_process.assert_called_once() - call_args = mock_handle_process.call_args - assert call_args[0][0] == mock_api # First argument is the API - assert isinstance(call_args[0][1], argparse.Namespace) # Second is Namespace - assert call_args[0][1].command == "process" - - @patch('knowledgebase_processor.cli.main.setup_logging') - @patch('knowledgebase_processor.cli.main.load_config') - @patch('knowledgebase_processor.cli.main.KnowledgeBaseAPI') - @patch('knowledgebase_processor.cli.main.handle_query') - def test_main_query_command_success(self, mock_handle_query, mock_api_class, mock_load_config, mock_setup_logging, mock_api): - """Test main function with query command.""" - mock_load_config.return_value = mock_api.config - mock_api_class.return_value = mock_api - mock_handle_query.return_value = 0 - - result = main(["query", "test search"]) - - assert result == 0 - mock_handle_query.assert_called_once() - - @patch('knowledgebase_processor.cli.main.setup_logging') - @patch('knowledgebase_processor.cli.main.load_config') - @patch('knowledgebase_processor.cli.main.KnowledgeBaseAPI') - @patch('knowledgebase_processor.cli.main.handle_sparql') - def test_main_sparql_command_success(self, mock_handle_sparql, mock_api_class, mock_load_config, mock_setup_logging, mock_api): - """Test main function with sparql command.""" - mock_load_config.return_value = mock_api.config - mock_api_class.return_value = mock_api - mock_handle_sparql.return_value = 0 - - result = main(["sparql", "query", "SELECT * WHERE { ?s ?p ?o }"]) - - assert result == 0 - mock_handle_sparql.assert_called_once() - - @patch('knowledgebase_processor.cli.main.setup_logging') - @patch('knowledgebase_processor.cli.main.load_config') - @patch('knowledgebase_processor.cli.main.KnowledgeBaseAPI') - @patch('knowledgebase_processor.cli.main.handle_process_and_load') - def test_main_process_and_load_command_success(self, mock_handle_pal, mock_api_class, mock_load_config, mock_setup_logging, mock_api): - """Test main function with process-and-load command.""" - mock_load_config.return_value = mock_api.config - mock_api_class.return_value = mock_api - mock_handle_pal.return_value = 0 - - result = main(["process-and-load"]) - - assert result == 0 - mock_handle_pal.assert_called_once() - - @patch('knowledgebase_processor.cli.main.setup_logging') - @patch('knowledgebase_processor.cli.main.load_config') - @patch('knowledgebase_processor.cli.main.KnowledgeBaseAPI') - def test_main_api_initialization_failure(self, mock_api_class, mock_load_config, mock_setup_logging): - """Test main function when API initialization fails.""" - mock_load_config.return_value = Mock(spec=Config) - mock_api_class.side_effect = Exception("API init failed") - - result = main(["process"]) - - assert result == 1 - mock_api_class.assert_called_once() - - @patch('knowledgebase_processor.cli.main.setup_logging') - @patch('knowledgebase_processor.cli.main.load_config') - @patch('knowledgebase_processor.cli.main.KnowledgeBaseAPI') - def test_main_unknown_command(self, mock_api_class, mock_load_config, mock_setup_logging, mock_api): - """Test main function with unknown command.""" - mock_load_config.return_value = mock_api.config - mock_api_class.return_value = mock_api - - # This should not happen in practice due to argparse validation - # but test the handler logic directly - with patch('knowledgebase_processor.cli.main.parse_args') as mock_parse: - # Mock the parsed args with all required attributes - mock_parsed_args = Mock() - mock_parsed_args.command = "unknown" - mock_parsed_args.config = None - mock_parsed_args.knowledge_base = None - mock_parsed_args.metadata_store = None - mock_parsed_args.log_level = "INFO" - mock_parsed_args.log_file = None - mock_parsed_args.log_format = "text" - mock_parse.return_value = mock_parsed_args - - result = main([]) - - assert result == 1 - - @patch('knowledgebase_processor.cli.main.setup_logging') - @patch('knowledgebase_processor.cli.main.load_config') - @patch('knowledgebase_processor.cli.main.KnowledgeBaseAPI') - def test_main_config_override_knowledge_base(self, mock_api_class, mock_load_config, mock_setup_logging, mock_api): - """Test that command line knowledge base path overrides config.""" - mock_config = Mock(spec=Config) - mock_config.knowledge_base_path = "/config/kb" - mock_load_config.return_value = mock_config - mock_api_class.return_value = mock_api - - with patch('knowledgebase_processor.cli.main.handle_process') as mock_handle: - mock_handle.return_value = 0 - main(["--knowledge-base", "/cli/kb", "process"]) - - assert mock_config.knowledge_base_path == "/cli/kb" - - @patch('knowledgebase_processor.cli.main.setup_logging') - @patch('knowledgebase_processor.cli.main.load_config') - @patch('knowledgebase_processor.cli.main.KnowledgeBaseAPI') - @patch('pathlib.Path.cwd') - def test_main_default_knowledge_base_path(self, mock_cwd, mock_api_class, mock_load_config, mock_setup_logging, mock_api): - """Test that knowledge base path defaults to current directory.""" - mock_cwd.return_value = Path("/current/dir") - mock_config = Mock(spec=Config) - # Simulate no knowledge_base_path in config - delattr(mock_config, 'knowledge_base_path') - mock_load_config.return_value = mock_config - mock_api_class.return_value = mock_api - - with patch('knowledgebase_processor.cli.main.handle_process') as mock_handle: - mock_handle.return_value = 0 - main(["process"]) - - assert mock_config.knowledge_base_path == "/current/dir" - - @patch('knowledgebase_processor.cli.main.setup_logging') - @patch('knowledgebase_processor.cli.main.load_config') - @patch('knowledgebase_processor.cli.main.KnowledgeBaseAPI') - @patch('pathlib.Path.home') - def test_main_metadata_store_path_handling(self, mock_home, mock_api_class, mock_load_config, mock_setup_logging, mock_api): - """Test metadata store path handling with various scenarios.""" - mock_home.return_value = Path("/home/user") - mock_config = Mock(spec=Config) - mock_config.knowledge_base_path = "/test/kb" - mock_load_config.return_value = mock_config - mock_api_class.return_value = mock_api - - with patch('knowledgebase_processor.cli.main.handle_process') as mock_handle: - mock_handle.return_value = 0 - - # Test CLI override - main(["--metadata-store", "/cli/metadata", "process"]) - assert mock_config.metadata_store_path == "/cli/metadata/knowledgebase.db" - - # Test config value - mock_config.metadata_store_path = "/config/metadata" - main(["process"]) - assert mock_config.metadata_store_path == "/config/metadata/knowledgebase.db" - - # Test default value - delattr(mock_config, 'metadata_store_path') - main(["process"]) - assert mock_config.metadata_store_path == "/home/user/.kbp/metadata/knowledgebase.db" \ No newline at end of file diff --git a/tests/cli/test_cli_performance.py b/tests/cli/test_cli_performance.py deleted file mode 100644 index 738efca..0000000 --- a/tests/cli/test_cli_performance.py +++ /dev/null @@ -1,830 +0,0 @@ -""" -Comprehensive CLI Performance and Reliability Tests - -This module provides extensive testing of the CLI's performance characteristics, -reliability under various conditions, and stress testing capabilities. -""" - -import os -import sys -import time -import psutil -import tempfile -import shutil -import unittest -import subprocess -import signal -import threading -import gc -from pathlib import Path -from unittest.mock import patch, MagicMock -from concurrent.futures import ThreadPoolExecutor, TimeoutError -from typing import List, Dict, Any, Optional -import json -import resource - - -# Helper to get the project root -PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) - - -class PerformanceMetrics: - """Utility class for capturing and analyzing performance metrics.""" - - def __init__(self): - self.start_time = None - self.end_time = None - self.start_memory = None - self.peak_memory = None - self.cpu_percent = [] - self.process = None - self.monitoring_thread = None - self.monitoring_active = False - - def start_monitoring(self, process_pid: Optional[int] = None): - """Start performance monitoring.""" - self.start_time = time.time() - self.start_memory = psutil.virtual_memory().used - if process_pid: - try: - self.process = psutil.Process(process_pid) - self.monitoring_active = True - self.monitoring_thread = threading.Thread(target=self._monitor_process) - self.monitoring_thread.daemon = True - self.monitoring_thread.start() - except psutil.NoSuchProcess: - pass - - def stop_monitoring(self): - """Stop performance monitoring and return metrics.""" - self.end_time = time.time() - self.monitoring_active = False - - if self.monitoring_thread: - self.monitoring_thread.join(timeout=1) - - current_memory = psutil.virtual_memory().used - peak_memory = self.peak_memory or current_memory - - return { - 'duration': self.end_time - self.start_time, - 'start_memory_mb': self.start_memory / 1024 / 1024, - 'peak_memory_mb': peak_memory / 1024 / 1024, - 'memory_delta_mb': (current_memory - self.start_memory) / 1024 / 1024, - 'avg_cpu_percent': sum(self.cpu_percent) / len(self.cpu_percent) if self.cpu_percent else 0 - } - - def _monitor_process(self): - """Internal method to monitor process metrics.""" - while self.monitoring_active and self.process: - try: - if self.process.is_running(): - memory_info = self.process.memory_info() - if not self.peak_memory or memory_info.rss > self.peak_memory: - self.peak_memory = memory_info.rss - - cpu_percent = self.process.cpu_percent() - if cpu_percent is not None: - self.cpu_percent.append(cpu_percent) - - time.sleep(0.1) # Sample every 100ms - except (psutil.NoSuchProcess, psutil.AccessDenied): - break - - -class TestCLIPerformance(unittest.TestCase): - """Performance tests for the CLI application.""" - - @classmethod - def setUpClass(cls): - """Set up test class with resource limits and baseline metrics.""" - # Set resource limits for safety - if hasattr(resource, 'RLIMIT_AS'): - try: - resource.setrlimit(resource.RLIMIT_AS, (2 * 1024 * 1024 * 1024, -1)) # 2GB - except (ValueError, OSError): - pass # Not supported on all systems - - def setUp(self): - """Set up each test with temporary directories and sample data.""" - self.temp_dir = tempfile.mkdtemp() - self.kb_dir = os.path.join(self.temp_dir, "kb") - self.large_kb_dir = os.path.join(self.temp_dir, "large_kb") - self.rdf_output_dir = os.path.join(self.temp_dir, "rdf_output") - - os.makedirs(self.kb_dir) - os.makedirs(self.large_kb_dir) - os.makedirs(self.rdf_output_dir) - - # Create standard test files - self._create_test_files() - - # Create large dataset for performance testing - self._create_large_dataset() - - # Force garbage collection before tests - gc.collect() - - def tearDown(self): - """Clean up after each test.""" - shutil.rmtree(self.temp_dir) - gc.collect() - - def _create_test_files(self): - """Create standard test files for basic performance testing.""" - test_files = { - "simple.md": "# Simple Document\n\nBasic content.", - "medium.md": self._generate_markdown_content(100), - "complex.md": self._generate_complex_markdown(50), - } - - for filename, content in test_files.items(): - with open(os.path.join(self.kb_dir, filename), "w", encoding='utf-8') as f: - f.write(content) - - def _create_large_dataset(self): - """Create a large dataset for stress testing.""" - # Create 100 documents with varying sizes - for i in range(100): - size_category = i % 4 - if size_category == 0: # Small files (1KB) - content = self._generate_markdown_content(10) - elif size_category == 1: # Medium files (10KB) - content = self._generate_markdown_content(100) - elif size_category == 2: # Large files (100KB) - content = self._generate_markdown_content(1000) - else: # Very large files (1MB) - content = self._generate_markdown_content(10000) - - filename = f"doc_{i:03d}.md" - with open(os.path.join(self.large_kb_dir, filename), "w", encoding='utf-8') as f: - f.write(content) - - # Create some deeply nested directories - nested_path = os.path.join(self.large_kb_dir, "deep", "nested", "structure") - os.makedirs(nested_path) - with open(os.path.join(nested_path, "deep_file.md"), "w", encoding='utf-8') as f: - f.write(self._generate_markdown_content(50)) - - def _generate_markdown_content(self, sections: int) -> str: - """Generate markdown content with specified number of sections.""" - content = [] - content.append("# Main Document Title\n") - - for i in range(sections): - content.extend([ - f"## Section {i + 1}\n", - f"This is section {i + 1} with some sample content. ", - "Lorem ipsum dolor sit amet, consectetur adipiscing elit. ", - "Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.\n\n", - f"- List item {i + 1}.1\n", - f"- List item {i + 1}.2\n", - f"- List item {i + 1}.3\n\n", - f"```python\n# Code block {i + 1}\ndef function_{i}():\n return {i}\n```\n\n", - f"[Link to section {i}](#section-{i})\n\n" - ]) - - return "".join(content) - - def _generate_complex_markdown(self, complexity: int) -> str: - """Generate complex markdown with various elements.""" - content = ["# Complex Document\n\n"] - - for i in range(complexity): - content.extend([ - f"## Complex Section {i}\n\n", - "| Column 1 | Column 2 | Column 3 |\n", - "|----------|----------|----------|\n", - f"| Data {i}.1 | Data {i}.2 | Data {i}.3 |\n", - f"| Data {i}.4 | Data {i}.5 | Data {i}.6 |\n\n", - "```yaml\n", - f"key_{i}:\n", - f" nested_key: value_{i}\n", - f" list:\n", - f" - item_{i}_1\n", - f" - item_{i}_2\n", - "```\n\n", - f"[[WikiLink {i}]]\n\n", - f"#tag{i} #category-{i % 5}\n\n", - f"- [ ] TODO: Task {i}\n", - f"- [x] DONE: Completed task {i}\n\n" - ]) - - return "".join(content) - - def run_cli_command(self, args: List[str], timeout: int = 120, **kwargs) -> subprocess.CompletedProcess: - """Run CLI command with performance monitoring.""" - base_command = ["poetry", "run", "python", "-m", "knowledgebase_processor.cli.main"] - full_command = base_command + args - - return subprocess.run( - full_command, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - cwd=PROJECT_ROOT, - timeout=timeout, - **kwargs - ) - - def benchmark_command(self, args: List[str], iterations: int = 3) -> Dict[str, Any]: - """Benchmark a CLI command over multiple iterations.""" - results = [] - - for i in range(iterations): - metrics = PerformanceMetrics() - metrics.start_monitoring() - - try: - result = self.run_cli_command(args) - perf_data = metrics.stop_monitoring() - - results.append({ - 'iteration': i + 1, - 'returncode': result.returncode, - 'stdout_length': len(result.stdout), - 'stderr_length': len(result.stderr), - **perf_data - }) - - except subprocess.TimeoutExpired as e: - perf_data = metrics.stop_monitoring() - results.append({ - 'iteration': i + 1, - 'returncode': -1, - 'error': 'timeout', - **perf_data - }) - - # Calculate aggregate statistics - successful_runs = [r for r in results if r['returncode'] == 0] - if successful_runs: - durations = [r['duration'] for r in successful_runs] - memory_deltas = [r['memory_delta_mb'] for r in successful_runs] - - return { - 'iterations': iterations, - 'successful_runs': len(successful_runs), - 'avg_duration': sum(durations) / len(durations), - 'min_duration': min(durations), - 'max_duration': max(durations), - 'avg_memory_delta': sum(memory_deltas) / len(memory_deltas), - 'max_memory_delta': max(memory_deltas), - 'results': results - } - else: - return { - 'iterations': iterations, - 'successful_runs': 0, - 'error': 'No successful runs', - 'results': results - } - - # PERFORMANCE TESTS - - def test_cli_startup_time(self): - """Test CLI startup time with help command.""" - benchmark = self.benchmark_command(["--help"], iterations=5) - - self.assertGreater(benchmark['successful_runs'], 0, "Help command should succeed") - self.assertLess(benchmark['avg_duration'], 5.0, "Help command should complete within 5 seconds") - self.assertLess(benchmark['max_duration'], 10.0, "No help command should take more than 10 seconds") - - print(f"CLI startup time: avg={benchmark['avg_duration']:.3f}s, max={benchmark['max_duration']:.3f}s") - - def test_small_file_processing_performance(self): - """Test performance with small files.""" - args = ["process", "--knowledge-base", self.kb_dir, "--rdf-output-dir", self.rdf_output_dir] - benchmark = self.benchmark_command(args, iterations=3) - - self.assertGreater(benchmark['successful_runs'], 0, "Small file processing should succeed") - self.assertLess(benchmark['avg_duration'], 30.0, "Small files should process quickly") - self.assertLess(benchmark['avg_memory_delta'], 100.0, "Memory usage should be reasonable") - - print(f"Small files: avg={benchmark['avg_duration']:.3f}s, memory={benchmark['avg_memory_delta']:.1f}MB") - - @patch("knowledgebase_processor.services.processing_service.ProcessingService.process_and_load") - def test_large_dataset_processing_performance(self, mock_process_and_load): - """Test performance with large dataset.""" - mock_process_and_load.return_value = 0 - - args = [ - "process-and-load", self.large_kb_dir, - "--endpoint-url", "http://fake-endpoint:9999/sparql" - ] - - # Single run with extended timeout for large dataset - metrics = PerformanceMetrics() - metrics.start_monitoring() - - try: - result = self.run_cli_command(args, timeout=300) # 5 minute timeout - perf_data = metrics.stop_monitoring() - - self.assertEqual(result.returncode, 0, "Large dataset processing should succeed") - self.assertLess(perf_data['duration'], 240.0, "Large dataset should process within 4 minutes") - self.assertLess(perf_data['memory_delta_mb'], 500.0, "Memory usage should stay under 500MB") - - print(f"Large dataset: duration={perf_data['duration']:.3f}s, memory={perf_data['memory_delta_mb']:.1f}MB") - - except subprocess.TimeoutExpired: - perf_data = metrics.stop_monitoring() - self.fail(f"Large dataset processing timed out after {perf_data['duration']:.1f}s") - - def test_memory_usage_scaling(self): - """Test memory usage doesn't grow excessively with file count.""" - # Test with different sized datasets - datasets = [ - (self.kb_dir, "small"), - (self.large_kb_dir, "large") - ] - - memory_results = {} - - for kb_path, label in datasets: - args = ["process", "--knowledge-base", kb_path, "--rdf-output-dir", self.rdf_output_dir] - - metrics = PerformanceMetrics() - metrics.start_monitoring() - - try: - result = self.run_cli_command(args, timeout=180) - perf_data = metrics.stop_monitoring() - - if result.returncode == 0: - memory_results[label] = perf_data['memory_delta_mb'] - print(f"Memory usage for {label} dataset: {perf_data['memory_delta_mb']:.1f}MB") - - except subprocess.TimeoutExpired: - perf_data = metrics.stop_monitoring() - print(f"Dataset {label} timed out after {perf_data['duration']:.1f}s") - - # Verify memory scaling is reasonable (not exponential) - if 'small' in memory_results and 'large' in memory_results: - # Large dataset has ~33x more files, memory should not scale linearly - memory_ratio = memory_results['large'] / max(memory_results['small'], 1) - self.assertLess(memory_ratio, 50.0, f"Memory scaling too aggressive: {memory_ratio:.1f}x") - - def test_concurrent_cli_invocations(self): - """Test handling multiple concurrent CLI invocations.""" - num_concurrent = 3 - args = ["process", "--knowledge-base", self.kb_dir, "--rdf-output-dir", self.rdf_output_dir] - - def run_single_command(): - try: - result = self.run_cli_command(args, timeout=60) - return {'returncode': result.returncode, 'duration': time.time()} - except subprocess.TimeoutExpired: - return {'returncode': -1, 'error': 'timeout'} - except Exception as e: - return {'returncode': -1, 'error': str(e)} - - start_time = time.time() - - with ThreadPoolExecutor(max_workers=num_concurrent) as executor: - futures = [executor.submit(run_single_command) for _ in range(num_concurrent)] - results = [] - - for future in futures: - try: - result = future.result(timeout=90) - results.append(result) - except TimeoutError: - results.append({'returncode': -1, 'error': 'executor_timeout'}) - - end_time = time.time() - total_duration = end_time - start_time - - successful_runs = [r for r in results if r.get('returncode') == 0] - - self.assertGreater(len(successful_runs), 0, "At least one concurrent run should succeed") - self.assertLess(total_duration, 120.0, "Concurrent runs should complete within 2 minutes") - - print(f"Concurrent runs: {len(successful_runs)}/{num_concurrent} successful in {total_duration:.1f}s") - - # RELIABILITY TESTS - - def test_signal_handling_interrupt(self): - """Test CLI gracefully handles SIGINT.""" - if os.name == 'nt': - self.skipTest("Signal handling test not supported on Windows") - - args = ["process", "--knowledge-base", self.large_kb_dir, "--rdf-output-dir", self.rdf_output_dir] - base_command = ["poetry", "run", "python", "-m", "knowledgebase_processor.cli.main"] - full_command = base_command + args - - proc = subprocess.Popen( - full_command, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - cwd=PROJECT_ROOT - ) - - # Let it start processing - time.sleep(2) - - # Send SIGINT - proc.send_signal(signal.SIGINT) - - try: - stdout, stderr = proc.communicate(timeout=30) - - # Process should exit gracefully - self.assertNotEqual(proc.returncode, 0, "Process should exit with error code after SIGINT") - # Should not crash with unhandled exception - self.assertNotIn("Traceback", stderr, "Process should not crash with unhandled exception") - - except subprocess.TimeoutExpired: - proc.kill() - proc.communicate() - self.fail("Process did not respond to SIGINT within timeout") - - def test_signal_handling_terminate(self): - """Test CLI gracefully handles SIGTERM.""" - if os.name == 'nt': - self.skipTest("Signal handling test not supported on Windows") - - args = ["process", "--knowledge-base", self.large_kb_dir, "--rdf-output-dir", self.rdf_output_dir] - base_command = ["poetry", "run", "python", "-m", "knowledgebase_processor.cli.main"] - full_command = base_command + args - - proc = subprocess.Popen( - full_command, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - cwd=PROJECT_ROOT - ) - - # Let it start processing - time.sleep(2) - - # Send SIGTERM - proc.terminate() - - try: - stdout, stderr = proc.communicate(timeout=30) - - # Process should exit - self.assertNotEqual(proc.returncode, 0, "Process should exit after SIGTERM") - - except subprocess.TimeoutExpired: - proc.kill() - proc.communicate() - self.fail("Process did not respond to SIGTERM within timeout") - - def test_malformed_input_handling(self): - """Test CLI stability with malformed input files.""" - malformed_dir = os.path.join(self.temp_dir, "malformed") - os.makedirs(malformed_dir) - - # Create various malformed files - malformed_files = { - "binary.md": b"\x00\x01\x02\xff\xfe\xfd", # Binary data - "huge_line.md": "# Title\n" + "A" * 100000 + "\n", # Very long line - "unicode.md": "# Title\n\u0000\uffff\u200b\u2028\u2029\n", # Problematic Unicode - "empty.md": "", # Empty file - "only_spaces.md": " \n \t \n ", # Only whitespace - } - - for filename, content in malformed_files.items(): - filepath = os.path.join(malformed_dir, filename) - mode = 'wb' if isinstance(content, bytes) else 'w' - encoding = None if isinstance(content, bytes) else 'utf-8' - - with open(filepath, mode, encoding=encoding) as f: - f.write(content) - - # Process malformed files - args = ["process", "--knowledge-base", malformed_dir, "--rdf-output-dir", self.rdf_output_dir] - - metrics = PerformanceMetrics() - metrics.start_monitoring() - - try: - result = self.run_cli_command(args, timeout=60) - perf_data = metrics.stop_monitoring() - - # Should complete without crashing, even if some files fail - self.assertIn(result.returncode, [0, 1], "CLI should handle malformed files gracefully") - self.assertNotIn("Traceback", result.stderr, "Should not crash with unhandled exception") - self.assertLess(perf_data['duration'], 30.0, "Should not hang on malformed files") - - except subprocess.TimeoutExpired: - perf_data = metrics.stop_monitoring() - self.fail(f"CLI hung on malformed files for {perf_data['duration']:.1f}s") - - def test_resource_cleanup_on_error(self): - """Test proper resource cleanup when errors occur.""" - # Test with invalid endpoint to trigger error path - args = [ - "process-and-load", self.kb_dir, - "--endpoint-url", "http://invalid-endpoint:99999/sparql" - ] - - metrics = PerformanceMetrics() - metrics.start_monitoring() - - result = self.run_cli_command(args, timeout=60) - perf_data = metrics.stop_monitoring() - - # Should fail but not hang or leak resources - self.assertEqual(result.returncode, 1, "Should fail with invalid endpoint") - self.assertLess(perf_data['duration'], 30.0, "Should fail quickly") - - # Memory should not continue growing after process ends - post_memory = psutil.virtual_memory().used - self.assertLess( - abs(perf_data['memory_delta_mb']), 50.0, - "Should not leak significant memory on error" - ) - - def test_argument_parsing_stress(self): - """Test CLI with various argument combinations.""" - test_cases = [ - # Valid cases - ["--help"], - ["process", "--knowledge-base", self.kb_dir], - ["process", "--knowledge-base", self.kb_dir, "--pattern", "*.md"], - - # Edge cases - ["process", "--knowledge-base", self.kb_dir, "--pattern", ""], - ["process", "--knowledge-base", self.kb_dir, "--pattern", "*" * 100], - - # Invalid cases - ["invalid-command"], - ["process"], # Missing required args - ["process", "--knowledge-base", "/nonexistent/path"], - ] - - for i, args in enumerate(test_cases): - with self.subTest(case=i, args=args): - try: - result = self.run_cli_command(args, timeout=30) - - # Should complete quickly regardless of validity - # Valid cases should succeed, invalid should fail gracefully - self.assertIn(result.returncode, [0, 1, 2], - f"Unexpected return code for args: {args}") - self.assertNotIn("Traceback", result.stderr, - f"Should not crash on args: {args}") - - except subprocess.TimeoutExpired: - self.fail(f"Timeout on argument case: {args}") - - # LOAD TESTS - - def test_stress_large_file_processing(self): - """Stress test with very large individual files.""" - # Create a very large markdown file (10MB) - large_file_dir = os.path.join(self.temp_dir, "large_file") - os.makedirs(large_file_dir) - - large_content = self._generate_markdown_content(50000) # ~10MB - large_file_path = os.path.join(large_file_dir, "huge.md") - - with open(large_file_path, "w", encoding='utf-8') as f: - f.write(large_content) - - args = ["process", "--knowledge-base", large_file_dir, "--rdf-output-dir", self.rdf_output_dir] - - metrics = PerformanceMetrics() - metrics.start_monitoring() - - try: - result = self.run_cli_command(args, timeout=600) # 10 minute timeout - perf_data = metrics.stop_monitoring() - - self.assertEqual(result.returncode, 0, "Large file processing should succeed") - self.assertLess(perf_data['memory_delta_mb'], 1000.0, "Memory usage should be reasonable") - - print(f"Large file processing: {perf_data['duration']:.1f}s, {perf_data['memory_delta_mb']:.1f}MB") - - except subprocess.TimeoutExpired: - perf_data = metrics.stop_monitoring() - self.fail(f"Large file processing timed out after {perf_data['duration']:.1f}s") - - def test_stress_many_small_files(self): - """Stress test with many small files.""" - many_files_dir = os.path.join(self.temp_dir, "many_files") - os.makedirs(many_files_dir) - - # Create 1000 small files - for i in range(1000): - filename = f"small_{i:04d}.md" - content = f"# Small File {i}\n\nThis is small file number {i}.\n" - - with open(os.path.join(many_files_dir, filename), "w", encoding='utf-8') as f: - f.write(content) - - args = ["process", "--knowledge-base", many_files_dir, "--rdf-output-dir", self.rdf_output_dir] - - metrics = PerformanceMetrics() - metrics.start_monitoring() - - try: - result = self.run_cli_command(args, timeout=300) # 5 minute timeout - perf_data = metrics.stop_monitoring() - - self.assertEqual(result.returncode, 0, "Many files processing should succeed") - self.assertLess(perf_data['memory_delta_mb'], 200.0, "Memory usage should scale reasonably") - - print(f"Many files processing: {perf_data['duration']:.1f}s, {perf_data['memory_delta_mb']:.1f}MB") - - except subprocess.TimeoutExpired: - perf_data = metrics.stop_monitoring() - self.fail(f"Many files processing timed out after {perf_data['duration']:.1f}s") - - # BENCHMARK TESTS - - def test_command_performance_comparison(self): - """Compare performance across different CLI commands.""" - commands = [ - (["--help"], "help"), - (["process", "--knowledge-base", self.kb_dir, "--rdf-output-dir", self.rdf_output_dir], "process_small"), - ] - - results = {} - - for args, label in commands: - benchmark = self.benchmark_command(args, iterations=3) - if benchmark['successful_runs'] > 0: - results[label] = { - 'avg_duration': benchmark['avg_duration'], - 'avg_memory': benchmark['avg_memory_delta'] - } - - # Print comparison - print("\nCommand Performance Comparison:") - for label, metrics in results.items(): - print(f" {label:20s}: {metrics['avg_duration']:7.3f}s, {metrics['avg_memory']:6.1f}MB") - - # Basic assertions - if 'help' in results: - self.assertLess(results['help']['avg_duration'], 5.0, "Help should be very fast") - - def test_throughput_measurement(self): - """Measure document processing throughput.""" - # Count files in test directory - file_count = len([f for f in os.listdir(self.kb_dir) if f.endswith('.md')]) - - args = ["process", "--knowledge-base", self.kb_dir, "--rdf-output-dir", self.rdf_output_dir] - benchmark = self.benchmark_command(args, iterations=3) - - if benchmark['successful_runs'] > 0: - throughput = file_count / benchmark['avg_duration'] - print(f"Processing throughput: {throughput:.2f} files/second") - - # Should process at least 1 file per 5 seconds on average - self.assertGreater(throughput, 0.2, "Processing throughput too low") - - def test_performance_regression_detection(self): - """Detect potential performance regressions.""" - # Run the same command multiple times to detect variance - args = ["process", "--knowledge-base", self.kb_dir, "--rdf-output-dir", self.rdf_output_dir] - benchmark = self.benchmark_command(args, iterations=5) - - if benchmark['successful_runs'] >= 3: - durations = [r['duration'] for r in benchmark['results'] if r['returncode'] == 0] - - # Calculate coefficient of variation (std dev / mean) - mean_duration = sum(durations) / len(durations) - variance = sum((d - mean_duration) ** 2 for d in durations) / len(durations) - std_dev = variance ** 0.5 - cv = std_dev / mean_duration if mean_duration > 0 else 0 - - print(f"Performance variance: CV={cv:.3f}, range={min(durations):.3f}-{max(durations):.3f}s") - - # Performance should be reasonably consistent (CV < 50%) - self.assertLess(cv, 0.5, f"Performance too variable: {cv:.3f}") - - -class TestCLIReliability(unittest.TestCase): - """Reliability and error recovery tests for the CLI.""" - - def setUp(self): - """Set up test environment.""" - self.temp_dir = tempfile.mkdtemp() - self.kb_dir = os.path.join(self.temp_dir, "kb") - os.makedirs(self.kb_dir) - - # Create basic test file - with open(os.path.join(self.kb_dir, "test.md"), "w") as f: - f.write("# Test\nContent") - - def tearDown(self): - """Clean up test environment.""" - shutil.rmtree(self.temp_dir) - - def run_cli_command(self, args: List[str], **kwargs) -> subprocess.CompletedProcess: - """Run CLI command.""" - base_command = ["poetry", "run", "python", "-m", "knowledgebase_processor.cli.main"] - full_command = base_command + args - - return subprocess.run( - full_command, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - cwd=PROJECT_ROOT, - **kwargs - ) - - def test_error_recovery_invalid_config(self): - """Test recovery from invalid configuration.""" - invalid_config = os.path.join(self.temp_dir, "invalid_config.json") - with open(invalid_config, "w") as f: - f.write("invalid json content") - - args = ["--config", invalid_config, "process", "--knowledge-base", self.kb_dir] - result = self.run_cli_command(args, timeout=30) - - # Should fail gracefully, not crash - self.assertNotEqual(result.returncode, 0, "Should fail with invalid config") - self.assertNotIn("Traceback", result.stderr, "Should not crash") - - def test_graceful_degradation_no_permissions(self): - """Test graceful handling of permission errors.""" - if os.name == 'nt': - self.skipTest("Permission test not reliable on Windows") - - # Create a directory with no read permissions - no_read_dir = os.path.join(self.temp_dir, "no_read") - os.makedirs(no_read_dir) - os.chmod(no_read_dir, 0o000) - - try: - args = ["process", "--knowledge-base", no_read_dir] - result = self.run_cli_command(args, timeout=30) - - # Should handle permission error gracefully - self.assertNotEqual(result.returncode, 0, "Should fail due to permissions") - self.assertNotIn("Traceback", result.stderr, "Should not crash") - - finally: - # Restore permissions for cleanup - os.chmod(no_read_dir, 0o755) - - def test_disk_space_handling(self): - """Test behavior when disk space is limited.""" - # This is hard to test reliably, so we'll mock the scenario - large_output_dir = os.path.join(self.temp_dir, "large_output") - os.makedirs(large_output_dir) - - # Create a large KB that would generate significant output - large_kb = os.path.join(self.temp_dir, "large_kb") - os.makedirs(large_kb) - - for i in range(10): - with open(os.path.join(large_kb, f"doc_{i}.md"), "w") as f: - f.write("# Large Doc\n" + "Content " * 10000) - - args = ["process", "--knowledge-base", large_kb, "--rdf-output-dir", large_output_dir] - result = self.run_cli_command(args, timeout=60) - - # Should either succeed or fail gracefully - self.assertIn(result.returncode, [0, 1], "Should handle large output gracefully") - if result.returncode != 0: - self.assertNotIn("Traceback", result.stderr, "Should not crash on errors") - - def test_network_timeout_handling(self): - """Test handling of network timeouts.""" - # Use a non-routable IP to simulate network timeout - args = [ - "process-and-load", self.kb_dir, - "--endpoint-url", "http://10.255.255.1:9999/sparql" - ] - - start_time = time.time() - result = self.run_cli_command(args, timeout=60) - duration = time.time() - start_time - - # Should timeout reasonably quickly and handle it gracefully - self.assertNotEqual(result.returncode, 0, "Should fail with unreachable endpoint") - self.assertLess(duration, 45.0, "Should timeout within reasonable time") - self.assertNotIn("Traceback", result.stderr, "Should handle timeout gracefully") - - -if __name__ == "__main__": - # Configure test runner for performance testing - import sys - - if "--benchmark" in sys.argv: - sys.argv.remove("--benchmark") - # Run only benchmark tests - loader = unittest.TestLoader() - suite = unittest.TestSuite() - - # Add benchmark test methods - suite.addTest(TestCLIPerformance('test_command_performance_comparison')) - suite.addTest(TestCLIPerformance('test_throughput_measurement')) - suite.addTest(TestCLIPerformance('test_performance_regression_detection')) - - runner = unittest.TextTestRunner(verbosity=2) - result = runner.run(suite) - sys.exit(0 if result.wasSuccessful() else 1) - - else: - # Run all tests - unittest.main(verbosity=2) \ No newline at end of file diff --git a/tests/cli/test_cli_reliability.py b/tests/cli/test_cli_reliability.py deleted file mode 100644 index 2d71bdc..0000000 --- a/tests/cli/test_cli_reliability.py +++ /dev/null @@ -1,566 +0,0 @@ -""" -CLI Reliability and Error Recovery Tests - -This module focuses on testing the CLI's behavior under various error conditions, -signal handling, resource constraints, and edge cases. -""" - -import os -import sys -import tempfile -import shutil -import subprocess -import signal -import threading -import time -import pytest -from pathlib import Path -from unittest.mock import patch, mock_open -from typing import List, Dict, Any -import json -import resource - - -PROJECT_ROOT = Path(__file__).parent.parent.parent - - -class TestCLIReliability: - """Test CLI reliability and error handling.""" - - @pytest.fixture - def temp_workspace(self): - """Create temporary workspace for testing.""" - temp_dir = tempfile.mkdtemp(prefix="cli_reliability_") - yield Path(temp_dir) - shutil.rmtree(temp_dir) - - @pytest.fixture - def basic_kb(self, temp_workspace): - """Create basic knowledge base for testing.""" - kb_dir = temp_workspace / "basic_kb" - kb_dir.mkdir() - - (kb_dir / "test.md").write_text("""# Test Document - -This is a basic test document for reliability testing. - -## Section 1 -Content here. - -## Section 2 -More content here. -""") - return kb_dir - - def run_cli_command(self, args: List[str], timeout: int = 30, **kwargs) -> subprocess.CompletedProcess: - """Run CLI command with specified timeout.""" - base_command = ["poetry", "run", "python", "-m", "knowledgebase_processor.cli.main"] - full_command = base_command + args - - return subprocess.run( - full_command, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - cwd=PROJECT_ROOT, - timeout=timeout, - **kwargs - ) - - def start_cli_process(self, args: List[str]) -> subprocess.Popen: - """Start CLI process for signal testing.""" - base_command = ["poetry", "run", "python", "-m", "knowledgebase_processor.cli.main"] - full_command = base_command + args - - return subprocess.Popen( - full_command, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - cwd=PROJECT_ROOT - ) - - # Error Handling Tests - - def test_invalid_command_handling(self): - """Test handling of invalid commands.""" - result = self.run_cli_command(["invalid-command"]) - - assert result.returncode != 0 - assert "invalid choice" in result.stderr.lower() or "error" in result.stderr.lower() - # Should not crash with unhandled exception - assert "Traceback" not in result.stderr - - def test_missing_required_arguments(self): - """Test handling of missing required arguments.""" - test_cases = [ - ["process"], # Missing --knowledge-base - ["process-and-load"], # Missing knowledge base path - ["query"], # Missing query string - ["sparql", "query"], # Missing SPARQL query - ] - - for args in test_cases: - with pytest.subTest(args=args): - result = self.run_cli_command(args) - - assert result.returncode != 0 - # Should show usage information, not crash - assert "Traceback" not in result.stderr - assert len(result.stderr) > 0 # Should have error message - - def test_invalid_file_paths(self, temp_workspace): - """Test handling of invalid file paths.""" - test_cases = [ - str(temp_workspace / "nonexistent"), # Non-existent directory - str(temp_workspace / "file.txt"), # File instead of directory (we'll create this) - "/dev/null", # Device file - "", # Empty path - "." * 300, # Very long path - ] - - # Create a regular file for testing - (temp_workspace / "file.txt").write_text("not a directory") - - for path in test_cases: - with pytest.subTest(path=path): - args = ["process", "--knowledge-base", path] - result = self.run_cli_command(args) - - assert result.returncode != 0 - assert "Traceback" not in result.stderr - - def test_invalid_urls(self, basic_kb): - """Test handling of invalid SPARQL URLs.""" - invalid_urls = [ - "not-a-url", - "http://", - "ftp://example.com/sparql", - "http://256.256.256.256:99999/sparql", # Invalid IP - "http://localhost:99999/sparql", # Invalid port - "", - ] - - for url in invalid_urls: - with pytest.subTest(url=url): - args = [ - "process-and-load", str(basic_kb), - "--endpoint-url", url - ] - result = self.run_cli_command(args) - - assert result.returncode != 0 - assert "Traceback" not in result.stderr - - def test_malformed_configuration_files(self, temp_workspace, basic_kb): - """Test handling of malformed configuration files.""" - config_tests = [ - ("invalid_json.json", "{invalid json}"), - ("empty.json", ""), - ("binary.json", b"\x00\x01\x02\xff"), - ("huge.json", '{"key": "' + "a" * 1000000 + '"}'), - ] - - for filename, content in config_tests: - with pytest.subTest(config=filename): - config_file = temp_workspace / filename - - if isinstance(content, bytes): - config_file.write_bytes(content) - else: - config_file.write_text(content) - - args = [ - "--config", str(config_file), - "process", "--knowledge-base", str(basic_kb) - ] - result = self.run_cli_command(args) - - assert result.returncode != 0 - # Should handle gracefully, not crash - assert "Traceback" not in result.stderr - - # Malformed Input Tests - - def test_malformed_markdown_files(self, temp_workspace): - """Test processing malformed markdown files.""" - malformed_kb = temp_workspace / "malformed_kb" - malformed_kb.mkdir() - - malformed_files = { - "binary.md": b"\x00\x01\x02\xff\xfe\xfd", - "huge_line.md": "# Title\n" + "A" * 100000 + "\n", - "unicode_issues.md": "# Title\n\u0000\uffff\u200b\u2028\u2029\n", - "empty.md": "", - "only_whitespace.md": " \n \t \n ", - "control_chars.md": "# Title\n\x01\x02\x03\x1f\n", - "null_bytes.md": "# Title\nContent with \x00 null bytes\n", - } - - for filename, content in malformed_files.items(): - filepath = malformed_kb / filename - if isinstance(content, bytes): - filepath.write_bytes(content) - else: - filepath.write_text(content, encoding='utf-8', errors='replace') - - rdf_dir = temp_workspace / "malformed_rdf" - rdf_dir.mkdir() - - args = [ - "process", - "--knowledge-base", str(malformed_kb), - "--rdf-output-dir", str(rdf_dir) - ] - - result = self.run_cli_command(args, timeout=60) - - # Should handle malformed files gracefully - # May succeed with warnings or fail gracefully, but shouldn't crash - assert result.returncode in [0, 1] - assert "Traceback" not in result.stderr - - def test_extremely_large_files(self, temp_workspace): - """Test handling of extremely large files.""" - large_kb = temp_workspace / "large_kb" - large_kb.mkdir() - - # Create a very large file (~5MB) - large_content = "# Huge Document\n\n" + ("Content line " * 50 + "\n") * 10000 - (large_kb / "huge.md").write_text(large_content) - - rdf_dir = temp_workspace / "large_rdf" - rdf_dir.mkdir() - - args = [ - "process", - "--knowledge-base", str(large_kb), - "--rdf-output-dir", str(rdf_dir) - ] - - # Should either succeed or fail gracefully within reasonable time - result = self.run_cli_command(args, timeout=120) - - # Should not crash regardless of success/failure - assert "Traceback" not in result.stderr - - if result.returncode == 0: - # If successful, should have created output - assert len(list(rdf_dir.glob("*.ttl"))) > 0 - - # Signal Handling Tests - - @pytest.mark.skipif(os.name == 'nt', reason="Signal handling not reliable on Windows") - def test_sigint_handling(self, basic_kb, temp_workspace): - """Test graceful handling of SIGINT (Ctrl+C).""" - rdf_dir = temp_workspace / "sigint_rdf" - rdf_dir.mkdir() - - # Create larger dataset to ensure process runs long enough - large_kb = temp_workspace / "large_for_sigint" - large_kb.mkdir() - - for i in range(20): - content = f"# Document {i}\n\n" + ("Content line\n" * 100) - (large_kb / f"doc_{i:02d}.md").write_text(content) - - args = [ - "process", - "--knowledge-base", str(large_kb), - "--rdf-output-dir", str(rdf_dir) - ] - - proc = self.start_cli_process(args) - - # Let it start processing - time.sleep(2) - - # Send SIGINT - proc.send_signal(signal.SIGINT) - - try: - stdout, stderr = proc.communicate(timeout=30) - - # Should exit with non-zero code but not crash - assert proc.returncode != 0 - assert "Traceback" not in stderr - - except subprocess.TimeoutExpired: - proc.kill() - proc.communicate() - pytest.fail("Process did not respond to SIGINT within timeout") - - @pytest.mark.skipif(os.name == 'nt', reason="Signal handling not reliable on Windows") - def test_sigterm_handling(self, basic_kb, temp_workspace): - """Test graceful handling of SIGTERM.""" - rdf_dir = temp_workspace / "sigterm_rdf" - rdf_dir.mkdir() - - # Create larger dataset - large_kb = temp_workspace / "large_for_sigterm" - large_kb.mkdir() - - for i in range(15): - content = f"# Document {i}\n\n" + ("Content line\n" * 150) - (large_kb / f"doc_{i:02d}.md").write_text(content) - - args = [ - "process", - "--knowledge-base", str(large_kb), - "--rdf-output-dir", str(rdf_dir) - ] - - proc = self.start_cli_process(args) - - # Let it start processing - time.sleep(2) - - # Send SIGTERM - proc.terminate() - - try: - stdout, stderr = proc.communicate(timeout=30) - - # Should exit - assert proc.returncode != 0 - - except subprocess.TimeoutExpired: - proc.kill() - proc.communicate() - pytest.fail("Process did not respond to SIGTERM within timeout") - - # Resource Constraint Tests - - @pytest.mark.skipif(os.name == 'nt', reason="Resource limits not supported on Windows") - def test_memory_limit_handling(self, basic_kb, temp_workspace): - """Test behavior under memory constraints.""" - # Set a memory limit for the subprocess (not the test process) - # This is tricky to test reliably, so we'll do a basic check - - # Create many files to potentially trigger memory issues - many_files_kb = temp_workspace / "many_files" - many_files_kb.mkdir() - - for i in range(100): - content = f"# Document {i}\n\n" + ("Line of content\n" * 500) - (many_files_kb / f"doc_{i:03d}.md").write_text(content) - - rdf_dir = temp_workspace / "memory_test_rdf" - rdf_dir.mkdir() - - args = [ - "process", - "--knowledge-base", str(many_files_kb), - "--rdf-output-dir", str(rdf_dir) - ] - - # Run with extended timeout - result = self.run_cli_command(args, timeout=180) - - # Should either succeed or fail gracefully, not crash with OOM - assert "Traceback" not in result.stderr - assert "MemoryError" not in result.stderr - - def test_disk_space_simulation(self, basic_kb, temp_workspace): - """Simulate disk space constraints by filling up output directory.""" - rdf_dir = temp_workspace / "full_disk_rdf" - rdf_dir.mkdir() - - # Fill the directory with dummy files to simulate disk space issues - # (This won't actually fill the disk but may trigger some error paths) - for i in range(10): - dummy_file = rdf_dir / f"dummy_{i}.txt" - dummy_file.write_text("x" * 100000) # 100KB files - - args = [ - "process", - "--knowledge-base", str(basic_kb), - "--rdf-output-dir", str(rdf_dir) - ] - - result = self.run_cli_command(args, timeout=60) - - # Should handle gracefully regardless of success/failure - assert "Traceback" not in result.stderr - - # Network and Connectivity Tests - - def test_network_timeout_handling(self, basic_kb): - """Test handling of network timeouts.""" - # Use non-routable IP to simulate timeout - args = [ - "process-and-load", str(basic_kb), - "--endpoint-url", "http://10.255.255.1:9999/sparql" - ] - - start_time = time.time() - result = self.run_cli_command(args, timeout=60) - duration = time.time() - start_time - - # Should timeout reasonably quickly - assert result.returncode != 0 - assert duration < 45 # Should not hang for too long - assert "Traceback" not in result.stderr - - def test_invalid_endpoint_response_handling(self, basic_kb): - """Test handling of invalid endpoint responses.""" - # Use a non-SPARQL endpoint that will return unexpected responses - args = [ - "process-and-load", str(basic_kb), - "--endpoint-url", "http://httpbin.org/status/500" - ] - - result = self.run_cli_command(args, timeout=60) - - # Should fail gracefully - assert result.returncode != 0 - assert "Traceback" not in result.stderr - - # Resource Cleanup Tests - - def test_cleanup_on_early_exit(self, basic_kb, temp_workspace): - """Test that resources are cleaned up on early exit.""" - rdf_dir = temp_workspace / "cleanup_test_rdf" - rdf_dir.mkdir() - - # Create a scenario that might cause early exit - args = [ - "process", "--knowledge-base", "/nonexistent", - "--rdf-output-dir", str(rdf_dir) - ] - - result = self.run_cli_command(args, timeout=30) - - # Should exit quickly and cleanly - assert result.returncode != 0 - assert "Traceback" not in result.stderr - - # Directory should still exist but should be empty or minimal - assert rdf_dir.exists() - - def test_partial_processing_recovery(self, temp_workspace): - """Test recovery from partial processing failures.""" - mixed_kb = temp_workspace / "mixed_kb" - mixed_kb.mkdir() - - # Create mix of valid and invalid files - (mixed_kb / "valid1.md").write_text("# Valid Document 1\nContent here.") - (mixed_kb / "valid2.md").write_text("# Valid Document 2\nMore content.") - (mixed_kb / "invalid.md").write_bytes(b"\xff\xfe\xfd\x00") # Binary data - (mixed_kb / "valid3.md").write_text("# Valid Document 3\nFinal content.") - - rdf_dir = temp_workspace / "mixed_rdf" - rdf_dir.mkdir() - - args = [ - "process", - "--knowledge-base", str(mixed_kb), - "--rdf-output-dir", str(rdf_dir) - ] - - result = self.run_cli_command(args, timeout=60) - - # Should process what it can, may succeed or fail but shouldn't crash - assert result.returncode in [0, 1] - assert "Traceback" not in result.stderr - - # Should have processed at least some valid files - ttl_files = list(rdf_dir.glob("*.ttl")) - # At least some processing should have occurred - assert len(ttl_files) >= 0 # May be 0 if it fails early, but shouldn't crash - - # Edge Case Tests - - def test_unicode_handling_in_paths(self, temp_workspace): - """Test handling of Unicode characters in file paths.""" - unicode_kb = temp_workspace / "unicode_ๆต‹่ฏ•_๐Ÿš€" - unicode_kb.mkdir() - - # Create files with Unicode names - unicode_files = [ - "ๆต‹่ฏ•ๆ–‡ๆกฃ.md", - "๐Ÿš€_rocket.md", - "cafรฉ_rรฉsumรฉ.md", - "ั„ะฐะนะป.md", - ] - - for filename in unicode_files: - try: - filepath = unicode_kb / filename - filepath.write_text(f"# {filename}\n\nUnicode content: ๆต‹่ฏ• ๐Ÿš€ cafรฉ ั„ะฐะนะป") - except (OSError, UnicodeError): - # Skip files that can't be created on this filesystem - continue - - # Only test if we successfully created files - if list(unicode_kb.glob("*.md")): - rdf_dir = temp_workspace / "unicode_rdf" - rdf_dir.mkdir() - - args = [ - "process", - "--knowledge-base", str(unicode_kb), - "--rdf-output-dir", str(rdf_dir) - ] - - result = self.run_cli_command(args, timeout=60) - - # Should handle Unicode gracefully - assert "Traceback" not in result.stderr - - def test_concurrent_access_simulation(self, basic_kb, temp_workspace): - """Simulate concurrent access to the same knowledge base.""" - import threading - import queue - - results_queue = queue.Queue() - - def run_process(instance_id): - rdf_dir = temp_workspace / f"concurrent_{instance_id}" - rdf_dir.mkdir(exist_ok=True) - - args = [ - "process", - "--knowledge-base", str(basic_kb), - "--rdf-output-dir", str(rdf_dir) - ] - - try: - result = self.run_cli_command(args, timeout=60) - results_queue.put({ - 'instance': instance_id, - 'returncode': result.returncode, - 'stderr': result.stderr, - 'success': result.returncode == 0 - }) - except Exception as e: - results_queue.put({ - 'instance': instance_id, - 'error': str(e), - 'success': False - }) - - # Start 3 concurrent processes - threads = [] - for i in range(3): - thread = threading.Thread(target=run_process, args=(i,)) - threads.append(thread) - thread.start() - - # Wait for all to complete - for thread in threads: - thread.join(timeout=90) - - # Collect results - results = [] - while not results_queue.empty(): - results.append(results_queue.get()) - - # At least some should succeed - successful = [r for r in results if r.get('success', False)] - assert len(successful) >= 1, f"No concurrent processes succeeded: {results}" - - # None should crash - for result in results: - if 'stderr' in result: - assert "Traceback" not in result['stderr'] \ No newline at end of file diff --git a/tests/cli/test_cli_validation.py b/tests/cli/test_cli_validation.py deleted file mode 100644 index 23e1b55..0000000 --- a/tests/cli/test_cli_validation.py +++ /dev/null @@ -1,420 +0,0 @@ -"""Unit tests for CLI input validation and error conditions.""" - -import argparse -import pytest -from pathlib import Path -from unittest.mock import Mock, patch -from urllib.parse import urlparse - -from knowledgebase_processor.cli.main import ( - parse_args, is_valid_url, main -) -from knowledgebase_processor.config import Config -from knowledgebase_processor.api import KnowledgeBaseAPI - - -class TestArgumentValidation: - """Test CLI argument validation.""" - - def test_required_command_missing(self): - """Test that missing command raises SystemExit.""" - with pytest.raises(SystemExit): - parse_args([]) - - def test_invalid_log_level(self): - """Test invalid log level raises SystemExit.""" - with pytest.raises(SystemExit): - parse_args(["--log-level", "INVALID", "process"]) - - def test_valid_log_levels(self): - """Test all valid log levels are accepted.""" - valid_levels = ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] - for level in valid_levels: - args = parse_args(["--log-level", level, "process"]) - assert args.log_level == level - - def test_invalid_log_format(self): - """Test invalid log format raises SystemExit.""" - with pytest.raises(SystemExit): - parse_args(["--log-format", "invalid", "process"]) - - def test_valid_log_formats(self): - """Test all valid log formats are accepted.""" - valid_formats = ["text", "json"] - for format_type in valid_formats: - args = parse_args(["--log-format", format_type, "process"]) - assert args.log_format == format_type - - def test_invalid_query_type(self): - """Test invalid query type raises SystemExit.""" - with pytest.raises(SystemExit): - parse_args(["query", "test", "--type", "invalid"]) - - def test_valid_query_types(self): - """Test all valid query types are accepted.""" - valid_types = ["text", "tag", "topic"] - for query_type in valid_types: - args = parse_args(["query", "test", "--type", query_type]) - assert args.type == query_type - - def test_invalid_sparql_format(self): - """Test invalid SPARQL output format raises SystemExit.""" - with pytest.raises(SystemExit): - parse_args(["sparql", "query", "SELECT * WHERE { ?s ?p ?o }", "--format", "invalid"]) - - def test_valid_sparql_formats(self): - """Test all valid SPARQL output formats are accepted.""" - valid_formats = ["json", "table", "turtle"] - for format_type in valid_formats: - args = parse_args(["sparql", "query", "SELECT * WHERE { ?s ?p ?o }", "--format", format_type]) - assert args.format == format_type - - def test_invalid_rdf_format(self): - """Test invalid RDF format raises SystemExit.""" - with pytest.raises(SystemExit): - parse_args(["sparql", "load-file", "data.ttl", "--rdf-format", "invalid"]) - - def test_valid_rdf_formats(self): - """Test all valid RDF formats are accepted.""" - valid_formats = ["turtle", "n3", "nt", "xml", "json-ld"] - for format_type in valid_formats: - args = parse_args(["sparql", "load-file", "data.ttl", "--rdf-format", format_type]) - assert args.rdf_format == format_type - - def test_sparql_subcommand_required(self): - """Test that SPARQL subcommand is required.""" - with pytest.raises(SystemExit): - parse_args(["sparql"]) - - def test_sparql_query_requires_query_string(self): - """Test that SPARQL query requires query string.""" - with pytest.raises(SystemExit): - parse_args(["sparql", "query"]) - - def test_sparql_load_file_requires_file_path(self): - """Test that SPARQL load-file requires file path.""" - with pytest.raises(SystemExit): - parse_args(["sparql", "load-file"]) - - def test_query_command_requires_query_string(self): - """Test that query command requires query string.""" - with pytest.raises(SystemExit): - parse_args(["query"]) - - -class TestUrlValidation: - """Test URL validation function.""" - - def test_valid_urls(self): - """Test various valid URL formats.""" - valid_urls = [ - "http://example.com", - "https://example.com", - "http://localhost", - "https://localhost", - "http://127.0.0.1", - "https://127.0.0.1", - "http://example.com:8080", - "https://example.com:443", - "http://localhost:3030", - "http://example.com/sparql", - "https://example.com/sparql/query", - "http://example.com/path/to/endpoint", - "https://sub.example.com", - "http://example.com/sparql?param=value", - "https://example.com/sparql#fragment", - ] - - for url in valid_urls: - assert is_valid_url(url), f"URL should be valid: {url}" - - def test_invalid_urls(self): - """Test various invalid URL formats.""" - invalid_urls = [ - "", # Empty string - None, # None value - "example.com", # No scheme - "ftp://example.com", # Wrong scheme (not http/https) - "http://", # No netloc - "https://", # No netloc - "http:///path", # Empty netloc - "://example.com", # Empty scheme - "not-a-url", # Not a URL at all - "just text", # Plain text - "http:/example.com", # Missing slash - "http//example.com", # Missing colon - "http:", # Incomplete URL - "://", # Just scheme separator - " http://example.com", # Leading whitespace - "http://example.com ", # Trailing whitespace - ] - - for url in invalid_urls: - assert not is_valid_url(url), f"URL should be invalid: {url}" - - def test_url_validation_with_special_characters(self): - """Test URL validation with special characters.""" - # Valid URLs with special characters - valid_special_urls = [ - "http://example.com/path%20with%20spaces", - "https://user:pass@example.com", - "http://192.168.1.1:8080/endpoint", - "https://example-site.co.uk", - "http://api.example.com/v1/sparql", - ] - - for url in valid_special_urls: - assert is_valid_url(url), f"URL with special characters should be valid: {url}" - - def test_url_validation_edge_cases(self): - """Test URL validation edge cases.""" - # Edge cases that should be invalid - edge_cases = [ - "http://", - "https://", - "http:// ", - "http://.", - "http://..", - "http://../", - "http://?", - "http://#", - ] - - for url in edge_cases: - assert not is_valid_url(url), f"Edge case should be invalid: {url}" - - -class TestConfigurationValidation: - """Test configuration validation and error handling.""" - - @patch('knowledgebase_processor.cli.main.setup_logging') - @patch('knowledgebase_processor.cli.main.load_config') - def test_config_loading_failure(self, mock_load_config, mock_setup_logging): - """Test behavior when config loading fails.""" - mock_load_config.side_effect = Exception("Config loading failed") - - # Should not crash, but may behave unexpectedly depending on implementation - # This tests that the exception propagates properly - with pytest.raises(Exception, match="Config loading failed"): - main(["process"]) - - @patch('knowledgebase_processor.cli.main.setup_logging') - @patch('knowledgebase_processor.cli.main.load_config') - @patch('knowledgebase_processor.cli.main.KnowledgeBaseAPI') - def test_api_initialization_failure(self, mock_api_class, mock_load_config, mock_setup_logging): - """Test behavior when API initialization fails.""" - mock_config = Mock(spec=Config) - mock_config.knowledge_base_path = "/test/kb" - mock_load_config.return_value = mock_config - mock_api_class.side_effect = Exception("API initialization failed") - - result = main(["process"]) - - assert result == 1 - - @patch('knowledgebase_processor.cli.main.setup_logging') - def test_logging_setup_failure(self, mock_setup_logging): - """Test behavior when logging setup fails.""" - mock_setup_logging.side_effect = Exception("Logging setup failed") - - # Should not crash the entire application - with pytest.raises(Exception, match="Logging setup failed"): - main(["process"]) - - -class TestFilePathValidation: - """Test file path validation scenarios.""" - - def test_config_file_path_validation(self): - """Test config file path handling.""" - args = parse_args(["--config", "/path/to/config.yaml", "process"]) - assert args.config == "/path/to/config.yaml" - - args = parse_args(["--config", "relative/config.yaml", "process"]) - assert args.config == "relative/config.yaml" - - def test_knowledge_base_path_validation(self): - """Test knowledge base path handling.""" - args = parse_args(["--knowledge-base", "/absolute/path", "process"]) - assert args.knowledge_base == "/absolute/path" - - args = parse_args(["--knowledge-base", "relative/path", "process"]) - assert args.knowledge_base == "relative/path" - - def test_metadata_store_path_validation(self): - """Test metadata store path handling.""" - args = parse_args(["--metadata-store", "/absolute/metadata", "process"]) - assert args.metadata_store == "/absolute/metadata" - - args = parse_args(["--metadata-store", "relative/metadata", "process"]) - assert args.metadata_store == "relative/metadata" - - def test_log_file_path_validation(self): - """Test log file path handling.""" - args = parse_args(["--log-file", "/var/log/app.log", "process"]) - assert args.log_file == "/var/log/app.log" - - args = parse_args(["--log-file", "app.log", "process"]) - assert args.log_file == "app.log" - - def test_rdf_output_dir_validation(self): - """Test RDF output directory path handling.""" - args = parse_args(["process", "--rdf-output-dir", "/tmp/rdf"]) - assert args.rdf_output_dir == "/tmp/rdf" - - args = parse_args(["process", "--rdf-output-dir", "output"]) - assert args.rdf_output_dir == "output" - - def test_sparql_load_file_path_validation(self): - """Test SPARQL load file path handling.""" - args = parse_args(["sparql", "load-file", "/absolute/path/data.ttl"]) - assert args.file_path == "/absolute/path/data.ttl" - - args = parse_args(["sparql", "load-file", "relative/data.ttl"]) - assert args.file_path == "relative/data.ttl" - - -class TestNumericParameterValidation: - """Test numeric parameter validation.""" - - def test_timeout_parameter_validation(self): - """Test timeout parameter accepts integers.""" - args = parse_args(["sparql", "query", "SELECT * WHERE { ?s ?p ?o }", "--timeout", "60"]) - assert args.timeout == 60 - assert isinstance(args.timeout, int) - - def test_timeout_parameter_default_value(self): - """Test timeout parameter default value.""" - args = parse_args(["sparql", "query", "SELECT * WHERE { ?s ?p ?o }"]) - assert args.timeout == 30 - - def test_invalid_timeout_parameter(self): - """Test invalid timeout parameter raises SystemExit.""" - with pytest.raises(SystemExit): - parse_args(["sparql", "query", "SELECT * WHERE { ?s ?p ?o }", "--timeout", "not-a-number"]) - - -class TestArgumentCombinations: - """Test argument combinations and conflicts.""" - - def test_global_options_with_process_command(self): - """Test global options work with process command.""" - args = parse_args([ - "--config", "config.yaml", - "--knowledge-base", "/kb", - "--metadata-store", "/metadata", - "--log-level", "DEBUG", - "--log-file", "app.log", - "--log-format", "json", - "process", - "--pattern", "*.md", - "--rdf-output-dir", "/output" - ]) - - assert args.config == "config.yaml" - assert args.knowledge_base == "/kb" - assert args.metadata_store == "/metadata" - assert args.log_level == "DEBUG" - assert args.log_file == "app.log" - assert args.log_format == "json" - assert args.command == "process" - assert args.pattern == "*.md" - assert args.rdf_output_dir == "/output" - - def test_global_options_with_query_command(self): - """Test global options work with query command.""" - args = parse_args([ - "--config", "config.yaml", - "--log-level", "WARNING", - "query", "test search", "--type", "tag" - ]) - - assert args.config == "config.yaml" - assert args.log_level == "WARNING" - assert args.command == "query" - assert args.query_string == "test search" - assert args.type == "tag" - - def test_global_options_with_sparql_command(self): - """Test global options work with SPARQL command.""" - args = parse_args([ - "--config", "config.yaml", - "--log-level", "ERROR", - "sparql", "query", "SELECT * WHERE { ?s ?p ?o }", - "--format", "json", - "--timeout", "120" - ]) - - assert args.config == "config.yaml" - assert args.log_level == "ERROR" - assert args.command == "sparql" - assert args.sparql_command == "query" - assert args.sparql_query == "SELECT * WHERE { ?s ?p ?o }" - assert args.format == "json" - assert args.timeout == 120 - - def test_credential_parameters_work_together(self): - """Test username and password parameters work together.""" - args = parse_args([ - "sparql", "query", "SELECT * WHERE { ?s ?p ?o }", - "--user", "testuser", - "--password", "testpass" - ]) - - assert args.user == "testuser" - assert args.password == "testpass" - - def test_endpoint_url_parameters_work_together(self): - """Test endpoint URL parameters work with other options.""" - args = parse_args([ - "process-and-load", "/kb", - "--endpoint-url", "http://localhost:3030/sparql", - "--graph", "http://example.org/graph", - "--user", "admin", - "--password", "secret" - ]) - - assert args.endpoint_url == "http://localhost:3030/sparql" - assert args.graph == "http://example.org/graph" - assert args.user == "admin" - assert args.password == "secret" - - -class TestErrorMessageValidation: - """Test that appropriate error messages are generated.""" - - def test_missing_required_argument_error_message(self): - """Test error messages for missing required arguments.""" - # Test missing command - with pytest.raises(SystemExit): - try: - parse_args([]) - except SystemExit as e: - # argparse should have printed an error message - pass - else: - pytest.fail("Should have raised SystemExit") - - def test_invalid_choice_error_message(self): - """Test error messages for invalid choices.""" - # Test invalid log level - with pytest.raises(SystemExit): - try: - parse_args(["--log-level", "INVALID", "process"]) - except SystemExit as e: - # argparse should have printed an error message about invalid choice - pass - else: - pytest.fail("Should have raised SystemExit") - - def test_invalid_type_error_message(self): - """Test error messages for invalid types.""" - # Test invalid timeout (non-integer) - with pytest.raises(SystemExit): - try: - parse_args(["sparql", "query", "SELECT * WHERE { ?s ?p ?o }", "--timeout", "not-a-number"]) - except SystemExit as e: - # argparse should have printed an error message about invalid type - pass - else: - pytest.fail("Should have raised SystemExit") \ No newline at end of file diff --git a/tests/cli_v2/test_cli_workflow_e2e.py b/tests/cli/test_cli_workflow_e2e.py similarity index 99% rename from tests/cli_v2/test_cli_workflow_e2e.py rename to tests/cli/test_cli_workflow_e2e.py index 352f785..6ed4e05 100644 --- a/tests/cli_v2/test_cli_workflow_e2e.py +++ b/tests/cli/test_cli_workflow_e2e.py @@ -8,7 +8,7 @@ import time import requests -from knowledgebase_processor.cli_v2.main import cli +from knowledgebase_processor.cli.main import cli class TestCLIWorkflowE2E: diff --git a/tests/cli/test_process_and_load_e2e.py b/tests/cli/test_process_and_load_e2e.py deleted file mode 100644 index d80b90c..0000000 --- a/tests/cli/test_process_and_load_e2e.py +++ /dev/null @@ -1,680 +0,0 @@ -import os -import tempfile -import shutil -import unittest -from unittest.mock import patch, MagicMock -from subprocess import run, PIPE, Popen, TimeoutExpired -import time -import json -from pathlib import Path - -# Helper to get the project root -PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) - -class TestProcessAndLoadE2E(unittest.TestCase): - def setUp(self): - self.temp_dir = tempfile.mkdtemp() - self.kb_dir = os.path.join(self.temp_dir, "kb") - os.makedirs(self.kb_dir) - - # Create sample files - with open(os.path.join(self.kb_dir, "sample1.md"), "w") as f: - f.write("# Sample Document 1\n\nContent one.") - with open(os.path.join(self.kb_dir, "sample2.md"), "w") as f: - f.write("# Sample Document 2\n\nContent two.") - - # Malformed file (if needed for a test) - with open(os.path.join(self.kb_dir, "malformed.md"), "w") as f: - f.write("Just some text without structure.") - - def tearDown(self): - shutil.rmtree(self.temp_dir) - - def run_cli_command(self, args, **kwargs): - base_command = ["poetry", "run", "python", "-m", "knowledgebase_processor.cli.main"] - full_command = base_command + args - return run(full_command, stdout=PIPE, stderr=PIPE, text=True, cwd=PROJECT_ROOT, **kwargs) - - @patch("knowledgebase_processor.services.processing_service.ProcessingService.process_and_load") - def test_successful_run_mocked(self, mock_process_and_load): - mock_process_and_load.return_value = 0 - - args = [ - "process-and-load", self.kb_dir, - "--endpoint-url", "http://fake-sparql-endpoint:9999/sparql", - "--cleanup" - ] - result = self.run_cli_command(args) - - self.assertEqual(result.returncode, 0, f"CLI command failed with stderr: {result.stderr}") - self.assertIn("Processing and loading completed successfully", result.stderr) - mock_process_and_load.assert_called_once() - - @patch("src.knowledgebase_processor.services.sparql_service.SparqlQueryInterface") - def test_process_and_load_with_credentials(self, mock_sparql_interface): - """Test that user and password are passed to the SPARQL interface.""" - mock_instance = mock_sparql_interface.return_value - mock_instance.load_file.return_value = None - - args = [ - "process-and-load", self.kb_dir, - "--endpoint-url", "http://fake-sparql-endpoint:9999/sparql", - "--user", "testuser", - "--password", "testpass" - ] - result = self.run_cli_command(args) - - self.assertEqual(result.returncode, 0, f"CLI command failed with stderr: {result.stderr}") - - # Check if SparqlQueryInterface was instantiated with credentials - mock_sparql_interface.assert_called_with( - endpoint_url='http://fake-sparql-endpoint:9999/sparql', - update_endpoint_url='http://fake-sparql-endpoint:9999/update', - username='testuser', - password='testpass' - ) - - def test_invalid_knowledge_base_path(self): - invalid_path = os.path.join(self.temp_dir, "non_existent_kb") - args = ["process-and-load", invalid_path, "--endpoint-url", "http://example.com/sparql"] - result = self.run_cli_command(args) - - self.assertEqual(result.returncode, 1) - self.assertIn("is not a directory", result.stderr) - - def test_missing_endpoint_url(self): - args = ["process-and-load", self.kb_dir] - result = self.run_cli_command(args) - - self.assertEqual(result.returncode, 1) - self.assertIn("SPARQL endpoint URL is required", result.stderr) - - def test_invalid_endpoint_url(self): - args = ["process-and-load", self.kb_dir, "--endpoint-url", "not-a-valid-url"] - result = self.run_cli_command(args) - - self.assertEqual(result.returncode, 1) - self.assertIn("Invalid SPARQL endpoint URL", result.stderr) - - def test_unreachable_endpoint(self): - # This test attempts to connect to a non-existent port, simulating an unreachable endpoint. - args = [ - "process-and-load", self.kb_dir, - "--endpoint-url", "http://localhost:9999/sparql", - "--update-endpoint-url", "http://localhost:9999/update" - ] - result = self.run_cli_command(args) - - self.assertEqual(result.returncode, 1) - self.assertIn("A SPARQL error occurred", result.stderr) - self.assertIn("check if the SPARQL endpoint", result.stderr) - - def test_batch_processing_and_cleanup(self): - rdf_output_dir = os.path.join(self.temp_dir, "rdf_output") - - with patch("knowledgebase_processor.services.sparql_service.SparqlQueryInterface.update") as mock_update: - mock_update.return_value = True - - args = [ - "process-and-load", self.kb_dir, - "--endpoint-url", "http://example.com/sparql", - "--rdf-output-dir", rdf_output_dir, - "--cleanup" - ] - - # Ensure the output dir exists before running - os.makedirs(rdf_output_dir) - - result = self.run_cli_command(args) - - self.assertEqual(result.returncode, 0, f"STDERR: {result.stderr}") - - # Check that RDF files were created and then removed - self.assertTrue(os.path.exists(rdf_output_dir)) # Dir should still be there - self.assertEqual(len(os.listdir(rdf_output_dir)), 0, "RDF output directory should be empty after cleanup") - - # Check that update was called for each file - self.assertEqual(mock_update.call_count, 3) # sample1, sample2, malformed - - def test_rdf_output_validation(self): - rdf_output_dir = os.path.join(self.temp_dir, "rdf_output_no_cleanup") - - with patch("knowledgebase_processor.services.sparql_service.SparqlQueryInterface.update"): - args = [ - "process-and-load", self.kb_dir, - "--endpoint-url", "http://example.com/sparql", - "--rdf-output-dir", rdf_output_dir - # No --cleanup - ] - - result = self.run_cli_command(args) - self.assertEqual(result.returncode, 0, f"STDERR: {result.stderr}") - - # Check that RDF files exist - self.assertTrue(os.path.exists(rdf_output_dir)) - output_files = os.listdir(rdf_output_dir) - self.assertEqual(len(output_files), 3) - - # Validate content of one of the TTL files - sample1_ttl_path = os.path.join(rdf_output_dir, "sample1.ttl") - self.assertTrue(os.path.exists(sample1_ttl_path)) - - with open(sample1_ttl_path, "r") as f: - content = f.read() - self.assertIn('kb:title "Sample Document 1"', content) - self.assertIn('kb:hasContent "Content one."', content) - - @unittest.skipIf("CI" not in os.environ, "Docker tests are run in CI environment only") - def test_e2e_with_docker_fuseki(self): - fuseki_proc = None - try: - # Start Fuseki from docker-compose - run(["docker-compose", "up", "-d", "fuseki"], check=True, cwd=PROJECT_ROOT) - time.sleep(10) # Give it time to start up - - endpoint_url = "http://localhost:3030/ds/query" - update_endpoint_url = "http://localhost:3030/ds/update" - graph_uri = "http://example.org/test_graph" - - # Run the process-and-load command - args = [ - "process-and-load", self.kb_dir, - "--endpoint-url", endpoint_url, - "--update-endpoint-url", update_endpoint_url, - "--graph", graph_uri, - "--log-format", "json" - ] - result = self.run_cli_command(args) - - self.assertEqual(result.returncode, 0, f"STDERR: {result.stderr}") - - # Verify data was loaded by querying Fuseki - query_args = [ - "sparql", "query", - f'SELECT (COUNT(*) as ?count) FROM <{graph_uri}> WHERE {{ ?s ?p ?o }}', - "--endpoint-url", endpoint_url, - "--format", "json" - ] - query_result = self.run_cli_command(query_args) - - self.assertEqual(query_result.returncode, 0, f"Query failed: {query_result.stderr}") - - query_output = json.loads(query_result.stdout) - count = int(query_output[0]['count']) - self.assertGreater(count, 0, "No triples were loaded into the graph.") - - finally: - # Stop and remove the Fuseki container - run(["docker-compose", "down"], cwd=PROJECT_ROOT) - - # ========================================== - # NEW ENHANCED TEST METHODS - QA Agent Added - # ========================================== - - # Tests for 'process' command - def test_process_command_integration(self): - """Test process command integration without mocking.""" - args = ["--knowledge-base", self.kb_dir, "process", "--pattern", "**/*.md"] - result = self.run_cli_command(args) - - self.assertEqual(result.returncode, 0) - # Check for processing-related messages in stdout (where logs go) - self.assertIn("Processing completed successfully", result.stdout) - self.assertIn("Processing files matching pattern", result.stdout) - - def test_process_command_success(self): - """Test successful process command execution (simplified).""" - # For now, just test that the command runs without error - # More detailed mocking can be added later if needed - args = ["--knowledge-base", self.kb_dir, "process", "--pattern", "*.md"] - result = self.run_cli_command(args) - - self.assertEqual(result.returncode, 0) - self.assertIn("Processing completed successfully", result.stdout) - - def test_process_command_with_rdf_output(self): - """Test process command with RDF output directory.""" - rdf_dir = os.path.join(self.temp_dir, "rdf_out") - - args = ["--knowledge-base", self.kb_dir, "process", "--pattern", "*.md", "--rdf-output-dir", rdf_dir] - result = self.run_cli_command(args) - - self.assertEqual(result.returncode, 0) - self.assertIn("Processing completed successfully", result.stdout) - # Check if RDF directory was created - self.assertTrue(os.path.exists(rdf_dir)) - - def test_process_command_invalid_knowledge_base(self): - """Test process command with invalid knowledge base path.""" - invalid_dir = os.path.join(self.temp_dir, "nonexistent") - - args = ["--knowledge-base", invalid_dir, "process"] - result = self.run_cli_command(args) - - self.assertEqual(result.returncode, 1) - self.assertIn("Base path does not exist", result.stdout) - - def test_process_command_with_pattern(self): - """Test process command with custom pattern.""" - # Create a .txt file to test pattern matching - with open(os.path.join(self.kb_dir, "test.txt"), "w") as f: - f.write("# Test TXT file") - - args = ["--knowledge-base", self.kb_dir, "process", "--pattern", "**/*.txt"] - result = self.run_cli_command(args) - - self.assertEqual(result.returncode, 0) - self.assertIn("Processing completed successfully", result.stdout) - - # Tests for 'query' command - @patch("src.knowledgebase_processor.api.KnowledgeBaseAPI.query") - def test_query_command_success(self, mock_query): - """Test successful query command execution.""" - mock_query.return_value = ["Result 1", "Result 2", "Result 3"] - - args = ["--knowledge-base", self.kb_dir, "query", "test search", "--type", "text"] - result = self.run_cli_command(args) - - self.assertEqual(result.returncode, 0) - self.assertIn("Result 1", result.stdout) - self.assertIn("Result 2", result.stdout) - self.assertIn("Result 3", result.stdout) - mock_query.assert_called_once_with("test search", "text") - - @patch("src.knowledgebase_processor.api.KnowledgeBaseAPI.query") - def test_query_command_no_results(self, mock_query): - """Test query command with no results.""" - mock_query.return_value = [] - - args = ["--knowledge-base", self.kb_dir, "query", "nonexistent", "--type", "tag"] - result = self.run_cli_command(args) - - self.assertEqual(result.returncode, 0) - self.assertIn("No results found", result.stdout) - mock_query.assert_called_once_with("nonexistent", "tag") - - @patch("src.knowledgebase_processor.api.KnowledgeBaseAPI.query") - def test_query_command_topic_type(self, mock_query): - """Test query command with topic type.""" - mock_query.return_value = ["Topic result"] - - args = ["--knowledge-base", self.kb_dir, "query", "python programming", "--type", "topic"] - result = self.run_cli_command(args) - - self.assertEqual(result.returncode, 0) - self.assertIn("Topic result", result.stdout) - mock_query.assert_called_once_with("python programming", "topic") - - @patch("src.knowledgebase_processor.api.KnowledgeBaseAPI.query") - def test_query_command_exception(self, mock_query): - """Test query command handling exceptions.""" - mock_query.side_effect = Exception("Query error") - - args = ["--knowledge-base", self.kb_dir, "query", "test"] - result = self.run_cli_command(args) - - self.assertEqual(result.returncode, 1) - self.assertIn("Error during query", result.stderr) - - # Tests for 'sparql query' command - @patch("src.knowledgebase_processor.api.KnowledgeBaseAPI.sparql_query") - def test_sparql_query_json_format(self, mock_sparql_query): - """Test SPARQL query with JSON output format.""" - mock_sparql_query.return_value = [{"count": "42"}, {"count": "24"}] - - args = [ - "sparql", "query", "SELECT (COUNT(*) as ?count) WHERE { ?s ?p ?o }", - "--endpoint-url", "http://example.com/sparql", - "--format", "json" - ] - result = self.run_cli_command(args) - - self.assertEqual(result.returncode, 0) - self.assertIn('"count": "42"', result.stdout) - mock_sparql_query.assert_called_once() - - @patch("src.knowledgebase_processor.api.KnowledgeBaseAPI.sparql_query") - def test_sparql_query_table_format(self, mock_sparql_query): - """Test SPARQL query with table output format.""" - mock_sparql_query.return_value = [{"name": "John", "age": "30"}, {"name": "Jane", "age": "25"}] - - args = [ - "sparql", "query", "SELECT ?name ?age WHERE { ?person foaf:name ?name; foaf:age ?age }", - "--endpoint-url", "http://example.com/sparql", - "--format", "table", - "--timeout", "60" - ] - result = self.run_cli_command(args) - - self.assertEqual(result.returncode, 0) - self.assertIn("name | age", result.stdout) - self.assertIn("John | 30", result.stdout) - self.assertIn("Jane | 25", result.stdout) - - @patch("src.knowledgebase_processor.api.KnowledgeBaseAPI.sparql_query") - def test_sparql_query_turtle_format(self, mock_sparql_query): - """Test SPARQL query with turtle output format.""" - turtle_result = "@prefix ex: .\nex:subject ex:predicate ex:object ." - mock_sparql_query.return_value = turtle_result - - args = [ - "sparql", "query", "CONSTRUCT { ?s ?p ?o } WHERE { ?s ?p ?o }", - "--endpoint-url", "http://example.com/sparql", - "--format", "turtle" - ] - result = self.run_cli_command(args) - - self.assertEqual(result.returncode, 0) - self.assertIn("@prefix ex:", result.stdout) - - @patch("src.knowledgebase_processor.api.KnowledgeBaseAPI.sparql_query") - def test_sparql_query_with_credentials(self, mock_sparql_query): - """Test SPARQL query with authentication.""" - mock_sparql_query.return_value = [{"result": "authenticated"}] - - args = [ - "sparql", "query", "SELECT ?result WHERE { ?s ?p ?result }", - "--endpoint-url", "http://secure.example.com/sparql", - "--user", "testuser", - "--password", "testpass", - "--format", "json" - ] - result = self.run_cli_command(args) - - self.assertEqual(result.returncode, 0) - mock_sparql_query.assert_called_once() - - @patch("src.knowledgebase_processor.api.KnowledgeBaseAPI.sparql_query") - def test_sparql_query_boolean_result(self, mock_sparql_query): - """Test SPARQL ASK query returning boolean.""" - mock_sparql_query.return_value = True - - args = [ - "sparql", "query", "ASK { ?s ?p ?o }", - "--endpoint-url", "http://example.com/sparql", - "--format", "table" - ] - result = self.run_cli_command(args) - - self.assertEqual(result.returncode, 0) - self.assertIn("True", result.stdout) - - @patch("src.knowledgebase_processor.api.KnowledgeBaseAPI.sparql_query") - def test_sparql_query_exception(self, mock_sparql_query): - """Test SPARQL query handling exceptions.""" - mock_sparql_query.side_effect = Exception("SPARQL endpoint unreachable") - - args = [ - "sparql", "query", "SELECT ?s WHERE { ?s ?p ?o }", - "--endpoint-url", "http://unreachable.example.com/sparql" - ] - result = self.run_cli_command(args) - - self.assertEqual(result.returncode, 1) - self.assertIn("SPARQL query failed", result.stderr) - - # Tests for 'sparql load-file' command - @patch("src.knowledgebase_processor.api.KnowledgeBaseAPI.sparql_load") - def test_sparql_load_file_success(self, mock_sparql_load): - """Test successful SPARQL load-file command.""" - # Create a dummy RDF file - rdf_file = os.path.join(self.temp_dir, "test.ttl") - with open(rdf_file, "w") as f: - f.write("@prefix ex: .\nex:subject ex:predicate ex:object .") - - mock_sparql_load.return_value = None - - args = [ - "sparql", "load-file", rdf_file, - "--endpoint-url", "http://example.com/sparql", - "--graph", "http://example.org/graph", - "--rdf-format", "turtle" - ] - result = self.run_cli_command(args) - - self.assertEqual(result.returncode, 0) - self.assertIn(f"Successfully loaded RDF file '{rdf_file}'", result.stderr) - mock_sparql_load.assert_called_once() - - @patch("src.knowledgebase_processor.api.KnowledgeBaseAPI.sparql_load") - def test_sparql_load_file_with_auth(self, mock_sparql_load): - """Test SPARQL load-file with authentication.""" - rdf_file = os.path.join(self.temp_dir, "auth_test.ttl") - with open(rdf_file, "w") as f: - f.write("@prefix ex: .\nex:auth ex:test ex:data .") - - mock_sparql_load.return_value = None - - args = [ - "sparql", "load-file", rdf_file, - "--endpoint-url", "http://secure.example.com/sparql", - "--user", "admin", - "--password", "secret", - "--graph", "http://example.org/secure-graph" - ] - result = self.run_cli_command(args) - - self.assertEqual(result.returncode, 0) - mock_sparql_load.assert_called_once() - - @patch("src.knowledgebase_processor.api.KnowledgeBaseAPI.sparql_load") - def test_sparql_load_file_different_formats(self, mock_sparql_load): - """Test SPARQL load-file with different RDF formats.""" - formats_to_test = ["turtle", "n3", "nt", "xml", "json-ld"] - - for rdf_format in formats_to_test: - with self.subTest(format=rdf_format): - rdf_file = os.path.join(self.temp_dir, f"test.{rdf_format}") - with open(rdf_file, "w") as f: - f.write("# Test RDF content") - - mock_sparql_load.return_value = None - - args = [ - "sparql", "load-file", rdf_file, - "--endpoint-url", "http://example.com/sparql", - "--rdf-format", rdf_format - ] - result = self.run_cli_command(args) - - self.assertEqual(result.returncode, 0) - - @patch("src.knowledgebase_processor.api.KnowledgeBaseAPI.sparql_load") - def test_sparql_load_file_exception(self, mock_sparql_load): - """Test SPARQL load-file handling exceptions.""" - rdf_file = os.path.join(self.temp_dir, "error_test.ttl") - with open(rdf_file, "w") as f: - f.write("invalid rdf content") - - mock_sparql_load.side_effect = Exception("Failed to load RDF") - - args = [ - "sparql", "load-file", rdf_file, - "--endpoint-url", "http://example.com/sparql" - ] - result = self.run_cli_command(args) - - self.assertEqual(result.returncode, 1) - self.assertIn(f"Failed to load RDF file '{rdf_file}'", result.stderr) - - # Edge cases and error handling tests - def test_missing_command(self): - """Test CLI with no command specified.""" - args = ["--log-level", "DEBUG"] - result = self.run_cli_command(args) - - self.assertEqual(result.returncode, 2) # argparse error - self.assertIn("required", result.stderr.lower()) - - def test_unknown_command(self): - """Test CLI with unknown command.""" - args = ["unknown-command"] - result = self.run_cli_command(args) - - self.assertEqual(result.returncode, 2) # argparse error - self.assertIn("invalid choice", result.stderr.lower()) - - def test_malformed_arguments_process(self): - """Test process command with conflicting arguments.""" - args = ["process", "--pattern", "--rdf-output-dir"] # Missing values - result = self.run_cli_command(args) - - self.assertEqual(result.returncode, 2) # argparse error - - def test_malformed_arguments_sparql_query(self): - """Test SPARQL query with missing query string.""" - args = ["sparql", "query", "--endpoint-url", "http://example.com/sparql"] - result = self.run_cli_command(args) - - self.assertEqual(result.returncode, 2) # argparse error - self.assertIn("required", result.stderr.lower()) - - def test_malformed_arguments_sparql_load(self): - """Test SPARQL load-file with missing file path.""" - args = ["sparql", "load-file", "--endpoint-url", "http://example.com/sparql"] - result = self.run_cli_command(args) - - self.assertEqual(result.returncode, 2) # argparse error - - def test_invalid_sparql_subcommand(self): - """Test SPARQL with invalid subcommand.""" - args = ["sparql", "invalid-subcommand"] - result = self.run_cli_command(args) - - self.assertEqual(result.returncode, 2) # argparse error - - def test_empty_query_string(self): - """Test query command with empty query string.""" - args = ["--knowledge-base", self.kb_dir, "query", ""] # Empty query - result = self.run_cli_command(args) - - # Should not crash, but may return no results - self.assertIn([0, 1], [result.returncode]) # Could be 0 or 1 depending on implementation - - def test_nonexistent_rdf_file_load(self): - """Test SPARQL load-file with non-existent file.""" - nonexistent_file = os.path.join(self.temp_dir, "does_not_exist.ttl") - - args = [ - "sparql", "load-file", nonexistent_file, - "--endpoint-url", "http://example.com/sparql" - ] - result = self.run_cli_command(args) - - self.assertEqual(result.returncode, 1) - # Should contain some error about file not found - - # Configuration and logging tests - @patch("src.knowledgebase_processor.config.load_config") - def test_custom_config_file(self, mock_load_config): - """Test CLI with custom configuration file.""" - config_file = os.path.join(self.temp_dir, "test_config.json") - with open(config_file, "w") as f: - json.dump({"knowledge_base_path": "/test/path"}, f) - - mock_config = MagicMock() - mock_config.knowledge_base_path = "/test/path" - mock_config.sparql_endpoint_url = "http://example.com/sparql" - mock_load_config.return_value = mock_config - - with patch("src.knowledgebase_processor.api.KnowledgeBaseAPI.query") as mock_query: - mock_query.return_value = ["Config test result"] - - args = ["--config", config_file, "--knowledge-base", self.kb_dir, "query", "test"] - result = self.run_cli_command(args) - - self.assertEqual(result.returncode, 0) - mock_load_config.assert_called_with(config_file) - - def test_logging_configurations(self): - """Test different logging configurations.""" - log_configs = [ - (["--log-level", "DEBUG"], "DEBUG"), - (["--log-level", "ERROR"], "ERROR"), - (["--log-format", "json"], "json"), - (["--log-format", "text"], "text") - ] - - for args_addition, expected in log_configs: - with self.subTest(config=expected): - with patch("src.knowledgebase_processor.api.KnowledgeBaseAPI.query") as mock_query: - mock_query.return_value = ["Log test"] - - args = args_addition + ["--knowledge-base", self.kb_dir, "query", "test"] - result = self.run_cli_command(args) - - # Should not crash with different log configurations - self.assertIn(result.returncode, [0, 1]) # May succeed or fail, but shouldn't crash - - def test_log_file_output(self): - """Test logging to file.""" - log_file = os.path.join(self.temp_dir, "test.log") - - with patch("src.knowledgebase_processor.api.KnowledgeBaseAPI.query") as mock_query: - mock_query.return_value = ["Log file test"] - - args = ["--log-file", log_file, "--log-level", "DEBUG", "--knowledge-base", self.kb_dir, "query", "test"] - result = self.run_cli_command(args) - - self.assertEqual(result.returncode, 0) - # Log file should exist (though it might be empty due to mocking) - self.assertTrue(os.path.exists(log_file) or result.returncode == 0) - - def test_knowledge_base_path_override(self): - """Test knowledge base path override behavior.""" - custom_kb_path = os.path.join(self.temp_dir, "custom_kb") - os.makedirs(custom_kb_path) - - with open(os.path.join(custom_kb_path, "custom.md"), "w") as f: - f.write("# Custom KB Content") - - with patch("src.knowledgebase_processor.api.KnowledgeBaseAPI.query") as mock_query: - mock_query.return_value = ["Custom KB result"] - - args = ["--knowledge-base", custom_kb_path, "query", "custom"] - result = self.run_cli_command(args) - - self.assertEqual(result.returncode, 0) - - def test_metadata_store_path_override(self): - """Test metadata store path override behavior.""" - custom_metadata_path = os.path.join(self.temp_dir, "custom_metadata") - os.makedirs(custom_metadata_path, exist_ok=True) - - with patch("src.knowledgebase_processor.api.KnowledgeBaseAPI.query") as mock_query: - mock_query.return_value = ["Metadata test"] - - args = ["--metadata-store", custom_metadata_path, "--knowledge-base", self.kb_dir, "query", "metadata"] - result = self.run_cli_command(args) - - self.assertEqual(result.returncode, 0) - - @patch("src.knowledgebase_processor.api.KnowledgeBaseAPI") - def test_api_initialization_failure(self, mock_api_class): - """Test CLI behavior when API initialization fails.""" - mock_api_class.side_effect = Exception("API initialization failed") - - args = ["--knowledge-base", self.kb_dir, "query", "test"] - result = self.run_cli_command(args) - - self.assertEqual(result.returncode, 1) - self.assertIn("Failed to initialize KnowledgeBaseAPI", result.stderr) - - def test_process_and_load_update_endpoint_url(self): - """Test process-and-load with explicit update endpoint URL.""" - with patch("knowledgebase_processor.services.sparql_service.SparqlQueryInterface.update") as mock_update: - mock_update.return_value = True - - args = [ - "process-and-load", self.kb_dir, - "--endpoint-url", "http://example.com/query", - "--update-endpoint-url", "http://example.com/update" # This should be handled by the CLI arg parser - ] - result = self.run_cli_command(args) - - # Should not crash even if update endpoint is different - # The actual implementation may or may not use this arg, but CLI should parse it - self.assertIn(result.returncode, [0, 1, 2]) - -if __name__ == "__main__": - unittest.main() \ No newline at end of file