LGDiMaggio
diff --git a/‎CHANGELOG.md‎
Lines changed: 30 additions & 1 deletion b/‎CHANGELOG.md‎
Lines changed: 30 additions & 1 deletion
diff --git a/‎CITATION.cff‎
Lines changed: 4 additions & 4 deletions b/‎CITATION.cff‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎INSTALL.md‎
Lines changed: 28 additions & 0 deletions b/‎INSTALL.md‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 30 additions & 16 deletions b/‎README.md‎
Lines changed: 30 additions & 16 deletions
diff --git a/‎SECURITY.md‎
Lines changed: 3 additions & 3 deletions b/‎SECURITY.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎docs/index.html‎
Lines changed: 5 additions & 5 deletions b/‎docs/index.html‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 15 additions & 2 deletions b/‎pyproject.toml‎
Lines changed: 15 additions & 2 deletions
diff --git a/‎server.json‎
Lines changed: 2 additions & 2 deletions b/‎server.json‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎src/__init__.py‎
Lines changed: 1 addition & 1 deletion
@@ -5,6 +5,35 @@ All notable changes to the Predictive Maintenance MCP Server project will be doc
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.7.0] - 2025-07-14
+
+### Added
+- **FAISS vector search** — `search_documentation` now uses FAISS + sentence-transformers for semantic retrieval when installed (`pip install predictive-maintenance-mcp[vector-search]`). Falls back to TF-IDF keyword search when not installed. Dual-backend `DocumentIndex` in `src/rag.py`.
+- **OCR for scanned PDFs** — `document_reader.extract_text_from_pdf()` automatically falls back to Tesseract OCR for pages with empty/minimal text. Requires optional `pytesseract` + `pdf2image` + Poppler.
+- **DOCX diagnostic reports** — New `generate_diagnostic_report_docx` MCP tool and `save_diagnostic_report_docx()` in report generator. Creates structured Word documents with statistics tables, FFT/envelope peaks, bearing frequencies, ISO evaluation, and diagnostic summary. Requires optional `python-docx`.
+- **New optional dependency groups** in `pyproject.toml`: `vector-search`, `ocr`, `docx`. The `full` extra now includes all of them.
+- **Overlapping chunking** — New `chunk_text()` helper in RAG module for character-level overlapping chunks alongside paragraph-aware chunking.
+
+### Changed
+- **`search_documentation`** now reports active backend (`faiss` or `tfidf`) in response
+- **27 MCP tools** (was 26) — added `generate_diagnostic_report_docx`
+- Version bumped to 0.7.0
+
+## [0.6.0] - 2025-07-08
+
+### Added
+- **RAG-based document search** — New `search_documentation` MCP tool using TF-IDF indexing over machine manuals and bearing catalogs (`src/rag.py`)
+- **`SpectralPeak` model** — Structured representation for individual FFT peaks (frequency, magnitude, dB, annotation)
+
+### Changed
+- **Compact FFT output** — `analyze_fft` now returns top-20 peaks + RMS/stats instead of full frequency/magnitude arrays (~200 KB → ~2 KB per call), eliminating LLM context overflow
+- **Compact signal resource** — `read_signal_file` returns metadata + statistics only (no raw samples), preventing large JSON payloads
+- **Server instructions** updated with output-efficiency policy and RAG documentation guidance
+- **`pypdf`** promoted from optional to required dependency
+
+### Fixed
+- LLM "output too long" errors caused by full-array serialisation in `FFTResult`
+
 ## [0.5.0] - 2026-02-16
 
 ### Added
@@ -138,7 +167,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## Roadmap
 
-### Planned for v0.6.0
+### Planned for v0.7.0
 - **📦 Docker image** for zero-install setup
 - **📏 Customizable ISO report thresholds**
 - Multi-signal comparison tools
 
@@ -1,8 +1,8 @@
 cff-version: 1.2.0
 message: "If you use this software, please cite it as below."
 title: "Predictive Maintenance MCP Server: An open-source framework for integrating Large Language Models with predictive maintenance and fault diagnosis workflows"
-version: 0.5.0
-date-released: 2026-02-16
+version: 0.7.0
+date-released: 2025-07-14
 authors:
   - family-names: Di Maggio
     given-names: Luigi Gianpio
@@ -51,8 +51,8 @@ preferred-citation:
   authors:
     - family-names: Di Maggio
       given-names: Luigi Gianpio
-  year: 2025
-  version: 0.5.0
+  year: 2026
+  version: 0.7.0
   repository-code: "https://github.com/LGDiMaggio/predictive-maintenance-mcp"
   license: MIT
   doi: "10.5281/zenodo.17611542"
@@ -234,6 +234,34 @@ python validate_server.py
 - `scikit-learn>=1.7.2` - Machine learning
 - `plotly>=5.24.0` - Interactive visualizations
 - `pydantic>=2.12.0` - Data validation
+- `pypdf>=4.0` - PDF text extraction
+
+### Optional Extras
+
+Install any combination using pip extras:
+
+```bash
+# Semantic vector search (FAISS + sentence-transformers)
+pip install predictive-maintenance-mcp[vector-search]
+
+# OCR for scanned PDF manuals (Tesseract)
+pip install predictive-maintenance-mcp[ocr]
+
+# DOCX diagnostic report generation
+pip install predictive-maintenance-mcp[docx]
+
+# Everything (all optional features)
+pip install predictive-maintenance-mcp[full]
+```
+
+| Extra | Packages | Purpose |
+|-------|----------|---------|
+| `vector-search` | `faiss-cpu`, `sentence-transformers` | Semantic document search (FAISS). Falls back to TF-IDF when not installed. |
+| `ocr` | `pytesseract`, `Pillow`, `pdf2image` | OCR for scanned/image-based PDF manuals. Requires [Poppler](https://github.com/ossamamehmood/Poppler-windows/releases) on system PATH. |
+| `docx` | `python-docx` | Generate structured Word (.docx) diagnostic reports. |
+| `full` | All of the above | Install all optional features at once. |
+
+> **Note**: `vector-search` pulls in PyTorch (~2 GB). For lightweight installs, skip it — TF-IDF keyword search works well for technical documentation.
 
 ### Development Dependencies
 - `pytest>=8.0.0` - Testing
 
@@ -59,6 +59,10 @@ This project is built around the **Model Context Protocol (MCP)** — an open st
 │   │ ML Anomaly     │  │ Manual/PDF    │  │ Bearing       │  │
 │   │ Detection      │  │ Reader        │  │ Catalog       │  │
 │   └────────────────┘  └───────────────┘  └───────────────┘  │
+│   ┌────────────────┐  ┌───────────────┐                     │
+│   │ RAG Document   │  │ DOCX Report   │                     │
+│   │ Search (FAISS) │  │ Generation    │                     │
+│   └────────────────┘  └───────────────┘                     │
 └──────────────┬───────────────────────────────────────────────┘
                │
                ▼
@@ -173,7 +177,11 @@ This project serves two audiences. Pick the door that fits you:
 
   </details>
 - **📁 Multi-Format Support** — Load signals from CSV, MAT (MATLAB), WAV, NPY, and Parquet files
-- **🚀 Zero Configuration** — Works out of the box with sample data, auto-detects sampling rates from metadata
+- **🔎 RAG Document Search** — Vector search (FAISS + sentence-transformers) with TF-IDF fallback over machine manuals and bearing catalogs. Auto-cached.
+- **📝 DOCX Reports** — Generate structured Word diagnostic reports alongside interactive HTML (requires `python-docx`)
+- **🔍 OCR for Scanned PDFs** — Automatic OCR fallback (Tesseract) for image-based equipment manuals
+- **⚡ LLM-Optimised Output** — Tool responses return compact summaries (top peaks, statistics) instead of raw arrays, keeping LLM context windows lean
+- **�🚀 Zero Configuration** — Works out of the box with sample data, auto-detects sampling rates from metadata
 
 ---
 
@@ -393,12 +401,13 @@ Tools perform **computations and generate outputs**:
 - **`generate_fft_report`** — Interactive FFT spectrum HTML report with peak table
 - **`generate_envelope_report`** — Envelope analysis report with bearing fault markers
 - **`generate_iso_report`** — ISO 20816-3 evaluation with zone visualization
+- **`generate_diagnostic_report_docx`** — Structured Word (.docx) diagnostic report (requires `python-docx`)
 - **`generate_pca_visualization_report`** — 2D/3D PCA projection report for anomaly exploration
 - **`generate_feature_comparison_report`** — Feature-level comparison report across signals/classes
 - **`list_html_reports`** — List all generated reports with metadata
 - **`get_report_info`** — Get report details without loading full HTML
 
-> 💡 **All reports are interactive Plotly visualizations saved to `reports/` directory**
+> 💡 **HTML reports are interactive Plotly visualizations saved to `reports/`. DOCX reports are structured Word documents for stakeholders.**
 
 </details>
 
@@ -410,6 +419,7 @@ Tools perform **computations and generate outputs**:
 - **`calculate_bearing_characteristic_frequencies`** — Calculate BPFO/BPFI/BSF/FTF from geometry
 - **`read_manual_excerpt`** — Read manual text excerpt (configurable page limit)
 - **`search_bearing_catalog`** — Search bearing geometry in local catalog (20+ common bearings)
+- **`search_documentation`** — Semantic search across machine manuals and bearing catalogs (FAISS vector search or TF-IDF fallback)
 
 **MCP Resources:**
 - `manual://list` — Browse available manuals
@@ -442,7 +452,9 @@ The `skills/` directory contains pre-built guided workflows that orchestrate mul
 | [**quick-screening**](skills/quick-screening/SKILL.md) | 5 | Fast health screening with clear Healthy/Suspicious/Critical classification |
 | [**report-generation**](skills/report-generation/SKILL.md) | 6 | Professional HTML report generation with composite multi-report option |
 
-> 💡 Skills are standalone markdown files that any MCP-compatible LLM client can use as system instructions to coordinate multi-step diagnostic workflows.
+> ⚠️ **Skills are Claude / GitHub Copilot-specific.** They use [SKILL.md with YAML frontmatter](https://docs.anthropic.com/en/docs/agents-and-tools/claude-code/skills), a convention recognised by Claude.ai, Claude Code, and Copilot agents. Other LLM clients can still read them as plain markdown, but automatic skill invocation requires a Claude or Copilot-compatible host.
+>
+> **MCP tools, resources, and HTML reports are LLM-agnostic** — they work with any MCP-compatible client (ChatGPT, Ollama, LM Studio, etc.).
 
 ---
 
@@ -480,10 +492,11 @@ The system follows a **hybrid MCP architecture** combining Resources (direct dat
 │  │  TOOLS (Analysis & Processing)                       │  │
 │  │  • FFT, Envelope, ISO 20816-3                        │  │
 │  │  • ML Anomaly Detection                              │  │
-│  │  • Report Generation (HTML)                          │  │
+│  │  • Report Generation (HTML + DOCX)                   │  │
 │  │  • Manual Spec Extraction                            │  │
 │  │  • Bearing Frequency Calculation                     │  │
 │  │  • Bearing Catalog Search                            │  │
+│  │  • RAG Document Search (FAISS / TF-IDF)               │  │
 │  └──────────────────────────────────────────────────────┘  │
 └────────────────────┬────────────────────────────────────────┘
                      │
@@ -523,8 +536,8 @@ The system follows a **hybrid MCP architecture** combining Resources (direct dat
 
 **Key Features:**
 - ✅ **4 MCP Resources** — Direct read access to signals and manuals
-- ✅ **25 MCP Tools** — Complete diagnostic workflow (analysis, plotting, ML, reporting, manuals)
-- ✅ **4 MCP Prompts** — Guided diagnostic workflows
+- ✅ **27 MCP Tools** — Complete diagnostic workflow (analysis, plotting, ML, reporting incl. DOCX, manuals, RAG search)
+- ✅ **3 Copilot Skills** — Guided diagnostic workflows (Claude / Copilot-specific)
 - ✅ **Hybrid Architecture** — Resources for reading, Tools for processing
 - ✅ **Local-First** — All data stays on your machine (privacy-preserving)
 
@@ -601,6 +614,7 @@ All analysis tools generate **interactive HTML reports** with Plotly visualizati
 | 🔊 **FFT Spectrum** | `generate_fft_report()` | Frequency analysis, peak detection, harmonic markers |
 | 🎯 **Envelope Analysis** | `generate_envelope_report()` | Bearing fault frequencies, modulation detection |
 | 📏 **ISO 20816-3** | `generate_iso_report()` | Vibration severity zones, compliance assessment |
+| 📝 **Diagnostic DOCX** | `generate_diagnostic_report_docx()` | Word document with stats, peaks, ISO, diagnosis |
 
 All reports include:
 - Interactive Plotly charts (pan/zoom/hover)
@@ -628,7 +642,7 @@ Generate FFT report for baseline_1.csv
 | [Ollama Guide](docs/OLLAMA_GUIDE.md) | Engineers | Use with local LLMs (fully air-gapped) |
 | [CHANGELOG.md](CHANGELOG.md) | Everyone | Version history |
 | [data/README.md](data/README.md) | Everyone | Dataset documentation |
-| [skills/](skills/) | 🤖 LLM Clients | Copilot Skills — guided diagnostic workflows (bearing, screening, reporting) |
+| [skills/](skills/) | 🤖 Claude / Copilot | Copilot Skills — guided diagnostic workflows (bearing, screening, reporting) |
 
 ---
 
@@ -701,13 +715,12 @@ npx @modelcontextprotocol/inspector python -m predictive_maintenance_mcp
 
 ## 🚀 Roadmap
 
-### ✨ Recent: v0.5.0 — Code Quality & Multi-Format Support
+### ✨ Recent: v0.7.0 — Vector Search, OCR & DOCX Reports
 
-- 📂 **Multi-format signal loading** — CSV, MAT, WAV, NPY, Parquet via unified `load_signal_data()`
-- 🔧 **ML code deduplication** — 4 helper functions reduce ~163 statements
-- 📦 **pypdf migration** — Replaced deprecated PyPDF2 with pypdf
-- ▶️ **`python -m` support** — Run as `python -m predictive_maintenance_mcp`
-- 🧹 **Consolidated metadata reads** — ISO evaluation no longer double-reads metadata files
+- 🔎 **FAISS vector search** — Semantic document retrieval with sentence-transformers (TF-IDF fallback when not installed)
+- 🔍 **OCR for scanned PDFs** — Automatic Tesseract OCR fallback for image-based equipment manuals
+- 📝 **DOCX diagnostic reports** — Structured Word documents with statistics, peaks, ISO evaluation, and diagnostic summary
+- ⚡ **Compact FFT output** — Top-20 peaks + RMS/stats instead of full arrays (~200 KB → ~2 KB)
 
 ### 🔮 Planned Enhancements
 
@@ -718,8 +731,9 @@ Each item below links to an open issue where you can **discuss, contribute, or c
 | ✅ Done | **Parquet/MAT/WAV/NPY data format support** | v0.5.0 | — |
 | 🔴 High | **Customizable ISO report thresholds** | Open | [Good First Issue](https://github.com/LGDiMaggio/predictive-maintenance-mcp/issues) |
 | 🔴 High | **Docker image for zero-install setup** | Open | [Help Wanted](https://github.com/LGDiMaggio/predictive-maintenance-mcp/issues) |
-| 🟡 Medium | **Vector search for large documents** (ChromaDB/FAISS) | Planned | [Discuss](https://github.com/LGDiMaggio/predictive-maintenance-mcp/discussions) |
-| 🟡 Medium | **OCR for scanned PDF manuals** (Tesseract) | Planned | [Discuss](https://github.com/LGDiMaggio/predictive-maintenance-mcp/discussions) |
+| ✅ Done | **Vector search for large documents** (FAISS + sentence-transformers) | v0.7.0 | — |
+| ✅ Done | **OCR for scanned PDF manuals** (Tesseract) | v0.7.0 | — |
+| ✅ Done | **DOCX diagnostic reports** (python-docx) | v0.7.0 | — |
 | 🟡 Medium | **Multi-signal trending** — Compare historical data | Planned | [Discuss](https://github.com/LGDiMaggio/predictive-maintenance-mcp/discussions) |
 | 🟢 Future | **Real-time streaming** — Live vibration monitoring | Concept | — |
 | 🟢 Future | **Dashboard** — Multi-asset fleet monitoring | Concept | — |
@@ -766,7 +780,7 @@ If you use this server in your research or projects, please cite:
   title = {Predictive Maintenance MCP Server: An open-source framework for integrating Large Language Models with predictive maintenance and fault diagnosis workflows},
   author = {Di Maggio, Luigi Gianpio},
   year = {2025},
-  version = {0.5.0},
+  version = {0.7.0},
   url = {https://github.com/LGDiMaggio/predictive-maintenance-mcp},
   doi = {10.5281/zenodo.17611542}
 }
 
@@ -4,10 +4,10 @@
 
 | Version | Supported          |
 | ------- | ------------------ |
+| 0.6.x   | :white_check_mark: |
 | 0.5.x   | :white_check_mark: |
-| 0.4.x   | :white_check_mark: |
-| 0.3.x   | :x:                |
-| < 0.3   | :x:                |
+| 0.4.x   | :x:                |
+| < 0.4   | :x:                |
 
 ## Reporting a Vulnerability
 
 
@@ -38,7 +38,7 @@
     "description": "Open-source MCP server for AI-powered predictive maintenance, bearing diagnostics, vibration analysis, and ISO 20816 compliance.",
     "applicationCategory": "EngineeringApplication",
     "operatingSystem": "Windows, macOS, Linux",
-    "softwareVersion": "0.5.0",
+    "softwareVersion": "0.7.0",
     "license": "https://opensource.org/licenses/MIT",
     "url": "https://lgdimaggio.github.io/predictive-maintenance-mcp/",
     "codeRepository": "https://github.com/LGDiMaggio/predictive-maintenance-mcp",
@@ -667,7 +667,7 @@
   <div class="container">
     <div class="hero-badge">
       <span class="pulse"></span>
-      v0.5.0 — Multi-format support, ML dedup &amp; more
+      v0.7.0 — Vector search, OCR &amp; DOCX reports
     </div>
 
     <h1>
@@ -709,7 +709,7 @@ <h1>
   <div class="container">
     <div class="stats-grid">
       <div class="stat-card">
-        <div class="stat-num" data-target="25">0</div>
+        <div class="stat-num" data-target="27">0</div>
         <div class="stat-label">MCP Tools</div>
       </div>
       <div class="stat-card">
@@ -765,7 +765,7 @@ <h3>ML Anomaly Detection</h3>
       <div class="feature-card reveal">
         <div class="feature-icon green">📄</div>
         <h3>Interactive HTML Reports</h3>
-        <p>Publication-quality reports with Plotly charts, auto-generated summaries. Shareable files for ops teams and management.</p>
+        <p>Publication-quality reports with Plotly charts and structured DOCX diagnostics. Shareable files for ops teams and management.</p>
       </div>
       <div class="feature-card reveal">
         <div class="feature-icon purple">📁</div>
@@ -903,7 +903,7 @@ <h2>Reports &amp; Visualizations</h2>
       </figure>
       <figure class="screenshot-card reveal">
         <img src="https://raw.githubusercontent.com/LGDiMaggio/predictive-maintenance-mcp/main/assets/MCPserver.png"
-             alt="MCP server architecture showing 25 tools, 4 resources, 4 prompts"
+             alt="MCP server architecture showing 26 tools, 4 resources, 4 prompts"
              loading="lazy" width="600" height="400">
         <figcaption>MCP server architecture overview</figcaption>
       </figure>
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "predictive-maintenance-mcp"
-version = "0.5.0"
+version = "0.7.0"
 description = "Proof of Concept: AI-Powered Predictive Maintenance & Fault Diagnosis MCP Server - Industrial machinery condition monitoring, vibration analysis, bearing diagnostics, and ML-based anomaly detection through Model Context Protocol"
 readme = "README.md"
 authors = [
@@ -51,6 +51,7 @@ dependencies = [
     "pandas>=2.3.3",
     "plotly>=5.24.0",
     "pydantic>=2.12.0",
+    "pypdf>=4.0",
     "scikit-learn>=1.7.2",
     "scipy>=1.16.2",
 ]
@@ -81,8 +82,20 @@ ml = [
 viz = [
     "plotly>=5.20",
 ]
+vector-search = [
+    "faiss-cpu>=1.7",
+    "sentence-transformers>=2.0",
+]
+ocr = [
+    "pytesseract>=0.3",
+    "Pillow>=10.0",
+    "pdf2image>=1.16",
+]
+docx = [
+    "python-docx>=1.0",
+]
 full = [
-    "predictive-maintenance-mcp[docs,ml,viz]",
+    "predictive-maintenance-mcp[docs,ml,viz,vector-search,ocr,docx]",
 ]
 dev = [
     "pytest>=8.0.0",
 
@@ -3,12 +3,12 @@
   "name": "io.github.LGDiMaggio/predictive-maintenance-mcp",
   "title": "Predictive Maintenance",
   "description": "Industrial vibration analysis, bearing fault diagnosis, ISO 20816, and ML anomaly detection",
-  "version": "0.5.0",
+  "version": "0.7.0",
   "packages": [
     {
       "registryType": "pypi",
       "identifier": "predictive-maintenance-mcp",
-      "version": "0.5.0",
+      "version": "0.7.0",
       "transport": {
         "type": "stdio"
       }
 
@@ -8,7 +8,7 @@
 Package name: predictive_maintenance_mcp (mapped from src/ directory).
 """
 
-__version__ = "0.5.0"
+__version__ = "0.7.0"
 __author__ = "Luigi Gianpio Di Maggio"
 __license__ = "MIT"