diff --git a/.github/workflows/build-doc-mcp.yml b/.github/workflows/build-doc-mcp.yml new file mode 100644 index 0000000..a5c14a7 --- /dev/null +++ b/.github/workflows/build-doc-mcp.yml @@ -0,0 +1,75 @@ +name: ๐Ÿ› ๏ธ Build Docker image for Oracle Database Documentation MCP Server +on: + push: + paths: + - "src/oracle-db-doc-mcp-server/**" + pull_request: + paths: + - "src/oracle-db-doc-mcp-server/**" + +jobs: + build-image: + strategy: + matrix: + runner: ["ubuntu-24.04", "ubuntu-24.04-arm"] + + permissions: + packages: write + + name: ๐Ÿ› ๏ธ Build image + runs-on: ${{ matrix.runner }} + + steps: + - name: ๐Ÿ“‚ Checkout repo + uses: actions/checkout@v4 + + - name: ๐Ÿ”„ Generate environment variables + id: os_arch + run: | + if [ "$(uname -m)" == "aarch64" ]; then + echo "OS_ARCH=arm64" >> "$GITHUB_OUTPUT" + else + echo "OS_ARCH=amd64" >> "$GITHUB_OUTPUT" + fi; + + - name: Build image + run: | + cd src/oracle-db-doc-mcp-server/ + buildah bud -f Dockerfile -t oracle-db-doc:latest-${{ steps.os_arch.outputs.OS_ARCH }} . + + - name: ๐Ÿ”“ Login to GHCR registry + uses: redhat-actions/podman-login@v1 + with: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + registry: ghcr.io + + - name: ๐Ÿซธ Push image to Container Registry + uses: redhat-actions/push-to-registry@v2 + with: + registry: ghcr.io/gvenzl/mcp + image: oracle-db-doc + tags: latest-${{ steps.os_arch.outputs.OS_ARCH }} + + upload-multi-arch: + name: ๐Ÿซธ Push multi-arch manifest + runs-on: "ubuntu-24.04" + needs: build-image + + permissions: + packages: write + + steps: + - name: ๐Ÿ”“ Login to GHCR registry + uses: redhat-actions/podman-login@v1 + with: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + registry: ghcr.io + + - name: ๐Ÿซธ Push multi-arch manifest + run: | + podman manifest create ghcr.io/gvenzl/mcp/oracle-db-doc:latest + podman manifest add ghcr.io/gvenzl/mcp/oracle-db-doc:latest ghcr.io/gvenzl/mcp/oracle-db-doc:latest-amd64 + podman manifest add ghcr.io/gvenzl/mcp/oracle-db-doc:latest ghcr.io/gvenzl/mcp/oracle-db-doc:latest-arm64 + podman push ghcr.io/gvenzl/mcp/oracle-db-doc:latest diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 3ed1f15..4e8e0fd 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -20,7 +20,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v5 with: - python-version: '3.13' + python-version: "3.13" - name: Install requirements run: pip install -r requirements-dev.txt @@ -53,7 +53,6 @@ jobs: working-directory: src/${{ matrix.directory }} run: uv pip install . - get-directories: runs-on: ubuntu-latest outputs: @@ -65,5 +64,5 @@ jobs: - name: Get directories id: get-directories run: | - directories=$(ls src | grep -v dbtools-mcp-server | grep -v mysql-mcp-server | grep -v oci-pricing-mcp-server | jq -R -s -c 'split("\n")[:-1]') + directories=$(ls src | grep -v dbtools-mcp-server | grep -v mysql-mcp-server | grep -v oci-pricing-mcp-server | grep -v oracle-db-doc-mcp-server | jq -R -s -c 'split("\n")[:-1]') echo "directories=$directories" >> $GITHUB_OUTPUT diff --git a/.gitignore b/.gitignore index e9e0828..343680d 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,15 @@ env/ env.bak/ venv.bak/ +# Python uv +uv.lock +.python-version + +# VScode +.vscode + +# Mac files +.DS_Store # test environments .env diff --git a/src/oracle-db-doc-mcp-server/.gitignore b/src/oracle-db-doc-mcp-server/.gitignore new file mode 100644 index 0000000..992244a --- /dev/null +++ b/src/oracle-db-doc-mcp-server/.gitignore @@ -0,0 +1,2 @@ +*.log +index* diff --git a/src/oracle-db-doc-mcp-server/Dockerfile b/src/oracle-db-doc-mcp-server/Dockerfile new file mode 100644 index 0000000..4fa72ed --- /dev/null +++ b/src/oracle-db-doc-mcp-server/Dockerfile @@ -0,0 +1,39 @@ +# +# Since: August 2025 +# Author: Gerald Venzl +# Name: Dockerfile +# Description: Dockerfile to build Docker image +# +# Copyright 2025 Oracle Corporation and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +FROM alpine + +COPY oracle-db-doc-mcp-server.py fastmcp.json requirements.txt ./ + +RUN apk --update --no-cache add python3 py3-pip curl && \ + pip install -r requirements.txt --break-system-packages && \ + rm requirements.txt && \ + mkdir /input && \ + curl -L -o /input/dbdoc.zip https://docs.oracle.com/en/database/oracle/oracle-database/26/zip/oracle-database_26.zip && \ + python3 oracle-db-doc-mcp-server.py -log-level DEBUG idx -path /input/dbdoc.zip && \ + rm -r /input && \ + apk del curl && \ + rm -rf /var/cache/apk/* /tmp/* + +LABEL org.opencontainers.image.source=https://github.com/oracle/mcp +LABEL org.opencontainers.image.description="Oracle Database Documentation MCP Server" +LABEL org.opencontainers.image.licenses=Apache-2.0 + +ENTRYPOINT [ "python3", "oracle-db-doc-mcp-server.py", "mcp" ] diff --git a/src/oracle-db-doc-mcp-server/README.md b/src/oracle-db-doc-mcp-server/README.md new file mode 100644 index 0000000..f247fa3 --- /dev/null +++ b/src/oracle-db-doc-mcp-server/README.md @@ -0,0 +1,155 @@ +# Oracle Database Documentation MCP Server + +A Python-based MCP (Model Context Protocol) server that provides tools for searching the official Oracle Database documentation. + +The MCP server leverages an inverted index to serve snippets of the Oracle Database documentation. Because the Oracle Database documentation is large and gets updated from time to time, it is unfeasible to ship a ready to go documentation index with this repository. Doing so will bloat the repository and runs risk of users searching on an outdated documentation. + +Instead, users can create their own index and maintain it as often as required. See [Index creation/maintenance](#index-creationmaintenance) for more on that topic. + +## Features + +- **Search** + - Serach the documentation by keywords and phrases + +## Prerequisites + +- Python 3.x +- Downloaded [Oracle Database Documentation zip file](https://docs.oracle.com/en/database/oracle/oracle-database/26/zip/oracle-database_26.zip) to build the initial index + +## Installation + +```console +git clone https://github.com/oracle/mcp.git + +cd mcp/src/oracle-db-doc-mcp-server + +python3 -m venv .venv + +source .venv/bin/activate + +python3 -m pip install -r requirements.txt +``` + +## Usage + +```console +usage: oracle-db-doc-mcp-server.py [-h] [-log-level LOG_LEVEL] {idx,mcp} ... + +Oracle Database Documentation MCP Server. + +options: + -h, --help show this help message and exit + -log-level LOG_LEVEL Set the log level (DEBUG, INFO, WARNING, ERROR (default), CRITICAL). + +subcommands: + {idx,mcp} + idx create/maintain the index + mcp run the MCP server +``` + +The MCP server has two subcommands: + +1. `idx`: Creates or maintains the documentation index. +2. `mcp`: Runs the MCP server. + +Building the index will take some time and some MCP clients will time out while waiting for the index to be built. Hence the two subcommands cannot be intermixed. Users will first have to create the documentation index via the `idx` subcommand and once completed, run the server with the `mcp` subcommand. + +### Index creation/maintenance + +```console +usage: oracle-db-doc-mcp-server.py idx [-h] -path PATH [-preprocess PREPROCESS] + +options: + -h, --help show this help message and exit + -path PATH path to the documentation input zip file or extracted directory + -preprocess PREPROCESS + preprocessing level of documentation (NONE, BASIC (default), ADVANCED) +``` + +To create or maintain the index, use the `idx` subcommand and point the `-path` parameter to either the Oracle Database Documentation zip file (the file will be automatically unzipped into a temorary location under `$HOME/.oracle/oracle-db-doc-mcp-server`) or an **already extracted** location of the Oracle Database Documentation. + +The server will create a new folder under `$HOME/.oracle/oracle-db-doc-mcp-server` and store the index and the server log file within. Subsequent runs of `mcp` will open that index. The index can be updated by running the `idx` mode again. + +The index creation will take several minutes to complete depending on your environment and the level of preprocessing specified via the `-preprocess` parameter. + +A checksum of the index is kept so that subsequent executions of the program will only reindex content that has changed. + +For example, to create an index on a downloaded Oracle Database documentation zip file under `~/Downloads/oracle-database_26.zip`, run: + +```console +python3 oracle-db-doc-mcp-server.py idx -path ~/Downloads/oracle-database_26.zip +``` + +### Running the MCP Server + +```console +usage: oracle-db-doc-mcp-server.py mcp [-h] [-mode {stdio,http}] [-host HOST] [-port PORT] + +options: + -h, --help show this help message and exit + -mode {stdio,http} the transport mode for the MCP server (stdio (default) or http) + -host HOST the IP address (default 0.0.0.0) that the MCP server is reachable at + -port PORT the port (default 8000) that the MCP server is reachable at +``` + +To run the MCP server, use the `mcp` subcommand. + +**Note:** The index will have to exist. If it doesn't, the MCP server will exit with an error. + +By default, the MCP server runs on `stdio`. Hence, the simplest way to run it, is: + +```console +python3 oracle-db-doc-mcp-server.py mcp +``` + +### VSCode integration + +#### Running the MCP server via Docker/Podman + +To run the MCP server from inside a Docker container: + +1. Add a new file `.vscode/mcp.json` file to your project folder. +2. Add the following content to your `mcp.json` file. + +``` +{ + "servers": { + "oracle-db-doc": { + "type": "stdio", + "command": "docker", + "args": [ "run", "--rm", "-i", "ghcr.io/oracle/mcp/oracle-db-doc" ] + } + } +} +``` + +#### Running the MCP server directly + +To run the MCP server directly from your machine: + +1. Follow the [Installation](#installation) instructions first. +2. Create an index as explained in [Index creation/maintenance](#index-creationmaintenance) +3. Add a new file `mcp.json` file to your project folder. +4. Add the following content to your `.vscode/mcp.json` file. Replace the `<>` placeholders with the paths to the MCP server installation. + +``` +{ + "servers": { + "oracle-db-doc": { + "type": "stdio", + "command": "/.venv/bin/python3", + "args": [ "oracle-db-doc-mcp-server.py", "mcp" ] + } + } +} +``` + +## Tools + +### search_oracle_database_documentation + +Searches the documentation for key words and key phrases. + +```python +search_oracle_database_documentation(search_query: str, max_results: int) -> list[str]: +``` diff --git a/src/oracle-db-doc-mcp-server/fastmcp.json b/src/oracle-db-doc-mcp-server/fastmcp.json new file mode 100644 index 0000000..656d093 --- /dev/null +++ b/src/oracle-db-doc-mcp-server/fastmcp.json @@ -0,0 +1,11 @@ +{ + "entrypoint": "oracle-db-doc-mcp-server.py", + "environment": { + "dependencies": [ + "beautifulsoup4 >= 4.9.0", + "markdownify >= 1.2.0", + "fastmcp >= 2.11.3", + "pocketsearch >= 0.40.0" + ] + } +} \ No newline at end of file diff --git a/src/oracle-db-doc-mcp-server/oracle-db-doc-mcp-server.py b/src/oracle-db-doc-mcp-server/oracle-db-doc-mcp-server.py new file mode 100644 index 0000000..09360fb --- /dev/null +++ b/src/oracle-db-doc-mcp-server/oracle-db-doc-mcp-server.py @@ -0,0 +1,592 @@ +# +# Since: August 2025 +# Author: Gerald Venzl +# Name: main.py +# Description: The Oracle Database Documentation MCP Server +# +# Copyright 2025 Oracle Corporation and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import hashlib +import logging +import re +import shutil +import zipfile +from pathlib import Path, PurePath + +import markdownify as md +from bs4 import BeautifulSoup +from fastmcp import FastMCP +from pocketsearch import PocketSearch, PocketWriter + +# Working home directory +HOME_DIR = Path.home().joinpath(PurePath(".oracle/oracle-db-doc-mcp-server")) + +# Index +INDEX = None +INDEX_FILE = HOME_DIR.joinpath(PurePath("index.db")) +INDEX_VERSION = "1.0.0" +INDEX_VERSION_FILE = HOME_DIR.joinpath(PurePath("index.version")) +CONTENT_CHECKSUM_FILE = HOME_DIR.joinpath(PurePath("content.checksum")) + +# Resources folder +RESOURCES_DIR = HOME_DIR.joinpath(PurePath("resources")) + +# Temp directory for zip file extraction +ZIP_TEMP_OUTPUT = HOME_DIR.joinpath("zip_temp") + +PREPROCESS = "BASIC" + +logger = logging.getLogger(__name__) + + +mcp = FastMCP( + "oracle-doc", + instructions=""" + # Oracle Database Documentation MCP Server. + + This server is used to search the Oracle Database documentation for information. + It can be used to find information about SQL syntax, PL/SQL, database concepts, best practices, + examples and many more. + It is also used to search the official Oracle Database documentation for additional information + on a particular feature, its use cases, restrictions or interoperability with other features. + The tool should be used to augment any existing knowledge or to find information that is + not available in the current context. + The server is designed to search the Oracle Database documentation for search phrases and + will return a list of results. + + You can use the following tools to search the documentation: + - search: Search the documentation for a query string or search phrase. + + The search tool takes a search query as input and returns a list of results. + The results are returned as a list of strings containing relevant information. + + ## Best Practices + + - Use the search tool to search for phrases or query strings. + - Use the search tool to search for specific topics or features. + - Always use the search tool to search for additional and official information + for Oracle Database features. + - If the search tool returns no results, try to rephrase the query. + - If the search tool returns too few results, increase the max_results limit. + - If the search tool returns too many results, reduce the max_results limit. + - If the search tool returns results that are not relevant, try to refine the query. + """, +) + + +@mcp.tool() +def search_oracle_database_documentation( + search_query: str, + max_results: int = 4, +) -> list[str]: + """Search for information about how to use Oracle Database for a query string + and return a list of results. + + Args: + search_query: The search phrase to search for. + max_results: The maximum number of results to return, defaults to 4. + + Usage: + search_oracle_database_documentation(search_query="create table syntax") + search_oracle_database_documentation(search_query="alter a parameter", max_results=13) + search_oracle_database_documentation(search_query="database user concept", max_results=20) + search_oracle_database_documentation(search_query="data use case domains best practices", + max_results=15) + search_oracle_database_documentation(search_query="external table definition", max_results=100) + Returns: + A list of results. + Each result a string in Markdown format with the most relevant search topic. + + """ + logger.info(f"query={search_query!r}") + return search_index(search_query, max_results) + + +# Function to search the index +def search_index(query_str: str, limit: int = 4) -> list[str]: + """ + Search the index for the query string and return matching sections with context. + Returns a list of content. + """ + results = [] + hits = INDEX.search(text=query_str) + finds = 0 + for hit in hits: + results.append(hit.text) + finds += 1 + if finds >= limit: + break + return results + + +def maintain_content(path: str) -> None: + """Maintains the content for the MCP server. + This function checks if the index needs to be created or updated based on the + contents of the provided location, which can be a directory or a zip file. + + Args: + path (str): The path to the documentation directory or zip file. + + Returns: + None + """ + logger.info("Maintaining index...") + # Logic to create or update the index goes here + + location = Path(path) + if not location.exists(): + logger.error(f"Provided path does not exist: {location}") + return + + # Get the old index checksum, if it exists + content_checksum = get_file_content(CONTENT_CHECKSUM_FILE) + + # Get the old index version, if it exists + index_version = get_file_content(INDEX_VERSION_FILE) + + # Only directories and zip files are currently supported + if location.is_file() and not location.suffix == ".zip": + logger.error( + f"Unsupported file type: {location}. Must be a zip file or directory." + ) + return + + # Calculate the checksum of the input directory or zip file + logger.debug(f"Calculating checksum for location: {location}") + input_checksum = shasum_directory(location) + logger.debug(f"Checksum is {input_checksum} for location '{location}'") + + # See whether checksum matches the old index checksum and the index has not changed + if input_checksum == content_checksum and index_version == INDEX_VERSION: + logger.info("Index is up to date, no changes needed.") + return + # Data has changed, re-index + else: + if input_checksum != content_checksum: + logger.info("Checksum has changed.") + logger.debug( + f"Old index checksum: {content_checksum}, New input checksum: {input_checksum}" + ) + + if index_version != INDEX_VERSION: + logger.info("Index version has changed.") + logger.debug( + f"Old index version: {index_version}, New index version: {INDEX_VERSION}" + ) + + INDEX_FILE.unlink(missing_ok=True) + logger.info("Recreating index...") + # Extract the zip file to a temporary directory + if location.is_file() and location.suffix == ".zip": + + # Check if temp output directory exists and remove it + zip_output = Path(ZIP_TEMP_OUTPUT) + if zip_output.exists(): + logger.debug(f"Removing existing zip output directory: {zip_output}") + shutil.rmtree(zip_output) + + logger.debug(f"Creating zip output directory: {zip_output}") + zip_output.mkdir() + with zipfile.ZipFile(location, "r") as zip_ref: + logger.debug(f"Extracting zip file {location} to {zip_output}") + zip_ref.extractall(ZIP_TEMP_OUTPUT) + + logger.debug(f"Done creating zip output directory: {zip_output}") + # Set the location to the extracted output directory + location = zip_output + + logger.debug("Indexing all html files in the directory...") + + update_content(location) + + # Write the new checksum to the checksum file + logger.debug( + f"Writing new checksum {input_checksum} to {CONTENT_CHECKSUM_FILE}" + ) + write_file_content(CONTENT_CHECKSUM_FILE, input_checksum) + + if index_version != INDEX_VERSION: + # Write index version to version file + logger.debug( + f"Writing index version {INDEX_VERSION} to {INDEX_VERSION_FILE}" + ) + write_file_content(INDEX_VERSION_FILE, INDEX_VERSION) + + # Delete temporary zip output directory if it exists + if Path(ZIP_TEMP_OUTPUT).exists(): + logger.debug(f"Removing temporary zip output directory: {zip_output}") + shutil.rmtree(zip_output) + + +def update_content(location: Path) -> None: + """Updates the stored content with the source provided. + + Args: + location (Path): The path to the documentation directory. + Returns: + None + """ + logger.debug("Updating content") + + files_processed = 0 + for file in location.rglob("*"): + process_file(file) + files_processed += 1 + logger.info(f"Processed {files_processed} files from '{location}'.") + + logger.debug("Optimizing index...") + optimize_index() + logger.debug("Index optimized") + + +def process_file(file: Path) -> None: + """Process the file.""" + # Only index html file + if file.suffix == ".html" or file.suffix == ".htm": + name = file.stem.lower() + # Ignore ReadMes, table of contents, indexes + if name not in ("readme", "toc", "index"): + content_chunks = convert_to_markdown_chunks(file) + update_index(content_chunks) + + +def optimize_index() -> None: + """Optimizes index.""" + ps = PocketSearch(db_name=INDEX_FILE, writeable=True) + ps.optimize() + + +def update_index(content: list[str]) -> None: + """Update the index with content. + + Args: + content list[str]: The list of HTML content to index. + Returns: + None + """ + with PocketWriter(db_name=INDEX_FILE) as writer: + for segment in content: + writer.insert(text=segment) + + +def shasum_directory(directory: Path) -> str: + """Calculate the SHA256 checksum of all files in a directory.""" + sha256 = hashlib.sha256() + for file in sorted(directory.rglob("*")): + if file.is_file(): + # Include relative path for uniqueness + sha256.update(str(file.relative_to(directory)).encode()) + with file.open("rb") as f: + while chunk := f.read(8192): + sha256.update(chunk) + return sha256.hexdigest() + + +def convert_to_markdown_chunks(file: Path) -> list[str]: + """Convert an HTML file to Markdown format. + + Args: + file (Path): The path to the HTML file. + + Returns: + str: The converted Markdown content. + """ + logger.debug(f"Converting {file} to Markdown format.") + + with file.open("r", encoding="utf-8") as f: + html = f.read() + + if PREPROCESS == "ADVANCED": + # Preprocess HTML to remove boilerplate and navigation + html = preprocess_html(html) + + # Convert HTML to Markdown + markdown = md.markdownify(html) + if PREPROCESS != "NONE": + markdown = markdown.replace( + "Previous\nNext\n JavaScript must be enabled to correctly display this content", + "", + ) + markdown = remove_markdown_urls(markdown) + + # Split markdown into sections based on headings + pattern = r"(^#{1,6}\s+[^\n]*\n?)(.*?)(?=(?:^#{1,6}\s+|\Z))" + + # Find all matches with re.MULTILINE and re.DOTALL flags + matches = re.finditer(pattern, markdown, re.MULTILINE | re.DOTALL) + + # Create sections list + sections = [] + for match in matches: + # Get heading without the leading "### " + heading = re.sub("^#{1,6}\\s+", "", match.group(1).strip()) + # Get content without URLs within them + content = match.group(2).strip() + sections.append(heading + "\n\n" + content) + + if len(sections) == 0: + return [markdown] + else: + return sections + + +def remove_markdown_urls(text): + # Remove Markdown links [text](url) and replace with just the text + text = re.sub(r"\[([^\]]*)\]\([^\)]*\)", r"\1", text) + + # Remove URLs with GUIDs (32-char hex with hyphens) + text = re.sub( + r"https?://[^\s]*[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}[^\s]*", + "", + text, + ) + + # Remove URLs with long hex strings (likely file hashes or identifiers) + text = re.sub(r"https?://[^\s]*[a-f0-9]{16,}[^\s]*", "", text) + + # Remove standalone URLs that start with http/https + text = re.sub(r"https?://[^\s]+", "", text) + + # Clean up extra spaces/tabs but preserve new lines (\s includes \n) + text = re.sub(r"[ \t]+", " ", text) + + # Clean up extra spaces within new lines + text = re.sub(r"\n *\n", "\n\n", text) + + return text.strip() + + +def preprocess_html(html_content: str) -> str: + """Preprocess HTML to remove boilerplate and navigation elements. + + Args: + html_content (str): The raw HTML content. + + Returns: + str: Cleaned HTML content ready for markdown conversion. + """ + soup = BeautifulSoup(html_content, "html.parser") + + # Remove script and style tags + for tag in soup.find_all(["script", "style"]): + tag.decompose() + + # Remove navigation elements + for tag in soup.find_all(["nav", "header", "footer"]): + tag.decompose() + + # Remove elements with navigation-related classes/ids + nav_classes = [ + "noscript", + "alert", + "pull-left", + "pull-right", + "skip", + "navigation", + "breadcrumb", + "nav-", + "header-", + "footer-", + "menu", + "sidebar", + "toc", + ] + for nav_class in nav_classes: + for tag in soup.find_all( + attrs={ + "class": lambda x: x + and any( + nav_class in str(cls).lower() + for cls in (x if isinstance(x, list) else [x]) + ) + } + ): + tag.decompose() + for tag in soup.find_all( + attrs={"id": lambda x: x and nav_class in str(x).lower()} + ): + tag.decompose() + + # Remove common Oracle doc boilerplate text patterns + boilerplate_patterns = [ + r"JavaScript.*(?:disabled|enabled).*browser", + r"Skip navigation.*", + r"Oracleยฎ.*(?:Database.*)?(?:Reference|Guide|Manual|Documentation)", + r"Release \d+[a-z]*[\s-]*[A-Z0-9-]*", + r"Previous.*Next", + r"All Classes.*", + r"Overview.*Package.*Class.*Use.*Tree.*Deprecated.*Index.*Help", + ] + + for pattern in boilerplate_patterns: + for tag in soup.find_all(string=re.compile(pattern, re.IGNORECASE)): + parent = tag.parent if hasattr(tag, "parent") else None + if parent: + parent.decompose() + + # Remove elements likely to be navigation by common Oracle doc structure + # Remove elements with common Oracle navigation text content + nav_text_patterns = [ + "Skip navigation links", + "JavaScript is disabled on your browser", + "All Classes", + "SEARCH:", + ] + + for pattern in nav_text_patterns: + for element in soup.find_all(string=lambda text: text and pattern in text): + parent = element.parent if hasattr(element, "parent") else None + if parent: + parent.decompose() + + return str(soup) + + +def build_folder_structure() -> None: + """Builds the home directory structure.""" + if not RESOURCES_DIR.exists(): + RESOURCES_DIR.mkdir(parents=True) + + +def get_file_content(path: str) -> str: + """Reads the content of a file and returns it or 'N/A' if the file does not exist. + + Args: + file (Path): The path to the file. + """ + if Path(path).exists(): + with Path(path).open("r") as f: + return f.read().strip() + else: + return "N/A" + + +def write_file_content(path: str, content: str) -> None: + """Writes the content to a file.""" + with Path(path).open("w") as f: + f.write(content) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Oracle Database Documentation MCP Server." + ) + + parser.add_argument( + "-log-level", + type=str, + default="ERROR", + help="Set the log level (DEBUG, INFO, WARNING, ERROR (default), CRITICAL).", + ) + + subparser = parser.add_subparsers( + title="subcommands", dest="command", required=True + ) + + parser_doc = subparser.add_parser("idx", help="create/maintain the index") + parser_doc.add_argument( + "-path", + type=str, + required=True, + help="path to the documentation input zip file or extracted directory", + ) + parser_doc.add_argument( + "-preprocess", + type=str, + default="BASIC", + help="preprocessing level of documentation (NONE, BASIC (default), ADVANCED)", + ) + + parser_mcp = subparser.add_parser("mcp", help="run the MCP server") + parser_mcp.add_argument( + "-mode", + choices=["stdio", "http"], + default="stdio", + help="the transport mode for the MCP server (stdio (default) or http)", + ) + parser_mcp.add_argument( + "-host", + type=str, + default="0.0.0.0", + help="the IP address (default 0.0.0.0) that the MCP server is reachable at", + ) + parser_mcp.add_argument( + "-port", + type=int, + default=8000, + help="the port (default 8000) that the MCP server is reachable at", + ) + + args = parser.parse_args() + + return args + + +def main(): + """Main entrypoint for the Oracle Documentation MCP server.""" + + # Parse command line arguments + args = parse_args() + + # Build the home directory structure, needed also for the log file + build_folder_structure() + + # Set up logging + ch = logging.StreamHandler() + formatter = logging.Formatter( + "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + ) + ch.setFormatter(formatter) + logger.addHandler(ch) + + # Set log level + logging.basicConfig( + filename=HOME_DIR.joinpath(Path("oracle-db-doc-mcp-server.log")), + filemode="w", + level=logging.ERROR, + ) + logger.setLevel(getattr(logging, args.log_level.upper(), logging.ERROR)) + + if args.command == "idx": + global PREPROCESS + PREPROCESS = args.preprocess.upper() + maintain_content(args.path) + + if args.command == "mcp": + + # If no index is present (not index was built), refuse to start the server. + if not INDEX_FILE.exists(): + logger.error( + "Index does not exist. Please create the index first via the 'idx' subcommand." + ) + return + + global INDEX + logger.debug("Opening index file.") + INDEX = PocketSearch(db_name=INDEX_FILE) + + logger.info("Serving MCP server for Oracle Database documentation.") + if args.mode == "stdio": + mcp.run(transport="stdio", show_banner=False) + elif args.mode == "http": + mcp.run(transport="http", host=args.host, port=args.port, show_banner=False) + + +if __name__ == "__main__": + try: + main() + except KeyboardInterrupt: + logger.info("Shutting down Oracle Database Documentation MCP Server.") diff --git a/src/oracle-db-doc-mcp-server/pyproject.toml b/src/oracle-db-doc-mcp-server/pyproject.toml new file mode 100644 index 0000000..326bfee --- /dev/null +++ b/src/oracle-db-doc-mcp-server/pyproject.toml @@ -0,0 +1,12 @@ +[project] +name = "oracle-db-doc-mcp-server" +version = "1.0.0" +description = "The Oracle Database Documentation MCP Server" +readme = "README.md" +requires-python = ">=3.13" +dependencies = [ + "beautifulsoup4>=4.9.0", + "fastmcp>=2.11.3", + "markdownify>=1.2.0", + "pocketsearch>=0.40.0", +] diff --git a/src/oracle-db-doc-mcp-server/requirements.txt b/src/oracle-db-doc-mcp-server/requirements.txt new file mode 100644 index 0000000..a08e586 --- /dev/null +++ b/src/oracle-db-doc-mcp-server/requirements.txt @@ -0,0 +1,4 @@ +beautifulsoup4 >= 4.9.0 +markdownify >= 1.2.0 +fastmcp >= 2.11.3 +pocketsearch >= 0.40.0