diff --git a/docscontent/quickstart_content.txt b/docscontent/quickstart_content.txt index da885f1..75165c8 100644 --- a/docscontent/quickstart_content.txt +++ b/docscontent/quickstart_content.txt @@ -35,7 +35,7 @@ type: markdown --- ## 1. LLM Configuration -Before running the project, you need to configure a Large Language Model (LLM). This is used for tasks like generating business glossaries and predicting links between tables. For the semantic search feature, you will also need to set up Qdrant and provide an OpenAI API key. For detailed setup instructions, please refer to the [README.md](README.md) file. +Before running the project, you need to configure a Large Language Model (LLM). This is used for tasks like generating business glossaries and predicting links between tables. For the semantic search feature, you will also need to set up Qdrant and provide an OpenAI API key. For detailed setup instructions, please refer to the [README.md](https://github.com/Intugle/data-tools/blob/main/README.md) file. You can configure the necessary services by setting the following environment variables: @@ -196,7 +196,7 @@ type: markdown The semantic search feature allows you to search for columns in your datasets using natural language. -> **Note:** To use this feature, you need to have a running Qdrant instance and an OpenAI API key. Please refer to the [README.md](README.md) for detailed setup instructions. +> **Note:** To use this feature, you need to have a running Qdrant instance and an OpenAI API key. Please refer to the [README.md](https://github.com/Intugle/data-tools/blob/main/README.md) for detailed setup instructions. > > **Google Colab Users:** If you are running this notebook in Google Colab, you may not be able to connect to a local Qdrant instance running in Docker. In this case, you will need to use a remotely hosted Qdrant server. > diff --git a/docsite/docs/mcp-server.md b/docsite/docs/mcp-server.md new file mode 100644 index 0000000..e694626 --- /dev/null +++ b/docsite/docs/mcp-server.md @@ -0,0 +1,58 @@ +--- +sidebar_position: 6 +title: MCP Server +--- + +# Intugle MCP Server + +The Intugle library includes a built-in MCP (Model Context Protocol) server that exposes your data environment as a set of tools that can be understood and used by AI assistants and LLM-powered clients. + +By serving your project's context through this standardized protocol, you enable powerful conversational workflows, such as [Vibe Coding](./vibe-coding.md), and allow AI agents to interact with your data securely. + +## 1. Setting up the MCP Server + +Once you have built your semantic layer using the `SemanticModel`, you can easily expose it as a set of tools for an AI assistant by starting the built-in MCP server. + +### Starting the Server + +To start the server, run the following command in your terminal from your project's root directory: + +```bash +intugle-mcp +``` + +This will start a server on `localhost:8080` by default. You should see output indicating that the server is running and that the `semantic_layer` service is mounted. + +### Connecting from an MCP Client + +With the server running, you can connect to it from any MCP-compatible client. The endpoint for the semantic layer is: + +`http://localhost:8080/semantic_layer/mcp` + +Popular clients that support MCP include AI-powered IDEs and standalone applications. Here’s how to configure a few of them: + +- **Cursor**: [Configuring MCP Servers](https://docs.cursor.com/en/context/mcp#configuring-mcp-servers) +- **Claude Code**: [Using MCP with Claude Code](https://docs.claude.com/en/docs/claude-code/mcp) +- **Claude Desktop**: [User Quickstart](https://modelcontextprotocol.info/docs/quickstart/user/) +- **Gemini CLI**: [Configure MCP Servers](https://cloud.google.com/gemini/docs/codeassist/use-agentic-chat-pair-programmer#configure-mcp-servers) + +## 2. Data Discovery Tools + +The MCP server provides tools that allow an LLM client to discover and understand the structure of your data. These tools are essential for providing the AI with the context it needs to answer questions and generate valid queries or specifications. + +These tools are only available after a `SemanticModel` has been successfully generated and loaded. + +### `get_tables` + +This tool returns a list of all available tables in your semantic model, along with their descriptions. It's the primary way for an AI assistant to discover what data is available. + +- **Description**: Get list of tables in database along with their technical description. +- **Returns**: A list of objects, where each object contains the `table_name` and `table_description`. + +### `get_schema` + +This tool retrieves the schema for one or more specified tables, including column names, data types, and other metadata including links. This allows the AI to understand the specific attributes of each table before attempting to query it. + +- **Description**: Given database table names, get the schemas of the tables. +- **Parameters**: `table_names` (a list of strings). +- **Returns**: A dictionary where keys are table names and values are their detailed schemas. diff --git a/docsite/docs/vibe-coding.md b/docsite/docs/vibe-coding.md index 7101433..25ad0de 100644 --- a/docsite/docs/vibe-coding.md +++ b/docsite/docs/vibe-coding.md @@ -1,17 +1,13 @@ --- -sidebar_position: 6 +sidebar_position: 7 title: Vibe Coding --- # Vibe Coding with the MCP Server -"Vibe Coding" is an interactive, conversational approach to development where you use natural language to generate code or specifications. Intugle embraces this by allowing you to serve your semantic layer through an MCP (Model Context Protocol) server. +"Vibe Coding" is an interactive, conversational approach to data intelligence. Intugle embraces this by allowing you to serve your project as an MCP (Model Context Protocol) server. -This turns your data into a "self-describing" resource that an AI assistant can understand, allowing you to "vibe" with your data to create specifications without writing them by hand. - -:::info In Progress -Currently, Vibe Coding is available for generating **Data Product** specifications. We are actively working on extending this capability to other modules in the Intugle ecosystem. Stay tuned for more updates! -::: +This turns your entire data workflow into a "self-describing" resource that an AI assistant can understand and operate. It allows you to "vibe" with the intugle library—using natural language to build semantic models, perform searches, and create data products from scratch. ## 1. Setting up the MCP Server @@ -29,13 +25,13 @@ To start the server, run the following command in your terminal from your projec intugle-mcp ``` -This will start a server on `localhost:8000` by default. You should see output indicating that the server is running and that the `semantic_layer` and `adapter` services are mounted. +This will start a server on `localhost:8080` by default. You should see output indicating that the server is running and that the `semantic_layer` service is mounted. ### Connecting from an MCP Client With the server running, you can connect to it from any MCP-compatible client. The endpoint for the semantic layer is: -`http://localhost:8000/semantic_layer/mcp` +`http://localhost:8080/semantic_layer/mcp` Popular clients that support MCP include AI-powered IDEs and standalone applications. Here’s how to configure a few of them: @@ -44,56 +40,53 @@ Popular clients that support MCP include AI-powered IDEs and standalone applicat - **Claude Desktop**: [User Quickstart](https://modelcontextprotocol.info/docs/quickstart/user/) - **Gemini CLI**: [Configure MCP Servers](https://cloud.google.com/gemini/docs/codeassist/use-agentic-chat-pair-programmer#configure-mcp-servers) -## 2. Using Vibe Coding +## 2. Vibe Coding -The MCP server exposes powerful prompts that are designed to take your natural language requests and convert them directly into valid specifications. +The MCP server exposes the `intugle-vibe` prompt. This prompt equips an AI assistant with knowledge of the Intugle library and access to its core tools. You can use it to guide you through the entire data intelligence workflow using natural language. -### Example: Generating a Data Product +In your MCP-compatible client, you can invoke the prompt and provide your request. In most clients, this is done by typing `/` followed by the prompt name. -Currently, you can use the `create-dp` prompt to generate a `product_spec` dictionary for a Data Product. +### Example 1: Getting Started and Building a Semantic Model -In your MCP-compatible client, you can invoke the prompt and provide your request. In most clients, this is done by typing `/` followed by the prompt name. +If you are unsure how to start, you can ask for guidance. You can also ask the assistant to perform actions like creating a semantic model. ``` -/create-dp show me the top 5 patients with the most claims +/intugle-vibe How do I create a semantic model? +``` +``` +/intugle-vibe Create a semantic model over my healthcare data. ``` -:::tip Client-Specific Commands -The exact command to invoke a prompt (e.g., using `/` or another prefix) can vary between clients. Be sure to check the documentation for your specific tool. -::: +The assistant will read the relevant documentation and guide you through the process or execute the steps if possible. + +### Example 2: Generating a Data Product Specification + +Once you have a semantic model, you can ask the assistant to create a specification for a reusable data product. + +``` +/intugle-vibe create a data product specification for the top 5 patients with the most claims +``` + +The AI assistant, connected to your MCP server, will understand that you are requesting a `product_spec`. It will use the `get_tables` and `get_schema` tools to find the `patients` and `claims` tables, and generate the specification. + +### Example 3: Performing a Semantic Search -The AI assistant, connected to your MCP server, will understand the request, use the `get_tables` and `get_schema` tools to find the `patients` and `claims` tables, and generate the following `product_spec`: - -```json -{ - "name": "top_5_patients_by_claims", - "fields": [ - { - "id": "patients.first", - "name": "first_name" - }, - { - "id": "patients.last", - "name": "last_name" - }, - { - "id": "claims.id", - "name": "number_of_claims", - "category": "measure", - "measure_func": "count" - } - ], - "filter": { - "sort_by": [ - { - "id": "claims.id", - "alias": "number_of_claims", - "direction": "desc" - } - ], - "limit": 5 - } -} +You can also perform a semantic search on your data. + +``` +/intugle-vibe use semantic search to find columns related to 'hospital visit reasons' ``` -This workflow allows you to stay in your creative flow, rapidly iterating on data product ideas by describing what you want in plain English. +The assistant will code out the semantic search capabilities of your `SemanticModel` to find and return relevant columns from your datasets. + +:::tip Agent Mode +Most modern, AI-powered clients support an "agent mode" where the coding assistant can handle the entire workflow for you. + +For example, you can directly ask for a final output, like a CSV file: + +`/intugle-vibe create a CSV of the top 10 patients by claim count` + +The agent will understand the end goal and perform all the necessary intermediate steps for you. It will realize it needs to build the semantic model, generate the data product specification, execute it, and finally provide you with the resulting CSV file—all without you needing to manage the code or the process. +::: + +This workflow accelerates your journey from raw data to insightful data products. Simply describe what you want in plain English and let the assistant handle the details, freeing you from the hassle of digging through documentation. diff --git a/notebooks/quickstart_fmcg.ipynb b/notebooks/quickstart_fmcg.ipynb index a53a073..1d84379 100644 --- a/notebooks/quickstart_fmcg.ipynb +++ b/notebooks/quickstart_fmcg.ipynb @@ -123,7 +123,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "f0ecc2ba", "metadata": {}, "outputs": [], @@ -131,7 +131,7 @@ "def generate_config(table_name: str) -> str:\n", " \"\"\"Append the base URL to the table name.\"\"\"\n", " return {\n", - " \"path\": f\"./sample_data/fmcg/{table_name}.csv\",\n", + " \"path\": f\"https://github.com/Intugle/data-tools/tree/main/sample_data/fmcg/{table_name}.csv\",\n", " \"type\": \"csv\",\n", " }\n", "\n", @@ -2528,7 +2528,7 @@ "\n", "The semantic search feature allows you to search for columns in your datasets using natural language. \n", "\n", - "> **Note:** To use this feature, you need to have a running Qdrant instance and an OpenAI API key. Please refer to the [README.md](README.md) for detailed setup instructions.\n", + "> **Note:** To use this feature, you need to have a running Qdrant instance and an OpenAI API key. Please refer to the [README.md](https://github.com/Intugle/data-tools/blob/main/README.md) for detailed setup instructions.\n", ">\n", "> **Google Colab Users:** If you are running this notebook in Google Colab, you may not be able to connect to a local Qdrant instance running in Docker. In this case, you will need to use a remotely hosted Qdrant server.\n", ">\n", diff --git a/notebooks/quickstart_fmcg_snowflake.ipynb b/notebooks/quickstart_fmcg_snowflake.ipynb index 29e0295..4c21511 100644 --- a/notebooks/quickstart_fmcg_snowflake.ipynb +++ b/notebooks/quickstart_fmcg_snowflake.ipynb @@ -193,7 +193,7 @@ "source": [ "## 1. LLM Configuration\n", "\n", - "Before running the project, you need to configure a Large Language Model (LLM). This is used for tasks like generating business glossaries and predicting links between tables. For detailed setup instructions, please refer to the [README.md](README.md) file.\n", + "Before running the project, you need to configure a Large Language Model (LLM). This is used for tasks like generating business glossaries and predicting links between tables. For detailed setup instructions, please refer to the [README.md](https://github.com/Intugle/data-tools/blob/main/README.md) file.\n", "\n", "You can configure the necessary services by setting the following environment variables:\n", "\n", diff --git a/notebooks/quickstart_healthcare.ipynb b/notebooks/quickstart_healthcare.ipynb index a1ec3ab..a544a52 100644 --- a/notebooks/quickstart_healthcare.ipynb +++ b/notebooks/quickstart_healthcare.ipynb @@ -124,7 +124,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "76771eda", "metadata": {}, "outputs": [], @@ -132,7 +132,7 @@ "def generate_config(table_name: str) -> str:\n", " \"\"\"Append the base URL to the table name.\"\"\"\n", " return {\n", - " \"path\": f\"./sample_data/healthcare/{table_name}.csv\",\n", + " \"path\": f\"https://github.com/Intugle/data-tools/tree/main/sample_data/healthcare/{table_name}.csv\",\n", " \"type\": \"csv\",\n", " }\n", "\n", diff --git a/notebooks/quickstart_healthcare_databricks.ipynb b/notebooks/quickstart_healthcare_databricks.ipynb index 88cc33d..f013d49 100644 --- a/notebooks/quickstart_healthcare_databricks.ipynb +++ b/notebooks/quickstart_healthcare_databricks.ipynb @@ -107,7 +107,7 @@ "source": [ "## 1. LLM Configuration\n", "\n", - "Before running the project, you need to configure a Large Language Model (LLM). This is used for tasks like generating business glossaries and predicting links between tables. For detailed setup instructions, please refer to the [README.md](README.md) file.\n", + "Before running the project, you need to configure a Large Language Model (LLM). This is used for tasks like generating business glossaries and predicting links between tables. For detailed setup instructions, please refer to the [README.md](https://github.com/Intugle/data-tools/blob/main/README.md) file.\n", "\n", "You can configure the necessary services by setting the following environment variables:\n", "\n", diff --git a/notebooks/quickstart_native_databricks.ipynb b/notebooks/quickstart_native_databricks.ipynb index 164bb50..b20ccc4 100644 --- a/notebooks/quickstart_native_databricks.ipynb +++ b/notebooks/quickstart_native_databricks.ipynb @@ -61,7 +61,7 @@ "source": [ "## 1. LLM Configuration\n", "\n", - "Before running the project, you need to configure a Large Language Model (LLM). This is used for tasks like generating business glossaries and predicting links between tables. For the semantic search feature, you will also need to set up Qdrant and provide an OpenAI API key. For detailed setup instructions, please refer to the [README.md](README.md) file.\n", + "Before running the project, you need to configure a Large Language Model (LLM). This is used for tasks like generating business glossaries and predicting links between tables. For the semantic search feature, you will also need to set up Qdrant and provide an OpenAI API key. For detailed setup instructions, please refer to the [README.md](https://github.com/Intugle/data-tools/blob/main/README.md) file.\n", "\n", "You can configure the necessary services by setting the following environment variables:\n", "\n", diff --git a/notebooks/quickstart_native_snowflake.ipynb b/notebooks/quickstart_native_snowflake.ipynb index d63f245..b3a68eb 100644 --- a/notebooks/quickstart_native_snowflake.ipynb +++ b/notebooks/quickstart_native_snowflake.ipynb @@ -110,7 +110,7 @@ "source": [ "## 1. LLM Configuration\n", "\n", - "Before running the project, you need to configure a Large Language Model (LLM). This is used for tasks like generating business glossaries and predicting links between tables. For the semantic search feature, you will also need to set up Qdrant and provide an OpenAI API key. For detailed setup instructions, please refer to the [README.md](README.md) file.\n", + "Before running the project, you need to configure a Large Language Model (LLM). This is used for tasks like generating business glossaries and predicting links between tables. For the semantic search feature, you will also need to set up Qdrant and provide an OpenAI API key. For detailed setup instructions, please refer to the [README.md](https://github.com/Intugle/data-tools/blob/main/README.md) file.\n", "\n", "You can configure the necessary services by setting the following environment variables:\n", "\n", diff --git a/notebooks/quickstart_sports_media.ipynb b/notebooks/quickstart_sports_media.ipynb index 232347f..f3c7867 100644 --- a/notebooks/quickstart_sports_media.ipynb +++ b/notebooks/quickstart_sports_media.ipynb @@ -124,7 +124,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "f7367816", "metadata": {}, "outputs": [], @@ -132,7 +132,7 @@ "def generate_config(table_name: str) -> str:\n", " \"\"\"Append the base URL to the table name.\"\"\"\n", " return {\n", - " \"path\": f\"./sample_data/sports_media/{table_name}.csv\",\n", + " \"path\": f\"https://github.com/Intugle/data-tools/tree/main/sample_data/sports_media/{table_name}.csv\",\n", " \"type\": \"csv\",\n", " }\n", "\n", diff --git a/notebooks/quickstart_tech_manufacturing.ipynb b/notebooks/quickstart_tech_manufacturing.ipynb index 619248a..24d6388 100644 --- a/notebooks/quickstart_tech_manufacturing.ipynb +++ b/notebooks/quickstart_tech_manufacturing.ipynb @@ -119,14 +119,14 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def generate_config(table_name: str) -> str:\n", " \"\"\"Append the base URL to the table name.\"\"\"\n", " return {\n", - " \"path\": f\"./sample_data/tech_manufacturing/{table_name}.csv\",\n", + " \"path\": f\"https://github.com/Intugle/data-tools/tree/main/sample_data/tech_company/{table_name}.csv\",\n", " \"type\": \"csv\",\n", " }\n", "\n", diff --git a/pyproject.toml b/pyproject.toml index bc64354..8558b77 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "intugle" -version = "1.0.4" +version = "1.0.5" authors = [ { name="Intugle", email="hello@intugle.ai" }, ] @@ -50,6 +50,7 @@ dependencies = [ "langchain[anthropic,google-genai,openai]>=0.3.27", "qdrant-client>=1.15.1", "rich>=14.1.0", + "aiohttp>=3.9.5", ] [project.optional-dependencies] diff --git a/src/intugle/cli.py b/src/intugle/cli.py deleted file mode 100644 index bbd98e1..0000000 --- a/src/intugle/cli.py +++ /dev/null @@ -1,11 +0,0 @@ -from intugle.streamlit import StreamlitApp - - -def export_data(): - """Exports the analysis results to CSV files.""" - app = StreamlitApp() - app.export_analysis_to_csv() - - -if __name__ == "__main__": - export_data() diff --git a/src/intugle/core/settings.py b/src/intugle/core/settings.py index 6dbdea7..2034245 100644 --- a/src/intugle/core/settings.py +++ b/src/intugle/core/settings.py @@ -70,6 +70,7 @@ class Settings(BaseSettings): CUSTOM_EMBEDDINGS_INSTANCE: Optional[Any] = None # LP + RELATIONSHIPS_FILE: str = "__relationships__.yml" HALLUCINATIONS_MAX_RETRY: int = 2 UNIQUENESS_THRESHOLD: float = 0.9 INTERSECT_RATIO_THRESHOLD: float = 0.9 diff --git a/src/intugle/link_predictor/predictor.py b/src/intugle/link_predictor/predictor.py index 5fdc93b..5555277 100644 --- a/src/intugle/link_predictor/predictor.py +++ b/src/intugle/link_predictor/predictor.py @@ -157,11 +157,14 @@ def _predict_for_pair( ] return pair_links - def predict(self, filename='__relationships__.yml', save: bool = False, force_recreate: bool = False) -> 'LinkPredictor': + def predict(self, filename: str = None, save: bool = False, force_recreate: bool = False) -> 'LinkPredictor': """ Iterates through all unique pairs of datasets, predicts the links for each pair, and returns the aggregated results. """ + if filename is None: + filename = settings.RELATIONSHIPS_FILE + relationships_file = os.path.join(settings.PROJECT_BASE, filename) if not force_recreate and os.path.exists(relationships_file): diff --git a/src/intugle/mcp/docs_search/service.py b/src/intugle/mcp/docs_search/service.py new file mode 100644 index 0000000..6c88ad0 --- /dev/null +++ b/src/intugle/mcp/docs_search/service.py @@ -0,0 +1,83 @@ + +import asyncio +import aiohttp +from typing import List + +class DocsSearchService: + """ + Service for searching Intugle's documentation. + """ + + BASE_URL = "https://raw.githubusercontent.com/Intugle/data-tools/main/docsite/docs/" + API_URL = "https://api.github.com/repos/Intugle/data-tools/contents/docsite/docs" + + def __init__(self): + self._doc_paths = None + + async def list_doc_paths(self) -> List[str]: + """ + Fetches and returns a list of all documentation file paths from the GitHub repository. + Caches the result to avoid repeated API calls. + """ + if self._doc_paths is None: + async with aiohttp.ClientSession() as session: + self._doc_paths = await self._fetch_paths_recursively(session, self.API_URL) + return self._doc_paths + + async def _fetch_paths_recursively(self, session: aiohttp.ClientSession, url: str) -> List[str]: + """ + Recursively fetches file paths from the GitHub API. + """ + paths = [] + try: + async with session.get(url) as response: + if response.status != 200: + # Optionally log an error here + return [f"Error: Could not fetch {url}, status code: {response.status}"] + + items = await response.json() + + for item in items: + if item['type'] == 'file' and (item['name'].endswith('.md') or item['name'].endswith('.mdx')): + # Strip the base 'docsite/docs/' part to make it a relative path + paths.append(item['path'].replace('docsite/docs/', '', 1)) + elif item['type'] == 'dir': + paths.extend(await self._fetch_paths_recursively(session, item['url'])) + except Exception as e: + # Optionally log the exception + return [f"Error: Exception while fetching {url}: {e}"] + + return paths + + async def search_docs(self, paths: List[str]) -> str: + """ + Fetches and concatenates content from a list of documentation paths. + + Args: + paths (List[str]): A list of markdown file paths (e.g., ["intro.md", "core-concepts/semantic-model.md"]) + + Returns: + str: The concatenated content of the documentation files. + """ + async with aiohttp.ClientSession() as session: + tasks = [self._fetch_doc(session, path) for path in paths] + results = await asyncio.gather(*tasks) + return "\n\n---\n\n".join(filter(None, results)) + + async def _fetch_doc(self, session: aiohttp.ClientSession, path: str) -> str | None: + """ + Fetches a single documentation file. + """ + url = f"{self.BASE_URL}{path}" + try: + async with session.get(url) as response: + if response.status == 200: + return await response.text() + else: + # Optionally log an error here + return f"Error: Could not fetch {url}, status code: {response.status}" + except Exception as e: + # Optionally log the exception + return f"Error: Exception while fetching {url}: {e}" + +docs_search_service = DocsSearchService() diff --git a/src/intugle/mcp/semantic_layer/prompt.py b/src/intugle/mcp/semantic_layer/prompt.py index 1c3ddb9..82cafcd 100644 --- a/src/intugle/mcp/semantic_layer/prompt.py +++ b/src/intugle/mcp/semantic_layer/prompt.py @@ -2,6 +2,7 @@ from pathlib import Path +from intugle.mcp.docs_search.service import docs_search_service from intugle.mcp.semantic_layer.schema import SQLDialect @@ -11,14 +12,43 @@ class Prompts: """ @classmethod - def create_dp_prompt(cls, user_request: str) -> str: + async def intugle_vibe_prompt(cls, user_query: str = "") -> str: """ - Returns the prompt for creating a data product specification. + Returns the prompt for the Intugle Vibe agent. """ - prompt_path = Path(__file__).parent / "prompts" / "create_dp_prompt.md" + prompt_path = Path(__file__).parent / "prompts" / "intugle_vibe_prompt.md" with open(prompt_path, "r") as f: base_prompt = f.read() - return base_prompt.format(user_request=user_request) + + library_overview = textwrap.dedent(""" + Intugle is a GenAI-powered open-source Python library that builds a semantic data model over your existing data systems. + It discovers meaningful links and relationships across data assets, enriching them with profiles, classifications, and business glossaries. + With this connected knowledge layer, you can enable semantic search and auto-generate queries to create unified data products, + making data integration and exploration faster, more accurate, and far less manual. + """) + + doc_paths = await docs_search_service.list_doc_paths() + formatted_doc_paths = "\n".join(f"- `{path}`" for path in doc_paths) + + query_section = "" + if user_query: + query_section = f"Conversation starts:\n\n---\n\n{user_query}" + + return base_prompt.format( + library_overview=library_overview.strip(), + doc_paths=formatted_doc_paths, + user_query=query_section + ) + + # @classmethod + # def create_dp_prompt(cls, user_request: str) -> str: + # """ + # Returns the prompt for creating a data product specification. + # """ + # prompt_path = Path(__file__).parent / "prompts" / "create_dp_prompt.md" + # with open(prompt_path, "r") as f: + # base_prompt = f.read() + # return base_prompt.format(user_request=user_request) @classmethod def raw_executor_prompt( diff --git a/src/intugle/mcp/semantic_layer/prompts/intugle_vibe_prompt.md b/src/intugle/mcp/semantic_layer/prompts/intugle_vibe_prompt.md new file mode 100644 index 0000000..d921335 --- /dev/null +++ b/src/intugle/mcp/semantic_layer/prompts/intugle_vibe_prompt.md @@ -0,0 +1,35 @@ +You are Intugle Vibe, a helpful AI assistant for the Intugle library. + +## About Intugle + +{library_overview} + +## How to Use the Documentation + +Below is a list of all available documentation pages. You can read the content of any of these pages using the `search_intugle_docs` tool. Simply pass the path or paths you want to read to the tool. + +For example: `search_intugle_docs(paths=["intro.md", "getting-started.md"])` + +### Available Documentation Paths: + +{doc_paths} + +## Other Available Tools + +You also have access to the following tools to inspect the data model: + +- `get_tables`: Lists all tables in the semantic model. +- `get_schema`: Retrieves the schema for specified tables. + +These tools are useful for understanding the available data to answer user questions or to gather the necessary information for building a data product specification. + +**Important:** +- These tools will only return a response if a semantic model has already been generated and loaded in the user's environment. + +> **Semantic Search** and **Data Product Generation** both require a `SemanticModel` to be built first. Before you can perform a search or create a data product, you MUST ensure a semantic model has been built. If it hasn't, you should guide the user to build one or build it for them depending on the scenario. + +## Your Task + +Your goal is to help the user achieve their task by leveraging the Intugle library. Use the documentation to understand how the library works and guide the user. You can read from the documentation to answer questions or provide explanations. + +{user_query} \ No newline at end of file diff --git a/src/intugle/mcp/semantic_layer/router.py b/src/intugle/mcp/semantic_layer/router.py index 4c53905..71b4386 100644 --- a/src/intugle/mcp/semantic_layer/router.py +++ b/src/intugle/mcp/semantic_layer/router.py @@ -2,6 +2,7 @@ from intugle.core.settings import settings from intugle.mcp.adapter.service import adapter_service +from intugle.mcp.docs_search.service import docs_search_service from intugle.mcp.semantic_layer.prompt import Prompts from intugle.mcp.semantic_layer.service import semantic_layer_service @@ -56,12 +57,38 @@ async def get_schema(table_names: list[str]) -> dict[str, str]: # return Prompts.raw_executor_prompt(settings.SQL_DIALECT, settings.DOMAIN, settings.UNIVERSAL_INSTRUCTIONS) -@semantic_layer_mcp.prompt(name="create-dp", title="Create Data Product Specification") -async def create_dp_prompt(user_request: str) -> str: - return Prompts.create_dp_prompt(user_request) +@semantic_layer_mcp.prompt( + name="intugle-vibe", + title="Intugle Vibe Prompt", + description="A helpful AI assistant for the Intugle library.", +) +async def intugle_vibe_prompt(user_query: str) -> str: + return await Prompts.intugle_vibe_prompt(user_query) + + +# @semantic_layer_mcp.prompt(name="create-dp", title="Create Data Product Specification") +# async def create_dp_prompt(user_request: str) -> str: +# return Prompts.create_dp_prompt(user_request) + +# @semantic_layer_mcp.tool(name="execute_query", description="Return the result of a query execution") +# async def execute_query(sql_query: str) -> list[dict]: +# data = await adapter_service.execute_query(sql_query) +# return data -@semantic_layer_mcp.tool(name="execute_query", description="Return the result of a query execution") -async def execute_query(sql_query: str) -> list[dict]: - data = await adapter_service.execute_query(sql_query) - return data \ No newline at end of file + +@semantic_layer_mcp.tool( + name="search_intugle_docs", + description="Fetches content from the Intugle documentation for a given list of page paths.", +) +async def search_intugle_docs(paths: list[str]) -> str: + """ + Fetches content from the Intugle documentation. + + Args: + paths (list[str]): A list of markdown file paths (e.g., ["intro.md", "core-concepts/semantic-model.md"]) + + Returns: + str: The concatenated content of the documentation files. + """ + return await docs_search_service.search_docs(paths) \ No newline at end of file diff --git a/src/intugle/semantic_model.py b/src/intugle/semantic_model.py index bd4b898..c3ef8f1 100644 --- a/src/intugle/semantic_model.py +++ b/src/intugle/semantic_model.py @@ -1,19 +1,20 @@ import logging -import yaml from typing import TYPE_CHECKING, Any, Dict, List import pandas as pd +import yaml from intugle.analysis.models import DataSet from intugle.core.console import console, success_style +from intugle.exporters.factory import factory as exporter_factory from intugle.link_predictor.predictor import LinkPredictor from intugle.semantic_search import SemanticSearch -from intugle.exporters.factory import factory as exporter_factory +from intugle.utils.files import update_relationship_file_mtime if TYPE_CHECKING: - from intugle.link_predictor.models import PredictedLink from intugle.adapters.adapter import Adapter + from intugle.link_predictor.models import PredictedLink log = logging.getLogger(__name__) @@ -30,7 +31,9 @@ def __init__(self, data_input: Dict[str, Any] | List[DataSet], domain: str = "") elif isinstance(data_input, list): self._initialize_from_list(data_input) else: - raise TypeError("Input must be a dictionary of named dataframes or a list of DataSet objects.") + raise TypeError( + "Input must be a dictionary of named dataframes or a list of DataSet objects." + ) def _initialize_from_dict(self, data_dict: Dict[str, Any]): """Creates and processes DataSet objects from a dictionary of raw dataframes.""" @@ -42,12 +45,16 @@ def _initialize_from_list(self, data_list: List[DataSet]): """Processes a list of existing DataSet objects""" for dataset in data_list: if not dataset.name: - raise ValueError("DataSet objects provided in a list must have a 'name' attribute.") + raise ValueError( + "DataSet objects provided in a list must have a 'name' attribute." + ) self.datasets[dataset.name] = dataset def profile(self, force_recreate: bool = False): """Run profiling, datatype identification, and key identification for all datasets.""" - console.print("Starting profiling and key identification stage...", style="yellow") + console.print( + "Starting profiling and key identification stage...", style="yellow" + ) for dataset in self.datasets.values(): # Check if this stage is already complete if dataset.source_table_model.key is not None and not force_recreate: @@ -58,7 +65,9 @@ def profile(self, force_recreate: bool = False): dataset.profile(save=True) dataset.identify_datatypes(save=True) dataset.identify_keys(save=True) - console.print("Profiling and key identification complete.", style="bold green") + console.print( + "Profiling and key identification complete.", style="bold green" + ) def predict_links(self, force_recreate: bool = False): """Run link prediction across all datasets.""" @@ -74,10 +83,14 @@ def generate_glossary(self, force_recreate: bool = False): for dataset in self.datasets.values(): # Check if this stage is already complete if dataset.source_table_model.description and not force_recreate: - console.print(f"Glossary for '{dataset.name}' already exists. Skipping.") + console.print( + f"Glossary for '{dataset.name}' already exists. Skipping." + ) continue - console.print(f"Generating glossary for dataset: {dataset.name}", style=success_style) + console.print( + f"Generating glossary for dataset: {dataset.name}", style=success_style + ) dataset.generate_glossary(domain=self.domain, save=True) console.print("Business glossary generation complete.", style="bold green") @@ -87,6 +100,8 @@ def build(self, force_recreate: bool = False): self.predict_links() self.generate_glossary(force_recreate=force_recreate) + update_relationship_file_mtime() + # Initialize semantic search try: self.initialize_semantic_search() @@ -99,21 +114,22 @@ def export(self, format: str, **kwargs): """Export the semantic model to a specified format.""" # This assumes that the manifest is already loaded in the SemanticModel # In a real implementation, you would get the manifest from the SemanticModel instance - from intugle.parser.manifest import ManifestLoader from intugle.core import settings + from intugle.parser.manifest import ManifestLoader + manifest_loader = ManifestLoader(settings.PROJECT_BASE) manifest_loader.load() manifest = manifest_loader.manifest exporter = exporter_factory.get_exporter(format, manifest) exported_data = exporter.export(**kwargs) - + output_path = kwargs.get("path") if output_path: with open(output_path, "w") as f: yaml.dump(exported_data, f, sort_keys=False, default_flow_style=False) print(f"Successfully exported to {output_path}") - + return exported_data @property @@ -180,22 +196,27 @@ def deploy(self, target: str, **kwargs): target (str): The target platform to deploy to (e.g., "snowflake"). **kwargs: Additional keyword arguments specific to the target platform. """ - console.print(f"Starting deployment to '{target}' based on project YAML files...", style="yellow") + console.print( + f"Starting deployment to '{target}' based on project YAML files...", + style="yellow", + ) # 1. Load the entire project state from YAML files - from intugle.parser.manifest import ManifestLoader from intugle.core import settings + from intugle.parser.manifest import ManifestLoader + manifest_loader = ManifestLoader(settings.PROJECT_BASE) manifest_loader.load() manifest = manifest_loader.manifest # 2. Find a suitable adapter from the loaded manifest adapter_to_use: "Adapter" = None - + # Dynamically get the adapter class from the factory from intugle.adapters.factory import AdapterFactory + factory = AdapterFactory() - + target_adapter_class = None for name, (checker, creator) in factory.dataframe_funcs.items(): if name == target.lower(): @@ -203,25 +224,34 @@ def deploy(self, target: str, **kwargs): break if not target_adapter_class: - raise ValueError(f"Deployment target '{target}' is not supported or its dependencies are not installed.") + raise ValueError( + f"Deployment target '{target}' is not supported or its dependencies are not installed." + ) # Find a source that matches the target type to instantiate the adapter for source in manifest.sources.values(): - if source.table.details and source.table.details.get("type") == target.lower(): + if ( + source.table.details + and source.table.details.get("type") == target.lower() + ): adapter_to_use = target_adapter_class() break - + if not adapter_to_use: raise RuntimeError( f"Cannot deploy to '{target}'. No '{target}' source found in the project YAML files " "to provide connection details." ) - # 4. Delegate the deployment to the adapter, passing full manifest try: adapter_to_use.deploy_semantic_model(manifest, **kwargs) - console.print(f"Successfully deployed semantic model to '{target}'.", style="bold green") + console.print( + f"Successfully deployed semantic model to '{target}'.", + style="bold green", + ) except Exception as e: - console.print(f"Failed to deploy semantic model to '{target}': {e}", style="bold red") + console.print( + f"Failed to deploy semantic model to '{target}': {e}", style="bold red" + ) raise diff --git a/src/intugle/utils/files.py b/src/intugle/utils/files.py new file mode 100644 index 0000000..beb0107 --- /dev/null +++ b/src/intugle/utils/files.py @@ -0,0 +1,28 @@ +import os + +from pathlib import Path + +from intugle.core import settings + + +def touch(path: str | Path) -> None: + """ + Updates the modified time of a file, creating it if it doesn't exist. + Similar to the 'touch' command in Unix. + """ + with open(path, 'a'): + os.utime(path, None) + + +def update_relationship_file_mtime() -> None: + """ + Updates the modified time of the relationships file. + """ + if not settings.RELATIONSHIPS_FILE: + return + + file_path = os.path.join(settings.PROJECT_BASE, settings.RELATIONSHIPS_FILE) + + # Check if the file exists before touching it + if os.path.exists(file_path): + touch(file_path) diff --git a/tests/mcp/test_docs_search.py b/tests/mcp/test_docs_search.py new file mode 100644 index 0000000..cb9953a --- /dev/null +++ b/tests/mcp/test_docs_search.py @@ -0,0 +1,81 @@ +import functools + +from enum import Enum + +import pytest +import pytest_asyncio + +from mcp import ClientSession +from mcp.client.streamable_http import streamablehttp_client + +URL = "http://127.0.0.1:8080/semantic_layer/mcp" + + +class Prompts(str, Enum): + INTUGLE_VIBE = "intugle-vibe" + + def __repr__(self) -> str: + return self.value + + +class Tools(str, Enum): + SEARCH_INTUGLE_DOCS = "search_intugle_docs" + + def __repr__(self) -> str: + return self.value + + +def connection_decorator(): + def decorator(func): + @functools.wraps(func) + async def wrapper(*args, **kwargs): + async with streamablehttp_client(URL) as (read, write, _): + async with ClientSession(read, write) as session: + await session.initialize() + return await func(*args, **kwargs, session=session) + + return wrapper + + return decorator + + +class MCPTools: + def __init__(self): + ... + + @connection_decorator() + async def search_intugle_docs(self, session: ClientSession = None): + docs = await session.call_tool( + name=Tools.SEARCH_INTUGLE_DOCS, arguments={"paths": ["intro.md"]} + ) + + assert isinstance(docs.structuredContent["result"], str) + assert len(docs.structuredContent["result"]) > 0 + + @connection_decorator() + async def intugle_vibe_prompt(self, session: ClientSession = None): + prompt = await session.get_prompt(name=Prompts.INTUGLE_VIBE, arguments={}) + + prompt_text = prompt.messages[0].content.text + + assert isinstance(prompt_text, str) + assert len(prompt_text) > 0 + assert "About Intugle" in prompt_text + assert "Available Documentation Paths:" in prompt_text + assert ".md" in prompt_text + assert prompt.description + + +@pytest_asyncio.fixture +async def mcp_server_tools() -> MCPTools: + return MCPTools() + + +@pytest.mark.asyncio +async def test_mcp_search_intugle_docs(mcp_server_tools): + await mcp_server_tools.search_intugle_docs() + + +@pytest.mark.asyncio +async def test_mcp_intugle_vibe_prompt(mcp_server_tools): + await mcp_server_tools.intugle_vibe_prompt() \ No newline at end of file diff --git a/uv.lock b/uv.lock index fb41e9e..c864996 100644 --- a/uv.lock +++ b/uv.lock @@ -1668,9 +1668,10 @@ wheels = [ [[package]] name = "intugle" -version = "1.0.4" +version = "1.0.5" source = { editable = "." } dependencies = [ + { name = "aiohttp" }, { name = "asyncpg" }, { name = "duckdb" }, { name = "fastapi", extra = ["standard"] }, @@ -1730,6 +1731,7 @@ test = [ [package.metadata] requires-dist = [ + { name = "aiohttp", specifier = ">=3.9.5" }, { name = "asyncpg", specifier = ">=0.30.0" }, { name = "databricks-sql-connector", marker = "extra == 'databricks'", specifier = ">=4.1.3" }, { name = "duckdb", specifier = ">=1.3.2" },