diff --git a/README.md b/README.md index 18c195e..cb8efff 100644 --- a/README.md +++ b/README.md @@ -104,7 +104,8 @@ For a detailed, hands-on introduction to the project, please see our quickstart | **Sports Media** | [`quickstart_sports_media.ipynb`](notebooks/quickstart_sports_media.ipynb) | [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Intugle/data-tools/blob/main/notebooks/quickstart_sports_media.ipynb) | | **Databricks Unity Catalog [Health Care]** | [`quickstart_healthcare_databricks.ipynb`](notebooks/quickstart_healthcare_databricks.ipynb) | Databricks Notebook Only | | **Snowflake Horizon Catalog [ FMCG ]** | [`quickstart_fmcg_snowflake.ipynb`](notebooks/quickstart_fmcg_snowflake.ipynb) | Snowflake Notebook Only | -| **Native Snowflake with Cortex Analyst [ Tech Manufacturing ]** | [`quickstart_native_snowflake.ipynb`](notebooks/quickstart_native_snowflake.ipynb) | Snowflake Notebook Only | +| **Native Snowflake with Cortex Analyst [ Tech Manufacturing ]** | [`quickstart_native_snowflake.ipynb`](notebooks/quickstart_native_snowflake.ipynb) | [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Intugle/data-tools/blob/main/notebooks/quickstart_native_snowflake.ipynb) | +| **Native Databricks with AI/BI Genie [ Tech Manufacturing ]** | [`quickstart_native_databricks.ipynb`](notebooks/quickstart_native_databricks.ipynb) | [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Intugle/data-tools/blob/main/notebooks/quickstart_native_databricks.ipynb) | These datasets will take you through the following steps: diff --git a/docsite/docs/connectors/databricks.md b/docsite/docs/connectors/databricks.md new file mode 100644 index 0000000..b1e0b41 --- /dev/null +++ b/docsite/docs/connectors/databricks.md @@ -0,0 +1,104 @@ +--- +sidebar_position: 2 +--- + +# Databricks + +`intugle` integrates with Databricks, allowing you to read data from your tables and deploy your `SemanticModel` by setting constraints and comments directly in your Databricks account. + +## Installation + +To use `intugle` with Databricks, you must install the optional dependencies: + +```bash +pip install "intugle[databricks]" +``` + +This installs the `pyspark`, `sqlglot` and `databricks-sql-connector` libraries. + +## Configuration + +The Databricks adapter can connect using credentials from a `profiles.yml` file or automatically use an active session when running inside a Databricks notebook. + +### Connecting from an External Environment + +When running `intugle` outside of a Databricks notebook, you must provide full connection credentials in a `profiles.yml` file at the root of your project. The adapter looks for a top-level `databricks:` key. + +**Example `profiles.yml`:** + +```yaml +databricks: + host: + http_path: + token: + schema: + catalog: # Optional, for Unity Catalog +``` + +### Connecting from a Databricks Notebook + +When your code is executed within a Databricks Notebook, the adapter automatically detects and uses the notebook's active Spark session for execution. However, it still requires a `profiles.yml` file to determine the target `schema` and `catalog` for your operations. + +**Example `profiles.yml` for Notebooks:** + +```yaml +databricks: + schema: + catalog: # Optional, for Unity Catalog +``` + +## Usage + +### Reading Data from Databricks + +To include a Databricks table in your `SemanticModel`, define it in your input dictionary with `type: "databricks"` and use the `identifier` key to specify the table name. + +:::caution Important +The dictionary key for your dataset (e.g., `"CUSTOMERS"`) must exactly match the table name specified in the `identifier`. +::: + +```python +from intugle import SemanticModel + +datasets = { + "CUSTOMERS": { + "identifier": "CUSTOMERS", # Must match the key above + "type": "databricks" + }, + "ORDERS": { + "identifier": "ORDERS", # Must match the key above + "type": "databricks" + } +} + +# Initialize the semantic model +sm = SemanticModel(datasets, domain="E-commerce") + +# Build the model as usual +sm.build() +``` + +### Materializing Data Products + +When you use the `DataProduct` class with a Databricks connection, the resulting data product will be materialized as a new **view** directly within your target schema. + +### Deploying the Semantic Model + +Once your semantic model is built, you can deploy it to Databricks using the `deploy()` method. This process syncs your model's intelligence to your physical tables by: +1. **Syncing Metadata:** It updates the comments on your physical Databricks tables and columns with the business glossaries from your `intugle` model. You can also sync tags. +2. **Setting Constraints:** It sets `PRIMARY KEY` and `FOREIGN KEY` constraints on your tables based on the relationships discovered in the model. + +```python +# Deploy the model to Databricks +sm.deploy(target="databricks") + +# You can also control which parts of the deployment to run +sm.deploy( + target="databricks", + sync_glossary=True, + sync_tags=True, + set_primary_keys=True, + set_foreign_keys=True +) +``` + diff --git a/docsite/docs/examples.md b/docsite/docs/examples.md index b359143..80ac6b7 100644 --- a/docsite/docs/examples.md +++ b/docsite/docs/examples.md @@ -15,7 +15,8 @@ For a detailed, hands-on introduction to the project, please see our quickstart | **Sports Media** | [`quickstart_sports_media.ipynb`](https://github.com/Intugle/data-tools/blob/main/notebooks/quickstart_sports_media.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Intugle/data-tools/blob/main/notebooks/quickstart_sports_media.ipynb) | | **Databricks Unity Catalog [Health Care]** | [`quickstart_healthcare_databricks.ipynb`](https://github.com/Intugle/data-tools/blob/main/notebooks/quickstart_healthcare_databricks.ipynb) | Databricks Notebook Only | | **Snowflake Horizon Catalog [ FMCG ]** | [`quickstart_fmcg_snowflake.ipynb`](https://github.com/Intugle/data-tools/blob/main/notebooks/quickstart_fmcg_snowflake.ipynb) | Snowflake Notebook Only | -| **Native Snowflake with Cortex Analyst [ Tech Manufacturing ]** | [`quickstart_native_snowflake.ipynb`](https://github.com/Intugle/data-tools/blob/main/notebooks/quickstart_native_snowflake.ipynb) | Snowflake Notebook Only | +| **Native Snowflake with Cortex Analyst [ Tech Manufacturing ]** | [`quickstart_native_snowflake.ipynb`](https://github.com/Intugle/data-tools/blob/main/notebooks/quickstart_native_snowflake.ipynb) | [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Intugle/data-tools/blob/main/notebooks/quickstart_native_snowflake.ipynb) | +| **Native Databricks with AI/BI Genie [ Tech Manufacturing ]** | [`quickstart_native_databricks.ipynb`](https://github.com/Intugle/data-tools/blob/main/notebooks/quickstart_native_databricks.ipynb) | [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Intugle/data-tools/blob/main/notebooks/quickstart_native_databricks.ipynb) | These datasets will take you through the following steps: diff --git a/notebooks/quickstart_native_databricks.ipynb b/notebooks/quickstart_native_databricks.ipynb new file mode 100644 index 0000000..164bb50 --- /dev/null +++ b/notebooks/quickstart_native_databricks.ipynb @@ -0,0 +1,2332 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "51afdb98-f483-427c-bf3d-99e67555384f", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + }, + "collapsed": false, + "name": "cell1" + }, + "source": [ + "# Quickstart: Building a GenAI powered Semantic Data Model with Intugle natively in Databricks\n", + "\n", + "This quickstart shows you how to use Intugle’s open-source library to transform fragmented datasets into a connected semantic model. The semantic model captures profiling, relationships, and business meaning of your data — making it instantly usable for exploration, search, and data product creation.\n", + "\n", + "**What is a Semantic Model?**\n", + "\n", + "A semantic model is an intelligent knowledge graph of your data. It connects tables, discovers relationships, and enriches them with business glossaries — so both data teams and business users can query with clarity, not complexity.\n", + "\n", + "**Who is this for?**\n", + "\n", + "* **Data Engineers & Architects** often spend weeks manually profiling, classifying, and stitching together fragmented data assets. With Intugle, they can automate this process end-to-end, uncovering meaningful links and relationships to instantly generate a connected semantic layer.\n", + "* **Data Analysts & Scientists** spend endless hours on data readiness and preparation before they can even start the real analysis. Intugle accelerates this by providing contextual intelligence, automatically generating SQL and reusable data products enriched with relationships and business meaning.\n", + "* **Business Analysts & Decision Makers** are slowed down by constant dependence on technical teams for answers. Intugle removes this bottleneck by enabling natural language queries and semantic search, giving them trusted insights on demand.\n", + "\n", + "**In this notebook, you will learn how to:**\n", + "\n", + "* **Generate Semantic Model** → The unified layer that transforms fragmented datasets, creating the foundation for connected intelligence.\n", + " * **1.1 Profile and classify data** → Analyze your data sources to understand their structure, data types, and other characteristics.\n", + " * **1.2 Discover links & relationships among data** → Reveal meaningful connections (PK & FK) across fragmented tables.\n", + " * **1.3 Generate a business glossary** → Create business-friendly terms and use them to query data with context.\n", + " * **1.4 Enable Semantic search** → Intelligent search that understands meaning, not just keywords—making data more accessible across both technical and business users.\n", + " * **1.5 Visualize semantic model** → Get access to enriched metadata of the semantic model and visualize your data and relationships.\n", + "* **Build Unified Data Products** → Simply pick the attributes across your data tables, and let the toolkit auto-generate queries with all the required joins, transformations, and aggregations using the semantic layer. When executed, these queries produce reusable data products.\n", + "* Sync the semantic model and data products to Databricks Unity Catalog\n", + "* Converse with your data using Databricks Genie\n", + "\n", + "Before you start, make sure you install the **Intugle Data Tools** in your environemt: `pip install intugle[databricks]`" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "df4da28c-dbaf-4440-84a0-95cbfd4683f3", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + }, + "collapsed": false, + "name": "cell3" + }, + "source": [ + "## 1. LLM Configuration\n", + "\n", + "Before running the project, you need to configure a Large Language Model (LLM). This is used for tasks like generating business glossaries and predicting links between tables. For the semantic search feature, you will also need to set up Qdrant and provide an OpenAI API key. For detailed setup instructions, please refer to the [README.md](README.md) file.\n", + "\n", + "You can configure the necessary services by setting the following environment variables:\n", + "\n", + "* `LLM_PROVIDER`: The LLM provider and model to use (e.g., `openai:gpt-3.5-turbo`). The format follows langchain's format for initializing chat models. Checkout how to specify your model [here](https://python.langchain.com/docs/integrations/chat/)\n", + "* `API_KEY`: Your API key for the LLM provider. The exact name of the variable may vary from provider to provider (e.g., `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`).\n", + "\n", + "Here's an example of how to set these variables in your environment:\n", + "\n", + "```bash\n", + "export LLM_PROVIDER=\"openai:gpt-3.5-turbo\"\n", + "export OPENAI_API_KEY=\"your-openai-api-key\"\n", + "```\n", + "Alternatively, you can set them in the notebook like this:" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "c60bc6a8-2656-4393-a92d-f0a0a0122a79", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + }, + "language": "python", + "name": "cell4" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import os\n", + "from dotenv import load_dotenv\n", + "\n", + "\n", + "os.environ[\"LLM_PROVIDER\"] = \"openai:gpt-3.5-turbo\"\n", + "os.environ[\"OPENAI_API_KEY\"] = \"your-openai-api-key\" # Replace with your actual key\n", + "\n", + "\n", + "# Load environment variables from .env file\n", + "load_dotenv(override=True)\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "fb8538ea-a5d8-4501-a13a-6bff29d7d5aa", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + }, + "name": "cell5" + }, + "source": [ + "> Currently the langchain packages for OpenAI, Anthropic and Gemini is installed by default. For additional models, make sure you have the integration packages installed. E.g. you should have langchain-deepseek installed to use a DeepSeek model. You can get these packages here: [LangChain Chat Models](https://python.langchain.com/docs/integrations/chat/)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "db5c1e88-f1b4-4f50-a584-6a9a0584fa2a", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + }, + "name": "cell6" + }, + "source": [ + "## 2. Building the Semantic Model\n", + "\n", + "The `SemanticModel` is the entry point for building your semantic layer. It takes a dictionary of datasets as input and performs the following steps:\n", + "\n", + "1. **Data Profiling:** Calculates statistics for each column, such as distinct count, uniqueness, and completeness.\n", + "2. **Datatype Identification:** Identifies the data type of each column (e.g., integer, string, datetime).\n", + "3. **Key Identification:** Identifies potential primary keys.\n", + "4. **Glossary Generation:** Generates a business glossary for each column using an LLM.\n", + "5. **Link Prediction:** Predicts the relationships (foreign keys) between tables.\n", + "\n", + "Let's start by defining the datasets we want to use. The path shown below can be a local file path or a remote URL." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "cfe62d69-97cc-420a-8405-6560a6384407", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + }, + "collapsed": false, + "name": "cell37" + }, + "source": [ + "> For this demo, we will be using the technology manufacturing dataset which can be found under [sample_data/tech_manufacturing](https://github.com/Intugle/data-tools/blob/main/notebooks/quickstart_native_snowflake.ipynb) in the repo" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "b563f3bc-bc34-4d09-85b7-ebe94f2b3a95", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + }, + "language": "python", + "name": "cell7" + }, + "outputs": [], + "source": [ + "def generate_config(table_name: str) -> str:\n", + " \"\"\"Append the base URL to the table name.\"\"\"\n", + " return {\n", + " \"identifier\": table_name,\n", + " \"type\": \"databricks\"\n", + " }\n", + "\n", + "\n", + "table_names = \\\n", + "[\n", + " \"campaigns\",\n", + " \"campaign_survey\",\n", + " \"customer_hierarchy\",\n", + " \"customers\",\n", + " \"delivery_survey\",\n", + " \"expense\",\n", + " \"install_base\",\n", + " \"inventory\",\n", + " \"logistics\",\n", + " \"nps_survey\",\n", + " \"opportunity\",\n", + " \"orders\",\n", + " \"prob_statement_issue\",\n", + " \"product_feature\",\n", + " \"product_hierarchy\",\n", + " \"products\",\n", + " \"renewals\",\n", + " \"returns\",\n", + " \"service_requests\",\n", + " \"website\",\n", + "]\n", + "\n", + "\n", + "datasets = {table: generate_config(table) for table in table_names}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "cee3f9d4-9cb1-412f-8cc2-5f0f9c8fd65a", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + }, + "name": "cell8" + }, + "source": [ + "Now, let's use the `SemanticModel` to build our semantic layer:\n", + "\n", + "> The `domain` parameter helps the LLM generate a more contextual business glossary. It specifies the industry domain that the dataset belongs to (e.g., \"Healthcare\", \"Finance\", \"E-commerce\")." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "fdc5c410-f3a7-4c12-83c6-ca2fa71f1446", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + }, + "language": "python", + "name": "cell9" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Found existing YAML for 'campaigns'. Checking for staleness.\ncampaigns loaded\nFound existing YAML for 'campaign_survey'. Checking for staleness.\ncampaign_survey loaded\nFound existing YAML for 'customer_hierarchy'. Checking for staleness.\ncustomer_hierarchy loaded\nFound existing YAML for 'customers'. Checking for staleness.\ncustomers loaded\nFound existing YAML for 'delivery_survey'. Checking for staleness.\ndelivery_survey loaded\nFound existing YAML for 'expense'. Checking for staleness.\nexpense loaded\nFound existing YAML for 'install_base'. Checking for staleness.\ninstall_base loaded\nFound existing YAML for 'inventory'. Checking for staleness.\ninventory loaded\nFound existing YAML for 'logistics'. Checking for staleness.\nlogistics loaded\nFound existing YAML for 'nps_survey'. Checking for staleness.\nnps_survey loaded\nFound existing YAML for 'opportunity'. Checking for staleness.\nopportunity loaded\nFound existing YAML for 'orders'. Checking for staleness.\norders loaded\nFound existing YAML for 'prob_statement_issue'. Checking for staleness.\nprob_statement_issue loaded\nFound existing YAML for 'product_feature'. Checking for staleness.\nproduct_feature loaded\nFound existing YAML for 'product_hierarchy'. Checking for staleness.\nproduct_hierarchy loaded\nFound existing YAML for 'products'. Checking for staleness.\nproducts loaded\nFound existing YAML for 'renewals'. Checking for staleness.\nrenewals loaded\nFound existing YAML for 'returns'. Checking for staleness.\nreturns loaded\nFound existing YAML for 'service_requests'. Checking for staleness.\nservice_requests loaded\nFound existing YAML for 'website'. Checking for staleness.\nwebsite loaded\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
Starting profiling and key identification stage...\n",
+       "
\n" + ], + "text/plain": [ + "\u001B[33mStarting profiling and key identification stage\u001B[0m\u001B[33m...\u001B[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset 'campaigns' already profiled. Skipping.\nDataset 'campaign_survey' already profiled. Skipping.\nDataset 'customer_hierarchy' already profiled. Skipping.\nDataset 'customers' already profiled. Skipping.\nDataset 'delivery_survey' already profiled. Skipping.\nDataset 'expense' already profiled. Skipping.\nDataset 'install_base' already profiled. Skipping.\nDataset 'inventory' already profiled. Skipping.\nDataset 'logistics' already profiled. Skipping.\nDataset 'nps_survey' already profiled. Skipping.\nDataset 'opportunity' already profiled. Skipping.\nDataset 'orders' already profiled. Skipping.\nDataset 'prob_statement_issue' already profiled. Skipping.\nDataset 'product_feature' already profiled. Skipping.\nDataset 'product_hierarchy' already profiled. Skipping.\nDataset 'products' already profiled. Skipping.\nDataset 'renewals' already profiled. Skipping.\nDataset 'returns' already profiled. Skipping.\nDataset 'service_requests' already profiled. Skipping.\nDataset 'website' already profiled. Skipping.\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
Profiling and key identification complete.\n",
+       "
\n" + ], + "text/plain": [ + "\u001B[1;32mProfiling and key identification complete.\u001B[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
Starting link prediction stage...\n",
+       "
\n" + ], + "text/plain": [ + "\u001B[33mStarting link prediction stage\u001B[0m\u001B[33m...\u001B[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset 'campaigns' already processed. Skipping analysis.\nDataset 'campaign_survey' already processed. Skipping analysis.\nDataset 'customer_hierarchy' already processed. Skipping analysis.\nDataset 'customers' already processed. Skipping analysis.\nDataset 'delivery_survey' already processed. Skipping analysis.\nDataset 'expense' already processed. Skipping analysis.\nDataset 'install_base' already processed. Skipping analysis.\nDataset 'inventory' already processed. Skipping analysis.\nDataset 'logistics' already processed. Skipping analysis.\nDataset 'nps_survey' already processed. Skipping analysis.\nDataset 'opportunity' already processed. Skipping analysis.\nDataset 'orders' already processed. Skipping analysis.\nDataset 'prob_statement_issue' already processed. Skipping analysis.\nDataset 'product_feature' already processed. Skipping analysis.\nDataset 'product_hierarchy' already processed. Skipping analysis.\nDataset 'products' already processed. Skipping analysis.\nDataset 'renewals' already processed. Skipping analysis.\nDataset 'returns' already processed. Skipping analysis.\nDataset 'service_requests' already processed. Skipping analysis.\nDataset 'website' already processed. Skipping analysis.\nLinkPredictor initialized with datasets: ['campaigns', 'campaign_survey', 'customer_hierarchy', 'customers', 'delivery_survey', 'expense', 'install_base', 'inventory', 'logistics', 'nps_survey', 'opportunity', 'orders', 'prob_statement_issue', 'product_feature', 'product_hierarchy', 'products', 'renewals', 'returns', 'service_requests', 'website']\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
Link predictions are up-to-date. Loading from cache.\n",
+       "
\n" + ], + "text/plain": [ + "\u001B[32mLink predictions are up-to-date. Loading from cache.\u001B[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
Link prediction complete.\n",
+       "
\n" + ], + "text/plain": [ + "\u001B[1;32mLink prediction complete.\u001B[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
Starting business glossary generation stage...\n",
+       "
\n" + ], + "text/plain": [ + "\u001B[33mStarting business glossary generation stage\u001B[0m\u001B[33m...\u001B[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
Glossary for 'campaigns' already exists. Skipping.\n",
+       "
\n" + ], + "text/plain": [ + "Glossary for \u001B[32m'campaigns'\u001B[0m already exists. Skipping.\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
Glossary for 'campaign_survey' already exists. Skipping.\n",
+       "
\n" + ], + "text/plain": [ + "Glossary for \u001B[32m'campaign_survey'\u001B[0m already exists. Skipping.\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
Glossary for 'customer_hierarchy' already exists. Skipping.\n",
+       "
\n" + ], + "text/plain": [ + "Glossary for \u001B[32m'customer_hierarchy'\u001B[0m already exists. Skipping.\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
Glossary for 'customers' already exists. Skipping.\n",
+       "
\n" + ], + "text/plain": [ + "Glossary for \u001B[32m'customers'\u001B[0m already exists. Skipping.\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
Glossary for 'delivery_survey' already exists. Skipping.\n",
+       "
\n" + ], + "text/plain": [ + "Glossary for \u001B[32m'delivery_survey'\u001B[0m already exists. Skipping.\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
Glossary for 'expense' already exists. Skipping.\n",
+       "
\n" + ], + "text/plain": [ + "Glossary for \u001B[32m'expense'\u001B[0m already exists. Skipping.\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
Glossary for 'install_base' already exists. Skipping.\n",
+       "
\n" + ], + "text/plain": [ + "Glossary for \u001B[32m'install_base'\u001B[0m already exists. Skipping.\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
Glossary for 'inventory' already exists. Skipping.\n",
+       "
\n" + ], + "text/plain": [ + "Glossary for \u001B[32m'inventory'\u001B[0m already exists. Skipping.\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
Glossary for 'logistics' already exists. Skipping.\n",
+       "
\n" + ], + "text/plain": [ + "Glossary for \u001B[32m'logistics'\u001B[0m already exists. Skipping.\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
Glossary for 'nps_survey' already exists. Skipping.\n",
+       "
\n" + ], + "text/plain": [ + "Glossary for \u001B[32m'nps_survey'\u001B[0m already exists. Skipping.\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
Glossary for 'opportunity' already exists. Skipping.\n",
+       "
\n" + ], + "text/plain": [ + "Glossary for \u001B[32m'opportunity'\u001B[0m already exists. Skipping.\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
Glossary for 'orders' already exists. Skipping.\n",
+       "
\n" + ], + "text/plain": [ + "Glossary for \u001B[32m'orders'\u001B[0m already exists. Skipping.\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
Glossary for 'prob_statement_issue' already exists. Skipping.\n",
+       "
\n" + ], + "text/plain": [ + "Glossary for \u001B[32m'prob_statement_issue'\u001B[0m already exists. Skipping.\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
Glossary for 'product_feature' already exists. Skipping.\n",
+       "
\n" + ], + "text/plain": [ + "Glossary for \u001B[32m'product_feature'\u001B[0m already exists. Skipping.\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
Glossary for 'product_hierarchy' already exists. Skipping.\n",
+       "
\n" + ], + "text/plain": [ + "Glossary for \u001B[32m'product_hierarchy'\u001B[0m already exists. Skipping.\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
Glossary for 'products' already exists. Skipping.\n",
+       "
\n" + ], + "text/plain": [ + "Glossary for \u001B[32m'products'\u001B[0m already exists. Skipping.\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
Glossary for 'renewals' already exists. Skipping.\n",
+       "
\n" + ], + "text/plain": [ + "Glossary for \u001B[32m'renewals'\u001B[0m already exists. Skipping.\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
Glossary for 'returns' already exists. Skipping.\n",
+       "
\n" + ], + "text/plain": [ + "Glossary for \u001B[32m'returns'\u001B[0m already exists. Skipping.\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
Glossary for 'service_requests' already exists. Skipping.\n",
+       "
\n" + ], + "text/plain": [ + "Glossary for \u001B[32m'service_requests'\u001B[0m already exists. Skipping.\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
Glossary for 'website' already exists. Skipping.\n",
+       "
\n" + ], + "text/plain": [ + "Glossary for \u001B[32m'website'\u001B[0m already exists. Skipping.\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
Business glossary generation complete.\n",
+       "
\n" + ], + "text/plain": [ + "\u001B[1;32mBusiness glossary generation complete.\u001B[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Initializing semantic search...\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "Could not initialize semantic search: Error code: 401 - {'error': {'message': 'Incorrect API key provided: your-ope*******-key. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}\nSemantic search initialization failed during build: Error code: 401 - {'error': {'message': 'Incorrect API key provided: your-ope*******-key. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from intugle import SemanticModel\n", + "\n", + "# Initialize the semantic model\n", + "sm = SemanticModel(datasets, domain=\"Technology Manufacturing Company\")\n", + "\n", + "# Run the prediction\n", + "sm.build()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "c0547583-6cc3-4a36-9d64-a7e178de19be", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + }, + "name": "cell10" + }, + "source": [ + "## 3. Accessing Enriched Metadata\n", + "\n", + "Now that the semantic model is built, you can easily access the enriched metadata for each dataset.\n", + "\n", + "### Accessing a Dataset\n", + "\n", + "You can access a specific dataset by its name from the `sm.datasets` dictionary:" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "c7fdfeab-a67b-4032-beae-c43c8a61fffa", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + }, + "language": "python", + "name": "cell11" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
column_nametable_namebusiness_namedatatype_l1datatype_l2business_glossarybusiness_tagscountnull_countdistinct_countuniquenesscompletenesssample_data
0Product IDproductsproduct_idalphanumericdimensionServes as a unique identifier for individual i...[Product Identification, Inventory Tracking, M...1000010001.0001.0[PROD-11196, PROD-11735, PROD-11723, PROD-1140...
1Product Nameproductsproduct_nameclose_ended_textdimensionIdentifies the name used to distinguish a spec...[Product Portfolio, Technology Solutions, Bran...1000050.0051.0[CoreAnalytics, CloudEdge, NetConnect, Insight...
2Product Categoryproductsproduct_categoryclose_ended_textdimensionGroups products into predefined classification...[Technology Solutions, Business Segmentation, ...1000050.0051.0[Networking, Cloud Platform, IoT, Security, An...
3Product Statusproductsproduct_statusclose_ended_textdimensionIndicates the current lifecycle phase of a pro...[Lifecycle Management, Product Development Sta...1000050.0051.0[In Development, GA (General Availability), Ac...
4R&D Initiation Dateproductsr_d_initiation_datedate & timedimensionMarks the date when the research and developme...[Research And Development Timeline, Product De...100007770.7771.0[2022-06-05, 2025-01-08, 2021-07-07, 2023-02-0...
5R&D Stageproductsr_d_stageclose_ended_textdimensionIndicates the current phase of development and...[Product Development Lifecycle, Innovation Tra...1000050.0051.0[Concept, Pre-Launch, Enhancement, Testing, Pr...
6Launch Statusproductslaunch_statusclose_ended_textdimensionIndicates the current phase or outcome of a pr...[Product Lifecycle, Launch Planning, Market Re...1000050.0051.0[Not Launched, Planned, Launched, Cancelled, I...
7Launch Dateproductslaunch_datedate & timedimensionIndicates the scheduled date and time when a p...[Product Launch Timeline, Go-To-Market Strateg...100006590.6591.0[2025-07-14, 2025-07-03, 2025-03-15, 2024-07-2...
8PM Nameproductspm_nameclose_ended_textdimensionIdentifies the individual responsible for mana...[Product Manager, Responsible Party, Ownership]100009940.9941.0[Lucas Marsh, Jillian Brady, Theresa Hall, Bri...
9Product Cost ($)productsproduct_costintegermeasureMonetary value associated with the production ...[Product Manufacturing Cost, Cost Analysis, Fi...100009980.9981.0[189597, 576010, 217526, 391686, 218954, 26701...
\n", + "
" + ], + "text/plain": [ + " column_name ... sample_data\n", + "0 Product ID ... [PROD-11196, PROD-11735, PROD-11723, PROD-1140...\n", + "1 Product Name ... [CoreAnalytics, CloudEdge, NetConnect, Insight...\n", + "2 Product Category ... [Networking, Cloud Platform, IoT, Security, An...\n", + "3 Product Status ... [In Development, GA (General Availability), Ac...\n", + "4 R&D Initiation Date ... [2022-06-05, 2025-01-08, 2021-07-07, 2023-02-0...\n", + "5 R&D Stage ... [Concept, Pre-Launch, Enhancement, Testing, Pr...\n", + "6 Launch Status ... [Not Launched, Planned, Launched, Cancelled, I...\n", + "7 Launch Date ... [2025-07-14, 2025-07-03, 2025-03-15, 2024-07-2...\n", + "8 PM Name ... [Lucas Marsh, Jillian Brady, Theresa Hall, Bri...\n", + "9 Product Cost ($) ... [189597, 576010, 217526, 391686, 218954, 26701...\n", + "\n", + "[10 rows x 13 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "products_dataset = sm.datasets['products']\n", + "products_dataset.profiling_df" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "5ccdcad7-775f-4450-98c5-b0edf9290669", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + }, + "name": "cell12" + }, + "source": [ + "The profiling results can be accessed through the `profiling_df` property of the `DataSet` object. It's a pandas DataFrame that you can easily explore. \n", + "> The business glossary is also available in the `profiling_df`:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "c1fd43dd-8b0a-4ccd-b060-39529680c265", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + }, + "name": "cell13" + }, + "source": [ + "### Visualizing Relationships\n", + "\n", + "The `SemanticModel` automatically discovers the relationships between your tables. You can access the predicted links as a list of `PredictedLink` objects:" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "a7171e09-db28-42cb-b110-58f3a2935590", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + }, + "language": "python", + "name": "cell14" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[PredictedLink(from_dataset='campaigns', from_column='Campaign ID', to_dataset='campaign_survey', to_column='Camp ID', intersect_count=635, intersect_ratio_from_col=0.634, intersect_ratio_to_col=1.0, accuracy=1.0),\n", + " PredictedLink(from_dataset='campaigns', from_column='Prospect ID', to_dataset='customers', to_column='C_ID', intersect_count=622, intersect_ratio_from_col=1.0, intersect_ratio_to_col=0.622, accuracy=1.0),\n", + " PredictedLink(from_dataset='campaigns', from_column='Prod_ID', to_dataset='products', to_column='Product ID', intersect_count=633, intersect_ratio_from_col=1.0, intersect_ratio_to_col=0.633, accuracy=1.0),\n", + " PredictedLink(from_dataset='campaigns', from_column='Campaign ID', to_dataset='website', to_column='Cmgn ID', intersect_count=618, intersect_ratio_from_col=0.618, intersect_ratio_to_col=1.0, accuracy=1.0),\n", + " PredictedLink(from_dataset='customer_hierarchy', from_column='Party ID', to_dataset='customers', to_column='C_ID', intersect_count=616, intersect_ratio_from_col=1.0, intersect_ratio_to_col=0.616, accuracy=1.0),\n", + " PredictedLink(from_dataset='customers', from_column='C_ID', to_dataset='delivery_survey', to_column='Customer ID', intersect_count=627, intersect_ratio_from_col=0.627, intersect_ratio_to_col=1.0, accuracy=1.0),\n", + " PredictedLink(from_dataset='customers', from_column='C_ID', to_dataset='expense', to_column='Contact ID', intersect_count=646, intersect_ratio_from_col=0.646, intersect_ratio_to_col=1.0, accuracy=1.0),\n", + " PredictedLink(from_dataset='customers', from_column='C_ID', to_dataset='install_base', to_column='Party ID', intersect_count=638, intersect_ratio_from_col=0.638, intersect_ratio_to_col=1.0, accuracy=1.0),\n", + " PredictedLink(from_dataset='customers', from_column='C_ID', to_dataset='logistics', to_column='Cust_ID', intersect_count=612, intersect_ratio_from_col=0.612, intersect_ratio_to_col=1.0, accuracy=1.0),\n", + " PredictedLink(from_dataset='customers', from_column='C_ID', to_dataset='nps_survey', to_column='Customer_ID', intersect_count=615, intersect_ratio_from_col=0.615, intersect_ratio_to_col=1.0, accuracy=1.0),\n", + " PredictedLink(from_dataset='customers', from_column='C_ID', to_dataset='opportunity', to_column='Cust ID', intersect_count=636, intersect_ratio_from_col=0.636, intersect_ratio_to_col=1.0, accuracy=1.0),\n", + " PredictedLink(from_dataset='customers', from_column='C_ID', to_dataset='orders', to_column='Party ID', intersect_count=994, intersect_ratio_from_col=0.994, intersect_ratio_to_col=1.0, accuracy=1.0),\n", + " PredictedLink(from_dataset='customers', from_column='C_ID', to_dataset='renewals', to_column='Cus ID', intersect_count=637, intersect_ratio_from_col=0.637, intersect_ratio_to_col=1.0, accuracy=1.0),\n", + " PredictedLink(from_dataset='customers', from_column='C_ID', to_dataset='returns', to_column='Customer ID', intersect_count=652, intersect_ratio_from_col=0.652, intersect_ratio_to_col=1.0, accuracy=1.0),\n", + " PredictedLink(from_dataset='customers', from_column='C_ID', to_dataset='service_requests', to_column='C ID', intersect_count=993, intersect_ratio_from_col=0.993, intersect_ratio_to_col=1.0, accuracy=1.0),\n", + " PredictedLink(from_dataset='delivery_survey', from_column='Odr ID', to_dataset='orders', to_column='Order ID', intersect_count=893, intersect_ratio_from_col=1.0, intersect_ratio_to_col=0.179, accuracy=1.0),\n", + " PredictedLink(from_dataset='delivery_survey', from_column='PD ID', to_dataset='products', to_column='Product ID', intersect_count=640, intersect_ratio_from_col=1.0, intersect_ratio_to_col=0.64, accuracy=1.0),\n", + " PredictedLink(from_dataset='expense', from_column='Invoice Number', to_dataset='inventory', to_column='Inventory ID', intersect_count=1000, intersect_ratio_from_col=1.0, intersect_ratio_to_col=1.0, accuracy=1.0),\n", + " PredictedLink(from_dataset='install_base', from_column='Order ID', to_dataset='orders', to_column='Order ID', intersect_count=1000, intersect_ratio_from_col=1.0, intersect_ratio_to_col=0.2, accuracy=1.0),\n", + " PredictedLink(from_dataset='install_base', from_column='Product ID', to_dataset='products', to_column='Product ID', intersect_count=630, intersect_ratio_from_col=1.0, intersect_ratio_to_col=0.63, accuracy=1.0),\n", + " PredictedLink(from_dataset='inventory', from_column='Prod_ID', to_dataset='products', to_column='Product ID', intersect_count=621, intersect_ratio_from_col=1.0, intersect_ratio_to_col=0.621, accuracy=1.0),\n", + " PredictedLink(from_dataset='logistics', from_column='Od_Contract_ID', to_dataset='orders', to_column='Order ID', intersect_count=890, intersect_ratio_from_col=1.0, intersect_ratio_to_col=0.178, accuracy=1.0),\n", + " PredictedLink(from_dataset='nps_survey', from_column='Return_ID', to_dataset='returns', to_column='Return_ID', intersect_count=627, intersect_ratio_from_col=1.0, intersect_ratio_to_col=0.627, accuracy=1.0),\n", + " PredictedLink(from_dataset='nps_survey', from_column='Incident ID', to_dataset='service_requests', to_column='SR ID', intersect_count=911, intersect_ratio_from_col=1.0, intersect_ratio_to_col=0.182, accuracy=1.0),\n", + " PredictedLink(from_dataset='orders', from_column='Product ID', to_dataset='products', to_column='Product ID', intersect_count=994, intersect_ratio_from_col=1.0, intersect_ratio_to_col=0.994, accuracy=1.0),\n", + " PredictedLink(from_dataset='orders', from_column='Order ID', to_dataset='renewals', to_column='Od ID', intersect_count=906, intersect_ratio_from_col=0.181, intersect_ratio_to_col=1.0, accuracy=1.0),\n", + " PredictedLink(from_dataset='orders', from_column='Order ID', to_dataset='returns', to_column='R Order ID', intersect_count=925, intersect_ratio_from_col=0.185, intersect_ratio_to_col=1.0, accuracy=1.0),\n", + " PredictedLink(from_dataset='orders', from_column='Order ID', to_dataset='service_requests', to_column='Sales Ord ID', intersect_count=3223, intersect_ratio_from_col=0.645, intersect_ratio_to_col=1.0, accuracy=1.0),\n", + " PredictedLink(from_dataset='prob_statement_issue', from_column='Pd_ID', to_dataset='products', to_column='Product ID', intersect_count=628, intersect_ratio_from_col=1.0, intersect_ratio_to_col=0.628, accuracy=1.0),\n", + " PredictedLink(from_dataset='prob_statement_issue', from_column='SR ID', to_dataset='service_requests', to_column='SR ID', intersect_count=890, intersect_ratio_from_col=1.0, intersect_ratio_to_col=0.178, accuracy=1.0),\n", + " PredictedLink(from_dataset='product_feature', from_column='P_ID', to_dataset='products', to_column='Product ID', intersect_count=644, intersect_ratio_from_col=1.0, intersect_ratio_to_col=0.644, accuracy=1.0),\n", + " PredictedLink(from_dataset='product_hierarchy', from_column='Product ID', to_dataset='products', to_column='Product ID', intersect_count=631, intersect_ratio_from_col=1.0, intersect_ratio_to_col=0.631, accuracy=1.0),\n", + " PredictedLink(from_dataset='products', from_column='Product ID', to_dataset='renewals', to_column='Pr ID', intersect_count=646, intersect_ratio_from_col=0.646, intersect_ratio_to_col=1.0, accuracy=1.0),\n", + " PredictedLink(from_dataset='products', from_column='Product ID', to_dataset='returns', to_column='Product ID', intersect_count=642, intersect_ratio_from_col=0.642, intersect_ratio_to_col=1.0, accuracy=1.0),\n", + " PredictedLink(from_dataset='products', from_column='Product ID', to_dataset='service_requests', to_column='Prod ID', intersect_count=993, intersect_ratio_from_col=0.993, intersect_ratio_to_col=1.0, accuracy=1.0),\n", + " PredictedLink(from_dataset='returns', from_column='SR ID', to_dataset='service_requests', to_column='SR ID', intersect_count=1000, intersect_ratio_from_col=1.0, intersect_ratio_to_col=0.2, accuracy=1.0)]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sm.links" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "2033c057-a741-4fa2-8fc0-ef1ffd2085f2", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + }, + "name": "cell15" + }, + "source": [ + "You can also visualize these relationships as a graph. In case you run into an error, make sure you install/upgrade your ipykernel package:\n", + "> %pip install --upgrade ipykernel" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "81aa7147-5d92-48f1-a908-2816920d886b", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + }, + "language": "python", + "name": "cell16" + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "sm.visualize() # To visualize the relationships as a graph" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "e957ab4c-80e7-4542-a0ef-5f2c070636f0", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + }, + "name": "cell17" + }, + "source": [ + "## 4. The Semantic Layer\n", + "\n", + "The SemanticModel results are used to generate YAML files which are saved automatically. These files defines the semantic layer, including the models (tables) and their relationships. \n", + "\n", + "By default, these files are saved in the current working directory. You can configure this path by setting the `PROJECT_BASE` environment variable." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "4eb1f6de-9b2e-45e8-89d6-1351a713bcac", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + }, + "collapsed": false, + "name": "cell19" + }, + "source": [ + "## 5. Deploying to Databricks" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "93149df1-dc2d-4947-aba4-20517b702a1d", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + }, + "collapsed": false, + "name": "cell29" + }, + "source": [ + "Syncs the business glossaries, tags, primary keys and relationsips with the source tables. " + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "61ab3d8e-ffd8-4b68-8716-4ffb7906585c", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + }, + "language": "python", + "name": "cell21" + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
Starting deployment to 'databricks' based on project YAML files...\n",
+       "
\n" + ], + "text/plain": [ + "\u001B[33mStarting deployment to \u001B[0m\u001B[32m'databricks'\u001B[0m\u001B[33m based on project YAML files\u001B[0m\u001B[33m...\u001B[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Setting primary key constraints...\nSet primary key on `intugle`.`tech_manufacturing`.`product_hierarchy` (`SKU ID`)\nSkipping primary key for table 'product_feature' due to missing or invalid key.\nSet primary key on `intugle`.`tech_manufacturing`.`website` (`Customer Session ID`)\nSet primary key on `intugle`.`tech_manufacturing`.`prob_statement_issue` (`Issue ID`)\nSet primary key on `intugle`.`tech_manufacturing`.`customer_hierarchy` (`Account Manager`)\nSet primary key on `intugle`.`tech_manufacturing`.`campaigns` (`Campaign ID`)\nSet primary key on `intugle`.`tech_manufacturing`.`products` (`Product ID`)\nSet primary key on `intugle`.`tech_manufacturing`.`service_requests` (`SR ID`)\nSet primary key on `intugle`.`tech_manufacturing`.`logistics` (`Logistics ID`)\nSet primary key on `intugle`.`tech_manufacturing`.`customers` (`C_ID`)\nSet primary key on `intugle`.`tech_manufacturing`.`expense` (`Expense ID`)\nSet primary key on `intugle`.`tech_manufacturing`.`install_base` (`Order ID`)\nSet primary key on `intugle`.`tech_manufacturing`.`campaign_survey` (`Participant Name`)\nSet primary key on `intugle`.`tech_manufacturing`.`nps_survey` (`NPS_ID`)\nSet primary key on `intugle`.`tech_manufacturing`.`orders` (`Order ID`)\nSet primary key on `intugle`.`tech_manufacturing`.`opportunity` (`Opportunity ID`)\nSet primary key on `intugle`.`tech_manufacturing`.`renewals` (`Renw ID`)\nSet primary key on `intugle`.`tech_manufacturing`.`returns` (`Return_ID`)\nSet primary key on `intugle`.`tech_manufacturing`.`delivery_survey` (`Odr ID`)\nSet primary key on `intugle`.`tech_manufacturing`.`inventory` (`Inventory ID`)\nPrimary key setting complete.\nSetting foreign key constraints...\nCould not set foreign key for relationship install_base_orders: Failed to create foreign key constraint `fk_install_base_orders`: table `intugle.tech_manufacturing.orders` already has a foreign key constraint: `fk_delivery_survey_orders`, that has the same set of child columns.\n\nJVM stacktrace:\norg.apache.spark.sql.catalyst.analysis.ConstraintAlreadyExistsException\n\tat com.databricks.managedcatalog.ManagedCatalogClientImpl.$anonfun$createTableConstraint$1(ManagedCatalogClientImpl.scala:4008)\n\tat com.databricks.managedcatalog.ManagedCatalogClientImpl.$anonfun$recordAndWrapExceptionBase$2(ManagedCatalogClientImpl.scala:7505)\n\tat com.databricks.spark.util.FrameProfiler$.$anonfun$record$1(FrameProfiler.scala:114)\n\tat com.databricks.spark.util.FrameProfilerExporter$.maybeExportFrameProfiler(FrameProfilerExporter.scala:200)\n\tat com.databricks.spark.util.FrameProfiler$.record(FrameProfiler.scala:105)\n\tat com.databricks.managedcatalog.ManagedCatalogClientImpl.$anonfun$recordAndWrapExceptionBase$1(ManagedCatalogClientImpl.scala:7504)\n\tat com.databricks.managedcatalog.ErrorDetailsHandler.wrapServiceException(ErrorDetailsHandler.scala:74)\n\tat com.databricks.managedcatalog.ErrorDetailsHandler.wrapServiceException$(ErrorDetailsHandler.scala:66)\n\tat com.databricks.managedcatalog.ManagedCatalogClientImpl.wrapServiceException(ManagedCatalogClientImpl.scala:268)\n\tat com.databricks.managedcatalog.ManagedCatalogClientImpl.recordAndWrapExceptionBase(ManagedCatalogClientImpl.scala:7485)\n\tat com.databricks.managedcatalog.ManagedCatalogClientImpl.recordAndWrapException(ManagedCatalogClientImpl.scala:7471)\n\tat com.databricks.managedcatalog.ManagedCatalogClientImpl.createTableConstraint(ManagedCatalogClientImpl.scala:3913)\n\tat com.databricks.sql.managedcatalog.ManagedCatalogCommon.$anonfun$addTableConstraint$1(ManagedCatalogCommon.scala:2084)\n\tat scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.scala:18)\n\tat com.databricks.sql.managedcatalog.ManagedCatalogCommon.withTableCacheInvalidated(ManagedCatalogCommon.scala:2336)\n\tat com.databricks.sql.managedcatalog.ManagedCatalogCommon.addTableConstraint(ManagedCatalogCommon.scala:2076)\n\tat com.databricks.sql.managedcatalog.ProfiledManagedCatalog.$anonfun$addTableConstraint$1(ProfiledManagedCatalog.scala:356)\n\tat scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.scala:18)\n\tat org.apache.spark.sql.catalyst.MetricKeyUtils$.measure(MetricKey.scala:1892)\n\tat com.databricks.sql.managedcatalog.ProfiledManagedCatalog.$anonfun$profile$1(ProfiledManagedCatalog.scala:64)\n\tat com.databricks.spark.util.FrameProfiler$.$anonfun$record$1(FrameProfiler.scala:114)\n\tat com.databricks.spark.util.FrameProfilerExporter$.maybeExportFrameProfiler(FrameProfilerExporter.scala:200)\n\tat com.databricks.spark.util.FrameProfiler$.record(FrameProfiler.scala:105)\n\tat com.databricks.sql.managedcatalog.ProfiledManagedCatalog.profile(ProfiledManagedCatalog.scala:63)\n\tat com.databricks.sql.managedcatalog.ProfiledManagedCatalog.addTableConstraint(ProfiledManagedCatalog.scala:356)\n\tat com.databricks.sql.managedcatalog.ManagedCatalogSessionCatalog.addTableConstraint(ManagedCatalogSessionCatalog.scala:1551)\n\tat com.databricks.sql.transaction.tahoe.commands.AlterTableAddTableConstraintDeltaCommand.$anonfun$run$66(alterDeltaTableCommands.scala:1924)\n\tat com.databricks.sql.transaction.tahoe.commands.AlterTableAddTableConstraintDeltaCommand.$anonfun$run$66$adapted(alterDeltaTableCommands.scala:1923)\n\tat scala.Option.foreach(Option.scala:437)\n\tat com.databricks.sql.transaction.tahoe.commands.AlterTableAddTableConstraintDeltaCommand.run(alterDeltaTableCommands.scala:1923)\n\tat com.databricks.sql.transaction.tahoe.catalog.DeltaCatalog.$anonfun$alterTable$44(DeltaCatalog.scala:2083)\n\tat scala.collection.IterableOnceOps.foreach(IterableOnce.scala:619)\n\tat scala.collection.IterableOnceOps.foreach$(IterableOnce.scala:617)\n\tat scala.collection.AbstractIterable.foreach(Iterable.scala:935)\n\tat com.databricks.sql.transaction.tahoe.catalog.DeltaCatalog.$anonfun$alterTable$22(DeltaCatalog.scala:2079)\n\tat scala.collection.immutable.HashMap.foreach(HashMap.scala:1115)\n\tat com.databricks.sql.transaction.tahoe.catalog.DeltaCatalog.$anonfun$alterTable$8(DeltaCatalog.scala:1853)\n\tat scala.util.DynamicVariable.withValue(DynamicVariable.scala:59)\n\tat com.databricks.sql.transaction.tahoe.redirect.RedirectFeature$.withUpdateTableRedirectDDL(TableRedirect.scala:796)\n\tat com.databricks.sql.transaction.tahoe.catalog.DeltaCatalog.$anonfun$alterTable$1(DeltaCatalog.scala:1762)\n\tat com.databricks.spark.util.FrameProfiler$.$anonfun$record$1(FrameProfiler.scala:114)\n\tat com.databricks.spark.util.FrameProfilerExporter$.maybeExportFrameProfiler(FrameProfilerExporter.scala:200)\n\tat com.databricks.spark.util.FrameProfiler$.record(FrameProfiler.scala:105)\n\tat com.databricks.sql.transaction.tahoe.metering.DeltaLogging.recordFrameProfile(DeltaLogging.scala:420)\n\tat com.databricks.sql.transaction.tahoe.metering.DeltaLogging.recordFrameProfile$(DeltaLogging.scala:418)\n\tat com.databricks.sql.transaction.tahoe.catalog.DeltaCatalog.recordFrameProfile(DeltaCatalog.scala:144)\n\tat com.databricks.sql.transaction.tahoe.catalog.DeltaCatalog.alterTable(DeltaCatalog.scala:1741)\n\tat com.databricks.sql.transaction.tahoe.catalog.DeltaCatalog.alterTable(DeltaCatalog.scala:144)\n\tat com.databricks.sql.managedcatalog.UnityCatalogV2Proxy.alterTable(UnityCatalogV2Proxy.scala:261)\n\tat com.databricks.sql.managedcatalog.UnityCatalogV2Proxy.alterTable(UnityCatalogV2Proxy.scala:57)\n\tat org.apache.spark.sql.execution.datasources.v2.AlterTableExec.run(AlterTableExec.scala:38)\n\tat org.apache.spark.sql.execution.datasources.v2.V2CommandExec.$anonfun$result$2(V2CommandExec.scala:48)\n\tat org.apache.spark.sql.execution.SparkPlan.runCommandInAetherOrSpark(SparkPlan.scala:195)\n\tat org.apache.spark.sql.execution.datasources.v2.V2CommandExec.$anonfun$result$1(V2CommandExec.scala:48)\n\tat com.databricks.spark.util.FrameProfiler$.$anonfun$record$1(FrameProfiler.scala:114)\n\tat com.databricks.spark.util.FrameProfilerExporter$.maybeExportFrameProfiler(FrameProfilerExporter.scala:200)\n\tat com.databricks.spark.util.FrameProfiler$.record(FrameProfiler.scala:105)\n\tat org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result$lzycompute(V2CommandExec.scala:47)\n\tat org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result(V2CommandExec.scala:45)\n\tat org.apache.spark.sql.execution.datasources.v2.V2CommandExec.executeCollect(V2CommandExec.scala:56)\n\tat org.apache.spark.sql.execution.QueryExecution.$anonfun$eagerlyExecuteCommands$5(QueryExecution.scala:507)\n\tat com.databricks.util.LexicalThreadLocal$Handle.runWith(LexicalThreadLocal.scala:63)\n\tat org.apache.spark.sql.execution.QueryExecution.$anonfun$eagerlyExecuteCommands$4(QueryExecution.scala:507)\n\tat org.apache.spark.sql.catalyst.QueryPlanningTracker$.withTracker(QueryPlanningTracker.scala:267)\n\tat org.apache.spark.sql.execution.QueryExecution.$anonfun$eagerlyExecuteCommands$3(QueryExecution.scala:506)\n\tat org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId0$14(SQLExecution.scala:561)\n\tat com.databricks.sql.util.MemoryTrackerHelper.withMemoryTracking(MemoryTrackerHelper.scala:111)\n\tat org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId0$13(SQLExecution.scala:475)\n\tat org.apache.spark.sql.execution.SQLExecution$.withSessionTagsApplied(SQLExecution.scala:859)\n\tat org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId0$12(SQLExecution.scala:403)\n\tat org.apache.spark.JobArtifactSet$.withActiveJobArtifactState(JobArtifactSet.scala:97)\n\tat org.apache.spark.sql.artifact.ArtifactManager.$anonfun$withResources$1(ArtifactManager.scala:121)\n\tat org.apache.spark.sql.artifact.ArtifactManager.withClassLoaderIfNeeded(ArtifactManager.scala:115)\n\tat org.apache.spark.sql.artifact.ArtifactManager.withResources(ArtifactManager.scala:120)\n\tat org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId0$11(SQLExecution.scala:403)\n\tat org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:888)\n\tat org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId0$1(SQLExecution.scala:402)\n\tat org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:860)\n\tat org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId0(SQLExecution.scala:238)\n\tat org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:812)\n\tat org.apache.spark.sql.execution.QueryExecution.$anonfun$eagerlyExecuteCommands$2(QueryExecution.scala:502)\n\tat org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:1449)\n\tat org.apache.spark.sql.execution.QueryExecution.$anonfun$eagerlyExecuteCommands$1(QueryExecution.scala:498)\n\tat org.apache.spark.sql.execution.QueryExecution.withMVTagsIfNecessary(QueryExecution.scala:418)\n\tat org.apache.spark.sql.execution.QueryExecution.org$apache$spark$sql$execution$QueryExecution$$eagerlyExecute$1(QueryExecution.scala:496)\n\tat org.apache.spark.sql.execution.QueryExecution$$anonfun$$nestedInanonfun$eagerlyExecuteCommands$8$1.applyOrElse(QueryExecution.scala:578)\n\tat org.apache.spark.sql.execution.QueryExecution$$anonfun$$nestedInanonfun$eagerlyExecuteCommands$8$1.applyOrElse(QueryExecution.scala:570)\n\tat org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:529)\n\tat org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(origin.scala:121)\n\tat org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:529)\n\tat org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:42)\n\tat org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:361)\n\tat org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:357)\n\tat org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:42)\n\tat org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:42)\n\tat org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:505)\n\tat org.apache.spark.sql.execution.QueryExecution.$anonfun$eagerlyExecuteCommands$8(QueryExecution.scala:570)\n\tat org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.allowInvokingTransformsInAnalyzer(AnalysisHelper.scala:418)\n\tat org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:570)\n\tat org.apache.spark.sql.execution.QueryExecution.$anonfun$lazyCommandExecuted$1(QueryExecution.scala:374)\n\tat scala.util.Try$.apply(Try.scala:217)\n\tat org.apache.spark.util.Utils$.doTryWithCallerStacktrace(Utils.scala:1686)\n\tat org.apache.spark.util.Utils$.getTryWithCallerStacktrace(Utils.scala:1747)\n\tat org.apache.spark.util.LazyTry.get(LazyTry.scala:75)\n\tat org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:379)\n\tat org.apache.spark.sql.classic.Dataset.(Dataset.scala:432)\n\tat org.apache.spark.sql.classic.Dataset$.$anonfun$ofRows$3(Dataset.scala:155)\n\tat org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:860)\n\tat org.apache.spark.sql.classic.SparkSession.$anonfun$withActiveAndFrameProfiler$1(SparkSession.scala:1072)\n\tat com.databricks.spark.util.FrameProfiler$.$anonfun$record$1(FrameProfiler.scala:114)\n\tat com.databricks.spark.util.FrameProfilerExporter$.maybeExportFrameProfiler(FrameProfilerExporter.scala:200)\n\tat com.databricks.spark.util.FrameProfiler$.record(FrameProfiler.scala:105)\n\tat org.apache.spark.sql.classic.SparkSession.withActiveAndFrameProfiler(SparkSession.scala:1072)\n\tat org.apache.spark.sql.classic.Dataset$.ofRows(Dataset.scala:146)\n\tat org.apache.spark.sql.classic.SparkSession.$anonfun$sql$4(SparkSession.scala:851)\n\tat org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:860)\n\tat org.apache.spark.sql.classic.SparkSession.sql(SparkSession.scala:814)\n\tat org.apache.spark.sql.connect.planner.SparkConnectPlanner.executeSQL(SparkConnectPlanner.scala:3531)\n\tat org.apache.spark.sql.connect.planner.SparkConnectPlanner.handleSqlCommand(SparkConnectPlanner.scala:3361)\n\tat org.apache.spark.sql.connect.planner.SparkConnectPlanner.process(SparkConnectPlanner.scala:3238)\n\tat org.apache.spark.sql.connect.execution.ExecuteThreadRunner.handleCommand(ExecuteThreadRunner.scala:385)\n\tat org.apache.spark.sql.connect.execution.ExecuteThreadRunner.$anonfun$executeInternal$1(ExecuteThreadRunner.scala:282)\n\tat org.apache.spark.sql.connect.execution.ExecuteThreadRunner.$anonfun$executeInternal$1$adapted(ExecuteThreadRunner.scala:238)\n\tat org.apache.spark.sql.connect.service.SessionHolder.$anonfun$withSession$2(SessionHolder.scala:466)\n\tat org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:860)\n\tat org.apache.spark.sql.connect.service.SessionHolder.$anonfun$withSession$1(SessionHolder.scala:466)\n\tat org.apache.spark.JobArtifactSet$.withActiveJobArtifactState(JobArtifactSet.scala:97)\n\tat org.apache.spark.sql.artifact.ArtifactManager.$anonfun$withResources$1(ArtifactManager.scala:121)\n\tat org.apache.spark.sql.artifact.ArtifactManager.withClassLoaderIfNeeded(ArtifactManager.scala:115)\n\tat org.apache.spark.sql.artifact.ArtifactManager.withResources(ArtifactManager.scala:120)\n\tat org.apache.spark.sql.connect.service.SessionHolder.withSession(SessionHolder.scala:465)\n\tat org.apache.spark.sql.connect.execution.ExecuteThreadRunner.executeInternal(ExecuteThreadRunner.scala:238)\n\tat org.apache.spark.sql.connect.execution.ExecuteThreadRunner.$anonfun$execute$1(ExecuteThreadRunner.scala:141)\n\tat scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.scala:18)\n\tat com.databricks.spark.connect.service.UtilizationMetrics.recordActiveQueries(UtilizationMetrics.scala:43)\n\tat com.databricks.spark.connect.service.UtilizationMetrics.recordActiveQueries$(UtilizationMetrics.scala:40)\n\tat org.apache.spark.sql.connect.execution.ExecuteThreadRunner.recordActiveQueries(ExecuteThreadRunner.scala:53)\n\tat org.apache.spark.sql.connect.execution.ExecuteThreadRunner.org$apache$spark$sql$connect$execution$ExecuteThreadRunner$$execute(ExecuteThreadRunner.scala:139)\n\tat org.apache.spark.sql.connect.execution.ExecuteThreadRunner$ExecutionThread.$anonfun$run$2(ExecuteThreadRunner.scala:586)\n\tat scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.scala:18)\n\tat com.databricks.unity.UCSEphemeralState$Handle.runWith(UCSEphemeralState.scala:51)\n\tat com.databricks.unity.HandleImpl.runWith(UCSHandle.scala:104)\n\tat com.databricks.unity.HandleImpl.$anonfun$runWithAndClose$1(UCSHandle.scala:109)\n\tat scala.util.Using$.resource(Using.scala:296)\n\tat com.databricks.unity.HandleImpl.runWithAndClose(UCSHandle.scala:108)\n\tat org.apache.spark.sql.connect.execution.ExecuteThreadRunner$ExecutionThread.run(ExecuteThreadRunner.scala:586)\nForeign key setting complete.\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
Successfully deployed semantic model to 'databricks'.\n",
+       "
\n" + ], + "text/plain": [ + "\u001B[1;32mSuccessfully deployed semantic model to \u001B[0m\u001B[32m'databricks'\u001B[0m\u001B[1;32m.\u001B[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "sm.deploy('databricks', sync_glossary=True, sync_tags=True, set_primary_keys=True, set_foreign_keys=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "37951c61-5627-434c-adfc-f55b9b1d4c34", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + }, + "name": "cell22" + }, + "source": [ + "## 6. Data Product Creation\n", + "\n", + "The semantic layer serves as a foundation for the DataProduct, which streamlines the creation of reusable data products. This allows you\n", + "to encapsulate business logic and create standardized, trustworthy data assets that can be easily shared and reused across different teams and \n", + "applications.\n", + "\n", + "Let's define the model for the data product we want to build:" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "5322b219-a8c2-4dab-89a1-e245fa2a7a6d", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + }, + "language": "python", + "name": "cell23" + }, + "outputs": [], + "source": [ + "data_product_config = \\\n", + "{\n", + " \"name\": \"customer_kpi_summary\",\n", + " \"fields\": [\n", + " {\n", + " \"id\": \"customers.C_ID\",\n", + " \"name\": \"c_id\"\n", + " },\n", + " {\n", + " \"id\": \"customers.Customer Name\",\n", + " \"name\": \"customer_name\"\n", + " },\n", + " {\n", + " \"id\": \"customer_hierarchy.Global / Parent Account\",\n", + " \"name\": \"global_parent_account\"\n", + " },\n", + " {\n", + " \"id\": \"customer_hierarchy.Region\",\n", + " \"name\": \"region\"\n", + " },\n", + " {\n", + " \"id\": \"customer_hierarchy.Global / Local Entity\",\n", + " \"name\": \"global_local_entity\"\n", + " },\n", + " {\n", + " \"id\": \"products.Product Name\",\n", + " \"name\": \"product_name\"\n", + " },\n", + " {\n", + " \"id\": \"orders.Order Value ($)\",\n", + " \"name\": \"sum_order_value\",\n", + " \"category\": \"measure\",\n", + " \"measure_func\": \"sum\"\n", + " },\n", + " {\n", + " \"id\": \"orders.Order ID\",\n", + " \"name\": \"count_distinct_order_id\",\n", + " \"category\": \"measure\",\n", + " \"measure_func\": \"count\"\n", + " },\n", + " {\n", + " \"id\": \"orders.Order Qty\",\n", + " \"name\": \"sum_order_qty\",\n", + " \"category\": \"measure\",\n", + " \"measure_func\": \"sum\"\n", + " },\n", + " {\n", + " \"id\": \"service_requests.SR ID\",\n", + " \"name\": \"count_distinct_sr_id\",\n", + " \"category\": \"measure\",\n", + " \"measure_func\": \"count\"\n", + " },\n", + " {\n", + " \"id\": \"service_requests.Prod ID\",\n", + " \"name\": \"count_distinct_prod_id\",\n", + " \"category\": \"measure\",\n", + " \"measure_func\": \"count\"\n", + " },\n", + " {\n", + " \"id\": \"returns.R Order ID\",\n", + " \"name\": \"count_distinct_r_order_id\",\n", + " \"category\": \"measure\",\n", + " \"measure_func\": \"count\"\n", + " },\n", + " {\n", + " \"id\": \"nps_survey.Survey Score\",\n", + " \"name\": \"sum_survey_score\",\n", + " \"category\": \"measure\",\n", + " \"measure_func\": \"sum\"\n", + " }\n", + " ]\n", + " }" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "1618b7a2-46ef-4cbf-9d8c-036d9dce7f51", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "> Checkout the Intugle documentation to learn how to add sorting and filters to your data product" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "94509207-5e86-4470-91df-c394197839fa", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + }, + "name": "cell24" + }, + "source": [ + "Now, let's use the `DataProduct` to generate the data product:" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6061cfc7-59ec-4ca8-8e8a-91c1c57511ee", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + }, + "language": "python", + "name": "cell25" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Found existing YAML for 'product_hierarchy'. Checking for staleness.\nproduct_hierarchy loaded\nFound existing YAML for 'product_feature'. Checking for staleness.\nproduct_feature loaded\nFound existing YAML for 'website'. Checking for staleness.\nwebsite loaded\nFound existing YAML for 'prob_statement_issue'. Checking for staleness.\nprob_statement_issue loaded\nFound existing YAML for 'customer_hierarchy'. Checking for staleness.\ncustomer_hierarchy loaded\nFound existing YAML for 'campaigns'. Checking for staleness.\ncampaigns loaded\nFound existing YAML for 'products'. Checking for staleness.\nproducts loaded\nFound existing YAML for 'service_requests'. Checking for staleness.\nservice_requests loaded\nFound existing YAML for 'logistics'. Checking for staleness.\nlogistics loaded\nFound existing YAML for 'customers'. Checking for staleness.\ncustomers loaded\nFound existing YAML for 'expense'. Checking for staleness.\nexpense loaded\nFound existing YAML for 'install_base'. Checking for staleness.\ninstall_base loaded\nFound existing YAML for 'campaign_survey'. Checking for staleness.\ncampaign_survey loaded\nFound existing YAML for 'nps_survey'. Checking for staleness.\nnps_survey loaded\nFound existing YAML for 'orders'. Checking for staleness.\norders loaded\nFound existing YAML for 'opportunity'. Checking for staleness.\nopportunity loaded\nFound existing YAML for 'renewals'. Checking for staleness.\nrenewals loaded\nFound existing YAML for 'returns'. Checking for staleness.\nreturns loaded\nFound existing YAML for 'delivery_survey'. Checking for staleness.\ndelivery_survey loaded\nFound existing YAML for 'inventory'. Checking for staleness.\ninventory loaded\ncustomer_kpi_summary loaded\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
c_idcustomer_nameglobal_parent_accountregionglobal_local_entityproduct_namesum_order_valuecount_distinct_order_idsum_order_qtycount_distinct_sr_idcount_distinct_prod_idcount_distinct_r_order_idsum_survey_score
0CUST-10521Morris, Tate and WoodsPowell HoldingsEuropeLocalCoreAnalyticsNaN0NaN00022
1CUST-10902Allen, Hernandez and TylerSchultz-Reyes HoldingsMiddle EastGlobalNoneNaN0NaN0005
2CUST-10214King, Reynolds and KennedyAnderson, HoldingsAPACLocalNoneNaN0NaN00029
3CUST-10976Rush LLCSchultz-Vargas HoldingsNorth AmericaGlobalCloudEdge53870.04158.044414
4CUST-10502Conner IncWarren HoldingsAPACLocalCoreAnalytics22939.0249.022212
..........................................
1084CUST-10668Jefferson, Riggs and MorrowBennett-Flores HoldingsEuropeGlobalNoneNaN0NaN00014
1085CUST-10050Glover GroupNoneNoneNoneDataSphere22286.0128.01119
1086CUST-10870Wiley, Perez and RuizWalker-Shea HoldingsAPACLocalCoreAnalytics12736.0121.01117
1087CUST-10384Rice, Phillips and SmithBerry-Bates HoldingsNorth AmericaGlobalCoreAnalytics7256.011.01119
1088CUST-10683Hess, Moreno and MillerNoneNoneNoneCoreAnalytics19839.0144.01116
\n", + "

1089 rows × 13 columns

\n", + "
" + ], + "text/plain": [ + " c_id ... sum_survey_score\n", + "0 CUST-10521 ... 22\n", + "1 CUST-10902 ... 5\n", + "2 CUST-10214 ... 29\n", + "3 CUST-10976 ... 14\n", + "4 CUST-10502 ... 12\n", + "... ... ... ...\n", + "1084 CUST-10668 ... 14\n", + "1085 CUST-10050 ... 9\n", + "1086 CUST-10870 ... 7\n", + "1087 CUST-10384 ... 9\n", + "1088 CUST-10683 ... 6\n", + "\n", + "[1089 rows x 13 columns]" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from intugle import DataProduct\n", + "\n", + "# Create a DataProduct\n", + "dp = DataProduct()\n", + "\n", + "# Generate the data product\n", + "data_product = dp.build(data_product_config)\n", + "\n", + "data_product.to_df()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "25339458-dfcc-4548-86ed-0d9b6e552206", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + }, + "name": "cell26" + }, + "source": [ + "The `build` function returns a `DataSet` object. You can also view the generated SQL query used for creating the data product:" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "111d079c-6d52-4a26-b281-5db95662d5d7", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + }, + "language": "python", + "name": "cell27" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'SELECT `customers`.`C_ID` AS c_id, `customers`.`Customer Name` AS customer_name, `customer_hierarchy`.`Global / Parent Account` AS global_parent_account, `customer_hierarchy`.`Region` AS region, `customer_hierarchy`.`Global / Local Entity` AS global_local_entity, `products`.`Product Name` AS product_name, SUM(`orders`.`Order Value ($)`) AS sum_order_value, COUNT(`orders`.`Order ID`) AS count_distinct_order_id, SUM(`orders`.`Order Qty`) AS sum_order_qty, COUNT(`service_requests`.`SR ID`) AS count_distinct_sr_id, COUNT(`service_requests`.`Prod ID`) AS count_distinct_prod_id, COUNT(`returns`.`R Order ID`) AS count_distinct_r_order_id, SUM(`nps_survey`.`Survey Score`) AS sum_survey_score FROM nps_survey LEFT JOIN customers ON `customers`.`C_ID` = `nps_survey`.`Customer_ID` LEFT JOIN customer_hierarchy ON `customer_hierarchy`.`Party ID` = `customers`.`C_ID` LEFT JOIN campaigns ON `campaigns`.`Prospect ID` = `customers`.`C_ID` LEFT JOIN products ON `campaigns`.`Prod_ID` = `products`.`Product ID` LEFT JOIN returns ON `products`.`Product ID` = `returns`.`Product ID` LEFT JOIN service_requests ON `returns`.`SR ID` = `service_requests`.`SR ID` LEFT JOIN orders ON `orders`.`Order ID` = `service_requests`.`Sales Ord ID` GROUP BY `customers`.`C_ID`, `customers`.`Customer Name`, `customer_hierarchy`.`Global / Parent Account`, `customer_hierarchy`.`Region`, `customer_hierarchy`.`Global / Local Entity`, `products`.`Product Name`'" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# The generated SQL query\n", + "data_product.sql_query" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "e6f15e02-9bf3-4448-97e2-71b856ac231a", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + }, + "collapsed": false, + "name": "cell36" + }, + "source": [ + "### Enriching the Data Product\n", + "\n", + "The `data_product` is in itself a DataSet object. Hence we can run generate glossaries for it as well. " + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "fc185269-6aa9-4486-a623-5e6faca1a306", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + }, + "language": "python", + "name": "cell28" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "I0000 00:00:1759658943.597447 27837 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers\nI0000 00:00:1759658943.622177 27837 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers\n/databricks/python/lib/python3.12/site-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice.\n return _methods._mean(a, axis=axis, dtype=dtype,\n/databricks/python/lib/python3.12/site-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in scalar divide\n ret = ret.dtype.type(ret / rcount)\n" + ] + }, + { + "output_type": "display_data", + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "5b9161d8b4cf45649235f2931933223c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/13 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
column_nametable_namebusiness_namedatatype_l1datatype_l2business_glossarybusiness_tagscountnull_countdistinct_countuniquenesscompletenesssample_data
0c_idcustomer_kpi_summaryc_idalphanumericL2OutputTypes.dimensionIdentifies individual customers for tracking a...[Customer Identifier, Customer Record Tracking...108906150.5647381.000000[CUST-10707, CUST-10806, CUST-10591, CUST-1048...
1customer_namecustomer_kpi_summarycustomer_nameclose_ended_textL2OutputTypes.dimensionIdentifies the legal or commercial entity asso...[Customer Identification, Business Entity Name...108905990.5500461.000000[Lee PLC, Turner, Butler and Morgan, House-Ram...
2global_parent_accountcustomer_kpi_summaryglobal_parent_accountclose_ended_textL2OutputTypes.dimensionIdentifies the overarching corporate entity as...[Global Account Hierarchy, Parent Company Iden...10892994090.3755740.725436[French Holdings, Morales Holdings, Perez-Camp...
3regioncustomer_kpi_summaryregionclose_ended_textL2OutputTypes.dimensionGeographical area associated with customer per...[Geographical Region, Market Segmentation, Reg...108929950.0045910.725436[APAC, Europe, LATAM, North America, Middle East]
4global_local_entitycustomer_kpi_summaryglobal_local_entityclose_ended_textL2OutputTypes.dimensionIndicates whether a customer-related metric is...[Entity Scope, Operational Coverage, Geographi...108929920.0018370.725436[Local, Global]
5product_namecustomer_kpi_summaryproduct_nameclose_ended_textL2OutputTypes.dimensionName of a product associated with customer per...[Product Portfolio, Customer Offering, Technol...108932950.0045910.697888[CoreAnalytics, NetConnect, Insight360, DataSp...
6sum_order_valuecustomer_kpi_summarysum_order_valueintegerL2OutputTypes.measureAggregates the total monetary value of all ord...[Customer Order Value, Revenue Aggregation, Sa...10895623420.3140500.483930[8922, 69204, 2469, 18393, 75163, 62820, 6389,...
7count_distinct_order_idcustomer_kpi_summarycount_distinct_order_idintegerL2OutputTypes.measureTracks the total number of unique orders assoc...[Customer Order Metrics, Unique Order Tracking...10890140.0128561.000000[10, 4, 6, 3, 14, 2, 12, 1, 5, 0]
8sum_order_qtycustomer_kpi_summarysum_order_qtyintegerL2OutputTypes.measureAggregates the total quantity of orders associ...[Customer Order Volume, Sales Performance Metr...10895621380.1267220.483930[116, 96, 54, 7, 50, 232, 76, 52, 61, 15]
9count_distinct_sr_idcustomer_kpi_summarycount_distinct_sr_idintegerL2OutputTypes.measureTracks the number of unique service request id...[Service Request Tracking, Customer Interactio...10890140.0128561.000000[14, 5, 6, 1, 0, 20, 2, 12, 3, 9]
10count_distinct_prod_idcustomer_kpi_summarycount_distinct_prod_idintegerL2OutputTypes.measureTracks the number of unique products associate...[Product Diversity, Customer Purchase Insights...10890140.0128561.000000[16, 9, 2, 4, 1, 5, 14, 6, 12, 3]
11count_distinct_r_order_idcustomer_kpi_summarycount_distinct_r_order_idintegerL2OutputTypes.measureTracks the number of unique orders associated ...[Customer Order Metrics, Distinct Order Tracki...10890140.0128561.000000[14, 5, 6, 1, 16, 20, 3, 12, 10, 2]
12sum_survey_scorecustomer_kpi_summarysum_survey_scoreintegerL2OutputTypes.measureAggregates the total score from customer feedb...[Customer Satisfaction, Survey Performance Met...10890530.0486691.000000[29, 9, 23, 24, 40, 51, 13, 152, 36, 69]
\n", + "" + ], + "text/plain": [ + " column_name ... sample_data\n", + "0 c_id ... [CUST-10707, CUST-10806, CUST-10591, CUST-1048...\n", + "1 customer_name ... [Lee PLC, Turner, Butler and Morgan, House-Ram...\n", + "2 global_parent_account ... [French Holdings, Morales Holdings, Perez-Camp...\n", + "3 region ... [APAC, Europe, LATAM, North America, Middle East]\n", + "4 global_local_entity ... [Local, Global]\n", + "5 product_name ... [CoreAnalytics, NetConnect, Insight360, DataSp...\n", + "6 sum_order_value ... [8922, 69204, 2469, 18393, 75163, 62820, 6389,...\n", + "7 count_distinct_order_id ... [10, 4, 6, 3, 14, 2, 12, 1, 5, 0]\n", + "8 sum_order_qty ... [116, 96, 54, 7, 50, 232, 76, 52, 61, 15]\n", + "9 count_distinct_sr_id ... [14, 5, 6, 1, 0, 20, 2, 12, 3, 9]\n", + "10 count_distinct_prod_id ... [16, 9, 2, 4, 1, 5, 14, 6, 12, 3]\n", + "11 count_distinct_r_order_id ... [14, 5, 6, 1, 16, 20, 3, 12, 10, 2]\n", + "12 sum_survey_score ... [29, 9, 23, 24, 40, 51, 13, 152, 36, 69]\n", + "\n", + "[13 rows x 13 columns]" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_product.run(domain=\"Technology Manufacturing Company\")\n", + "data_product.profiling_df" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "09a1ce49-e9d8-41d1-b26d-b2c92a105675", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + }, + "collapsed": false, + "name": "cell35" + }, + "source": [ + "### Syncing with Databricks Unity Catalog\n", + "Lets sync the data product with the Databricks Unity Catalog as well" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "c2e48111-74fe-4fca-a795-f066865a1230", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + }, + "language": "python", + "name": "cell18" + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
Starting deployment to 'databricks' based on project YAML files...\n",
+       "
\n" + ], + "text/plain": [ + "\u001B[33mStarting deployment to \u001B[0m\u001B[32m'databricks'\u001B[0m\u001B[33m based on project YAML files\u001B[0m\u001B[33m...\u001B[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Syncing metadata to Databricks tables...\nMetadata sync complete.\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
Successfully deployed semantic model to 'databricks'.\n",
+       "
\n" + ], + "text/plain": [ + "\u001B[1;32mSuccessfully deployed semantic model to \u001B[0m\u001B[32m'databricks'\u001B[0m\u001B[1;32m.\u001B[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "sm.deploy('databricks')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "9e763741-1f33-4efc-8670-213fedbbbb97", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "%md\n", + ">Now that you have synced with databricks, you can use **AI|BI Genie** to **converse with your data** using natural language. AI|BI Genie leverages the relationships and context that were synced to databricks to answer questions without requiring you to write SQL.\n", + "To get started, navigate to **Genie** -> Create a new space -> Pick your datasets and start conversing" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "847eadda-f22a-4998-a6bf-35ce4010267c", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + }, + "collapsed": false, + "name": "cell30" + }, + "source": [ + "## Conclusion\n", + "\n", + "You've learned how to:\n", + "\n", + "* Configure your LLM provider\n", + "* Build a semantic model using the `SemanticModel`.\n", + "* Access enriched metadata, business glossaries and visualize the relationships between your tables.\n", + "* Generate data products from the semantic layer using the `DataProduct`.\n", + "* Sync the semantic model with Databricks Unity Catalog\n", + "* Converse with your data using AI|BI Genie\n", + "\n", + "This is just a starting point. This project has many other features to explore. We encourage you to try it with your own data and see how it can help you build a powerful semantic layer." + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "computePreferences": { + "hardware": { + "accelerator": null, + "gpuPoolId": null, + "memory": null + } + }, + "dashboards": [], + "environmentMetadata": { + "base_environment": "", + "dependencies": [ + "/Workspace/Users/raphael.tony@intugle.ai/intugle-1.0.4rc1-py3-none-any.whl[databricks]" + ], + "environment_version": "3" + }, + "inputWidgetPreferences": null, + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 4 + }, + "notebookName": "quickstart_native_databricks", + "widgets": {} + }, + "kernelspec": { + "display_name": "intugle", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python" + }, + "lastEditStatus": { + "authorEmail": "RAPHAEL.TONY@INTUGLE.AI", + "authorId": "9200264168148", + "authorName": "RAPHAEL.TONY", + "lastEditTime": 1759433896509, + "notebookId": "vk6kvtijqawizengiokk", + "sessionId": "ce6ad5bc-8f6f-4bfd-93a8-bd6841b44952" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/notebooks/quickstart_native_snowflake.ipynb b/notebooks/quickstart_native_snowflake.ipynb index 6d685ec..d63f245 100644 --- a/notebooks/quickstart_native_snowflake.ipynb +++ b/notebooks/quickstart_native_snowflake.ipynb @@ -203,7 +203,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "a55fa189-8c6c-45b0-a58c-f58bd448985d", "metadata": { "language": "python", @@ -215,6 +215,7 @@ " \"\"\"Append the base URL to the table name.\"\"\"\n", " return {\n", " \"identifier\": table_name,\n", + " \"type\": \"snowflake\"\n", " }\n", "\n", "\n", diff --git a/pyproject.toml b/pyproject.toml index 5d2344b..bc64354 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "intugle" -version = "1.0.3" +version = "1.0.4" authors = [ { name="Intugle", email="hello@intugle.ai" }, ] @@ -49,13 +49,18 @@ dependencies = [ "scikit-learn==1.7.1", "langchain[anthropic,google-genai,openai]>=0.3.27", "qdrant-client>=1.15.1", - "rich>=14.1.0" + "rich>=14.1.0", ] [project.optional-dependencies] snowflake = [ "snowflake-snowpark-python[pandas]>=1.12.0" ] +databricks = [ + "databricks-sql-connector>=4.1.3", + "pyspark>=3.5.0", + "sqlglot>=27.20.0", +] [project.urls] diff --git a/src/intugle/adapters/adapter.py b/src/intugle/adapters/adapter.py index 9d2fae9..f7fe7db 100644 --- a/src/intugle/adapters/adapter.py +++ b/src/intugle/adapters/adapter.py @@ -48,7 +48,7 @@ def to_df_from_query(self, query: str) -> pd.DataFrame: raise NotImplementedError() @abstractmethod - def create_table_from_query(self, table_name: str, query: str): + def create_table_from_query(self, table_name: str, query: str) -> str: raise NotImplementedError() @abstractmethod diff --git a/src/intugle/adapters/common/models.py b/src/intugle/adapters/common/models.py new file mode 100644 index 0000000..c612780 --- /dev/null +++ b/src/intugle/adapters/common/models.py @@ -0,0 +1,10 @@ +from pydantic import BaseModel + + +class ResolvedRelationship(BaseModel): + """Represents a relationship with a clearly identified parent (one) and child (many) side.""" + + parent_table: str + parent_column: str + child_table: str + child_column: str diff --git a/src/intugle/adapters/common/relationships.py b/src/intugle/adapters/common/relationships.py new file mode 100644 index 0000000..691bb54 --- /dev/null +++ b/src/intugle/adapters/common/relationships.py @@ -0,0 +1,55 @@ +import re + +from typing import Dict, Optional + +from intugle.adapters.common.models import ResolvedRelationship +from intugle.models.resources.relationship import Relationship +from intugle.models.resources.source import Source + + +def clean_name(name: str) -> str: + """Cleans a string to be a valid SQL identifier by replacing non-alphanumeric characters with underscores.""" + return re.sub(r'[^a-zA-Z0-9_]', '_', name) + + +def resolve_relationship_direction( + rel: Relationship, sources: Dict[str, Source] +) -> Optional[ResolvedRelationship]: + """ + Determines the direction of a relationship by identifying the primary key. + + Args: + rel: The relationship object from the manifest. + sources: A dictionary of all sources from the manifest. + + Returns: + A ResolvedRelationship object with parent/child identified, or None if it's not a valid FK relationship. + """ + source_table_info = sources.get(rel.source.table) + target_table_info = sources.get(rel.target.table) + + if not source_table_info or not target_table_info: + return None + + # Case 1: The source column is the primary key of the source table. + # This means the target table is the child (many side). + if source_table_info.table.key and source_table_info.table.key == rel.source.column: + return ResolvedRelationship( + parent_table=rel.source.table, + parent_column=rel.source.column, + child_table=rel.target.table, + child_column=rel.target.column, + ) + + # Case 2: The target column is the primary key of the target table. + # This means the source table is the child (many side). + elif target_table_info.table.key and target_table_info.table.key == rel.target.column: + return ResolvedRelationship( + parent_table=rel.target.table, + parent_column=rel.target.column, + child_table=rel.source.table, + child_column=rel.source.column, + ) + + # If neither side is a primary key, it's not a valid FK relationship for our purposes. + return None diff --git a/src/intugle/adapters/factory.py b/src/intugle/adapters/factory.py index 3b0bb65..3521932 100644 --- a/src/intugle/adapters/factory.py +++ b/src/intugle/adapters/factory.py @@ -22,6 +22,7 @@ def import_module(name: str) -> ModuleInterface: "intugle.adapters.types.pandas.pandas", "intugle.adapters.types.duckdb.duckdb", "intugle.adapters.types.snowflake.snowflake", + "intugle.adapters.types.databricks.databricks", ] diff --git a/src/intugle/adapters/models.py b/src/intugle/adapters/models.py index b5393f9..0ccc1ad 100644 --- a/src/intugle/adapters/models.py +++ b/src/intugle/adapters/models.py @@ -5,11 +5,12 @@ from pydantic import BaseModel, Field +from intugle.adapters.types.databricks.models import DatabricksConfig from intugle.adapters.types.duckdb.models import DuckdbConfig from intugle.adapters.types.snowflake.models import SnowflakeConfig # FIXME load dynamically -DataSetData = pd.DataFrame | DuckdbConfig | SnowflakeConfig +DataSetData = pd.DataFrame | DuckdbConfig | SnowflakeConfig | DatabricksConfig class ProfilingOutput(BaseModel): diff --git a/src/intugle/adapters/types/databricks/databricks.py b/src/intugle/adapters/types/databricks/databricks.py new file mode 100644 index 0000000..ed5f8de --- /dev/null +++ b/src/intugle/adapters/types/databricks/databricks.py @@ -0,0 +1,421 @@ +import re +import time + +from typing import TYPE_CHECKING, Any, Optional + +import numpy as np +import pandas as pd + +from intugle.adapters.adapter import Adapter +from intugle.adapters.common.relationships import clean_name, resolve_relationship_direction +from intugle.adapters.factory import AdapterFactory +from intugle.adapters.models import ColumnProfile, DataSetData, ProfilingOutput +from intugle.adapters.types.databricks.models import ( + DatabricksConfig, + DatabricksNotebookConfig, + DatabricksSQLConnectorConfig, +) +from intugle.adapters.utils import convert_to_native +from intugle.core import settings +from intugle.core.utilities.processing import string_standardization + +if TYPE_CHECKING: + from intugle.analysis.models import DataSet + from intugle.models.manifest import Manifest + +try: + from pyspark.sql import SparkSession + PYSPARK_AVAILABLE = True +except ImportError: + PYSPARK_AVAILABLE = False + +try: + from databricks import sql + DATABRICKS_SQL_AVAILABLE = True +except ImportError: + DATABRICKS_SQL_AVAILABLE = False + +try: + from sqlglot import transpile + SQLGLOT_AVAILABLE = True +except ImportError: + SQLGLOT_AVAILABLE = False + + +DATABRICKS_AVAILABLE = PYSPARK_AVAILABLE and DATABRICKS_SQL_AVAILABLE and SQLGLOT_AVAILABLE + + +def clean_tag(name: str) -> str: + """Cleans a string to be a valid Databricks tag name.""" + return re.sub(r'[^a-zA-Z0-9_ ]', '_', name) + + +class DatabricksAdapter(Adapter): + _instance = None + _initialized = False + + def __new__(cls, *args, **kwargs): + if not cls._instance: + cls._instance = super().__new__(cls) + return cls._instance + + def __init__(self): + if self._initialized: + return + + if not DATABRICKS_AVAILABLE: + raise ImportError( + "Databricks dependencies are not installed. Please run 'pip install intugle[databricks]'.." + ) + + self.spark: Optional["SparkSession"] = None + self.connection: Optional[Any] = None + self.catalog: Optional[str] = None + self.schema: Optional[str] = None + self.connect() + self._initialized = True + + def connect(self): + connection_parameters_dict = settings.PROFILES.get("databricks", {}) + if not connection_parameters_dict: + raise ValueError( + "Could not create Databricks connection. No 'databricks' section found in profiles.yml." + ) + + # Try to get an active Spark session (for notebook environment) + if PYSPARK_AVAILABLE: + try: + self.spark = SparkSession.getActiveSession() + if self.spark: + print("Found active Spark session. Using it for execution.") + params = DatabricksNotebookConfig.model_validate(connection_parameters_dict) + self.catalog = params.catalog + self.schema = params.schema + return + except (AttributeError, TypeError): + self.spark = None + + # If no active Spark session, create a SQL connector connection (for external environment) + if not self.spark: + if not DATABRICKS_SQL_AVAILABLE: + raise ImportError( + "databricks-sql-connector is not installed. Please run 'pip install intugle[databricks]' to connect from outside a Databricks notebook." + ) + print("No active Spark session found. Creating a new SQL connector connection.") + params = DatabricksSQLConnectorConfig.model_validate(connection_parameters_dict) + self.catalog = params.catalog + self.schema = params.schema + self.connection = sql.connect( + server_hostname=params.host, http_path=params.http_path, access_token=params.token + ) + + def _get_fqn(self, identifier: str) -> str: + """Gets the fully qualified name for a table identifier.""" + # An identifier is already fully qualified if it contains a dot. + if "." in identifier: + return identifier + + # Backticks are used to handle reserved keywords and special characters. + safe_schema = f"`{self.schema}`" + safe_identifier = f"`{identifier}`" + + if self.catalog: + safe_catalog = f"`{self.catalog}`" + return f"{safe_catalog}.{safe_schema}.{safe_identifier}" + + return f"{safe_schema}.{safe_identifier}" + + @staticmethod + def check_data(data: Any) -> DatabricksConfig: + try: + data = DatabricksConfig.model_validate(data) + except Exception: + raise TypeError("Input must be a Databricks config.") + return data + + def _execute_sql(self, query: str) -> list[Any]: + if self.spark: + if self.catalog: + self.spark.sql(f"USE CATALOG `{self.catalog}`") + if self.schema: + self.spark.sql(f"USE `{self.schema}`") + return self.spark.sql(query).collect() + elif self.connection: + with self.connection.cursor() as cursor: + if self.catalog: + cursor.execute(f"USE CATALOG `{self.catalog}`") + if self.schema: + cursor.execute(f"USE `{self.schema}`") + cursor.execute(query) + try: + return cursor.fetchall() + except Exception: + return [] + raise ConnectionError("No active Databricks connection.") + + def _get_pandas_df(self, query: str) -> pd.DataFrame: + if self.spark: + if self.catalog: + self.spark.sql(f"USE CATALOG `{self.catalog}`") + if self.schema: + self.spark.sql(f"USE `{self.schema}`") + return self.spark.sql(query).toPandas() + elif self.connection: + with self.connection.cursor() as cursor: + if self.catalog: + cursor.execute(f"USE CATALOG `{self.catalog}`") + if self.schema: + cursor.execute(f"USE `{self.schema}`") + cursor.execute(query) + data = cursor.fetchall() + columns = [column[0] for column in cursor.description] + return pd.DataFrame(data, columns=columns) + raise ConnectionError("No active Databricks connection.") + + def profile(self, data: DatabricksConfig, table_name: str) -> ProfilingOutput: + data = self.check_data(data) + fqn = self._get_fqn(data.identifier) + if self.spark: + table = self.spark.table(fqn) + total_count = table.count() + columns = table.columns + dtypes = {field.name: str(field.dataType) for field in table.schema.fields} + else: + rows = self._execute_sql(f"DESCRIBE TABLE {fqn}") + columns = [row.col_name for row in rows] + dtypes = {row.col_name: row.data_type for row in rows} + total_count = self._execute_sql(f"SELECT COUNT(*) FROM {fqn}")[0][0] + + return ProfilingOutput( + count=total_count, + columns=columns, + dtypes=dtypes, + ) + + def column_profile( + self, + data: DatabricksConfig, + table_name: str, + column_name: str, + total_count: int, + sample_limit: int = 10, + dtype_sample_limit: int = 10000, + ) -> Optional[ColumnProfile]: + data = self.check_data(data) + fqn = self._get_fqn(data.identifier) + start_ts = time.time() + + # Null and distinct counts + query = f""" + SELECT + COUNT(CASE WHEN `{column_name}` IS NULL THEN 1 END) as null_count, + COUNT(DISTINCT `{column_name}`) as distinct_count + FROM {fqn} + """ + result = self._execute_sql(query)[0] + null_count = result.null_count + distinct_count = result.distinct_count + not_null_count = total_count - null_count + + # Sampling + sample_query = f""" + SELECT DISTINCT CAST(`{column_name}` AS STRING) FROM {fqn} WHERE `{column_name}` IS NOT NULL LIMIT {dtype_sample_limit} + """ + distinct_values_result = self._execute_sql(sample_query) + distinct_values = [row[0] for row in distinct_values_result] + + if distinct_count > 0: + distinct_sample_size = min(distinct_count, dtype_sample_limit) + sample_data = list(np.random.choice(distinct_values, distinct_sample_size, replace=False)) + else: + sample_data = [] + + dtype_sample = None + if distinct_count >= dtype_sample_limit: + dtype_sample = sample_data + elif distinct_count > 0 and not_null_count > 0: + remaining_sample_size = dtype_sample_limit - distinct_count + additional_samples_query = f""" + SELECT CAST(`{column_name}` AS STRING) FROM {fqn} WHERE `{column_name}` IS NOT NULL ORDER BY RAND() LIMIT {remaining_sample_size} + """ + additional_samples_result = self._execute_sql(additional_samples_query) + additional_samples = [row[0] for row in additional_samples_result] + dtype_sample = list(distinct_values) + additional_samples + else: + dtype_sample = [] + + native_sample_data = convert_to_native(sample_data) + native_dtype_sample = convert_to_native(dtype_sample) + business_name = string_standardization(column_name) + + return ColumnProfile( + column_name=column_name, + table_name=table_name, + business_name=business_name, + null_count=null_count, + count=total_count, + distinct_count=distinct_count, + uniqueness=distinct_count / total_count if total_count > 0 else 0.0, + completeness=not_null_count / total_count if total_count > 0 else 0.0, + sample_data=native_sample_data[:sample_limit], + dtype_sample=native_dtype_sample, + ts=time.time() - start_ts, + ) + + def load(self, data: DatabricksConfig, table_name: str): + self.check_data(data) + # No-op, we assume the table already exists in Databricks. + + def execute(self, query: str): + return self._execute_sql(query) + + def to_df(self, data: DatabricksConfig, table_name: str) -> pd.DataFrame: + data = self.check_data(data) + fqn = self._get_fqn(data.identifier) + return self._get_pandas_df(f"SELECT * FROM {fqn}") + + def to_df_from_query(self, query: str) -> pd.DataFrame: + return self._get_pandas_df(query) + + def create_table_from_query(self, table_name: str, query: str): + fqn = self._get_fqn(table_name) + transpiled_sql = transpile(query, write="databricks")[0] + self._execute_sql(f"CREATE OR REPLACE VIEW {fqn} AS {transpiled_sql}") + return transpiled_sql + + def create_new_config_from_etl(self, etl_name: str) -> "DataSetData": + fqn = self._get_fqn(etl_name) + return DatabricksConfig(identifier=fqn) + + def deploy_semantic_model( + self, + manifest: "Manifest", + sync_glossary: bool = True, + sync_tags: bool = False, + set_primary_keys: bool = True, + set_foreign_keys: bool = True, + **kwargs, + ): + if sync_glossary or sync_tags: + self._sync_metadata(manifest, sync_glossary, sync_tags) + if set_primary_keys: + self._set_primary_keys(manifest) + if set_foreign_keys: + self._set_foreign_keys(manifest) + + def _sync_metadata(self, manifest: "Manifest", sync_glossary: bool, sync_tags: bool): + """ + Syncs metadata (comments for glossaries, and tags) from the manifest to the physical Databricks tables. + """ + print("Syncing metadata to Databricks tables...") + + for source in manifest.sources.values(): + fqn = self._get_fqn(source.table.name) + + # Set table comment + if sync_glossary and source.table.description: + table_comment = source.table.description.replace("'", "\\'") + self._execute_sql(f"COMMENT ON TABLE {fqn} IS '{table_comment}'") #Works for views too + + # Set column comments and tags + for column in source.table.columns: + if sync_glossary and column.description: + col_comment = column.description.replace("'", "\\'") + self._execute_sql(f"COMMENT ON COLUMN {fqn}.`{column.name}` IS '{col_comment}'") + + if sync_tags and column.tags: + cleaned_tags = [clean_tag(tag) for tag in column.tags] + tag_assignments = ", ".join([f"'{tag}'" for tag in cleaned_tags]) + + # FIXME: Need to differentiate between TABLES and VIEWS for setting tags + try: + self._execute_sql(f"ALTER TABLE {fqn} ALTER COLUMN `{column.name}` SET TAGS ({tag_assignments})") + except Exception as e: + try: + self._execute_sql(f"ALTER VIEW {fqn} ALTER COLUMN `{column.name}` SET TAGS ({tag_assignments})") + except Exception as e: + print(f"Could not set tags '{tag_assignments}' on {fqn}.`{column.name}`: {e}") + + + print("Metadata sync complete.") + + def _set_primary_keys(self, manifest: "Manifest"): + """ + Sets primary key constraints on the tables based on the manifest. + """ + print("Setting primary key constraints...") + for source in manifest.sources.values(): + if not source.table.key or not isinstance(source.table.key, str): + print(f"Skipping primary key for table '{source.table.name}' due to missing or invalid key.") + continue + + fqn = self._get_fqn(source.table.name) + pk_column = source.table.key + constraint_name = f"pk_{source.table.name}" + try: + # First, ensure the column is not nullable + self._execute_sql(f"ALTER TABLE {fqn} ALTER COLUMN `{pk_column}` SET NOT NULL") + # Then, add the primary key constraint + self._execute_sql(f"ALTER TABLE {fqn} ADD CONSTRAINT {constraint_name} PRIMARY KEY (`{pk_column}`)") + print(f"Set primary key on {fqn} (`{pk_column}`)") + except Exception as e: + print(f"Could not set primary key for {fqn}: {e}") + print("Primary key setting complete.") + + def _set_foreign_keys(self, manifest: "Manifest"): + """ + Sets foreign key constraints between tables based on the manifest relationships. + """ + print("Setting foreign key constraints...") + for rel in manifest.relationships.values(): + resolved = resolve_relationship_direction(rel, manifest.sources) + if not resolved: + print(f"Skipping invalid or ambiguous relationship '{rel.name}'.") + continue + + try: + child_fqn = self._get_fqn(resolved.child_table) + parent_fqn = self._get_fqn(resolved.parent_table) + constraint_name = f"fk_{rel.name}" + cleaned_constraint_name = clean_name(constraint_name) + + self._execute_sql( + f"ALTER TABLE {child_fqn} ADD CONSTRAINT {cleaned_constraint_name} " + f"FOREIGN KEY (`{resolved.child_column}`) REFERENCES {parent_fqn} (`{resolved.parent_column}`)" + ) + except Exception as e: + print(f"Could not set foreign key for relationship {rel.name}: {e}") + print("Foreign key setting complete.") + + def intersect_count(self, table1: "DataSet", column1_name: str, table2: "DataSet", column2_name: str) -> int: + table1_adapter = self.check_data(table1.data) + table2_adapter = self.check_data(table2.data) + + fqn1 = self._get_fqn(table1_adapter.identifier) + fqn2 = self._get_fqn(table2_adapter.identifier) + + query = f""" + SELECT COUNT(*) FROM ( + SELECT DISTINCT `{column1_name}` FROM {fqn1} WHERE `{column1_name}` IS NOT NULL + INTERSECT + SELECT DISTINCT `{column2_name}` FROM {fqn2} WHERE `{column2_name}` IS NOT NULL + ) + """ + return self._execute_sql(query)[0][0] + + def get_details(self, data: DatabricksConfig): + data = self.check_data(data) + return data.model_dump() + + +def can_handle_databricks(df: Any) -> bool: + try: + DatabricksConfig.model_validate(df) + return True + except Exception: + return False + + +def register(factory: AdapterFactory): + if DATABRICKS_AVAILABLE: + factory.register("databricks", can_handle_databricks, DatabricksAdapter) \ No newline at end of file diff --git a/src/intugle/adapters/types/databricks/models.py b/src/intugle/adapters/types/databricks/models.py new file mode 100644 index 0000000..1566bbe --- /dev/null +++ b/src/intugle/adapters/types/databricks/models.py @@ -0,0 +1,21 @@ +from typing import Optional + +from intugle.common.schema import SchemaBase + + +class DatabricksSQLConnectorConfig(SchemaBase): + host: str + http_path: str + token: str + schema: str + catalog: Optional[str] = None + + +class DatabricksNotebookConfig(SchemaBase): + schema: str + catalog: Optional[str] = None + + +class DatabricksConfig(SchemaBase): + identifier: str + type: str = "databricks" diff --git a/src/intugle/adapters/types/duckdb/duckdb.py b/src/intugle/adapters/types/duckdb/duckdb.py index 5966d19..301517f 100644 --- a/src/intugle/adapters/types/duckdb/duckdb.py +++ b/src/intugle/adapters/types/duckdb/duckdb.py @@ -1,6 +1,6 @@ import time -from typing import Any, Optional +from typing import TYPE_CHECKING, Any, Optional import duckdb import numpy as np @@ -18,6 +18,9 @@ from intugle.common.exception import errors from intugle.core.utilities.processing import string_standardization +if TYPE_CHECKING: + from intugle.analysis.models import DataSet + class DuckdbAdapter(Adapter): @@ -240,6 +243,7 @@ def to_df_from_query(self, query: str) -> pd.DataFrame: def create_table_from_query(self, table_name: str, query: str): duckdb.sql(f'CREATE OR REPLACE VIEW "{table_name}" AS {query}') + return query def create_new_config_from_etl(self, etl_name: str) -> "DataSetData": return DuckdbConfig(path=etl_name, type="table") diff --git a/src/intugle/adapters/types/snowflake/snowflake.py b/src/intugle/adapters/types/snowflake/snowflake.py index 3f384cb..9797bcf 100644 --- a/src/intugle/adapters/types/snowflake/snowflake.py +++ b/src/intugle/adapters/types/snowflake/snowflake.py @@ -11,6 +11,7 @@ from intugle.models.manifest import Manifest from intugle.adapters.adapter import Adapter +from intugle.adapters.common.relationships import resolve_relationship_direction from intugle.adapters.factory import AdapterFactory from intugle.adapters.models import ( ColumnProfile, @@ -37,7 +38,18 @@ class SnowflakeAdapter(Adapter): + _instance = None + _initialized = False + + def __new__(cls, *args, **kwargs): + if not cls._instance: + cls._instance = super().__new__(cls) + return cls._instance + def __init__(self): + if self._initialized: + return + if not SNOWFLAKE_AVAILABLE: raise ImportError("Snowflake dependencies are not installed. Please run 'pip install intugle[snowflake]'.") @@ -45,6 +57,7 @@ def __init__(self): self.database: Optional[str] = None self.schema: Optional[str] = None self.connect() + self._initialized = True def connect(self): try: @@ -179,6 +192,7 @@ def _clean_column_quotes(sql: str) -> str: query = _clean_column_quotes(query) self.session.sql(f"CREATE OR REPLACE TABLE {table_name} AS {query}").collect() + return query def create_new_config_from_etl(self, etl_name: str) -> "DataSetData": return SnowflakeConfig(identifier=etl_name) @@ -268,28 +282,16 @@ def deploy_semantic_model(self, manifest: "Manifest", **kwargs): # -- RELATIONSHIPS clause -- relationship_clauses = [] for rel in manifest.relationships.values(): - source_table_info = manifest.sources.get(rel.source.table) - target_table_info = manifest.sources.get(rel.target.table) - - if not source_table_info or not target_table_info: + resolved = resolve_relationship_direction(rel, manifest.sources) + if not resolved: continue - # Determine which table is the 'one' side (contains the PK for the join) - if source_table_info.table.key == rel.source.column: - # source is the 'one' side (referenced table) - ref_table_alias = clean_name(rel.source.table) - ref_column = clean_name(rel.source.column) - table_alias = clean_name(rel.target.table) - column = clean_name(rel.target.column) - elif target_table_info.table.key == rel.target.column: - # target is the 'one' side (referenced table) - ref_table_alias = clean_name(rel.target.table) - ref_column = clean_name(rel.target.column) - table_alias = clean_name(rel.source.table) - column = clean_name(rel.source.column) - else: - # This is not a valid FK relationship for the semantic view, skip it - continue + # The table with the FK is the "referencing" table + table_alias = clean_name(resolved.child_table) + column = clean_name(resolved.child_column) + # The table with the PK is the "referenced" table + ref_table_alias = clean_name(resolved.parent_table) + ref_column = clean_name(resolved.parent_column) clause = f"{clean_name(rel.name)} AS {table_alias}({column}) REFERENCES {ref_table_alias}({ref_column})" relationship_clauses.append(clause) diff --git a/src/intugle/data_product.py b/src/intugle/data_product.py index bc104a7..bf76494 100644 --- a/src/intugle/data_product.py +++ b/src/intugle/data_product.py @@ -86,7 +86,7 @@ def build(self, etl: ETLModel) -> DataSet: sql_query = self.generate_query(etl) # 3. Materialize the query as a new table in the target database - execution_adapter.create_table_from_query(etl.name, sql_query) + dialect_sql = execution_adapter.create_table_from_query(etl.name, sql_query) # 4. Create a new config object pointing to the newly created table new_config = execution_adapter.create_new_config_from_etl(etl.name) @@ -94,7 +94,7 @@ def build(self, etl: ETLModel) -> DataSet: # 5. Return a new DataSet pointing to the materialized table result_dataset = DataSet(data=new_config, name=etl.name) # Attach the query for inspection - result_dataset.sql_query = sql_query + result_dataset.sql_query = dialect_sql return result_dataset diff --git a/src/intugle/exporters/snowflake.py b/src/intugle/exporters/snowflake.py index 4754aac..467d1c1 100644 --- a/src/intugle/exporters/snowflake.py +++ b/src/intugle/exporters/snowflake.py @@ -1,11 +1,15 @@ import re -from .base import Exporter -from intugle.libs.smart_query_generator.models.models import CategoryType + +from intugle.adapters.common.relationships import resolve_relationship_direction from intugle.core import settings +from intugle.libs.smart_query_generator.models.models import CategoryType from intugle.models.resources.relationship import RelationshipType +from .base import Exporter + RESERVED_WORDS = {'start', 'end', 'select', 'from', 'where', 'order', 'group', 'join', 'table', 'on'} + def clean_name(name: str) -> str: """Cleans an identifier to be a safe logical name for the Snowflake Semantic Model.""" cleaned = name.strip().strip('"') @@ -19,11 +23,13 @@ def clean_name(name: str) -> str: return f'_{cleaned}' return cleaned + def quote_identifier(name: str) -> str: """Ensure the identifier is wrapped in exactly one pair of double quotes.""" clean_name = name.strip().strip('"') return f'"{clean_name}"' + # Mapping from our types to Snowflake's expected data types DATA_TYPE_MAPPING = { "integer": "NUMBER", @@ -36,6 +42,7 @@ def quote_identifier(name: str) -> str: # Add other mappings as necessary } + class SnowflakeExporter(Exporter): def export(self, **kwargs) -> dict: """ @@ -73,7 +80,7 @@ def export(self, **kwargs) -> dict: # Map columns to dimensions and facts for column in source.table.columns: - snowflake_type = DATA_TYPE_MAPPING.get(column.type, "TEXT") # Default to TEXT + snowflake_type = DATA_TYPE_MAPPING.get(column.type, "TEXT") # Default to TEXT if column.category == CategoryType.dimension: dimension = { "name": clean_name(column.name), @@ -96,40 +103,18 @@ def export(self, **kwargs) -> dict: # Process relationships for rel in manifest.relationships.values(): - source_table_name = rel.source.table - target_table_name = rel.target.table - - source_table_info = manifest.sources.get(source_table_name) - target_table_info = manifest.sources.get(target_table_name) - - if not source_table_info or not target_table_info: - continue - - # Determine which table is the 'one' side (contains the PK for the join) - if source_table_info.table.key == rel.source.column: - # source is the 'one' side - right_table = source_table_name - right_column = rel.source.column - left_table = target_table_name - left_column = rel.target.column - elif target_table_info.table.key == rel.target.column: - # target is the 'one' side - right_table = target_table_name - right_column = rel.target.column - left_table = source_table_name - left_column = rel.source.column - else: - # This is not a valid FK relationship for Snowflake's semantic model + resolved = resolve_relationship_direction(rel, manifest.sources) + if not resolved: continue relationship = { "name": rel.name, - "left_table": clean_name(left_table), - "right_table": clean_name(right_table), + "left_table": clean_name(resolved.child_table), + "right_table": clean_name(resolved.parent_table), "relationship_columns": [ { - "left_column": clean_name(left_column), - "right_column": clean_name(right_column) + "left_column": clean_name(resolved.child_column), + "right_column": clean_name(resolved.parent_column) } ], "join_type": "left_outer", diff --git a/uv.lock b/uv.lock index fd9f494..fb41e9e 100644 --- a/uv.lock +++ b/uv.lock @@ -867,6 +867,26 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e7/05/c19819d5e3d95294a6f5947fb9b9629efb316b96de511b418c53d245aae6/cycler-0.12.1-py3-none-any.whl", hash = "sha256:85cef7cff222d8644161529808465972e51340599459b8ac3ccbac5a854e0d30", size = 8321, upload-time = "2023-10-07T05:32:16.783Z" }, ] +[[package]] +name = "databricks-sql-connector" +version = "4.1.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "lz4" }, + { name = "oauthlib" }, + { name = "openpyxl" }, + { name = "pandas" }, + { name = "pyjwt" }, + { name = "python-dateutil" }, + { name = "requests" }, + { name = "thrift" }, + { name = "urllib3" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/e5/e9/6f538a27ffa79a34ddb7ea61e18fdce8420aca899c7cecdcad61b56d75c7/databricks_sql_connector-4.1.3.tar.gz", hash = "sha256:225cef7c3454e93d7a700dd336c665a44b04ab8f80236bd9be815bc42e7d2468", size = 175553, upload-time = "2025-09-17T17:56:28.733Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c7/6f/56617e6a866d8f0ba54f1df1b7d2feb9fe63e27e06033c81ce20ad894721/databricks_sql_connector-4.1.3-py3-none-any.whl", hash = "sha256:471c5acc2ce4ee4efedf66d33e6f86f5cd349496a85826ee95fc59b485d2daef", size = 198774, upload-time = "2025-09-17T17:56:27.112Z" }, +] + [[package]] name = "dataclasses-json" version = "0.6.7" @@ -1036,6 +1056,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/de/15/545e2b6cf2e3be84bc1ed85613edd75b8aea69807a71c26f4ca6a9258e82/email_validator-2.3.0-py3-none-any.whl", hash = "sha256:80f13f623413e6b197ae73bb10bf4eb0908faf509ad8362c5edeb0be7fd450b4", size = 35604, upload-time = "2025-08-26T13:09:05.858Z" }, ] +[[package]] +name = "et-xmlfile" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d3/38/af70d7ab1ae9d4da450eeec1fa3918940a5fafb9055e934af8d6eb0c2313/et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54", size = 17234, upload-time = "2024-10-25T17:25:40.039Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa", size = 18059, upload-time = "2024-10-25T17:25:39.051Z" }, +] + [[package]] name = "exceptiongroup" version = "1.3.0" @@ -1639,7 +1668,7 @@ wheels = [ [[package]] name = "intugle" -version = "1.0.3" +version = "1.0.4" source = { editable = "." } dependencies = [ { name = "asyncpg" }, @@ -1672,6 +1701,11 @@ dependencies = [ ] [package.optional-dependencies] +databricks = [ + { name = "databricks-sql-connector" }, + { name = "pyspark" }, + { name = "sqlglot" }, +] snowflake = [ { name = "snowflake-snowpark-python", extra = ["pandas"] }, ] @@ -1697,6 +1731,7 @@ test = [ [package.metadata] requires-dist = [ { name = "asyncpg", specifier = ">=0.30.0" }, + { name = "databricks-sql-connector", marker = "extra == 'databricks'", specifier = ">=4.1.3" }, { name = "duckdb", specifier = ">=1.3.2" }, { name = "fastapi", extras = ["standard"], specifier = ">=0.116.1" }, { name = "langchain", extras = ["anthropic", "google-genai", "openai"], specifier = ">=0.3.27" }, @@ -1713,17 +1748,19 @@ requires-dist = [ { name = "pydantic", specifier = ">=2.11.7" }, { name = "pydantic-settings", specifier = ">=2.10.1" }, { name = "pyfunctional", specifier = ">=1.5.0" }, + { name = "pyspark", marker = "extra == 'databricks'", specifier = ">=3.5.0" }, { name = "python-dotenv", specifier = ">=1.1.1" }, { name = "pyyaml", specifier = ">=6.0.2" }, { name = "qdrant-client", specifier = ">=1.15.1" }, { name = "rich", specifier = ">=14.1.0" }, { name = "scikit-learn", specifier = "==1.7.1" }, { name = "snowflake-snowpark-python", extras = ["pandas"], marker = "extra == 'snowflake'", specifier = ">=1.12.0" }, + { name = "sqlglot", marker = "extra == 'databricks'", specifier = ">=27.20.0" }, { name = "symspellpy", specifier = ">=6.9.0" }, { name = "trieregex", specifier = ">=1.0.0" }, { name = "xgboost", specifier = ">=3.0.4" }, ] -provides-extras = ["snowflake"] +provides-extras = ["snowflake", "databricks"] [package.metadata.requires-dev] dev = [ @@ -2411,6 +2448,46 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/3e/8e/e7a43d907a147e1f87eebdd6737483f9feba52a5d4b20f69d0bd6f2fa22f/langsmith-0.4.31-py3-none-any.whl", hash = "sha256:64f340bdead21defe5f4a6ca330c11073e35444989169f669508edf45a19025f", size = 386347, upload-time = "2025-09-25T04:18:16.69Z" }, ] +[[package]] +name = "lz4" +version = "4.4.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/c6/5a/945f5086326d569f14c84ac6f7fcc3229f0b9b1e8cc536b951fd53dfb9e1/lz4-4.4.4.tar.gz", hash = "sha256:070fd0627ec4393011251a094e08ed9fdcc78cb4e7ab28f507638eee4e39abda", size = 171884, upload-time = "2025-04-01T22:55:58.62Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b0/80/4054e99cda2e003097f59aeb3ad470128f3298db5065174a84564d2d6983/lz4-4.4.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f170abb8416c4efca48e76cac2c86c3185efdf841aecbe5c190121c42828ced0", size = 220896, upload-time = "2025-04-01T22:55:13.577Z" }, + { url = "https://files.pythonhosted.org/packages/dd/4e/f92424d5734e772b05ddbeec739e2566e2a2336995b36a180e1dd9411e9a/lz4-4.4.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d33a5105cd96ebd32c3e78d7ece6123a9d2fb7c18b84dec61f27837d9e0c496c", size = 189679, upload-time = "2025-04-01T22:55:15.471Z" }, + { url = "https://files.pythonhosted.org/packages/a2/70/71ffd496067cba6ba352e10b89c0e9cee3e4bc4717ba866b6aa350f4c7ac/lz4-4.4.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:30ebbc5b76b4f0018988825a7e9ce153be4f0d4eba34e6c1f2fcded120573e88", size = 1237940, upload-time = "2025-04-01T22:55:16.498Z" }, + { url = "https://files.pythonhosted.org/packages/6e/59/cf34d1e232b11e1ae7122300be00529f369a7cd80f74ac351d58c4c4eedf/lz4-4.4.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dc64d6dfa7a89397529b22638939e70d85eaedc1bd68e30a29c78bfb65d4f715", size = 1264105, upload-time = "2025-04-01T22:55:17.606Z" }, + { url = "https://files.pythonhosted.org/packages/f9/f6/3a00a98ff5b872d572cc6e9c88e0f6275bea0f3ed1dc1b8f8b736c85784c/lz4-4.4.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a355223a284f42a723c120ce68827de66d5cb872a38732b3d5abbf544fa2fe26", size = 1184179, upload-time = "2025-04-01T22:55:19.206Z" }, + { url = "https://files.pythonhosted.org/packages/bc/de/6aeb602786174bad290609c0c988afb1077b74a80eaea23ebc3b5de6e2fa/lz4-4.4.4-cp310-cp310-win32.whl", hash = "sha256:b28228197775b7b5096898851d59ef43ccaf151136f81d9c436bc9ba560bc2ba", size = 88265, upload-time = "2025-04-01T22:55:20.215Z" }, + { url = "https://files.pythonhosted.org/packages/e4/b5/1f52c8b17d02ae637f85911c0135ca08be1c9bbdfb3e7de1c4ae7af0bac6/lz4-4.4.4-cp310-cp310-win_amd64.whl", hash = "sha256:45e7c954546de4f85d895aa735989d77f87dd649f503ce1c8a71a151b092ed36", size = 99916, upload-time = "2025-04-01T22:55:21.332Z" }, + { url = "https://files.pythonhosted.org/packages/01/e7/123587e7dae6cdba48393e4fdad2b9412f43f51346afe9ca6f697029de11/lz4-4.4.4-cp310-cp310-win_arm64.whl", hash = "sha256:e3fc90f766401684740978cd781d73b9685bd81b5dbf7257542ef9de4612e4d2", size = 89746, upload-time = "2025-04-01T22:55:22.205Z" }, + { url = "https://files.pythonhosted.org/packages/28/e8/63843dc5ecb1529eb38e1761ceed04a0ad52a9ad8929ab8b7930ea2e4976/lz4-4.4.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ddfc7194cd206496c445e9e5b0c47f970ce982c725c87bd22de028884125b68f", size = 220898, upload-time = "2025-04-01T22:55:23.085Z" }, + { url = "https://files.pythonhosted.org/packages/e4/94/c53de5f07c7dc11cf459aab2a1d754f5df5f693bfacbbe1e4914bfd02f1e/lz4-4.4.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:714f9298c86f8e7278f1c6af23e509044782fa8220eb0260f8f8f1632f820550", size = 189685, upload-time = "2025-04-01T22:55:24.413Z" }, + { url = "https://files.pythonhosted.org/packages/fe/59/c22d516dd0352f2a3415d1f665ccef2f3e74ecec3ca6a8f061a38f97d50d/lz4-4.4.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a8474c91de47733856c6686df3c4aca33753741da7e757979369c2c0d32918ba", size = 1239225, upload-time = "2025-04-01T22:55:25.737Z" }, + { url = "https://files.pythonhosted.org/packages/81/af/665685072e71f3f0e626221b7922867ec249cd8376aca761078c8f11f5da/lz4-4.4.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80dd27d7d680ea02c261c226acf1d41de2fd77af4fb2da62b278a9376e380de0", size = 1265881, upload-time = "2025-04-01T22:55:26.817Z" }, + { url = "https://files.pythonhosted.org/packages/90/04/b4557ae381d3aa451388a29755cc410066f5e2f78c847f66f154f4520a68/lz4-4.4.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9b7d6dddfd01b49aedb940fdcaf32f41dc58c926ba35f4e31866aeec2f32f4f4", size = 1185593, upload-time = "2025-04-01T22:55:27.896Z" }, + { url = "https://files.pythonhosted.org/packages/7b/e4/03636979f4e8bf92c557f998ca98ee4e6ef92e92eaf0ed6d3c7f2524e790/lz4-4.4.4-cp311-cp311-win32.whl", hash = "sha256:4134b9fd70ac41954c080b772816bb1afe0c8354ee993015a83430031d686a4c", size = 88259, upload-time = "2025-04-01T22:55:29.03Z" }, + { url = "https://files.pythonhosted.org/packages/07/f0/9efe53b4945441a5d2790d455134843ad86739855b7e6199977bf6dc8898/lz4-4.4.4-cp311-cp311-win_amd64.whl", hash = "sha256:f5024d3ca2383470f7c4ef4d0ed8eabad0b22b23eeefde1c192cf1a38d5e9f78", size = 99916, upload-time = "2025-04-01T22:55:29.933Z" }, + { url = "https://files.pythonhosted.org/packages/87/c8/1675527549ee174b9e1db089f7ddfbb962a97314657269b1e0344a5eaf56/lz4-4.4.4-cp311-cp311-win_arm64.whl", hash = "sha256:6ea715bb3357ea1665f77874cf8f55385ff112553db06f3742d3cdcec08633f7", size = 89741, upload-time = "2025-04-01T22:55:31.184Z" }, + { url = "https://files.pythonhosted.org/packages/f7/2d/5523b4fabe11cd98f040f715728d1932eb7e696bfe94391872a823332b94/lz4-4.4.4-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:23ae267494fdd80f0d2a131beff890cf857f1b812ee72dbb96c3204aab725553", size = 220669, upload-time = "2025-04-01T22:55:32.032Z" }, + { url = "https://files.pythonhosted.org/packages/91/06/1a5bbcacbfb48d8ee5b6eb3fca6aa84143a81d92946bdb5cd6b005f1863e/lz4-4.4.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:fff9f3a1ed63d45cb6514bfb8293005dc4141341ce3500abdfeb76124c0b9b2e", size = 189661, upload-time = "2025-04-01T22:55:33.413Z" }, + { url = "https://files.pythonhosted.org/packages/fa/08/39eb7ac907f73e11a69a11576a75a9e36406b3241c0ba41453a7eb842abb/lz4-4.4.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1ea7f07329f85a8eda4d8cf937b87f27f0ac392c6400f18bea2c667c8b7f8ecc", size = 1238775, upload-time = "2025-04-01T22:55:34.835Z" }, + { url = "https://files.pythonhosted.org/packages/e9/26/05840fbd4233e8d23e88411a066ab19f1e9de332edddb8df2b6a95c7fddc/lz4-4.4.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8ccab8f7f7b82f9fa9fc3b0ba584d353bd5aa818d5821d77d5b9447faad2aaad", size = 1265143, upload-time = "2025-04-01T22:55:35.933Z" }, + { url = "https://files.pythonhosted.org/packages/b7/5d/5f2db18c298a419932f3ab2023deb689863cf8fd7ed875b1c43492479af2/lz4-4.4.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e43e9d48b2daf80e486213128b0763deed35bbb7a59b66d1681e205e1702d735", size = 1185032, upload-time = "2025-04-01T22:55:37.454Z" }, + { url = "https://files.pythonhosted.org/packages/c4/e6/736ab5f128694b0f6aac58343bcf37163437ac95997276cd0be3ea4c3342/lz4-4.4.4-cp312-cp312-win32.whl", hash = "sha256:33e01e18e4561b0381b2c33d58e77ceee850a5067f0ece945064cbaac2176962", size = 88284, upload-time = "2025-04-01T22:55:38.536Z" }, + { url = "https://files.pythonhosted.org/packages/40/b8/243430cb62319175070e06e3a94c4c7bd186a812e474e22148ae1290d47d/lz4-4.4.4-cp312-cp312-win_amd64.whl", hash = "sha256:d21d1a2892a2dcc193163dd13eaadabb2c1b803807a5117d8f8588b22eaf9f12", size = 99918, upload-time = "2025-04-01T22:55:39.628Z" }, + { url = "https://files.pythonhosted.org/packages/6c/e1/0686c91738f3e6c2e1a243e0fdd4371667c4d2e5009b0a3605806c2aa020/lz4-4.4.4-cp312-cp312-win_arm64.whl", hash = "sha256:2f4f2965c98ab254feddf6b5072854a6935adab7bc81412ec4fe238f07b85f62", size = 89736, upload-time = "2025-04-01T22:55:40.5Z" }, + { url = "https://files.pythonhosted.org/packages/3b/3c/d1d1b926d3688263893461e7c47ed7382a969a0976fc121fc678ec325fc6/lz4-4.4.4-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:ed6eb9f8deaf25ee4f6fad9625d0955183fdc90c52b6f79a76b7f209af1b6e54", size = 220678, upload-time = "2025-04-01T22:55:41.78Z" }, + { url = "https://files.pythonhosted.org/packages/26/89/8783d98deb058800dabe07e6cdc90f5a2a8502a9bad8c5343c641120ace2/lz4-4.4.4-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:18ae4fe3bafb344dbd09f976d45cbf49c05c34416f2462828f9572c1fa6d5af7", size = 189670, upload-time = "2025-04-01T22:55:42.775Z" }, + { url = "https://files.pythonhosted.org/packages/22/ab/a491ace69a83a8914a49f7391e92ca0698f11b28d5ce7b2ececa2be28e9a/lz4-4.4.4-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:57fd20c5fc1a49d1bbd170836fccf9a338847e73664f8e313dce6ac91b8c1e02", size = 1238746, upload-time = "2025-04-01T22:55:43.797Z" }, + { url = "https://files.pythonhosted.org/packages/97/12/a1f2f4fdc6b7159c0d12249456f9fe454665b6126e98dbee9f2bd3cf735c/lz4-4.4.4-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e9cb387c33f014dae4db8cb4ba789c8d2a0a6d045ddff6be13f6c8d9def1d2a6", size = 1265119, upload-time = "2025-04-01T22:55:44.943Z" }, + { url = "https://files.pythonhosted.org/packages/50/6e/e22e50f5207649db6ea83cd31b79049118305be67e96bec60becf317afc6/lz4-4.4.4-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d0be9f68240231e1e44118a4ebfecd8a5d4184f0bdf5c591c98dd6ade9720afd", size = 1184954, upload-time = "2025-04-01T22:55:46.161Z" }, + { url = "https://files.pythonhosted.org/packages/4c/c4/2a458039645fcc6324ece731d4d1361c5daf960b553d1fcb4261ba07d51c/lz4-4.4.4-cp313-cp313-win32.whl", hash = "sha256:e9ec5d45ea43684f87c316542af061ef5febc6a6b322928f059ce1fb289c298a", size = 88289, upload-time = "2025-04-01T22:55:47.601Z" }, + { url = "https://files.pythonhosted.org/packages/00/96/b8e24ea7537ab418074c226279acfcaa470e1ea8271003e24909b6db942b/lz4-4.4.4-cp313-cp313-win_amd64.whl", hash = "sha256:a760a175b46325b2bb33b1f2bbfb8aa21b48e1b9653e29c10b6834f9bb44ead4", size = 99925, upload-time = "2025-04-01T22:55:48.463Z" }, + { url = "https://files.pythonhosted.org/packages/a5/a5/f9838fe6aa132cfd22733ed2729d0592259fff074cefb80f19aa0607367b/lz4-4.4.4-cp313-cp313-win_arm64.whl", hash = "sha256:f4c21648d81e0dda38b4720dccc9006ae33b0e9e7ffe88af6bf7d4ec124e2fba", size = 89743, upload-time = "2025-04-01T22:55:49.716Z" }, +] + [[package]] name = "markdown-it-py" version = "4.0.0" @@ -2988,6 +3065,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/5d/24/11df42593d1a6d10b3ffef049cec064832f108e77bc5cac12726e4ec1cb2/nvidia_nccl_cu12-2.28.3-py3-none-manylinux_2_18_x86_64.whl", hash = "sha256:79cf0412094e4a552889e5cb7757d92c010ead557ec722c5eebe6a94b1d8681c", size = 295901337, upload-time = "2025-09-06T00:32:01.348Z" }, ] +[[package]] +name = "oauthlib" +version = "3.3.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/0b/5f/19930f824ffeb0ad4372da4812c50edbd1434f678c90c2733e1188edfc63/oauthlib-3.3.1.tar.gz", hash = "sha256:0f0f8aa759826a193cf66c12ea1af1637f87b9b4622d46e866952bb022e538c9", size = 185918, upload-time = "2025-06-19T22:48:08.269Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/be/9c/92789c596b8df838baa98fa71844d84283302f7604ed565dafe5a6b5041a/oauthlib-3.3.1-py3-none-any.whl", hash = "sha256:88119c938d2b8fb88561af5f6ee0eec8cc8d552b7bb1f712743136eb7523b7a1", size = 160065, upload-time = "2025-06-19T22:48:06.508Z" }, +] + [[package]] name = "openai" version = "2.0.1" @@ -3007,6 +3093,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/4d/5d/7b8dc822de474a283a190fe222d9a074e2fdecfbcb4a14ff49ad4d555404/openai-2.0.1-py3-none-any.whl", hash = "sha256:f0671423666cfd24c15010fd4732738f89f1b6d4f21c47f5c82db411cc2648d5", size = 956304, upload-time = "2025-10-01T19:49:07.497Z" }, ] +[[package]] +name = "openpyxl" +version = "3.1.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "et-xmlfile" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/3d/f9/88d94a75de065ea32619465d2f77b29a0469500e99012523b91cc4141cd1/openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050", size = 186464, upload-time = "2024-06-28T14:03:44.161Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2", size = 250910, upload-time = "2024-06-28T14:03:41.161Z" }, +] + [[package]] name = "orjson" version = "3.11.3" @@ -3135,7 +3233,7 @@ wheels = [ [[package]] name = "pandas" -version = "2.3.3" +version = "2.2.3" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, @@ -3144,55 +3242,42 @@ dependencies = [ { name = "pytz" }, { name = "tzdata" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/33/01/d40b85317f86cf08d853a4f495195c73815fdf205eef3993821720274518/pandas-2.3.3.tar.gz", hash = "sha256:e05e1af93b977f7eafa636d043f9f94c7ee3ac81af99c13508215942e64c993b", size = 4495223, upload-time = "2025-09-29T23:34:51.853Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/3d/f7/f425a00df4fcc22b292c6895c6831c0c8ae1d9fac1e024d16f98a9ce8749/pandas-2.3.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:376c6446ae31770764215a6c937f72d917f214b43560603cd60da6408f183b6c", size = 11555763, upload-time = "2025-09-29T23:16:53.287Z" }, - { url = "https://files.pythonhosted.org/packages/13/4f/66d99628ff8ce7857aca52fed8f0066ce209f96be2fede6cef9f84e8d04f/pandas-2.3.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e19d192383eab2f4ceb30b412b22ea30690c9e618f78870357ae1d682912015a", size = 10801217, upload-time = "2025-09-29T23:17:04.522Z" }, - { url = "https://files.pythonhosted.org/packages/1d/03/3fc4a529a7710f890a239cc496fc6d50ad4a0995657dccc1d64695adb9f4/pandas-2.3.3-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5caf26f64126b6c7aec964f74266f435afef1c1b13da3b0636c7518a1fa3e2b1", size = 12148791, upload-time = "2025-09-29T23:17:18.444Z" }, - { url = "https://files.pythonhosted.org/packages/40/a8/4dac1f8f8235e5d25b9955d02ff6f29396191d4e665d71122c3722ca83c5/pandas-2.3.3-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dd7478f1463441ae4ca7308a70e90b33470fa593429f9d4c578dd00d1fa78838", size = 12769373, upload-time = "2025-09-29T23:17:35.846Z" }, - { url = "https://files.pythonhosted.org/packages/df/91/82cc5169b6b25440a7fc0ef3a694582418d875c8e3ebf796a6d6470aa578/pandas-2.3.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:4793891684806ae50d1288c9bae9330293ab4e083ccd1c5e383c34549c6e4250", size = 13200444, upload-time = "2025-09-29T23:17:49.341Z" }, - { url = "https://files.pythonhosted.org/packages/10/ae/89b3283800ab58f7af2952704078555fa60c807fff764395bb57ea0b0dbd/pandas-2.3.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:28083c648d9a99a5dd035ec125d42439c6c1c525098c58af0fc38dd1a7a1b3d4", size = 13858459, upload-time = "2025-09-29T23:18:03.722Z" }, - { url = "https://files.pythonhosted.org/packages/85/72/530900610650f54a35a19476eca5104f38555afccda1aa11a92ee14cb21d/pandas-2.3.3-cp310-cp310-win_amd64.whl", hash = "sha256:503cf027cf9940d2ceaa1a93cfb5f8c8c7e6e90720a2850378f0b3f3b1e06826", size = 11346086, upload-time = "2025-09-29T23:18:18.505Z" }, - { url = "https://files.pythonhosted.org/packages/c1/fa/7ac648108144a095b4fb6aa3de1954689f7af60a14cf25583f4960ecb878/pandas-2.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:602b8615ebcc4a0c1751e71840428ddebeb142ec02c786e8ad6b1ce3c8dec523", size = 11578790, upload-time = "2025-09-29T23:18:30.065Z" }, - { url = "https://files.pythonhosted.org/packages/9b/35/74442388c6cf008882d4d4bdfc4109be87e9b8b7ccd097ad1e7f006e2e95/pandas-2.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8fe25fc7b623b0ef6b5009149627e34d2a4657e880948ec3c840e9402e5c1b45", size = 10833831, upload-time = "2025-09-29T23:38:56.071Z" }, - { url = "https://files.pythonhosted.org/packages/fe/e4/de154cbfeee13383ad58d23017da99390b91d73f8c11856f2095e813201b/pandas-2.3.3-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b468d3dad6ff947df92dcb32ede5b7bd41a9b3cceef0a30ed925f6d01fb8fa66", size = 12199267, upload-time = "2025-09-29T23:18:41.627Z" }, - { url = "https://files.pythonhosted.org/packages/bf/c9/63f8d545568d9ab91476b1818b4741f521646cbdd151c6efebf40d6de6f7/pandas-2.3.3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b98560e98cb334799c0b07ca7967ac361a47326e9b4e5a7dfb5ab2b1c9d35a1b", size = 12789281, upload-time = "2025-09-29T23:18:56.834Z" }, - { url = "https://files.pythonhosted.org/packages/f2/00/a5ac8c7a0e67fd1a6059e40aa08fa1c52cc00709077d2300e210c3ce0322/pandas-2.3.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1d37b5848ba49824e5c30bedb9c830ab9b7751fd049bc7914533e01c65f79791", size = 13240453, upload-time = "2025-09-29T23:19:09.247Z" }, - { url = "https://files.pythonhosted.org/packages/27/4d/5c23a5bc7bd209231618dd9e606ce076272c9bc4f12023a70e03a86b4067/pandas-2.3.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:db4301b2d1f926ae677a751eb2bd0e8c5f5319c9cb3f88b0becbbb0b07b34151", size = 13890361, upload-time = "2025-09-29T23:19:25.342Z" }, - { url = "https://files.pythonhosted.org/packages/8e/59/712db1d7040520de7a4965df15b774348980e6df45c129b8c64d0dbe74ef/pandas-2.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:f086f6fe114e19d92014a1966f43a3e62285109afe874f067f5abbdcbb10e59c", size = 11348702, upload-time = "2025-09-29T23:19:38.296Z" }, - { url = "https://files.pythonhosted.org/packages/9c/fb/231d89e8637c808b997d172b18e9d4a4bc7bf31296196c260526055d1ea0/pandas-2.3.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6d21f6d74eb1725c2efaa71a2bfc661a0689579b58e9c0ca58a739ff0b002b53", size = 11597846, upload-time = "2025-09-29T23:19:48.856Z" }, - { url = "https://files.pythonhosted.org/packages/5c/bd/bf8064d9cfa214294356c2d6702b716d3cf3bb24be59287a6a21e24cae6b/pandas-2.3.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3fd2f887589c7aa868e02632612ba39acb0b8948faf5cc58f0850e165bd46f35", size = 10729618, upload-time = "2025-09-29T23:39:08.659Z" }, - { url = "https://files.pythonhosted.org/packages/57/56/cf2dbe1a3f5271370669475ead12ce77c61726ffd19a35546e31aa8edf4e/pandas-2.3.3-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ecaf1e12bdc03c86ad4a7ea848d66c685cb6851d807a26aa245ca3d2017a1908", size = 11737212, upload-time = "2025-09-29T23:19:59.765Z" }, - { url = "https://files.pythonhosted.org/packages/e5/63/cd7d615331b328e287d8233ba9fdf191a9c2d11b6af0c7a59cfcec23de68/pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b3d11d2fda7eb164ef27ffc14b4fcab16a80e1ce67e9f57e19ec0afaf715ba89", size = 12362693, upload-time = "2025-09-29T23:20:14.098Z" }, - { url = "https://files.pythonhosted.org/packages/a6/de/8b1895b107277d52f2b42d3a6806e69cfef0d5cf1d0ba343470b9d8e0a04/pandas-2.3.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a68e15f780eddf2b07d242e17a04aa187a7ee12b40b930bfdd78070556550e98", size = 12771002, upload-time = "2025-09-29T23:20:26.76Z" }, - { url = "https://files.pythonhosted.org/packages/87/21/84072af3187a677c5893b170ba2c8fbe450a6ff911234916da889b698220/pandas-2.3.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:371a4ab48e950033bcf52b6527eccb564f52dc826c02afd9a1bc0ab731bba084", size = 13450971, upload-time = "2025-09-29T23:20:41.344Z" }, - { url = "https://files.pythonhosted.org/packages/86/41/585a168330ff063014880a80d744219dbf1dd7a1c706e75ab3425a987384/pandas-2.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:a16dcec078a01eeef8ee61bf64074b4e524a2a3f4b3be9326420cabe59c4778b", size = 10992722, upload-time = "2025-09-29T23:20:54.139Z" }, - { url = "https://files.pythonhosted.org/packages/cd/4b/18b035ee18f97c1040d94debd8f2e737000ad70ccc8f5513f4eefad75f4b/pandas-2.3.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:56851a737e3470de7fa88e6131f41281ed440d29a9268dcbf0002da5ac366713", size = 11544671, upload-time = "2025-09-29T23:21:05.024Z" }, - { url = "https://files.pythonhosted.org/packages/31/94/72fac03573102779920099bcac1c3b05975c2cb5f01eac609faf34bed1ca/pandas-2.3.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:bdcd9d1167f4885211e401b3036c0c8d9e274eee67ea8d0758a256d60704cfe8", size = 10680807, upload-time = "2025-09-29T23:21:15.979Z" }, - { url = "https://files.pythonhosted.org/packages/16/87/9472cf4a487d848476865321de18cc8c920b8cab98453ab79dbbc98db63a/pandas-2.3.3-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e32e7cc9af0f1cc15548288a51a3b681cc2a219faa838e995f7dc53dbab1062d", size = 11709872, upload-time = "2025-09-29T23:21:27.165Z" }, - { url = "https://files.pythonhosted.org/packages/15/07/284f757f63f8a8d69ed4472bfd85122bd086e637bf4ed09de572d575a693/pandas-2.3.3-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:318d77e0e42a628c04dc56bcef4b40de67918f7041c2b061af1da41dcff670ac", size = 12306371, upload-time = "2025-09-29T23:21:40.532Z" }, - { url = "https://files.pythonhosted.org/packages/33/81/a3afc88fca4aa925804a27d2676d22dcd2031c2ebe08aabd0ae55b9ff282/pandas-2.3.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:4e0a175408804d566144e170d0476b15d78458795bb18f1304fb94160cabf40c", size = 12765333, upload-time = "2025-09-29T23:21:55.77Z" }, - { url = "https://files.pythonhosted.org/packages/8d/0f/b4d4ae743a83742f1153464cf1a8ecfafc3ac59722a0b5c8602310cb7158/pandas-2.3.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:93c2d9ab0fc11822b5eece72ec9587e172f63cff87c00b062f6e37448ced4493", size = 13418120, upload-time = "2025-09-29T23:22:10.109Z" }, - { url = "https://files.pythonhosted.org/packages/4f/c7/e54682c96a895d0c808453269e0b5928a07a127a15704fedb643e9b0a4c8/pandas-2.3.3-cp313-cp313-win_amd64.whl", hash = "sha256:f8bfc0e12dc78f777f323f55c58649591b2cd0c43534e8355c51d3fede5f4dee", size = 10993991, upload-time = "2025-09-29T23:25:04.889Z" }, - { url = "https://files.pythonhosted.org/packages/f9/ca/3f8d4f49740799189e1395812f3bf23b5e8fc7c190827d55a610da72ce55/pandas-2.3.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:75ea25f9529fdec2d2e93a42c523962261e567d250b0013b16210e1d40d7c2e5", size = 12048227, upload-time = "2025-09-29T23:22:24.343Z" }, - { url = "https://files.pythonhosted.org/packages/0e/5a/f43efec3e8c0cc92c4663ccad372dbdff72b60bdb56b2749f04aa1d07d7e/pandas-2.3.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:74ecdf1d301e812db96a465a525952f4dde225fdb6d8e5a521d47e1f42041e21", size = 11411056, upload-time = "2025-09-29T23:22:37.762Z" }, - { url = "https://files.pythonhosted.org/packages/46/b1/85331edfc591208c9d1a63a06baa67b21d332e63b7a591a5ba42a10bb507/pandas-2.3.3-cp313-cp313t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6435cb949cb34ec11cc9860246ccb2fdc9ecd742c12d3304989017d53f039a78", size = 11645189, upload-time = "2025-09-29T23:22:51.688Z" }, - { url = "https://files.pythonhosted.org/packages/44/23/78d645adc35d94d1ac4f2a3c4112ab6f5b8999f4898b8cdf01252f8df4a9/pandas-2.3.3-cp313-cp313t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:900f47d8f20860de523a1ac881c4c36d65efcb2eb850e6948140fa781736e110", size = 12121912, upload-time = "2025-09-29T23:23:05.042Z" }, - { url = "https://files.pythonhosted.org/packages/53/da/d10013df5e6aaef6b425aa0c32e1fc1f3e431e4bcabd420517dceadce354/pandas-2.3.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:a45c765238e2ed7d7c608fc5bc4a6f88b642f2f01e70c0c23d2224dd21829d86", size = 12712160, upload-time = "2025-09-29T23:23:28.57Z" }, - { url = "https://files.pythonhosted.org/packages/bd/17/e756653095a083d8a37cbd816cb87148debcfcd920129b25f99dd8d04271/pandas-2.3.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:c4fc4c21971a1a9f4bdb4c73978c7f7256caa3e62b323f70d6cb80db583350bc", size = 13199233, upload-time = "2025-09-29T23:24:24.876Z" }, - { url = "https://files.pythonhosted.org/packages/04/fd/74903979833db8390b73b3a8a7d30d146d710bd32703724dd9083950386f/pandas-2.3.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:ee15f284898e7b246df8087fc82b87b01686f98ee67d85a17b7ab44143a3a9a0", size = 11540635, upload-time = "2025-09-29T23:25:52.486Z" }, - { url = "https://files.pythonhosted.org/packages/21/00/266d6b357ad5e6d3ad55093a7e8efc7dd245f5a842b584db9f30b0f0a287/pandas-2.3.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:1611aedd912e1ff81ff41c745822980c49ce4a7907537be8692c8dbc31924593", size = 10759079, upload-time = "2025-09-29T23:26:33.204Z" }, - { url = "https://files.pythonhosted.org/packages/ca/05/d01ef80a7a3a12b2f8bbf16daba1e17c98a2f039cbc8e2f77a2c5a63d382/pandas-2.3.3-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6d2cefc361461662ac48810cb14365a365ce864afe85ef1f447ff5a1e99ea81c", size = 11814049, upload-time = "2025-09-29T23:27:15.384Z" }, - { url = "https://files.pythonhosted.org/packages/15/b2/0e62f78c0c5ba7e3d2c5945a82456f4fac76c480940f805e0b97fcbc2f65/pandas-2.3.3-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ee67acbbf05014ea6c763beb097e03cd629961c8a632075eeb34247120abcb4b", size = 12332638, upload-time = "2025-09-29T23:27:51.625Z" }, - { url = "https://files.pythonhosted.org/packages/c5/33/dd70400631b62b9b29c3c93d2feee1d0964dc2bae2e5ad7a6c73a7f25325/pandas-2.3.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:c46467899aaa4da076d5abc11084634e2d197e9460643dd455ac3db5856b24d6", size = 12886834, upload-time = "2025-09-29T23:28:21.289Z" }, - { url = "https://files.pythonhosted.org/packages/d3/18/b5d48f55821228d0d2692b34fd5034bb185e854bdb592e9c640f6290e012/pandas-2.3.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:6253c72c6a1d990a410bc7de641d34053364ef8bcd3126f7e7450125887dffe3", size = 13409925, upload-time = "2025-09-29T23:28:58.261Z" }, - { url = "https://files.pythonhosted.org/packages/a6/3d/124ac75fcd0ecc09b8fdccb0246ef65e35b012030defb0e0eba2cbbbe948/pandas-2.3.3-cp314-cp314-win_amd64.whl", hash = "sha256:1b07204a219b3b7350abaae088f451860223a52cfb8a6c53358e7948735158e5", size = 11109071, upload-time = "2025-09-29T23:32:27.484Z" }, - { url = "https://files.pythonhosted.org/packages/89/9c/0e21c895c38a157e0faa1fb64587a9226d6dd46452cac4532d80c3c4a244/pandas-2.3.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:2462b1a365b6109d275250baaae7b760fd25c726aaca0054649286bcfbb3e8ec", size = 12048504, upload-time = "2025-09-29T23:29:31.47Z" }, - { url = "https://files.pythonhosted.org/packages/d7/82/b69a1c95df796858777b68fbe6a81d37443a33319761d7c652ce77797475/pandas-2.3.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:0242fe9a49aa8b4d78a4fa03acb397a58833ef6199e9aa40a95f027bb3a1b6e7", size = 11410702, upload-time = "2025-09-29T23:29:54.591Z" }, - { url = "https://files.pythonhosted.org/packages/f9/88/702bde3ba0a94b8c73a0181e05144b10f13f29ebfc2150c3a79062a8195d/pandas-2.3.3-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a21d830e78df0a515db2b3d2f5570610f5e6bd2e27749770e8bb7b524b89b450", size = 11634535, upload-time = "2025-09-29T23:30:21.003Z" }, - { url = "https://files.pythonhosted.org/packages/a4/1e/1bac1a839d12e6a82ec6cb40cda2edde64a2013a66963293696bbf31fbbb/pandas-2.3.3-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2e3ebdb170b5ef78f19bfb71b0dc5dc58775032361fa188e814959b74d726dd5", size = 12121582, upload-time = "2025-09-29T23:30:43.391Z" }, - { url = "https://files.pythonhosted.org/packages/44/91/483de934193e12a3b1d6ae7c8645d083ff88dec75f46e827562f1e4b4da6/pandas-2.3.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:d051c0e065b94b7a3cea50eb1ec32e912cd96dba41647eb24104b6c6c14c5788", size = 12699963, upload-time = "2025-09-29T23:31:10.009Z" }, - { url = "https://files.pythonhosted.org/packages/70/44/5191d2e4026f86a2a109053e194d3ba7a31a2d10a9c2348368c63ed4e85a/pandas-2.3.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:3869faf4bd07b3b66a9f462417d0ca3a9df29a9f6abd5d0d0dbab15dac7abe87", size = 13202175, upload-time = "2025-09-29T23:31:59.173Z" }, +sdist = { url = "https://files.pythonhosted.org/packages/9c/d6/9f8431bacc2e19dca897724cd097b1bb224a6ad5433784a44b587c7c13af/pandas-2.2.3.tar.gz", hash = "sha256:4f18ba62b61d7e192368b84517265a99b4d7ee8912f8708660fb4a366cc82667", size = 4399213, upload-time = "2024-09-20T13:10:04.827Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/aa/70/c853aec59839bceed032d52010ff5f1b8d87dc3114b762e4ba2727661a3b/pandas-2.2.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1948ddde24197a0f7add2bdc4ca83bf2b1ef84a1bc8ccffd95eda17fd836ecb5", size = 12580827, upload-time = "2024-09-20T13:08:42.347Z" }, + { url = "https://files.pythonhosted.org/packages/99/f2/c4527768739ffa4469b2b4fff05aa3768a478aed89a2f271a79a40eee984/pandas-2.2.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:381175499d3802cde0eabbaf6324cce0c4f5d52ca6f8c377c29ad442f50f6348", size = 11303897, upload-time = "2024-09-20T13:08:45.807Z" }, + { url = "https://files.pythonhosted.org/packages/ed/12/86c1747ea27989d7a4064f806ce2bae2c6d575b950be087837bdfcabacc9/pandas-2.2.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d9c45366def9a3dd85a6454c0e7908f2b3b8e9c138f5dc38fed7ce720d8453ed", size = 66480908, upload-time = "2024-09-20T18:37:13.513Z" }, + { url = "https://files.pythonhosted.org/packages/44/50/7db2cd5e6373ae796f0ddad3675268c8d59fb6076e66f0c339d61cea886b/pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:86976a1c5b25ae3f8ccae3a5306e443569ee3c3faf444dfd0f41cda24667ad57", size = 13064210, upload-time = "2024-09-20T13:08:48.325Z" }, + { url = "https://files.pythonhosted.org/packages/61/61/a89015a6d5536cb0d6c3ba02cebed51a95538cf83472975275e28ebf7d0c/pandas-2.2.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:b8661b0238a69d7aafe156b7fa86c44b881387509653fdf857bebc5e4008ad42", size = 16754292, upload-time = "2024-09-20T19:01:54.443Z" }, + { url = "https://files.pythonhosted.org/packages/ce/0d/4cc7b69ce37fac07645a94e1d4b0880b15999494372c1523508511b09e40/pandas-2.2.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:37e0aced3e8f539eccf2e099f65cdb9c8aa85109b0be6e93e2baff94264bdc6f", size = 14416379, upload-time = "2024-09-20T13:08:50.882Z" }, + { url = "https://files.pythonhosted.org/packages/31/9e/6ebb433de864a6cd45716af52a4d7a8c3c9aaf3a98368e61db9e69e69a9c/pandas-2.2.3-cp310-cp310-win_amd64.whl", hash = "sha256:56534ce0746a58afaf7942ba4863e0ef81c9c50d3f0ae93e9497d6a41a057645", size = 11598471, upload-time = "2024-09-20T13:08:53.332Z" }, + { url = "https://files.pythonhosted.org/packages/a8/44/d9502bf0ed197ba9bf1103c9867d5904ddcaf869e52329787fc54ed70cc8/pandas-2.2.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:66108071e1b935240e74525006034333f98bcdb87ea116de573a6a0dccb6c039", size = 12602222, upload-time = "2024-09-20T13:08:56.254Z" }, + { url = "https://files.pythonhosted.org/packages/52/11/9eac327a38834f162b8250aab32a6781339c69afe7574368fffe46387edf/pandas-2.2.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7c2875855b0ff77b2a64a0365e24455d9990730d6431b9e0ee18ad8acee13dbd", size = 11321274, upload-time = "2024-09-20T13:08:58.645Z" }, + { url = "https://files.pythonhosted.org/packages/45/fb/c4beeb084718598ba19aa9f5abbc8aed8b42f90930da861fcb1acdb54c3a/pandas-2.2.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd8d0c3be0515c12fed0bdbae072551c8b54b7192c7b1fda0ba56059a0179698", size = 15579836, upload-time = "2024-09-20T19:01:57.571Z" }, + { url = "https://files.pythonhosted.org/packages/cd/5f/4dba1d39bb9c38d574a9a22548c540177f78ea47b32f99c0ff2ec499fac5/pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c124333816c3a9b03fbeef3a9f230ba9a737e9e5bb4060aa2107a86cc0a497fc", size = 13058505, upload-time = "2024-09-20T13:09:01.501Z" }, + { url = "https://files.pythonhosted.org/packages/b9/57/708135b90391995361636634df1f1130d03ba456e95bcf576fada459115a/pandas-2.2.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:63cc132e40a2e084cf01adf0775b15ac515ba905d7dcca47e9a251819c575ef3", size = 16744420, upload-time = "2024-09-20T19:02:00.678Z" }, + { url = "https://files.pythonhosted.org/packages/86/4a/03ed6b7ee323cf30404265c284cee9c65c56a212e0a08d9ee06984ba2240/pandas-2.2.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:29401dbfa9ad77319367d36940cd8a0b3a11aba16063e39632d98b0e931ddf32", size = 14440457, upload-time = "2024-09-20T13:09:04.105Z" }, + { url = "https://files.pythonhosted.org/packages/ed/8c/87ddf1fcb55d11f9f847e3c69bb1c6f8e46e2f40ab1a2d2abadb2401b007/pandas-2.2.3-cp311-cp311-win_amd64.whl", hash = "sha256:3fc6873a41186404dad67245896a6e440baacc92f5b716ccd1bc9ed2995ab2c5", size = 11617166, upload-time = "2024-09-20T13:09:06.917Z" }, + { url = "https://files.pythonhosted.org/packages/17/a3/fb2734118db0af37ea7433f57f722c0a56687e14b14690edff0cdb4b7e58/pandas-2.2.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b1d432e8d08679a40e2a6d8b2f9770a5c21793a6f9f47fdd52c5ce1948a5a8a9", size = 12529893, upload-time = "2024-09-20T13:09:09.655Z" }, + { url = "https://files.pythonhosted.org/packages/e1/0c/ad295fd74bfac85358fd579e271cded3ac969de81f62dd0142c426b9da91/pandas-2.2.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a5a1595fe639f5988ba6a8e5bc9649af3baf26df3998a0abe56c02609392e0a4", size = 11363475, upload-time = "2024-09-20T13:09:14.718Z" }, + { url = "https://files.pythonhosted.org/packages/c6/2a/4bba3f03f7d07207481fed47f5b35f556c7441acddc368ec43d6643c5777/pandas-2.2.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5de54125a92bb4d1c051c0659e6fcb75256bf799a732a87184e5ea503965bce3", size = 15188645, upload-time = "2024-09-20T19:02:03.88Z" }, + { url = "https://files.pythonhosted.org/packages/38/f8/d8fddee9ed0d0c0f4a2132c1dfcf0e3e53265055da8df952a53e7eaf178c/pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fffb8ae78d8af97f849404f21411c95062db1496aeb3e56f146f0355c9989319", size = 12739445, upload-time = "2024-09-20T13:09:17.621Z" }, + { url = "https://files.pythonhosted.org/packages/20/e8/45a05d9c39d2cea61ab175dbe6a2de1d05b679e8de2011da4ee190d7e748/pandas-2.2.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6dfcb5ee8d4d50c06a51c2fffa6cff6272098ad6540aed1a76d15fb9318194d8", size = 16359235, upload-time = "2024-09-20T19:02:07.094Z" }, + { url = "https://files.pythonhosted.org/packages/1d/99/617d07a6a5e429ff90c90da64d428516605a1ec7d7bea494235e1c3882de/pandas-2.2.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:062309c1b9ea12a50e8ce661145c6aab431b1e99530d3cd60640e255778bd43a", size = 14056756, upload-time = "2024-09-20T13:09:20.474Z" }, + { url = "https://files.pythonhosted.org/packages/29/d4/1244ab8edf173a10fd601f7e13b9566c1b525c4f365d6bee918e68381889/pandas-2.2.3-cp312-cp312-win_amd64.whl", hash = "sha256:59ef3764d0fe818125a5097d2ae867ca3fa64df032331b7e0917cf5d7bf66b13", size = 11504248, upload-time = "2024-09-20T13:09:23.137Z" }, + { url = "https://files.pythonhosted.org/packages/64/22/3b8f4e0ed70644e85cfdcd57454686b9057c6c38d2f74fe4b8bc2527214a/pandas-2.2.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f00d1345d84d8c86a63e476bb4955e46458b304b9575dcf71102b5c705320015", size = 12477643, upload-time = "2024-09-20T13:09:25.522Z" }, + { url = "https://files.pythonhosted.org/packages/e4/93/b3f5d1838500e22c8d793625da672f3eec046b1a99257666c94446969282/pandas-2.2.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3508d914817e153ad359d7e069d752cdd736a247c322d932eb89e6bc84217f28", size = 11281573, upload-time = "2024-09-20T13:09:28.012Z" }, + { url = "https://files.pythonhosted.org/packages/f5/94/6c79b07f0e5aab1dcfa35a75f4817f5c4f677931d4234afcd75f0e6a66ca/pandas-2.2.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:22a9d949bfc9a502d320aa04e5d02feab689d61da4e7764b62c30b991c42c5f0", size = 15196085, upload-time = "2024-09-20T19:02:10.451Z" }, + { url = "https://files.pythonhosted.org/packages/e8/31/aa8da88ca0eadbabd0a639788a6da13bb2ff6edbbb9f29aa786450a30a91/pandas-2.2.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3a255b2c19987fbbe62a9dfd6cff7ff2aa9ccab3fc75218fd4b7530f01efa24", size = 12711809, upload-time = "2024-09-20T13:09:30.814Z" }, + { url = "https://files.pythonhosted.org/packages/ee/7c/c6dbdb0cb2a4344cacfb8de1c5808ca885b2e4dcfde8008266608f9372af/pandas-2.2.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:800250ecdadb6d9c78eae4990da62743b857b470883fa27f652db8bdde7f6659", size = 16356316, upload-time = "2024-09-20T19:02:13.825Z" }, + { url = "https://files.pythonhosted.org/packages/57/b7/8b757e7d92023b832869fa8881a992696a0bfe2e26f72c9ae9f255988d42/pandas-2.2.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6374c452ff3ec675a8f46fd9ab25c4ad0ba590b71cf0656f8b6daa5202bca3fb", size = 14022055, upload-time = "2024-09-20T13:09:33.462Z" }, + { url = "https://files.pythonhosted.org/packages/3b/bc/4b18e2b8c002572c5a441a64826252ce5da2aa738855747247a971988043/pandas-2.2.3-cp313-cp313-win_amd64.whl", hash = "sha256:61c5ad4043f791b61dd4752191d9f07f0ae412515d59ba8f005832a532f8736d", size = 11481175, upload-time = "2024-09-20T13:09:35.871Z" }, + { url = "https://files.pythonhosted.org/packages/76/a3/a5d88146815e972d40d19247b2c162e88213ef51c7c25993942c39dbf41d/pandas-2.2.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:3b71f27954685ee685317063bf13c7709a7ba74fc996b84fc6821c59b0f06468", size = 12615650, upload-time = "2024-09-20T13:09:38.685Z" }, + { url = "https://files.pythonhosted.org/packages/9c/8c/f0fd18f6140ddafc0c24122c8a964e48294acc579d47def376fef12bcb4a/pandas-2.2.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:38cf8125c40dae9d5acc10fa66af8ea6fdf760b2714ee482ca691fc66e6fcb18", size = 11290177, upload-time = "2024-09-20T13:09:41.141Z" }, + { url = "https://files.pythonhosted.org/packages/ed/f9/e995754eab9c0f14c6777401f7eece0943840b7a9fc932221c19d1abee9f/pandas-2.2.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ba96630bc17c875161df3818780af30e43be9b166ce51c9a18c1feae342906c2", size = 14651526, upload-time = "2024-09-20T19:02:16.905Z" }, + { url = "https://files.pythonhosted.org/packages/25/b0/98d6ae2e1abac4f35230aa756005e8654649d305df9a28b16b9ae4353bff/pandas-2.2.3-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1db71525a1538b30142094edb9adc10be3f3e176748cd7acc2240c2f2e5aa3a4", size = 11871013, upload-time = "2024-09-20T13:09:44.39Z" }, + { url = "https://files.pythonhosted.org/packages/cc/57/0f72a10f9db6a4628744c8e8f0df4e6e21de01212c7c981d31e50ffc8328/pandas-2.2.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:15c0e1e02e93116177d29ff83e8b1619c93ddc9c49083f237d4312337a61165d", size = 15711620, upload-time = "2024-09-20T19:02:20.639Z" }, + { url = "https://files.pythonhosted.org/packages/ab/5f/b38085618b950b79d2d9164a711c52b10aefc0ae6833b96f626b7021b2ed/pandas-2.2.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:ad5b65698ab28ed8d7f18790a0dc58005c7629f227be9ecc1072aa74c0c1d43a", size = 13098436, upload-time = "2024-09-20T13:09:48.112Z" }, ] [[package]] @@ -3509,6 +3594,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl", hash = "sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0", size = 11842, upload-time = "2024-07-21T12:58:20.04Z" }, ] +[[package]] +name = "py4j" +version = "0.10.9.9" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/38/31/0b210511177070c8d5d3059556194352e5753602fa64b85b7ab81ec1a009/py4j-0.10.9.9.tar.gz", hash = "sha256:f694cad19efa5bd1dee4f3e5270eb406613c974394035e5bfc4ec1aba870b879", size = 761089, upload-time = "2025-01-15T03:53:18.624Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bd/db/ea0203e495be491c85af87b66e37acfd3bf756fd985f87e46fc5e3bf022c/py4j-0.10.9.9-py2.py3-none-any.whl", hash = "sha256:c7c26e4158defb37b0bb124933163641a2ff6e3a3913f7811b0ddbe07ed61533", size = 203008, upload-time = "2025-01-15T03:53:15.648Z" }, +] + [[package]] name = "pyaml" version = "25.7.0" @@ -3769,6 +3863,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/10/5e/1aa9a93198c6b64513c9d7752de7422c06402de6600a8767da1524f9570b/pyparsing-3.2.5-py3-none-any.whl", hash = "sha256:e38a4f02064cf41fe6593d328d0512495ad1f3d8a91c4f73fc401b3079a59a5e", size = 113890, upload-time = "2025-09-21T04:11:04.117Z" }, ] +[[package]] +name = "pyspark" +version = "4.0.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "py4j" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ae/40/1414582f16c1d7b051c668c2e19c62d21a18bd181d944cb24f5ddbb2423f/pyspark-4.0.1.tar.gz", hash = "sha256:9d1f22d994f60369228397e3479003ffe2dd736ba79165003246ff7bd48e2c73", size = 434204896, upload-time = "2025-09-06T07:15:57.091Z" } + [[package]] name = "pytest" version = "8.4.2" @@ -4893,6 +4996,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b8/d9/13bdde6521f322861fab67473cec4b1cc8999f3871953531cf61945fad92/sqlalchemy-2.0.43-py3-none-any.whl", hash = "sha256:1681c21dd2ccee222c2fe0bef671d1aef7c504087c9c4e800371cfcc8ac966fc", size = 1924759, upload-time = "2025-08-11T15:39:53.024Z" }, ] +[[package]] +name = "sqlglot" +version = "27.20.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/76/9d/31eac378d98b5d7f945981dc27bc34a3959a60c09ed9ca78cf9c9f95fe52/sqlglot-27.20.0.tar.gz", hash = "sha256:92e7a93200eb588eb17cf19c813103160bd6c9b261ffd295eea79633657569d9", size = 5480742, upload-time = "2025-09-30T13:36:20.987Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bf/53/4c87ef36f743d7b2a414839677f08ea0904400afd376b3785b6ed1593d0b/sqlglot-27.20.0-py3-none-any.whl", hash = "sha256:9c0b67bbb8e0a9300e34eb2984bf825bca6356cf3cdcc7637658058f2afe41ca", size = 520739, upload-time = "2025-09-30T13:36:18.322Z" }, +] + [[package]] name = "sse-starlette" version = "3.0.2" @@ -4971,6 +5083,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/32/d5/f9a850d79b0851d1d4ef6456097579a9005b31fea68726a4ae5f2d82ddd9/threadpoolctl-3.6.0-py3-none-any.whl", hash = "sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb", size = 18638, upload-time = "2025-03-13T13:49:21.846Z" }, ] +[[package]] +name = "thrift" +version = "0.20.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "six" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/3c/2d/8946864f716ac82dcc88d290ed613cba7a80ec75df4f553ec3ff275f486e/thrift-0.20.0.tar.gz", hash = "sha256:4dd662eadf6b8aebe8a41729527bd69adf6ceaa2a8681cbef64d1273b3e8feba", size = 62295, upload-time = "2024-03-22T22:53:08.228Z" } + [[package]] name = "tiktoken" version = "0.11.0"