Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ __marimo__/
notes.txt

testing_base
models
/models
models_bak

settings.json
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,8 @@ export OPENAI_API_KEY="your-openai-api-key"

For a detailed, hands-on introduction to the project, please see our quickstart notebooks:

* [`quickstart_healthcare.ipynb`](notebooks/quickstart_healthcare.ipynb): This notebook will walk you through the entire process of building a semantic layer using a healthcare dataset.
* [`quickstart_tech_company.ipynb`](notebooks/quickstart_tech_company.ipynb): This notebook demonstrates how to use the library with a technology manufacturing company dataset
* [`quickstart_healthcare.ipynb`](notebooks/quickstart_healthcare.ipynb) [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Intugle/data-tools/blob/main/notebooks/quickstart_healthcare.ipynb): This notebook will walk you through the entire process of building a semantic layer using a healthcare dataset.
* [`quickstart_tech_company.ipynb`](notebooks/quickstart_tech_company.ipynb) [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Intugle/data-tools/blob/main/notebooks/quickstart_tech_company.ipynb): This notebook demonstrates how to use the library with a technology manufacturing company dataset

These datasets will take you through the following steps:

Expand Down
20 changes: 20 additions & 0 deletions docsite/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Dependencies
/node_modules

# Production
/build

# Generated files
.docusaurus
.cache-loader

# Misc
.DS_Store
.env.local
.env.development.local
.env.test.local
.env.production.local

npm-debug.log*
yarn-debug.log*
yarn-error.log*
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ dependencies = [

[project.scripts]
intugle-mcp = "intugle.mcp.server:main"
intugle-streamlit = "intugle.cli:export_data"

[dependency-groups]
test = [
Expand Down
11 changes: 11 additions & 0 deletions src/intugle/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from intugle.streamlit import StreamlitApp


def export_data():
"""Exports the analysis results to CSV files."""
app = StreamlitApp()
app.export_analysis_to_csv()


if __name__ == "__main__":
export_data()
36 changes: 36 additions & 0 deletions src/intugle/exporters/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import os

import pandas as pd

from intugle.parser.manifest import Manifest


class CSVExporter:
def __init__(self, manifest: Manifest, project_base: str):
self.manifest = manifest
self.project_base = project_base

def _export_column_profiles(self, file_path: str):
df = self.manifest.profiles_df
profile_columns_to_keep = [
col for col in df.columns if col not in ["business_glossary", "business_tags"]
]
df[profile_columns_to_keep].to_csv(file_path, index=False)

def _export_link_predictions(self, file_path: str):
df = self.manifest.links_df
df.to_csv(file_path, index=False)

def _export_business_glossary(self, file_path: str):
df = self.manifest.business_glossary_df
df.to_csv(file_path, index=False)

def export_all(
self,
column_profiles_file="column_profiles.csv",
link_predictions_file="link_predictions.csv",
business_glossary_file="business_glossary.csv",
):
self._export_column_profiles(os.path.join(self.project_base, column_profiles_file))
self._export_link_predictions(os.path.join(self.project_base, link_predictions_file))
self._export_business_glossary(os.path.join(self.project_base, business_glossary_file))
2 changes: 1 addition & 1 deletion src/intugle/link_predictor/predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ def _predict_for_pair(
]
return pair_links

def predict(self, filename='relationships.yml', save: bool = False) -> Self:
def predict(self, filename='__relationships__.yml', save: bool = False) -> Self:
"""
Iterates through all unique pairs of datasets, predicts the links for
each pair, and returns the aggregated results.
Expand Down
93 changes: 93 additions & 0 deletions src/intugle/models/manifest.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import pandas as pd

from pydantic import Field

from intugle.common.schema import SchemaBase
Expand All @@ -10,3 +12,94 @@ class Manifest(SchemaBase):
sources: dict[str, Source] = Field(default_factory=dict)
models: dict[str, Model] = Field(default_factory=dict)
relationships: dict[str, Relationship] = Field(default_factory=dict)

@property
def profiles_df(self) -> pd.DataFrame:
"""Generates a DataFrame with column profiling information."""
all_profiles = []
for source in self.sources.values():
for column in source.table.columns:
metrics = column.profiling_metrics
profile_data = {
"table_name": source.table.name,
"column_name": column.name,
"data_type_l1": column.type,
"data_type_l2": column.category,
"count": metrics.count,
"null_count": metrics.null_count,
"distinct_count": metrics.distinct_count,
"uniqueness": metrics.distinct_count / metrics.count if metrics.count else 0,
"completeness": (metrics.count - metrics.null_count) / metrics.count if metrics.count else 0,
"sample_values": metrics.sample_data,
"business_glossary": column.description,
"business_tags": column.tags,
}
all_profiles.append(profile_data)
return pd.DataFrame(all_profiles)

@property
def links_df(self) -> pd.DataFrame:
"""Generates a DataFrame with link prediction information."""
link_data = []
for relationship in self.relationships.values():
left_table_name = relationship.source.table
left_column_name = relationship.source.column
right_table_name = relationship.target.table
right_column_name = relationship.target.column

left_source = self.sources.get(left_table_name)
right_source = self.sources.get(right_table_name)

if left_source and right_source:
left_column = next((c for c in left_source.table.columns if c.name == left_column_name), None)
right_column = next((c for c in right_source.table.columns if c.name == right_column_name), None)

if left_column and right_column:
left_metrics = left_column.profiling_metrics
right_metrics = right_column.profiling_metrics
link_data.append(
{
"left_table": left_table_name,
"left_column": left_column_name,
"left_data_type_l1": left_column.type,
"left_data_type_l2": left_column.category,
"left_count": left_metrics.count,
"left_uniqueness": left_metrics.distinct_count / left_metrics.count
if left_metrics.count
else 0,
"left_completeness": (left_metrics.count - left_metrics.null_count) / left_metrics.count
if left_metrics.count
else 0,
"left_sample_values": left_metrics.sample_data,
"right_table": right_table_name,
"right_column": right_column_name,
"right_data_type_l1": right_column.type,
"right_data_type_l2": right_column.category,
"right_count": right_metrics.count,
"right_uniqueness": right_metrics.distinct_count / right_metrics.count
if right_metrics.count
else 0,
"right_completeness": (right_metrics.count - right_metrics.null_count)
/ right_metrics.count
if right_metrics.count
else 0,
"right_sample_values": right_metrics.sample_data,
}
)
return pd.DataFrame(link_data)

@property
def business_glossary_df(self) -> pd.DataFrame:
"""Generates a DataFrame with business glossary information."""
glossary_data = []
for source in self.sources.values():
for column in source.table.columns:
glossary_data.append(
{
"table_name": source.table.name,
"column_name": column.name,
"business_glossary": column.description,
"business_tags": column.tags,
}
)
return pd.DataFrame(glossary_data)
32 changes: 32 additions & 0 deletions src/intugle/streamlit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@

from intugle.analysis.models import DataSet
from intugle.core import settings
from intugle.exporters import CSVExporter
from intugle.parser.manifest import ManifestLoader


class StreamlitApp:

def __init__(self, project_base: str = settings.PROJECT_BASE):
self.manifest_loader = ManifestLoader(project_base)
self.manifest_loader.load()
self.manifest = self.manifest_loader.manifest

self.project_base = project_base

self.load_all()

def load_all(self):
sources = self.manifest.sources
for source in sources.values():
table_name = source.table.name
details = source.table.details
DataSet(data=details, name=table_name)

def export_analysis_to_csv(self):
"""Exports the analysis results to CSV files."""
exporter = CSVExporter(self.manifest, self.project_base)
exporter.export_all()
print("Succesfulluy exported analysis results to CSV files.")


Loading