diff --git a/.gitignore b/.gitignore index a8e0c01..9ac7fef 100644 --- a/.gitignore +++ b/.gitignore @@ -208,7 +208,7 @@ __marimo__/ notes.txt testing_base -models +/models models_bak settings.json diff --git a/README.md b/README.md index 3168785..7779210 100644 --- a/README.md +++ b/README.md @@ -81,8 +81,8 @@ export OPENAI_API_KEY="your-openai-api-key" For a detailed, hands-on introduction to the project, please see our quickstart notebooks: -* [`quickstart_healthcare.ipynb`](notebooks/quickstart_healthcare.ipynb): This notebook will walk you through the entire process of building a semantic layer using a healthcare dataset. -* [`quickstart_tech_company.ipynb`](notebooks/quickstart_tech_company.ipynb): This notebook demonstrates how to use the library with a technology manufacturing company dataset +* [`quickstart_healthcare.ipynb`](notebooks/quickstart_healthcare.ipynb) [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Intugle/data-tools/blob/main/notebooks/quickstart_healthcare.ipynb): This notebook will walk you through the entire process of building a semantic layer using a healthcare dataset. +* [`quickstart_tech_company.ipynb`](notebooks/quickstart_tech_company.ipynb) [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Intugle/data-tools/blob/main/notebooks/quickstart_tech_company.ipynb): This notebook demonstrates how to use the library with a technology manufacturing company dataset These datasets will take you through the following steps: diff --git a/docsite/.gitignore b/docsite/.gitignore new file mode 100644 index 0000000..b2d6de3 --- /dev/null +++ b/docsite/.gitignore @@ -0,0 +1,20 @@ +# Dependencies +/node_modules + +# Production +/build + +# Generated files +.docusaurus +.cache-loader + +# Misc +.DS_Store +.env.local +.env.development.local +.env.test.local +.env.production.local + +npm-debug.log* +yarn-debug.log* +yarn-error.log* diff --git a/pyproject.toml b/pyproject.toml index 773fae7..b90c813 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,6 +62,7 @@ dependencies = [ [project.scripts] intugle-mcp = "intugle.mcp.server:main" +intugle-streamlit = "intugle.cli:export_data" [dependency-groups] test = [ diff --git a/src/intugle/cli.py b/src/intugle/cli.py new file mode 100644 index 0000000..bbd98e1 --- /dev/null +++ b/src/intugle/cli.py @@ -0,0 +1,11 @@ +from intugle.streamlit import StreamlitApp + + +def export_data(): + """Exports the analysis results to CSV files.""" + app = StreamlitApp() + app.export_analysis_to_csv() + + +if __name__ == "__main__": + export_data() diff --git a/src/intugle/exporters/__init__.py b/src/intugle/exporters/__init__.py new file mode 100644 index 0000000..744aa56 --- /dev/null +++ b/src/intugle/exporters/__init__.py @@ -0,0 +1,36 @@ +import os + +import pandas as pd + +from intugle.parser.manifest import Manifest + + +class CSVExporter: + def __init__(self, manifest: Manifest, project_base: str): + self.manifest = manifest + self.project_base = project_base + + def _export_column_profiles(self, file_path: str): + df = self.manifest.profiles_df + profile_columns_to_keep = [ + col for col in df.columns if col not in ["business_glossary", "business_tags"] + ] + df[profile_columns_to_keep].to_csv(file_path, index=False) + + def _export_link_predictions(self, file_path: str): + df = self.manifest.links_df + df.to_csv(file_path, index=False) + + def _export_business_glossary(self, file_path: str): + df = self.manifest.business_glossary_df + df.to_csv(file_path, index=False) + + def export_all( + self, + column_profiles_file="column_profiles.csv", + link_predictions_file="link_predictions.csv", + business_glossary_file="business_glossary.csv", + ): + self._export_column_profiles(os.path.join(self.project_base, column_profiles_file)) + self._export_link_predictions(os.path.join(self.project_base, link_predictions_file)) + self._export_business_glossary(os.path.join(self.project_base, business_glossary_file)) diff --git a/src/intugle/link_predictor/predictor.py b/src/intugle/link_predictor/predictor.py index c10410e..057a2a7 100644 --- a/src/intugle/link_predictor/predictor.py +++ b/src/intugle/link_predictor/predictor.py @@ -143,7 +143,7 @@ def _predict_for_pair( ] return pair_links - def predict(self, filename='relationships.yml', save: bool = False) -> Self: + def predict(self, filename='__relationships__.yml', save: bool = False) -> Self: """ Iterates through all unique pairs of datasets, predicts the links for each pair, and returns the aggregated results. diff --git a/src/intugle/models/manifest.py b/src/intugle/models/manifest.py index 17aa331..7b84b81 100644 --- a/src/intugle/models/manifest.py +++ b/src/intugle/models/manifest.py @@ -1,3 +1,5 @@ +import pandas as pd + from pydantic import Field from intugle.common.schema import SchemaBase @@ -10,3 +12,94 @@ class Manifest(SchemaBase): sources: dict[str, Source] = Field(default_factory=dict) models: dict[str, Model] = Field(default_factory=dict) relationships: dict[str, Relationship] = Field(default_factory=dict) + + @property + def profiles_df(self) -> pd.DataFrame: + """Generates a DataFrame with column profiling information.""" + all_profiles = [] + for source in self.sources.values(): + for column in source.table.columns: + metrics = column.profiling_metrics + profile_data = { + "table_name": source.table.name, + "column_name": column.name, + "data_type_l1": column.type, + "data_type_l2": column.category, + "count": metrics.count, + "null_count": metrics.null_count, + "distinct_count": metrics.distinct_count, + "uniqueness": metrics.distinct_count / metrics.count if metrics.count else 0, + "completeness": (metrics.count - metrics.null_count) / metrics.count if metrics.count else 0, + "sample_values": metrics.sample_data, + "business_glossary": column.description, + "business_tags": column.tags, + } + all_profiles.append(profile_data) + return pd.DataFrame(all_profiles) + + @property + def links_df(self) -> pd.DataFrame: + """Generates a DataFrame with link prediction information.""" + link_data = [] + for relationship in self.relationships.values(): + left_table_name = relationship.source.table + left_column_name = relationship.source.column + right_table_name = relationship.target.table + right_column_name = relationship.target.column + + left_source = self.sources.get(left_table_name) + right_source = self.sources.get(right_table_name) + + if left_source and right_source: + left_column = next((c for c in left_source.table.columns if c.name == left_column_name), None) + right_column = next((c for c in right_source.table.columns if c.name == right_column_name), None) + + if left_column and right_column: + left_metrics = left_column.profiling_metrics + right_metrics = right_column.profiling_metrics + link_data.append( + { + "left_table": left_table_name, + "left_column": left_column_name, + "left_data_type_l1": left_column.type, + "left_data_type_l2": left_column.category, + "left_count": left_metrics.count, + "left_uniqueness": left_metrics.distinct_count / left_metrics.count + if left_metrics.count + else 0, + "left_completeness": (left_metrics.count - left_metrics.null_count) / left_metrics.count + if left_metrics.count + else 0, + "left_sample_values": left_metrics.sample_data, + "right_table": right_table_name, + "right_column": right_column_name, + "right_data_type_l1": right_column.type, + "right_data_type_l2": right_column.category, + "right_count": right_metrics.count, + "right_uniqueness": right_metrics.distinct_count / right_metrics.count + if right_metrics.count + else 0, + "right_completeness": (right_metrics.count - right_metrics.null_count) + / right_metrics.count + if right_metrics.count + else 0, + "right_sample_values": right_metrics.sample_data, + } + ) + return pd.DataFrame(link_data) + + @property + def business_glossary_df(self) -> pd.DataFrame: + """Generates a DataFrame with business glossary information.""" + glossary_data = [] + for source in self.sources.values(): + for column in source.table.columns: + glossary_data.append( + { + "table_name": source.table.name, + "column_name": column.name, + "business_glossary": column.description, + "business_tags": column.tags, + } + ) + return pd.DataFrame(glossary_data) diff --git a/src/intugle/streamlit.py b/src/intugle/streamlit.py new file mode 100644 index 0000000..7fb6da2 --- /dev/null +++ b/src/intugle/streamlit.py @@ -0,0 +1,32 @@ + +from intugle.analysis.models import DataSet +from intugle.core import settings +from intugle.exporters import CSVExporter +from intugle.parser.manifest import ManifestLoader + + +class StreamlitApp: + + def __init__(self, project_base: str = settings.PROJECT_BASE): + self.manifest_loader = ManifestLoader(project_base) + self.manifest_loader.load() + self.manifest = self.manifest_loader.manifest + + self.project_base = project_base + + self.load_all() + + def load_all(self): + sources = self.manifest.sources + for source in sources.values(): + table_name = source.table.name + details = source.table.details + DataSet(data=details, name=table_name) + + def export_analysis_to_csv(self): + """Exports the analysis results to CSV files.""" + exporter = CSVExporter(self.manifest, self.project_base) + exporter.export_all() + print("Succesfulluy exported analysis results to CSV files.") + +