diff --git a/docsite/docs/core-concepts/dataset.md b/docsite/docs/core-concepts/dataset.md index 78e4d10..ee3ef0c 100644 --- a/docsite/docs/core-concepts/dataset.md +++ b/docsite/docs/core-concepts/dataset.md @@ -40,10 +40,12 @@ The library organizes metadata using Pydantic models, but you can access it thro - `[column_name].tags: Optional[List[str]]` - `[column_name].profiling_metrics: Optional[ColumnProfilingMetrics]` - **`ColumnProfilingMetrics`**: Detailed statistics for a column. - - `count: Optional[int]` - - `null_count: Optional[int]` - - `distinct_count: Optional[int]` - - `sample_data: Optional[List[Any]]` + - `.count: Optional[int]` + - `.null_count: Optional[int]` + - `.distinct_count: Optional[int]` + - `.sample_data: Optional[List[Any]]` + - `.uniqueness: Optional[float]` (Read-only property) + - `.completeness: Optional[float]` (Read-only property) #### Example of accessing metadata @@ -64,6 +66,8 @@ print(f"Column Description: {email_column.description}") metrics = email_column.profiling_metrics if metrics: print(f"Distinct Count: {metrics.distinct_count}") + print(f"Uniqueness: {metrics.uniqueness}") + print(f"Completeness: {metrics.completeness}") ``` ### Automatic caching @@ -75,7 +79,7 @@ The `DataSet` object avoids redundant work. When you initialize a `DataSet`, it You can run the analysis pipeline step-by-step for more granular control. Each of these methods includes a `save=True` option to persist the results of that specific stage. ```python -from intugle import DataSet +from intugle.analysis.models import DataSet # Initialize the dataset data_source = {"path": "path/to/my_data.csv", "type": "csv"} @@ -124,4 +128,4 @@ profiles = dataset.profiling_df # Display the first 5 rows print(profiles.head()) -``` +``` \ No newline at end of file diff --git a/docsite/docs/core-concepts/knowledge-builder.md b/docsite/docs/core-concepts/knowledge-builder.md index 39c7b46..4168994 100644 --- a/docsite/docs/core-concepts/knowledge-builder.md +++ b/docsite/docs/core-concepts/knowledge-builder.md @@ -35,7 +35,8 @@ You can initialize the `KnowledgeBuilder` in two ways: 2. **With a List of `DataSet` Objects**: If you have already created `DataSet` objects, you can pass a list of them directly. ```python - from intugle import KnowledgeBuilder, DataSet + from intugle.analysis.models import DataSet + from intugle import KnowledgeBuilder # Create DataSet objects from file-based sources customers_data = {"path": "path/to/customers.csv", "type": "csv"} @@ -129,7 +130,40 @@ link_predictor = kb.link_predictor print(f"Primary Key for customers: {customers_dataset.source_table_model.description}") print("Discovered Links:") print(link_predictor.get_links_df()) - ``` + Learn more about what you can do with these objects. See the [DataSet](./dataset.md) and [Link Prediction](./link-prediction.md) documentation. +## Utility DataFrames + +The `KnowledgeBuilder` provides three convenient properties that consolidate the results from all processed datasets into single Pandas DataFrames. + +### `profiling_df` + +Returns a DataFrame containing the full profiling metrics for every column across all datasets. + +```python +# Get a single DataFrame of all column profiles +all_profiles = kb.profiling_df +print(all_profiles.head()) +``` + +### `links_df` + +A shortcut to the `get_links_df()` method on the `LinkPredictor`, this property returns a DataFrame of all discovered relationships. + +```python +# Get a DataFrame of all predicted links +all_links = kb.links_df +print(all_links) +``` + +### `glossary_df` + +Returns a DataFrame that serves as a consolidated business glossary, listing the table name, column name, description, and tags for every column across all datasets. + +```python +# Get a single, unified business glossary +full_glossary = kb.glossary_df +print(full_glossary.head()) +``` \ No newline at end of file diff --git a/docsite/docs/core-concepts/link-prediction.md b/docsite/docs/core-concepts/link-prediction.md index f95a4db..c5379e6 100644 --- a/docsite/docs/core-concepts/link-prediction.md +++ b/docsite/docs/core-concepts/link-prediction.md @@ -28,7 +28,9 @@ links_list = predictor_instance.links To use the `LinkPredictor` manually, you must give it a list of fully profiled `DataSet` objects. ```python -from intugle import DataSet, LinkPredictor +from intugle.analysis.models import DataSet, +from intugle.link_predictor.predictor import LinkPredictor + # 1. Initialize and fully profile your DataSet objects first customers_data = {"path": "path/to/customers.csv", "type": "csv"} diff --git a/notebooks/quickstart_healthcare.ipynb b/notebooks/quickstart_healthcare.ipynb index 1e0f796..2aea4de 100644 --- a/notebooks/quickstart_healthcare.ipynb +++ b/notebooks/quickstart_healthcare.ipynb @@ -2916,7 +2916,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "0aa894eb", "metadata": {}, "outputs": [ @@ -3610,8 +3610,8 @@ } ], "source": [ - "allergies_dataset = kb.datasets['patients']\n", - "allergies_dataset.profiling_df\n" + "patients_dataset = kb.datasets['patients']\n", + "patients_dataset.profiling_df" ] }, { diff --git a/pyproject.toml b/pyproject.toml index 9d7c850..49100cb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "intugle" -version = "0.1.6" +version = "0.1.7" authors = [ { name="Intugle", email="hello@intugle.ai" }, ] diff --git a/src/intugle/knowledge_builder.py b/src/intugle/knowledge_builder.py index ba0b630..92a6326 100644 --- a/src/intugle/knowledge_builder.py +++ b/src/intugle/knowledge_builder.py @@ -4,6 +4,8 @@ from typing import TYPE_CHECKING, Any, Awaitable, Dict, List, TypeVar +import pandas as pd + from intugle.analysis.models import DataSet from intugle.core.console import console, success_style from intugle.link_predictor.predictor import LinkPredictor @@ -125,6 +127,35 @@ def build(self, force_recreate: bool = False): return self + @property + def profiling_df(self) -> pd.DataFrame: + """Returns a consolidated DataFrame of profiling metrics for all datasets.""" + all_profiles = [dataset.profiling_df for dataset in self.datasets.values()] + return pd.concat(all_profiles, ignore_index=True) + + @property + def links_df(self) -> pd.DataFrame: + """Returns the predicted links as a pandas DataFrame.""" + if hasattr(self, "link_predictor"): + return self.link_predictor.get_links_df() + return pd.DataFrame() + + @property + def glossary_df(self) -> pd.DataFrame: + """Returns a consolidated DataFrame of glossary information for all datasets.""" + glossary_data = [] + for dataset in self.datasets.values(): + for column in dataset.source_table_model.columns: + glossary_data.append( + { + "table_name": dataset.name, + "column_name": column.name, + "column_description": column.description, + "column_tags": column.tags, + } + ) + return pd.DataFrame(glossary_data) + def initialize_semantic_search(self): """Initialize the semantic search engine.""" try: @@ -150,5 +181,4 @@ def search(self, query: str): return _run_async_in_sync(search_client.search(query)) except Exception as e: log.error(f"Could not perform semantic search: {e}") - raise e - + raise e \ No newline at end of file diff --git a/src/intugle/models/resources/model.py b/src/intugle/models/resources/model.py index 4cff1da..0069a39 100644 --- a/src/intugle/models/resources/model.py +++ b/src/intugle/models/resources/model.py @@ -13,6 +13,20 @@ class ColumnProfilingMetrics(SchemaBase): sample_data: Optional[List[Any]] = Field(default_factory=list) dtype_sample: Optional[List[Any]] = Field(default_factory=list, exclude=True) + @property + def uniqueness(self) -> Optional[float]: + """The ratio of distinct values to total count.""" + if self.count is not None and self.distinct_count is not None and self.count > 0: + return self.distinct_count / self.count + return None + + @property + def completeness(self) -> Optional[float]: + """The ratio of non-null values to total count.""" + if self.count is not None and self.null_count is not None and self.count > 0: + return (self.count - self.null_count) / self.count + return None + class Column(SchemaBase): name: str @@ -37,4 +51,4 @@ class ModelProfilingMetrics(SchemaBase): class Model(BaseResource): resource_type: NodeType = NodeType.MODEL columns: List[Column] = Field(default_factory=list) - profiling_metrics: Optional[ModelProfilingMetrics] = None + profiling_metrics: Optional[ModelProfilingMetrics] = None \ No newline at end of file diff --git a/uv.lock b/uv.lock index 20bb35b..e66941f 100644 --- a/uv.lock +++ b/uv.lock @@ -1458,7 +1458,7 @@ wheels = [ [[package]] name = "intugle" -version = "0.1.6" +version = "0.1.7" source = { editable = "." } dependencies = [ { name = "asyncpg" },