Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 10 additions & 6 deletions docsite/docs/core-concepts/dataset.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,12 @@ The library organizes metadata using Pydantic models, but you can access it thro
- `[column_name].tags: Optional[List[str]]`
- `[column_name].profiling_metrics: Optional[ColumnProfilingMetrics]`
- **`ColumnProfilingMetrics`**: Detailed statistics for a column.
- `count: Optional[int]`
- `null_count: Optional[int]`
- `distinct_count: Optional[int]`
- `sample_data: Optional[List[Any]]`
- `.count: Optional[int]`
- `.null_count: Optional[int]`
- `.distinct_count: Optional[int]`
- `.sample_data: Optional[List[Any]]`
- `.uniqueness: Optional[float]` (Read-only property)
- `.completeness: Optional[float]` (Read-only property)

#### Example of accessing metadata

Expand All @@ -64,6 +66,8 @@ print(f"Column Description: {email_column.description}")
metrics = email_column.profiling_metrics
if metrics:
print(f"Distinct Count: {metrics.distinct_count}")
print(f"Uniqueness: {metrics.uniqueness}")
print(f"Completeness: {metrics.completeness}")
```

### Automatic caching
Expand All @@ -75,7 +79,7 @@ The `DataSet` object avoids redundant work. When you initialize a `DataSet`, it
You can run the analysis pipeline step-by-step for more granular control. Each of these methods includes a `save=True` option to persist the results of that specific stage.

```python
from intugle import DataSet
from intugle.analysis.models import DataSet

# Initialize the dataset
data_source = {"path": "path/to/my_data.csv", "type": "csv"}
Expand Down Expand Up @@ -124,4 +128,4 @@ profiles = dataset.profiling_df

# Display the first 5 rows
print(profiles.head())
```
```
38 changes: 36 additions & 2 deletions docsite/docs/core-concepts/knowledge-builder.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ You can initialize the `KnowledgeBuilder` in two ways:
2. **With a List of `DataSet` Objects**: If you have already created `DataSet` objects, you can pass a list of them directly.

```python
from intugle import KnowledgeBuilder, DataSet
from intugle.analysis.models import DataSet
from intugle import KnowledgeBuilder

# Create DataSet objects from file-based sources
customers_data = {"path": "path/to/customers.csv", "type": "csv"}
Expand Down Expand Up @@ -129,7 +130,40 @@ link_predictor = kb.link_predictor
print(f"Primary Key for customers: {customers_dataset.source_table_model.description}")
print("Discovered Links:")
print(link_predictor.get_links_df())

```

Learn more about what you can do with these objects. See the [DataSet](./dataset.md) and [Link Prediction](./link-prediction.md) documentation.

## Utility DataFrames

The `KnowledgeBuilder` provides three convenient properties that consolidate the results from all processed datasets into single Pandas DataFrames.

### `profiling_df`

Returns a DataFrame containing the full profiling metrics for every column across all datasets.

```python
# Get a single DataFrame of all column profiles
all_profiles = kb.profiling_df
print(all_profiles.head())
```

### `links_df`

A shortcut to the `get_links_df()` method on the `LinkPredictor`, this property returns a DataFrame of all discovered relationships.

```python
# Get a DataFrame of all predicted links
all_links = kb.links_df
print(all_links)
```

### `glossary_df`

Returns a DataFrame that serves as a consolidated business glossary, listing the table name, column name, description, and tags for every column across all datasets.

```python
# Get a single, unified business glossary
full_glossary = kb.glossary_df
print(full_glossary.head())
```
4 changes: 3 additions & 1 deletion docsite/docs/core-concepts/link-prediction.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,9 @@ links_list = predictor_instance.links
To use the `LinkPredictor` manually, you must give it a list of fully profiled `DataSet` objects.

```python
from intugle import DataSet, LinkPredictor
from intugle.analysis.models import DataSet,
from intugle.link_predictor.predictor import LinkPredictor


# 1. Initialize and fully profile your DataSet objects first
customers_data = {"path": "path/to/customers.csv", "type": "csv"}
Expand Down
6 changes: 3 additions & 3 deletions notebooks/quickstart_healthcare.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2916,7 +2916,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"id": "0aa894eb",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -3610,8 +3610,8 @@
}
],
"source": [
"allergies_dataset = kb.datasets['patients']\n",
"allergies_dataset.profiling_df\n"
"patients_dataset = kb.datasets['patients']\n",
"patients_dataset.profiling_df"
]
},
{
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "intugle"
version = "0.1.6"
version = "0.1.7"
authors = [
{ name="Intugle", email="[email protected]" },
]
Expand Down
34 changes: 32 additions & 2 deletions src/intugle/knowledge_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

from typing import TYPE_CHECKING, Any, Awaitable, Dict, List, TypeVar

import pandas as pd

from intugle.analysis.models import DataSet
from intugle.core.console import console, success_style
from intugle.link_predictor.predictor import LinkPredictor
Expand Down Expand Up @@ -125,6 +127,35 @@ def build(self, force_recreate: bool = False):

return self

@property
def profiling_df(self) -> pd.DataFrame:
"""Returns a consolidated DataFrame of profiling metrics for all datasets."""
all_profiles = [dataset.profiling_df for dataset in self.datasets.values()]
return pd.concat(all_profiles, ignore_index=True)

@property
def links_df(self) -> pd.DataFrame:
"""Returns the predicted links as a pandas DataFrame."""
if hasattr(self, "link_predictor"):
return self.link_predictor.get_links_df()
return pd.DataFrame()

@property
def glossary_df(self) -> pd.DataFrame:
"""Returns a consolidated DataFrame of glossary information for all datasets."""
glossary_data = []
for dataset in self.datasets.values():
for column in dataset.source_table_model.columns:
glossary_data.append(
{
"table_name": dataset.name,
"column_name": column.name,
"column_description": column.description,
"column_tags": column.tags,
}
)
return pd.DataFrame(glossary_data)

def initialize_semantic_search(self):
"""Initialize the semantic search engine."""
try:
Expand All @@ -150,5 +181,4 @@ def search(self, query: str):
return _run_async_in_sync(search_client.search(query))
except Exception as e:
log.error(f"Could not perform semantic search: {e}")
raise e

raise e
16 changes: 15 additions & 1 deletion src/intugle/models/resources/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,20 @@ class ColumnProfilingMetrics(SchemaBase):
sample_data: Optional[List[Any]] = Field(default_factory=list)
dtype_sample: Optional[List[Any]] = Field(default_factory=list, exclude=True)

@property
def uniqueness(self) -> Optional[float]:
"""The ratio of distinct values to total count."""
if self.count is not None and self.distinct_count is not None and self.count > 0:
return self.distinct_count / self.count
return None

@property
def completeness(self) -> Optional[float]:
"""The ratio of non-null values to total count."""
if self.count is not None and self.null_count is not None and self.count > 0:
return (self.count - self.null_count) / self.count
return None


class Column(SchemaBase):
name: str
Expand All @@ -37,4 +51,4 @@ class ModelProfilingMetrics(SchemaBase):
class Model(BaseResource):
resource_type: NodeType = NodeType.MODEL
columns: List[Column] = Field(default_factory=list)
profiling_metrics: Optional[ModelProfilingMetrics] = None
profiling_metrics: Optional[ModelProfilingMetrics] = None
2 changes: 1 addition & 1 deletion uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.