Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
163 changes: 163 additions & 0 deletions examples/redshift_adapter_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
"""
Example: Using Intugle with Amazon Redshift

This example demonstrates how to use Intugle to profile and analyze data in Amazon Redshift.

Prerequisites:
1. Install Intugle with Redshift support: pip install intugle[redshift]
2. Configure your profiles.yml with Redshift connection details
3. Have access to a Redshift cluster with sample data
"""

from intugle.adapters.types.redshift.models import RedshiftConfig
from intugle.adapters.types.redshift.redshift import RedshiftAdapter
from intugle.analysis.models import DataSet


def main():
# Initialize the Redshift adapter
# This will read configuration from profiles.yml
adapter = RedshiftAdapter()

print(f"Connected to Redshift source: {adapter.source_name}")
print(f"Database: {adapter.database}")
print(f"Schema: {adapter.schema}")
print("-" * 50)

# Example 1: Profile a table
print("\n=== Example 1: Profile a Table ===")
customers_config = RedshiftConfig(
identifier="customers",
type="redshift"
)

profile = adapter.profile(customers_config, "customers")
print(f"Table: customers")
print(f"Total rows: {profile.count}")
print(f"Columns: {', '.join(profile.columns)}")
print(f"Data types: {profile.dtypes}")

# Example 2: Profile a specific column
print("\n=== Example 2: Profile a Column ===")
column_profile = adapter.column_profile(
data=customers_config,
table_name="customers",
column_name="customer_id",
total_count=profile.count,
sample_limit=5
)

print(f"Column: {column_profile.column_name}")
print(f"Total count: {column_profile.count}")
print(f"Null count: {column_profile.null_count}")
print(f"Distinct count: {column_profile.distinct_count}")
print(f"Uniqueness: {column_profile.uniqueness:.2%}")
print(f"Completeness: {column_profile.completeness:.2%}")
print(f"Sample data: {column_profile.sample_data}")

# Example 3: Execute a custom query
print("\n=== Example 3: Execute a Custom Query ===")
query = """
SELECT
customer_segment,
COUNT(*) as customer_count,
AVG(lifetime_value) as avg_lifetime_value
FROM customers
GROUP BY customer_segment
ORDER BY customer_count DESC
"""

df = adapter.to_df_from_query(query)
print("Customer segments:")
print(df.to_string(index=False))

# Example 4: Create a view from a query
print("\n=== Example 4: Create a View ===")
view_query = """
SELECT
customer_id,
customer_name,
email,
customer_segment,
lifetime_value
FROM customers
WHERE customer_segment = 'Premium'
AND lifetime_value > 10000
"""

adapter.create_table_from_query(
table_name="premium_customers",
query=view_query,
materialize="view"
)
print("Created view: premium_customers")

# Verify the view was created
premium_df = adapter.to_df_from_query(
"SELECT COUNT(*) as count FROM premium_customers"
)
print(f"Premium customers count: {premium_df['count'][0]}")

# Example 5: Analyze relationship between tables
print("\n=== Example 5: Analyze Table Relationships ===")
customers_dataset = DataSet(
RedshiftConfig(identifier="customers"),
name="customers"
)

orders_dataset = DataSet(
RedshiftConfig(identifier="orders"),
name="orders"
)

# Find how many customer IDs appear in both tables
intersection = adapter.intersect_count(
table1=customers_dataset,
column1_name="customer_id",
table2=orders_dataset,
column2_name="customer_id"
)

print(f"Customers with orders: {intersection}")

# Example 6: Analyze composite key uniqueness
print("\n=== Example 6: Composite Key Analysis ===")
orders_config = RedshiftConfig(identifier="orders")

# Check uniqueness of composite key (customer_id, order_date)
composite_uniqueness = adapter.get_composite_key_uniqueness(
table_name="orders",
columns=["customer_id", "order_date"],
dataset_data=orders_config
)

print(f"Unique (customer_id, order_date) combinations: {composite_uniqueness}")

# Example 7: Create a materialized view
print("\n=== Example 7: Create a Materialized View ===")
materialized_query = """
SELECT
DATE_TRUNC('month', order_date) as month,
customer_segment,
COUNT(DISTINCT customer_id) as unique_customers,
COUNT(*) as order_count,
SUM(order_total) as total_revenue
FROM orders o
JOIN customers c ON o.customer_id = c.customer_id
GROUP BY 1, 2
"""

adapter.create_table_from_query(
table_name="monthly_segment_metrics",
query=materialized_query,
materialize="materialized_view"
)
print("Created materialized view: monthly_segment_metrics")
print("Note: Refresh with: REFRESH MATERIALIZED VIEW monthly_segment_metrics")

print("\n" + "=" * 50)
print("All examples completed successfully!")


if __name__ == "__main__":
main()
4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,10 @@ postgres = [
"asyncpg>=0.30.0",
"sqlglot>=27.20.0",
]
redshift = [
"redshift-connector>=2.1.0",
"sqlglot>=27.20.0",
]
bigquery = [
"google-cloud-bigquery>=3.11.0",
"sqlglot>=27.20.0",
Expand Down
1 change: 1 addition & 0 deletions src/intugle/adapters/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def is_safe_plugin_name(plugin_name: str) -> bool:
"intugle.adapters.types.snowflake.snowflake",
"intugle.adapters.types.databricks.databricks",
"intugle.adapters.types.postgres.postgres",
"intugle.adapters.types.redshift.redshift",
"intugle.adapters.types.mysql.mysql",
"intugle.adapters.types.mariadb.mariadb",
"intugle.adapters.types.sqlserver.sqlserver",
Expand Down
182 changes: 182 additions & 0 deletions src/intugle/adapters/types/redshift/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
# Amazon Redshift Adapter

The Redshift adapter allows Intugle to connect to and interact with Amazon Redshift data warehouses.

## Installation

To use the Redshift adapter, you need to install Intugle with the Redshift optional dependencies:

```bash
pip install intugle[redshift]
```

This will install:
- `redshift-connector>=2.1.0` - The official Amazon Redshift Python connector
- `sqlglot>=27.20.0` - SQL transpilation library

## Configuration

Add a Redshift configuration to your `profiles.yml` file:

```yaml
redshift:
name: my_redshift_source
user: your_username
password: your_password
host: your-cluster.region.redshift.amazonaws.com
port: 5439 # Default Redshift port
database: your_database
schema: public
```

### Configuration Parameters

- **name** (optional): A friendly name for your Redshift source. Defaults to "my_redshift_source"
- **user**: Your Redshift username
- **password**: Your Redshift password
- **host**: The Redshift cluster endpoint (without the port)
- **port**: The port number (default: 5439)
- **database**: The database name to connect to
- **schema**: The schema to use for queries

## Usage

### Basic Example

```python
from intugle.adapters.types.redshift.models import RedshiftConfig
from intugle.adapters.types.redshift.redshift import RedshiftAdapter

# Create a config for your table
config = RedshiftConfig(
identifier="my_table",
type="redshift"
)

# Get the adapter instance
adapter = RedshiftAdapter()

# Profile the table
profile = adapter.profile(config, "my_table")
print(f"Total rows: {profile.count}")
print(f"Columns: {profile.columns}")
```

### Profile a Column

```python
# Get detailed profile for a specific column
column_profile = adapter.column_profile(
data=config,
table_name="my_table",
column_name="customer_id",
total_count=profile.count,
sample_limit=10,
dtype_sample_limit=10000
)

print(f"Distinct count: {column_profile.distinct_count}")
print(f"Null count: {column_profile.null_count}")
print(f"Uniqueness: {column_profile.uniqueness}")
print(f"Completeness: {column_profile.completeness}")
```

### Query and Create Views/Tables

```python
# Execute a query and get results as DataFrame
df = adapter.to_df_from_query("SELECT * FROM my_table LIMIT 100")

# Create a view from a query
query = """
SELECT customer_id, SUM(order_total) as total_spent
FROM orders
GROUP BY customer_id
HAVING SUM(order_total) > 1000
"""

adapter.create_table_from_query(
table_name="high_value_customers",
query=query,
materialize="view" # Options: "view", "table", "materialized_view"
)
```

### Analyzing Relationships

```python
from intugle.analysis.models import DataSet

# Create DataSet objects
customers = DataSet(
RedshiftConfig(identifier="customers"),
name="customers"
)

orders = DataSet(
RedshiftConfig(identifier="orders"),
name="orders"
)

# Find intersection count between tables
intersection = adapter.intersect_count(
table1=customers,
column1_name="customer_id",
table2=orders,
column2_name="customer_id"
)

print(f"Common customer IDs: {intersection}")
```

## Features

The Redshift adapter supports all standard Intugle adapter operations:

- **Profiling**: Get row counts, column lists, and data types
- **Column Profiling**: Detailed statistics including null counts, distinct values, uniqueness, and completeness
- **Query Execution**: Run arbitrary SQL queries
- **DataFrame Conversion**: Convert query results to Pandas DataFrames
- **Table/View Creation**: Create tables, views, or materialized views from queries
- **Relationship Analysis**: Calculate intersections between tables
- **Composite Key Analysis**: Analyze uniqueness of composite keys

## SQL Dialect

The adapter uses SQLGlot to transpile queries to Redshift's SQL dialect. This means you can write queries in a more standard SQL format, and they will be automatically converted to Redshift-compatible SQL.

## Notes

- The Redshift adapter is based on the PostgreSQL adapter since Redshift is built on PostgreSQL
- Some SQL features may differ from standard PostgreSQL due to Redshift's columnar storage and distributed architecture
- For best performance, consider Redshift's distribution keys and sort keys when creating tables
- Materialized views in Redshift need to be manually refreshed using `REFRESH MATERIALIZED VIEW`

## Troubleshooting

### Connection Issues

If you're having trouble connecting:

1. Verify your cluster endpoint and credentials
2. Check that your IP is whitelisted in the Redshift security group
3. Ensure the cluster is publicly accessible (if connecting from outside AWS)
4. Verify the database and schema exist

### Missing Dependencies

If you see an error about missing dependencies:

```
ImportError: Redshift dependencies are not installed. Please run 'pip install intugle[redshift]'.
```

Install the required dependencies:

```bash
pip install intugle[redshift]
```

## License

The `redshift-connector` package is licensed under the Apache License 2.0, which is compatible with this project's Apache 2.0 license.
1 change: 1 addition & 0 deletions src/intugle/adapters/types/redshift/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# Redshift adapter module
19 changes: 19 additions & 0 deletions src/intugle/adapters/types/redshift/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from typing import Literal

from pydantic import Field

from intugle.common.schema import SchemaBase


class RedshiftConnectionConfig(SchemaBase):
user: str
password: str
host: str
port: int = 5439 # Default Redshift port
database: str
schema_: str = Field(..., alias="schema")


class RedshiftConfig(SchemaBase):
identifier: str
type: Literal["redshift"] = "redshift"
Loading