Skip to content

Commit 4cc5d48

Browse files
committed
MNT: upgrade docstring (gemini)
1 parent d03ec96 commit 4cc5d48

File tree

4 files changed

+131
-40
lines changed

4 files changed

+131
-40
lines changed

README.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,9 @@ data-scribe dbt --project-dir /path/to/your/dbt/project --update
5555

5656
# Check for documentation drift against the live database
5757
data-scribe dbt --project-dir /path/to/your/dbt/project --db your_db_profile --drift
58+
59+
# Generate a global, end-to-end lineage graph
60+
data-scribe lineage --project-dir /path/to/your/dbt/project --db your_db_profile --output your_mermaid_profile
5861
```
5962

6063
**For a database:**
@@ -76,6 +79,7 @@ data-scribe db --output my_markdown
7679
- **Documentation Drift Detection**: Use the `--drift` flag to compare your existing documentation against the live database, catching descriptions that have become inconsistent with reality.
7780
- **🔒 Security-Aware**: The `init` wizard helps you store sensitive keys (passwords, API tokens) in a `.env` file, not in `config.yaml`.
7881
- **🔌 Extensible by Design**: A pluggable architecture supports multiple backends.
82+
- **🌐 Global End-to-End Lineage**: Generate a single, project-wide lineage graph that combines physical database foreign keys with logical dbt `ref` and `source` dependencies.
7983
- **🚀 Web API Server**: Launch a FastAPI server to trigger documentation workflows programmatically. Includes built-in API documentation via Swagger/ReDoc.
8084

8185
---
@@ -118,6 +122,14 @@ Scans a dbt project's `manifest.json` file.
118122

119123
**Note:** `--update`, `--check`, `--interactive`, and `--drift` flags are mutually exclusive. Choose only one.
120124

125+
### `data-scribe lineage`
126+
127+
Generates a global, end-to-end lineage graph for a dbt project.
128+
129+
- `--project-dir TEXT`: **(Required)** Path to the dbt project directory.
130+
- `--db TEXT`: **(Required)** The database profile to scan for physical Foreign Keys.
131+
- `--output TEXT`: **(Required)** The output profile (must be type 'mermaid') to write the `.md` file to.
132+
121133
### `data-scribe serve`
122134

123135
Launches the FastAPI web server.

data_scribe/app.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -432,8 +432,15 @@ def generate_lineage(
432432
),
433433
):
434434
"""
435-
Generates a Global End-to-End lineage graph (Mermaid) by combining
436-
physical DB Foreign Keys with logical dbt dependencies (refs/sources).
435+
Generates a global end-to-end lineage graph for a dbt project.
436+
437+
This command creates a comprehensive, project-wide lineage graph by
438+
combining two sources of information:
439+
1. **Physical Lineage**: Foreign key relationships from the live database.
440+
2. **Logical Lineage**: `ref()` and `source()` dependencies from the dbt project.
441+
442+
The resulting output is a single Mermaid.js graph, saved to a Markdown file,
443+
that shows the complete data flow from source tables to final models.
437444
"""
438445
LineageWorkflow(
439446
config_path=config_path,

data_scribe/components/writers/mermaid_writer.py

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
"""
2-
This module provides a simple writer for saving a raw Mermaid
3-
string to a Markdown file.
2+
This module provides a specialized writer for saving a Mermaid.js graph
3+
string to a Markdown file, formatted for rendering.
44
"""
55

66
from typing import Dict, Any
@@ -13,21 +13,31 @@
1313

1414
class MermaidWriter(BaseWriter):
1515
"""
16-
Handles writing a single Mermaid chart string to a .md file.
16+
Handles writing a single Mermaid graph string to a Markdown file.
17+
18+
This writer is designed to take a complete Mermaid graph definition
19+
and save it within a Markdown code block, ready for rendering in
20+
supported platforms like GitHub or GitLab.
1721
"""
1822

1923
def write(self, catalog_data: Dict[str, Any], **kwargs):
2024
"""
21-
Writes the Mermaid chart to a Markdown file.
25+
Writes the Mermaid graph from catalog_data to a Markdown file.
2226
2327
Args:
24-
catalog_data: A dictionary expected to have a "mermaid_graph" key.
25-
**kwargs: Expects 'output_filename'.
28+
catalog_data: A dictionary expected to contain the key
29+
`"mermaid_graph"` with the full Mermaid string.
30+
**kwargs: Expects the `output_filename` key, which specifies
31+
the path to the output `.md` file.
32+
33+
Raises:
34+
ConfigError: If `output_filename` is not provided in kwargs.
35+
WriterError: If there is an error writing the file to disk.
2636
"""
2737
output_filename = kwargs.get("output_filename")
2838
if not output_filename:
2939
logger.error("MermaidWriter 'write' method missing 'output_filename'.")
30-
raise ConfigError("Missing required kwargs for MermaidWriter.")
40+
raise ConfigError("Missing required kwarg 'output_filename' for MermaidWriter.")
3141

3242
mermaid_graph = catalog_data.get("mermaid_graph")
3343
if not mermaid_graph:

data_scribe/core/lineage_workflow.py

Lines changed: 93 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
"""
2-
This module defines the workflow for the 'lineage' command,
3-
which combines physical DB lineage (FKs) with logical
4-
dbt lineage (refs/sources).
2+
This module defines the workflow for the 'lineage' command.
3+
4+
It combines physical database lineage (from foreign keys) with logical dbt
5+
project lineage (from refs and sources) to generate a single, comprehensive
6+
end-to-end data lineage graph.
57
"""
68
import typer
79
from typing import List, Dict, Any, Set
@@ -15,66 +17,120 @@
1517

1618
class GlobalLineageGenerator:
1719
"""
18-
Merges DB foreign keys and dbt dependencies into a single
19-
Mermaid graph string.
20+
Builds a global lineage graph from multiple sources.
21+
22+
This class merges physical foreign key relationships from a database with
23+
logical dependencies from a dbt project (`ref` and `source` calls) into a
24+
single graph structure. It intelligently assigns and prioritizes styles to
25+
nodes to ensure, for example, that a dbt model is always styled as a model,
26+
even if it's also a plain database table.
2027
"""
2128
def __init__(self, db_fks: List[Dict[str, str]], dbt_models: List[Dict[str, Any]]):
29+
"""
30+
Initializes the GlobalLineageGenerator.
31+
32+
Args:
33+
db_fks: A list of foreign key relationships from the database.
34+
dbt_models: A list of parsed dbt models, including their dependencies.
35+
"""
2236
self.db_fks = db_fks
2337
self.dbt_models = dbt_models
24-
self.nodes: Set[str] = set()
25-
self.edges: List[str] = []
38+
39+
# Stores nodes and their assigned style, e.g., {"stg_orders": "box"}
40+
self.nodes: Dict[str, str] = {}
41+
# Stores unique edges to prevent duplicates in the graph
42+
self.edges: Set[str] = set()
43+
44+
def _get_style_priority(self, style: str) -> int:
45+
"""Assigns a priority to a node style. Higher numbers win."""
46+
if style == "box": return 3 # dbt model (highest priority)
47+
if style == "source": return 2 # dbt source
48+
if style == "db": return 1 # db table (lowest priority)
49+
return 0
2650

2751
def _add_node(self, name: str, style: str = "box"):
28-
"""Adds a node to the graph if it doesn't exist."""
29-
if name not in self.nodes:
30-
if style == "box":
31-
self.nodes.add(f' {name}["{name}"]') # dbt model
32-
elif style == "db":
33-
self.nodes.add(f' {name}[("{name}")]') # DB table
34-
elif style == "source":
35-
self.nodes.add(f' {name}(("{name}"))') # dbt source
36-
self.nodes.add(name)
52+
"""
53+
Adds a node to the graph, applying style based on priority.
54+
55+
If the node already exists, its style is only updated if the new
56+
style has a higher priority than the current one. This ensures a
57+
dbt model is always styled as a model, not as a generic DB table.
58+
"""
59+
current_style = self.nodes.get(name)
60+
current_priority = self._get_style_priority(current_style) if current_style else -1
61+
new_priority = self._get_style_priority(style)
62+
63+
if new_priority > current_priority:
64+
self.nodes[name] = style
65+
66+
def _add_edge(self, from_node: str, to_node: str, label: str = ""):
67+
"""Adds a unique, formatted edge to the graph's edge set."""
68+
if label:
69+
self.edges.add(f' {from_node} -- "{label}" --> {to_node}')
70+
else:
71+
self.edges.add(f' {from_node} --> {to_node}')
3772

3873
def generate_graph(self) -> str:
39-
"""Generates the full Mermaid graph string."""
74+
"""
75+
Generates the complete Mermaid.js graph string.
76+
77+
It processes database foreign keys first, then dbt dependencies,
78+
allowing the style prioritization logic in `_add_node` to work
79+
correctly. Finally, it assembles the unique nodes and edges into a
80+
single string.
81+
82+
Returns:
83+
A string containing the full Mermaid.js graph definition.
84+
"""
4085
logger.info("Generating global lineage graph...")
4186

4287
# 1. Process DB Foreign Keys (Physical Lineage)
4388
for fk in self.db_fks:
4489
from_table = fk["from_table"]
4590
to_table = fk["to_table"]
4691

47-
# Style DB tables
92+
# Add nodes with 'db' style (lowest priority)
4893
self._add_node(from_table, "db")
4994
self._add_node(to_table, "db")
50-
51-
self.edges.append(f' {from_table} -- FK --> {to_table}')
95+
self._add_edge(from_table, to_table, "FK")
5296

5397
# 2. Process dbt Model Dependencies (Logical Lineage)
5498
for model in self.dbt_models:
5599
model_name = model["name"]
56-
self._add_node(model_name, "box") # Style dbt models
100+
self._add_node(model_name, "box") # Style dbt models (highest priority)
57101

58102
for dep in model.get("dependencies", []):
59-
if "." in dep: # This is a source (e.g., 'jaffle_shop.customers')
103+
# A dependency with a dot is a source (e.g., 'jaffle_shop.customers')
104+
if "." in dep:
60105
self._add_node(dep, "source")
61-
self.edges.append(f' {dep} --> {model_name}')
62-
else: # This is another dbt model (a ref)
106+
self._add_edge(dep, model_name)
107+
else: # Otherwise, it's another dbt model (a ref)
63108
self._add_node(dep, "box")
64-
self.edges.append(f' {dep} --> {model_name}')
109+
self._add_edge(dep, model_name)
65110

66111
# 3. Combine into a Mermaid string
67112
graph_lines = ["graph TD;"]
68-
graph_lines.extend(sorted(list(self.nodes))) # Add all unique node definitions
69-
graph_lines.append("") # Spacer
70-
graph_lines.extend(sorted(list(self.edges))) # Add all unique edges
113+
114+
# Define all nodes with their final, prioritized styles
115+
node_definitions = []
116+
for name, style in self.nodes.items():
117+
if style == "box":
118+
node_definitions.append(f' {name}["{name}"]') # dbt model
119+
elif style == "db":
120+
node_definitions.append(f' {name}[("{name}")]') # DB table
121+
elif style == "source":
122+
node_definitions.append(f' {name}(("{name}"))') # dbt source
123+
124+
graph_lines.extend(sorted(node_definitions))
125+
graph_lines.append("") # Spacer for readability
126+
graph_lines.extend(sorted(list(self.edges)))
71127

72128
return "\n".join(graph_lines)
73129

74130

75131
class LineageWorkflow:
76132
"""
77-
Manages the workflow for the 'lineage' command.
133+
Manages the end-to-end workflow for the `data-scribe lineage` command.
78134
"""
79135
def __init__(
80136
self,
@@ -83,14 +139,19 @@ def __init__(
83139
dbt_project_dir: str,
84140
output_profile: str,
85141
):
142+
"""
143+
Initializes the LineageWorkflow with parameters from the CLI.
144+
"""
86145
self.config_path = config_path
87146
self.db_profile_name = db_profile
88147
self.dbt_project_dir = dbt_project_dir
89148
self.output_profile_name = output_profile
90149
self.config = load_config(config_path)
91150

92151
def run(self):
93-
"""Executes the lineage generation workflow."""
152+
"""
153+
Executes the full lineage generation and writing workflow.
154+
"""
94155

95156
# 1. Get Physical Lineage (FKs) from DB
96157
db_connector = None
@@ -112,7 +173,7 @@ def run(self):
112173
# 2. Get Logical Lineage (refs) from dbt
113174
logger.info(f"Parsing dbt project at '{self.dbt_project_dir}' for dependencies...")
114175
parser = DbtManifestParser(self.dbt_project_dir)
115-
dbt_models = parser.models # This now contains 'dependencies'
176+
dbt_models = parser.models
116177
logger.info(f"Parsed {len(dbt_models)} dbt models.")
117178

118179
# 3. Generate Graph
@@ -125,6 +186,7 @@ def run(self):
125186
try:
126187
writer_params = self.config["output_profiles"][self.output_profile_name]
127188
writer_type = writer_params.pop("type")
189+
# The workflow requires a 'mermaid' writer type.
128190
if writer_type != "mermaid":
129191
logger.warning(f"Output profile '{self.output_profile_name}' is not type 'mermaid'. Using MermaidWriter anyway.")
130192

0 commit comments

Comments
 (0)