-
Notifications
You must be signed in to change notification settings - Fork 4
feat(sql): chain dataframe-mode SQL blocks via a query registry #104
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,6 +5,30 @@ | |
| from deepnote_toolkit.sql.query_preview import DeepnoteQueryPreview | ||
| from deepnote_toolkit.sql.sql_utils import is_single_select_query | ||
|
|
||
| # Maps a SQL block's result variable name to the source query that produced it. | ||
| # deepnote-internal calls register_sql_query after a dataframe-mode SQL block | ||
| # runs, so that subsequent SQL blocks can reference the variable as a table and | ||
| # have it expanded into a CTE. This is the same chaining DeepnoteQueryPreview | ||
| # enables for query-preview-mode blocks, but without changing the result type | ||
| # (which would show a preview banner and limit the row count). | ||
| _sql_query_registry: dict = {} | ||
|
|
||
|
|
||
| def register_sql_query(variable_name, query): | ||
| """Register a SQL block's source query so downstream blocks can chain on it. | ||
|
|
||
| Only single SELECT statements can be expanded into a CTE, so anything else | ||
| (or an empty query) clears any previous entry for the variable rather than | ||
| storing it. | ||
| """ | ||
| if variable_name is None: | ||
| return | ||
|
|
||
| if query is not None and is_single_select_query(query): | ||
| _sql_query_registry[variable_name] = query | ||
| else: | ||
| _sql_query_registry.pop(variable_name, None) | ||
|
|
||
|
|
||
| def add_limit_clause(query: str, limit: int = 100): | ||
| class ExecuteSqlError(Exception): | ||
|
|
@@ -137,16 +161,33 @@ def find_query_preview_references( | |
|
|
||
| # Check each table reference | ||
| for table_reference in table_references: | ||
| # Check if the reference exists in the main module | ||
| variable_name = table_reference | ||
|
|
||
| # Skip references we've already resolved (dedupe + cycle guard by name) | ||
| if variable_name in query_preview_references: | ||
| continue | ||
|
|
||
| # 1) Dataframe-mode chaining. deepnote-internal registers a block's | ||
| # source query keyed by its result variable name. We only trust the | ||
| # entry while the variable is still bound in __main__, so a deleted or | ||
| # rebound variable doesn't produce a stale CTE expansion. | ||
| if variable_name in _sql_query_registry and hasattr(__main__, variable_name): | ||
| registered_source = _sql_query_registry[variable_name] | ||
| if registered_source: | ||
| query_preview_references[variable_name] = registered_source | ||
| # Recursively resolve the registered query's own references | ||
| find_query_preview_references( | ||
| registered_source, | ||
| query_preview_references, | ||
| processed_queries, | ||
| ) | ||
| continue | ||
|
Comment on lines
+170
to
+184
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Rebinding the same name still expands stale SQL. This branch only checks that the name still exists in 🤖 Prompt for AI Agents |
||
|
|
||
| # 2) Query-preview-mode chaining. The variable itself is a | ||
| # DeepnoteQueryPreview that carries its source query. | ||
| if hasattr(__main__, table_reference): | ||
| variable_name = table_reference | ||
| variable = getattr(__main__, table_reference) | ||
| # If it's a QueryPreview object and not already in our list | ||
| # Use any() with a generator expression to check if the variable is already in the list | ||
| # This avoids using the pandas object in a boolean context | ||
| if isinstance(variable, DeepnoteQueryPreview) and not any( | ||
| id(variable) == id(ref) for ref in query_preview_references | ||
| ): | ||
| if isinstance(variable, DeepnoteQueryPreview): | ||
| # Add it to our list | ||
| query_preview_source = variable._deepnote_query | ||
| query_preview_references[variable_name] = query_preview_source | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,13 +1,16 @@ | ||
| from unittest import TestCase | ||
|
|
||
| import __main__ | ||
| import sqlparse | ||
| from sqlparse.tokens import Keyword | ||
|
|
||
| from deepnote_toolkit.sql import sql_query_chaining | ||
| from deepnote_toolkit.sql.sql_query_chaining import ( | ||
| add_limit_clause, | ||
| extract_table_reference_from_token, | ||
| extract_table_references, | ||
| find_query_preview_references, | ||
| register_sql_query, | ||
| unchain_sql_query, | ||
| ) | ||
|
|
||
|
|
@@ -714,3 +717,80 @@ def test_non_single_select_query_raises_exception(self): | |
| "Invalid query type: Query Preview supports only a single SELECT statement", | ||
| str(context.exception), | ||
| ) | ||
|
|
||
|
|
||
| class TestRegistryBasedChaining(TestCase): | ||
| """Tests for dataframe-mode chaining via register_sql_query / the registry. | ||
|
|
||
| deepnote-internal registers a SQL block's source query keyed by its result | ||
| variable name; downstream blocks then reference the variable as a table and | ||
| it is expanded into a CTE - without the result becoming a DeepnoteQueryPreview. | ||
| """ | ||
|
|
||
| def setUp(self): | ||
| sql_query_chaining._sql_query_registry.clear() | ||
| self._original_vars = vars(__main__).copy() | ||
|
|
||
| def tearDown(self): | ||
| sql_query_chaining._sql_query_registry.clear() | ||
| for key in list(vars(__main__).keys()): | ||
| if key not in self._original_vars: | ||
| delattr(__main__, key) | ||
|
|
||
| def _bind(self, name, value="bound"): | ||
| """Bind a variable in __main__ (simulates the block having executed).""" | ||
| setattr(__main__, name, value) | ||
|
Comment on lines
+730
to
+742
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Type-annotate the new test helpers. The new helper/lifecycle methods should carry explicit As per coding guidelines, "Use type hints consistently" and "Use explicit type hints for function parameters and return values". 🧰 Tools🪛 Ruff (0.15.14)[warning] 740-740: Missing return type annotation for private function Add return type annotation: (ANN202) 🤖 Prompt for AI Agents |
||
|
|
||
| def test_register_stores_single_select(self): | ||
| register_sql_query("df_1", "SELECT * FROM users") | ||
| self.assertEqual( | ||
| sql_query_chaining._sql_query_registry, {"df_1": "SELECT * FROM users"} | ||
| ) | ||
|
|
||
| def test_register_non_select_clears_entry(self): | ||
| register_sql_query("df_1", "SELECT * FROM users") | ||
| register_sql_query("df_1", "INSERT INTO users VALUES (1)") | ||
| self.assertNotIn("df_1", sql_query_chaining._sql_query_registry) | ||
|
|
||
| def test_register_none_query_clears_entry(self): | ||
| register_sql_query("df_1", "SELECT * FROM users") | ||
| register_sql_query("df_1", None) | ||
| self.assertNotIn("df_1", sql_query_chaining._sql_query_registry) | ||
|
|
||
| def test_find_references_via_registry(self): | ||
| self._bind("df_1") | ||
| register_sql_query("df_1", "SELECT * FROM users") | ||
| refs = find_query_preview_references("SELECT * FROM df_1") | ||
| self.assertEqual(refs, {"df_1": "SELECT * FROM users"}) | ||
|
|
||
| def test_unchain_via_registry(self): | ||
| self._bind("df_1") | ||
| register_sql_query("df_1", "SELECT * FROM users") | ||
| result = unchain_sql_query("SELECT * FROM df_1") | ||
| self.assertIn("WITH", result) | ||
| self.assertIn("df_1 AS", result) | ||
| self.assertIn("SELECT * FROM users", result) | ||
|
|
||
| def test_multi_level_registry_chaining(self): | ||
| self._bind("df_1") | ||
| self._bind("df_2") | ||
| register_sql_query("df_1", "SELECT * FROM users") | ||
| register_sql_query("df_2", "SELECT * FROM df_1 WHERE active") | ||
| result = unchain_sql_query("SELECT * FROM df_2") | ||
| # Both levels expanded, with the dependency (df_1) defined before df_2 | ||
| self.assertIn("df_1 AS", result) | ||
| self.assertIn("df_2 AS", result) | ||
| self.assertIn("SELECT * FROM users", result) | ||
| self.assertLess(result.find("df_1 AS"), result.find("df_2 AS")) | ||
|
|
||
| def test_stale_entry_ignored_when_variable_not_bound(self): | ||
| # Registered but the variable is no longer bound in __main__ (e.g. deleted | ||
| # cell or kernel restart) -> the entry must not produce a CTE. | ||
| register_sql_query("df_1", "SELECT * FROM users") | ||
| refs = find_query_preview_references("SELECT * FROM df_1") | ||
| self.assertEqual(refs, {}) | ||
|
|
||
| def test_empty_registry_is_safe(self): | ||
| refs = find_query_preview_references("SELECT * FROM df_1") | ||
| self.assertEqual(refs, {}) | ||
| self.assertEqual(unchain_sql_query("SELECT * FROM df_1"), "SELECT * FROM df_1") | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🛠️ Refactor suggestion | 🟠 Major | ⚡ Quick win
Add a concrete type contract to the new registry API.
_sql_query_registry: dictandregister_sql_query(variable_name, query)currently hide the actualOptional[str] -> Nonecontract. Please annotate the registry and this public function explicitly so callers and static checks enforce the same rules as the implementation.As per coding guidelines, "Always use
Optional[T]for parameters that can be None (notT = None)" and "Use explicit type hints for function parameters and return values".🤖 Prompt for AI Agents