|
56 | 56 | from datachain.utils import JSONSerialize |
57 | 57 |
|
58 | 58 | if TYPE_CHECKING: |
59 | | - from sqlalchemy import Delete, Insert, Select, Update |
| 59 | + from sqlalchemy import CTE, Delete, Insert, Select, Subquery, Update |
60 | 60 | from sqlalchemy.schema import SchemaItem |
| 61 | + from sqlalchemy.sql.elements import ColumnElement |
61 | 62 |
|
62 | 63 | from datachain.data_storage import schema |
63 | 64 | from datachain.data_storage.db_engine import DatabaseEngine |
64 | 65 |
|
65 | 66 | logger = logging.getLogger("datachain") |
| 67 | +DEPTH_LIMIT_DEFAULT = 100 |
66 | 68 |
|
67 | 69 |
|
68 | 70 | class AbstractMetastore(ABC, Serializable): |
@@ -1463,6 +1465,18 @@ def _dataset_dependencies_select_columns(self) -> list["SchemaItem"]: |
1463 | 1465 | Returns a list of columns to select in a query for fetching dataset dependencies |
1464 | 1466 | """ |
1465 | 1467 |
|
| 1468 | + @abstractmethod |
| 1469 | + def _dataset_dependency_nodes_select_columns( |
| 1470 | + self, |
| 1471 | + namespaces_subquery: "Subquery", |
| 1472 | + dependency_tree_cte: "CTE", |
| 1473 | + datasets_subquery: "Subquery", |
| 1474 | + ) -> list["ColumnElement"]: |
| 1475 | + """ |
| 1476 | + Returns a list of columns to select in a query for fetching |
| 1477 | + dataset dependency nodes. |
| 1478 | + """ |
| 1479 | + |
1466 | 1480 | def get_direct_dataset_dependencies( |
1467 | 1481 | self, dataset: DatasetRecord, version: str |
1468 | 1482 | ) -> list[DatasetDependency | None]: |
@@ -1493,7 +1507,7 @@ def get_direct_dataset_dependencies( |
1493 | 1507 | return [self.dependency_class.parse(*r) for r in self.db.execute(query)] |
1494 | 1508 |
|
1495 | 1509 | def get_dataset_dependency_nodes( |
1496 | | - self, dataset_id: int, version_id: int |
| 1510 | + self, dataset_id: int, version_id: int, depth_limit: int = DEPTH_LIMIT_DEFAULT |
1497 | 1511 | ) -> list[DatasetDependencyNode | None]: |
1498 | 1512 | n = self._namespaces_select().subquery() |
1499 | 1513 | p = self._projects |
@@ -1522,33 +1536,31 @@ def get_dataset_dependency_nodes( |
1522 | 1536 | cte = base_query.cte(name="dependency_tree", recursive=True) |
1523 | 1537 |
|
1524 | 1538 | # Recursive case: dependencies of dependencies |
1525 | | - recursive_query = select( |
1526 | | - *dep_fields, |
1527 | | - (cte.c.depth + 1).label("depth"), |
1528 | | - ).select_from( |
1529 | | - cte.join( |
1530 | | - dd, |
1531 | | - (cte.c.dataset_id == dd.c.source_dataset_id) |
1532 | | - & (cte.c.dataset_version_id == dd.c.source_dataset_version_id), |
| 1539 | + # Limit depth to 100 to prevent infinite loops in case of circular dependencies |
| 1540 | + recursive_query = ( |
| 1541 | + select( |
| 1542 | + *dep_fields, |
| 1543 | + (cte.c.depth + 1).label("depth"), |
1533 | 1544 | ) |
| 1545 | + .select_from( |
| 1546 | + cte.join( |
| 1547 | + dd, |
| 1548 | + (cte.c.dataset_id == dd.c.source_dataset_id) |
| 1549 | + & (cte.c.dataset_version_id == dd.c.source_dataset_version_id), |
| 1550 | + ) |
| 1551 | + ) |
| 1552 | + .where(cte.c.depth < depth_limit) |
1534 | 1553 | ) |
1535 | 1554 |
|
1536 | 1555 | cte = cte.union(recursive_query) |
1537 | 1556 |
|
1538 | 1557 | # Fetch all with full details |
1539 | | - final_query = select( |
1540 | | - n.c.name, |
1541 | | - p.c.name, |
1542 | | - cte.c.id, |
1543 | | - cte.c.dataset_id, |
1544 | | - cte.c.dataset_version_id, |
1545 | | - d.c.name, |
1546 | | - dv.c.version, |
1547 | | - dv.c.created_at, |
1548 | | - cte.c.source_dataset_id, |
1549 | | - cte.c.source_dataset_version_id, |
1550 | | - cte.c.depth, |
1551 | | - ).select_from( |
| 1558 | + select_cols = self._dataset_dependency_nodes_select_columns( |
| 1559 | + namespaces_subquery=n, |
| 1560 | + dependency_tree_cte=cte, |
| 1561 | + datasets_subquery=d, |
| 1562 | + ) |
| 1563 | + final_query = self._datasets_dependencies_select(*select_cols).select_from( |
1552 | 1564 | # Use outer joins to handle cases where dependent datasets have been |
1553 | 1565 | # physically deleted. This allows us to return dependency records with |
1554 | 1566 | # None values instead of silently omitting them, making broken |
|
0 commit comments