diff --git a/.gitattributes b/.gitattributes index 5f1a2af022fd4..bd0b27a927aae 100644 --- a/.gitattributes +++ b/.gitattributes @@ -3,3 +3,4 @@ *.tsx text eol=lf gradlew text eol=lf metadata-utils/src/test/resources/filterQuery/* text eol=lf + diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md index 0ae0346fe21b7..729f2948fe89a 100644 --- a/docs/how/updating-datahub.md +++ b/docs/how/updating-datahub.md @@ -33,8 +33,8 @@ This file documents any backwards-incompatible changes in DataHub and assists pe ### Breaking Changes -- #16341 (Ingestion) SQL parsing: View query IDs are now generated using a SHA-256 hash instead of URL-encoding the view URN. This affects all connectors that use view lineage tracking (Snowflake, Oracle, BigQuery, Postgres, MySQL, Hive, Trino, ClickHouse, DB2, Dremio, SQL Server, and others). Previously, query entities had URNs like `urn:li:query:view_urn%3Ali%3Adataset%3A%28...%29`; they now use `urn:li:query:view_`. After upgrading, the old URL-encoded query entities will become stale and orphaned. To clean them up, enable stateful ingestion with stale entity removal in your recipe and re-run ingestion. - +- #16685:(Ingestion) PowerBI M-Query lineage extraction has been rewritten using Microsoft's official `@microsoft/powerquery-parser`. As part of this change, the `native_query_parsing: false` configuration flag now suppresses only expressions containing `Value.NativeQuery`. Previously it suppressed all M-Query lineage extraction. Users who set this flag to block all lineage extraction will now see lineage produced for non-NativeQuery sources (Snowflake, PostgreSQL, MSSQL, etc.). To restore the suppress-all behaviour, add a `table_pattern.deny` rule in your recipe. +- #16341:(Ingestion) SQL parsing: View query IDs are now generated using a SHA-256 hash instead of URL-encoding the view URN. This affects all connectors that use view lineage tracking (Snowflake, Oracle, BigQuery, Postgres, MySQL, Hive, Trino, ClickHouse, DB2, Dremio, SQL Server, and others). Previously, query entities had URNs like `urn:li:query:view_urn%3Ali%3Adataset%3A%28...%29`; they now use `urn:li:query:view_`. After upgrading, the old URL-encoded query entities will become stale and orphaned. To clean them up, enable stateful ingestion with stale entity removal in your recipe and re-run ingestion. - #16396: Oracle connector: When connecting via `service_name` to a multitenant Oracle database, the database name used in URNs will now reflect the Pluggable Database (PDB) name instead of the Container Database (CDB) name. In Oracle Multitenant architecture, a CDB is the top-level container (e.g. `cdb`) and a PDB is an individual tenant database within it (e.g. `mypdb`); `service_name` typically routes to the PDB, so the PDB name is the correct identifier for your datasets. This affects both dataset URNs (when `add_database_name_to_urn: true`) and database/schema container URNs (always, since containers always include the database name). If your existing metadata was ingested with the old CDB-based URNs, re-ingesting will create new entities under the corrected URNs. To preserve the old URN shape and avoid re-creating entities, set `urn_db_name` explicitly in your recipe to match your previous CDB name. - #16628 (Ingestion) Fabric OneLake source: Workspace containers now use the `fabric` platform instead of `fabric-onelake`. This changes workspace container URNs and the `dataPlatformInstance.platform` emitted for workspace entities. Lakehouse, warehouse, schema, and dataset entities remain on `fabric-onelake`. - **Retention service disabled: only current version retained.** When the retention service is not enabled (not configured or unavailable), the write path now retains only the current version (version 0) and does not create version-history rows. Previously, version history was still written when retention was disabled. **Impact:** Deployments that run without retention enabled will no longer accumulate aspect version history; only the latest aspect value is stored. **Migration:** Enable and configure the retention service (e.g. ingest retention policies from `boot/retention.yaml`) if you need version history for any entity/aspect. diff --git a/metadata-ingestion/constraints.txt b/metadata-ingestion/constraints.txt index a40ed0a76249c..0dff3546a738b 100644 --- a/metadata-ingestion/constraints.txt +++ b/metadata-ingestion/constraints.txt @@ -782,8 +782,6 @@ langcodes==3.5.1 # via spacy langdetect==1.0.9 # via unstructured -lark==1.1.4 - # via acryl-datahub leb128==1.0.9 # via asynch linear-tsv==1.1.0 @@ -841,6 +839,8 @@ mdurl==0.1.2 # via markdown-it-py memray==1.19.1 # via acryl-datahub +mini-racer==0.14.1 + # via acryl-datahub mistune==3.2.0 # via # acryl-great-expectations @@ -1370,7 +1370,6 @@ referencing==0.37.0 # jupyter-events regex==2026.2.28 # via - # lark # nltk # tiktoken requests==2.32.5 diff --git a/metadata-ingestion/pyproject.toml b/metadata-ingestion/pyproject.toml index 104f64236ed3c..ca807aea3c9c2 100644 --- a/metadata-ingestion/pyproject.toml +++ b/metadata-ingestion/pyproject.toml @@ -943,7 +943,7 @@ postgres = [ ] powerbi = [ - "lark[regex]==1.1.4", + "mini-racer==0.14.1", "more-itertools<11.0.0", "msal>=1.31.1,<2.0.0", "patchy==2.8.0", @@ -1419,10 +1419,10 @@ all = [ "jsonpath-ng==1.7.0", "jupyter_server>=2.14.1,<3.0.0", "kerberos>=1.3.0,<2.0.0", - "lark[regex]==1.1.4", "litellm==1.80.5", "lkml>=1.3.4,<2.0.0", "looker-sdk>=23.0.0,<26.0.0", + "mini-racer==0.14.1", "mlflow-skinny>=2.3.0,<2.21.0", "more-itertools>=8.12.0,<11.0.0", "moto[s3]>=5.0.0,<6.0.0", @@ -1570,10 +1570,10 @@ dev = [ "jsonschema<5.0.0", "jupyter_server>=2.14.1,<3.0.0", "kerberos>=1.3.0,<2.0.0", - "lark[regex]==1.1.4", "litellm==1.80.5", "lkml>=1.3.4,<2.0.0", "looker-sdk>=23.0.0,<26.0.0", + "mini-racer==0.14.1", "mixpanel>=4.9.0,<6.0.0", "mlflow-skinny>=2.3.0,<2.21.0", "more-itertools>=8.12.0,<11.0.0", @@ -1761,10 +1761,10 @@ docs = [ "jsonschema<5.0.0", "jupyter_server>=2.14.1,<3.0.0", "kerberos>=1.3.0,<2.0.0", - "lark[regex]==1.1.4", "litellm==1.80.5", "lkml>=1.3.4,<2.0.0", "looker-sdk>=23.0.0,<26.0.0", + "mini-racer==0.14.1", "mixpanel>=4.9.0,<6.0.0", "mlflow-skinny>=2.3.0,<2.21.0", "more-itertools>=8.12.0,<11.0.0", @@ -2147,7 +2147,7 @@ datahub = ["py.typed"] "datahub.cli.gql" = ["*.gql"] "datahub.cli.resources" = ["*.md"] "datahub.ingestion.autogenerated" = ["*.json"] -"datahub.ingestion.source.powerbi" = ["powerbi-lexical-grammar.rule"] +"datahub.ingestion.source.powerbi.m_query.mquery_bridge" = ["bundle.js.gz"] "datahub.metadata" = ["schema.avsc"] "datahub.metadata.schemas" = ["*.avsc"] diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 14d822c34e564..9fc880c6bfb70 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -744,7 +744,7 @@ "nifi": {"requests<3.0.0", "packaging<26.0.0", "requests-gssapi<2.0.0"}, "powerbi": ( microsoft_common - | {"lark[regex]==1.1.4", "sqlparse<1.0.0", "more-itertools<11.0.0"} + | {"sqlparse<1.0.0", "more-itertools<11.0.0", "mini-racer==0.14.1"} | sqlglot_lib | threading_timeout_common ), @@ -1204,7 +1204,7 @@ "datahub": ["py.typed"], "datahub.metadata": ["schema.avsc"], "datahub.metadata.schemas": ["*.avsc"], - "datahub.ingestion.source.powerbi": ["powerbi-lexical-grammar.rule"], + "datahub.ingestion.source.powerbi.m_query.mquery_bridge": ["bundle.js.gz"], "datahub.ingestion.autogenerated": ["*.json"], "datahub.cli.gql": ["*.gql"], "datahub.cli.resources": ["*.md"], diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py index 511f9da148c6b..6883f30258a80 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py @@ -247,8 +247,14 @@ class PowerBiDashboardSourceReport(StaleEntityRemovalSourceReport): m_query_parse_attempts: int = 0 m_query_parse_successes: int = 0 m_query_parse_timeouts: int = 0 + m_query_native_query_skipped: int = 0 + # Expressions that reached the parser but are not M-Query at all + # (e.g. DAX computed-table expressions, empty strings, label rows). + # These fail with MQueryParseError but are expected and logged at INFO. + m_query_non_mquery_expressions: int = 0 m_query_parse_validation_errors: int = 0 m_query_parse_unexpected_character_errors: int = 0 + # Genuine M-Query expressions that the parser could not handle. m_query_parse_unknown_errors: int = 0 m_query_resolver_errors: int = 0 m_query_resolver_no_lineage: int = 0 diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/_bridge.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/_bridge.py new file mode 100644 index 0000000000000..cacc4a55216a1 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/_bridge.py @@ -0,0 +1,156 @@ +import gzip +import json +import logging +import threading +from pathlib import Path +from typing import Optional + +logger = logging.getLogger(__name__) + +NodeIdMap = dict[int, dict] + +_BUNDLE_PATH = Path(__file__).parent / "mquery_bridge" / "bundle.js.gz" + + +class MQueryBridgeError(RuntimeError): + """V8 context error or malformed response from the M-Query bridge.""" + + pass + + +class MQueryParseError(RuntimeError): + """Parser returned a structured parse error for a specific expression.""" + + def __init__(self, message: str, expression: str = "") -> None: + super().__init__(message) + self.expression = expression + + +class MQueryBridge: + def __init__(self) -> None: + if not _BUNDLE_PATH.exists(): + raise ImportError( + f"M-Query bridge bundle not found at {_BUNDLE_PATH}. " + "Re-installing acryl-datahub[powerbi] may fix this." + ) + try: + from py_mini_racer import MiniRacer + except ImportError as e: + raise ImportError( + "PowerBI M-Query parsing requires 'mini-racer'. " + "Install it with: pip install 'acryl-datahub[powerbi]'" + ) from e + + # Decompress bundle.js.gz in memory — fast (~50ms for 500KB) and happens once per process. + try: + bundle_js = gzip.decompress(_BUNDLE_PATH.read_bytes()).decode("utf-8") + except (gzip.BadGzipFile, OSError, EOFError) as e: + raise ImportError( + f"M-Query bridge bundle at {_BUNDLE_PATH} appears to be corrupt: {e}. " + "Re-installing acryl-datahub[powerbi] may fix this." + ) from e + self._ctx = MiniRacer() + self._ctx.eval(bundle_js) + + def parse(self, expression: str) -> NodeIdMap: + """ + Parse an M-Query expression and return a flat node map. + + Each key is a node ID (int); each value is a node dict with at least + ``kind`` (NodeKind string) and ``id``. Child nodes are embedded inline, + not as ID references, so you can walk them directly or look up any + node by ID via the returned map. + + Example — the LetExpression root for ``let x = 1 in x`` is at the + root of the returned dict and looks roughly like:: + + {1: {"kind": "LetExpression", "id": 1, "variableList": {...}, ...}, + 2: {"kind": "ArrayWrapper", "id": 2, ...}, + ...} + + Not thread-safe — callers must be single-threaded. + + Raises: + MQueryParseError: parser returned a structured error for this expression. + MQueryBridgeError: V8 context error or malformed response. + """ + # JSPromise is available: __init__ already guaranteed py_mini_racer is installed. + from py_mini_racer import JSPromise + + try: + # parseExpression is async, so ctx.call() returns an unresolved plain dict. + # Use ctx.eval() instead, which returns a JSPromise; call .get() to await it. + result = self._ctx.eval(f"parseExpression({json.dumps(expression)})") + if not isinstance(result, JSPromise): + raise MQueryBridgeError( + f"M-Query bridge: expected JSPromise from parseExpression, got {type(result).__name__}" + ) + raw = result.get() + except MQueryBridgeError: + raise + except Exception as e: + # Catches all py_mini_racer errors (JSEvalException, JSTimeoutException, etc.) + # MiniRacerBaseException is not exported from the top-level namespace in mini-racer. + raise MQueryBridgeError(f"M-Query bridge V8 error: {e}") from e + + if not isinstance(raw, str): + raise MQueryBridgeError( + f"M-Query bridge returned non-string result: {type(raw).__name__}" + ) + + try: + result = json.loads(raw) + except (json.JSONDecodeError, TypeError) as e: + raise MQueryBridgeError( + f"M-Query bridge returned malformed JSON: {e}. Raw: {raw!r}" + ) from e + + if not result.get("ok"): + raise MQueryParseError( + result.get("error", "unknown error"), expression=expression + ) + + node_id_map = result.get("nodeIdMap") + if node_id_map is None: + raise MQueryBridgeError( + "M-Query bridge returned ok=true but 'nodeIdMap' is missing from response" + ) + + return {int(node_id): node for node_id, node in node_id_map} + + +_bridge_instance: Optional[MQueryBridge] = None +_bridge_lock = threading.Lock() + + +def get_bridge() -> MQueryBridge: + """Return the process-wide MQueryBridge, creating it on first call.""" + global _bridge_instance + if _bridge_instance is None: + with _bridge_lock: + if _bridge_instance is None: + _bridge_instance = MQueryBridge() + return _bridge_instance + + +def _clear_bridge() -> None: + """Drop the singleton so the next call to get_bridge() starts fresh. + + Called after a V8 crash to avoid reusing a broken context, and in tests + to ensure each test module gets an isolated bridge. + """ + global _bridge_instance + with _bridge_lock: + if _bridge_instance is not None: + # Explicitly close the V8 context before dropping the reference. + # If we just set _bridge_instance = None here, Python's GC decides + # when to finalize the MiniRacer object. If that happens while other + # threads are active (e.g. in a later, unrelated test), MiniRacer's + # __del__ -> close() path segfaults. Closing synchronously here, while + # the lock is held and no concurrent parse() calls are in flight, + # shuts down V8 cleanly. + try: + _bridge_instance._ctx.close() + except Exception: + pass + _bridge_instance = None diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/ast_utils.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/ast_utils.py new file mode 100644 index 0000000000000..9acff88f147c2 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/ast_utils.py @@ -0,0 +1,188 @@ +""" +Utilities for navigating a powerquery-parser NodeIdMap. + +A NodeIdMap is dict[int, dict] — the deserialized output of the TypeScript +bridge. Each node dict has at minimum: + "kind": str — NodeKind enum value (e.g. "LetExpression") + "id": int — unique node identifier + +Child nodes are embedded directly as nested dicts, not as integer ID references. +The nodeIdMap provides a flat index to find any node by ID. +""" + +from typing import Dict, Optional + +NodeIdMap = dict[int, dict] + + +def find_nodes_by_kind(node_map: NodeIdMap, kind: str) -> list[dict]: + """Return all nodes of the given NodeKind at any depth in the map.""" + return [node for node in node_map.values() if node.get("kind") == kind] + + +def get_literal_value(node: dict) -> Optional[str]: + """ + Return the string content of a Text LiteralExpression with quotes stripped. + Returns None for non-Text literals or non-LiteralExpression nodes. + """ + if node.get("kind") != "LiteralExpression": + return None + if node.get("literalKind") != "Text": + return None + literal = node.get("literal", "") + if not (literal.startswith('"') and literal.endswith('"')): + return None + return literal[1:-1] + + +def get_invoke_callee_name(node_map: NodeIdMap, invoke_node: dict) -> Optional[str]: + """ + Resolve the callee name of an InvokeExpression. + + The powerquery-parser AST encodes ``Snowflake.Databases(...)`` as:: + + RecursivePrimaryExpression + head: IdentifierExpression → identifier.literal = "Snowflake.Databases" + recursiveExpressions: ArrayWrapper + elements: [InvokeExpression ← this is the node passed in] + + The function name lives on the *parent*, not the invoke node itself, so we + scan the flat map for a RecursivePrimaryExpression whose element list + contains this invoke node's ID. + + Returns None if the node is not an InvokeExpression or the callee cannot be resolved. + """ + if invoke_node.get("kind") != "InvokeExpression": + return None + + invoke_id = invoke_node.get("id") + if invoke_id is None: + return None + + # Find the parent RecursivePrimaryExpression that contains this InvokeExpression + for node in node_map.values(): + if node.get("kind") != "RecursivePrimaryExpression": + continue + rec_exprs = node.get("recursiveExpressions", {}) + if not isinstance(rec_exprs, dict) or rec_exprs.get("kind") != "ArrayWrapper": + continue + for elem in rec_exprs.get("elements", []): + if isinstance(elem, dict) and elem.get("id") == invoke_id: + head = node.get("head", {}) + if head.get("kind") == "IdentifierExpression": + identifier = head.get("identifier", {}) + return identifier.get("literal") + + return None + + +def resolve_identifier( + node_map: NodeIdMap, + let_node: dict, + name: str, +) -> Optional[dict]: + """ + Look up a variable name in the given LetExpression's variable list. + Returns the value node assigned to `name`, or None if not found. + + Structure: LetExpression.variableList (ArrayWrapper) → elements (Csv[]) → node (IdentifierPairedExpression) + """ + if let_node.get("kind") != "LetExpression": + return None + + var_list = let_node.get("variableList", {}) + if not isinstance(var_list, dict) or var_list.get("kind") != "ArrayWrapper": + return None + + for elem in var_list.get("elements", []): + # elements are Csv nodes wrapping IdentifierPairedExpression + if isinstance(elem, dict) and elem.get("kind") == "Csv": + inner = elem.get("node", {}) + else: + inner = elem + + if not isinstance(inner, dict): + continue + if inner.get("kind") not in ( + "IdentifierPairedExpression", + "GeneralizedIdentifierPairedExpression", + ): + continue + + key_node = inner.get("key", {}) + if not isinstance(key_node, dict): + continue + key_literal = key_node.get("literal", "") + # Normalize: strip #"..." quoting and compare case-insensitively + # (M-Query variable names are case-insensitive) + key_bare = key_literal + if key_bare.startswith('#"') and key_bare.endswith('"'): + key_bare = key_bare[2:-1] + name_bare = name + if name_bare.startswith('#"') and name_bare.endswith('"'): + name_bare = name_bare[2:-1] + if key_bare.lower() == name_bare.lower(): + return inner.get("value") + + return None + + +def get_record_field_values( + node_map: NodeIdMap, + record_node: dict, + parameters: Optional[Dict[str, str]] = None, +) -> dict[str, str]: + """ + Extract key-value pairs from a RecordExpression where values are Text literals. + Keys: GeneralizedIdentifier literals. + Values: Text LiteralExpression values (quotes stripped). + Non-string values are omitted unless resolvable via parameters. + + Structure: RecordExpression.content (ArrayWrapper) → elements (Csv[]) → node (GeneralizedIdentifierPairedExpression) + """ + parameters = parameters or {} + result: dict[str, str] = {} + if record_node.get("kind") != "RecordExpression": + return result + + content = record_node.get("content", {}) + if not isinstance(content, dict) or content.get("kind") != "ArrayWrapper": + return result + + for elem in content.get("elements", []): + if isinstance(elem, dict) and elem.get("kind") == "Csv": + inner = elem.get("node", {}) + else: + inner = elem + + if not isinstance(inner, dict): + continue + if inner.get("kind") not in ( + "GeneralizedIdentifierPairedExpression", + "IdentifierPairedExpression", + ): + continue + + key_node = inner.get("key", {}) + value_node = inner.get("value", {}) + if not isinstance(key_node, dict) or not isinstance(value_node, dict): + continue + + key = key_node.get("literal", "") + value = get_literal_value(value_node) + if ( + value is None + and parameters + and value_node.get("kind") == "IdentifierExpression" + ): + # Resolve identifier references using parameters + ident = value_node.get("identifier", {}) + ref_name = ident.get("literal", "") + if ref_name.startswith('#"') and ref_name.endswith('"'): + ref_name = ref_name[2:-1] + if ref_name in parameters: + value = parameters[ref_name] + if value is not None: + result[key] = value + + return result diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/data_classes.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/data_classes.py index bb76b70c111f7..8e449202ce365 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/data_classes.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/data_classes.py @@ -1,14 +1,12 @@ -import os -from dataclasses import dataclass +from dataclasses import dataclass, field from enum import Enum from typing import Any, Dict, List, Optional -from lark import Tree - +from datahub.configuration.env_vars import get_trace_powerbi_mquery_parser from datahub.ingestion.source.powerbi.config import DataPlatformPair from datahub.sql_parsing.sqlglot_lineage import ColumnLineageInfo -TRACE_POWERBI_MQUERY_PARSER = os.getenv("DATAHUB_TRACE_POWERBI_MQUERY_PARSER", False) +TRACE_POWERBI_MQUERY_PARSER: bool = get_trace_powerbi_mquery_parser() @dataclass @@ -34,9 +32,13 @@ class IdentifierAccessor: @dataclass class DataAccessFunctionDetail: - arg_list: Tree - data_access_function_name: str + arg_list: dict # InvokeExpression node dict from NodeIdMap + data_access_function_name: ( + str # matches FunctionName.value (e.g. "Snowflake.Databases") + ) identifier_accessor: Optional[IdentifierAccessor] + node_map: Dict[int, dict] # full NodeIdMap for ast_utils navigation + parameters: Dict[str, str] = field(default_factory=dict) @dataclass diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/mquery_bridge/.gitignore b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/mquery_bridge/.gitignore new file mode 100644 index 0000000000000..e79fc0f1c915e --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/mquery_bridge/.gitignore @@ -0,0 +1,5 @@ +node_modules/ +dist/ +sea-prep.blob +# Uncompressed bundle — only commit the .gz +bundle.js diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/mquery_bridge/__init__.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/mquery_bridge/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/mquery_bridge/build.sh b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/mquery_bridge/build.sh new file mode 100755 index 0000000000000..b1c90336a8e17 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/mquery_bridge/build.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash +# Builds bundle.js.gz for the PyMiniRacer M-Query bridge. +# Requires Node.js 16+ and npm. +# Run manually after bumping @microsoft/powerquery-parser in package.json. +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR" + +echo "==> Installing dependencies" +npm ci + +echo "==> Compiling TypeScript" +npx tsc + +echo "==> Bundling with esbuild (no --minify: keeps LexError/ParseError .name for bridge messages; gzip shrinks the artifact)" +# Use --platform=browser (IIFE output) so the bundle runs in py_mini_racer's +# V8 context, which has no Node.js built-ins (no exports, require, process). +npx esbuild dist/index.js \ + --bundle \ + --platform=browser \ + --target=es2020 \ + --format=iife \ + --outfile=bundle.js + +echo "==> Compressing bundle" +gzip -9 -f bundle.js # produces bundle.js.gz; -f overwrites existing + +echo " Written: bundle.js.gz ($(du -sh bundle.js.gz | cut -f1))" +echo "" +echo "==> Done. Commit bundle.js.gz and package-lock.json." +echo " Tests no longer depend on committed fixtures — they generate ASTs at test time." +echo " generate_fixtures.py is kept as an optional tool for manual AST inspection only." diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/mquery_bridge/bundle.js.gz b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/mquery_bridge/bundle.js.gz new file mode 100644 index 0000000000000..7ddec01934e3e Binary files /dev/null and b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/mquery_bridge/bundle.js.gz differ diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/mquery_bridge/generate_fixtures.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/mquery_bridge/generate_fixtures.py new file mode 100644 index 0000000000000..5966f8b056e33 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/mquery_bridge/generate_fixtures.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python3 +""" +Optional dev-only tool for manually inspecting the parsed AST of M_QUERIES entries. + +This script is no longer needed for tests (tests generate ASTs at test time now). +It is kept as an optional tool for manual AST inspection during development. + +Usage (from the repo root, output defaults to ./ast_fixtures_debug/): + PYTHONPATH=metadata-ingestion/src python \ + metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/mquery_bridge/generate_fixtures.py + +To override the output directory: + PYTHONPATH=metadata-ingestion/src python \ + metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/mquery_bridge/generate_fixtures.py \ + --output /path/to/output/ +""" + +import argparse +import json +import sys +from pathlib import Path + +# generate_fixtures.py is at: +# metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/mquery_bridge/ +# parents: [0]=mquery_bridge [1]=m_query [2]=powerbi [3]=source [4]=ingestion +# [5]=datahub [6]=src [7]=metadata-ingestion +TESTS_DIR = Path(__file__).resolve().parents[7] / "tests" / "integration" / "powerbi" +sys.path.insert(0, str(TESTS_DIR)) + +from test_m_parser import M_QUERIES # type: ignore # noqa: E402 + + +def slugify(text: str) -> str: + """Create a safe filename from the first ~40 chars of an expression.""" + slug = text[:40].replace(" ", "_").replace("\n", "").replace('"', "") + return "".join(c for c in slug if c.isalnum() or c in "_-") + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument( + "--output", + required=False, + type=Path, + default=Path("./ast_fixtures_debug/"), + ) + args = parser.parse_args() + + # Import bridge here so PYTHONPATH errors surface clearly. + from datahub.ingestion.source.powerbi.m_query._bridge import ( + _clear_bridge, + get_bridge, + ) + + _clear_bridge() + bridge = get_bridge() + + args.output.mkdir(parents=True, exist_ok=True) + + for i, expression in enumerate(M_QUERIES): + try: + node_map = bridge.parse(expression) + result = {"ok": True, "nodeIdMap": list(node_map.items())} + status = "ok" + except Exception as e: + result = {"ok": False, "error": str(e)} + status = "ERROR" + + slug = slugify(expression) + out_path = args.output / f"{i:02d}_{slug}.json" + out_path.write_text(json.dumps(result, indent=2)) + print(f"[{status}] {out_path.name}") + + _clear_bridge() + print(f"\nGenerated {len(M_QUERIES)} fixtures in {args.output}") + + +if __name__ == "__main__": + main() diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/mquery_bridge/index.ts b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/mquery_bridge/index.ts new file mode 100644 index 0000000000000..baba0f8a5c3e3 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/mquery_bridge/index.ts @@ -0,0 +1,57 @@ +import { DefaultSettings, ResultKind, Task, TaskUtils } from "@microsoft/powerquery-parser"; + +/** + * LexError from @microsoft/powerquery-parser often fails `instanceof Error` (legacy prototype + * chain), but always exposes string `.name` / `.message`. Using `String(err)` on those values + * throws `TypeError: Function.prototype.toString requires that 'this' be a Function` in V8. + */ +function formatTaskError(err: unknown): string { + if (typeof err === "object" && err !== null) { + const e = err as { name?: unknown; message?: unknown }; + if (typeof e.message === "string") { + const name = typeof e.name === "string" && e.name.length > 0 ? e.name : "Error"; + return `${name}: ${e.message}`; + } + } + if (err instanceof Error) { + return `${err.name}: ${err.message}`; + } + try { + return String(err); + } catch { + return "Unknown error"; + } +} + +// Expose parseExpression on globalThis so py_mini_racer can call it. +// Bare V8 has neither Node.js `global` nor browser `window`, only globalThis. +// +// Wire protocol (always returns a JSON string, never throws into Python): +// +// success: { ok: true, nodeIdMap: [[id, node], ...] } +// failure: { ok: false, error: "Lex: ..." | "Parse: ErrorName: message" } +// +// nodeIdMap is an array of [number, object] pairs rather than a plain object +// because JSON object keys must be strings, and converting integer keys to +// strings and back on the Python side is error-prone. The Python caller +// reconstructs dict[int, dict] from these pairs. +// +// Example node (LetExpression root of `let Source = Sql.Database(...) in Source`): +// [1, { kind: "LetExpression", id: 1, variableList: { kind: "ArrayWrapper", elements: [...] }, ... }] +(globalThis as Record).parseExpression = async function (text: unknown): Promise { + if (typeof text !== "string") { + return JSON.stringify({ ok: false, error: "parseExpression: 'text' must be a string" }); + } + try { + const result = await TaskUtils.tryLexParse(DefaultSettings, text); + if (result.resultKind === ResultKind.Error) { + const err = result.error; + const stage = result.stage === Task.TaskStage.Lex ? "Lex" : "Parse"; + return JSON.stringify({ ok: false, error: `${stage}: ${formatTaskError(err)}` }); + } + const nodeIdMap = [...result.nodeIdMapCollection.astNodeById.entries()]; + return JSON.stringify({ ok: true, nodeIdMap }); + } catch (e) { + return JSON.stringify({ ok: false, error: String(e) }); + } +}; diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/mquery_bridge/package-lock.json b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/mquery_bridge/package-lock.json new file mode 100644 index 0000000000000..405138dbf438c --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/mquery_bridge/package-lock.json @@ -0,0 +1,506 @@ +{ + "name": "datahub-mquery-bridge", + "version": "1.0.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "datahub-mquery-bridge", + "version": "1.0.0", + "dependencies": { + "@microsoft/powerquery-parser": "0.18.4" + }, + "devDependencies": { + "@types/node": "^20.0.0", + "esbuild": "^0.20.0", + "typescript": "^5.4.0" + } + }, + "node_modules/@esbuild/aix-ppc64": { + "version": "0.20.2", + "resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.20.2.tgz", + "integrity": "sha512-D+EBOJHXdNZcLJRBkhENNG8Wji2kgc9AZ9KiPr1JuZjsNtyHzrsfLRrY0tk2H2aoFu6RANO1y1iPPUCDYWkb5g==", + "cpu": [ + "ppc64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "aix" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/android-arm": { + "version": "0.20.2", + "resolved": "https://registry.npmjs.org/@esbuild/android-arm/-/android-arm-0.20.2.tgz", + "integrity": "sha512-t98Ra6pw2VaDhqNWO2Oph2LXbz/EJcnLmKLGBJwEwXX/JAN83Fym1rU8l0JUWK6HkIbWONCSSatf4sf2NBRx/w==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/android-arm64": { + "version": "0.20.2", + "resolved": "https://registry.npmjs.org/@esbuild/android-arm64/-/android-arm64-0.20.2.tgz", + "integrity": "sha512-mRzjLacRtl/tWU0SvD8lUEwb61yP9cqQo6noDZP/O8VkwafSYwZ4yWy24kan8jE/IMERpYncRt2dw438LP3Xmg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/android-x64": { + "version": "0.20.2", + "resolved": "https://registry.npmjs.org/@esbuild/android-x64/-/android-x64-0.20.2.tgz", + "integrity": "sha512-btzExgV+/lMGDDa194CcUQm53ncxzeBrWJcncOBxuC6ndBkKxnHdFJn86mCIgTELsooUmwUm9FkhSp5HYu00Rg==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/darwin-arm64": { + "version": "0.20.2", + "resolved": "https://registry.npmjs.org/@esbuild/darwin-arm64/-/darwin-arm64-0.20.2.tgz", + "integrity": "sha512-4J6IRT+10J3aJH3l1yzEg9y3wkTDgDk7TSDFX+wKFiWjqWp/iCfLIYzGyasx9l0SAFPT1HwSCR+0w/h1ES/MjA==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/darwin-x64": { + "version": "0.20.2", + "resolved": "https://registry.npmjs.org/@esbuild/darwin-x64/-/darwin-x64-0.20.2.tgz", + "integrity": "sha512-tBcXp9KNphnNH0dfhv8KYkZhjc+H3XBkF5DKtswJblV7KlT9EI2+jeA8DgBjp908WEuYll6pF+UStUCfEpdysA==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/freebsd-arm64": { + "version": "0.20.2", + "resolved": "https://registry.npmjs.org/@esbuild/freebsd-arm64/-/freebsd-arm64-0.20.2.tgz", + "integrity": "sha512-d3qI41G4SuLiCGCFGUrKsSeTXyWG6yem1KcGZVS+3FYlYhtNoNgYrWcvkOoaqMhwXSMrZRl69ArHsGJ9mYdbbw==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/freebsd-x64": { + "version": "0.20.2", + "resolved": "https://registry.npmjs.org/@esbuild/freebsd-x64/-/freebsd-x64-0.20.2.tgz", + "integrity": "sha512-d+DipyvHRuqEeM5zDivKV1KuXn9WeRX6vqSqIDgwIfPQtwMP4jaDsQsDncjTDDsExT4lR/91OLjRo8bmC1e+Cw==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-arm": { + "version": "0.20.2", + "resolved": "https://registry.npmjs.org/@esbuild/linux-arm/-/linux-arm-0.20.2.tgz", + "integrity": "sha512-VhLPeR8HTMPccbuWWcEUD1Az68TqaTYyj6nfE4QByZIQEQVWBB8vup8PpR7y1QHL3CpcF6xd5WVBU/+SBEvGTg==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-arm64": { + "version": "0.20.2", + "resolved": "https://registry.npmjs.org/@esbuild/linux-arm64/-/linux-arm64-0.20.2.tgz", + "integrity": "sha512-9pb6rBjGvTFNira2FLIWqDk/uaf42sSyLE8j1rnUpuzsODBq7FvpwHYZxQ/It/8b+QOS1RYfqgGFNLRI+qlq2A==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-ia32": { + "version": "0.20.2", + "resolved": "https://registry.npmjs.org/@esbuild/linux-ia32/-/linux-ia32-0.20.2.tgz", + "integrity": "sha512-o10utieEkNPFDZFQm9CoP7Tvb33UutoJqg3qKf1PWVeeJhJw0Q347PxMvBgVVFgouYLGIhFYG0UGdBumROyiig==", + "cpu": [ + "ia32" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-loong64": { + "version": "0.20.2", + "resolved": "https://registry.npmjs.org/@esbuild/linux-loong64/-/linux-loong64-0.20.2.tgz", + "integrity": "sha512-PR7sp6R/UC4CFVomVINKJ80pMFlfDfMQMYynX7t1tNTeivQ6XdX5r2XovMmha/VjR1YN/HgHWsVcTRIMkymrgQ==", + "cpu": [ + "loong64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-mips64el": { + "version": "0.20.2", + "resolved": "https://registry.npmjs.org/@esbuild/linux-mips64el/-/linux-mips64el-0.20.2.tgz", + "integrity": "sha512-4BlTqeutE/KnOiTG5Y6Sb/Hw6hsBOZapOVF6njAESHInhlQAghVVZL1ZpIctBOoTFbQyGW+LsVYZ8lSSB3wkjA==", + "cpu": [ + "mips64el" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-ppc64": { + "version": "0.20.2", + "resolved": "https://registry.npmjs.org/@esbuild/linux-ppc64/-/linux-ppc64-0.20.2.tgz", + "integrity": "sha512-rD3KsaDprDcfajSKdn25ooz5J5/fWBylaaXkuotBDGnMnDP1Uv5DLAN/45qfnf3JDYyJv/ytGHQaziHUdyzaAg==", + "cpu": [ + "ppc64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-riscv64": { + "version": "0.20.2", + "resolved": "https://registry.npmjs.org/@esbuild/linux-riscv64/-/linux-riscv64-0.20.2.tgz", + "integrity": "sha512-snwmBKacKmwTMmhLlz/3aH1Q9T8v45bKYGE3j26TsaOVtjIag4wLfWSiZykXzXuE1kbCE+zJRmwp+ZbIHinnVg==", + "cpu": [ + "riscv64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-s390x": { + "version": "0.20.2", + "resolved": "https://registry.npmjs.org/@esbuild/linux-s390x/-/linux-s390x-0.20.2.tgz", + "integrity": "sha512-wcWISOobRWNm3cezm5HOZcYz1sKoHLd8VL1dl309DiixxVFoFe/o8HnwuIwn6sXre88Nwj+VwZUvJf4AFxkyrQ==", + "cpu": [ + "s390x" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-x64": { + "version": "0.20.2", + "resolved": "https://registry.npmjs.org/@esbuild/linux-x64/-/linux-x64-0.20.2.tgz", + "integrity": "sha512-1MdwI6OOTsfQfek8sLwgyjOXAu+wKhLEoaOLTjbijk6E2WONYpH9ZU2mNtR+lZ2B4uwr+usqGuVfFT9tMtGvGw==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/netbsd-x64": { + "version": "0.20.2", + "resolved": "https://registry.npmjs.org/@esbuild/netbsd-x64/-/netbsd-x64-0.20.2.tgz", + "integrity": "sha512-K8/DhBxcVQkzYc43yJXDSyjlFeHQJBiowJ0uVL6Tor3jGQfSGHNNJcWxNbOI8v5k82prYqzPuwkzHt3J1T1iZQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "netbsd" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/openbsd-x64": { + "version": "0.20.2", + "resolved": "https://registry.npmjs.org/@esbuild/openbsd-x64/-/openbsd-x64-0.20.2.tgz", + "integrity": "sha512-eMpKlV0SThJmmJgiVyN9jTPJ2VBPquf6Kt/nAoo6DgHAoN57K15ZghiHaMvqjCye/uU4X5u3YSMgVBI1h3vKrQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openbsd" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/sunos-x64": { + "version": "0.20.2", + "resolved": "https://registry.npmjs.org/@esbuild/sunos-x64/-/sunos-x64-0.20.2.tgz", + "integrity": "sha512-2UyFtRC6cXLyejf/YEld4Hajo7UHILetzE1vsRcGL3earZEW77JxrFjH4Ez2qaTiEfMgAXxfAZCm1fvM/G/o8w==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "sunos" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/win32-arm64": { + "version": "0.20.2", + "resolved": "https://registry.npmjs.org/@esbuild/win32-arm64/-/win32-arm64-0.20.2.tgz", + "integrity": "sha512-GRibxoawM9ZCnDxnP3usoUDO9vUkpAxIIZ6GQI+IlVmr5kP3zUq+l17xELTHMWTWzjxa2guPNyrpq1GWmPvcGQ==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/win32-ia32": { + "version": "0.20.2", + "resolved": "https://registry.npmjs.org/@esbuild/win32-ia32/-/win32-ia32-0.20.2.tgz", + "integrity": "sha512-HfLOfn9YWmkSKRQqovpnITazdtquEW8/SoHW7pWpuEeguaZI4QnCRW6b+oZTztdBnZOS2hqJ6im/D5cPzBTTlQ==", + "cpu": [ + "ia32" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/win32-x64": { + "version": "0.20.2", + "resolved": "https://registry.npmjs.org/@esbuild/win32-x64/-/win32-x64-0.20.2.tgz", + "integrity": "sha512-N49X4lJX27+l9jbLKSqZ6bKNjzQvHaT8IIFUy+YIqmXQdjYCToGWwOItDrfby14c78aDd5NHQl29xingXfCdLQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@microsoft/powerquery-parser": { + "version": "0.18.4", + "resolved": "https://registry.npmjs.org/@microsoft/powerquery-parser/-/powerquery-parser-0.18.4.tgz", + "integrity": "sha512-NV+vGcTLWIsWXEaKoLDHqThPEUWPGm8A1CK2rcdJtzAA2e30mf8svrujzOhQ1T0AErhMqjKz69Gsoa5cds6qbw==", + "license": "MIT", + "dependencies": { + "grapheme-splitter": "^1.0.4", + "performance-now": "^2.1.0" + }, + "engines": { + "node": ">=16.13.1" + } + }, + "node_modules/@types/node": { + "version": "20.19.37", + "resolved": "https://registry.npmjs.org/@types/node/-/node-20.19.37.tgz", + "integrity": "sha512-8kzdPJ3FsNsVIurqBs7oodNnCEVbni9yUEkaHbgptDACOPW04jimGagZ51E6+lXUwJjgnBw+hyko/lkFWCldqw==", + "dev": true, + "license": "MIT", + "dependencies": { + "undici-types": "~6.21.0" + } + }, + "node_modules/esbuild": { + "version": "0.20.2", + "resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.20.2.tgz", + "integrity": "sha512-WdOOppmUNU+IbZ0PaDiTst80zjnrOkyJNHoKupIcVyU8Lvla3Ugx94VzkQ32Ijqd7UhHJy75gNWDMUekcrSJ6g==", + "dev": true, + "hasInstallScript": true, + "license": "MIT", + "bin": { + "esbuild": "bin/esbuild" + }, + "engines": { + "node": ">=12" + }, + "optionalDependencies": { + "@esbuild/aix-ppc64": "0.20.2", + "@esbuild/android-arm": "0.20.2", + "@esbuild/android-arm64": "0.20.2", + "@esbuild/android-x64": "0.20.2", + "@esbuild/darwin-arm64": "0.20.2", + "@esbuild/darwin-x64": "0.20.2", + "@esbuild/freebsd-arm64": "0.20.2", + "@esbuild/freebsd-x64": "0.20.2", + "@esbuild/linux-arm": "0.20.2", + "@esbuild/linux-arm64": "0.20.2", + "@esbuild/linux-ia32": "0.20.2", + "@esbuild/linux-loong64": "0.20.2", + "@esbuild/linux-mips64el": "0.20.2", + "@esbuild/linux-ppc64": "0.20.2", + "@esbuild/linux-riscv64": "0.20.2", + "@esbuild/linux-s390x": "0.20.2", + "@esbuild/linux-x64": "0.20.2", + "@esbuild/netbsd-x64": "0.20.2", + "@esbuild/openbsd-x64": "0.20.2", + "@esbuild/sunos-x64": "0.20.2", + "@esbuild/win32-arm64": "0.20.2", + "@esbuild/win32-ia32": "0.20.2", + "@esbuild/win32-x64": "0.20.2" + } + }, + "node_modules/grapheme-splitter": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/grapheme-splitter/-/grapheme-splitter-1.0.4.tgz", + "integrity": "sha512-bzh50DW9kTPM00T8y4o8vQg89Di9oLJVLW/KaOGIXJWP/iqCN6WKYkbNOF04vFLJhwcpYUh9ydh/+5vpOqV4YQ==", + "license": "MIT" + }, + "node_modules/performance-now": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/performance-now/-/performance-now-2.1.0.tgz", + "integrity": "sha512-7EAHlyLHI56VEIdK57uwHdHKIaAGbnXPiw0yWbarQZOKaKpvUIgW0jWRVLiatnM+XXlSwsanIBH/hzGMJulMow==", + "license": "MIT" + }, + "node_modules/typescript": { + "version": "5.9.3", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.9.3.tgz", + "integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==", + "dev": true, + "license": "Apache-2.0", + "bin": { + "tsc": "bin/tsc", + "tsserver": "bin/tsserver" + }, + "engines": { + "node": ">=14.17" + } + }, + "node_modules/undici-types": { + "version": "6.21.0", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz", + "integrity": "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==", + "dev": true, + "license": "MIT" + } + } +} diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/mquery_bridge/package.json b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/mquery_bridge/package.json new file mode 100644 index 0000000000000..9b55982bc8831 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/mquery_bridge/package.json @@ -0,0 +1,17 @@ +{ + "name": "datahub-mquery-bridge", + "version": "1.0.0", + "private": true, + "main": "dist/bundle.js", + "scripts": { + "build": "tsc && npx esbuild dist/index.js --bundle --platform=node --target=node20 --outfile=dist/bundle.js" + }, + "dependencies": { + "@microsoft/powerquery-parser": "0.18.4" + }, + "devDependencies": { + "typescript": "^5.4.0", + "esbuild": "^0.20.0", + "@types/node": "^20.0.0" + } +} diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/mquery_bridge/tsconfig.json b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/mquery_bridge/tsconfig.json new file mode 100644 index 0000000000000..c352aa4592bd4 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/mquery_bridge/tsconfig.json @@ -0,0 +1,15 @@ +{ + "compilerOptions": { + "target": "ES2020", + "module": "commonjs", + "lib": ["ES2020"], + "outDir": "./dist", + "rootDir": ".", + "strict": true, + "esModuleInterop": true, + "skipLibCheck": true, + "resolveJsonModule": true + }, + "include": ["index.ts"], + "exclude": ["node_modules", "dist"] +} diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py index aaa5b8084409c..e68f9f9f32cb4 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py @@ -1,13 +1,6 @@ -import functools -import importlib.resources as pkg_resource import logging from typing import Dict, List, Optional -import lark -from lark import Lark, Tree - -import datahub.ingestion.source.powerbi.m_query.data_classes -from datahub.configuration.env_vars import get_powerbi_m_query_parse_timeout from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.source.powerbi.config import ( PowerBiDashboardSourceConfig, @@ -16,47 +9,42 @@ from datahub.ingestion.source.powerbi.dataplatform_instance_resolver import ( AbstractDataPlatformInstanceResolver, ) -from datahub.ingestion.source.powerbi.m_query import resolver, validator +from datahub.ingestion.source.powerbi.m_query import ( + pattern_handler, + resolver as mquery_resolver, +) +from datahub.ingestion.source.powerbi.m_query._bridge import ( + MQueryBridgeError, + MQueryParseError, + _clear_bridge, + get_bridge, +) from datahub.ingestion.source.powerbi.m_query.data_classes import ( TRACE_POWERBI_MQUERY_PARSER, + Lineage, ) from datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes import Table from datahub.utilities.threading_timeout import TimeoutException, threading_timeout logger = logging.getLogger(__name__) -_M_QUERY_PARSE_TIMEOUT = get_powerbi_m_query_parse_timeout() - - -@functools.lru_cache(maxsize=1) -def get_lark_parser() -> Lark: - # Read lexical grammar as text - grammar: str = pkg_resource.read_text( - "datahub.ingestion.source.powerbi", "powerbi-lexical-grammar.rule" - ) - # Create lark parser for the grammar text - return Lark(grammar, start="let_expression", regex=True) - - -def _parse_expression(expression: str, parse_timeout: int = 60) -> Tree: - lark_parser: Lark = get_lark_parser() - - # Replace U+00a0 NO-BREAK SPACE with a normal space. - # Sometimes PowerBI returns expressions with this character and it breaks the parser. - expression = expression.replace("\u00a0", " ") - - # Parser resolves the variable=null value to variable='', and in the Tree we get empty string - # to distinguish between an empty and null set =null to ="null" - expression = expression.replace("=null", '="null"') - - logger.debug(f"Parsing expression = {expression}") - with threading_timeout(parse_timeout): - parse_tree: Tree = lark_parser.parse(expression) - - if TRACE_POWERBI_MQUERY_PARSER: - logger.debug(parse_tree.pretty()) - return parse_tree +def _parse_with_bridge(expression: str, timeout: int) -> Dict[int, dict]: + """Call the bridge and return the NodeIdMap dict. + Clears the singleton on bridge crash or timeout so the next call gets a fresh context. + """ + try: + with threading_timeout(timeout): + return get_bridge().parse(expression) + except MQueryBridgeError: + _clear_bridge() + raise + except TimeoutException: + # The timeout interrupts the Python thread mid-V8-eval, leaving the MiniRacer + # context in an undefined state. Clear the singleton so the next call gets a + # fresh context rather than reusing the potentially-corrupted one. + _clear_bridge() + raise def get_upstream_tables( @@ -66,94 +54,150 @@ def get_upstream_tables( ctx: PipelineContext, config: PowerBiDashboardSourceConfig, parameters: Optional[Dict[str, str]] = None, -) -> List[datahub.ingestion.source.powerbi.m_query.data_classes.Lineage]: +) -> List[Lineage]: + """Parse the M-Query expression on *table* and return upstream lineage. + + Returns an empty list when the expression is absent, empty, a DAX + computed-table expression (no ``let`` keyword), or a NativeQuery that the + caller has opted out of (``native_query_parsing=False``). + """ parameters = parameters or {} + if table.expression is None: - logger.debug(f"There is no M-Query expression in table {table.full_name}") + logger.debug("There is no M-Query expression in table %s", table.full_name) return [] - parameters = parameters or {} + expression = table.expression - logger.debug( - f"Processing {table.full_name} m-query expression for lineage extraction. Expression = {table.expression}" - ) + if not expression.strip(): + logger.debug("Empty M-Query expression in table %s — skipping", table.full_name) + return [] - try: - valid, message = validator.validate_parse_tree( - table.expression, native_query_enabled=config.native_query_parsing + if TRACE_POWERBI_MQUERY_PARSER: + logger.debug( + "Processing %s m-query expression for lineage extraction. Expression = %s", + table.full_name, + expression, ) - if valid is False: - assert message is not None - logger.debug(f"Validation failed: {message}") - reporter.info( - title="Non-Data Platform Expression", - message=message, - context=f"table-full-name={table.full_name}, expression={table.expression}, message={message}", - ) - reporter.m_query_parse_validation_errors += 1 - return [] - with reporter.m_query_parse_timer: - reporter.m_query_parse_attempts += 1 - parse_tree: Tree = _parse_expression( - table.expression, parse_timeout=config.m_query_parse_timeout - ) + # Replaces validator.py — correctly suppresses only NativeQuery expressions, + # fixing the prior bug where native_query_parsing=False suppressed all parsing. + if not config.native_query_parsing and "Value.NativeQuery" in expression: + logger.debug( + "Skipping NativeQuery expression (native_query_parsing=False) for %s", + table.full_name, + ) + reporter.m_query_native_query_skipped += 1 + return [] - except KeyboardInterrupt: - raise + reporter.m_query_parse_attempts += 1 + + try: + with reporter.m_query_parse_timer: + node_map = _parse_with_bridge(expression, config.m_query_parse_timeout) except TimeoutException: reporter.m_query_parse_timeouts += 1 reporter.warning( title="M-Query Parsing Timeout", message=f"M-Query parsing timed out after {config.m_query_parse_timeout} seconds. Lineage for this table will not be extracted.", - context=f"table-full-name={table.full_name}, expression={table.expression}", + context=f"table-full-name={table.full_name}, expression={expression}", ) return [] - except ( - BaseException - ) as e: # TODO: Debug why BaseException is needed here and below. - if isinstance(e, lark.exceptions.UnexpectedCharacters): - error_type = "Unexpected Character Error" - reporter.m_query_parse_unexpected_character_errors += 1 + except MQueryParseError as e: + # Expressions without a `let` keyword are almost certainly not M-Query + # (e.g. DAX computed-table expressions like SUMMARIZE(...)). The old + # Lark parser happened to parse these and then logged INFO "Non-Data + # Platform Expression". Preserve that behaviour: only warn when the + # expression looks like it was intended to be M-Query. + if "let" not in expression.lower(): + reporter.m_query_non_mquery_expressions += 1 + logger.info( + "Non-M-Query expression in table %s — skipping lineage extraction " + "(expression does not contain 'let'). Expression: %s. Error: %s", + table.full_name, + expression, + e, + ) else: - error_type = "Unknown Parsing Error" reporter.m_query_parse_unknown_errors += 1 - + reporter.warning( + title="Unable to parse M-Query expression", + message="Got a parse error while parsing the expression. Lineage will be missing for this table.", + context=f"table-full-name={table.full_name}, expression={expression}", + exc=e, + ) + return [] + except MQueryBridgeError as e: + reporter.m_query_parse_unknown_errors += 1 reporter.warning( title="Unable to parse M-Query expression", - message=f"Got an '{error_type}' while parsing the expression. Lineage will be missing for this table.", - context=f"table-full-name={table.full_name}, expression={table.expression}", + message="Got a parse error while parsing the expression. Lineage will be missing for this table.", + context=f"table-full-name={table.full_name}", exc=e, ) return [] + reporter.m_query_parse_successes += 1 try: - lineage: List[datahub.ingestion.source.powerbi.m_query.data_classes.Lineage] = ( - resolver.MQueryResolver( - table=table, - parse_tree=parse_tree, - reporter=reporter, - parameters=parameters, - ).resolve_to_lineage( + data_access_func_details = mquery_resolver.resolve_to_data_access_functions( + node_map, parameters=parameters + ) + + if not data_access_func_details: + logger.debug( + "No recognized data-access function found in expression for table %s." + " Expression may use an unsupported source (e.g. Web.Contents," + " Excel.Workbook). To add support, reproduce with: %r", + table.full_name, + expression, + ) + + lineages: List[Lineage] = [] + for f_detail in data_access_func_details: + supported_pattern = pattern_handler.SupportedPattern.get_pattern_handler( + f_detail.data_access_function_name + ) + if supported_pattern is None: + logger.debug( + "No handler for data access function %s", + f_detail.data_access_function_name, + ) + continue + lineage = supported_pattern.handler()( ctx=ctx, + table=table, config=config, + reporter=reporter, platform_instance_resolver=platform_instance_resolver, - ) - ) + ).create_lineage(f_detail) + if lineage.upstreams: + lineages.append(lineage) - if lineage: + if lineages: reporter.m_query_resolver_successes += 1 else: reporter.m_query_resolver_no_lineage += 1 - return lineage - - except BaseException as e: + if data_access_func_details: + # Function(s) were recognized but all handlers returned empty — + # the per-handler debug logs above explain why. Log the expression + # here so it can be copy-pasted into a local test for investigation. + logger.debug( + "Recognized function(s) %s but no lineage extracted for table %s." + " To reproduce locally: %r", + [f.data_access_function_name for f in data_access_func_details], + table.full_name, + expression, + ) + + return lineages + + except Exception as e: reporter.m_query_resolver_errors += 1 reporter.warning( title="Unknown M-Query Pattern", message="Encountered an unknown M-Query Expression", - context=f"table-full-name={table.full_name}, expression={table.expression}, message={e}", + context=f"table-full-name={table.full_name}, expression={expression}, message={e}", exc=e, ) return [] diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/pattern_handler.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/pattern_handler.py index fedade3c9bc10..2e5688db34e48 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/pattern_handler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/pattern_handler.py @@ -1,10 +1,9 @@ import logging from abc import ABC, abstractmethod from enum import Enum -from typing import Callable, Dict, List, Optional, Tuple, Type, cast +from typing import Callable, Dict, List, Optional, Tuple, Type import sqlglot -from lark import Tree from sqlglot import ParseError, expressions as exp from datahub.configuration.source_common import PlatformDetail @@ -22,7 +21,13 @@ from datahub.ingestion.source.powerbi.dataplatform_instance_resolver import ( AbstractDataPlatformInstanceResolver, ) -from datahub.ingestion.source.powerbi.m_query import native_sql_parser, tree_function +from datahub.ingestion.source.powerbi.m_query import native_sql_parser +from datahub.ingestion.source.powerbi.m_query.ast_utils import ( + find_nodes_by_kind, + get_literal_value, + get_record_field_values, + resolve_identifier, +) from datahub.ingestion.source.powerbi.m_query.data_classes import ( DataAccessFunctionDetail, DataPlatformTable, @@ -50,6 +55,114 @@ logger = logging.getLogger(__name__) +def _unwrap_csv(elem: dict) -> dict: + """Unwrap a Csv wrapper node, returning the inner node.""" + if isinstance(elem, dict) and elem.get("kind") == "Csv": + return elem.get("node", elem) + return elem + + +def _get_invoke_elements(invoke_node: dict) -> List[dict]: + """Return the unwrapped argument elements from an InvokeExpression.""" + content = invoke_node.get("content", {}) + if not isinstance(content, dict) or content.get("kind") != "ArrayWrapper": + return [] + return [_unwrap_csv(e) for e in content.get("elements", []) if isinstance(e, dict)] + + +def _get_arg_values( + node_map: Dict[int, dict], + invoke_node: dict, + parameters: Optional[Dict[str, str]] = None, +) -> List[Optional[str]]: + """Extract positional string arguments from an InvokeExpression node. + + Returns a list of Optional[str] -- one per argument. + RecordExpression arguments return None. + IdentifierExpression arguments are resolved via parameters dict. + """ + parameters = parameters or {} + values: List[Optional[str]] = [] + for inner in _get_invoke_elements(invoke_node): + val = get_literal_value(inner) + if val is None and isinstance(inner, dict): + if inner.get("kind") == "IdentifierExpression": + ref_name = inner.get("identifier", {}).get("literal", "") + if ref_name.startswith('#"') and ref_name.endswith('"'): + ref_name = ref_name[2:-1] + if ref_name in parameters: + val = parameters[ref_name] + else: + logger.debug( + "Argument '%s' is an unresolved parameter reference" + " — not found in dataset parameters", + ref_name, + ) + values.append(val) + return values + + +def _get_record_args(node_map: Dict[int, dict], invoke_node: dict) -> Dict[str, str]: + """Extract all key-value pairs from RecordExpression arguments in an InvokeExpression.""" + result: Dict[str, str] = {} + for inner in _get_invoke_elements(invoke_node): + if isinstance(inner, dict) and inner.get("kind") == "RecordExpression": + result.update(get_record_field_values(node_map, inner)) + return result + + +def _get_data_source_tokens(node_map: Dict[int, dict], arg_node: dict) -> List[str]: + """Extract [platform_name, server, ...other_args] from a data source node. + + If arg_node is an IdentifierExpression, resolves it through the let scope. + """ + # Resolve through let scope if identifier + if arg_node.get("kind") == "IdentifierExpression": + name = arg_node.get("identifier", {}).get("literal", "") + let_nodes = sorted( + find_nodes_by_kind(node_map, "LetExpression"), + key=lambda n: n.get("id", 0), + ) + if let_nodes: + resolved = resolve_identifier(node_map, let_nodes[0], name) + if resolved is not None: + arg_node = resolved + + if arg_node.get("kind") != "RecursivePrimaryExpression": + return [] + + head = arg_node.get("head", {}) + platform_name = "" + if head.get("kind") == "IdentifierExpression": + platform_name = head.get("identifier", {}).get("literal", "") + + tokens: List[str] = [platform_name] + + rec_exprs = arg_node.get("recursiveExpressions", {}) + elements = rec_exprs.get("elements", []) if isinstance(rec_exprs, dict) else [] + + for elem in elements: + if elem.get("kind") != "InvokeExpression": + continue + content = elem.get("content", {}) + if not isinstance(content, dict) or content.get("kind") != "ArrayWrapper": + continue + for arg_elem in content.get("elements", []): + inner = _unwrap_csv(arg_elem) + if not isinstance(inner, dict): + continue + val = get_literal_value(inner) + if val is not None: + tokens.append(val) + elif inner.get("kind") == "RecordExpression": + kv = get_record_field_values(node_map, inner) + for k, v in kv.items(): + tokens.append(k) + tokens.append(v) + + return tokens + + def get_next_item(items: List[str], item: str) -> Optional[str]: if item in items: try: @@ -157,62 +270,67 @@ def get_platform_pair(self) -> DataPlatformPair: @staticmethod def get_db_detail_from_argument( - arg_list: Tree, + arg_list: dict, + node_map: Optional[Dict[int, dict]] = None, + parameters: Optional[Dict[str, str]] = None, ) -> Tuple[Optional[str], Optional[str]]: - # TODO: tree_function.token_values turns nulls into empty strings, - # which then get removed by remove_whitespaces_from_list. We would - # prefer to pass them along as None to give callers an accurate view - # of the arguments. - arguments: List[str] = tree_function.strip_char_from_list( - values=tree_function.remove_whitespaces_from_list( - tree_function.token_values(arg_list) - ), - ) - logger.debug(f"DB Details: {arguments}") + node_map = node_map or {} + args = _get_arg_values(node_map, arg_list, parameters=parameters) + logger.debug(f"DB Details: {args}") return ( - arguments[0] if len(arguments) > 0 else None, - arguments[1] if len(arguments) > 1 else None, + args[0] if len(args) > 0 else None, + args[1] if len(args) > 1 else None, ) @staticmethod def create_reference_table( - arg_list: Tree, + arg_list: dict, table_detail: Dict[str, str], + node_map: Optional[Dict[int, dict]] = None, + parameters: Optional[Dict[str, str]] = None, ) -> Optional[ReferencedTable]: - arguments: List[str] = tree_function.strip_char_from_list( - values=tree_function.remove_whitespaces_from_list( - tree_function.token_values(arg_list) - ), - ) + node_map = node_map or {} + args = _get_arg_values(node_map, arg_list, parameters=parameters) + record_fields = _get_record_args(node_map, arg_list) - logger.debug(f"Processing arguments {arguments}") + logger.debug(f"Processing arguments {args}, record_fields {record_fields}") - if ( - len(arguments) >= 4 # [0] is warehouse FQDN. - # [1] is endpoint, we are not using it. - # [2] is "Catalog" key - # [3] is catalog's value - ): + warehouse = args[0] if args else None + if warehouse is None: + logger.debug( + "No warehouse/host argument resolved from %s — skipping lineage", + args, + ) + return None + + catalog = record_fields.get("Catalog") + if catalog is not None: return ReferencedTable( - warehouse=arguments[0], - catalog=arguments[3], - # As per my observation, database and catalog names are same in M-Query + warehouse=warehouse, + catalog=catalog, database=table_detail["Database"] if table_detail.get("Database") - else arguments[3], + else catalog, schema=table_detail["Schema"], table=table_detail.get("Table") or table_detail["View"], ) - elif len(arguments) == 2: + elif len(args) >= 2: return ReferencedTable( - warehouse=arguments[0], + warehouse=warehouse, database=table_detail["Database"], schema=table_detail["Schema"], table=table_detail.get("Table") or table_detail["View"], catalog=None, ) + logger.debug( + "Insufficient arguments to build table reference" + " (warehouse=%s, args=%s, record_fields=%s) — skipping lineage", + warehouse, + args, + record_fields, + ) return None @staticmethod @@ -361,7 +479,11 @@ def create_lineage( f"Processing AmazonAthena data-access function detail {data_access_func_detail}" ) - server, _ = self.get_db_detail_from_argument(data_access_func_detail.arg_list) + server, _ = self.get_db_detail_from_argument( + data_access_func_detail.arg_list, + node_map=data_access_func_detail.node_map, + parameters=data_access_func_detail.parameters, + ) if server is None: logger.debug("Server/region not found in Athena data access function") return Lineage.empty() @@ -481,19 +603,31 @@ def create_lineage( ) server, db_name = self.get_db_detail_from_argument( - data_access_func_detail.arg_list + data_access_func_detail.arg_list, + node_map=data_access_func_detail.node_map, + parameters=data_access_func_detail.parameters, ) if db_name is None or server is None: - return Lineage.empty() # Return an empty list + logger.debug( + "Server or database argument not resolved for Redshift table %s" + " (server=%s, db=%s) — skipping lineage", + self.table.full_name, + server, + db_name, + ) + return Lineage.empty() - schema_name: str = cast( - IdentifierAccessor, data_access_func_detail.identifier_accessor - ).items["Name"] + accessor = data_access_func_detail.identifier_accessor + if accessor is None or accessor.next is None: + logger.debug( + "Incomplete accessor chain for Redshift table %s" + " — expected two navigation steps (schema then table)", + self.table.full_name, + ) + return Lineage.empty() - table_name: str = cast( - IdentifierAccessor, - cast(IdentifierAccessor, data_access_func_detail.identifier_accessor).next, - ).items["Name"] + schema_name: str = accessor.items["Name"] + table_name: str = accessor.next.items["Name"] qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}" @@ -535,7 +669,7 @@ def _get_server_and_db_name(value: str) -> Tuple[Optional[str], Optional[str]]: db_name = splitter_result[1].split(".")[0] - return tree_function.strip_char_from_list([splitter_result[0]])[0], db_name + return splitter_result[0].strip('"'), db_name def create_lineage( self, data_access_func_detail: DataAccessFunctionDetail @@ -544,23 +678,26 @@ def create_lineage( f"Processing Oracle data-access function detail {data_access_func_detail}" ) - arguments: List[str] = tree_function.remove_whitespaces_from_list( - tree_function.token_values(data_access_func_detail.arg_list) + args = _get_arg_values( + data_access_func_detail.node_map, + data_access_func_detail.arg_list, + parameters=data_access_func_detail.parameters, ) - server, db_name = self._get_server_and_db_name(arguments[0]) + if not args or args[0] is None: + return Lineage.empty() + + server, db_name = self._get_server_and_db_name(args[0]) if db_name is None or server is None: return Lineage.empty() - schema_name: str = cast( - IdentifierAccessor, data_access_func_detail.identifier_accessor - ).items["Schema"] + accessor = data_access_func_detail.identifier_accessor + if accessor is None or accessor.next is None: + return Lineage.empty() - table_name: str = cast( - IdentifierAccessor, - cast(IdentifierAccessor, data_access_func_detail.identifier_accessor).next, - ).items["Name"] + schema_name: Optional[str] = accessor.items.get("Schema") + table_name: Optional[str] = accessor.next.items.get("Name") qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}" @@ -643,6 +780,8 @@ def create_lineage( table_reference = self.create_reference_table( arg_list=data_access_func_detail.arg_list, table_detail=table_detail, + node_map=data_access_func_detail.node_map, + parameters=data_access_func_detail.parameters, ) if table_reference: @@ -695,18 +834,41 @@ def two_level_access_pattern( ) server, db_name = self.get_db_detail_from_argument( - data_access_func_detail.arg_list + data_access_func_detail.arg_list, + node_map=data_access_func_detail.node_map, + parameters=data_access_func_detail.parameters, ) - if server is None or db_name is None: - return Lineage.empty() # Return an empty list + if db_name is None: + logger.debug( + "No database argument resolved for %s (%s) — skipping lineage", + self.get_platform_pair().powerbi_data_platform_name, + self.table.full_name, + ) + return Lineage.empty() + if server is None: + # Server argument is an unresolved parameter reference (e.g. Sql.Database(ServerName, "db")). + # Fall back to empty-string server to preserve the pre-v2-parser behavior (partial lineage). + logger.info( + "Server argument not resolved from dataset parameters for table %s" + " — emitting partial lineage without server host." + " Add the server parameter to the dataset to resolve fully.", + self.table.full_name, + ) + server = "" - schema_name: str = cast( - IdentifierAccessor, data_access_func_detail.identifier_accessor - ).items["Schema"] + accessor = data_access_func_detail.identifier_accessor + if accessor is None: + logger.debug( + "No accessor chain for %s (%s) — expression may reference the source" + " directly without a table navigation step (e.g. missing" + " Source{[Schema=...,Item=...]}[Data])", + self.get_platform_pair().powerbi_data_platform_name, + self.table.full_name, + ) + return Lineage.empty() - table_name: str = cast( - IdentifierAccessor, data_access_func_detail.identifier_accessor - ).items["Item"] + schema_name: Optional[str] = accessor.items.get("Schema") + table_name: Optional[str] = accessor.items.get("Item") qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}" @@ -744,18 +906,31 @@ def create_lineage( ) server, db_name = self.get_db_detail_from_argument( - data_access_func_detail.arg_list + data_access_func_detail.arg_list, + node_map=data_access_func_detail.node_map, + parameters=data_access_func_detail.parameters, ) if server is None or db_name is None: - return Lineage.empty() # Return an empty list + logger.debug( + "Server or database argument not resolved for MySQL table %s" + " (server=%s, db=%s) — skipping lineage", + self.table.full_name, + server, + db_name, + ) + return Lineage.empty() - schema_name: str = cast( - IdentifierAccessor, data_access_func_detail.identifier_accessor - ).items["Schema"] + accessor = data_access_func_detail.identifier_accessor + if accessor is None: + logger.debug( + "No accessor chain for MySQL table %s" + " — expected Source{[Schema=...,Item=...]} navigation step", + self.table.full_name, + ) + return Lineage.empty() - table_name: str = cast( - IdentifierAccessor, data_access_func_detail.identifier_accessor - ).items["Item"] + schema_name: Optional[str] = accessor.items.get("Schema") + table_name: Optional[str] = accessor.items.get("Item") qualified_table_name: str = f"{schema_name}.{table_name}" @@ -853,22 +1028,34 @@ def create_urn_using_old_parser( def create_lineage( self, data_access_func_detail: DataAccessFunctionDetail ) -> Lineage: - arguments: List[str] = tree_function.strip_char_from_list( - values=tree_function.remove_whitespaces_from_list( - tree_function.token_values(data_access_func_detail.arg_list) - ), - ) + node_map = data_access_func_detail.node_map server, database = self.get_db_detail_from_argument( - data_access_func_detail.arg_list + data_access_func_detail.arg_list, + node_map=node_map, + parameters=data_access_func_detail.parameters, ) - if server is None or database is None: - return Lineage.empty() # Return an empty list - - assert server - assert database # to silent the lint + if database is None: + logger.debug( + "No database argument resolved for MSSql table %s — skipping lineage", + self.table.full_name, + ) + return Lineage.empty() + if server is None: + # Server argument is an unresolved parameter reference (e.g. Sql.Database(ServerName, "db")). + # The parameter is not in the dataset's parameters dict, so we can't resolve the host. + # Fall back to empty-string server to preserve the pre-v2-parser behavior (partial lineage). + logger.info( + "Server argument not resolved from dataset parameters for table %s" + " — emitting partial lineage without server host." + " Add the server parameter to the dataset to resolve fully.", + self.table.full_name, + ) + server = "" - query: Optional[str] = get_next_item(arguments, "Query") + # Check for inline SQL query in record arguments (e.g. [Query="SELECT ..."]) + record_fields = _get_record_args(node_map, data_access_func_detail.arg_list) + query: Optional[str] = record_fields.get("Query") if query: if self.config.enable_advance_lineage_sql_construct is False: # Use previous parser to generate URN to keep backward compatibility @@ -895,9 +1082,11 @@ def create_lineage( class ThreeStepDataAccessPattern(AbstractLineage, ABC): def get_datasource_server( - self, arguments: List[str], data_access_func_detail: DataAccessFunctionDetail + self, + args: List[Optional[str]], + data_access_func_detail: DataAccessFunctionDetail, ) -> str: - return tree_function.strip_char_from_list([arguments[0]])[0] + return args[0].strip('"') if args and args[0] else "" def create_lineage( self, data_access_func_detail: DataAccessFunctionDetail @@ -906,21 +1095,22 @@ def create_lineage( f"Processing {self.get_platform_pair().datahub_data_platform_name} function detail {data_access_func_detail}" ) - arguments: List[str] = tree_function.remove_whitespaces_from_list( - tree_function.token_values(data_access_func_detail.arg_list) + args = _get_arg_values( + data_access_func_detail.node_map, + data_access_func_detail.arg_list, + parameters=data_access_func_detail.parameters, ) + + accessor = data_access_func_detail.identifier_accessor + if accessor is None or accessor.next is None or accessor.next.next is None: + return Lineage.empty() + # First is database name - db_name: str = data_access_func_detail.identifier_accessor.items["Name"] # type: ignore + db_name: str = accessor.items["Name"] # Second is schema name - schema_name: str = cast( - IdentifierAccessor, - data_access_func_detail.identifier_accessor.next, # type: ignore - ).items["Name"] + schema_name: str = accessor.next.items["Name"] # Third is table name - table_name: str = cast( - IdentifierAccessor, - data_access_func_detail.identifier_accessor.next.next, # type: ignore - ).items["Name"] + table_name: str = accessor.next.next.items["Name"] qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}" @@ -928,7 +1118,7 @@ def create_lineage( f"{self.get_platform_pair().datahub_data_platform_name} qualified_table_name {qualified_table_name}" ) - server: str = self.get_datasource_server(arguments, data_access_func_detail) + server: str = self.get_datasource_server(args, data_access_func_detail) urn = make_urn( config=self.config, @@ -961,10 +1151,11 @@ def get_platform_pair(self) -> DataPlatformPair: return SupportedDataPlatform.GOOGLE_BIGQUERY.value def get_datasource_server( - self, arguments: List[str], data_access_func_detail: DataAccessFunctionDetail + self, + args: List[Optional[str]], + data_access_func_detail: DataAccessFunctionDetail, ) -> str: # In Google BigQuery server is project-name - # condition to silent lint, it is not going to be None return ( data_access_func_detail.identifier_accessor.items["Name"] if data_access_func_detail.identifier_accessor is not None @@ -973,10 +1164,11 @@ def get_datasource_server( class NativeQueryLineage(AbstractLineage): + # Maps the full data-access function name (e.g. "Snowflake.Databases") to its platform. SUPPORTED_NATIVE_QUERY_DATA_PLATFORM: dict = { - SupportedDataPlatform.SNOWFLAKE.value.powerbi_data_platform_name: SupportedDataPlatform.SNOWFLAKE, - SupportedDataPlatform.AMAZON_REDSHIFT.value.powerbi_data_platform_name: SupportedDataPlatform.AMAZON_REDSHIFT, - SupportedDataPlatform.DatabricksMultiCloud_SQL.value.powerbi_data_platform_name: SupportedDataPlatform.DatabricksMultiCloud_SQL, + FunctionName.SNOWFLAKE_DATA_ACCESS.value: SupportedDataPlatform.SNOWFLAKE, + FunctionName.AMAZON_REDSHIFT_DATA_ACCESS.value: SupportedDataPlatform.AMAZON_REDSHIFT, + FunctionName.DATABRICK_MULTI_CLOUD_DATA_ACCESS.value: SupportedDataPlatform.DatabricksMultiCloud_SQL, } current_data_platform: SupportedDataPlatform = SupportedDataPlatform.SNOWFLAKE @@ -1027,7 +1219,7 @@ def create_urn_using_old_parser(self, query: str, server: str) -> Lineage: def get_db_name(self, data_access_tokens: List[str]) -> Optional[str]: if ( data_access_tokens[0] - != SupportedDataPlatform.DatabricksMultiCloud_SQL.value.powerbi_data_platform_name + != FunctionName.DATABRICK_MULTI_CLOUD_DATA_ACCESS.value ): return None @@ -1050,54 +1242,53 @@ def get_db_name(self, data_access_tokens: List[str]) -> Optional[str]: def create_lineage( self, data_access_func_detail: DataAccessFunctionDetail ) -> Lineage: - t1: Optional[Tree] = tree_function.first_arg_list_func( - data_access_func_detail.arg_list - ) - assert t1 is not None - flat_argument_list: List[Tree] = tree_function.flat_argument_list(t1) + node_map = data_access_func_detail.node_map + invoke_node = data_access_func_detail.arg_list - if len(flat_argument_list) != 2: + elements = _get_invoke_elements(invoke_node) + + if len(elements) < 2: logger.debug( - f"Expecting 2 argument, actual argument count is {len(flat_argument_list)}" + "Expecting at least 2 arguments for Value.NativeQuery, got %d", + len(elements), ) - logger.debug(f"Flat argument list = {flat_argument_list}") return Lineage.empty() - data_access_tokens: List[str] = tree_function.remove_whitespaces_from_list( - tree_function.token_values(flat_argument_list[0]) - ) + source_node = elements[0] + sql_node = elements[1] - if not self.is_native_parsing_supported(data_access_tokens[0]): - logger.debug( - f"Unsupported native-query data-platform = {data_access_tokens[0]}" - ) + # Extract SQL query from second arg + sql_query = get_literal_value(sql_node) + if sql_query is None: + logger.debug("Could not extract SQL query from second argument") + return Lineage.empty() + + # Extract data source tokens from first arg + data_access_tokens = _get_data_source_tokens(node_map, source_node) + + if not data_access_tokens or not self.is_native_parsing_supported( + data_access_tokens[0] + ): logger.debug( - f"NativeQuery is supported only for {self.SUPPORTED_NATIVE_QUERY_DATA_PLATFORM}" + "Unsupported native-query data-platform = %s", + data_access_tokens[0] if data_access_tokens else "none", ) - return Lineage.empty() - if len(data_access_tokens[0]) < 3: + if len(data_access_tokens) < 2: logger.debug( - f"Server is not available in argument list for data-platform {data_access_tokens[0]}. Returning empty " - "list" + "Server not available in data source tokens for %s", + data_access_tokens[0], ) return Lineage.empty() self.current_data_platform = self.SUPPORTED_NATIVE_QUERY_DATA_PLATFORM[ data_access_tokens[0] ] - # The First argument is the query - sql_query: str = tree_function.strip_char_from_list( - values=tree_function.remove_whitespaces_from_list( - tree_function.token_values(flat_argument_list[1]) - ), - )[0] # Remove any whitespaces and double quotes character - - server = tree_function.strip_char_from_list([data_access_tokens[2]])[0] + # data_access_tokens[0] = platform name, [1] = first literal arg = server + server = data_access_tokens[1] if self.config.enable_advance_lineage_sql_construct is False: - # Use previous parser to generate URN to keep backward compatibility return self.create_urn_using_old_parser( query=sql_query, server=server, @@ -1123,7 +1314,9 @@ def create_lineage( ) connect_string, query = self.get_db_detail_from_argument( - data_access_func_detail.arg_list + data_access_func_detail.arg_list, + node_map=data_access_func_detail.node_map, + parameters=data_access_func_detail.parameters, ) if not connect_string: diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py index e098deaa949d4..926ca6833e39c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py @@ -1,413 +1,358 @@ -import logging -from abc import ABC, abstractmethod -from typing import Any, Dict, List, Optional, Tuple, Union +""" +M-Query resolver: walks a powerquery-parser NodeIdMap to find DataAccessFunctionDetail +entries (recognized data-source function calls with their navigation chain). +""" -from lark import Tree +import logging +from typing import Dict, FrozenSet, List, Optional, Set, Tuple -from datahub.ingestion.api.common import PipelineContext -from datahub.ingestion.source.powerbi.config import ( - PowerBiDashboardSourceConfig, - PowerBiDashboardSourceReport, +from datahub.ingestion.source.powerbi.m_query.ast_utils import ( + NodeIdMap, + get_record_field_values, + resolve_identifier, ) -from datahub.ingestion.source.powerbi.dataplatform_instance_resolver import ( - AbstractDataPlatformInstanceResolver, -) -from datahub.ingestion.source.powerbi.m_query import tree_function from datahub.ingestion.source.powerbi.m_query.data_classes import ( - TRACE_POWERBI_MQUERY_PARSER, DataAccessFunctionDetail, + FunctionName, IdentifierAccessor, - Lineage, -) -from datahub.ingestion.source.powerbi.m_query.pattern_handler import ( - AbstractLineage, - SupportedPattern, ) -from datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes import Table logger = logging.getLogger(__name__) +_RECOGNIZED_FUNCTIONS: FrozenSet[str] = frozenset(f.value for f in FunctionName) -class AbstractDataAccessMQueryResolver(ABC): - table: Table - parse_tree: Tree - parameters: Dict[str, str] - reporter: PowerBiDashboardSourceReport - data_access_functions: List[str] - - def __init__( - self, - table: Table, - parse_tree: Tree, - reporter: PowerBiDashboardSourceReport, - parameters: Dict[str, str], - ): - self.table = table - self.parse_tree = parse_tree - self.reporter = reporter - self.parameters = parameters - self.data_access_functions = SupportedPattern.get_function_names() - - @abstractmethod - def resolve_to_lineage( - self, - ctx: PipelineContext, - config: PowerBiDashboardSourceConfig, - platform_instance_resolver: AbstractDataPlatformInstanceResolver, - ) -> List[Lineage]: - pass - - -class MQueryResolver(AbstractDataAccessMQueryResolver, ABC): - """ - This class parses the M-Query recursively to generate DataAccessFunctionDetail (see method create_data_access_functional_detail). - - This class has generic code to process M-Query tokens and create instance of DataAccessFunctionDetail. - - Once DataAccessFunctionDetail instance is initialized thereafter MQueryResolver generates the DataPlatformTable with the help of AbstractDataPlatformTableCreator - (see method resolve_to_lineage). - - Classes which extended from AbstractDataPlatformTableCreator know how to convert generated DataAccessFunctionDetail instance - to the respective DataPlatformTable instance as per dataplatform. +def resolve_to_data_access_functions( + node_map: NodeIdMap, + parameters: Optional[Dict[str, str]] = None, +) -> List[DataAccessFunctionDetail]: """ - - def get_item_selector_tokens( - self, - expression_tree: Tree, - ) -> Tuple[Optional[str], Optional[Dict[str, str]]]: - item_selector: Optional[Tree] = tree_function.first_item_selector_func( - expression_tree + Entry point: walk the NodeIdMap and return all DataAccessFunctionDetail entries + for recognized data-access function calls in the expression. + """ + parameters = parameters or {} + let_nodes = [ + (k, v) for k, v in node_map.items() if v.get("kind") == "LetExpression" + ] + if not let_nodes: + logger.debug("No LetExpression found in node map") + return [] + + # Use the outermost let (smallest id = parsed first / outermost scope) + root_let_id, root_let = min(let_nodes, key=lambda kv: kv[0]) + + # LetExpression.expression is embedded -- not an ID + output_node = root_let.get("expression") + if output_node is None: + logger.debug( + "LetExpression (id=%d) has no output expression — cannot resolve lineage", + root_let_id, ) - if item_selector is None: - logger.debug("Item Selector not found in tree") - logger.debug(expression_tree.pretty()) - return None, None - - identifier_tree: Optional[Tree] = tree_function.first_identifier_func( - expression_tree + return [] + + results: List[DataAccessFunctionDetail] = [] + seen: Set[Tuple[int, str]] = set() + + _walk( + node_map=node_map, + node=output_node, + current_let=root_let, + current_let_id=root_let_id, + accessor_chain=None, + results=results, + seen=seen, + parameters=parameters, + ) + return results + + +def _walk( + node_map: NodeIdMap, + node: Optional[dict], + current_let: dict, + current_let_id: int, + accessor_chain: Optional[IdentifierAccessor], + results: List[DataAccessFunctionDetail], + seen: Set[Tuple[int, str]], + parameters: Optional[Dict[str, str]] = None, +) -> None: + if node is None: + return + + kind = node.get("kind", "") + + # -- IdentifierExpression (wraps Identifier) -- + if kind == "IdentifierExpression": + identifier = node.get("identifier", {}) + name = identifier.get("literal", "") + # Strip quoted identifier prefix/suffix (#"name" → name) + if name.startswith('#"') and name.endswith('"'): + name = name[2:-1] + _walk_identifier_name( + node_map, + name, + current_let, + current_let_id, + accessor_chain, + results, + seen, + parameters, ) - if identifier_tree is None: - logger.debug("Identifier not found in tree") - logger.debug(item_selector.pretty()) - return None, None - - # remove whitespaces and quotes from token - tokens: List[str] = tree_function.strip_char_from_list( - tree_function.remove_whitespaces_from_list( - tree_function.token_values(item_selector, parameters=self.parameters) - ), + return + + # -- Identifier -- + if kind == "Identifier": + name = node.get("literal", "") + if name.startswith('#"') and name.endswith('"'): + name = name[2:-1] + _walk_identifier_name( + node_map, + name, + current_let, + current_let_id, + accessor_chain, + results, + seen, + parameters, ) - identifier: List[str] = tree_function.token_values( - identifier_tree, parameters={} + return + + # -- LetExpression (nested let scope) -- + if kind == "LetExpression": + inner_let_id = node.get("id", -1) + inner_output = node.get("expression") # embedded node + _walk( + node_map, + inner_output, + node, + inner_let_id, + accessor_chain, + results, + seen, + parameters, ) - - # convert tokens to dict - iterator = iter(tokens) - - return "".join(identifier), dict(zip(iterator, iterator, strict=False)) - - @staticmethod - def get_argument_list(invoke_expression: Tree) -> Optional[Tree]: - argument_list: Optional[Tree] = tree_function.first_arg_list_func( - invoke_expression + return + + # -- RecursivePrimaryExpression -- + # Covers both function calls (head + InvokeExpression) and + # accessor chains (head + ItemAccessExpression + FieldSelector) + if kind == "RecursivePrimaryExpression": + _walk_recursive_primary( + node_map, + node, + current_let, + current_let_id, + accessor_chain, + results, + seen, + parameters, ) - if argument_list is None: - logger.debug("First argument-list rule not found in input tree") - return None - - return argument_list - - def take_first_argument(self, expression: Tree) -> Optional[Tree]: - # function is not data-access function, lets process function argument - first_arg_tree: Optional[Tree] = tree_function.first_arg_list_func(expression) - - if first_arg_tree is None: - logger.debug( - f"Function invocation without argument in expression = {expression.pretty()}" - ) - self.reporter.report_warning( - f"{self.table.full_name}-variable-statement", - "Function invocation without argument", - ) - return None - return first_arg_tree - - def _process_invoke_expression( - self, invoke_expression: Tree - ) -> Union[DataAccessFunctionDetail, List[str], None]: - letter_tree: Tree = invoke_expression.children[0] - data_access_func: str = tree_function.make_function_name(letter_tree) - # The invoke function is either DataAccess function like PostgreSQL.Database() or - # some other function like Table.AddColumn or Table.Combine and so on - - logger.debug(f"function-name: {data_access_func}") - - if data_access_func in self.data_access_functions: - arg_list: Optional[Tree] = MQueryResolver.get_argument_list( - invoke_expression - ) - if arg_list is None: - self.reporter.report_warning( - title="M-Query Resolver Error", - message="Unable to extract lineage from parsed M-Query expression (missing argument list)", - context=f"{self.table.full_name}: argument list not found for data-access-function {data_access_func}", + return + + # -- ListExpression (Table.Combine sources) -- + if kind == "ListExpression": + content = node.get("content", {}) + if isinstance(content, dict) and content.get("kind") == "ArrayWrapper": + for elem in content.get("elements", []): + inner = _unwrap_csv(elem) + # Use a copy of seen for each list element so sibling paths + # sharing common ancestors don't trigger false circular refs + _walk( + node_map, + inner, + current_let, + current_let_id, + accessor_chain, + results, + seen.copy(), + parameters, ) - return None - - return DataAccessFunctionDetail( - arg_list=arg_list, - data_access_function_name=data_access_func, - identifier_accessor=None, + return + + # -- FunctionExpression (each / anonymous function body) -- + if kind == "FunctionExpression": + body = node.get("expression") + if body is not None: + _walk( + node_map, + body, + current_let, + current_let_id, + accessor_chain, + results, + seen, + parameters, ) - - first_arg_tree: Optional[Tree] = self.take_first_argument(invoke_expression) - if first_arg_tree is None: - return None - - flat_arg_list: List[Tree] = tree_function.flat_argument_list(first_arg_tree) - if len(flat_arg_list) == 0: - logger.debug("flat_arg_list is zero") - return None - - first_argument: Tree = flat_arg_list[0] # take first argument only - - # Detect nested function calls in the first argument - # M-Query's data transformation pipeline: - # 1. Functions typically operate on tables/columns - # 2. First argument must be either: - # - A table variable name (referencing data source) - # - Another function that eventually leads to a table - # - # Example of nested functions: - # #"Split Column by Delimiter2" = Table.SplitColumn( - # Table.TransformColumnTypes(#"Removed Columns1", "KB") - # ) - # - # In this example: - # - The inner function Table.TransformColumnTypes takes #"Removed Columns1" - # (a table reference) as its first argument - # - Its result is then passed as the first argument to Table.SplitColumn - second_invoke_expression: Optional[Tree] = ( - tree_function.first_invoke_expression_func(first_argument) + return + + logger.debug("Unhandled node kind '%s', returning empty for this branch", kind) + + +def _walk_recursive_primary( + node_map: NodeIdMap, + node: dict, + current_let: dict, + current_let_id: int, + accessor_chain: Optional[IdentifierAccessor], + results: List[DataAccessFunctionDetail], + seen: Set[Tuple[int, str]], + parameters: Optional[Dict[str, str]] = None, +) -> None: + head = node.get("head") # embedded IdentifierExpression + rec_exprs = node.get("recursiveExpressions", {}) + elements = rec_exprs.get("elements", []) if isinstance(rec_exprs, dict) else [] + + if not elements: + _walk( + node_map, + head, + current_let, + current_let_id, + accessor_chain, + results, + seen, + parameters, ) - if second_invoke_expression: - # 1. The First argument is function call - # 2. That function's first argument references next table variable - first_arg_tree = self.take_first_argument(second_invoke_expression) - if first_arg_tree is None: - return None - - flat_arg_list = tree_function.flat_argument_list(first_arg_tree) - if len(flat_arg_list) == 0: - logger.debug("flat_arg_list is zero") - return None - - first_argument = flat_arg_list[0] # take first argument only - - expression: Optional[Tree] = tree_function.first_list_expression_func( - first_argument + return + + first = elements[0] + + # Function call: Snowflake.Databases(...), Table.RenameColumns(...), etc. + if first.get("kind") == "InvokeExpression": + _walk_invoke( + node_map, + head, + first, + current_let, + current_let_id, + accessor_chain, + results, + seen, + parameters, ) - - if TRACE_POWERBI_MQUERY_PARSER: - logger.debug(f"Extracting token from tree {first_argument.pretty()}") - else: - logger.debug(f"Extracting token from tree {first_argument}") - if expression is None: - expression = tree_function.first_type_expression_func(first_argument) - if expression is None: - logger.debug( - f"Either list_expression or type_expression is not found = {invoke_expression.pretty()}" - ) - self.reporter.report_warning( - title="M-Query Resolver Error", - message="Unable to extract lineage from parsed M-Query expression (function argument expression is not supported)", - context=f"{self.table.full_name}: function argument expression is not supported", - ) - return None - - tokens: List[str] = tree_function.remove_whitespaces_from_list( - tree_function.token_values(expression) + return + + # Accessor chain step: Source{[Name="mydb", Kind="Database"]}[Data] + if first.get("kind") == "ItemAccessExpression": + content = first.get("content", {}) # RecordExpression + kv: Dict[str, str] = {} + if isinstance(content, dict): + kv = get_record_field_values(node_map, content, parameters=parameters) + + new_accessor = IdentifierAccessor( + identifier=kv.get("Name", ""), + items=kv, + next=accessor_chain, ) - - logger.debug(f"Tokens in invoke expression are {tokens}") - return tokens - - def _process_item_selector_expression( - self, rh_tree: Tree - ) -> Tuple[Optional[str], Optional[Dict[str, str]]]: - first_expression: Optional[Tree] = tree_function.first_expression_func(rh_tree) - assert first_expression is not None - - new_identifier, key_vs_value = self.get_item_selector_tokens(first_expression) - return new_identifier, key_vs_value - - @staticmethod - def _create_or_update_identifier_accessor( - identifier_accessor: Optional[IdentifierAccessor], - new_identifier: str, - key_vs_value: Dict[str, Any], - ) -> IdentifierAccessor: - # It is first identifier_accessor - if identifier_accessor is None: - return IdentifierAccessor( - identifier=new_identifier, items=key_vs_value, next=None - ) - - new_identifier_accessor: IdentifierAccessor = IdentifierAccessor( - identifier=new_identifier, items=key_vs_value, next=identifier_accessor + _walk( + node_map, + head, + current_let, + current_let_id, + new_accessor, + results, + seen, + parameters, ) - - return new_identifier_accessor - - def create_data_access_functional_detail( - self, identifier: str - ) -> List[DataAccessFunctionDetail]: - table_links: List[DataAccessFunctionDetail] = [] - - def internal( - current_identifier: str, - identifier_accessor: Optional[IdentifierAccessor], - ) -> None: - """ - 1) Find statement where identifier appear in the left-hand side i.e. identifier = expression - 2) Check expression is function invocation i.e. invoke_expression or item_selector - 3) if it is function invocation and this function is not the data-access function then take first argument - i.e. identifier and call the function recursively - 4) if it is item_selector then take identifier and key-value pair, - add identifier and key-value pair in current_selector and call the function recursively - 5) This recursion will continue till we reach to data-access function and during recursion we will fill - token_dict dictionary for all item_selector we find during traversal. - - :param current_identifier: variable to look for - :param identifier_accessor: - :return: None - """ - # Grammar of variable_statement is = - # Examples: Source = PostgreSql.Database() - # public_order_date = Source{[Schema="public",Item="order_date"]}[Data] - v_statement: Optional[Tree] = tree_function.get_variable_statement( - self.parse_tree, current_identifier - ) - if v_statement is None: - self.reporter.report_warning( - title="Unable to extract lineage from M-Query expression", - message="Lineage will be incomplete.", - context=f"table-full-name={self.table.full_name}, expression = {self.table.expression}, output-variable={current_identifier} not found in table expression", - ) - return None - - # Any expression after "=" sign of variable-statement - rh_tree: Optional[Tree] = tree_function.first_expression_func(v_statement) - if rh_tree is None: - logger.debug("Expression tree not found") - logger.debug(v_statement.pretty()) - return None - - invoke_expression: Optional[Tree] = ( - tree_function.first_invoke_expression_func(rh_tree) - ) - - if invoke_expression is not None: - result: Union[DataAccessFunctionDetail, List[str], None] = ( - self._process_invoke_expression(invoke_expression) - ) - if result is None: - return None # No need to process some un-expected grammar found while processing invoke_expression - if isinstance(result, DataAccessFunctionDetail): - result.identifier_accessor = identifier_accessor - table_links.append(result) # Link of a table is completed - identifier_accessor = ( - None # reset the identifier_accessor for other table - ) - return None - # Process first argument of the function. - # The first argument can be a single table argument or list of table. - # For example Table.Combine({t1,t2},....), here first argument is list of table. - # Table.AddColumn(t1,....), here first argument is single table. - for token in result: - internal(token, identifier_accessor) - - else: - new_identifier, key_vs_value = self._process_item_selector_expression( - rh_tree - ) - if new_identifier is None or key_vs_value is None: - logger.debug("Required information not found in rh_tree") - return None - new_identifier_accessor: IdentifierAccessor = ( - self._create_or_update_identifier_accessor( - identifier_accessor, new_identifier, key_vs_value - ) - ) - - return internal(new_identifier, new_identifier_accessor) - - internal(identifier, None) - - return table_links - - def resolve_to_lineage( - self, - ctx: PipelineContext, - config: PowerBiDashboardSourceConfig, - platform_instance_resolver: AbstractDataPlatformInstanceResolver, - ) -> List[Lineage]: - lineage: List[Lineage] = [] - - # Find out output variable as we are doing backtracking in M-Query - output_variable: Optional[str] = tree_function.get_output_variable( - self.parse_tree - ) - - if output_variable is None: - logger.debug( - f"Table: {self.table.full_name}: output-variable not found in tree" - ) - self.reporter.report_warning( - f"{self.table.full_name}-output-variable", - "output-variable not found in table expression", + return + + # FieldSelector or other -- just walk the head + _walk( + node_map, + head, + current_let, + current_let_id, + accessor_chain, + results, + seen, + parameters, + ) + + +def _walk_invoke( + node_map: NodeIdMap, + head: Optional[dict], + invoke_node: dict, + current_let: dict, + current_let_id: int, + accessor_chain: Optional[IdentifierAccessor], + results: List[DataAccessFunctionDetail], + seen: Set[Tuple[int, str]], + parameters: Optional[Dict[str, str]] = None, +) -> None: + callee = None + if isinstance(head, dict) and head.get("kind") == "IdentifierExpression": + callee = head.get("identifier", {}).get("literal") + + if callee and callee in _RECOGNIZED_FUNCTIONS: + results.append( + DataAccessFunctionDetail( + arg_list=invoke_node, + data_access_function_name=callee, + identifier_accessor=accessor_chain, + node_map=node_map, + parameters=parameters or {}, ) - return lineage - - # Parse M-Query and use output_variable as root of tree and create instance of DataAccessFunctionDetail - table_links: List[DataAccessFunctionDetail] = ( - self.create_data_access_functional_detail(output_variable) ) - - # Each item is data-access function - for f_detail in table_links: - logger.debug( - f"Processing data-access-function {f_detail.data_access_function_name}" - ) - # Get & Check if we support data-access-function available in M-Query - supported_resolver = SupportedPattern.get_pattern_handler( - f_detail.data_access_function_name - ) - if supported_resolver is None: - logger.debug( - f"Resolver not found for the data-access-function {f_detail.data_access_function_name}" - ) - self.reporter.report_warning( - f"{self.table.full_name}-data-access-function", - f"Resolver not found for data-access-function = {f_detail.data_access_function_name}", + return + + # Unrecognized wrapper (Table.RenameColumns, Table.AddColumn, etc.) + # Recurse into first argument + if callee: + content = invoke_node.get("content", {}) + if isinstance(content, dict) and content.get("kind") == "ArrayWrapper": + for elem in content.get("elements", []): + inner = _unwrap_csv(elem) + _walk( + node_map, + inner, + current_let, + current_let_id, + accessor_chain, + results, + seen, + parameters, ) - continue - - # From supported_resolver enum get respective handler like AmazonRedshift or Snowflake or Oracle or NativeQuery and create instance of it - # & also pass additional information that will be need to generate lineage - logger.debug( - f"Creating instance of {supported_resolver.handler().__name__} " - f"for data-access-function {f_detail.data_access_function_name}" - ) - pattern_handler: AbstractLineage = supported_resolver.handler()( - ctx=ctx, - table=self.table, - config=config, - reporter=self.reporter, - platform_instance_resolver=platform_instance_resolver, - ) - - lineage.append(pattern_handler.create_lineage(f_detail)) - - return lineage + return # only first arg + + +def _unwrap_csv(elem: object) -> Optional[dict]: + """Unwrap a Csv wrapper node, returning the inner node.""" + if isinstance(elem, dict) and elem.get("kind") == "Csv": + return elem.get("node") + if isinstance(elem, dict): + return elem + return None + + +def _walk_identifier_name( + node_map: NodeIdMap, + name: str, + current_let: dict, + current_let_id: int, + accessor_chain: Optional[IdentifierAccessor], + results: List[DataAccessFunctionDetail], + seen: Set[Tuple[int, str]], + parameters: Optional[Dict[str, str]] = None, +) -> None: + """Resolve a variable name in the current let scope and continue walking.""" + if not name: + return + # Circular reference guard: (let_id, variable_name) pair + guard_key = (current_let_id, name) + if guard_key in seen: + logger.warning("Circular reference detected for variable '%s', stopping", name) + return + seen.add(guard_key) + + resolved = resolve_identifier(node_map, current_let, name) + _walk( + node_map, + resolved, + current_let, + current_let_id, + accessor_chain, + results, + seen, + parameters, + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/tree_function.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/tree_function.py deleted file mode 100644 index 81b3c6d440720..0000000000000 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/tree_function.py +++ /dev/null @@ -1,188 +0,0 @@ -import logging -from functools import partial -from typing import Any, Dict, List, Optional, Union - -from lark import Token, Tree - -from datahub.ingestion.source.powerbi.m_query.data_classes import ( - TRACE_POWERBI_MQUERY_PARSER, -) - -logger = logging.getLogger(__name__) - - -def get_output_variable(root: Tree) -> Optional[str]: - in_expression_tree: Optional[Tree] = get_first_rule(root, "in_expression") - if in_expression_tree is None: - return None - # Get list of terminal value - # Remove any whitespaces - # Remove any spaces - return "".join( - strip_char_from_list( - remove_whitespaces_from_list(token_values(in_expression_tree)), " " - ) - ) - - -def get_variable_statement(parse_tree: Tree, variable: str) -> Optional[Tree]: - _filter = parse_tree.find_data("variable") - # filter will return statement of the form = - # We are searching for Tree where variable-name is matching with provided variable - for tree in _filter: - values: List[str] = token_values(tree.children[0]) - actual_value: str = "".join(strip_char_from_list(values, " ")) - if TRACE_POWERBI_MQUERY_PARSER: - logger.debug(f"Actual Value = {actual_value}") - logger.debug(f"Expected Value = {variable}") - - if actual_value.lower() == variable.lower(): - return tree - - logger.debug(f"Provided variable({variable}) not found in variable rule") - - return None - - -def get_first_rule(tree: Tree, rule: str) -> Optional[Tree]: - """ - Lark library doesn't have an advance search function. - This function will return the first tree of the provided rule - :param tree: Tree to search for the expression rule - :return: Tree - """ - - def internal(node: Union[Tree, Token]) -> Optional[Tree]: - if isinstance(node, Tree) and node.data == rule: - return node - if isinstance(node, Token): - return None - - for child in node.children: - child_node: Optional[Tree] = internal(child) - if child_node is not None: - return child_node - - return None - - expression_tree: Optional[Tree] = internal(tree) - - return expression_tree - - -def token_values(tree: Tree, parameters: Optional[Dict[str, str]] = None) -> List[str]: - """ - :param tree: Tree to traverse - :param parameters: If parameters is not an empty dict, it will try to resolve identifier variable references - using the values in 'parameters'. - :return: List of leaf token data - """ - parameters = parameters or {} - values: List[str] = [] - - def internal(node: Union[Tree, Token]) -> None: - if parameters and isinstance(node, Tree) and node.data == "identifier": - # This is the case where they reference a variable using - # the `#"Name of variable"` or `Variable` syntax. It can be - # a quoted_identifier or a regular_identifier. - - ref = make_function_name(node) - - # For quoted_identifier, ref will have quotes around it. - if ref.startswith('"') and ref[1:-1] in parameters: - resolved = parameters[ref[1:-1]] - values.append(resolved) - elif ref in parameters: - resolved = parameters[ref] - values.append(resolved) - else: - # If we can't resolve, fall back to the name of the variable. - logger.debug(f"Unable to resolve parameter reference to {ref}") - values.append(ref) - elif isinstance(node, Token): - values.append(node.value) - return - else: - for child in node.children: - internal(child) - - internal(tree) - - return values - - -def remove_whitespaces_from_list(values: List[str]) -> List[str]: - result: List[str] = [] - for item in values: - if item.strip() not in ("", "\n", "\t"): - result.append(item) - - return result - - -def strip_char(value: str, char: str = '"') -> str: - return value.strip(char) - - -def strip_char_from_list(values: List[str], char: str = '"') -> List[str]: - result: List[str] = [] - for item in values: - result.append(strip_char(item.strip(char), char=char)) - - return result - - -def make_function_name(tree: Tree) -> str: - values: List[str] = token_values(tree) - return ".".join(values) - - -def get_all_function_name(tree: Tree) -> List[str]: - """ - Returns all function name present in an input tree - :param tree: Input lexical tree - :return: list of function name - """ - functions: List[str] = [] - - # List the all invoke_expression in the Tree - _filter: Any = tree.find_data("invoke_expression") - - for node in _filter: - if TRACE_POWERBI_MQUERY_PARSER: - logger.debug(f"Tree = {node.pretty()}") - primary_expression_node: Optional[Tree] = first_primary_expression_func(node) - if primary_expression_node is None: - continue - - identifier_node: Optional[Tree] = first_identifier_func(primary_expression_node) - if identifier_node is None: - continue - - functions.append(make_function_name(identifier_node)) - - return functions - - -def flat_argument_list(tree: Tree) -> List[Tree]: - values: List[Tree] = [] - - for child in tree.children: - if isinstance(child, Token): - continue - if isinstance(child, Tree) and ( - child.data == "argument_list" or child.data == "expression" - ): - values.append(child) - - return values - - -first_expression_func = partial(get_first_rule, rule="expression") -first_item_selector_func = partial(get_first_rule, rule="item_selector") -first_arg_list_func = partial(get_first_rule, rule="argument_list") -first_identifier_func = partial(get_first_rule, rule="identifier") -first_primary_expression_func = partial(get_first_rule, rule="primary_expression") -first_invoke_expression_func = partial(get_first_rule, rule="invoke_expression") -first_type_expression_func = partial(get_first_rule, rule="type_expression") -first_list_expression_func = partial(get_first_rule, rule="list_expression") diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/validator.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/validator.py deleted file mode 100644 index b52977aaa41fb..0000000000000 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/validator.py +++ /dev/null @@ -1,34 +0,0 @@ -import logging -from typing import Optional, Tuple - -import datahub.ingestion.source.powerbi.m_query.data_classes - -logger = logging.getLogger(__name__) - - -def validate_parse_tree( - expression: str, native_query_enabled: bool = True -) -> Tuple[bool, Optional[str]]: - """ - :param expression: M-Query expression to check if supported data-function is present in expression - :param native_query_enabled: Whether user want to extract lineage from native query - :return: True or False. - """ - function_names = [ - fun.value - for fun in datahub.ingestion.source.powerbi.m_query.data_classes.FunctionName - ] - if not any(fun in expression for fun in function_names): - return False, "DataAccess function is not present in M-Query expression." - - if native_query_enabled is False: - if ( - datahub.ingestion.source.powerbi.m_query.data_classes.FunctionName.NATIVE_QUERY.value - in function_names - ): - return ( - False, - "Lineage extraction from native query is disabled. Enable native_query_parsing in recipe", - ) - - return True, None diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule deleted file mode 100644 index 6df3fab732fdc..0000000000000 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule +++ /dev/null @@ -1,634 +0,0 @@ -// MODIFICATIONS: -// - The let_expression definition has added the whitespace rule instead of the required newline. -// This allows the parser to be less strict about whitespace. -// - Add inline whitespace to item_selection and optional_item_selection. -// - Tweak unary_expression to allow arbitrary operators within it. -// This is necessary because unary_expression is the base for the -// whole relational_expression parse tree. -// - Added letter_character_and_decimal_digit phrase and updated keyword_or_identifier phrase -// - Added below pattern in argument_list -// | WS_INLINE? sql_string -// | WS_INLINE? sql_string "," argument_list -// - Added subtract_expression -// - Updated relational_expression, here below are the updates -// | subtract_expression -// | subtract_expression "<" relational_expression -// | subtract_expression ">" relational_expression -// | subtract_expression "<=" relational_expression -// | subtract_expression ">=" relational_expression -// - Added empty_string -// - Updated argument_list, below are the updates -// | empty_string -// | empty_string "," argument_list -// - Added sql_string in any_literal -// - Added WS_INLINE? in field expression -// Added to ignore any comments -// %ignore WS // Ignore whitespace -// %ignore CPP_COMMENT // Ignore single-line comments -// %ignore C_COMMENT // Ignore multi-line comments -// - Removed optional whitespace in rules, it's handled by %ignore - -lexical_unit: lexical_elements? - -lexical_elements: lexical_element - | lexical_elements? - -lexical_element: whitespace - | token comment - -whitespace: WS - | new_line_character - -new_line_character: CR - | LF - | NEWLINE - -comment: single_line_comment - | delimited_comment - - -single_line_comment: single_line_comment_characters? - -single_line_comment_characters: single_line_comment_character - | single_line_comment_characters? - -single_line_comment_character: CPP_COMMENT - -delimited_comment: C_COMMENT - -asterisks: "*" - | asterisks? - -token: identifier - | keyword - | literal - | operator_or_punctuator - -character_escape_sequence: "#(" escape_sequence_list ")" - -escape_sequence_list: single_escape_sequence - | escape_sequence_list "," single_escape_sequence - -single_escape_sequence: long_unicode_escape_sequence - | short_unicode_escape_sequence - | control_character_escape_sequence - | escape_escape - -long_unicode_escape_sequence: hex_digit hex_digit hex_digit hex_digit hex_digit hex_digit hex_digit hex_digit - -short_unicode_escape_sequence: hex_digit hex_digit hex_digit hex_digit - -control_character_escape_sequence: control_character - -control_character: CR - | LF - | /\t/ - -escape_escape: "#" - -literal: logical_literal - | number_literal - | text_literal - | null_literal - | verbatim_literal - -logical_literal: "true" - | "false" - -number_literal: decimal_number_literal - | hexadecimal_number_literal - -decimal_digits: decimal_digit - | decimal_digits? - -decimal_digit: /\d+/ - -hexadecimal_number_literal: "0x" hex_digits - | "0X" hex_digits - -hex_digits: hex_digit - | hex_digits? - -hex_digit: HEXDIGIT - -decimal_number_literal: decimal_digits - | decimal_digits "." decimal_digits exponent_part? - | decimal_digits exponent_part? - | decimal_digits exponent_part? - -exponent_part: "e" sign? decimal_digits - | "E" sign? decimal_digits - -sign: ["+"|"-"] - -text_literal: ESCAPED_STRING - -text_literal_characters: text_literal_character - | text_literal_characters? - -text_literal_character: single_text_character - | character_escape_sequence - | double_quote_escape_sequence - -single_text_character: /./ - | /[^#]/ - -double_quote_escape_sequence: "\"\"" - -null_literal: "null" - -verbatim_literal: "#!\"" text_literal_characters? "\"" - -identifier: regular_identifier - | quoted_identifier - - -regular_identifier: available_identifier - | available_identifier dot_character regular_identifier - -available_identifier: keyword_or_identifier - -keyword_or_identifier: letter_character - | letter_character_and_decimal_digit - | underscore_character - | identifier_start_character identifier_part_characters - -identifier_start_character: letter_character - | underscore_character - -identifier_part_characters: identifier_part_character identifier_part_characters? - -identifier_part_character: letter_character - | decimal_digit_character - | underscore_character - | connecting_character - | combining_character - | formatting_character - -generalized_identifier: generalized_identifier_part - | generalized_identifier WS_INLINE generalized_identifier_part - -generalized_identifier_part: generalized_identifier_segment - | decimal_digit_character generalized_identifier_segment - -generalized_identifier_segment: keyword_or_identifier - | keyword_or_identifier dot_character keyword_or_identifier - -dot_character: "." - -underscore_character: "_" - -letter_character: /[_\-\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}]+/ - -letter_character_and_decimal_digit: /[_\-\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}\p{Nd}]+/ - -combining_character: /[_\p{Mn}\p{Mc}]+/ - -decimal_digit_character: /[\p{Nd}]+/ - -connecting_character: /[\p{Pc}]+/ - -formatting_character: /[\p{Cf}]+/ - -quoted_identifier: "#" ESCAPED_STRING - -keyword: "and" - | "as" - | "each" - | "else" - | "error" - | "false" - | "if" - | "in" - | "is" - | "let" - | "meta" - | "not" - | "null" - | "or" - | "otherwise" - | "section" - | "shared" - | "then" - | "true" - | "try" - | "type" - | "#binary" - | "#date" - | "#datetime" - | "#datetimezone" - | "#duration" - | "#infinity" - | "#nan" - | "#sections" - | "#shared" - | "#table" - | "#time" - - -operator_or_punctuator: "," - | ";" - | "=" - | "<" - | "<=" - | ">" - | ">=" - | "<>" - | "+" - | "_" - | "*" - | "/" - | "&" - | "(" - | ")" - | "[" - | "]" - | "{" - | "}" - | "@" - | "?" - | "??" - | "=>" - | ".." - | "..." - | "{{" - | "}}" - -document: section_document - | expression_document - -section_document: section - -section: literal_attributes? - | section - | section_name ";" section_members? - -section_name: identifier - -section_members: section_member - | section_members? - -section_member: literal_attributes? - | "shared"? - | section_member_name "=" expression ";" - -section_member_name: identifier - -expression_document: expression - -expression: logical_or_expression - | each_expression - | function_expression - | let_expression - | if_expression - | error_raising_expression - | error_handling_expression - | outer_expression - - -logical_or_expression: logical_and_expression - | logical_and_expression "or" logical_or_expression - - -logical_and_expression: is_expression - | logical_and_expression "and" is_expression - -is_expression: as_expression - | is_expression "is" nullable_primitive_type - -nullable_primitive_type: "nullable"? primitive_type - -as_expression: equality_expression - | as_expression "as" nullable_primitive_type - | multiplicative_expression - -equality_expression: relational_expression - | relational_expression "=" equality_expression - | relational_expression "<>" equality_expression - -relational_expression: additive_expression - | subtract_expression - | additive_expression "<" relational_expression - | additive_expression ">" relational_expression - | additive_expression "<=" relational_expression - | additive_expression ">=" relational_expression - | subtract_expression "<" relational_expression - | subtract_expression ">" relational_expression - | subtract_expression "<=" relational_expression - | subtract_expression ">=" relational_expression - - -additive_expression: multiplicative_expression - | multiplicative_expression "+" additive_expression - | multiplicative_expression "_" additive_expression - | multiplicative_expression "&" additive_expression - - -subtract_expression: multiplicative_expression - | multiplicative_expression "-" additive_expression - | multiplicative_expression "_" additive_expression - | multiplicative_expression "&" additive_expression - -multiplicative_expression: metadata_expression - | metadata_expression "*" multiplicative_expression - | metadata_expression "/" multiplicative_expression - -metadata_expression: unary_expression - | unary_expression - | "meta" - | unary_expression - -unary_expression: type_expression - | "+" unary_expression - | "_" unary_expression - | "not" unary_expression - | expression - -primary_expression: literal_expression - | list_expression - | record_expression - | identifier_expression - | section_access_expression - | parenthesized_expression - | field_access_expression - | item_access_expression - | invoke_expression - | not_implemented_expression - -literal_expression: literal - -identifier_expression: identifier_reference - -identifier_reference: exclusive_identifier_reference - | inclusive_identifier_reference - -exclusive_identifier_reference: identifier - -inclusive_identifier_reference: "@" identifier - -section_access_expression: identifier "!" identifier - -parenthesized_expression: "(" expression ")" - -not_implemented_expression: "..." - -invoke_expression: "#"? primary_expression "(" argument_list? ")" - -empty_string: /"([^"]|\\")*"/ - -// SQL String specific rules -sql_content: /(?:[^\"\\]|\\[\"]|\"\"|\#\(lf\))+/ - -sql_string: "\"" sql_content "\"" - -outer_expression: "{{" expression "}}" - -argument_list: expression - | expression "," argument_list - | sql_string - | sql_string "," argument_list - | "\"" identifier "\"" - | "\"" identifier "\"" "," argument_list - | "[" identifier "]" - | "[" identifier "]" "," argument_list - | empty_string - | empty_string "," argument_list - | ESCAPED_STRING - | ESCAPED_STRING "," argument_list - | record_literal - | record_literal "," argument_list - | null_literal - | null_literal "," argument_list - - -list_expression: "{" item_list? "}" - - -item_list: item - | item "," item_list - -item: expression - | expression ".." expression - -record_expression: "[" field_list? "]" - -field_list: field - | field "," field_list - -field: field_name "=" expression - -field_name: generalized_identifier - | quoted_identifier - -item_access_expression: item_selection - | optional_item_selection - -item_selection: primary_expression "{" item_selector "}" - -optional_item_selection: primary_expression "{" item_selector "}" "?" - -item_selector: expression - -field_access_expression: field_selection - | implicit_target_field_selection - | projection - | implicit_target_projection - -field_selection: primary_expression field_selector - -field_selector: required_field_selector - | optional_field_selector - -required_field_selector: "[" field_name "]" - -optional_field_selector: "[" field_name "]" "?" - -implicit_target_field_selection: field_selector - -projection: primary_expression required_projection - | primary_expression optional_projection - -required_projection: "[" required_selector_list "]" - -optional_projection: "[" required_selector_list "]" "?" - -required_selector_list: required_field_selector - | required_field_selector "," required_selector_list - -implicit_target_projection: required_projection - | optional_projection - -function_expression: "(" parameter_list? ")" return_type? "=>" function_body - -function_body: expression - -parameter_list: fixed_parameter_list - | fixed_parameter_list "," optional_parameter_list - | optional_parameter_list - -fixed_parameter_list: parameter - | parameter "," fixed_parameter_list - -parameter: parameter_name parameter_type? - -parameter_name: identifier - -parameter_type: assertion - -return_type: assertion - -assertion: "as" nullable_primitive_type - -optional_parameter_list: optional_parameter - | optional_parameter "," optional_parameter_list - -optional_parameter: "optional" parameter - -each_expression: "each" each_expression_body - -each_expression_body: function_body - -let_expression: "let" variable_list in_expression - -in_expression: "in" expression - -variable_list: variable - | variable "," variable_list - -variable: variable_name "=" expression - -variable_name: identifier - -if_expression: "if" if_condition "then" true_expression "else" false_expression - -if_condition: expression - -true_expression: expression - | multiplicative_expression - -false_expression: expression - -type_expression: primary_expression - | "type" primary_type - -type: parenthesized_expression - | primary_type - -primary_type: primitive_type - | record_type - | list_type - | function_type - | table_type - | nullable_type - -primitive_type: "any" - | "anynonnull" - | "binary" - | "date" - | "datetime" - | "datetimezone" - | "duration" - | "function" - | "list" - | "logical" - | "none" - | "null" - | "number" - | "record" - | "table" - | "text" - | "time" - | "type" - -record_type: "[" open_record_marker "]" - | "[" field_specification_list? "]" - | "[" field_specification_list "," open_record_marker "]" - -field_specification_list: field_specification - | field_specification "," field_specification_list - -field_specification: "optional"? field_name field_type_specification? - -field_type_specification: "=" field_type - -field_type: type - -open_record_marker: "..." - -list_type: "{" item_type "}" - -item_type: type - -function_type: "function" "(" parameter_specification_list? ")" return_type - -parameter_specification_list: required_parameter_specification_list - | required_parameter_specification_list "," optional_parameter_specification_list - | optional_parameter_specification_list - -required_parameter_specification_list: required_parameter_specification - | required_parameter_specification "," required_parameter_specification_list - -required_parameter_specification: parameter_specification - -optional_parameter_specification_list: optional_parameter_specification - | optional_parameter_specification "," optional_parameter_specification_list - -optional_parameter_specification: "optional" parameter_specification - -parameter_specification: parameter_name parameter_type - -table_type: "table" row_type - -row_type: "[" field_specification_list? "]" - -nullable_type: "nullable" type - -error_raising_expression: "error" expression "_" - -error_handling_expression: "try" protected_expression otherwise_clause? - -protected_expression: expression - -otherwise_clause: "otherwise" default_expression - -default_expression: expression - -literal_attributes: record_literal - -record_literal: "[" literal_field_list? "]" - -literal_field_list: literal_field - | literal_field "," literal_field_list - -literal_field: field_name "=" any_literal - | field_name "=" invoke_expression - -list_literal: "{" literal_item_list? "}" - -literal_item_list: any_literal - | any_literal "," literal_item_list - -any_literal: record_literal - | list_literal - | logical_literal - | number_literal - | text_literal - | null_literal - | sql_string - - -%import common.WORD -%import common.WS_INLINE -%import common.CPP_COMMENT -%import common.C_COMMENT -%import common.WS -%import common.NEWLINE -%import common.HEXDIGIT -%import common.DIGIT -%import common.LF -%import common.CR -%import common.ESCAPED_STRING - -%ignore WS // Ignore whitespace -%ignore CPP_COMMENT // Ignore single-line comments -%ignore C_COMMENT // Ignore multi-line comments \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py index 277808fa50065..69501f847ded3 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py +++ b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py @@ -5,7 +5,6 @@ from unittest.mock import MagicMock, patch import pytest -from lark import Tree import datahub.ingestion.source.powerbi.m_query.data_classes import datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes as powerbi_data_classes @@ -19,7 +18,7 @@ AbstractDataPlatformInstanceResolver, create_dataplatform_instance_resolver, ) -from datahub.ingestion.source.powerbi.m_query import parser, tree_function +from datahub.ingestion.source.powerbi.m_query import parser from datahub.ingestion.source.powerbi.m_query.data_classes import ( DataPlatformTable, Lineage, @@ -129,97 +128,6 @@ def combine_upstreams_from_lineage(lineage: List[Lineage]) -> List[DataPlatformT return data_platforms -@pytest.mark.integration -def test_parse_m_query1(): - expression: str = M_QUERIES[0] - parse_tree: Tree = parser._parse_expression(expression) - assert tree_function.get_output_variable(parse_tree) == "TESTTABLE_Table" - - -@pytest.mark.integration -def test_parse_m_query2(): - expression: str = M_QUERIES[1] - parse_tree: Tree = parser._parse_expression(expression) - assert tree_function.get_output_variable(parse_tree) == '"Added Custom2"' - - -@pytest.mark.integration -def test_parse_m_query3(): - expression: str = M_QUERIES[2] - parse_tree: Tree = parser._parse_expression(expression) - assert tree_function.get_output_variable(parse_tree) == '"Added Conditional Column"' - - -@pytest.mark.integration -def test_parse_m_query4(): - expression: str = M_QUERIES[3] - parse_tree: Tree = parser._parse_expression(expression) - assert tree_function.get_output_variable(parse_tree) == '"Changed Type"' - - -@pytest.mark.integration -def test_parse_m_query5(): - expression: str = M_QUERIES[4] - parse_tree: Tree = parser._parse_expression(expression) - assert tree_function.get_output_variable(parse_tree) == '"Renamed Columns"' - - -@pytest.mark.integration -def test_parse_m_query6(): - expression: str = M_QUERIES[5] - parse_tree: Tree = parser._parse_expression(expression) - assert tree_function.get_output_variable(parse_tree) == '"Added Custom"' - - -@pytest.mark.integration -def test_parse_m_query7(): - expression: str = M_QUERIES[6] - parse_tree: Tree = parser._parse_expression(expression) - assert tree_function.get_output_variable(parse_tree) == "Source" - - -@pytest.mark.integration -def test_parse_m_query8(): - expression: str = M_QUERIES[7] - parse_tree: Tree = parser._parse_expression(expression) - assert tree_function.get_output_variable(parse_tree) == '"Added Custom1"' - - -@pytest.mark.integration -def test_parse_m_query9(): - expression: str = M_QUERIES[8] - parse_tree: Tree = parser._parse_expression(expression) - assert tree_function.get_output_variable(parse_tree) == '"Added Custom1"' - - -@pytest.mark.integration -def test_parse_m_query10(): - expression: str = M_QUERIES[9] - parse_tree: Tree = parser._parse_expression(expression) - assert tree_function.get_output_variable(parse_tree) == '"Changed Type1"' - - -@pytest.mark.integration -def test_parse_m_query11(): - expression: str = M_QUERIES[10] - parse_tree: Tree = parser._parse_expression(expression) - assert tree_function.get_output_variable(parse_tree) == "Source" - - -@pytest.mark.integration -def test_parse_m_query12(): - expression: str = M_QUERIES[11] - parse_tree: Tree = parser._parse_expression(expression) - assert tree_function.get_output_variable(parse_tree) == '"Added Custom"' - - -@pytest.mark.integration -def test_parse_m_query13(): - expression: str = M_QUERIES[12] - parse_tree: Tree = parser._parse_expression(expression) - assert tree_function.get_output_variable(parse_tree) == "two_source_table" - - @pytest.mark.integration def test_snowflake_regular_case(): q: str = M_QUERIES[0] @@ -836,6 +744,10 @@ def test_databricks_multi_cloud(): ) +@pytest.mark.xfail( + reason="M_QUERIES[26] has a dangling comma that the strict Microsoft parser rejects", + strict=True, +) def test_databricks_catalog_pattern_1(): q = M_QUERIES[26] @@ -1045,6 +957,8 @@ def test_mssql_drop_with_select(): def test_unsupported_data_platform(): + # LOAD_DATA(SOURCE) is not a recognized data-platform expression — the bridge + # parses it successfully but no pattern handler matches, so lineage is empty. q = M_QUERIES[34] table: powerbi_data_classes.Table = powerbi_data_classes.Table( columns=[], @@ -1071,20 +985,6 @@ def test_unsupported_data_platform(): == [] ) - info_entries: dict = reporter._structured_logs._entries.get( - StructuredLogLevel.INFO, {} - ) # type :ignore - - is_entry_present: bool = False - for entry in info_entries.values(): - if entry.title == "Non-Data Platform Expression": - is_entry_present = True - break - - assert is_entry_present, ( - 'Info message "Non-Data Platform Expression" should be present in reporter' - ) - def test_empty_string_in_m_query(): # TRIM(TRIM(TRIM(AGENT_NAME, '\"\"'), '+'), '\\'') is in Query @@ -1130,8 +1030,8 @@ def test_double_quotes_in_alias(): ) -@patch("datahub.ingestion.source.powerbi.m_query.parser.get_lark_parser") -def test_m_query_timeout(mock_get_lark_parser): +@patch("datahub.ingestion.source.powerbi.m_query.parser.get_bridge") +def test_m_query_timeout(mock_get_bridge): q = 'let\n Source = Value.NativeQuery(Snowflake.Databases("0DD93C6BD5A6.snowflakecomputing.com","sales_analytics_warehouse_prod",[Role="sales_analytics_member_ad"]){[Name="SL_OPERATIONS"]}[Data], "select SALE_NO AS ""\x1b[4mSaleNo\x1b[0m""#(lf) ,CODE AS ""Code""#(lf) ,ENDDATE AS ""end_date""#(lf) from SL_OPERATIONS.SALE.REPORTS#(lf) where ENDDATE > \'2024-02-03\'", null, [EnableFolding=true]),\n #"selected Row" = Table.SelectRows(Source)\nin\n #"selected Row"' table: powerbi_data_classes.Table = powerbi_data_classes.Table( @@ -1150,11 +1050,11 @@ def test_m_query_timeout(mock_get_lark_parser): config.m_query_parse_timeout = 1 - mock_lark_instance = MagicMock() + mock_bridge_instance = MagicMock() - mock_get_lark_parser.return_value = mock_lark_instance + mock_get_bridge.return_value = mock_bridge_instance # sleep for 5 seconds to trigger timeout - mock_lark_instance.parse.side_effect = lambda expression: time.sleep(5) + mock_bridge_instance.parse.side_effect = lambda expression: time.sleep(5) parser.get_upstream_tables( table, diff --git a/metadata-ingestion/tests/unit/test_ast_utils.py b/metadata-ingestion/tests/unit/test_ast_utils.py new file mode 100644 index 0000000000000..5e0f8ac69973b --- /dev/null +++ b/metadata-ingestion/tests/unit/test_ast_utils.py @@ -0,0 +1,146 @@ +"""Tests for ast_utils.py — NodeIdMap navigation helpers. + +All tests are bridge-backed: the Snowflake M-Query expression is parsed at +module scope via the JS bridge (bundle.js.gz + py_mini_racer). No static JSON +fixtures are committed or loaded. +""" + +import pytest + +from datahub.ingestion.source.powerbi.m_query._bridge import NodeIdMap +from datahub.ingestion.source.powerbi.m_query.ast_utils import ( + find_nodes_by_kind, + get_invoke_callee_name, + get_literal_value, + get_record_field_values, + resolve_identifier, +) + +# M_QUERIES[0] from tests/integration/powerbi/test_m_parser.py — the canonical +# Snowflake three-step let expression used across PowerBI parser tests. +_SNOWFLAKE_M_EXPRESSION = ( + "let\n" + ' Source = Snowflake.Databases("bu10758.ap-unknown-2.fakecomputing.com","PBI_TEST_WAREHOUSE_PROD",[Role="PBI_TEST_MEMBER"]),\n' + ' PBI_TEST_Database = Source{[Name="PBI_TEST",Kind="Database"]}[Data],\n' + ' TEST_Schema = PBI_TEST_Database{[Name="TEST",Kind="Schema"]}[Data],\n' + ' TESTTABLE_Table = TEST_Schema{[Name="TESTTABLE",Kind="Table"]}[Data]\n' + "in\n" + " TESTTABLE_Table" +) + + +@pytest.fixture(scope="module") +def snowflake_node_map() -> NodeIdMap: + from datahub.ingestion.source.powerbi.m_query._bridge import ( + _clear_bridge, + get_bridge, + ) + + _clear_bridge() + bridge = get_bridge() + result = bridge.parse(_SNOWFLAKE_M_EXPRESSION) + _clear_bridge() + return result + + +# ── find_nodes_by_kind ──────────────────────────────────────────────────────── + + +def test_find_nodes_by_kind_returns_all_matching(snowflake_node_map: NodeIdMap) -> None: + let_nodes = find_nodes_by_kind(snowflake_node_map, "LetExpression") + assert len(let_nodes) >= 1 + assert all(n["kind"] == "LetExpression" for n in let_nodes) + + +def test_find_nodes_by_kind_returns_deep_nodes(snowflake_node_map: NodeIdMap) -> None: + identifiers = find_nodes_by_kind(snowflake_node_map, "Identifier") + assert len(identifiers) > 1 + + +def test_find_nodes_by_kind_empty_for_missing_kind( + snowflake_node_map: NodeIdMap, +) -> None: + assert find_nodes_by_kind(snowflake_node_map, "NonExistentKind") == [] + + +# ── get_invoke_callee_name ──────────────────────────────────────────────────── + + +def test_get_invoke_callee_name_snowflake(snowflake_node_map: NodeIdMap) -> None: + invokes = find_nodes_by_kind(snowflake_node_map, "InvokeExpression") + names = [get_invoke_callee_name(snowflake_node_map, n) for n in invokes] + assert "Snowflake.Databases" in names + + +def test_get_invoke_callee_name_returns_none_for_non_invoke() -> None: + fake_node = { + "kind": "LiteralExpression", + "literal": '"hello"', + "literalKind": "Text", + } + assert get_invoke_callee_name({}, fake_node) is None + + +# ── get_literal_value ───────────────────────────────────────────────────────── + + +def test_get_literal_value_text() -> None: + node = { + "kind": "LiteralExpression", + "literal": '"my_database"', + "literalKind": "Text", + } + assert get_literal_value(node) == "my_database" + + +def test_get_literal_value_non_text_returns_none() -> None: + node = {"kind": "LiteralExpression", "literal": "42", "literalKind": "Number"} + assert get_literal_value(node) is None + + +def test_get_literal_value_null_returns_none() -> None: + node = {"kind": "LiteralExpression", "literal": "null", "literalKind": "Null"} + assert get_literal_value(node) is None + + +def test_get_literal_value_non_literal_returns_none() -> None: + assert get_literal_value({"kind": "Identifier", "literal": "Source"}) is None + + +# ── resolve_identifier ──────────────────────────────────────────────────────── + + +def test_resolve_identifier_finds_variable(snowflake_node_map: NodeIdMap) -> None: + """resolve_identifier returns the value node for a named variable in a let expression.""" + let_nodes = find_nodes_by_kind(snowflake_node_map, "LetExpression") + assert let_nodes + let_node = let_nodes[0] + result = resolve_identifier(snowflake_node_map, let_node, "Source") + assert result is not None + + +def test_resolve_identifier_returns_none_for_unknown( + snowflake_node_map: NodeIdMap, +) -> None: + let_node = find_nodes_by_kind(snowflake_node_map, "LetExpression")[0] + assert resolve_identifier(snowflake_node_map, let_node, "DoesNotExist") is None + + +# ── get_record_field_values ─────────────────────────────────────────────────── + + +def test_get_record_field_values_extracts_name_and_kind( + snowflake_node_map: NodeIdMap, +) -> None: + """For a Snowflake three-step pattern, item selectors have Name and Kind fields.""" + records = find_nodes_by_kind(snowflake_node_map, "RecordExpression") + assert records, "Expected RecordExpression nodes in a Snowflake expression" + all_fields = [get_record_field_values(snowflake_node_map, r) for r in records] + named = [f for f in all_fields if f] # any non-empty record + assert named, f"No non-empty records found. Fields seen: {all_fields}" + + +def test_get_record_field_values_empty_record() -> None: + # Node with no content field → treated as empty + empty_record = {"kind": "RecordExpression"} + assert get_record_field_values({}, empty_record) == {} diff --git a/metadata-ingestion/tests/unit/test_mquery_bridge.py b/metadata-ingestion/tests/unit/test_mquery_bridge.py new file mode 100644 index 0000000000000..57f52a09d9ab4 --- /dev/null +++ b/metadata-ingestion/tests/unit/test_mquery_bridge.py @@ -0,0 +1,120 @@ +"""Tests for the M-Query V8 bridge (py_mini_racer + bundle.js.gz).""" + +import pytest + + +def test_bridge_starts_on_current_platform(): + """Bridge binary exists and starts without error.""" + from datahub.ingestion.source.powerbi.m_query._bridge import ( + _clear_bridge, + get_bridge, + ) + + _clear_bridge() + bridge = get_bridge() + assert bridge is not None + _clear_bridge() + + +def test_bridge_parses_simple_let_expression(): + """A valid M-Query let expression returns a nodeIdMap with a LetExpression root.""" + from datahub.ingestion.source.powerbi.m_query._bridge import ( + _clear_bridge, + get_bridge, + ) + + _clear_bridge() + bridge = get_bridge() + node_map = bridge.parse("let Source = 1 in Source") + kinds = {node["kind"] for node in node_map.values()} + assert "LetExpression" in kinds + _clear_bridge() + + +def test_bridge_parses_minimal_section_document(): + """DefaultSettings use ParseEitherExpressionOrSection — a bare section parses.""" + from datahub.ingestion.source.powerbi.m_query._bridge import ( + _clear_bridge, + get_bridge, + ) + + _clear_bridge() + bridge = get_bridge() + node_map = bridge.parse("section;") + kinds = {node["kind"] for node in node_map.values()} + assert "Section" in kinds + _clear_bridge() + + +@pytest.mark.parametrize( + ("expression", "required_substrings"), + [ + # Lex stage (tokenization) — inputs that never reach the parser + ( + "###", + ("lex:", "line"), + ), + ( + '"unterminated', + ("lex:", "unterminated"), + ), + # Parse stage + ( + "let x =", + ("parse:", "end-of-stream"), + ), + ( + "", + ("parse:", "end-of-stream"), + ), + ( + "let x = ) in x", + ("parse:", "parenthesis"), + ), + ( + "let Source = 1 in Source extra", + ("parse:", "tokens remain"), + ), + ( + "let Source = Sql.Database( in Source", + ("parse:", "keyword"), + ), + ( + "shared x = 1;", + ("parse:", "shared"), + ), + ], +) +def test_bridge_errors_include_stage_and_details( + expression: str, required_substrings: tuple[str, ...] +) -> None: + """Lex/Parse failures surface as MQueryParseError with Lex:/Parse: prefix and message text.""" + from datahub.ingestion.source.powerbi.m_query._bridge import ( + MQueryParseError, + _clear_bridge, + get_bridge, + ) + + _clear_bridge() + bridge = get_bridge() + with pytest.raises(MQueryParseError) as exc_info: + bridge.parse(expression) + message = str(exc_info.value).lower() + for needle in required_substrings: + assert needle in message, f"expected {needle!r} in {message!r}" + _clear_bridge() + + +def test_bridge_restart_after_clear(): + """After _clear_bridge(), a fresh get_bridge() call succeeds.""" + from datahub.ingestion.source.powerbi.m_query._bridge import ( + _clear_bridge, + get_bridge, + ) + + _clear_bridge() + b1 = get_bridge() + _clear_bridge() + b2 = get_bridge() + assert b2 is not b1 + _clear_bridge() diff --git a/metadata-ingestion/tests/unit/test_native_query_flag.py b/metadata-ingestion/tests/unit/test_native_query_flag.py new file mode 100644 index 0000000000000..e5fe18d51c9a3 --- /dev/null +++ b/metadata-ingestion/tests/unit/test_native_query_flag.py @@ -0,0 +1,134 @@ +""" +Verifies the corrected semantics of native_query_parsing=False. + +Old (buggy) behaviour: native_query_parsing=False suppressed ALL M-Query parsing. +New (correct) behaviour: native_query_parsing=False suppresses only expressions + containing Value.NativeQuery; all other expressions are parsed normally. +""" + +import pytest + +import datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes as powerbi_data_classes +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.source.powerbi.config import ( + PowerBiDashboardSourceConfig, + PowerBiDashboardSourceReport, +) +from datahub.ingestion.source.powerbi.dataplatform_instance_resolver import ( + AbstractDataPlatformInstanceResolver, + create_dataplatform_instance_resolver, +) + + +def _make_table(expression: str) -> powerbi_data_classes.Table: + """Create a Table with the given M-Query expression.""" + return powerbi_data_classes.Table( + columns=[], + measures=[], + expression=expression, + name="test_table", + full_name="test_dataset.test_table", + ) + + +def _get_instances( + native_query_parsing: bool, +) -> tuple[ + PipelineContext, + PowerBiDashboardSourceConfig, + AbstractDataPlatformInstanceResolver, + PowerBiDashboardSourceReport, +]: + config = PowerBiDashboardSourceConfig.model_validate( + { + "tenant_id": "fake", + "client_id": "foo", + "client_secret": "bar", + "enable_advance_lineage_sql_construct": False, + "extract_column_level_lineage": False, + "native_query_parsing": native_query_parsing, + } + ) + platform_instance_resolver = create_dataplatform_instance_resolver(config) + ctx = PipelineContext(run_id="fake") + reporter = PowerBiDashboardSourceReport() + return ctx, config, platform_instance_resolver, reporter + + +SNOWFLAKE_EXPR = ( + "let\n" + ' Source = Snowflake.Databases("bu10758.ap-unknown-2.fakecomputing.com","PBI_TEST_WAREHOUSE_PROD",[Role="PBI_TEST_MEMBER"]),\n' + ' PBI_TEST_Database = Source{[Name="PBI_TEST",Kind="Database"]}[Data],\n' + ' TEST_Schema = PBI_TEST_Database{[Name="TEST",Kind="Schema"]}[Data],\n' + ' TESTTABLE_Table = TEST_Schema{[Name="TESTTABLE",Kind="Table"]}[Data]\n' + "in\n" + " TESTTABLE_Table" +) + +NATIVE_QUERY_EXPR = ( + "let\n" + ' Source = Snowflake.Databases("account"),\n' + ' q = Value.NativeQuery(Source, "SELECT 1")\n' + "in\n" + " q" +) + + +@pytest.mark.integration_batch_5 +def test_native_query_false_skips_native_query_expression(): + """With native_query_parsing=False, NativeQuery expressions return empty lineage.""" + from datahub.ingestion.source.powerbi.m_query.parser import get_upstream_tables + + ctx, config, platform_instance_resolver, reporter = _get_instances( + native_query_parsing=False + ) + result = get_upstream_tables( + table=_make_table(NATIVE_QUERY_EXPR), + reporter=reporter, + platform_instance_resolver=platform_instance_resolver, + ctx=ctx, + config=config, + ) + assert result == [] + assert reporter.m_query_native_query_skipped == 1 + + +@pytest.mark.integration_batch_5 +def test_native_query_false_still_parses_non_native_expression(): + """With native_query_parsing=False, non-NativeQuery expressions still produce lineage.""" + from datahub.ingestion.source.powerbi.m_query.parser import get_upstream_tables + + ctx, config, platform_instance_resolver, reporter = _get_instances( + native_query_parsing=False + ) + result = get_upstream_tables( + table=_make_table(SNOWFLAKE_EXPR), + reporter=reporter, + platform_instance_resolver=platform_instance_resolver, + ctx=ctx, + config=config, + ) + # Should return non-empty lineage (Snowflake table found) + assert result != [] + upstreams = result[0].upstreams + assert len(upstreams) == 1 + assert "snowflake" in upstreams[0].urn + + +@pytest.mark.integration_batch_5 +def test_native_query_true_parses_both(): + """With native_query_parsing=True (default), both expression types are attempted.""" + from datahub.ingestion.source.powerbi.m_query.parser import get_upstream_tables + + # Both should be attempted without raising an exception + for expr in [SNOWFLAKE_EXPR, NATIVE_QUERY_EXPR]: + ctx, config, platform_instance_resolver, reporter = _get_instances( + native_query_parsing=True + ) + get_upstream_tables( + table=_make_table(expr), + reporter=reporter, + platform_instance_resolver=platform_instance_resolver, + ctx=ctx, + config=config, + ) diff --git a/metadata-ingestion/tests/unit/test_powerbi_parser.py b/metadata-ingestion/tests/unit/test_powerbi_parser.py index ef20e7b023043..fb4d39408a200 100644 --- a/metadata-ingestion/tests/unit/test_powerbi_parser.py +++ b/metadata-ingestion/tests/unit/test_powerbi_parser.py @@ -1,5 +1,4 @@ import pytest -from lark import Token, Tree from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.source.powerbi.config import ( @@ -133,15 +132,25 @@ def test_athena_lineage_valid_three_level_hierarchy(athena_lineage): identifier="catalog", items={"Name": "awsdatacatalog"}, next=db_accessor ) - # Mock argument list (Tree) with region - arg_list: Tree = Tree( - "arg_list", [Tree("string", [Token("STRING", '"us-east-1"')])] - ) + arg_list: dict = { + "kind": "InvokeExpression", + "content": { + "kind": "ArrayWrapper", + "elements": [ + { + "kind": "LiteralExpression", + "literalKind": "Text", + "literal": '"us-east-1"', + } + ], + }, + } data_access_func_detail = DataAccessFunctionDetail( arg_list=arg_list, data_access_function_name="AmazonAthena.Databases", identifier_accessor=catalog_accessor, + node_map={}, ) lineage = athena_lineage.create_lineage(data_access_func_detail) @@ -163,12 +172,16 @@ def test_athena_lineage_missing_server(athena_lineage): ) # Empty argument list (no region) - arg_list: Tree = Tree("arg_list", []) + arg_list: dict = { + "kind": "InvokeExpression", + "content": {"kind": "ArrayWrapper", "elements": []}, + } data_access_func_detail = DataAccessFunctionDetail( arg_list=arg_list, data_access_function_name="AmazonAthena.Databases", identifier_accessor=catalog_accessor, + node_map={}, ) lineage = athena_lineage.create_lineage(data_access_func_detail) @@ -179,14 +192,25 @@ def test_athena_lineage_missing_server(athena_lineage): def test_athena_lineage_missing_identifier_accessor(athena_lineage): """Test Athena lineage returns empty when identifier accessor is None.""" - arg_list: Tree = Tree( - "arg_list", [Tree("string", [Token("STRING", '"us-east-1"')])] - ) + arg_list: dict = { + "kind": "InvokeExpression", + "content": { + "kind": "ArrayWrapper", + "elements": [ + { + "kind": "LiteralExpression", + "literalKind": "Text", + "literal": '"us-east-1"', + } + ], + }, + } data_access_func_detail = DataAccessFunctionDetail( arg_list=arg_list, data_access_function_name="AmazonAthena.Databases", identifier_accessor=None, + node_map={}, ) lineage = athena_lineage.create_lineage(data_access_func_detail) @@ -201,14 +225,25 @@ def test_athena_lineage_incomplete_hierarchy_missing_database(athena_lineage): identifier="catalog", items={"Name": "awsdatacatalog"}, next=None ) - arg_list: Tree = Tree( - "arg_list", [Tree("string", [Token("STRING", '"us-west-2"')])] - ) + arg_list: dict = { + "kind": "InvokeExpression", + "content": { + "kind": "ArrayWrapper", + "elements": [ + { + "kind": "LiteralExpression", + "literalKind": "Text", + "literal": '"us-west-2"', + } + ], + }, + } data_access_func_detail = DataAccessFunctionDetail( arg_list=arg_list, data_access_function_name="AmazonAthena.Databases", identifier_accessor=catalog_accessor, + node_map={}, ) lineage = athena_lineage.create_lineage(data_access_func_detail) @@ -225,14 +260,25 @@ def test_athena_lineage_incomplete_hierarchy_missing_table(athena_lineage): identifier="catalog", items={"Name": "awsdatacatalog"}, next=db_accessor ) - arg_list: Tree = Tree( - "arg_list", [Tree("string", [Token("STRING", '"eu-west-1"')])] - ) + arg_list: dict = { + "kind": "InvokeExpression", + "content": { + "kind": "ArrayWrapper", + "elements": [ + { + "kind": "LiteralExpression", + "literalKind": "Text", + "literal": '"eu-west-1"', + } + ], + }, + } data_access_func_detail = DataAccessFunctionDetail( arg_list=arg_list, data_access_function_name="AmazonAthena.Databases", identifier_accessor=catalog_accessor, + node_map={}, ) lineage = athena_lineage.create_lineage(data_access_func_detail) @@ -252,14 +298,25 @@ def test_athena_lineage_malformed_items_missing_name_key(athena_lineage): identifier="catalog", items={"Name": "awsdatacatalog"}, next=db_accessor ) - arg_list: Tree = Tree( - "arg_list", [Tree("string", [Token("STRING", '"us-east-1"')])] - ) + arg_list: dict = { + "kind": "InvokeExpression", + "content": { + "kind": "ArrayWrapper", + "elements": [ + { + "kind": "LiteralExpression", + "literalKind": "Text", + "literal": '"us-east-1"', + } + ], + }, + } data_access_func_detail = DataAccessFunctionDetail( arg_list=arg_list, data_access_function_name="AmazonAthena.Databases", identifier_accessor=catalog_accessor, + node_map={}, ) lineage = athena_lineage.create_lineage(data_access_func_detail) @@ -283,14 +340,25 @@ def test_athena_lineage_different_regions(athena_lineage): identifier="catalog", items={"Name": "awsdatacatalog"}, next=db_accessor ) - arg_list: Tree = Tree( - "arg_list", [Tree("string", [Token("STRING", f'"{region}"')])] - ) + arg_list: dict = { + "kind": "InvokeExpression", + "content": { + "kind": "ArrayWrapper", + "elements": [ + { + "kind": "LiteralExpression", + "literalKind": "Text", + "literal": f'"{region}"', + } + ], + }, + } data_access_func_detail = DataAccessFunctionDetail( arg_list=arg_list, data_access_function_name="AmazonAthena.Databases", identifier_accessor=catalog_accessor, + node_map={}, ) lineage = athena_lineage.create_lineage(data_access_func_detail) @@ -322,14 +390,25 @@ def test_athena_custom_catalog_name(athena_lineage): identifier="catalog", items={"Name": "my_glue_catalog"}, next=db_accessor ) - arg_list: Tree = Tree( - "arg_list", [Tree("string", [Token("STRING", '"us-west-2"')])] - ) + arg_list: dict = { + "kind": "InvokeExpression", + "content": { + "kind": "ArrayWrapper", + "elements": [ + { + "kind": "LiteralExpression", + "literalKind": "Text", + "literal": '"us-west-2"', + } + ], + }, + } data_access_func_detail = DataAccessFunctionDetail( arg_list=arg_list, data_access_function_name="AmazonAthena.Databases", identifier_accessor=catalog_accessor, + node_map={}, ) lineage = athena_lineage.create_lineage(data_access_func_detail) @@ -353,14 +432,25 @@ def test_athena_empty_database_name(athena_lineage): identifier="catalog", items={"Name": "awsdatacatalog"}, next=db_accessor ) - arg_list: Tree = Tree( - "arg_list", [Tree("string", [Token("STRING", '"us-east-1"')])] - ) + arg_list: dict = { + "kind": "InvokeExpression", + "content": { + "kind": "ArrayWrapper", + "elements": [ + { + "kind": "LiteralExpression", + "literalKind": "Text", + "literal": '"us-east-1"', + } + ], + }, + } data_access_func_detail = DataAccessFunctionDetail( arg_list=arg_list, data_access_function_name="AmazonAthena.Databases", identifier_accessor=catalog_accessor, + node_map={}, ) lineage = athena_lineage.create_lineage(data_access_func_detail) @@ -381,14 +471,25 @@ def test_athena_empty_table_name(athena_lineage): identifier="catalog", items={"Name": "awsdatacatalog"}, next=db_accessor ) - arg_list: Tree = Tree( - "arg_list", [Tree("string", [Token("STRING", '"us-east-1"')])] - ) + arg_list: dict = { + "kind": "InvokeExpression", + "content": { + "kind": "ArrayWrapper", + "elements": [ + { + "kind": "LiteralExpression", + "literalKind": "Text", + "literal": '"us-east-1"', + } + ], + }, + } data_access_func_detail = DataAccessFunctionDetail( arg_list=arg_list, data_access_function_name="AmazonAthena.Databases", identifier_accessor=catalog_accessor, + node_map={}, ) lineage = athena_lineage.create_lineage(data_access_func_detail) @@ -409,14 +510,25 @@ def test_athena_whitespace_only_names(athena_lineage): identifier="catalog", items={"Name": "awsdatacatalog"}, next=db_accessor ) - arg_list: Tree = Tree( - "arg_list", [Tree("string", [Token("STRING", '"us-east-1"')])] - ) + arg_list: dict = { + "kind": "InvokeExpression", + "content": { + "kind": "ArrayWrapper", + "elements": [ + { + "kind": "LiteralExpression", + "literalKind": "Text", + "literal": '"us-east-1"', + } + ], + }, + } data_access_func_detail = DataAccessFunctionDetail( arg_list=arg_list, data_access_function_name="AmazonAthena.Databases", identifier_accessor=catalog_accessor, + node_map={}, ) lineage = athena_lineage.create_lineage(data_access_func_detail) diff --git a/metadata-ingestion/uv.lock b/metadata-ingestion/uv.lock index 8146f6a2f94b1..f5e017e0ebdbb 100644 --- a/metadata-ingestion/uv.lock +++ b/metadata-ingestion/uv.lock @@ -134,10 +134,10 @@ all = [ { name = "jsonpath-ng" }, { name = "jupyter-server" }, { name = "kerberos" }, - { name = "lark", extra = ["regex"] }, { name = "litellm" }, { name = "lkml" }, { name = "looker-sdk" }, + { name = "mini-racer" }, { name = "mlflow-skinny" }, { name = "more-itertools" }, { name = "moto", extra = ["s3"] }, @@ -596,10 +596,10 @@ dev = [ { name = "jsonschema" }, { name = "jupyter-server" }, { name = "kerberos" }, - { name = "lark", extra = ["regex"] }, { name = "litellm" }, { name = "lkml" }, { name = "looker-sdk" }, + { name = "mini-racer" }, { name = "mixpanel" }, { name = "mlflow-skinny" }, { name = "more-itertools" }, @@ -787,10 +787,10 @@ docs = [ { name = "jsonschema" }, { name = "jupyter-server" }, { name = "kerberos" }, - { name = "lark", extra = ["regex"] }, { name = "litellm" }, { name = "lkml" }, { name = "looker-sdk" }, + { name = "mini-racer" }, { name = "mixpanel" }, { name = "mlflow-skinny" }, { name = "more-itertools" }, @@ -1462,7 +1462,7 @@ postgres = [ { name = "urllib3" }, ] powerbi = [ - { name = "lark", extra = ["regex"] }, + { name = "mini-racer" }, { name = "more-itertools" }, { name = "msal" }, { name = "patchy" }, @@ -2471,10 +2471,6 @@ requires-dist = [ { name = "kerberos", marker = "extra == 'docs'", specifier = ">=1.3.0,<2.0.0" }, { name = "kerberos", marker = "extra == 'hive-metastore'", specifier = ">=1.3.0,<2.0.0" }, { name = "kerberos", marker = "extra == 'integration-tests'", specifier = ">=1.3.0,<2.0.0" }, - { name = "lark", extras = ["regex"], marker = "extra == 'all'", specifier = "==1.1.4" }, - { name = "lark", extras = ["regex"], marker = "extra == 'dev'", specifier = "==1.1.4" }, - { name = "lark", extras = ["regex"], marker = "extra == 'docs'", specifier = "==1.1.4" }, - { name = "lark", extras = ["regex"], marker = "extra == 'powerbi'", specifier = "==1.1.4" }, { name = "litellm", marker = "extra == 'all'", specifier = "==1.80.5" }, { name = "litellm", marker = "extra == 'confluence'", specifier = "==1.80.5" }, { name = "litellm", marker = "extra == 'datahub-documents'", specifier = "==1.80.5" }, @@ -2493,6 +2489,10 @@ requires-dist = [ { name = "looker-sdk", marker = "extra == 'looker'", specifier = ">=23.0.0,<26.0.0" }, { name = "looker-sdk", marker = "extra == 'lookml'", specifier = ">=23.0.0,<26.0.0" }, { name = "memray", marker = "extra == 'debug'", specifier = "<2.0.0" }, + { name = "mini-racer", marker = "extra == 'all'", specifier = "==0.14.1" }, + { name = "mini-racer", marker = "extra == 'dev'", specifier = "==0.14.1" }, + { name = "mini-racer", marker = "extra == 'docs'", specifier = "==0.14.1" }, + { name = "mini-racer", marker = "extra == 'powerbi'", specifier = "==0.14.1" }, { name = "mixpanel", specifier = ">=4.9.0,<6.0.0" }, { name = "mixpanel", marker = "extra == 'dev'", specifier = ">=4.9.0,<6.0.0" }, { name = "mixpanel", marker = "extra == 'docs'", specifier = ">=4.9.0,<6.0.0" }, @@ -7767,20 +7767,6 @@ dependencies = [ ] sdist = { url = "https://files.pythonhosted.org/packages/0e/72/a3add0e4eec4eb9e2569554f7c70f4a3c27712f40e3284d483e88094cc0e/langdetect-1.0.9.tar.gz", hash = "sha256:cbc1fef89f8d062739774bd51eda3da3274006b3661d199c2655f6b3f6d605a0", size = 981474, upload-time = "2021-05-07T07:54:13.562Z" } -[[package]] -name = "lark" -version = "1.1.4" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/1d/08/eb2590d4f824b5a947d866c603799fa70278f1372cbf8d15b7b4823dfb2b/lark-1.1.4.tar.gz", hash = "sha256:eee86062b149600ef62de0d8dfd38cf85ffc737e16911e7d8c18880f8c5b1333", size = 246520, upload-time = "2022-11-02T01:46:14.707Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d9/10/d95c9fe3d1c0d71c6fb46fee3ffa199febeeeb71094b03028ea4b264bfc4/lark-1.1.4-py3-none-any.whl", hash = "sha256:a42f9f18bdc9d5571a371ae658548e81e78d1642c2145cc3b663e0bf2e9e7eae", size = 107845, upload-time = "2022-11-02T01:46:12.486Z" }, -] - -[package.optional-dependencies] -regex = [ - { name = "regex" }, -] - [[package]] name = "leb128" version = "1.0.9" @@ -8231,6 +8217,22 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ab/45/4bf72fff0070f1760e0ead5fc78b50e2bef35be702cb7c3cd5b33aa776d5/memray-1.19.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:6a193cc20bbe60eccee8c9b4d9eb78ccd69a3248f0291d5d1a7fdda62aa19b53", size = 12148593, upload-time = "2025-10-16T02:26:29.708Z" }, ] +[[package]] +name = "mini-racer" +version = "0.14.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/71/b0/5b200bdf093433f1933f783fbc908c23349a1a575c28c81aff75b609c7c1/mini_racer-0.14.1.tar.gz", hash = "sha256:0df25889b7c4e753520324a1687d85e41f9f64984efa81963339ed400f004d49", size = 41771, upload-time = "2026-02-01T05:53:27.408Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/27/2c/5857ee4e1714db8956aa878dc90255557458c124f93163704e7de948be03/mini_racer-0.14.1-py3-none-macosx_10_9_x86_64.whl", hash = "sha256:a401ecdf5f73d4714b76dc3a4c9b5780059cf4c59a5b64cff7497dc6c219d5a0", size = 19988474, upload-time = "2026-02-01T05:53:03.998Z" }, + { url = "https://files.pythonhosted.org/packages/09/bf/ecaad0c208b9d8bd8f2141f7fa5e520b66915945a2ed56c520524df75fcb/mini_racer-0.14.1-py3-none-macosx_11_0_arm64.whl", hash = "sha256:56cc6965a1665a50d8d613bc43aa83b4cec6b1f09acc69dc4c2845dacb201463", size = 18511166, upload-time = "2026-02-01T05:53:07.059Z" }, + { url = "https://files.pythonhosted.org/packages/56/0c/5260cc29908777c91391dd8b61be9042a3d1c089e6bfc798cd403e44e87d/mini_racer-0.14.1-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:7f93d91973ddb2da4e899e06ecb426bfe7ea103cd1c92687ec9c38f206f375eb", size = 21929528, upload-time = "2026-02-01T05:53:10.22Z" }, + { url = "https://files.pythonhosted.org/packages/c2/3c/c5bd479784826bbbc69f713aae2bcfd5ef353ba4e5e0e661666938474535/mini_racer-0.14.1-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:cdf3a088e1363f16a695288f882abf76b3705b8e1df21418208b87ed010037a4", size = 22178205, upload-time = "2026-02-01T05:53:13.407Z" }, + { url = "https://files.pythonhosted.org/packages/eb/d4/89905a4238be7ea9afec2f10533668f68898c82914ddd91b2729be547eca/mini_racer-0.14.1-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:6788f31608c6c5d0faac4e6b4f36fc5498885f951e510717e8825eaebae6a4c5", size = 21837359, upload-time = "2026-02-01T05:53:16.762Z" }, + { url = "https://files.pythonhosted.org/packages/11/1d/735c5d74239bd0d7d3b271d7f3c91f337c7f937d748dadc2c1c9579df475/mini_racer-0.14.1-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:ad4b5c15993caab571fac3d67e0d55157b358bd439e629e0fef26c45a12c8bab", size = 22178200, upload-time = "2026-02-01T05:53:19.414Z" }, + { url = "https://files.pythonhosted.org/packages/78/60/e0708ea8533e928f10f985be35af4c02dd2b61f4a7501dc88e8c927d85e5/mini_racer-0.14.1-py3-none-win_amd64.whl", hash = "sha256:4abd58c62c9955988dbc0cbf5a798914334fe570d2b192f8890bec20135ab6d1", size = 15515732, upload-time = "2026-02-01T05:53:22.276Z" }, + { url = "https://files.pythonhosted.org/packages/99/fd/7fb43e269c44e5d44ef02fbd164fc11833d8293b47ddcd48e4fb1649f4d2/mini_racer-0.14.1-py3-none-win_arm64.whl", hash = "sha256:440bef1269655b1da94b550612b50669de9881c3d88b805c139f7f1b5ec8ae7b", size = 14829128, upload-time = "2026-02-01T05:53:24.983Z" }, +] + [[package]] name = "mistune" version = "3.2.0"