datahub-project
diff --git a/‎metadata-ingestion/src/datahub/ingestion/graph/client.py‎
Lines changed: 5 additions & 1 deletion b/‎metadata-ingestion/src/datahub/ingestion/graph/client.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎metadata-ingestion/src/datahub/ingestion/run/pipeline.py‎
Lines changed: 1 addition & 0 deletions b/‎metadata-ingestion/src/datahub/ingestion/run/pipeline.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎metadata-ingestion/src/datahub/ingestion/source/sql_queries.py‎
Lines changed: 164 additions & 15 deletions b/‎metadata-ingestion/src/datahub/ingestion/source/sql_queries.py‎
Lines changed: 164 additions & 15 deletions
@@ -102,6 +102,7 @@
     from datahub.sql_parsing.schema_resolver import (
         GraphQLSchemaMetadata,
         SchemaResolver,
+        SchemaResolverReport,
     )
     from datahub.sql_parsing.sqlglot_lineage import SqlParsingResult
 
@@ -1543,6 +1544,7 @@ def _make_schema_resolver(
         platform_instance: Optional[str],
         env: str,
         include_graph: bool = True,
+        report: Optional["SchemaResolverReport"] = None,
     ) -> "SchemaResolver":
         from datahub.sql_parsing.schema_resolver import SchemaResolver
 
@@ -1551,6 +1553,7 @@ def _make_schema_resolver(
             platform_instance=platform_instance,
             env=env,
             graph=self if include_graph else None,
+            report=report,
         )
 
     def initialize_schema_resolver_from_datahub(
@@ -1559,10 +1562,11 @@ def initialize_schema_resolver_from_datahub(
         platform_instance: Optional[str],
         env: str,
         batch_size: int = 100,
+        report: Optional["SchemaResolverReport"] = None,
     ) -> "SchemaResolver":
         logger.info("Initializing schema resolver")
         schema_resolver = self._make_schema_resolver(
-            platform, platform_instance, env, include_graph=False
+            platform, platform_instance, env, include_graph=False, report=report
         )
 
         logger.info(f"Fetching schemas for platform {platform}, env {env}")
 
@@ -558,6 +558,7 @@ def run(self) -> None:
 
                 self.process_commits()
                 self.final_status = PipelineStatus.COMPLETED
+
             except (SystemExit, KeyboardInterrupt):
                 self.final_status = PipelineStatus.CANCELLED
                 logger.error("Caught error", exc_info=True)
 
@@ -1,11 +1,13 @@
 import json
 import logging
 import os
-from dataclasses import dataclass
+import re
+from dataclasses import dataclass, field
 from datetime import datetime
 from functools import partial
-from typing import ClassVar, Iterable, List, Optional, Union
+from typing import ClassVar, Iterable, List, Optional, Union, cast
 
+import smart_open
 from pydantic import BaseModel, Field, validator
 
 from datahub.configuration.common import HiddenFromDocs
@@ -36,12 +38,13 @@
     SourceCapability,
     SourceReport,
 )
-from datahub.ingestion.api.source_helpers import auto_workunit_reporter
+from datahub.ingestion.api.source_helpers import auto_workunit, auto_workunit_reporter
 from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.ingestion.graph.client import DataHubGraph
+from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig
 from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
 from datahub.metadata.urns import CorpUserUrn, DatasetUrn
-from datahub.sql_parsing.schema_resolver import SchemaResolver
+from datahub.sql_parsing.schema_resolver import SchemaResolver, SchemaResolverReport
 from datahub.sql_parsing.sql_parsing_aggregator import (
     KnownQueryLineageInfo,
     ObservedQuery,
@@ -82,15 +85,38 @@ class SqlQueriesSourceConfig(
         None,
         description="The SQL dialect to use when parsing queries. Overrides automatic dialect detection.",
     )
+    temp_table_patterns: List[str] = Field(
+        description="Regex patterns for temporary tables to filter in lineage ingestion. "
+        "Specify regex to match the entire table name. This is useful for platforms like Athena "
+        "that don't have native temp tables but use naming patterns for fake temp tables.",
+        default=[],
+    )
+
+    enable_lazy_schema_loading: bool = Field(
+        default=True,
+        description="Enable lazy schema loading for better performance. When enabled, schemas are fetched on-demand "
+        "instead of bulk loading all schemas upfront, reducing startup time and memory usage.",
+    )
+
+    # AWS/S3 configuration
+    aws_config: Optional[AwsConnectionConfig] = Field(
+        default=None,
+        description="AWS configuration for S3 access. Required when query_file is an S3 URI (s3://).",
+    )
 
 
 @dataclass
 class SqlQueriesSourceReport(SourceReport):
     num_entries_processed: int = 0
     num_entries_failed: int = 0
     num_queries_aggregator_failures: int = 0
+    num_queries_processed_sequential: int = 0
+    num_temp_tables_detected: int = 0
+    temp_table_patterns_used: List[str] = field(default_factory=list)
+    peak_memory_usage_mb: float = 0.0
 
     sql_aggregator: Optional[SqlAggregatorReport] = None
+    schema_resolver_report: Optional[SchemaResolverReport] = None
 
 
 @platform_name("SQL Queries", id="sql-queries")
@@ -115,6 +141,18 @@ class SqlQueriesSource(Source):
     - upstream_tables (optional): string[] - Fallback list of tables the query reads from,
      used if the query can't be parsed.
 
+    **Lazy Schema Loading**:
+    - Fetches schemas on-demand during query parsing instead of bulk loading all schemas upfront
+    - Caches fetched schemas for future lookups to avoid repeated network requests
+    - Reduces initial startup time and memory usage significantly
+    - Automatically handles large platforms efficiently without memory issues
+
+    **Query Processing**:
+    - Loads the entire query file into memory at once
+    - Processes all queries sequentially before generating metadata work units
+    - Preserves temp table mappings and lineage relationships to ensure consistent lineage tracking
+    - Query deduplication is handled automatically by the SQL parsing aggregator
+
     ### Incremental Lineage
     When `incremental_lineage` is enabled, this source will emit lineage as patches rather than full overwrites.
     This allows you to add lineage edges without removing existing ones, which is useful for:
@@ -124,6 +162,12 @@ class SqlQueriesSource(Source):
 
     Note: Incremental lineage only applies to UpstreamLineage aspects. Other aspects like queries and usage
     statistics will still be emitted normally.
+
+    ### Temporary Table Support
+    For platforms like Athena that don't have native temporary tables, you can use the `temp_table_patterns`
+    configuration to specify regex patterns that identify fake temporary tables. This allows the source to
+    process these tables like other sources that support native temp tables, enabling proper lineage tracking
+    across temporary table operations.
     """
 
     schema_resolver: Optional[SchemaResolver]
@@ -141,13 +185,19 @@ def __init__(self, ctx: PipelineContext, config: SqlQueriesSourceConfig):
         self.report = SqlQueriesSourceReport()
 
         if self.config.use_schema_resolver:
-            # TODO: `initialize_schema_resolver_from_datahub` does a  bulk initialization by fetching all schemas
-            # for the given platform, platform instance, and env. Instead this should be configurable:
-            # bulk initialization vs lazy on-demand schema fetching.
-            self.schema_resolver = self.graph.initialize_schema_resolver_from_datahub(
+            # Create schema resolver report for tracking
+            self.report.schema_resolver_report = SchemaResolverReport()
+
+            # Use lazy loading - schemas will be fetched on-demand and cached
+            logger.info(
+                "Using lazy schema loading - schemas will be fetched on-demand and cached"
+            )
+            self.schema_resolver = SchemaResolver(
                 platform=self.config.platform,
                 platform_instance=self.config.platform_instance,
                 env=self.config.env,
+                graph=self.graph,
+                report=self.report.schema_resolver_report,
             )
         else:
             self.schema_resolver = None
@@ -156,7 +206,9 @@ def __init__(self, ctx: PipelineContext, config: SqlQueriesSourceConfig):
             platform=self.config.platform,
             platform_instance=self.config.platform_instance,
             env=self.config.env,
-            schema_resolver=self.schema_resolver,
+            schema_resolver=cast(SchemaResolver, self.schema_resolver)
+            if self.schema_resolver
+            else None,
             eager_graph_load=False,
             generate_lineage=True,  # TODO: make this configurable
             generate_queries=True,  # TODO: make this configurable
@@ -165,7 +217,9 @@ def __init__(self, ctx: PipelineContext, config: SqlQueriesSourceConfig):
             generate_usage_statistics=True,
             generate_operations=True,  # TODO: make this configurable
             usage_config=self.config.usage,
-            is_temp_table=None,
+            is_temp_table=self.is_temp_table
+            if self.config.temp_table_patterns
+            else None,
             is_allowed_table=None,
             format_queries=False,
         )
@@ -193,20 +247,73 @@ def get_workunits_internal(
     ) -> Iterable[Union[MetadataWorkUnit, MetadataChangeProposalWrapper]]:
         logger.info(f"Parsing queries from {os.path.basename(self.config.query_file)}")
 
+        logger.info("Processing all queries in batch mode")
+        yield from self._process_queries_batch()
+
+    def _process_queries_batch(
+        self,
+    ) -> Iterable[Union[MetadataWorkUnit, MetadataChangeProposalWrapper]]:
+        """Process all queries in memory (original behavior)."""
         with self.report.new_stage("Collecting queries from file"):
             queries = list(self._parse_query_file())
             logger.info(f"Collected {len(queries)} queries for processing")
 
         with self.report.new_stage("Processing queries through SQL parsing aggregator"):
-            for query_entry in queries:
-                self._add_query_to_aggregator(query_entry)
+            logger.info("Using sequential processing")
+            self._process_queries_sequential(queries)
 
         with self.report.new_stage("Generating metadata work units"):
             logger.info("Generating workunits from SQL parsing aggregator")
-            yield from self.aggregator.gen_metadata()
+            yield from auto_workunit(self.aggregator.gen_metadata())
 
-    def _parse_query_file(self) -> Iterable["QueryEntry"]:
-        """Parse the query file and yield QueryEntry objects."""
+    def _is_s3_uri(self, path: str) -> bool:
+        """Check if the path is an S3 URI."""
+        return path.startswith("s3://")
+
+    def _parse_s3_query_file(self) -> Iterable["QueryEntry"]:
+        """Parse query file from S3 using smart_open."""
+        if not self.config.aws_config:
+            raise ValueError("AWS configuration required for S3 file access")
+
+        logger.info(f"Reading query file from S3: {self.config.query_file}")
+
+        try:
+            # Use smart_open for efficient S3 streaming, similar to S3FileSystem
+            s3_client = self.config.aws_config.get_s3_client()
+
+            with smart_open.open(
+                self.config.query_file, mode="r", transport_params={"client": s3_client}
+            ) as file_stream:
+                for line in file_stream:
+                    if line.strip():
+                        try:
+                            query_dict = json.loads(line, strict=False)
+                            entry = QueryEntry.create(query_dict, config=self.config)
+                            self.report.num_entries_processed += 1
+                            if self.report.num_entries_processed % 1000 == 0:
+                                logger.info(
+                                    f"Processed {self.report.num_entries_processed} query entries from S3"
+                                )
+                            yield entry
+                        except Exception as e:
+                            self.report.num_entries_failed += 1
+                            self.report.warning(
+                                title="Error processing query from S3",
+                                message="Query skipped due to parsing error",
+                                context=line.strip(),
+                                exc=e,
+                            )
+        except Exception as e:
+            self.report.warning(
+                title="Error reading S3 file",
+                message="Failed to read S3 file",
+                context=self.config.query_file,
+                exc=e,
+            )
+            raise
+
+    def _parse_local_query_file(self) -> Iterable["QueryEntry"]:
+        """Parse local query file (existing logic)."""
         with open(self.config.query_file) as f:
             for line in f:
                 try:
@@ -227,6 +334,30 @@ def _parse_query_file(self) -> Iterable["QueryEntry"]:
                         exc=e,
                     )
 
+    def _parse_query_file(self) -> Iterable["QueryEntry"]:
+        """Parse the query file and yield QueryEntry objects."""
+        if self._is_s3_uri(self.config.query_file):
+            yield from self._parse_s3_query_file()
+        else:
+            yield from self._parse_local_query_file()
+
+    def _process_queries_sequential(self, queries: List["QueryEntry"]) -> None:
+        """Process queries sequentially."""
+        total_queries = len(queries)
+        logger.info(f"Processing {total_queries} queries sequentially")
+
+        # Process each query sequentially
+        for i, query_entry in enumerate(queries):
+            self._add_query_to_aggregator(query_entry)
+            self.report.num_queries_processed_sequential += 1
+
+            # Simple progress reporting every 1000 queries
+            if (i + 1) % 1000 == 0:
+                progress_pct = ((i + 1) / total_queries) * 100
+                logger.info(
+                    f"Processed {i + 1}/{total_queries} queries ({progress_pct:.1f}%)"
+                )
+
     def _add_query_to_aggregator(self, query_entry: "QueryEntry") -> None:
         """Add a query to the SQL parsing aggregator."""
         try:
@@ -285,6 +416,24 @@ def _add_query_to_aggregator(self, query_entry: "QueryEntry") -> None:
                 exc=e,
             )
 
+    def is_temp_table(self, name: str) -> bool:
+        """Check if a table name matches any of the configured temp table patterns."""
+        if not self.config.temp_table_patterns:
+            return False
+
+        try:
+            for pattern in self.config.temp_table_patterns:
+                if re.match(pattern, name, flags=re.IGNORECASE):
+                    logger.debug(
+                        f"Table '{name}' matched temp table pattern: {pattern}"
+                    )
+                    self.report.num_temp_tables_detected += 1
+                    return True
+        except re.error as e:
+            logger.warning(f"Invalid regex pattern '{pattern}': {e}")
+
+        return False
+
 
 class QueryEntry(BaseModel):
     query: str