Skip to content
Open
Show file tree
Hide file tree
Changes from 173 commits
Commits
Show all changes
174 commits
Select commit Hold shift + click to select a range
64e48a4
Bq multi partitions (#12824)
acrylJonny Mar 10, 2025
48f41b2
updates for missed partition tables
acrylJonny Mar 10, 2025
9d7a83e
upping timeout to 5m
acrylJonny Mar 10, 2025
a025b85
improvements to pick up non-null partitions
acrylJonny Mar 10, 2025
023365c
testing updates
acrylJonny Mar 11, 2025
3078e25
Update profiler.py
acrylJonny Mar 11, 2025
3f3151a
removing timeouts
acrylJonny Mar 11, 2025
664b217
updating logic ordering
acrylJonny Mar 11, 2025
4020201
trying cheaper sampling methods
acrylJonny Mar 11, 2025
bad6a87
Merge branch 'master' into bq-multi-partition-profiling
acrylJonny Mar 11, 2025
d462564
efficiency improvements
acrylJonny Mar 11, 2025
e95d9df
Update profiler.py
acrylJonny Mar 11, 2025
2af9681
improvements
acrylJonny Mar 13, 2025
cd9eac3
linting
acrylJonny Mar 13, 2025
72cb157
Merge branch 'master' into bq-multi-partition-profiling
acrylJonny Mar 13, 2025
910c7e3
Merge branch 'master' into bq-multi-partition-profiling
acrylJonny Mar 14, 2025
20acb75
Update profiler.py
acrylJonny Mar 14, 2025
b177908
Update profiler.py
acrylJonny Mar 14, 2025
18379ea
Update profiler.py
acrylJonny Mar 14, 2025
bc18e1b
Update profiler.py
acrylJonny Mar 14, 2025
62bafec
improved sql queries
acrylJonny Mar 14, 2025
d30a5ac
Update profiler.py
acrylJonny Mar 14, 2025
3394b49
Update profiler.py
acrylJonny Mar 14, 2025
c4b4ac1
Update profiler.py
acrylJonny Mar 14, 2025
7d93a00
Update profiler.py
acrylJonny Mar 14, 2025
c65aa97
Merge branch 'master' into bq-multi-partition-profiling
acrylJonny Mar 14, 2025
2613db5
Update profiler.py
acrylJonny Mar 14, 2025
0470262
updates
acrylJonny Mar 14, 2025
6f42717
Merge branch 'master' into bq-multi-partition-profiling
acrylJonny May 20, 2025
1f781d9
split out code
acrylJonny May 20, 2025
fdbd9a4
mocking update fixing tests
acrylJonny May 23, 2025
fa9b7bd
Merge branch 'master' into bq-multi-partition-profiling
acrylJonny May 23, 2025
b2f1d4a
error fix
acrylJonny May 28, 2025
cc3b879
Merge branch 'master' into bq-multi-partition-profiling
acrylJonny May 28, 2025
be6c6cf
improved profile filter
acrylJonny May 28, 2025
6f158de
Merge branch 'bq-multi-partition-profiling' of https://github.com/dat…
acrylJonny May 28, 2025
edc1412
last resort improvements
acrylJonny May 28, 2025
bdde914
profiling fixes
acrylJonny May 28, 2025
f1e271a
remove invalid profile pieces
acrylJonny May 28, 2025
6e36ebb
Revert "remove invalid profile pieces"
acrylJonny May 28, 2025
0924ca2
Revert "profiling fixes"
acrylJonny May 28, 2025
3eaec08
profiling changes
acrylJonny May 28, 2025
0ebde38
profiler fix
acrylJonny May 28, 2025
2d127c2
query fix
acrylJonny May 28, 2025
50c1d40
further fixes
acrylJonny May 28, 2025
a8cf6d8
get_client
acrylJonny May 28, 2025
f9f3326
move more to geprofiler
acrylJonny May 28, 2025
1cb293b
sqlalchemy addition
acrylJonny May 28, 2025
cc49774
Update profiler.py
acrylJonny May 28, 2025
1afefc1
Update profiler.py
acrylJonny May 28, 2025
72b748a
integrate in other classes
acrylJonny May 28, 2025
f8f8ef1
improvements
acrylJonny May 28, 2025
176c21b
Revert "improvements"
acrylJonny May 28, 2025
4eca691
Revert "integrate in other classes"
acrylJonny May 28, 2025
ee8435e
Revert "Update profiler.py"
acrylJonny May 28, 2025
997093d
Revert "Update profiler.py"
acrylJonny May 28, 2025
39253d3
Revert "sqlalchemy addition"
acrylJonny May 28, 2025
530fc2f
Revert "move more to geprofiler"
acrylJonny May 28, 2025
9a34d88
Revert "get_client"
acrylJonny May 28, 2025
90cc051
Revert "further fixes"
acrylJonny May 28, 2025
5306afa
Revert "query fix"
acrylJonny May 28, 2025
e6d9f5b
Revert "profiler fix"
acrylJonny May 28, 2025
d5589e6
Revert "profiling changes"
acrylJonny May 28, 2025
1f1db13
Reapply "profiling fixes"
acrylJonny May 28, 2025
3ed1317
Reapply "remove invalid profile pieces"
acrylJonny May 28, 2025
9fc76f0
Revert "remove invalid profile pieces"
acrylJonny May 28, 2025
a5ea147
Revert "profiling fixes"
acrylJonny May 28, 2025
0186eca
profiling fixes
acrylJonny May 28, 2025
970cabe
Merge branch 'master' into bq-multi-partition-profiling
acrylJonny May 28, 2025
c4e2fe1
remove columns
acrylJonny May 28, 2025
dffca72
histogram cleanup
acrylJonny May 28, 2025
3c6d810
fuller profiling
acrylJonny May 28, 2025
e367b83
profiler timeout
acrylJonny May 28, 2025
6d03e7f
use geprofiler
acrylJonny May 28, 2025
dc6cf0a
Merge branch 'master' into bq-multi-partition-profiling
acrylJonny May 28, 2025
042106c
dataset name fix
acrylJonny May 28, 2025
b3fe0a9
partition improvements
acrylJonny May 28, 2025
59eea5b
temp table approach
acrylJonny May 29, 2025
50de915
change profiler config
acrylJonny May 29, 2025
0b3be93
cache reduction code
acrylJonny May 29, 2025
20ec768
better handling of errors
acrylJonny May 29, 2025
72301cf
ge profiler changes
acrylJonny May 29, 2025
117ee53
additions of filters to ge
acrylJonny May 29, 2025
4ea2c1f
bq updates
acrylJonny May 29, 2025
0e180dc
profiling options
acrylJonny May 29, 2025
51bab3e
hardcoded fallbacks
acrylJonny May 30, 2025
0c74a47
Revert "hardcoded fallbacks"
acrylJonny Jun 2, 2025
959f16e
fallback
acrylJonny Jun 2, 2025
2ed4a4e
Merge branch 'master' into bq-multi-partition-profiling
acrylJonny Jun 2, 2025
fab0d88
fixes
acrylJonny Jun 2, 2025
e5f662e
Merge branch 'bq-multi-partition-profiling' of https://github.com/dat…
acrylJonny Jun 2, 2025
f82f6ff
Merge branch 'master' into bq-multi-partition-profiling
acrylJonny Jun 2, 2025
758bd6c
day improvements
acrylJonny Jun 2, 2025
4dd8a45
fallback dates as strings
acrylJonny Jun 2, 2025
e8d6e95
Update profiler.py
acrylJonny Jun 3, 2025
ea3c27d
unit tests
acrylJonny Jun 3, 2025
b689a66
Merge branch 'master' into bq-multi-partition-profiling
acrylJonny Jun 3, 2025
ca0f5b8
move extra profiling config
acrylJonny Jun 3, 2025
f03dee7
Merge branch 'master' into bq-multi-partition-profiling
acrylJonny Jun 26, 2025
d4d99d4
Merge branch 'master' into bq-multi-partition-profiling
acrylJonny Jul 14, 2025
44394cf
addressing comments
acrylJonny Jul 14, 2025
2430b95
Update profiler.py
acrylJonny Jul 14, 2025
5391b9d
Merge branch 'master' into bq-multi-partition-profiling
acrylJonny Jul 14, 2025
dde0c1f
Update profiler.py
acrylJonny Jul 14, 2025
75948a1
Update profiler.py
acrylJonny Jul 14, 2025
caaa132
Update profiler.py
acrylJonny Jul 14, 2025
73c402b
Merge branch 'master' into bq-multi-partition-profiling
acrylJonny Aug 1, 2025
a478eee
clean up
acrylJonny Aug 1, 2025
7bb8f53
sql injection prevention
acrylJonny Aug 1, 2025
18b7dd6
Merge branch 'master' into bq-multi-partition-profiling
acrylJonny Aug 4, 2025
ff0f68e
Update profiler.py
acrylJonny Aug 4, 2025
8f2c9f6
Merge branch 'bq-multi-partition-profiling' of https://github.com/dat…
acrylJonny Aug 4, 2025
e050c82
Merge branch 'master' into bq-multi-partition-profiling
acrylJonny Aug 4, 2025
56f3278
improvements
acrylJonny Aug 4, 2025
5ef7193
Update profiler.py
acrylJonny Aug 4, 2025
97b08f6
profiler update
acrylJonny Aug 5, 2025
f668b82
Merge branch 'master' into bq-multi-partition-profiling
acrylJonny Aug 28, 2025
433499c
Merge branch 'bq-multi-partition-profiling' of https://github.com/dat…
acrylJonny Aug 28, 2025
ff9f440
simpler layout of code
acrylJonny Aug 28, 2025
d01a48f
reduce verbose comments and further tests
acrylJonny Aug 28, 2025
1229470
date bug fix
acrylJonny Aug 28, 2025
1f468cd
improved date handling
acrylJonny Aug 28, 2025
7346d9b
Merge branch 'master' into bq-multi-partition-profiling
acrylJonny Aug 28, 2025
7b2d92e
pass partitions as strings
acrylJonny Aug 28, 2025
ed5fe1f
profiling improvements
acrylJonny Aug 28, 2025
6ea3734
quick fix
acrylJonny Aug 28, 2025
ea1b5ec
remove redundant methods and parallel process
acrylJonny Aug 28, 2025
8bcc5bf
unit tests
acrylJonny Aug 29, 2025
72d8e80
Merge branch 'master' into bq-multi-partition-profiling
acrylJonny Aug 29, 2025
cfd9daa
Merge branch 'master' into bq-multi-partition-profiling
acrylJonny Sep 8, 2025
d8d6937
simplify sql injection logic
acrylJonny Sep 8, 2025
cb1f271
latest date for all date types
acrylJonny Sep 8, 2025
019e3c6
prioritize dates in partition detection
acrylJonny Sep 8, 2025
d4d4bcc
remove incorrect check
acrylJonny Sep 8, 2025
0f3a486
further unit tests
acrylJonny Sep 8, 2025
8ddd48e
some unittests to pytest conversion
acrylJonny Sep 8, 2025
664acad
Merge branch 'master' into bq-multi-partition-profiling
acrylJonny Sep 8, 2025
2035447
improve efficiency
acrylJonny Sep 9, 2025
cebb323
Merge branch 'master' into bq-multi-partition-profiling
acrylJonny Sep 9, 2025
a27b15e
better testing and more efficiency
acrylJonny Sep 11, 2025
15057cf
Merge branch 'master' into bq-multi-partition-profiling
acrylJonny Sep 11, 2025
a34d18b
linting
acrylJonny Sep 11, 2025
70f0a78
fix broken quicktests
acrylJonny Sep 11, 2025
2b62832
Update partition_discovery.py
acrylJonny Sep 12, 2025
b2a7dc3
removing date conversion
acrylJonny Sep 16, 2025
9801af0
Merge branch 'master' into bq-multi-partition-profiling
acrylJonny Sep 16, 2025
1b562bf
all partitions as strings
acrylJonny Sep 16, 2025
5800e29
latest dates for internal tables
acrylJonny Sep 17, 2025
fe46c40
Merge branch 'master' into bq-multi-partition-profiling
acrylJonny Sep 26, 2025
f257e08
Merge branch 'master' into bq-multi-partition-profiling
acrylJonny Oct 1, 2025
288c80c
improvements
acrylJonny Oct 1, 2025
da71c3e
updates
acrylJonny Oct 1, 2025
bb1d31e
Merge branch 'master' into bq-multi-partition-profiling
acrylJonny Oct 1, 2025
dce8b7b
imp
acrylJonny Oct 1, 2025
5f1d75e
Update partition_discovery.py
acrylJonny Oct 1, 2025
e3f618c
Update profiler.py
acrylJonny Oct 2, 2025
6085903
Update partition_discovery.py
acrylJonny Oct 28, 2025
4b2db6b
Merge branch 'master' into bq-multi-partition-profiling
acrylJonny Oct 28, 2025
661ad8c
addressing feedback
acrylJonny Nov 20, 2025
d636322
further unit tests
acrylJonny Nov 20, 2025
3646cd1
Merge branch 'master' into bq-multi-partition-profiling
acrylJonny Nov 20, 2025
01c7b9c
Merge branch 'master' into bq-multi-partition-profiling
acrylJonny Nov 24, 2025
93270e7
refactor: BigQuery partition discovery and Pydantic model updates
acrylJonny Jan 13, 2026
8f77cc9
Merge remote-tracking branch 'origin/master' into bq-multi-partition-…
acrylJonny Jan 13, 2026
fcd50b4
comments clean up
acrylJonny Jan 13, 2026
6928acd
test fixes
acrylJonny Jan 13, 2026
93745e1
fixes for numeric partitions
acrylJonny Jan 14, 2026
8b39c9c
hive partition data type fixes
acrylJonny Jan 14, 2026
99d88d3
small issue fixes
acrylJonny Jan 21, 2026
3ddafbc
unit test improvements
acrylJonny Jan 22, 2026
f5c0cfe
Merge branch 'master' into bq-multi-partition-profiling
acrylJonny Feb 16, 2026
d874fcf
Update bigquery_schema_gen.py
acrylJonny Feb 17, 2026
1f2ab13
Merge branch 'master' into bq-multi-partition-profiling
acrylJonny Feb 17, 2026
465775f
Update discovery.py
acrylJonny Feb 18, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
BigQueryIdentifierBuilder,
)
from datahub.ingestion.source.bigquery_v2.lineage import BigqueryLineageExtractor
from datahub.ingestion.source.bigquery_v2.profiler import BigqueryProfiler
from datahub.ingestion.source.bigquery_v2.profiling.profiler import BigqueryProfiler
from datahub.ingestion.source.bigquery_v2.queries_extractor import (
BigQueryQueriesExtractor,
BigQueryQueriesExtractorConfig,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
BigQueryConnectionConfig,
)
from datahub.ingestion.source.data_lake_common.path_spec import PathSpec
from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig
from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, SQLFilterConfig
from datahub.ingestion.source.state.stateful_ingestion_base import (
StatefulLineageConfigMixin,
Expand All @@ -50,6 +51,49 @@
)


class BigQueryProfilingConfig(GEProfilingConfig):
fallback_partition_values: Dict[str, Any] = Field(
default_factory=dict,
description="Fallback values for partition columns when timeout occurs. Keys are column names, "
"values are the fallback values to use. For non-date columns, the values are used directly. "
"Example: {'batch': 'default', 'region': 'us-east-1'}",
)

partition_fetch_timeout: int = Field(
default=30,
description="Timeout in seconds for partition value fetch operations. If exceeded, fallback "
"partition values will be used.",
)

profiling_row_limit: int = Field(
default=1000000,
description="The number of rows to sample for profiling. This is a low level config property which "
"should be touched with care. This restriction is needed because excessively wide tables can "
"result in failure to ingest the schema.",
)

skip_stale_tables: bool = Field(
default=True,
description="Skip profiling for tables that haven't been modified in over a year. "
"Uses last_altered timestamp (which contains BigQuery's last_modified_time) for both regular and external tables. "
"This helps avoid profiling abandoned or archived tables.",
)

staleness_threshold_days: int = Field(
default=365,
description="Number of days after which a table is considered stale and profiling will be skipped "
"if skip_stale_tables is enabled.",
)

partition_datetime_window_days: Optional[int] = Field(
default=30,
description="Limit profiling to partitions within this many days from the selected partition date. "
"For example, if set to 30 and the selected partition is '2025-08-15', only partitions from "
"'2025-07-16' to '2025-08-15' will be included in profiling. Set to None to disable date windowing. "
"This helps focus profiling on recent data patterns and improves performance.",
)


class BigQueryBaseConfig(ConfigModel):
rate_limit: bool = Field(
default=False, description="Should we rate limit requests made to API."
Expand Down Expand Up @@ -482,6 +526,11 @@ def have_table_data_read_permission(self) -> bool:
"See [this](https://cloud.google.com/bigquery/docs/information-schema-jobs#scope_and_syntax) for details.",
)

profiling: BigQueryProfilingConfig = Field(
default=BigQueryProfilingConfig(),
description="Profiling related configs",
)

pushdown_deny_usernames: List[str] = Field(
default=[],
description="List of user email patterns using SQL LIKE syntax (e.g., 'bot_%', '%@%.iam.gserviceaccount.com') "
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
List,
Optional,
Set,
Tuple,
)

from google.api_core import retry
Expand Down Expand Up @@ -70,22 +71,29 @@ class BigqueryTableConstraint:
RANGE_PARTITION_NAME: str = "RANGE"


@dataclass
@dataclass(frozen=True)
class PartitionInfo:
field: str
# Data type is optional as we not have it when we set it from TimePartitioning
column: Optional[BigqueryColumn] = None
fields: Tuple[str, ...]
columns: Optional[Tuple[BigqueryColumn, ...]] = None
type: str = TimePartitioningType.DAY
expiration_ms: Optional[int] = None
require_partition_filter: bool = False

# TimePartitioning field doesn't provide data_type so we have to add it afterwards
def __post_init__(self) -> None:
if not self.fields:
raise ValueError("PartitionInfo must have at least one field")
if self.columns is not None and len(self.fields) != len(self.columns):
raise ValueError(
f"fields/columns length mismatch: {len(self.fields)} fields vs {len(self.columns)} columns"
)

@classmethod
def from_time_partitioning(
cls, time_partitioning: TimePartitioning
) -> "PartitionInfo":
"""Convert BigQuery time partitioning to PartitionInfo."""
return cls(
field=time_partitioning.field or "_PARTITIONTIME",
fields=(time_partitioning.field or "_PARTITIONTIME",),
type=time_partitioning.type_,
expiration_ms=time_partitioning.expiration_ms,
require_partition_filter=time_partitioning.require_partition_filter,
Expand All @@ -100,7 +108,7 @@ def from_range_partitioning(
return None

return cls(
field=field,
fields=(field,),
type=RANGE_PARTITION_NAME,
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import re
from base64 import b32decode
from collections import defaultdict
from dataclasses import replace
from typing import Dict, Iterable, List, Optional, Set, Tuple, Type, Union, cast

from google.cloud.bigquery.table import TableListItem
Expand Down Expand Up @@ -55,7 +56,7 @@
BigQueryFilter,
BigQueryIdentifierBuilder,
)
from datahub.ingestion.source.bigquery_v2.profiler import BigqueryProfiler
from datahub.ingestion.source.bigquery_v2.profiling.profiler import BigqueryProfiler
from datahub.ingestion.source.common.subtypes import (
DatasetContainerSubTypes,
DatasetSubTypes,
Expand Down Expand Up @@ -765,15 +766,16 @@ def _process_table(
)

# If table has time partitioning, set the data type of the partitioning field
if table.partition_info:
table.partition_info.column = next(
(
column
for column in columns
if column.name == table.partition_info.field
),
None,
if table.partition_info and table.partition_info.fields:
matching_columns = tuple(
column
for column in columns
if column.name in table.partition_info.fields
)
if matching_columns:
table.partition_info = replace(
table.partition_info, columns=matching_columns
)
yield from self.gen_table_dataset_workunits(
table, columns, project_id, dataset_name
)
Expand Down
Loading
Loading