3636 AuditEvent ,
3737 AuditLogEntry ,
3838 BigQueryAuditMetadata ,
39+ BigqueryTableIdentifier ,
3940 BigQueryTableRef ,
4041 QueryEvent ,
4142 ReadEvent ,
5354 make_usage_workunit ,
5455)
5556from datahub .metadata .schema_classes import OperationClass , OperationTypeClass
57+ from datahub .utilities .bigquery_sql_parser import BigQuerySQLParser
5658from datahub .utilities .file_backed_collections import ConnectionWrapper , FileBackedDict
5759from datahub .utilities .perf_timer import PerfTimer
5860
@@ -142,7 +144,7 @@ def bigquery_audit_metadata_query_template(
142144 AND
143145 (
144146 (
145- JSON_EXTRACT_SCALAR( protopayload_auditlog.methodName) IN
147+ protopayload_auditlog.methodName IN
146148 (
147149 "google.cloud.bigquery.v2.JobService.Query",
148150 "google.cloud.bigquery.v2.JobService.InsertJob"
@@ -184,6 +186,7 @@ def __init__(self, config: BigQueryV2Config):
184186 e .timestamp , config .bucket_duration
185187 ),
186188 "user" : lambda e : e .actor_email ,
189+ "from_query" : lambda e : int (e .from_query ),
187190 },
188191 cache_max_size = config .file_backed_cache_size ,
189192 # Evict entire cache to reduce db calls.
@@ -198,6 +201,7 @@ def __init__(self, config: BigQueryV2Config):
198201 extra_columns = {
199202 "query" : lambda e : e .query ,
200203 "is_read" : lambda e : int (e .statementType in READ_STATEMENT_TYPES ),
204+ "on_view" : lambda e : int (e .query_on_view ),
201205 },
202206 cache_max_size = config .file_backed_cache_size ,
203207 cache_eviction_batch_size = max (int (config .file_backed_cache_size * 0.9 ), 1 ),
@@ -328,6 +332,20 @@ def usage_statistics(self, top_n: int) -> Iterator[UsageStatistic]:
328332 column_freq = json .loads (row ["column_freq" ] or "[]" ),
329333 )
330334
335+ def delete_original_read_events_for_view_query_events (self ) -> None :
336+ self .read_events .sql_query (
337+ """
338+ DELETE FROM
339+ read_events
340+ WHERE
341+ read_events.from_query = 0 AND
342+ read_events.name in (
343+ SELECT q.key FROM query_events q WHERE q.on_view = 1
344+ )
345+ """ ,
346+ refs = [self .query_events ],
347+ )
348+
331349 def report_disk_usage (self , report : BigQueryV2Report ) -> None :
332350 report .usage_state_size = str (
333351 {
@@ -342,7 +360,7 @@ def report_disk_usage(self, report: BigQueryV2Report) -> None:
342360class BigQueryUsageExtractor :
343361 """
344362 This plugin extracts the following:
345- * Statistics on queries issued and tables and columns accessed (excludes views)
363+ * Statistics on queries issued and tables and columns accessed
346364 * Aggregation of these statistics into buckets, by day or hour granularity
347365
348366 :::note
@@ -389,6 +407,26 @@ def _run(
389407 logger .error ("Error processing usage" , exc_info = True )
390408 self .report .report_warning ("usage-ingestion" , str (e ))
391409
410+ def generate_read_events_from_query (
411+ self , query_event_on_view : QueryEvent
412+ ) -> Iterable [AuditEvent ]:
413+ try :
414+ tables = self .get_tables_from_query (
415+ query_event_on_view .project_id ,
416+ query_event_on_view .query ,
417+ )
418+ assert tables is not None and len (tables ) != 0
419+ for table in tables :
420+ yield AuditEvent .create (
421+ ReadEvent .from_query_event (table , query_event_on_view )
422+ )
423+ except Exception as ex :
424+ logger .debug (
425+ f"Generating read events failed for this query on view: { query_event_on_view .query } . "
426+ f"Usage won't be added. The error was { ex } ."
427+ )
428+ self .report .num_view_query_events_failed_sql_parsing += 1
429+
392430 def _ingest_events (
393431 self ,
394432 events : Iterable [AuditEvent ],
@@ -397,8 +435,33 @@ def _ingest_events(
397435 ) -> None :
398436 """Read log and store events in usage_state."""
399437 num_aggregated = 0
438+ num_generated = 0
400439 for audit_event in events :
401440 try :
441+ # Note for View Usage:
442+ # If Query Event references a view, bigquery audit logs do not contain Read Event for view
443+ # in its audit logs, but only for it base tables. To extract usage for views, we parse the
444+ # sql query to find bigquery tables and views read in the query and generate Read Events
445+ # for them in our code (`from_query`=True). For such Query Events, we delete the original
446+ # Read Events coming from Bigquery audit logs and keep only generated ones.
447+
448+ # Caveats of SQL parsing approach used here:
449+ # 1. If query parsing fails, usage for such query is not considered/counted.
450+ # 2. Due to limitations of query parsing, field level usage is not available.
451+ # To limit the impact, we use query parsing only for those queries that reference at least
452+ # one view. For all other queries, field level usage is available through bigquery audit logs.
453+ if (
454+ audit_event .query_event
455+ and audit_event .query_event .query_on_view
456+ and not self .config .usage .apply_view_usage_to_tables
457+ ):
458+ query_event = audit_event .query_event
459+ self .report .num_view_query_events += 1
460+
461+ for new_event in self .generate_read_events_from_query (query_event ):
462+ num_generated += self ._store_usage_event (
463+ new_event , usage_state , table_refs
464+ )
402465 num_aggregated += self ._store_usage_event (
403466 audit_event , usage_state , table_refs
404467 )
@@ -409,6 +472,10 @@ def _ingest_events(
409472 self ._report_error ("store-event" , e )
410473 logger .info (f"Total number of events aggregated = { num_aggregated } ." )
411474
475+ if self .report .num_view_query_events > 0 :
476+ logger .info (f"Total number of read events generated = { num_generated } ." )
477+ usage_state .delete_original_read_events_for_view_query_events ()
478+
412479 def _generate_operational_workunits (
413480 self , usage_state : BigQueryUsageState , table_refs : Collection [str ]
414481 ) -> Iterable [MetadataWorkUnit ]:
@@ -903,6 +970,56 @@ def _get_parsed_bigquery_log_events(
903970 f"log-parse-{ project_id } " , e , group = "usage-log-parse"
904971 )
905972
973+ def get_tables_from_query (
974+ self , default_project : str , query : str
975+ ) -> Optional [List [BigQueryTableRef ]]:
976+ """
977+ This method attempts to parse bigquery objects read in the query
978+ """
979+ if not query :
980+ return None
981+
982+ parsed_tables = set ()
983+ try :
984+ parser = BigQuerySQLParser (
985+ query ,
986+ self .config .sql_parser_use_external_process ,
987+ use_raw_names = self .config .lineage_sql_parser_use_raw_names ,
988+ )
989+ tables = parser .get_tables ()
990+ except Exception as ex :
991+ logger .debug (
992+ f"Sql parsing failed on this query on view: { query } . "
993+ f"Usage won't be added. The error was { ex } ."
994+ )
995+ return None
996+
997+ for table in tables :
998+ parts = table .split ("." )
999+ if len (parts ) == 2 :
1000+ parsed_tables .add (
1001+ BigQueryTableRef (
1002+ BigqueryTableIdentifier (
1003+ project_id = default_project , dataset = parts [0 ], table = parts [1 ]
1004+ )
1005+ ).get_sanitized_table_ref ()
1006+ )
1007+ elif len (parts ) == 3 :
1008+ parsed_tables .add (
1009+ BigQueryTableRef (
1010+ BigqueryTableIdentifier (
1011+ project_id = parts [0 ], dataset = parts [1 ], table = parts [2 ]
1012+ )
1013+ ).get_sanitized_table_ref ()
1014+ )
1015+ else :
1016+ logger .debug (
1017+ f"Invalid table identifier { table } when parsing query on view { query } "
1018+ )
1019+ self .report .num_view_query_events_failed_table_identification += 1
1020+
1021+ return list (parsed_tables )
1022+
9061023 def _report_error (
9071024 self , label : str , e : Exception , group : Optional [str ] = None
9081025 ) -> None :
0 commit comments