feat(classification): allow parallelisation to reduce time (#8368)

mayurinehate · yoonhyejin · commit 9e59bf995441 · 2023-08-24T16:31:30.000+09:00
diff --git a/metadata-ingestion/docs/dev_guides/classification.md b/metadata-ingestion/docs/dev_guides/classification.md
@@ -10,6 +10,7 @@ Note that a `.` is used to denote nested fields in the YAML recipe.
 | ------------------------- | -------- | --------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------- |
 | enabled                   |          | boolean                                 | Whether classification should be used to auto-detect glossary terms                                                                                                                                                                                                                                                                      | False                                                      |
 | sample_size               |          | int                                     | Number of sample values used for classification.                                                                                                                                                                                                                                                                                         | 100                                                        |
+| max_workers               |          | int                                     | Number of worker threads to use for classification. Set to 1 to disable.                                                                                                                                                                                                                                                                 | Number of cpu cores or 4                                   |
 | info_type_to_term         |          | Dict[str,string]                        | Optional mapping to provide glossary term identifier for info type.                                                                                                                                                                                                                                                                      | By default, info type is used as glossary term identifier. |
 | classifiers               |          | Array of object                         | Classifiers to use to auto-detect glossary terms. If more than one classifier, infotype predictions from the classifier defined later in sequence take precedance.                                                                                                                                                                       | [{'type': 'datahub', 'config': None}]                      |
 | table_pattern             |          | AllowDenyPattern (see below for fields) | Regex patterns to filter tables for classification. This is used in combination with other patterns in parent config. Specify regex to match the entire table name in `database.schema.table` format. e.g. to match all tables starting with customer in Customer database and public schema, use the regex 'Customer.public.customer.*' | {'allow': ['.*'], 'deny': [], 'ignoreCase': True}          |
diff --git a/metadata-ingestion/src/datahub/ingestion/glossary/classification_mixin.py b/metadata-ingestion/src/datahub/ingestion/glossary/classification_mixin.py
@@ -1,6 +1,8 @@
+import concurrent.futures
 import logging
 from dataclasses import dataclass, field
-from typing import Dict, List
+from math import ceil
+from typing import Dict, Iterable, List, Optional
 
 from datahub_classify.helper_classes import ColumnInfo, Metadata
 from pydantic import Field
@@ -108,15 +110,23 @@ def classify_schema_fields(
             return None
 
         logger.debug(f"Classifying Table {dataset_name}")
+
         self.report.num_tables_classification_attempted += 1
         field_terms: Dict[str, str] = {}
         with PerfTimer() as timer:
             try:
                 for classifier in self.classifiers:
-                    column_info_with_proposals = classifier.classify(column_infos)
-                    self.extract_field_wise_terms(
-                        field_terms, column_info_with_proposals
-                    )
+                    column_infos_with_proposals: Iterable[ColumnInfo]
+                    if self.config.classification.max_workers > 1:
+                        column_infos_with_proposals = self.async_classify(
+                            classifier, column_infos
+                        )
+                    else:
+                        column_infos_with_proposals = classifier.classify(column_infos)
+
+                    for column_info_proposal in column_infos_with_proposals:
+                        self.update_field_terms(field_terms, column_info_proposal)
+
             except Exception:
                 self.report.num_tables_classification_failed += 1
                 raise
@@ -130,6 +140,44 @@ def classify_schema_fields(
             self.report.num_tables_classified += 1
             self.populate_terms_in_schema_metadata(schema_metadata, field_terms)
 
+    def update_field_terms(
+        self, field_terms: Dict[str, str], col_info: ColumnInfo
+    ) -> None:
+        term = self.get_terms_for_column(col_info)
+        if term:
+            field_terms[col_info.metadata.name] = term
+
+    def async_classify(
+        self, classifier: Classifier, columns: List[ColumnInfo]
+    ) -> Iterable[ColumnInfo]:
+        num_columns = len(columns)
+        BATCH_SIZE = 5  # Number of columns passed to classify api at a time
+
+        logger.debug(
+            f"Will Classify {num_columns} column(s) with {self.config.classification.max_workers} worker(s) with batch size {BATCH_SIZE}."
+        )
+
+        with concurrent.futures.ProcessPoolExecutor(
+            max_workers=self.config.classification.max_workers,
+        ) as executor:
+            column_info_proposal_futures = [
+                executor.submit(
+                    classifier.classify,
+                    columns[
+                        (i * BATCH_SIZE) : min(i * BATCH_SIZE + BATCH_SIZE, num_columns)
+                    ],
+                )
+                for i in range(ceil(num_columns / BATCH_SIZE))
+            ]
+
+            return [
+                column_with_proposal
+                for proposal_future in concurrent.futures.as_completed(
+                    column_info_proposal_futures
+                )
+                for column_with_proposal in proposal_future.result()
+            ]
+
     def populate_terms_in_schema_metadata(
         self,
         schema_metadata: SchemaMetadata,
@@ -154,25 +202,20 @@ def populate_terms_in_schema_metadata(
                     ),
                 )
 
-    def extract_field_wise_terms(
-        self,
-        field_terms: Dict[str, str],
-        column_info_with_proposals: List[ColumnInfo],
-    ) -> None:
-        for col_info in column_info_with_proposals:
-            if not col_info.infotype_proposals:
-                continue
-            infotype_proposal = max(
-                col_info.infotype_proposals, key=lambda p: p.confidence_level
-            )
-            self.report.info_types_detected.setdefault(
-                infotype_proposal.infotype, LossyList()
-            ).append(f"{col_info.metadata.dataset_name}.{col_info.metadata.name}")
-            field_terms[
-                col_info.metadata.name
-            ] = self.config.classification.info_type_to_term.get(
-                infotype_proposal.infotype, infotype_proposal.infotype
-            )
+    def get_terms_for_column(self, col_info: ColumnInfo) -> Optional[str]:
+        if not col_info.infotype_proposals:
+            return None
+        infotype_proposal = max(
+            col_info.infotype_proposals, key=lambda p: p.confidence_level
+        )
+        self.report.info_types_detected.setdefault(
+            infotype_proposal.infotype, LossyList()
+        ).append(f"{col_info.metadata.dataset_name}.{col_info.metadata.name}")
+        term = self.config.classification.info_type_to_term.get(
+            infotype_proposal.infotype, infotype_proposal.infotype
+        )
+
+        return term
 
     def get_columns_to_classify(
         self,
diff --git a/metadata-ingestion/src/datahub/ingestion/glossary/classifier.py b/metadata-ingestion/src/datahub/ingestion/glossary/classifier.py
@@ -1,3 +1,4 @@
+import os
 from abc import ABCMeta, abstractmethod
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional
@@ -36,6 +37,11 @@ class ClassificationConfig(ConfigModel):
         default=100, description="Number of sample values used for classification."
     )
 
+    max_workers: int = Field(
+        default=(os.cpu_count() or 4),
+        description="Number of worker threads to use for classification. Set to 1 to disable.",
+    )
+
     table_pattern: AllowDenyPattern = Field(
         default=AllowDenyPattern.allow_all(),
         description="Regex patterns to filter tables for classification. This is used in combination with other patterns in parent config. Specify regex to match the entire table name in `database.schema.table` format. e.g. to match all tables starting with customer in Customer database and public schema, use the regex 'Customer.public.customer.*'",
diff --git a/metadata-ingestion/src/datahub/ingestion/glossary/datahub_classifier.py b/metadata-ingestion/src/datahub/ingestion/glossary/datahub_classifier.py
@@ -173,4 +173,5 @@ def classify(self, columns: List[ColumnInfo]) -> List[ColumnInfo]:
             infotypes=self.config.info_types,
             minimum_values_threshold=self.config.minimum_values_threshold,
         )
+
         return columns
diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py
@@ -872,8 +872,8 @@ def _process_table(
             self.gen_schema_metadata(table, schema_name, db_name)
 
     def fetch_sample_data_for_classification(
-        self, table, schema_name, db_name, dataset_name
-    ):
+        self, table: SnowflakeTable, schema_name: str, db_name: str, dataset_name: str
+    ) -> None:
         if (
             table.columns
             and self.config.classification.enabled
@@ -1225,7 +1225,12 @@ def build_foreign_keys(self, table, dataset_urn, foreign_keys):
             )
         return foreign_keys
 
-    def classify_snowflake_table(self, table, dataset_name, schema_metadata):
+    def classify_snowflake_table(
+        self,
+        table: Union[SnowflakeTable, SnowflakeView],
+        dataset_name: str,
+        schema_metadata: SchemaMetadata,
+    ) -> None:
         if (
             isinstance(table, SnowflakeTable)
             and self.config.classification.enabled
@@ -1255,6 +1260,9 @@ def classify_snowflake_table(self, table, dataset_name, schema_metadata):
                     "Failed to classify table columns",
                     dataset_name,
                 )
+            finally:
+                # Cleaning up sample_data fetched for classification
+                table.sample_data = None
 
     def get_report(self) -> SourceReport:
         return self.report
@@ -1470,7 +1478,7 @@ def get_sample_values_for_table(self, table_name, schema_name, db_name):
             df = pd.DataFrame(dat, columns=[col.name for col in cur.description])
             time_taken = timer.elapsed_seconds()
             logger.debug(
-                f"Finished collecting sample values for table {db_name}.{schema_name}.{table_name}; took {time_taken:.3f} seconds"
+                f"Finished collecting sample values for table {db_name}.{schema_name}.{table_name};{df.shape[0]} rows; took {time_taken:.3f} seconds"
             )
 
         return df
diff --git a/metadata-ingestion/tests/integration/snowflake/common.py b/metadata-ingestion/tests/integration/snowflake/common.py
@@ -14,7 +14,13 @@
 FROZEN_TIME = "2022-06-07 17:00:00"
 
 
-def default_query_results(query):  # noqa: C901
+def default_query_results(  # noqa: C901
+    query,
+    num_tables=NUM_TABLES,
+    num_views=NUM_VIEWS,
+    num_cols=NUM_COLS,
+    num_ops=NUM_OPS,
+):
     if query == SnowflakeQuery.current_account():
         return [{"CURRENT_ACCOUNT()": "ABC12345"}]
     if query == SnowflakeQuery.current_region():
@@ -79,7 +85,7 @@ def default_query_results(query):  # noqa: C901
                 "COMMENT": "Comment for Table",
                 "CLUSTERING_KEY": None,
             }
-            for tbl_idx in range(1, NUM_TABLES + 1)
+            for tbl_idx in range(1, num_tables + 1)
         ]
     elif query == SnowflakeQuery.show_views_for_schema("TEST_SCHEMA", "TEST_DB"):
         return [
@@ -90,7 +96,7 @@ def default_query_results(query):  # noqa: C901
                 "comment": "Comment for View",
                 "text": None,
             }
-            for view_idx in range(1, NUM_VIEWS + 1)
+            for view_idx in range(1, num_views + 1)
         ]
     elif query == SnowflakeQuery.columns_for_schema("TEST_SCHEMA", "TEST_DB"):
         raise Exception("Information schema query returned too much data")
@@ -99,13 +105,13 @@ def default_query_results(query):  # noqa: C901
             SnowflakeQuery.columns_for_table(
                 "TABLE_{}".format(tbl_idx), "TEST_SCHEMA", "TEST_DB"
             )
-            for tbl_idx in range(1, NUM_TABLES + 1)
+            for tbl_idx in range(1, num_tables + 1)
         ],
         *[
             SnowflakeQuery.columns_for_table(
                 "VIEW_{}".format(view_idx), "TEST_SCHEMA", "TEST_DB"
             )
-            for view_idx in range(1, NUM_VIEWS + 1)
+            for view_idx in range(1, num_views + 1)
         ],
     ]:
         return [
@@ -122,7 +128,7 @@ def default_query_results(query):  # noqa: C901
                 "NUMERIC_PRECISION": None if col_idx > 1 else 38,
                 "NUMERIC_SCALE": None if col_idx > 1 else 0,
             }
-            for col_idx in range(1, NUM_COLS + 1)
+            for col_idx in range(1, num_cols + 1)
         ]
     elif query in (
         SnowflakeQuery.use_database("TEST_DB"),
@@ -158,7 +164,7 @@ def default_query_results(query):  # noqa: C901
                         {
                             "columns": [
                                 {"columnId": 0, "columnName": "COL_{}".format(col_idx)}
-                                for col_idx in range(1, NUM_COLS + 1)
+                                for col_idx in range(1, num_cols + 1)
                             ],
                             "objectDomain": "Table",
                             "objectId": 0,
@@ -167,7 +173,7 @@ def default_query_results(query):  # noqa: C901
                         {
                             "columns": [
                                 {"columnId": 0, "columnName": "COL_{}".format(col_idx)}
-                                for col_idx in range(1, NUM_COLS + 1)
+                                for col_idx in range(1, num_cols + 1)
                             ],
                             "objectDomain": "Table",
                             "objectId": 0,
@@ -176,7 +182,7 @@ def default_query_results(query):  # noqa: C901
                         {
                             "columns": [
                                 {"columnId": 0, "columnName": "COL_{}".format(col_idx)}
-                                for col_idx in range(1, NUM_COLS + 1)
+                                for col_idx in range(1, num_cols + 1)
                             ],
                             "objectDomain": "Table",
                             "objectId": 0,
@@ -189,7 +195,7 @@ def default_query_results(query):  # noqa: C901
                         {
                             "columns": [
                                 {"columnId": 0, "columnName": "COL_{}".format(col_idx)}
-                                for col_idx in range(1, NUM_COLS + 1)
+                                for col_idx in range(1, num_cols + 1)
                             ],
                             "objectDomain": "Table",
                             "objectId": 0,
@@ -198,7 +204,7 @@ def default_query_results(query):  # noqa: C901
                         {
                             "columns": [
                                 {"columnId": 0, "columnName": "COL_{}".format(col_idx)}
-                                for col_idx in range(1, NUM_COLS + 1)
+                                for col_idx in range(1, num_cols + 1)
                             ],
                             "objectDomain": "Table",
                             "objectId": 0,
@@ -207,7 +213,7 @@ def default_query_results(query):  # noqa: C901
                         {
                             "columns": [
                                 {"columnId": 0, "columnName": "COL_{}".format(col_idx)}
-                                for col_idx in range(1, NUM_COLS + 1)
+                                for col_idx in range(1, num_cols + 1)
                             ],
                             "objectDomain": "Table",
                             "objectId": 0,
@@ -231,7 +237,7 @@ def default_query_results(query):  # noqa: C901
                                         }
                                     ],
                                 }
-                                for col_idx in range(1, NUM_COLS + 1)
+                                for col_idx in range(1, num_cols + 1)
                             ],
                             "objectDomain": "Table",
                             "objectId": 0,
@@ -246,7 +252,7 @@ def default_query_results(query):  # noqa: C901
                 "EMAIL": "abc@xyz.com",
                 "ROLE_NAME": "ACCOUNTADMIN",
             }
-            for op_idx in range(1, NUM_OPS + 1)
+            for op_idx in range(1, num_ops + 1)
         ]
     elif (
         query
@@ -276,7 +282,7 @@ def default_query_results(query):  # noqa: C901
                 "UPSTREAM_TABLE_COLUMNS": json.dumps(
                     [
                         {"columnId": 0, "columnName": "COL_{}".format(col_idx)}
-                        for col_idx in range(1, NUM_COLS + 1)
+                        for col_idx in range(1, num_cols + 1)
                     ]
                 ),
                 "DOWNSTREAM_TABLE_COLUMNS": json.dumps(
@@ -293,11 +299,11 @@ def default_query_results(query):  # noqa: C901
                                 }
                             ],
                         }
-                        for col_idx in range(1, NUM_COLS + 1)
+                        for col_idx in range(1, num_cols + 1)
                     ]
                 ),
             }
-            for op_idx in range(1, NUM_OPS + 1)
+            for op_idx in range(1, num_ops + 1)
         ] + [
             {
                 "DOWNSTREAM_TABLE_NAME": "TEST_DB.TEST_SCHEMA.TABLE_1",
@@ -371,7 +377,7 @@ def default_query_results(query):  # noqa: C901
                                 ]
                             ],
                         }
-                        for col_idx in range(1, NUM_COLS + 1)
+                        for col_idx in range(1, num_cols + 1)
                     ]
                     + (  # This additional upstream is only for TABLE_1
                         [
@@ -393,7 +399,7 @@ def default_query_results(query):  # noqa: C901
                     )
                 ),
             }
-            for op_idx in range(1, NUM_OPS + 1)
+            for op_idx in range(1, num_ops + 1)
         ]
     elif query in (
         snowflake_query.SnowflakeQuery.table_to_table_lineage_history_v2(
@@ -426,7 +432,7 @@ def default_query_results(query):  # noqa: C901
                     )
                 ),
             }
-            for op_idx in range(1, NUM_OPS + 1)
+            for op_idx in range(1, num_ops + 1)
         ]
     elif query == snowflake_query.SnowflakeQuery.external_table_lineage_history(
         1654499820000,
@@ -479,7 +485,7 @@ def default_query_results(query):  # noqa: C901
                 "VIEW_COLUMNS": json.dumps(
                     [
                         {"columnId": 0, "columnName": "COL_{}".format(col_idx)}
-                        for col_idx in range(1, NUM_COLS + 1)
+                        for col_idx in range(1, num_cols + 1)
                     ]
                 ),
                 "DOWNSTREAM_TABLE_DOMAIN": "TABLE",
@@ -497,7 +503,7 @@ def default_query_results(query):  # noqa: C901
                                 }
                             ],
                         }
-                        for col_idx in range(1, NUM_COLS + 1)
+                        for col_idx in range(1, num_cols + 1)
                     ]
                 ),
             }
diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake.py
diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake_classification.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake_classification.py
diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake_legacy_lineage.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake_legacy_lineage.py

Original file line number	Diff line number	Diff line change
`@@ -173,4 +173,5 @@ def classify(self, columns: List[ColumnInfo]) -> List[ColumnInfo]:`
`173`	`173`	`infotypes=self.config.info_types,`
`174`	`174`	`minimum_values_threshold=self.config.minimum_values_threshold,`
`175`	`175`	`)`
	`176`	`+`
`176`	`177`	`return columns`
Original file line number	Diff line number	Diff line change
`@@ -14,7 +14,13 @@`
`14`	`14`	`FROZEN_TIME = "2022-06-07 17:00:00"`
`15`	`15`
`16`	`16`
`17`		`-def default_query_results(query): # noqa: C901`
	`17`	`+def default_query_results( # noqa: C901`
	`18`	`+ query,`
	`19`	`+ num_tables=NUM_TABLES,`
	`20`	`+ num_views=NUM_VIEWS,`
	`21`	`+ num_cols=NUM_COLS,`
	`22`	`+ num_ops=NUM_OPS,`
	`23`	`+):`
`18`	`24`	`if query == SnowflakeQuery.current_account():`
`19`	`25`	`return [{"CURRENT_ACCOUNT()": "ABC12345"}]`
`20`	`26`	`if query == SnowflakeQuery.current_region():`
`@@ -79,7 +85,7 @@ def default_query_results(query): # noqa: C901`
`79`	`85`	`"COMMENT": "Comment for Table",`
`80`	`86`	`"CLUSTERING_KEY": None,`
`81`	`87`	`}`
`82`		`- for tbl_idx in range(1, NUM_TABLES + 1)`
	`88`	`+ for tbl_idx in range(1, num_tables + 1)`
`83`	`89`	`]`
`84`	`90`	`elif query == SnowflakeQuery.show_views_for_schema("TEST_SCHEMA", "TEST_DB"):`
`85`	`91`	`return [`
`@@ -90,7 +96,7 @@ def default_query_results(query): # noqa: C901`
`90`	`96`	`"comment": "Comment for View",`
`91`	`97`	`"text": None,`
`92`	`98`	`}`
`93`		`- for view_idx in range(1, NUM_VIEWS + 1)`
	`99`	`+ for view_idx in range(1, num_views + 1)`
`94`	`100`	`]`
`95`	`101`	`elif query == SnowflakeQuery.columns_for_schema("TEST_SCHEMA", "TEST_DB"):`
`96`	`102`	`raise Exception("Information schema query returned too much data")`
`@@ -99,13 +105,13 @@ def default_query_results(query): # noqa: C901`
`99`	`105`	`SnowflakeQuery.columns_for_table(`
`100`	`106`	`"TABLE_{}".format(tbl_idx), "TEST_SCHEMA", "TEST_DB"`
`101`	`107`	`)`
`102`		`- for tbl_idx in range(1, NUM_TABLES + 1)`
	`108`	`+ for tbl_idx in range(1, num_tables + 1)`
`103`	`109`	`],`
`104`	`110`	`*[`
`105`	`111`	`SnowflakeQuery.columns_for_table(`
`106`	`112`	`"VIEW_{}".format(view_idx), "TEST_SCHEMA", "TEST_DB"`
`107`	`113`	`)`
`108`		`- for view_idx in range(1, NUM_VIEWS + 1)`
	`114`	`+ for view_idx in range(1, num_views + 1)`
`109`	`115`	`],`
`110`	`116`	`]:`
`111`	`117`	`return [`
`@@ -122,7 +128,7 @@ def default_query_results(query): # noqa: C901`
`122`	`128`	`"NUMERIC_PRECISION": None if col_idx > 1 else 38,`
`123`	`129`	`"NUMERIC_SCALE": None if col_idx > 1 else 0,`
`124`	`130`	`}`
`125`		`- for col_idx in range(1, NUM_COLS + 1)`
	`131`	`+ for col_idx in range(1, num_cols + 1)`
`126`	`132`	`]`
`127`	`133`	`elif query in (`
`128`	`134`	`SnowflakeQuery.use_database("TEST_DB"),`
`@@ -158,7 +164,7 @@ def default_query_results(query): # noqa: C901`
`158`	`164`	`{`
`159`	`165`	`"columns": [`
`160`	`166`	`{"columnId": 0, "columnName": "COL_{}".format(col_idx)}`
`161`		`- for col_idx in range(1, NUM_COLS + 1)`
	`167`	`+ for col_idx in range(1, num_cols + 1)`
`162`	`168`	`],`
`163`	`169`	`"objectDomain": "Table",`
`164`	`170`	`"objectId": 0,`
`@@ -167,7 +173,7 @@ def default_query_results(query): # noqa: C901`
`167`	`173`	`{`
`168`	`174`	`"columns": [`
`169`	`175`	`{"columnId": 0, "columnName": "COL_{}".format(col_idx)}`
`170`		`- for col_idx in range(1, NUM_COLS + 1)`
	`176`	`+ for col_idx in range(1, num_cols + 1)`
`171`	`177`	`],`
`172`	`178`	`"objectDomain": "Table",`
`173`	`179`	`"objectId": 0,`
`@@ -176,7 +182,7 @@ def default_query_results(query): # noqa: C901`
`176`	`182`	`{`
`177`	`183`	`"columns": [`
`178`	`184`	`{"columnId": 0, "columnName": "COL_{}".format(col_idx)}`
`179`		`- for col_idx in range(1, NUM_COLS + 1)`
	`185`	`+ for col_idx in range(1, num_cols + 1)`
`180`	`186`	`],`
`181`	`187`	`"objectDomain": "Table",`
`182`	`188`	`"objectId": 0,`
`@@ -189,7 +195,7 @@ def default_query_results(query): # noqa: C901`
`189`	`195`	`{`
`190`	`196`	`"columns": [`
`191`	`197`	`{"columnId": 0, "columnName": "COL_{}".format(col_idx)}`
`192`		`- for col_idx in range(1, NUM_COLS + 1)`
	`198`	`+ for col_idx in range(1, num_cols + 1)`
`193`	`199`	`],`
`194`	`200`	`"objectDomain": "Table",`
`195`	`201`	`"objectId": 0,`
`@@ -198,7 +204,7 @@ def default_query_results(query): # noqa: C901`
`198`	`204`	`{`
`199`	`205`	`"columns": [`
`200`	`206`	`{"columnId": 0, "columnName": "COL_{}".format(col_idx)}`
`201`		`- for col_idx in range(1, NUM_COLS + 1)`
	`207`	`+ for col_idx in range(1, num_cols + 1)`
`202`	`208`	`],`
`203`	`209`	`"objectDomain": "Table",`
`204`	`210`	`"objectId": 0,`
`@@ -207,7 +213,7 @@ def default_query_results(query): # noqa: C901`
`207`	`213`	`{`
`208`	`214`	`"columns": [`
`209`	`215`	`{"columnId": 0, "columnName": "COL_{}".format(col_idx)}`
`210`		`- for col_idx in range(1, NUM_COLS + 1)`
	`216`	`+ for col_idx in range(1, num_cols + 1)`
`211`	`217`	`],`
`212`	`218`	`"objectDomain": "Table",`
`213`	`219`	`"objectId": 0,`
`@@ -231,7 +237,7 @@ def default_query_results(query): # noqa: C901`
`231`	`237`	`}`
`232`	`238`	`],`
`233`	`239`	`}`
`234`		`- for col_idx in range(1, NUM_COLS + 1)`
	`240`	`+ for col_idx in range(1, num_cols + 1)`
`235`	`241`	`],`
`236`	`242`	`"objectDomain": "Table",`
`237`	`243`	`"objectId": 0,`
`@@ -246,7 +252,7 @@ def default_query_results(query): # noqa: C901`
`246`	`252`	`"EMAIL": "[email protected]",`
`247`	`253`	`"ROLE_NAME": "ACCOUNTADMIN",`
`248`	`254`	`}`
`249`		`- for op_idx in range(1, NUM_OPS + 1)`
	`255`	`+ for op_idx in range(1, num_ops + 1)`
`250`	`256`	`]`
`251`	`257`	`elif (`
`252`	`258`	`query`
`@@ -276,7 +282,7 @@ def default_query_results(query): # noqa: C901`
`276`	`282`	`"UPSTREAM_TABLE_COLUMNS": json.dumps(`
`277`	`283`	`[`
`278`	`284`	`{"columnId": 0, "columnName": "COL_{}".format(col_idx)}`
`279`		`- for col_idx in range(1, NUM_COLS + 1)`
	`285`	`+ for col_idx in range(1, num_cols + 1)`
`280`	`286`	`]`
`281`	`287`	`),`
`282`	`288`	`"DOWNSTREAM_TABLE_COLUMNS": json.dumps(`
`@@ -293,11 +299,11 @@ def default_query_results(query): # noqa: C901`
`293`	`299`	`}`
`294`	`300`	`],`
`295`	`301`	`}`
`296`		`- for col_idx in range(1, NUM_COLS + 1)`
	`302`	`+ for col_idx in range(1, num_cols + 1)`
`297`	`303`	`]`
`298`	`304`	`),`
`299`	`305`	`}`
`300`		`- for op_idx in range(1, NUM_OPS + 1)`
	`306`	`+ for op_idx in range(1, num_ops + 1)`
`301`	`307`	`] + [`
`302`	`308`	`{`
`303`	`309`	`"DOWNSTREAM_TABLE_NAME": "TEST_DB.TEST_SCHEMA.TABLE_1",`
`@@ -371,7 +377,7 @@ def default_query_results(query): # noqa: C901`
`371`	`377`	`]`
`372`	`378`	`],`
`373`	`379`	`}`
`374`		`- for col_idx in range(1, NUM_COLS + 1)`
	`380`	`+ for col_idx in range(1, num_cols + 1)`
`375`	`381`	`]`
`376`	`382`	`+ ( # This additional upstream is only for TABLE_1`
`377`	`383`	`[`
`@@ -393,7 +399,7 @@ def default_query_results(query): # noqa: C901`
`393`	`399`	`)`
`394`	`400`	`),`
`395`	`401`	`}`
`396`		`- for op_idx in range(1, NUM_OPS + 1)`
	`402`	`+ for op_idx in range(1, num_ops + 1)`
`397`	`403`	`]`
`398`	`404`	`elif query in (`
`399`	`405`	`snowflake_query.SnowflakeQuery.table_to_table_lineage_history_v2(`
`@@ -426,7 +432,7 @@ def default_query_results(query): # noqa: C901`
`426`	`432`	`)`
`427`	`433`	`),`
`428`	`434`	`}`
`429`		`- for op_idx in range(1, NUM_OPS + 1)`
	`435`	`+ for op_idx in range(1, num_ops + 1)`
`430`	`436`	`]`
`431`	`437`	`elif query == snowflake_query.SnowflakeQuery.external_table_lineage_history(`
`432`	`438`	`1654499820000,`
`@@ -479,7 +485,7 @@ def default_query_results(query): # noqa: C901`
`479`	`485`	`"VIEW_COLUMNS": json.dumps(`
`480`	`486`	`[`
`481`	`487`	`{"columnId": 0, "columnName": "COL_{}".format(col_idx)}`
`482`		`- for col_idx in range(1, NUM_COLS + 1)`
	`488`	`+ for col_idx in range(1, num_cols + 1)`
`483`	`489`	`]`
`484`	`490`	`),`
`485`	`491`	`"DOWNSTREAM_TABLE_DOMAIN": "TABLE",`
`@@ -497,7 +503,7 @@ def default_query_results(query): # noqa: C901`
`497`	`503`	`}`
`498`	`504`	`],`
`499`	`505`	`}`
`500`		`- for col_idx in range(1, NUM_COLS + 1)`
	`506`	`+ for col_idx in range(1, num_cols + 1)`
`501`	`507`	`]`
`502`	`508`	`),`
`503`	`509`	`}`