feat(classification): configurable sample size (#8096)

mayurinehate · david-leifker · web-flow · commit 798ce3d6c827 · 2023-05-24T00:07:01.000-07:00
Co-authored-by: david-leifker &lt;114954101+david-leifker@users.noreply.github.com&gt;
diff --git a/metadata-ingestion/docs/dev_guides/classification.md b/metadata-ingestion/docs/dev_guides/classification.md
@@ -9,6 +9,7 @@ Note that a `.` is used to denote nested fields in the YAML recipe.
 | Field | Required | Type | Description | Default |
 | ---   | ---      | ---  | --- | -- |
 | enabled |  | boolean | Whether classification should be used to auto-detect glossary terms | False |
+| sample_size |  | int | Number of sample values used for classification. | 100 |
 | info_type_to_term |  | Dict[str,string] | Optional mapping to provide glossary term identifier for info type.  | By default, info type is used as glossary term identifier. |
 | classifiers |  | Array of object | Classifiers to use to auto-detect glossary terms. If more than one classifier, infotype predictions from the classifier defined later in sequence take precedance. | [{'type': 'datahub', 'config': None}] |
 | table_pattern |  | AllowDenyPattern (see below for fields) | Regex patterns to filter tables for classification. This is used in combination with other patterns in parent config. Specify regex to match the entire table name in `database.schema.table` format. e.g. to match all tables starting with customer in Customer database and public schema, use the regex 'Customer.public.customer.*' | {'allow': ['.*'], 'deny': [], 'ignoreCase': True} |
@@ -28,8 +29,8 @@ DataHub Classifier is the default classifier implementation, which uses [acryl-d
 
 | Field | Required | Type | Description | Default |
 | ---   | ---      | ---  | --- | -- |
-| confidence_level_threshold |  | number |  | 0.6 |
-| info_types |  | list[string] | List of infotypes to be predicted. By default, all supported infotypes are considered. If specified. this should be subset of ['Email_Address', 'Gender', 'Credit_Debit_Card_Number', 'Phone_Number', 'Street_Address', 'Full_Name', 'Age', 'IBAN', 'US_Social_Security_Number', 'Vehicle_Identification_Number', 'IP_Address_v4', 'IP_Address_v6', 'US_Driving_License_Number', 'Swift_Code'] | None |
+| confidence_level_threshold |  | number |  | 0.68 |
+| info_types |  | list[string] | List of infotypes to be predicted. By default, all supported infotypes are considered. If specified. this should be subset of `['Email_Address', 'Gender', 'Credit_Debit_Card_Number', 'Phone_Number', 'Street_Address', 'Full_Name', 'Age', 'IBAN', 'US_Social_Security_Number', 'Vehicle_Identification_Number', 'IP_Address_v4', 'IP_Address_v6', 'US_Driving_License_Number', 'Swift_Code']` | None |
 | info_types_config | Configuration details for infotypes | Dict[str, InfoTypeConfig] |  | See [reference_input.py](https://github.com/acryldata/datahub-classify/blob/main/datahub-classify/src/datahub_classify/reference_input.py) for default configuration. |
 | info_types_config.`key`.prediction_factors_and_weights | ❓ (required if info_types_config.`key` is set) | Dict[str,number] | Factors and their weights to consider when predicting info types |  |
 | info_types_config.`key`.name |  | NameFactorConfig (see below for fields) |  |  |
diff --git a/metadata-ingestion/src/datahub/ingestion/glossary/classifier.py b/metadata-ingestion/src/datahub/ingestion/glossary/classifier.py
@@ -31,6 +31,11 @@ class ClassificationConfig(ConfigModel):
         default=False,
         description="Whether classification should be used to auto-detect glossary terms",
     )
+
+    sample_size: int = Field(
+        default=100, description="Number of sample values used for classification."
+    )
+
     table_pattern: AllowDenyPattern = Field(
         default=AllowDenyPattern.allow_all(),
         description="Regex patterns to filter tables for classification. This is used in combination with other patterns in parent config. Specify regex to match the entire table name in `database.schema.table` format. e.g. to match all tables starting with customer in Customer database and public schema, use the regex 'Customer.public.customer.*'",
diff --git a/metadata-ingestion/src/datahub/ingestion/glossary/datahub_classifier.py b/metadata-ingestion/src/datahub/ingestion/glossary/datahub_classifier.py
@@ -71,7 +71,7 @@ class Config:
 # TODO: Generate Classification doc (classification.md) from python source.
 class DataHubClassifierConfig(ConfigModel):
     confidence_level_threshold: float = Field(
-        default=0.6,
+        default=0.68,
         init=False,
         description="The confidence threshold above which the prediction is considered as a proposal",
     )
diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py
@@ -1436,17 +1436,20 @@ def inspect_session_metadata(self) -> None:
 
     # Ideally we do not want null values in sample data for a column.
     # However that would require separate query per column and
-    # that would be expensive, hence not done.
+    # that would be expensive, hence not done. To compensale for possibility
+    # of some null values in collected sample, we fetch extra (20% more)
+    # rows than configured sample_size.
     def get_sample_values_for_table(self, table_name, schema_name, db_name):
         # Create a cursor object.
         logger.debug(
             f"Collecting sample values for table {db_name}.{schema_name}.{table_name}"
         )
+
+        actual_sample_size = self.config.classification.sample_size * 1.2
         with PerfTimer() as timer:
             cur = self.get_connection().cursor()
-            NUM_SAMPLED_ROWS = 1000
             # Execute a statement that will generate a result set.
-            sql = f'select * from "{db_name}"."{schema_name}"."{table_name}" sample ({NUM_SAMPLED_ROWS} rows);'
+            sql = f'select * from "{db_name}"."{schema_name}"."{table_name}" sample ({actual_sample_size} rows);'
 
             cur.execute(sql)
             # Fetch the result set from the cursor and deliver it as the Pandas DataFrame.

Original file line number	Diff line number	Diff line change
`@@ -71,7 +71,7 @@ class Config:`
`71`	`71`	`# TODO: Generate Classification doc (classification.md) from python source.`
`72`	`72`	`class DataHubClassifierConfig(ConfigModel):`
`73`	`73`	`confidence_level_threshold: float = Field(`
`74`		`- default=0.6,`
	`74`	`+ default=0.68,`
`75`	`75`	`init=False,`
`76`	`76`	`description="The confidence threshold above which the prediction is considered as a proposal",`
`77`	`77`	`)`