Merge pull request #767 from broadinstitute/dm/sklearn_warning

mike-w-wilson · web-flow · commit 897d750d8399 · 2025-09-17T11:34:22.000-04:00
Note difference in rounding behavior between Sklearn and Onnx models
diff --git a/gnomad/sample_qc/ancestry.py b/gnomad/sample_qc/ancestry.py
@@ -170,6 +170,10 @@ def apply_sklearn_classification_model(
     except TypeError:
         raise TypeError("The supplied model is not an sklearn model!")
 
+    logger.warning(
+        "The use of .onnx files and apply_onnx_classification_model is recommended."
+    )
+
     classification = fit.predict(data_pd)
     probs = fit.predict_proba(data_pd)
     probs = pd.DataFrame(probs, columns=[f"prob_{p}" for p in fit.classes_])
@@ -194,6 +198,13 @@ def convert_sklearn_rf_to_onnx(
     except TypeError:
         raise TypeError("The supplied model is not an sklearn model!")
 
+    logger.warning(
+        "sklearn models have different rounding behavior than ONNX models. Use of sklearn"
+        "rf models rounds probabilities to two decimal places when used in assign_genetic_ancestry_pcs(),"
+        "while use of onnx rf models does not. This may lead to subtly different assignment results"
+        "for samples around probability cutoffs."
+    )
+
     initial_type = [("float_input", FloatTensorType([None, fit.n_features_in_]))]
     onx = convert_sklearn(fit, initial_types=initial_type, target_opset=target_opset)