Support for Score Ranges with an Unspecified Classification

bencap · bencap · commit acf5a6a48567 · 2025-03-26T15:16:05.000-07:00
To support all pillar project data sets, it is necessary to support score ranges without an explicit
classification. This requires some changes to existing validation logic:
- The wild type score is no longer required at all times. If you have provided a score range with `normal` classification,
the wild type score is required and is required to be within this range.
- If you do provide a wild type score, it is required you provide at least one `normal` classification.
- Users may provide a new `Not Specified` classification, which comes free of normal and abnormal connotations.
- All other validation restrictions remain in place and also apply to the new classification

As part of these changes, a new file `utils.py` has been added to mavedb lib code. This file at present contains only one
new function to help with string sanitization for score ranges, but should be used for other shared library utilities. At
some point, we should make an effort to refactor shared utilities into it.
diff --git a/src/mavedb/lib/utils.py b/src/mavedb/lib/utils.py
@@ -0,0 +1,11 @@
+import re
+
+
+def sanitize_string(s: str):
+    """
+    Sanitize a string to a consistent format:
+    - Strip leading and trailing whitespace
+    - Convert to lowercase
+    - Replace internal whitespace with underscores
+    """
+    return re.sub(r"\s+", "_", s.strip().lower())
diff --git a/src/mavedb/lib/validation/constants/score_set.py b/src/mavedb/lib/validation/constants/score_set.py
@@ -1 +1 @@
-default_ranges = ["normal", "abnormal"]
+default_ranges = ["normal", "abnormal", "not_specified"]
diff --git a/src/mavedb/view_models/score_set.py b/src/mavedb/view_models/score_set.py
@@ -11,6 +11,7 @@
 from mavedb.lib.validation.constants.score_set import default_ranges
 from mavedb.lib.validation.exceptions import ValidationError
 from mavedb.lib.validation.utilities import inf_or_float, is_null
+from mavedb.lib.utils import sanitize_string
 from mavedb.models.enums.mapping_state import MappingState
 from mavedb.models.enums.processing_state import ProcessingState
 from mavedb.view_models import PublicationIdentifiersGetter, record_type_validator, set_record_type
@@ -64,7 +65,7 @@ class ScoreRange(BaseModel):
 
     @validator("classification")
     def range_classification_value_is_accepted(cls, field_value: str):
-        classification = field_value.strip().lower()
+        classification = sanitize_string(field_value)
         if classification not in default_ranges:
             raise ValidationError(
                 f"Unexpected classification value(s): {classification}. Permitted values: {default_ranges}"
@@ -89,7 +90,7 @@ def ranges_are_not_backwards(cls, field_value: tuple[Any]):
 
 
 class ScoreRanges(BaseModel):
-    wt_score: float
+    wt_score: Optional[float]
     ranges: list[ScoreRange]  # type: ignore
 
 
@@ -209,17 +210,16 @@ def score_range_labels_must_be_unique(cls, field_value: Optional[ScoreRanges]):
         return field_value
 
     @validator("score_ranges")
-    def ranges_contain_normal_and_abnormal(cls, field_value: Optional[ScoreRanges]):
+    def score_range_normal_classification_exists_if_wild_type_score_provided(cls, field_value: Optional[ScoreRanges]):
         if field_value is None:
             return None
 
-        ranges = set([range_model.classification for range_model in field_value.ranges])
-        if not set(default_ranges).issubset(ranges):
-            raise ValidationError(
-                "Both `normal` and `abnormal` ranges must be provided.",
-                # Raise this error inside the first classification provided by the model.
-                custom_loc=["body", "scoreRanges", "ranges", 0, "classification"],
-            )
+        if field_value.wt_score is not None:
+            if not any([range_model.classification == "normal" for range_model in field_value.ranges]):
+                raise ValidationError(
+                    "A wild type score has been provided, but no normal classification range exists.",
+                    custom_loc=["body", "scoreRanges", "wtScore"],
+                )
 
         return field_value
 
@@ -264,6 +264,16 @@ def wild_type_score_in_normal_range(cls, field_value: Optional[ScoreRanges]):
         normal_ranges = [
             range_model.range for range_model in field_value.ranges if range_model.classification == "normal"
         ]
+
+        if normal_ranges and field_value.wt_score is None:
+            raise ValidationError(
+                "A normal range has been provided, but no wild type score has been provided.",
+                custom_loc=["body", "scoreRanges", "wtScore"],
+            )
+
+        if field_value.wt_score is None:
+            return field_value
+
         for range in normal_ranges:
             if field_value.wt_score >= inf_or_float(range[0], lower=True) and field_value.wt_score < inf_or_float(
                 range[1], lower=False
diff --git a/tests/view_models/test_score_set.py b/tests/view_models/test_score_set.py
@@ -440,20 +440,35 @@ def test_cannot_create_score_set_with_wild_type_outside_normal_range():
     )
 
 
-@pytest.mark.parametrize("present_name", default_ranges)
-def test_cannot_create_score_set_without_default_range(present_name):
+def test_cannot_create_score_set_with_wild_type_score_and_no_normal_range():
+    wt_score = -0.5
     score_set_test = TEST_MINIMAL_SEQ_SCORESET.copy()
     score_set_test["score_ranges"] = {
-        "wt_score": -1.5,
+        "wt_score": wt_score,
         "ranges": [
-            {"label": "range_2", "classification": f"{present_name}", "range": (-3, -1)},
+            {"label": "range_1", "classification": "abnormal", "range": (-1, 0)},
+        ],
+    }
+
+    with pytest.raises(ValueError) as exc_info:
+        ScoreSetModify(**jsonable_encoder(score_set_test))
+
+    assert "A wild type score has been provided, but no normal classification range exists." in str(exc_info.value)
+
+
+def test_cannot_create_score_set_with_normal_range_and_no_wild_type_score():
+    score_set_test = TEST_MINIMAL_SEQ_SCORESET.copy()
+    score_set_test["score_ranges"] = {
+        "wt_score": None,
+        "ranges": [
+            {"label": "range_1", "classification": "normal", "range": (-1, 0)},
         ],
     }
 
     with pytest.raises(ValueError) as exc_info:
         ScoreSetModify(**jsonable_encoder(score_set_test))
 
-    assert "Both `normal` and `abnormal` ranges must be provided." in str(exc_info.value)
+    assert "A normal range has been provided, but no wild type score has been provided." in str(exc_info.value)
 
 
 def test_cannot_create_score_set_without_default_ranges():
@@ -468,4 +483,21 @@ def test_cannot_create_score_set_without_default_ranges():
     with pytest.raises(ValueError) as exc_info:
         ScoreSetModify(**jsonable_encoder(score_set_test))
 
-    assert "Unexpected classification value(s): other. Permitted values: ['normal', 'abnormal']" in str(exc_info.value)
+    assert (
+        "Unexpected classification value(s): other. Permitted values: ['normal', 'abnormal', 'not_specified']"
+        in str(exc_info.value)
+    )
+
+
+@pytest.mark.parametrize("classification", default_ranges)
+def test_can_create_score_set_with_any_range_classification(classification):
+    wt_score = -0.5 if classification == "normal" else None
+    score_set_test = TEST_MINIMAL_SEQ_SCORESET.copy()
+    score_set_test["score_ranges"] = {
+        "wt_score": wt_score,
+        "ranges": [
+            {"label": "range_1", "classification": classification, "range": (-1, 0)},
+        ],
+    }
+
+    ScoreSetModify(**jsonable_encoder(score_set_test))

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-default_ranges = ["normal", "abnormal"]`
	`1`	`+default_ranges = ["normal", "abnormal", "not_specified"]`