Make stricter type checking (#634)

cthoyt · web-flow · commit 589da9ab932e · 2025-10-28T18:40:18.000+02:00
diff --git a/src/sssom/cli.py b/src/sssom/cli.py
@@ -533,8 +533,8 @@ def correlations(input: str, output: TextIO, transpose: bool, fields: Tuple[str,
         for j, v in row.items():
             logging.info(f"{i} x {j} = {v}")
             rows.append((v, i, j))
-    for row in sorted(rows, key=itemgetter(0)):
-        print(*row, sep="\t")
+    for rrow in sorted(rows, key=itemgetter(0)):
+        print(*rrow, sep="\t")
 
 
 @main.command()
diff --git a/src/sssom/cliques.py b/src/sssom/cliques.py
@@ -30,12 +30,12 @@
     import networkx
 
 
-def to_digraph(msdf: MappingSetDataFrame) -> "networkx.DiGraph":
+def to_digraph(msdf: MappingSetDataFrame) -> "networkx.DiGraph[str]":
     """Convert to a graph where the nodes are entities' CURIEs and edges are their mappings."""
     import networkx as nx
 
     doc = to_mapping_set_document(msdf)
-    g = nx.DiGraph()
+    g: "networkx.DiGraph[str]" = nx.DiGraph()
     if doc.mapping_set.mappings is not None:
         for mapping in doc.mapping_set.mappings:
             if not isinstance(mapping, Mapping):
diff --git a/src/sssom/parsers.py b/src/sssom/parsers.py
@@ -538,9 +538,9 @@ def _get_mapping_dict(
     # only if the value exists, is not NaN, and the key is in the schema's mapping slots.
     # The value could be a string or a list and is handled accordingly via _address_multivalued_slot().
 
-    mdict = {
+    mdict: dict[str, str | list[str]] = {
         k: _address_multivalued_slot(k, v)
-        for k, v in row.items()
+        for k, v in row.to_dict().items()
         if v and pd.notna(v) and k in mapping_slots
     }
 
@@ -1002,7 +1002,7 @@ def _get_mapping_set_from_df(df: pd.DataFrame, meta: Optional[MetadataType] = No
 
     mapping_slots = set(_get_sssom_schema_object().mapping_slots)
 
-    df.apply(
+    df.apply(  # type:ignore
         lambda row: _add_valid_mapping_to_list(
             _get_mapping_dict(row, bad_attrs, mapping_slots), mapping_set.mappings
         ),
diff --git a/src/sssom/util.py b/src/sssom/util.py
@@ -426,7 +426,7 @@ def infer_cardinality(self, scope: Optional[List[str]] = None) -> None:
         # Helper function to transform a row into a string that represents
         # a subject (or object) in a given scope; `side` is either `subject`
         # or `object`.
-        def _to_string(row: dict[str, Any], side: str) -> str:
+        def _to_string(row: pd.Series, side: str) -> str:
             # We prepend a one-letter code (`L` or `E`) to the actual subject
             # or object so that literal and non-literal mapping records are
             # always distinguishable and can be counted separately.
@@ -718,7 +718,6 @@ def filter_redundant_rows(df: pd.DataFrame, ignore_predicate: bool = False) -> p
         key = [SUBJECT_ID, OBJECT_ID]
     else:
         key = [SUBJECT_ID, OBJECT_ID, PREDICATE_ID]
-    dfmax: pd.DataFrame
     if not df.empty:
         dfmax = df.groupby(key, as_index=False)[CONFIDENCE].apply(max).drop_duplicates()
         max_conf: Dict[Tuple[str, ...], float] = {}
@@ -1197,7 +1196,9 @@ def deal_with_negation(df: pd.DataFrame) -> pd.DataFrame:
 
     # GroupBy and SELECT ONLY maximum confidence
     max_confidence_df: pd.DataFrame
-    max_confidence_df = combined_normalized_subset.groupby(TRIPLES_IDS, as_index=False)[
+    max_confidence_df = combined_normalized_subset.groupby(
+        TRIPLES_IDS, as_index=False
+    )[  # type:ignore
         CONFIDENCE
     ].max()
 
@@ -1267,14 +1268,14 @@ def deal_with_negation(df: pd.DataFrame) -> pd.DataFrame:
     # This needs to happen because the columns in df
     # not in reconciled_df_subset will be NaN otherwise
     # which is incorrect.
-    reconciled_df = df.merge(
+    reconciled_df: pd.DataFrame = df.merge(
         reconciled_df_subset, how="right", on=list(reconciled_df_subset.columns)
     ).fillna(df)
 
     if nan_df.empty:
         return_df = reconciled_df
     else:
-        return_df = reconciled_df.append(nan_df).drop_duplicates()
+        return_df = reconciled_df.append(nan_df).drop_duplicates()  # type:ignore
 
     if not confidence_in_original:
         return_df = return_df.drop(columns=[CONFIDENCE], axis=1)
diff --git a/src/sssom/validators.py b/src/sssom/validators.py
@@ -208,7 +208,7 @@ def check_strict_curie_format(
 
     for column in entity_reference_slots:
         if column in msdf.df.columns:
-            for index, value in msdf.df[column].items():
+            for index, value in msdf.df[column].to_dict().items():
                 if pd.notna(value) and "|" in str(value):
                     message = f"{value} contains a pipe ('|') character (row {index + 1}, column '{column}')."
                     validation_results.append(
diff --git a/tests/test_parsers.py b/tests/test_parsers.py
@@ -326,19 +326,20 @@ def test_read_sssom_table(self) -> None:
             "mapping_justification",
         ]
         for idx, row in msdf.df.iterrows():
-            for k, v in row.items():
+            for k, v in row.to_dict().items():
+                xxx = imported_df.iloc[idx][k]  # type:ignore
                 if v == np.nan:
-                    self.assertTrue(math.isnan(imported_df.iloc[idx][k]))
+                    self.assertTrue(math.isnan(xxx))
                 else:
                     if k not in list_cols:
                         if v is np.nan:
-                            self.assertTrue(imported_df.iloc[idx][k] is v)
+                            self.assertTrue(xxx is v)
                         else:
-                            self.assertEqual(imported_df.iloc[idx][k], v)
+                            self.assertEqual(xxx, v)
                     elif k == "mapping_justification":
-                        self.assertEqual(imported_df.iloc[idx][k], v)
+                        self.assertEqual(xxx, v)
                     else:
-                        self.assertEqual(imported_df.iloc[idx][k], v)
+                        self.assertEqual(xxx, v)
 
     def test_parse_obographs_merged(self) -> None:
         """Test parsing OBO Graph JSON using custom prefix_map."""
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -208,9 +208,9 @@ def test_invert_asymmetric_nodes(self) -> None:
         msdf = parse_sssom_table(f"{data_dir}/asymmetric.tsv")
         inverted_df = invert_mappings(msdf.df, merge_inverted=False)
         self.assertEqual(len(inverted_df), len(msdf.df))
-        original_subject_labels = msdf.df["subject_label"].values
-        inverted_object_labels = inverted_df["object_label"].values
-        self.assertNotIn(False, original_subject_labels == inverted_object_labels)
+        original_subject_labels = msdf.df["subject_label"]
+        inverted_object_labels = inverted_df["object_label"]
+        self.assertTrue((original_subject_labels == inverted_object_labels).all())
 
 
 class TestUtils(unittest.TestCase):
diff --git a/tox.ini b/tox.ini
@@ -104,9 +104,14 @@ deps =
     types-requests
     click-types
     linkml-runtime
+    pandas-stubs
+    scipy-stubs
+    types-jsonschema
+    types-networkx
 extras =
     rdflib-endpoint
-commands = mypy --install-types --non-interactive --ignore-missing-imports --strict src/sssom tests/
+commands =
+    mypy --ignore-missing-imports --strict src/ tests/
 description = Run the mypy tool to check static typing on the project.
 
 [testenv:docstr-coverage]