try out ty for type checking (#176)

bdewilde · web-flow · commit a84f41c23250 · 2025-12-28T16:14:14.000-05:00
* build: add ty to dev deps for type checking

* types: fix some typing issues in lib code

* tests: fix type issue

* fix: typing issues and minor bugs in extractors

* build: configure ty tool

* ci: run ty in checks

* fix: call ty correctly in ci

* build: exclude nbs from ty check

* build: remove mypy from dev deps and ci
diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml
@@ -103,6 +103,6 @@ jobs:
         uses: ./.github/actions/setup-python-env
         with:
           python-version: "3.11"
-      - name: Check types with mypy
+      - name: Check types with ty
         run: |
-          uv run python -m mypy --install-types --non-interactive colandr
+          uv run python -m ty check
diff --git a/colandr/lib/extractors/locations.py b/colandr/lib/extractors/locations.py
@@ -2,6 +2,7 @@
 
 import collections
 import logging
+import typing as t
 
 from spacy.tokens import Span
 
@@ -50,7 +51,9 @@ def extract_locations(self, record_id: int, text: str) -> list[Metadata]:
             return []
 
         processed_docs_iter = process_texts_into_docs([text], max_len=None)
-        doc = next(processed_docs_iter, None)
+        doc = next(iter(processed_docs_iter), None)
+        if doc is None:
+            return []
 
         # Get all sentences
         sentences = list(doc.sents)
@@ -90,7 +93,7 @@ def extract_locations(self, record_id: int, text: str) -> list[Metadata]:
         return self._group_locations(record_id, locations)
 
     def _group_locations(
-        self, record_id: int, locations: list[Metadata]
+        self, record_id: int, locations: list[dict[str, t.Any]]
     ) -> list[Metadata]:
         """
         Group locations by name and sort by frequency.
diff --git a/colandr/lib/extractors/review_model.py b/colandr/lib/extractors/review_model.py
@@ -80,7 +80,7 @@ def transform(self, x: pd.Series) -> np.ndarray:
         Returns:
             A 2D NumPy array of shape (n_samples, n_features).
         """
-        return np.vstack(x)
+        return np.vstack(x.tolist())
 
 
 class ReviewModel:
@@ -447,8 +447,7 @@ def _process_text(self, text_content: str) -> tuple[pd.DataFrame, list[dict]]:
         processed_docs_iter = process_texts_into_docs(
             [main_content], max_len=None, exclude=("ner",)
         )
-        doc = next(processed_docs_iter, None)
-
+        doc = next(iter(processed_docs_iter), None)
         return self._extract_features_from_doc(doc)
 
     def _is_valid_sentence(self, sent: Optional[Span]) -> bool:
diff --git a/colandr/lib/fileio/studies/base.py b/colandr/lib/fileio/studies/base.py
@@ -112,7 +112,7 @@ def _from_stream(self, stream: t.IO[bytes], encoding: str) -> str:
         # ).read()
         return data
 
-    def _standardize_field_names(self, record: dict[str, object]) -> dict[str, object]:
+    def _standardize_field_names(self, record: dict[str, t.Any]) -> dict[str, t.Any]:
         record = {key.lower().replace(" ", "_"): value for key, value in record.items()}
         if self.field_alt_names:
             # only one alt name per field? take this faster path
@@ -127,7 +127,7 @@ def _standardize_field_names(self, record: dict[str, object]) -> dict[str, objec
                                 break
         return record
 
-    def _sanitize_field_values(self, record: dict[str, object]) -> dict[str, object]:
+    def _sanitize_field_values(self, record: dict[str, t.Any]) -> dict[str, t.Any]:
         if self.field_sanitizers:
             for field, sanitizers in self.field_sanitizers.items():
                 if field in record:
diff --git a/colandr/lib/nlp/representations.py b/colandr/lib/nlp/representations.py
@@ -148,8 +148,8 @@ def __init__(
         self._avg_doc_length = None
 
     def _validate_vocabulary(
-        self, vocabulary: dict[str, int] | Iterable[str]
-    ) -> tuple[dict[str, int], bool]:
+        self, vocabulary: t.Optional[dict[str, int] | Iterable[str]]
+    ) -> tuple[dict[str, int] | None, bool]:
         """
         Validate an input vocabulary. If it's a mapping, ensure that term ids
         are unique and compact (i.e. without any gaps between 0 and the number
@@ -191,7 +191,7 @@ def _validate_vocabulary(
             is_fixed = True
         else:
             is_fixed = False
-        return (vocabulary, is_fixed)
+        return (vocabulary, is_fixed)  # ty: ignore[invalid-return-type]
 
     def _check_vocabulary(self):
         """
@@ -211,9 +211,11 @@ def id_to_term(self) -> dict[int, str]:
         generated if needed, and it is automatically kept in sync with the
         corresponding vocabulary.
         """
+        self._check_vocabulary()
         if len(self.id_to_term_) != self.vocabulary_terms:
             self.id_to_term_ = {
-                term_id: term_str for term_str, term_id in self.vocabulary_terms.items()
+                term_id: term_str
+                for term_str, term_id in self.vocabulary_terms.items()  # ty: ignore[possibly-missing-attribute]
             }
         return self.id_to_term_
 
@@ -235,7 +237,8 @@ def terms_list(self) -> list[str]:
         return [
             term_str
             for term_str, _ in sorted(
-                self.vocabulary_terms.items(), key=operator.itemgetter(1)
+                self.vocabulary_terms.items(),  # ty: ignore[possibly-missing-attribute]
+                key=operator.itemgetter(1),
             )
         ]
 
@@ -389,6 +392,7 @@ def _count_terms(
             vocabulary.default_factory = vocabulary.__len__
         else:
             vocabulary = self.vocabulary_terms
+            assert vocabulary is not None
 
         indices = array(str("i"))
         indptr = array(str("i"), [0])
@@ -421,7 +425,7 @@ def _count_terms(
         # pretty sure this is a good thing to do... o_O
         doc_term_matrix.sort_indices()
 
-        return doc_term_matrix, vocabulary
+        return (doc_term_matrix, vocabulary)
 
     def _filter_terms(
         self, doc_term_matrix: sp.csr_matrix, vocabulary: dict[str, int]
diff --git a/colandr/tasks.py b/colandr/tasks.py
@@ -12,6 +12,7 @@
 from celery.utils.log import get_task_logger
 from flask import current_app
 from flask_mail import Message
+from sqlalchemy.dialects import postgresql as pg
 
 from . import models
 from .api.v1 import schemas
@@ -158,7 +159,9 @@ def deduplicate_citations(review_id: int):
         sa.select(models.Study.id)
         .where(models.Study.review_id == review_id)
         # .where(models.Study.citation_status.in_(["included", "excluded"]))
-        .where(models.Study.citation_status == sa.any_(["included", "excluded"]))
+        .where(
+            models.Study.citation_status == sa.any_(pg.array(["included", "excluded"]))
+        )
     )
     incl_excl_sids = set(db.session.execute(stmt).scalars().all())
 
@@ -197,7 +200,7 @@ def deduplicate_citations(review_id: int):
                 )
                 .where(models.Study.review_id == review_id)
                 # .where(models.Study.id.in_(int_sids))
-                .where(models.Study.id == sa.any_(int_sids))
+                .where(models.Study.id == sa.any_(pg.array(int_sids)))
                 .order_by(sa.text("n_null_cols ASC"))
                 .limit(1)
             )
diff --git a/pyproject.toml b/pyproject.toml
@@ -62,10 +62,11 @@ Repository = "https://github.com/datakind/permanent-colandr-back"
 [dependency-groups]
 dev = [
   "httpx~=0.28.0",
-  "mypy~=1.0",
   "pytest~=9.0",
   "pytest-postgresql~=7.0",
   "SQLAlchemy-Utils~=0.42.0",
+  # TODO: update ty once officially out of beta
+  "ty~=0.0.7",
   "ruff~=0.14.0",
 ]
 
@@ -80,15 +81,6 @@ required-version = ">=0.8.0,<0.10.0"
 module-name = "colandr"
 module-root = ""
 
-[tool.mypy]
-files = ["colandr/**/*.py"]
-python_version = "3.12"
-pretty = true
-ignore_errors = true
-allow_redefinition = true
-ignore_missing_imports = true
-follow_imports = "silent"
-
 [tool.pytest]
 minversion = "9.0"
 addopts = ["--verbose"]
@@ -124,3 +116,16 @@ ignore = ["E501", "E711", "F401", "PLW2901"]
 lines-after-imports = 2
 known-first-party = ["colandr"]
 known-third-party = ["alembic"]
+
+[tool.ty.environment]
+root = ["./colandr"]
+
+[tool.ty.rules]
+# ty appears to be struggling with relative imports :shrug:
+unresolved-import = "ignore"
+
+[tool.ty.src]
+exclude = ["migrations", "notebooks", "tests"]
+
+[tool.ty.terminal]
+output-format = "full"
diff --git a/tests/lib/nlp/test_utils.py b/tests/lib/nlp/test_utils.py
@@ -1,4 +1,5 @@
 import pytest
+from spacy.language import Language
 from spacy.tokens import Doc
 
 from colandr.lib.nlp import utils
@@ -125,4 +126,6 @@ def test_process_texts_into_docs(texts, max_len, fallback_lang, app):
     spacy_lang = utils.load_spacy_lang(
         utils.get_lang_to_models()["en"][0], exclude=("parser", "ner")
     )
-    assert spacy_lang(texts[0]).to_bytes() == docs[0].to_bytes()
+    doc = docs[0]
+    assert isinstance(spacy_lang, Language) and isinstance(doc, Doc)  # type guards
+    assert spacy_lang(texts[0]).to_bytes() == doc.to_bytes()
diff --git a/uv.lock b/uv.lock