mongodb-labs
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 55 additions & 29 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 55 additions & 29 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎bindings/python/.flake8‎
Lines changed: 0 additions & 15 deletions b/‎bindings/python/.flake8‎
Lines changed: 0 additions & 15 deletions
diff --git a/‎bindings/python/RELEASE.rst‎
Lines changed: 4 additions & 4 deletions b/‎bindings/python/RELEASE.rst‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎bindings/python/addtags.py‎
Lines changed: 10 additions & 6 deletions b/‎bindings/python/addtags.py‎
Lines changed: 10 additions & 6 deletions
diff --git a/‎bindings/python/benchmarks/benchmarks.py‎
Lines changed: 37 additions & 21 deletions b/‎bindings/python/benchmarks/benchmarks.py‎
Lines changed: 37 additions & 21 deletions
diff --git a/‎bindings/python/docs/source/changelog.rst‎
Lines changed: 2 additions & 2 deletions b/‎bindings/python/docs/source/changelog.rst‎
Lines changed: 2 additions & 2 deletions
@@ -1,12 +1,13 @@
 
 repos:
 - repo: https://github.com/pre-commit/pre-commit-hooks
-  rev: v4.1.0
+  rev: v4.5.0
   hooks:
   - id: check-added-large-files
   - id: check-case-conflict
   - id: check-toml
   - id: check-yaml
+    exclude: template.yaml
   - id: debug-statements
   - id: end-of-file-fixer
     exclude: WHEEL
@@ -16,55 +17,80 @@ repos:
     exclude: .patch
     exclude_types: [json]
 
-- repo: https://github.com/psf/black
-  rev: 22.3.0
+- repo: https://github.com/astral-sh/ruff-pre-commit
+  # Ruff version.
+  rev: v0.1.3
   hooks:
-  - id: black
-    files: \.py$
-    args: [--line-length=100]
+    - id: ruff
+      args: ["--fix", "--show-fixes"]
+    - id: ruff-format
 
-- repo: https://github.com/PyCQA/isort
-  rev: 5.12.0
+- repo: https://github.com/adamchainz/blacken-docs
+  rev: "1.16.0"
   hooks:
-  - id: isort
-    files: \.py$
-    args: [--profile=black]
+  - id: blacken-docs
+    additional_dependencies:
+    - black==22.3.0
 
-- repo: https://github.com/PyCQA/flake8
-  rev: 3.9.2
+- repo: https://github.com/pre-commit/pygrep-hooks
+  rev: "v1.10.0"
   hooks:
-  - id: flake8
-    args: [--config=bindings/python/.flake8]
-    types: [file]
-    files: \.py$
-    additional_dependencies: [
-        'flake8-bugbear==20.1.4',
-        'flake8-logging-format==0.6.0',
-        'flake8-implicit-str-concat==0.2.0',
-    ]
+    - id: rst-backticks
+    - id: rst-directive-colons
+    - id: rst-inline-touching-normal
 
+- repo: https://github.com/rstcheck/rstcheck
+  rev: v6.2.0
+  hooks:
+  - id: rstcheck
+    additional_dependencies: [sphinx]
+    args: ["--ignore-directives=doctest,testsetup,todo,automodule","--ignore-substitutions=release", "--report-level=error"]
 
 # We use the Python version instead of the original version which seems to require Docker
 # https://github.com/koalaman/shellcheck-precommit
 - repo: https://github.com/shellcheck-py/shellcheck-py
-  rev: v0.8.0.4
+  rev: v0.9.0.6
   hooks:
     - id: shellcheck
       name: shellcheck
       args: ["--severity=warning"]
+      stages: [manual]
+
+- repo: https://github.com/PyCQA/doc8
+  rev: v1.1.1
+  hooks:
+    - id: doc8
+      args: ["--ignore=D001"]  # ignore line length
+      stages: [manual]
 
 - repo: https://github.com/sirosen/check-jsonschema
-  rev: 0.14.1
+  rev: 0.27.0
   hooks:
     - id: check-jsonschema
       name: "Check GitHub Workflows"
       files: ^\.github/workflows/
       types: [yaml]
       args: ["--schemafile", "https://json.schemastore.org/github-workflow"]
+      stages: [manual]
 
-- repo: https://github.com/adamchainz/blacken-docs
-  rev: "1.13.0"
+- repo: https://github.com/ariebovenberg/slotscheck
+  rev: v0.17.0
   hooks:
-  - id: blacken-docs
-    additional_dependencies:
-    - black==22.3.0
+  - id: slotscheck
+    files: \.py$
+    exclude: "^(bindings/python/test|bindings/python)/"
+    stages: [manual]
+    args: ["--no-strict-imports"]
+
+- repo: https://github.com/codespell-project/codespell
+  rev: "v2.2.6"
+  hooks:
+  - id: codespell
+    # Examples of errors or updates to justify the exceptions:
+    # - test/test_on_demand_csfle.py:44: FLE ==> FILE
+    # - test/test_bson.py:1043: fo ==> of, for, to, do, go
+    # - test/bson_corpus/decimal128-4.json:98: Infinit ==> Infinite
+    # - test/test_bson.py:267: isnt ==> isn't
+    # - test/versioned-api/crud-api-version-1-strict.json:514: nin ==> inn, min, bin, nine
+    # - test/test_client.py:188: te ==> the, be, we, to
+    args: ["-L", "fle,fo,infinit,isnt,nin,te"]
@@ -31,7 +31,7 @@ df = production.invoices.find_pandas_all({'amount': {'$gt': 100.00}}, schema=inv
 ```
 
 Since PyMongoArrow can automatically infer the schema from the first batch of data, this can be
-further simplifed to:
+further simplified to:
 
 ```
 df = production.invoices.find_pandas_all({'amount': {'$gt': 100.00}})
 
@@ -25,13 +25,13 @@ Release Process
 
 #. Check JIRA to ensure all the tickets in this version have been completed.
 
-#. Add release notes to `doc/source/changelog.rst`. Generally just summarize/clarify
+#. Add release notes to ``doc/source/changelog.rst``. Generally just summarize/clarify
    the git log, but you might add some more long form notes for big changes.
 
-#. Replace the `devN` version number w/ the new version number (see
+#. Replace the ``devN`` version number w/ the new version number (see
    note above in `Versioning`_). Make sure version number is updated in
-   `pymongoarrow/version.py`. Commit the change and tag the release.
-   Immediately bump the version number to `dev0` in a new commit::
+   ``pymongoarrow/version.py``. Commit the change and tag the release.
+   Immediately bump the version number to ``dev0`` in a new commit::
 
      $ # Bump to release version number
      $ git commit -a -m "BUMP <release version number>"
 
@@ -1,4 +1,3 @@
-#!/usr/bin/env python3
 # Dependencies:
 # - auditwheel>=5,<6
 # Requires AUDITWHEEL_PLAT to be set (e.g. manylinux2014_x86_64)
@@ -24,7 +23,8 @@ def repair_wheel(wheel_path, abi, wheel_dir):
 
 def main(wheel_path, abi, wheel_dir):
     if not isfile(wheel_path):
-        raise FileNotFoundError("cannot access wheel file %s" % (wheel_path,))
+        msg = f"cannot access wheel file {wheel_path}"
+        raise FileNotFoundError(msg)
 
     if not exists(wheel_dir):
         os.makedirs(wheel_dir)
@@ -37,12 +37,12 @@ def main(wheel_path, abi, wheel_dir):
         if reqd_tag < get_priority_by_name(analyzed_tag):
             print(
                 "Wheel is eligible for a higher priority tag. "
-                "You requested %s but I have found this wheel is "
-                "eligible for %s." % (abi, analyzed_tag)
+                f"You requested {abi} but I have found this wheel is "
+                f"eligible for {analyzed_tag}."
             )
             out_wheel = repair_wheel(wheel_path, analyzed_tag, wheel_dir)
 
-        print("Fixed-up wheel written to %s" % (out_wheel,))
+        print(f"Fixed-up wheel written to {out_wheel}")
 
 
 if __name__ == "__main__":
@@ -51,4 +51,8 @@ def main(wheel_path, abi, wheel_dir):
     print(f"wheel path: {WHEEL_PATH}")
     print(f"target platform: {TARGET_PLATFORM}")
     print(f"wheel dir: {WHEEL_DIR}")
-    main(wheel_path=abspath(WHEEL_PATH), abi=TARGET_PLATFORM, wheel_dir=abspath(WHEEL_DIR))
+    main(
+        wheel_path=abspath(WHEEL_PATH),
+        abi=TARGET_PLATFORM,
+        wheel_dir=abspath(WHEEL_DIR),
+    )
@@ -18,9 +18,10 @@
 
 import numpy as np
 import pandas as pd
-import pyarrow
+import pyarrow as pa
 import pymongo
 from bson import BSON, Binary, Decimal128
+
 from pymongoarrow.api import (
     Schema,
     find_arrow_all,
@@ -31,7 +32,7 @@
 from pymongoarrow.types import BinaryType, Decimal128Type
 
 N_DOCS = int(os.environ.get("N_DOCS"))
-assert pymongo.has_c()
+assert pymongo.has_c()  # noqa: S101
 db = pymongo.MongoClient().pymongoarrow_test
 
 LARGE_DOC_SIZE = 20
@@ -49,7 +50,11 @@ class Insert(ABC):
 
     timeout = 100000  # The setup sometimes times out.
     number = 1
-    repeat = (1, 10, 30.0)  # Min repeat, max repeat, time limit (will stop sampling after this)
+    repeat = (
+        1,
+        10,
+        30.0,
+    )  # Min repeat, max repeat, time limit (will stop sampling after this)
     rounds = 1
 
     @abc.abstractmethod
@@ -90,15 +95,19 @@ class Read(ABC):
 
     timeout = 100000  # The setup sometimes times out.
     number = 3
-    repeat = (1, 10, 30.0)  # Min repeat, max repeat, time limit (will stop sampling after this)
+    repeat = (
+        1,
+        10,
+        30.0,
+    )  # Min repeat, max repeat, time limit (will stop sampling after this)
     rounds = 1
 
     @abc.abstractmethod
     def setup(self):
         raise NotImplementedError
 
     # We need this because the naive methods don't always convert nested objects.
-    @staticmethod
+    @staticmethod  # noqa: B027
     def exercise_table(table):
         pass
 
@@ -107,7 +116,10 @@ def time_conventional_ndarray(self):
         cursor = collection.find(projection={"_id": 0})
         dtype = self.dtypes
         if "Large" in type(self).__name__:
-            np.array([tuple(doc[k] for k in self.large_doc_keys) for doc in cursor], dtype=dtype)
+            np.array(
+                [tuple(doc[k] for k in self.large_doc_keys) for doc in cursor],
+                dtype=dtype,
+            )
         else:
             np.array([(doc["x"], doc["y"]) for doc in cursor], dtype=dtype)
 
@@ -132,7 +144,7 @@ def time_to_arrow(self):
     def time_conventional_arrow(self):
         c = db.benchmark
         f = list(c.find({}, projection={"_id": 0}))
-        table = pyarrow.Table.from_pylist(f)
+        table = pa.Table.from_pylist(f)
         self.exercise_table(table)
 
     def peakmem_to_numpy(self):
@@ -154,17 +166,21 @@ def peakmem_conventional_arrow(self):
 class ProfileReadArray(Read):
     schema = Schema(
         {
-            "x": pyarrow.int64(),
-            "y": pyarrow.float64(),
-            "emb": pyarrow.list_(pyarrow.float64()),
+            "x": pa.int64(),
+            "y": pa.float64(),
+            "emb": pa.list_(pa.float64()),
         }
     )
 
     def setup(self):
         coll = db.benchmark
         coll.drop()
         base_dict = dict(
-            [("x", 1), ("y", math.pi), ("emb", [math.pi for _ in range(EMBEDDED_OBJECT_SIZE)])]
+            [
+                ("x", 1),
+                ("y", math.pi),
+                ("emb", [math.pi for _ in range(EMBEDDED_OBJECT_SIZE)]),
+            ]
         )
         coll.insert_many([base_dict.copy() for _ in range(N_DOCS)])
         print(
@@ -176,7 +192,7 @@ def setup(self):
     @staticmethod
     def exercise_table(table):
         [
-            [[n for n in i.values] if isinstance(i, pyarrow.ListScalar) else i for i in column]
+            [[n for n in i.values] if isinstance(i, pa.ListScalar) else i for i in column]
             for column in table.columns
         ]
 
@@ -197,10 +213,10 @@ def time_conventional_pandas(self):
 class ProfileReadDocument(Read):
     schema = Schema(
         {
-            "x": pyarrow.int64(),
-            "y": pyarrow.float64(),
-            "emb": pyarrow.struct(
-                [pyarrow.field(f"a{i}", pyarrow.float64()) for i in range(EMBEDDED_OBJECT_SIZE)]
+            "x": pa.int64(),
+            "y": pa.float64(),
+            "emb": pa.struct(
+                [pa.field(f"a{i}", pa.float64()) for i in range(EMBEDDED_OBJECT_SIZE)]
             ),
         }
     )
@@ -225,7 +241,7 @@ def setup(self):
     @staticmethod
     def exercise_table(table):
         [
-            [[n for n in i.values()] if isinstance(i, pyarrow.StructScalar) else i for i in column]
+            [[n for n in i.values()] if isinstance(i, pa.StructScalar) else i for i in column]
             for column in table.columns
         ]
 
@@ -244,7 +260,7 @@ def time_conventional_pandas(self):
 
 
 class ProfileReadSmall(Read):
-    schema = Schema({"x": pyarrow.int64(), "y": pyarrow.float64()})
+    schema = Schema({"x": pa.int64(), "y": pa.float64()})
     dtypes = np.dtype(np.dtype([("x", np.int64), ("y", np.float64)]))
 
     def setup(self):
@@ -265,7 +281,7 @@ def setup(self):
 
 class ProfileReadLarge(Read):
     large_doc_keys = [f"a{i}" for i in range(LARGE_DOC_SIZE)]
-    schema = Schema({k: pyarrow.float64() for k in large_doc_keys})
+    schema = Schema({k: pa.float64() for k in large_doc_keys})
     dtypes = np.dtype([(k, np.float64) for k in large_doc_keys])
 
     def setup(self):
@@ -333,7 +349,7 @@ def time_insert_conventional(self):
 
 class ProfileInsertSmall(Insert):
     large_doc_keys = [f"a{i}" for i in range(LARGE_DOC_SIZE)]
-    schema = Schema({"x": pyarrow.int64(), "y": pyarrow.float64()})
+    schema = Schema({"x": pa.int64(), "y": pa.float64()})
     dtypes = np.dtype([("x", np.int64), ("y", np.float64)])
 
     def setup(self):
@@ -352,7 +368,7 @@ def setup(self):
 
 class ProfileInsertLarge(Insert):
     large_doc_keys = [f"a{i}" for i in range(LARGE_DOC_SIZE)]
-    schema = Schema({k: pyarrow.float64() for k in large_doc_keys})
+    schema = Schema({k: pa.float64() for k in large_doc_keys})
     dtypes = np.dtype([(k, np.float64) for k in large_doc_keys])
 
     def setup(self):
 
@@ -67,13 +67,13 @@ Changes in Version 0.4.0
 
 Changes in Version 0.3.0
 ------------------------
-- Support for `PyArrow` 7.0.
+- Support for ``PyArrow`` 7.0.
 - Support for :class:`~bson.objectid.ObjectId` type.
 - Improve error message when schema contains an unsupported type.
 - Add support for BSON string type.
 - Add support for BSON boolean type.
 - Upgraded to bundle `libbson <http://mongoc.org/libbson/current/index.html>`_ 1.21.1. If installing from source, the minimum supported ``libbson`` version is now 1.21.0.
-- Dropped Python 3.6 support (it was dropped in `PyArrow` 7.0).
+- Dropped Python 3.6 support (it was dropped in ``PyArrow`` 7.0).
 
 Changes in Version 0.2.0
 ------------------------