Kinto · sambhav · Feb 15, 2026 · Feb 15, 2026 · Feb 15, 2026 · Feb 15, 2026
diff --git a/kinto/core/storage/postgresql/__init__.py b/kinto/core/storage/postgresql/__init__.py
@@ -75,6 +75,86 @@ class Storage(StorageBase, MigratorMixin):
         recommended to allow load balancing, replication or limit the number
         of connections used in a multi-process deployment.
 
+    **Optional: GIN index for JSONB queries**
+
+    For large collections (100K+ records), equality filters (``?field=value``)
+    and array containment filters (``?contains_field=value``) can be
+    accelerated by adding a GIN index on the JSONB ``data`` column.
+
+    This index is **not created automatically** because it can take significant
+    time on large tables. Create it manually using ``CONCURRENTLY`` to avoid
+    blocking reads and writes.
+
+    **Recommended index** (scoped to records only)::
+
+        CREATE INDEX CONCURRENTLY idx_objects_data_gin
+            ON objects USING gin (data jsonb_path_ops)
+            WHERE resource_name = 'record' AND NOT deleted;
+
+    This is the smallest and most efficient option. The
+    ``resource_name = 'record'`` partial condition excludes bucket, collection,
+    and group metadata objects (which are never filtered by JSONB fields),
+    keeping the index focused on actual record data. This works because
+    psycopg2 performs client-side parameter interpolation, so PostgreSQL's
+    planner sees ``resource_name = 'record'`` as a literal and can match it
+    against the partial index condition.
+
+    .. note::
+
+        If you switch to a driver that uses server-side parameter binding
+        (e.g. psycopg3 defaults), the planner may not be able to prove the
+        partial condition is satisfied. In that case, fall back to the
+        basic index below.
+
+    **Basic index** (driver-independent)::
+
+        CREATE INDEX CONCURRENTLY idx_objects_data_gin
+            ON objects USING gin (data jsonb_path_ops)
+            WHERE NOT deleted;
+
+    Slightly larger (includes non-record objects) but works regardless of
+    driver parameter binding behavior. The ``WHERE NOT deleted`` condition
+    matches the literal ``AND NOT deleted`` clause present in ``list_all``,
+    ``count_all``, and ``delete_all`` queries.
+
+    **Composite index** (single index scan, no BitmapAnd)::
+
+        CREATE EXTENSION IF NOT EXISTS btree_gin;
+
+        CREATE INDEX CONCURRENTLY idx_objects_data_gin
+            ON objects USING gin (parent_id, resource_name, data jsonb_path_ops)
+            WHERE NOT deleted;
+
+    The ``btree_gin`` extension (ships with PostgreSQL, just needs
+    ``CREATE EXTENSION``) allows including B-tree-compatible columns in a GIN
+    index. This lets PostgreSQL satisfy ``parent_id``, ``resource_name``, and
+    ``data @>`` conditions in a single index scan, instead of combining
+    separate GIN and B-tree scans via BitmapAnd. Trade-off: the index is
+    larger (includes ``parent_id`` and ``resource_name`` values) and has
+    higher write overhead.
+
+    All three options use the ``jsonb_path_ops`` operator class, which is ~60%
+    smaller than the default GIN class and supports the ``@>`` containment
+    operator used by Kinto's filter queries.
+
+    **What this index accelerates:**
+
+    - Equality filters: ``?status=active`` → ``data @> '{"status": "active"}'``
+    - Nested field equality: ``?person.name=Alice`` → ``data @> '{"person": {"name": "Alice"}}'``
+    - Array contains: ``?contains_colors=red`` → ``data @> '{"colors": ["red"]}'``
+
+    **What this index does NOT accelerate:**
+
+    - Range filters (``min_``, ``max_``, ``gt_``, ``lt_``)
+    - LIKE/text search filters
+    - ``contains_any_`` filters (uses ``&&`` array overlap operator)
+    - Sorting on JSONB fields (requires B-tree expression indexes)
+
+    **Approximate index sizes** (for the recommended index):
+
+    - 1M records, 200B avg JSON: ~300-500 MB
+    - 1M records, 1KB avg JSON: ~800 MB - 1.5 GB
+
     """  # NOQA
 
     # MigratorMixin attributes.
@@ -926,11 +1006,25 @@ def _format_conditions(self, filters, id_field, modified_field, prefix="filters"
 
             elif filtr.operator == COMPARISON.CONTAINS:
                 value_holder = f"{prefix}_value_{i}"
-                holders[value_holder] = value
-                # In case the field is not a sequence, we ignore the object.
-                is_json_sequence = f"jsonb_typeof({sql_field}) = 'array'"
-                sql_operator = operators[filtr.operator]
-                cond = f"{is_json_sequence} AND {sql_field} {sql_operator} :{value_holder}"
+                # Use top-level containment (data @> '{"field": [values]}')
+                # instead of sub-expression containment (data->'field' @> '[values]').
+                # This allows a GIN index on data to accelerate the query.
+                # Top-level containment is semantically equivalent and already
+                # returns false when the field is not an array, so no
+                # jsonb_typeof guard is needed.
+                is_data_field = filtr.field not in (id_field, modified_field)
+                if is_data_field:
+                    subfields = filtr.field.split(".")
+                    containment_obj = filtr.value
+                    for subfield in reversed(subfields):
+                        containment_obj = {subfield: containment_obj}
+                    holders[value_holder] = json.dumps(containment_obj)
+                    cond = f"data @> :{value_holder}"
+                else:
+                    holders[value_holder] = value
+                    is_json_sequence = f"jsonb_typeof({sql_field}) = 'array'"
+                    sql_operator = operators[filtr.operator]
+                    cond = f"{is_json_sequence} AND {sql_field} {sql_operator} :{value_holder}"
 
             elif filtr.operator == COMPARISON.CONTAINS_ANY:
                 value_holder = f"{prefix}_value_{i}"
@@ -949,10 +1043,27 @@ def _format_conditions(self, filters, id_field, modified_field, prefix="filters"
             elif value != MISSING:
                 # Safely escape value. MISSINGs get handled below.
                 value_holder = f"{prefix}_value_{i}"
-                holders[value_holder] = value
 
-                sql_operator = operators.setdefault(filtr.operator, filtr.operator.value)
-                cond = f"{sql_field} {sql_operator} :{value_holder}"
+                # Use JSONB containment (@>) for EQ on data fields with scalar
+                # values. This is semantically equivalent to the arrow extraction
+                # form (data->'field' = 'value'::jsonb) for scalars, but can be
+                # accelerated by a GIN index on the data column.
+                # We restrict this to scalars because @> uses superset semantics
+                # for arrays/objects (e.g. [1,2,3] @> [1] is true), which differs
+                # from the exact equality that EQ should provide.
+                is_data_field = filtr.field not in (id_field, modified_field)
+                is_scalar_value = isinstance(filtr.value, (str, int, float, bool, type(None)))
+                if is_data_field and filtr.operator == COMPARISON.EQ and is_scalar_value:
+                    subfields = filtr.field.split(".")
+                    containment_obj = filtr.value
+                    for subfield in reversed(subfields):
+                        containment_obj = {subfield: containment_obj}
+                    holders[value_holder] = json.dumps(containment_obj)
+                    cond = f"data @> :{value_holder}"
+                else:
+                    holders[value_holder] = value
+                    sql_operator = operators.setdefault(filtr.operator, filtr.operator.value)
+                    cond = f"{sql_field} {sql_operator} :{value_holder}"
 
             # If the field is missing, column_name will produce
             # NULL. NULL has strange properties with comparisons
@@ -1083,13 +1194,16 @@ def _format_sorting(self, sorting, id_field, modified_field):
                 sql_field = "objects.last_modified"
             else:
                 # Subfields: ``person.name`` becomes ``data->person->name``
+                # Use the same format as _format_conditions (without
+                # parentheses around placeholders) so that the expression
+                # text matches any expression indexes that may exist.
                 subfields = sort.field.split(".")
                 sql_field = "data"
                 for j, subfield in enumerate(subfields):
                     # Safely escape field name
                     field_holder = f"sort_field_{i}_{j}"
                     holders[field_holder] = subfield
-                    sql_field += f"->(:{field_holder})"
+                    sql_field += f"->:{field_holder}"
 
             sql_direction = "ASC" if sort.direction > 0 else "DESC"
             sql_sort = f"{sql_field} {sql_direction}"

diff --git a/kinto/core/storage/testing.py b/kinto/core/storage/testing.py
@@ -393,6 +393,29 @@ def test_list_all_can_filter_with_list_of_values(self):
         objects = self.storage.list_all(filters=filters, **self.storage_kw)
         self.assertEqual(len(objects), 2)
 
+    def test_list_all_can_filter_with_exact_array_equality(self):
+        """EQ on an array field must match exactly, not as a superset."""
+        self.create_object({"colors": ["red", "green", "blue"]})
+        self.create_object({"colors": ["red", "green"]})
+        self.create_object({"colors": ["red"]})
+
+        # Exact match: only the object with exactly ["red", "green"] should match
+        filters = [Filter("colors", ["red", "green"], utils.COMPARISON.EQ)]
+        objects = self.storage.list_all(filters=filters, **self.storage_kw)
+        self.assertEqual(len(objects), 1)
+        self.assertEqual(objects[0]["colors"], ["red", "green"])
+
+    def test_list_all_can_filter_with_exact_object_equality(self):
+        """EQ on an object field must match exactly, not as a superset."""
+        self.create_object({"meta": {"a": 1, "b": 2}})
+        self.create_object({"meta": {"a": 1}})
+        self.create_object({"meta": {"a": 1, "b": 2, "c": 3}})
+
+        filters = [Filter("meta", {"a": 1, "b": 2}, utils.COMPARISON.EQ)]
+        objects = self.storage.list_all(filters=filters, **self.storage_kw)
+        self.assertEqual(len(objects), 1)
+        self.assertEqual(objects[0]["meta"], {"a": 1, "b": 2})
+
     def test_list_all_can_filter_on_array_that_contains_values(self):
         self.create_object({"colors": ["red", "green", "blue"]})
         self.create_object({"colors": ["gray", "blue"]})