fix(identity-vault): pagination while scanning overrides in the API

bheesham · bheesham · commit 2ec9c8569aac · 2025-09-23T10:06:47.000-04:00
In `parallel_dynamo.py`, we were paginating over the results for our callers.
While this simplifies the callers' code, it also increases their run time,
which is an issue when paired with AWS API Gateway.

The fix here is to do what we say we're going to do: allow the users to
paginate if they feel. We provide a way for users to specify where their next
page should start, via `scan`'s `exclusive_start_key` argument. We hint as much
in the return, naming this key `nextPage`.

But, doing this requires fixing our segmentation logic! Each segment receives
it's own `ExclusiveStartKey`, which needs to be used the next time we call our
next scan operation.

Jira: IAM-1793
diff --git a/python-modules/cis_identity_vault/cis_identity_vault/models/user.py b/python-modules/cis_identity_vault/cis_identity_vault/models/user.py
@@ -317,15 +317,65 @@ def all(self):
             users.extend(response["Items"])
         return users
 
-    def _last_evaluated_to_friendly(self, last_evaluated_key):
-        if last_evaluated_key is None:
+    def _last_evaluated_to_friendly(self, last_evaluated_keys):
+        """
+        Received from Dynamo, and serialized into something our clients can
+        understand (or rather, use: this _should_ be an opaque token to
+        clients).
+
+        When we're paginating through Dynamo, each segment returns a
+        `LastEvaluatedKey`, which we need to specify as `ExclusiveStartKey` in
+        subsequent requests. These `ExclusiveStartKey` is segment-specific,
+        hence the care here to serialize these in the order returned.
+
+        * `None`, indicating that we've completely finished, there are no more
+          results any segment can return;
+        * `list[Optional[Any]]`, indicating that _some_ segments have results
+          left.
+
+        Our clients' pagination logic (at least as seen by various publishers),
+        will consider the query over once we return `None`.
+        """
+        if not last_evaluated_keys:
             return None
+        next_page = []
+        for last_evaluated_key in last_evaluated_keys:
+            # A signal that the segment is done scanning.
+            if last_evaluated_key is None:
+                id = ""
+            else:
+                id = last_evaluated_key["id"]["S"]
+            next_page.append(id)
+        # If there are at all any segments left with work, then continue.
+        if any(next_page):
+            return ",".join(next_page)
         else:
-            return last_evaluated_key["id"]["S"]
+            return None
 
     def _next_page_to_dynamodb(self, next_page):
-        if next_page is not None:
-            return {"id": {"S": next_page}}
+        """
+        Received from _clients_, and deserialized into something our parallel
+        Dynamo code understands.
+
+        A complication here is that we can't reuse `None`, since that would
+        cause a segment to start from the beginning. So, we use a sentinel
+        value of `"done"` to signal to the parallel Dynamo code that we should
+        skip this segment.
+
+        When Dynamo returns `None`, that means _all_ segments are done. If it
+        returns a `list[Optional[Any]]`, that means that we can still make
+        progress on some segments.
+        """
+        if not next_page:
+            return None
+        exclusive_start_keys = []
+        for last_evaluated_key in next_page.split(","):
+            if last_evaluated_key == "":
+                id = "done"
+            else:
+                id = {"id": {"S": last_evaluated_key}}
+            exclusive_start_keys.append(id)
+        return exclusive_start_keys
 
     def all_filtered(self, connection_method=None, active=None, next_page=None):
         """
@@ -352,7 +402,7 @@ def all_filtered(self, connection_method=None, active=None, next_page=None):
             filter_expression=filter_expression,
             expression_attr=expression_attr,
             projection_expression=projection_expression,
-            exclusive_start_key=next_page,
+            exclusive_start_keys=next_page,
         )
         return dict(users=response["users"], nextPage=self._last_evaluated_to_friendly(response.get("nextPage")))
 
diff --git a/python-modules/cis_identity_vault/cis_identity_vault/parallel_dynamo.py b/python-modules/cis_identity_vault/cis_identity_vault/parallel_dynamo.py
@@ -36,35 +36,55 @@ def get_segment(
 
     logger.debug("Running parallel scan with kwargs: {}".format(scan_kwargs))
     response = dynamodb_client.scan(**scan_kwargs)
-    users = response.get("Items", [])
+    # Return a dictionary of users, since sets can only contain hashable types
+    # (and lists and dicts are not).
+    users = {
+        user["id"]["S"]: user
+        for user in response.get("Items", [])
+    }
     last_evaluated_key = response.get("LastEvaluatedKey")
 
-    while last_evaluated_key is not None:
-        scan_kwargs["ExclusiveStartKey"] = last_evaluated_key
-        response = dynamodb_client.scan(**scan_kwargs)
-        users.extend(response.get("Items", []))
-        last_evaluated_key = response.get("LastEvaluatedKey")
-
-    logger.debug("Running thread_id: {}".format(thread_id))
+    logger.debug("Finished thread_id: {}, with nextPage: {}".format(thread_id, last_evaluated_key))
     return result_queue.put(dict(users=users, nextPage=last_evaluated_key, segment=thread_id))
 
 
 def scan(
-    dynamodb_client, table_name, filter_expression, expression_attr, projection_expression, exclusive_start_key=None
+    dynamodb_client, table_name, filter_expression, expression_attr, projection_expression, exclusive_start_keys=None
 ):
     logger.debug("Creating new threads and queue.")
     result_queue = queue.Queue()
 
-    # The worker pool size should be equal to the max_segments. Ideally we want one segment per worker.
-    pool_size = 128
-    max_segments = 128
+    # We use one worker per segment.
+    max_segments = 48
 
-    users = []
-    last_evaluated_key = None
+    users = dict()
+    last_evaluated_keys = [None] * max_segments
     threads = []
 
-    for thread_id in range(0, pool_size):
+    # If this is the first request, then we'll receive a None from our
+    # caller.
+    if exclusive_start_keys is None:
+        exclusive_start_keys = [None] * max_segments
+
+    # When we're continuing, we signal that a segment has no more work to
+    # complete if it's ESK is "done". If _all_ of the segments have that, then
+    # we're at the end of our result set.
+    elif all(map(lambda esk: esk == "done", exclusive_start_keys)):
+        return dict(users=[], nextPage=None)
+
+    for thread_id in range(0, max_segments):
         # What are we passing to each threaded function.
+        try:
+            exclusive_start_key = exclusive_start_keys[thread_id]
+        except IndexError:
+            logger.critical("Someone may be DOSing us or not doing pagination properly.")
+            raise
+
+        # If we explicitly read a "done", then this is a signal that the
+        # segment has no more records.
+        if exclusive_start_key == "done":
+            logger.debug(f"skipping thread {thread_id}")
+            continue
 
         thread_args = (
             result_queue,
@@ -102,13 +122,10 @@ def scan(
     while not result_queue.empty():
         logger.debug("Results queue is not empty.")
         result = result_queue.get()
-        users_additional = result.get("users")
-        users.extend(users_additional)
-        if result.get("segment") == max_segments - 1:
-            logger.debug("This is the last segment.")
-            last_evaluated_key = result.get("nextPage")
-            logger.debug("Last evaluated key in page was: {}".format(last_evaluated_key))
+        users.update(result.get("users", {}))
+        segment = result.get("segment")
+        last_evaluated_keys[segment] = result.get("nextPage")
         result_queue.task_done()
 
     logger.debug("Results queue is empty.")
-    return dict(users=users, nextPage=last_evaluated_key)
+    return dict(users=users.values(), nextPage=last_evaluated_keys)
diff --git a/python-modules/cis_profile_retrieval_service/cis_profile_retrieval_service/v2_api.py b/python-modules/cis_profile_retrieval_service/cis_profile_retrieval_service/v2_api.py
@@ -1,3 +1,4 @@
+import os
 import orjson
 
 from flask import Flask
@@ -370,7 +371,11 @@ def version():
 
 
 def main():
-    app.run(host="0.0.0.0", debug=True)
+    # DEBT: I think this has been fixed in a later version of Flask.
+    # We don't call this in production, but instead lean on Serverless' WSGI handler.
+    host, _, port = os.environ.get("SERVER_NAME", "127.0.0.1:5000").partition(":")
+    port = int(port)
+    app.run(host=host, port=port, debug=True)
 
 
 if __name__ == "__main__":
diff --git a/python-modules/cis_profile_retrieval_service/tests/test_e2e.py b/python-modules/cis_profile_retrieval_service/tests/test_e2e.py
@@ -0,0 +1,65 @@
+"""
+n.b. Do not run this as a part of your development cycle. Do not run this
+regularly. This can, and will, insert weird data into CIS, if misconfigured.
+
+A pseudo-e2e test, where we run the code locally but use dev/stage resources.
+This is the code equivalent of running with scissors.
+
+Requires the following environment variables:
+
+    CIS_ENVIRONMENT="testing"
+    CIS_SEED_API_DATA="false"
+    PERSON_API_ADVANCED_SEARCH="true"
+    PERSON_API_INITIALIZE_VAULT="false"
+    PERSON_API_JWT_VALIDATION="false"
+    SERVER_NAME="127.0.0.1:8000"
+
+Run the server with:
+
+    python cis_profile_retrieval_service/v2_api.py
+
+Run the test with:
+
+    PSEUDO_E2E=yes pytest --log-cli-level=DEBUG tests/test_e2e.py
+
+You'll also need an active AWS session, which is left as an exercise for the
+user.
+"""
+
+import logging
+import os
+import pytest
+import requests
+
+from tests.fake_auth0 import FakeBearer
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)s:%(levelname)s:%(name)s:%(message)s")
+logging.getLogger("faker.factory").setLevel(logging.INFO)
+logging.getLogger("urllib3").setLevel(logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+@pytest.fixture
+def auth_headers():
+    bearer = FakeBearer()
+    token = bearer.generate_bearer_with_scope("display:all search:all")
+    headers = {"Authorization": f"Bearer {token}"}
+    return headers
+
+
+@pytest.mark.skipif(os.environ.get("PSEUDO_E2E") is None, reason="Not running in pseudo-E2E mode.")
+def test_retrieve_single_profile(auth_headers):
+    res = requests.get(
+        "http://localhost:8000/v2/users/id/all?connectionMethod=ad&active=True",
+        headers=auth_headers,
+    ).json()
+    next_page = res.get("nextPage")
+    pages = 1
+    while next_page:
+        res = requests.get(
+            f"http://localhost:8000/v2/users/id/all?connectionMethod=ad&active=True&nextPage={next_page}",
+            headers=auth_headers,
+        ).json()
+        next_page = res.get("nextPage")
+        pages += 1
+    assert pages >= 2, "Did not iterate through any pages."
diff --git a/python-modules/cis_profile_retrieval_service/tests/test_v2_api_pagination.py b/python-modules/cis_profile_retrieval_service/tests/test_v2_api_pagination.py
@@ -1,6 +1,7 @@
 """
-A simplification of the existing v2_api tests. We reuse Dynamo and the users we
-create across each test, since otherwise it'll take a while.
+A simplification of the existing v2_api tests. There's care to reuse Dynamo and
+the users we create across each test, since otherwise it'll take a while when
+adding additional tests.
 
 See `DEBT` notes for where I ran into weirdness.
 
@@ -63,31 +64,56 @@ def identity_vault(environment):
     vault_client.find_or_create()
     # DEBT: requires side-effects.
     from cis_profile_retrieval_service.common import seed
-    # DEBT?: doesn't seemingly generate only `number_of_fake_users` users.
+
     # DEBT: doesn't generate `ad|Mozilla-LDAP` users.
-    seed(number_of_fake_users=128)
+    seed(number_of_fake_users=256)
     return (boto3.client("dynamodb"), boto3.resource("dynamodb"))
 
 
 @pytest.fixture
 def app(environment, monkeypatch):
     bearer = FakeBearer()
     token = bearer.generate_bearer_with_scope("display:all search:all")
-    headers = {
-        "Authorization": f"Bearer {token}"
-    }
+    headers = {"Authorization": f"Bearer {token}"}
     monkeypatch.setattr("cis_profile_retrieval_service.idp.get_jwks", lambda: json_form_of_pk)
     # DEBT: requires side-effects.
     from cis_profile_retrieval_service import v2_api
+
     v2_api.app.testing = True
     return (headers, v2_api.app.test_client())
 
 
 def test_existing(identity_vault, app):
+    """
+    As it turns out, our pagination was broken.
+
+    The `v2/users/id/all` endpoint, as written, did the right thing: paginate
+    through pages, skipping empty pages.
+
+    The scan logic [0] has a slight bug in it, where it would continue
+    paginating. Since it was doing this pagination for us, at a lower level, we
+    simply weren't propagating any `nextPage` tokens forward.
+
+    Pagination mystery: solved.
+
+    [0]: python-modules/cis_identity_vault/cis_identity_vault/parallel_dynamo.py
+    """
     headers, client = app
     # DEBT: see note above, about not generating `ad|Mozilla-LDAP` users.
     results = client.get(
         "/v2/users/id/all?connectionMethod=github&active=True",
         headers=headers,
         follow_redirects=True,
-    )
+    ).json
+    next_page = results.get("nextPage")
+    pages = 1
+    # Pagination, as implemented elsewhere.
+    while next_page:
+        results = client.get(
+            f"/v2/users/id/all?connectionMethod=github&active=True&nextPage={next_page}",
+            headers=headers,
+            follow_redirects=True,
+        ).json
+        next_page = results.get("nextPage")
+        pages += 1
+    assert pages >= 2, "Did not paginate!"