Merge pull request #416 from alexhsamuel/feature/archive-chunks

alexhsamuel · web-flow · commit e531aca7376f · 2024-12-05T15:12:04.000Z
Archive chunk size.
diff --git a/RELEASE.md b/RELEASE.md
@@ -1,6 +1,8 @@
 # v0.30
 
 - Config `database.timeout` sets the sqlite lock timeout.
+- The run archive program takes `chunk_size` and `chunk_sleep` parameters, to
+  divide the archive operation into chunks with pauses in between.
 
 
 # v0.29
diff --git a/docs/programs.rst b/docs/programs.rst
@@ -205,7 +205,8 @@ The archive program retires a run from Apsis's memory before archiving it.  The
 run is no longer visible through any UI.  A run that is not completed cannot be
 archived.
 
-This job archives up to 10,000 runs older than 14 days (1,209,600 seconds):
+This job archives up to 10,000 runs older than 14 days (1,209,600 seconds), in
+chunks of 1,000 runs at a time, with a 10 second pause between chunks:
 
 .. code:: yaml
 
@@ -218,11 +219,16 @@ This job archives up to 10,000 runs older than 14 days (1,209,600 seconds):
         type: apsis.program.internal.archive.ArchiveProgram
         age: 1209600
         count: 10000
+        chunk_size: 1000
+        chunk_sleep: 10
         path: '/path/to/apsis/archive.db'
 
-The archive program blocks Apsis from performing other tasks.  Adjust the
-`count` parameter so that the archiving process does not take more than a few
-seconds, to avoid long delays in startng scheduled runs.
+The archive program blocks Apsis from performing other tasks for each chunk of
+archive runs.  Adjust the `chunk_size`, `chunk_sleep`, and `count` parameters so
+that the archiving process pauses every few seconds, to avoid long delays in
+starting scheduled runs.  If the `chunk_size` parameter is omitted, all runs are
+archived in one chunk.  If the `chunk_sleep` parameter is omitted, Apsis does
+not pause between chunks.
 
 The archive file is also an SQLite3 database file, and contains the subset of
 columns from the main database file that contains run data.  The archive file
diff --git a/python/apsis/lib/json.py b/python/apsis/lib/json.py
@@ -85,6 +85,10 @@ def expand_dotted_keys(mapping):
     return type(mapping)(result)
 
 
+def nkey(name, value):
+    return {} if value is None else {name: value}
+
+
 #-------------------------------------------------------------------------------
 
 class TypedJso:
diff --git a/python/apsis/program/internal/archive.py b/python/apsis/program/internal/archive.py
@@ -3,7 +3,7 @@
 import ora
 
 from   ..base import _InternalProgram, ProgramRunning, ProgramSuccess
-from   apsis.lib.json import check_schema
+from   apsis.lib.json import check_schema, nkey
 from   apsis.lib.parse import parse_duration
 from   apsis.runs import template_expand
 
@@ -23,7 +23,7 @@ class ArchiveProgram(_InternalProgram):
     skipped for archiving.
     """
 
-    def __init__(self, *, age, path, count):
+    def __init__(self, *, age, path, count, chunk_size=None, chunk_sleep=None):
         """
         If this archive file doesn't exist, it is created automatically on
         first use; the contianing directory must exist.
@@ -35,38 +35,59 @@ def __init__(self, *, age, path, count):
           Apsis database file.
         :param count:
           Maximum number of runs to archive per run of this program.
+        :param chunk_size:
+          Number of runs to archive in one chunk.  Each chunk is blocking.
+        :param chunk_sleep:
+          Time in seconds to wait between chunks.
         """
-        self.__age = age
-        self.__path = path
-        self.__count = count
+        self.__age          = age
+        self.__path         = path
+        self.__count        = count
+        self.__chunk_size   = chunk_size
+        self.__chunk_sleep  = chunk_sleep
 
 
     def __str__(self):
         return f"archive age {self.__age} → {self.__path}"
 
 
     def bind(self, args):
-        age = parse_duration(template_expand(self.__age, args))
-        path = template_expand(self.__path, args)
-        count = int(template_expand(self.__count, args))
-        return type(self)(age=age, path=path, count=count)
+        return type(self)(
+            age         = parse_duration(template_expand(self.__age, args)),
+            path        = template_expand(self.__path, args),
+            count       = int(template_expand(self.__count, args)),
+            chunk_size  = None if self.__chunk_size is None
+                          else int(template_expand(self.__chunk_size, args)),
+            chunk_sleep = None if self.__chunk_sleep is None
+                          else float(template_expand(self.__chunk_sleep, args)),
+        )
 
 
     @classmethod
     def from_jso(cls, jso):
         with check_schema(jso) as pop:
-            age = pop("age")
-            path = pop("path", str)
-            count = pop("count", int)
-        return cls(age=age, path=path, count=count)
+            age         = pop("age")
+            path        = pop("path", str)
+            count       = pop("count", int)
+            chunk_size  = pop("chunk_size", int, None)
+            chunk_sleep = pop("chunk_sleep", float, None)
+        return cls(
+            age         =age,
+            path        =path,
+            count       =count,
+            chunk_size  =chunk_size,
+            chunk_sleep =chunk_sleep,
+        )
 
 
     def to_jso(self):
         return {
             **super().to_jso(),
-            "age": self.__age,
-            "path": self.__path,
-            "count": self.__count,
+            "age"   : self.__age,
+            "path"  : self.__path,
+            "count" : self.__count,
+            **nkey("chunk_size", self.__chunk_size),
+            **nkey("chunk_sleep", self.__chunk_sleep),
         }
 
 
@@ -78,28 +99,47 @@ async def wait(self, apsis):
         # FIXME: Private attributes.
         db = apsis._Apsis__db
 
-        run_ids = db.get_archive_run_ids(
-            before  =ora.now() - self.__age,
-            count   =self.__count,
-        )
-
-        # Make sure all runs are retired; else skip them.
-        run_ids = [ r for r in run_ids if apsis.run_store.retire(r) ]
-
-        if len(run_ids) > 0:
-            # Archive these runs.
-            row_counts = db.archive(self.__path, run_ids)
-            # Also vacuum to free space.
-            db.vacuum()
+        if not (self.__chunk_size is None or 0 < self.__chunk_size):
+            raise ValueError("nonpositive chunk size")
 
-        else:
-            row_counts = {}
+        row_counts = {}
+        meta = {
+            "run count" : 0,
+            "run_ids"   : [],
+            "row counts": row_counts
+        }
 
-        return ProgramSuccess(meta={
-            "run count" : len(run_ids),
-            "run_ids"   : run_ids,
-            "row counts": row_counts,
-        })
+        count = self.__count
+        while count > 0:
+            chunk = (
+                count if self.__chunk_size is None
+                else min(count, self.__chunk_size)
+            )
+            run_ids = db.get_archive_run_ids(
+                before  =ora.now() - self.__age,
+                count   =chunk,
+            )
+            count -= chunk
+
+            # Make sure all runs are retired; else skip them.
+            run_ids = [ r for r in run_ids if apsis.run_store.retire(r) ]
+
+            if len(run_ids) > 0:
+                # Archive these runs.
+                chunk_row_counts = db.archive(self.__path, run_ids)
+                # Accumulate metadata.
+                meta["run count"] += len(run_ids)
+                meta["run_ids"].append(run_ids)
+                for key, value in chunk_row_counts.items():
+                    row_counts[key] = row_counts.get(key, 0) + value
+                # Also vacuum to free space.
+                db.vacuum()
+
+            if count > 0 and self.__chunk_sleep is not None:
+                # Yield to the event loop.
+                await asyncio.sleep(self.__chunk_sleep)
+
+        return ProgramSuccess(meta=meta)
 
 
     def reconnect(self, run_id, run_state, apsis):
diff --git a/python/apsis/service/client.py b/python/apsis/service/client.py
@@ -332,23 +332,24 @@ def schedule(self, job_id, args, time="now", *, count=None):
         return next(iter(runs.values())) if count is None else runs.values()
 
 
-    def __schedule(self, time, job):
+    def __schedule(self, time, job, count):
         time = "now" if time == "now" else str(Time(time))
         data = {
             "job": job,
             "times": {
                 "schedule": time,
             },
         }
-        runs = self.__post("/api/v1/runs", data=data)["runs"]
-        return next(iter(runs.values()))
+        runs = self.__post("/api/v1/runs", data=data, count=count)["runs"]
+        # FIXME: Hacky.
+        return next(iter(runs.values())) if count is None else runs.values()
 
 
-    def schedule_adhoc(self, time, job):
-        return self.__schedule(time, job)
+    def schedule_adhoc(self, time, job, *, count=None):
+        return self.__schedule(time, job, count)
 
 
-    def schedule_program(self, time, args):
+    def schedule_program(self, time, args, *, count=None):
         """
         :param time:
           The schedule time, or "now" for immediate.
@@ -357,17 +358,17 @@ def schedule_program(self, time, args):
           to run.
         """
         args = [ str(a) for a in args ]
-        return self.__schedule(time, {"program": args})
+        return self.__schedule(time, {"program": args}, count)
 
 
-    def schedule_shell_program(self, time, command):
+    def schedule_shell_program(self, time, command, *, count=None):
         """
         :param time:
           The schedule time, or "now" for immediate.
         :param command:
           The shell command to run.
         """
-        return self.__schedule(time, {"program": str(command)})
+        return self.__schedule(time, {"program": str(command)}, count)
 
 
     def reload_jobs(self, *, dry_run=False):
diff --git a/python/apsis/sqlite.py b/python/apsis/sqlite.py
@@ -761,7 +761,6 @@ def get_archive_run_ids(self, *, before, count):
                     sa.select([TBL_RUNS.c.run_id])
                     .where(TBL_RUNS.c.timestamp < dump_time(before))
                     .where(TBL_RUNS.c.state.in_(FINISHED_STATES))
-                    .order_by(TBL_RUNS.c.timestamp)
                     .limit(count)
                 )
             ]
diff --git a/test/int/test_archive.py b/test/int/test_archive.py
@@ -56,7 +56,7 @@ def test_archive(tmp_path):
         res = inst.wait_run(res["run_id"])
         # The first run has been archived.
         assert res["meta"]["program"]["run count"] == 1
-        assert res["meta"]["program"]["run_ids"] == [run_id0]
+        assert res["meta"]["program"]["run_ids"] == [[run_id0]]
 
         # The first run is no longer be available; the other two are.
         with pytest.raises(APIError):
@@ -77,7 +77,7 @@ def test_archive(tmp_path):
         # The second run was archived, but the third isn't old enough yet.
         res = inst.wait_run(res["run_id"])
         assert res["meta"]["program"]["run count"] == 1
-        assert res["meta"]["program"]["run_ids"] == [run_id1]
+        assert res["meta"]["program"]["run_ids"] == [[run_id1]]
 
         # The second run is no longer available.
         with pytest.raises(APIError):
@@ -103,6 +103,57 @@ def test_archive(tmp_path):
             assert rows[0] == (run_id1, "combined stdout & stderr", 14)
 
 
+def test_archive_chunks(tmp_path):
+    path = tmp_path / "archive.db"
+    job_dir = tmp_path / "jobs"
+    job_dir.mkdir()
+
+    with closing(ApsisService(
+            cfg={"schedule": {"horizon": 1}},
+            job_dir=job_dir,
+    )) as inst:
+        inst.create_db()
+        inst.write_cfg()
+        inst.start_serve()
+        inst.wait_for_serve()
+
+        client = inst.client
+
+        # Run 100 runs.
+        res = client.schedule_adhoc(
+            "now", {"program": {"type": "no-op"}}, count=100)
+        run_ids = { r["run_id"] for r in res }
+        for run_id in run_ids:
+            inst.wait_run(run_id)
+
+        time.sleep(1)
+
+        # Archive, with a max age of 1 s and up to 80 runs, chunked by 10.
+        res = client.schedule_adhoc("now", {
+            "program": {
+                "type": "apsis.program.internal.archive.ArchiveProgram",
+                "age": 1,
+                "count": 80,
+                "chunk_size": 10,
+                "chunk_sleep": 0.1,
+                "path": str(path),
+            },
+        })
+        res = inst.wait_run(res["run_id"])
+        # Runs have been archived.
+        meta = res["meta"]["program"]
+        assert meta["run count"] == 80
+        assert len(meta["run_ids"]) == 8
+        assert all( len(c) == 10 for c in meta["run_ids"] )
+        assert all( r in run_ids for c in meta["run_ids"] for r in c )
+
+        # Check the archive file.
+        with closing(sqlite3.connect(path)) as db:
+            rows = set(db.execute("SELECT run_id, state FROM runs"))
+            assert len(rows) == 80
+            assert all( r[0] in run_ids and r[1] == "success" for r in rows )
+
+
 def test_clean_up_jobs(tmp_path):
     path = tmp_path / "archive.db"
     job_dir = tmp_path / "jobs"
@@ -156,8 +207,10 @@ def test_clean_up_jobs(tmp_path):
         res = inst.wait_run(res["run_id"])
         archive_job_id = res["job_id"]
         # The first two runs have been archived.
-        assert res["meta"]["program"]["run count"] == 2
-        assert set(res["meta"]["program"]["run_ids"]) == {run_id0, run_id1}
+        meta = res["meta"]["program"]
+        assert meta["run count"] == 2
+        assert len(meta["run_ids"]) == 1  # one chunk
+        assert set(meta["run_ids"][0]) == {run_id0, run_id1}
 
     # Check the DB.  Only the third job ID should remain, plus the job ID from
     # the archive job.

Original file line number	Diff line number	Diff line change
`@@ -761,7 +761,6 @@ def get_archive_run_ids(self, *, before, count):`
`761`	`761`	`sa.select([TBL_RUNS.c.run_id])`
`762`	`762`	`.where(TBL_RUNS.c.timestamp < dump_time(before))`
`763`	`763`	`.where(TBL_RUNS.c.state.in_(FINISHED_STATES))`
`764`		`- .order_by(TBL_RUNS.c.timestamp)`
`765`	`764`	`.limit(count)`
`766`	`765`	`)`
`767`	`766`	`]`