Skip to content

Commit 454fdaf

Browse files
sokcevicGLUCI
authored andcommitted
sync: Always use WORKER_BATCH_SIZE
With 551285f, the comment about number of workers no longer stands - dict is shared among multiprocesses and real time information is available. Using 2.7k projects as the baseline, using chunk size of 4 takes close to 5 minutes. A chunk size of 32 takes this down to 40s - a reduction of rougly 8 times which matches the increase. [email protected] Bug: b/371638995 Change-Id: Ida5fd8f7abc44b3b82c02aa0f7f7ae01dff5eb07 Reviewed-on: https://gerrit-review.googlesource.com/c/git-repo/+/438523 Commit-Queue: Josip Sokcevic <[email protected]> Tested-by: Josip Sokcevic <[email protected]> Reviewed-by: Gavin Mak <[email protected]>
1 parent f7f9dd4 commit 454fdaf

File tree

3 files changed

+43
-27
lines changed

3 files changed

+43
-27
lines changed

project.py

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2396,26 +2396,25 @@ def _CheckForImmutableRevision(self):
23962396
try:
23972397
# if revision (sha or tag) is not present then following function
23982398
# throws an error.
2399+
revs = [f"{self.revisionExpr}^0"]
2400+
upstream_rev = None
2401+
if self.upstream:
2402+
upstream_rev = self.GetRemote().ToLocal(self.upstream)
2403+
revs.append(upstream_rev)
2404+
23992405
self.bare_git.rev_list(
24002406
"-1",
24012407
"--missing=allow-any",
2402-
"%s^0" % self.revisionExpr,
2408+
*revs,
24032409
"--",
24042410
log_as_error=False,
24052411
)
2412+
24062413
if self.upstream:
2407-
rev = self.GetRemote().ToLocal(self.upstream)
2408-
self.bare_git.rev_list(
2409-
"-1",
2410-
"--missing=allow-any",
2411-
"%s^0" % rev,
2412-
"--",
2413-
log_as_error=False,
2414-
)
24152414
self.bare_git.merge_base(
24162415
"--is-ancestor",
24172416
self.revisionExpr,
2418-
rev,
2417+
upstream_rev,
24192418
log_as_error=False,
24202419
)
24212420
return True

subcmds/sync.py

Lines changed: 10 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,11 @@ def _SafeCheckoutOrder(checkouts: List[Project]) -> List[List[Project]]:
131131
return res
132132

133133

134+
def _chunksize(projects: int, jobs: int) -> int:
135+
"""Calculate chunk size for the given number of projects and jobs."""
136+
return min(max(1, projects // jobs), WORKER_BATCH_SIZE)
137+
138+
134139
class _FetchOneResult(NamedTuple):
135140
"""_FetchOne return value.
136141
@@ -819,7 +824,6 @@ def _GetSyncProgressMessage(self):
819824
def _Fetch(self, projects, opt, err_event, ssh_proxy, errors):
820825
ret = True
821826

822-
jobs = opt.jobs_network
823827
fetched = set()
824828
remote_fetched = set()
825829
pm = Progress(
@@ -849,6 +853,8 @@ def _MonitorSyncLoop():
849853
objdir_project_map.setdefault(project.objdir, []).append(project)
850854
projects_list = list(objdir_project_map.values())
851855

856+
jobs = min(opt.jobs_network, len(projects_list))
857+
852858
def _ProcessResults(results_sets):
853859
ret = True
854860
for results in results_sets:
@@ -888,35 +894,22 @@ def _ProcessResults(results_sets):
888894
Sync.ssh_proxy = None
889895

890896
# NB: Multiprocessing is heavy, so don't spin it up for one job.
891-
if len(projects_list) == 1 or jobs == 1:
897+
if jobs == 1:
892898
self._FetchInitChild(ssh_proxy)
893899
if not _ProcessResults(
894900
self._FetchProjectList(opt, x) for x in projects_list
895901
):
896902
ret = False
897903
else:
898-
# Favor throughput over responsiveness when quiet. It seems that
899-
# imap() will yield results in batches relative to chunksize, so
900-
# even as the children finish a sync, we won't see the result until
901-
# one child finishes ~chunksize jobs. When using a large --jobs
902-
# with large chunksize, this can be jarring as there will be a large
903-
# initial delay where repo looks like it isn't doing anything and
904-
# sits at 0%, but then suddenly completes a lot of jobs all at once.
905-
# Since this code is more network bound, we can accept a bit more
906-
# CPU overhead with a smaller chunksize so that the user sees more
907-
# immediate & continuous feedback.
908-
if opt.quiet:
909-
chunksize = WORKER_BATCH_SIZE
910-
else:
904+
if not opt.quiet:
911905
pm.update(inc=0, msg="warming up")
912-
chunksize = 4
913906
with multiprocessing.Pool(
914907
jobs, initializer=self._FetchInitChild, initargs=(ssh_proxy,)
915908
) as pool:
916909
results = pool.imap_unordered(
917910
functools.partial(self._FetchProjectList, opt),
918911
projects_list,
919-
chunksize=chunksize,
912+
chunksize=_chunksize(len(projects_list), jobs),
920913
)
921914
if not _ProcessResults(results):
922915
ret = False

tests/test_subcmds_sync.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -355,6 +355,30 @@ def test_complex_nested(self):
355355
)
356356

357357

358+
class Chunksize(unittest.TestCase):
359+
"""Tests for _chunksize."""
360+
361+
def test_single_project(self):
362+
"""Single project."""
363+
self.assertEqual(sync._chunksize(1, 1), 1)
364+
365+
def test_low_project_count(self):
366+
"""Multiple projects, low number of projects to sync."""
367+
self.assertEqual(sync._chunksize(10, 1), 10)
368+
self.assertEqual(sync._chunksize(10, 2), 5)
369+
self.assertEqual(sync._chunksize(10, 4), 2)
370+
self.assertEqual(sync._chunksize(10, 8), 1)
371+
self.assertEqual(sync._chunksize(10, 16), 1)
372+
373+
def test_high_project_count(self):
374+
"""Multiple projects, high number of projects to sync."""
375+
self.assertEqual(sync._chunksize(2800, 1), 32)
376+
self.assertEqual(sync._chunksize(2800, 16), 32)
377+
self.assertEqual(sync._chunksize(2800, 32), 32)
378+
self.assertEqual(sync._chunksize(2800, 64), 32)
379+
self.assertEqual(sync._chunksize(2800, 128), 21)
380+
381+
358382
class GetPreciousObjectsState(unittest.TestCase):
359383
"""Tests for _GetPreciousObjectsState."""
360384

0 commit comments

Comments
 (0)