Skip to content

Commit f1f27e3

Browse files
authored
Repo sync: don't keep all objects in memory (#12404)
We were keeping all remote repos and org objects into memory, but only using its remote_id, this should help to reduce the memory footprint when syncing users with lots of repos.
1 parent a034dcc commit f1f27e3

File tree

5 files changed

+53
-62
lines changed

5 files changed

+53
-62
lines changed

readthedocs/oauth/services/base.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -255,16 +255,15 @@ def sync(self):
255255
- deletes old RemoteRepository/Organization that are not present
256256
for this user in the current provider
257257
"""
258-
remote_repositories = self.sync_repositories()
258+
repository_remote_ids = self.sync_repositories()
259259
(
260-
remote_organizations,
261-
remote_repositories_organizations,
260+
organization_remote_ids,
261+
organization_repositories_remote_ids,
262262
) = self.sync_organizations()
263263

264264
# Delete RemoteRepository where the user doesn't have access anymore
265265
# (skip RemoteRepository tied to a Project on this user)
266-
all_remote_repositories = remote_repositories + remote_repositories_organizations
267-
repository_remote_ids = [r.remote_id for r in all_remote_repositories if r is not None]
266+
repository_remote_ids += organization_repositories_remote_ids
268267
(
269268
self.user.remote_repository_relations.filter(
270269
account=self.account,
@@ -277,7 +276,6 @@ def sync(self):
277276
)
278277

279278
# Delete RemoteOrganization where the user doesn't have access anymore
280-
organization_remote_ids = [o.remote_id for o in remote_organizations if o is not None]
281279
(
282280
self.user.remote_organization_relations.filter(
283281
account=self.account,

readthedocs/oauth/services/bitbucket.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ class BitbucketService(UserService):
3434

3535
def sync_repositories(self):
3636
"""Sync repositories from Bitbucket API."""
37-
remote_repositories = []
37+
remote_ids = []
3838

3939
# Get user repos
4040
try:
@@ -44,7 +44,8 @@ def sync_repositories(self):
4444
)
4545
for repo in repos:
4646
remote_repository = self.create_repository(repo)
47-
remote_repositories.append(remote_repository)
47+
if remote_repository:
48+
remote_ids.append(remote_repository.remote_id)
4849

4950
except (TypeError, ValueError):
5051
log.warning("Error syncing Bitbucket repositories")
@@ -74,12 +75,12 @@ def sync_repositories(self):
7475
except (TypeError, ValueError):
7576
pass
7677

77-
return remote_repositories
78+
return remote_ids
7879

7980
def sync_organizations(self):
8081
"""Sync Bitbucket workspaces(our RemoteOrganization) and workspace repositories."""
81-
remote_organizations = []
82-
remote_repositories = []
82+
organization_remote_ids = []
83+
repository_remote_ids = []
8384

8485
try:
8586
workspaces = self.paginate(
@@ -90,14 +91,15 @@ def sync_organizations(self):
9091
remote_organization = self.create_organization(workspace)
9192
repos = self.paginate(workspace["links"]["repositories"]["href"])
9293

93-
remote_organizations.append(remote_organization)
94+
organization_remote_ids.append(remote_organization.remote_id)
9495

9596
for repo in repos:
9697
remote_repository = self.create_repository(
9798
repo,
9899
organization=remote_organization,
99100
)
100-
remote_repositories.append(remote_repository)
101+
if remote_repository:
102+
repository_remote_ids.append(remote_repository.remote_id)
101103

102104
except ValueError:
103105
log.warning("Error syncing Bitbucket organizations")
@@ -107,7 +109,7 @@ def sync_organizations(self):
107109
)
108110
)
109111

110-
return remote_organizations, remote_repositories
112+
return organization_remote_ids, repository_remote_ids
111113

112114
def create_repository(self, fields, privacy=None, organization=None):
113115
"""

readthedocs/oauth/services/github.py

Lines changed: 11 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -38,26 +38,27 @@ class GitHubService(UserService):
3838

3939
def sync_repositories(self):
4040
"""Sync repositories from GitHub API."""
41-
remote_repositories = []
41+
remote_ids = []
4242

4343
try:
4444
repos = self.paginate(f"{self.base_api_url}/user/repos", per_page=100)
4545
for repo in repos:
4646
remote_repository = self.create_repository(repo)
47-
remote_repositories.append(remote_repository)
47+
if remote_repository:
48+
remote_ids.append(remote_repository.remote_id)
4849
except (TypeError, ValueError):
4950
log.warning("Error syncing GitHub repositories")
5051
raise SyncServiceError(
5152
SyncServiceError.INVALID_OR_REVOKED_ACCESS_TOKEN.format(
5253
provider=self.vcs_provider_slug
5354
)
5455
)
55-
return remote_repositories
56+
return remote_ids
5657

5758
def sync_organizations(self):
5859
"""Sync organizations from GitHub API."""
59-
remote_organizations = []
60-
remote_repositories = []
60+
organization_remote_ids = []
61+
repository_remote_ids = []
6162

6263
try:
6364
orgs = self.paginate(f"{self.base_api_url}/user/orgs", per_page=100)
@@ -67,7 +68,7 @@ def sync_organizations(self):
6768
org_details,
6869
create_user_relationship=True,
6970
)
70-
remote_organizations.append(remote_organization)
71+
organization_remote_ids.append(remote_organization.remote_id)
7172

7273
org_url = org["url"]
7374
org_repos = self.paginate(
@@ -76,7 +77,8 @@ def sync_organizations(self):
7677
)
7778
for repo in org_repos:
7879
remote_repository = self.create_repository(repo)
79-
remote_repositories.append(remote_repository)
80+
if remote_repository:
81+
repository_remote_ids.append(remote_repository.remote_id)
8082

8183
except (TypeError, ValueError):
8284
log.warning("Error syncing GitHub organizations")
@@ -86,7 +88,7 @@ def sync_organizations(self):
8688
)
8789
)
8890

89-
return remote_organizations, remote_repositories
91+
return organization_remote_ids, repository_remote_ids
9092

9193
def create_repository(self, fields, privacy=None):
9294
"""
@@ -105,24 +107,11 @@ def create_repository(self, fields, privacy=None):
105107
(fields["private"] is False and privacy == "public"),
106108
]
107109
):
108-
repo, created = RemoteRepository.objects.get_or_create(
110+
repo, _ = RemoteRepository.objects.get_or_create(
109111
remote_id=str(fields["id"]),
110112
vcs_provider=self.vcs_provider_slug,
111113
)
112114

113-
# TODO: For debugging: https://github.com/readthedocs/readthedocs.org/pull/9449.
114-
if created:
115-
_old_remote_repository = RemoteRepository.objects.filter(
116-
full_name=fields["full_name"], vcs_provider=self.vcs_provider_slug
117-
).first()
118-
if _old_remote_repository:
119-
log.warning(
120-
"GitHub repository created with different remote_id but exact full_name.",
121-
fields=fields,
122-
old_remote_repository=_old_remote_repository.__dict__,
123-
imported=_old_remote_repository.projects.exists(),
124-
)
125-
126115
owner_type = fields["owner"]["type"]
127116
organization = None
128117
if owner_type == "Organization":

readthedocs/oauth/services/gitlab.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ def sync_repositories(self):
8282
8383
See https://docs.gitlab.com/api/projects/#list-a-users-projects.
8484
"""
85-
remote_repositories = []
85+
remote_ids = []
8686
try:
8787
repos = self.paginate(
8888
f"{self.base_api_url}/api/v4/users/{self.account.uid}/projects",
@@ -94,7 +94,8 @@ def sync_repositories(self):
9494

9595
for repo in repos:
9696
remote_repository = self.create_repository(repo)
97-
remote_repositories.append(remote_repository)
97+
if remote_repository:
98+
remote_ids.append(remote_repository.remote_id)
9899
except (TypeError, ValueError):
99100
log.warning("Error syncing GitLab repositories")
100101
raise SyncServiceError(
@@ -103,11 +104,11 @@ def sync_repositories(self):
103104
)
104105
)
105106

106-
return remote_repositories
107+
return remote_ids
107108

108109
def sync_organizations(self):
109-
remote_organizations = []
110-
remote_repositories = []
110+
organization_remote_ids = []
111+
repository_remote_ids = []
111112

112113
try:
113114
orgs = self.paginate(
@@ -130,7 +131,7 @@ def sync_organizations(self):
130131
sort="asc",
131132
)
132133

133-
remote_organizations.append(remote_organization)
134+
organization_remote_ids.append(remote_organization.remote_id)
134135

135136
for repo in org_repos:
136137
# TODO: Optimize this so that we don't re-fetch project data
@@ -151,7 +152,8 @@ def sync_organizations(self):
151152
remote_repository = self.create_repository(
152153
repo_details, organization=remote_organization
153154
)
154-
remote_repositories.append(remote_repository)
155+
if remote_repository:
156+
repository_remote_ids.append(remote_repository.remote_id)
155157
else:
156158
log.warning(
157159
"GitLab project does not exist or user does not have permissions.",
@@ -172,7 +174,7 @@ def sync_organizations(self):
172174
)
173175
)
174176

175-
return remote_organizations, remote_repositories
177+
return organization_remote_ids, repository_remote_ids
176178

177179
def create_repository(
178180
self, fields, privacy=None, organization: RemoteOrganization | None = None

readthedocs/rtd_tests/tests/test_oauth_sync.py

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -194,13 +194,14 @@ def test_sync_repositories(self, mock_request):
194194
self.assertEqual(RemoteRepository.objects.count(), 0)
195195
self.assertEqual(RemoteOrganization.objects.count(), 0)
196196

197-
remote_repositories = self.service.sync_repositories()
197+
repository_remote_ids = self.service.sync_repositories()
198198

199199
self.assertEqual(RemoteRepository.objects.count(), 1)
200200
self.assertEqual(RemoteOrganization.objects.count(), 0)
201-
self.assertEqual(len(remote_repositories), 1)
202-
remote_repository = remote_repositories[0]
203-
self.assertIsInstance(remote_repository, RemoteRepository)
201+
self.assertEqual(len(repository_remote_ids), 1)
202+
203+
remote_repository = RemoteRepository.objects.first()
204+
self.assertEqual(repository_remote_ids[0], remote_repository.remote_id)
204205
self.assertEqual(remote_repository.full_name, "organization/repository")
205206
self.assertEqual(remote_repository.name, "repository")
206207
self.assertFalse(remote_repository.remote_repository_relations.first().admin)
@@ -234,15 +235,15 @@ def test_sync_repositories_relation_with_organization(self, mock_request):
234235
organization=remote_organization,
235236
vcs_provider="github",
236237
)
237-
remote_repositories = self.service.sync_repositories()
238+
repository_remote_ids = self.service.sync_repositories()
238239

239240
self.assertEqual(RemoteRepository.objects.count(), 1)
240241
self.assertEqual(RemoteRepositoryRelation.objects.count(), 1)
241242
self.assertEqual(RemoteOrganization.objects.count(), 1)
242243

243-
self.assertEqual(len(remote_repositories), 1)
244-
remote_repository = remote_repositories[0]
245-
self.assertIsInstance(remote_repository, RemoteRepository)
244+
self.assertEqual(len(repository_remote_ids), 1)
245+
remote_repository = RemoteRepository.objects.first()
246+
self.assertEqual(repository_remote_ids[0], remote_repository.remote_id)
246247
self.assertEqual(remote_repository.full_name, "organization/repository")
247248
self.assertEqual(remote_repository.name, "repository")
248249
self.assertEqual(remote_repository.organization.slug, "organization")
@@ -276,15 +277,15 @@ def test_sync_repositories_moved_from_org_to_user(self, mock_request):
276277
organization=remote_organization,
277278
vcs_provider="github",
278279
)
279-
remote_repositories = self.service.sync_repositories()
280+
repository_remote_ids = self.service.sync_repositories()
280281

281282
self.assertEqual(RemoteRepository.objects.count(), 1)
282283
self.assertEqual(RemoteRepositoryRelation.objects.count(), 1)
283284
self.assertEqual(RemoteOrganization.objects.count(), 1)
284285

285-
self.assertEqual(len(remote_repositories), 1)
286-
remote_repository = remote_repositories[0]
287-
self.assertIsInstance(remote_repository, RemoteRepository)
286+
self.assertEqual(len(repository_remote_ids), 1)
287+
remote_repository = RemoteRepository.objects.first()
288+
self.assertEqual(repository_remote_ids[0], remote_repository.remote_id)
288289
self.assertEqual(remote_repository.full_name, "organization/repository")
289290
self.assertEqual(remote_repository.name, "repository")
290291
self.assertIsNone(remote_repository.organization)
@@ -407,14 +408,13 @@ def test_sync_organizations(self, mock_request):
407408
self.assertEqual(RemoteOrganization.objects.count(), 0)
408409
self.assertEqual(RemoteOrganizationRelation.objects.count(), 0)
409410

410-
remote_organizations, remote_repositories = self.service.sync_organizations()
411+
organization_remote_ids, repository_remote_ids = self.service.sync_organizations()
411412

412413
self.assertEqual(RemoteRepository.objects.count(), 1)
413414
self.assertEqual(RemoteRepositoryRelation.objects.count(), 1)
414415
self.assertEqual(RemoteOrganization.objects.count(), 1)
415416
self.assertEqual(RemoteOrganizationRelation.objects.count(), 1)
416-
self.assertEqual(len(remote_organizations), 1)
417-
self.assertEqual(len(remote_repositories), 1)
418-
remote_organization = remote_organizations[0]
419-
self.assertIsInstance(remote_organization, RemoteOrganization)
420-
self.assertEqual(remote_organization.name, "Organization")
417+
self.assertEqual(len(organization_remote_ids), 1)
418+
self.assertEqual(len(repository_remote_ids), 1)
419+
remote_organization = RemoteOrganization.objects.first()
420+
self.assertEqual(organization_remote_ids[0], remote_organization.remote_id)

0 commit comments

Comments
 (0)