Skip to content

Commit 278fb69

Browse files
committed
♻️(backend) refactor dedupe_accounts command
Refactored `dedupe_accounts` to improve readability and logic separation by moving duplicate-handling logic to a utility.
1 parent bdf747e commit 278fb69

File tree

11 files changed

+2219
-414
lines changed

11 files changed

+2219
-414
lines changed
Lines changed: 6 additions & 146 deletions
Original file line numberDiff line numberDiff line change
@@ -1,114 +1,13 @@
11
"""Command to dedupe accounts."""
22

3-
from difflib import SequenceMatcher
43
import logging
5-
from typing import Any
64

7-
from django.contrib.auth import get_user_model
85
from django.core.management.base import BaseCommand
9-
from django.db import transaction
10-
from django.db.models import Count
116

7+
from marsha.account.utils.dedupe_accounts import dedupe_accounts
128

13-
logger = logging.getLogger(__name__)
14-
15-
16-
# pylint: disable=too-many-locals
17-
def dedupe_accounts(
18-
options: dict[str, Any],
19-
) -> tuple[list[Any], dict[Any, Any], dict[Any, Any], list[Any], list[Any]]:
20-
"""Deduplicate accounts."""
21-
# pylint: disable=invalid-name
22-
User = get_user_model()
23-
24-
if options["email"]:
25-
duplicates = [{"email": options["email"]}]
26-
else:
27-
duplicates = (
28-
User.objects.values("email")
29-
.annotate(count=Count("id"))
30-
.filter(count__gt=1)
31-
.order_by("email")
32-
)
33-
34-
accounts_to_delete = []
35-
duped_users = {}
36-
organizations = {}
37-
skipped_accounts = []
38-
users_to_delete = []
39-
for dup in duplicates:
40-
email = dup["email"]
41-
if not email:
42-
continue
43-
44-
logger.info("Deduping %s", email)
45-
46-
users = list(User.objects.filter(email=email).order_by("date_joined"))
47-
original_user, *duplicate_users = users
48-
original_social = original_user.social_auth.first()
49-
50-
for duplicate_user in duplicate_users:
51-
new_social = duplicate_user.social_auth.first()
52-
if not new_social:
53-
continue
54-
55-
old_account_email = original_social.uid.split(":")[1]
56-
new_account_email = new_social.uid.split(":")[1]
57-
58-
old_organization_uid = original_social.uid.split(":")[0]
59-
new_organization_uid = new_social.uid.split(":")[0]
60-
61-
account_email_ratio = SequenceMatcher(
62-
None, old_account_email, new_account_email
63-
).ratio()
64-
organization_ratio = SequenceMatcher(
65-
None, old_organization_uid, new_organization_uid
66-
).ratio()
679

68-
if old_account_email != new_account_email:
69-
skipped_accounts.append(
70-
[
71-
email,
72-
[
73-
original_social.uid,
74-
new_social.uid,
75-
str(organization_ratio),
76-
str(account_email_ratio),
77-
],
78-
]
79-
)
80-
continue
81-
82-
if old_organization_uid not in organizations:
83-
organizations[old_organization_uid] = [new_organization_uid]
84-
else:
85-
if new_organization_uid not in organizations[old_organization_uid]:
86-
organizations[old_organization_uid].append(new_organization_uid)
87-
88-
if original_user.email not in duped_users:
89-
duped_users[original_user.email] = [original_social.uid, new_social.uid]
90-
else:
91-
duped_users[original_user.email].append(new_social.uid)
92-
users_to_delete.append(duplicate_user.email)
93-
accounts_to_delete.append(original_social.uid)
94-
95-
if not options["dry_run"]:
96-
with transaction.atomic():
97-
original_user.social_auth.first().delete()
98-
original_user.social_auth.set([new_social])
99-
for playlist in duplicate_user.playlists.exclude(
100-
id__in=original_user.playlists.values_list("id", flat=True)
101-
):
102-
original_user.playlists.add(playlist)
103-
duplicate_user.delete()
104-
105-
return (
106-
accounts_to_delete,
107-
duped_users,
108-
organizations,
109-
skipped_accounts,
110-
users_to_delete,
111-
)
10+
logger = logging.getLogger(__name__)
11211

11312

11413
class Command(BaseCommand):
@@ -118,52 +17,13 @@ class Command(BaseCommand):
11817

11918
def add_arguments(self, parser):
12019
"""Add arguments to the command."""
20+
parser.add_argument("--email", type=str, help="Email to dedupe")
12121
parser.add_argument("--dry-run", action="store_true")
122-
parser.add_argument(
123-
"--email", type=str, help="Email to dedupe (for testing purposes)"
124-
)
12522

12623
def handle(self, *args, **options):
12724
"""Handle command."""
128-
if options["dry_run"]:
25+
dry_run = options["dry_run"]
26+
if dry_run:
12927
logger.info("[DRY-RUN] No changes will be made.")
13028

131-
(
132-
accounts_to_delete,
133-
duped_users,
134-
organizations,
135-
skipped_accounts,
136-
users_to_delete,
137-
) = dedupe_accounts(options)
138-
139-
logger.info("-" * 80)
140-
logger.info(
141-
"Deduping complete. %d SSO accounts deleted, %d users deleted",
142-
len(accounts_to_delete),
143-
len(users_to_delete),
144-
)
145-
logger.info("- " * 40)
146-
147-
logger.info("%d accounts skipped:", len(skipped_accounts))
148-
for email, accounts in skipped_accounts:
149-
logger.info(" - %s | %s", email, " | ".join(accounts))
150-
logger.info("- " * 40)
151-
152-
logger.info("%d organizations impacted:", len(organizations))
153-
for org_id, new_orgs in organizations.items():
154-
logger.info(" - %s -> %s", org_id, " -> ".join(new_orgs))
155-
logger.info("- " * 40)
156-
157-
logger.info("%d users impacted:", len(duped_users))
158-
for email, accounts in duped_users.items():
159-
logger.info(" - %s -> %s", email, " -> ".join(accounts))
160-
logger.info("- " * 40)
161-
162-
logger.info("Summary:")
163-
logger.info(" %d organizations impacted", len(organizations))
164-
logger.info(" %d users processed", len(duped_users))
165-
logger.info(" %d users deleted", len(users_to_delete))
166-
logger.info(" %d SSO accounts deleted", len(accounts_to_delete))
167-
168-
if options["dry_run"]:
169-
logger.info("[DRY-RUN] No changes made.")
29+
dedupe_accounts(options["email"], dry_run)

src/backend/marsha/account/tests/dedupe_accounts/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)