Skip to content

Commit afd707f

Browse files
Create a management command to run edx courserun migration (#2922)
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent ba5e6e2 commit afd707f

File tree

4 files changed

+592
-8
lines changed

4 files changed

+592
-8
lines changed
Lines changed: 244 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,244 @@
1+
from django.conf import settings
2+
from django.core.management.base import BaseCommand
3+
from trino.auth import BasicAuthentication
4+
from trino.dbapi import connect
5+
6+
from cms.api import create_default_courseware_page
7+
from cms.models import CertificatePage, SignatoryPage
8+
from courses.models import Course, CourseRun, Department
9+
10+
11+
class Command(BaseCommand):
12+
help = (
13+
"Migrate the edX data via Trino from the data platform to the corresponding models in MITx Online "
14+
"e.g Course, CourseRun, CoursePage CertificatePage, etc."
15+
)
16+
17+
def _connect_to_trino(self):
18+
try:
19+
conn = connect(
20+
host=settings.TRINO_HOST,
21+
port=settings.TRINO_PORT,
22+
user=settings.TRINO_USER,
23+
auth=BasicAuthentication(settings.TRINO_USER, settings.TRINO_PASSWORD),
24+
catalog=settings.TRINO_CATALOG,
25+
schema="ol_warehouse_production_migration",
26+
)
27+
self.stdout.write(self.style.SUCCESS("Successfully connected to Trino"))
28+
except Exception as e:
29+
self.stdout.write(self.style.ERROR(f"Failed to connect to Trino: {e}"))
30+
raise
31+
else:
32+
return conn
33+
34+
def _create_course(self, row):
35+
"""
36+
Create a new Course instance or get an existing one using data from the row.
37+
38+
Args:
39+
row (dict): Dictionary containing course data from Trino
40+
41+
Returns:
42+
tuple: Tuple containing (Course instance, boolean indicating if created)
43+
- Course instance: The created or retrieved Course model instance
44+
- boolean: True if a new course was created, False if existing was retrieved
45+
"""
46+
department, _ = Department.objects.get_or_create(
47+
name=row.get("department_name")
48+
)
49+
50+
course, created = Course.objects.get_or_create(
51+
readable_id=row.get("course_readable_id"),
52+
defaults={
53+
"title": row.get("course_title"),
54+
"readable_id": row.get("course_readable_id"),
55+
"live": False,
56+
},
57+
)
58+
course.departments.add(department)
59+
course.save()
60+
61+
return course, created
62+
63+
def _create_course_certificate_page(self, course_page, course_title, signatories):
64+
"""
65+
Create a certificate page for a course and associate it with a signatory.
66+
67+
Args:
68+
course_page: The course page to create the certificate page under
69+
course_title: Title of the course to use in certificate
70+
signatories: The SignatoryPage instances to associate with the certificate
71+
72+
Returns:
73+
The created CertificatePage instance
74+
75+
"""
76+
certificate_page = course_page.add_child(
77+
instance=CertificatePage(
78+
product_name=course_title,
79+
title=f"Certificate For {course_title}",
80+
live=True,
81+
)
82+
)
83+
84+
if signatories:
85+
# Create the signatory block with only required fields
86+
signatory_blocks = [
87+
{"type": "signatory", "value": signatory.id}
88+
for signatory in signatories
89+
]
90+
certificate_page.signatories = signatory_blocks
91+
92+
# Save the page first to create the initial content
93+
certificate_page.save()
94+
95+
# Create a new revision
96+
revision = certificate_page.save_revision()
97+
revision.publish()
98+
99+
return certificate_page
100+
101+
def _create_course_run(self, course, row):
102+
"""
103+
Create a new CourseRun instance for the given course using data from row.
104+
105+
Args:
106+
course (Course): Course model instance to create the run for
107+
row (dict): Dictionary containing course run data from Trino
108+
109+
Returns:
110+
tuple: Tuple containing (CourseRun instance, boolean indicating if created)
111+
"""
112+
course_run, created_run = CourseRun.objects.get_or_create(
113+
courseware_id=row.get("courseware_id"),
114+
defaults={
115+
"course": course,
116+
"run_tag": row.get("run_tag"),
117+
"start_date": row.get("start_date"),
118+
"end_date": row.get("end_date"),
119+
"enrollment_start": row.get("enrollment_start"),
120+
"enrollment_end": row.get("enrollment_end"),
121+
"title": row.get("courserun_title"),
122+
"live": True,
123+
"is_self_paced": row.get("is_self_paced"),
124+
},
125+
)
126+
return course_run, created_run
127+
128+
def _get_signatories(self, signatory_list, use_default_signatory):
129+
"""
130+
Get SignatoryPage based on the provided list of names or use the default signatory.
131+
132+
Args:
133+
signatory_list (list): List of signatory names
134+
use_default_signatory: Boolean indicating whether to use the default signatory
135+
136+
Returns:
137+
list: List of SignatoryPage
138+
"""
139+
if use_default_signatory:
140+
signatory_obj = SignatoryPage.objects.first()
141+
if not signatory_obj:
142+
self.stdout.write(
143+
self.style.ERROR(
144+
"No signatory found in the system. Please create at least one SignatoryPage instance."
145+
)
146+
)
147+
exit(-1) # noqa: PLR1722
148+
return [signatory_obj]
149+
150+
signatory_names = [
151+
name.strip()
152+
for name in signatory_list
153+
if isinstance(name, str) and name.strip()
154+
]
155+
return list(SignatoryPage.objects.filter(name__in=signatory_names))
156+
157+
def _migrate_course_runs(self, conn, options):
158+
"""
159+
Migrate course runs,their associated courses, course pages, and course certificate pages
160+
"""
161+
use_default_signatory = options["use_default_signatory"]
162+
limit = options.get("limit")
163+
batch_size = options.get("batch_size", 1000)
164+
165+
cur = conn.cursor()
166+
167+
query = "SELECT * FROM edxorg_to_mitxonline_course_runs"
168+
if limit is not None:
169+
query += f" LIMIT {int(limit)}"
170+
cur.execute(query)
171+
columns = [desc[0] for desc in cur.description]
172+
173+
course_success_count = 0
174+
run_success_count = 0
175+
176+
while True:
177+
results = cur.fetchmany(batch_size)
178+
if not results:
179+
break
180+
181+
for result in results:
182+
row = dict(zip(columns, result))
183+
# Skip rows with no department courses or any course with no certificates
184+
if (
185+
not row.get("department_name")
186+
and row.get("mitxonline_course_id") is None
187+
) or (
188+
row.get("certificate_count") is None
189+
or int(row.get("certificate_count", 0)) < 1
190+
):
191+
continue
192+
193+
signatories = self._get_signatories(
194+
row.get("signatory_names", []), use_default_signatory
195+
)
196+
197+
(course, course_created) = self._create_course(row)
198+
199+
if course_created:
200+
try:
201+
course_page = create_default_courseware_page(course, live=False)
202+
self._create_course_certificate_page(
203+
course_page, row.get("course_title"), signatories
204+
)
205+
course_success_count += 1
206+
207+
except Exception as e: # noqa: BLE001
208+
self.stdout.write(
209+
self.style.ERROR(
210+
f"Could not create CMS page {course.readable_id}, skipping it: {e}"
211+
)
212+
)
213+
214+
course_run, run_created = self._create_course_run(course, row)
215+
if run_created:
216+
run_success_count += 1
217+
218+
self.stdout.write(self.style.SUCCESS(f"{course_success_count} courses created"))
219+
self.stdout.write(
220+
self.style.SUCCESS(f"{run_success_count} course runs created")
221+
)
222+
223+
def add_arguments(self, parser) -> None:
224+
parser.add_argument(
225+
"--use-default-signatory",
226+
action="store_true",
227+
help="Use default signatory for certificate pages for testing purposes, which is the first signatory.",
228+
)
229+
parser.add_argument(
230+
"--limit",
231+
type=int,
232+
help="Limit the number of rows processed from Trino (for testing purposes)",
233+
)
234+
parser.add_argument(
235+
"--batch-size",
236+
type=int,
237+
default=1000,
238+
help="Number of rows to fetch per batch from Trino (default: 1000)",
239+
)
240+
241+
def handle(self, *args, **options): # pylint: disable=unused-argument # noqa: ARG002
242+
conn = self._connect_to_trino()
243+
244+
self._migrate_course_runs(conn, options)

main/settings.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1593,3 +1593,33 @@
15931593
default=False,
15941594
description="Disable the task so it no-ops",
15951595
)
1596+
1597+
TRINO_HOST = get_string(
1598+
name="TRINO_HOST",
1599+
default=None,
1600+
description="Host URL for Trino server",
1601+
)
1602+
1603+
TRINO_PORT = get_int(
1604+
name="TRINO_PORT",
1605+
default=443,
1606+
description="Port number for Trino server",
1607+
)
1608+
1609+
TRINO_CATALOG = get_string(
1610+
name="TRINO_CATALOG",
1611+
default=None,
1612+
description="Catalog name for Trino queries",
1613+
)
1614+
1615+
TRINO_USER = get_string(
1616+
name="TRINO_USER",
1617+
default=None,
1618+
description="Username for Trino authentication",
1619+
)
1620+
1621+
TRINO_PASSWORD = get_string(
1622+
name="TRINO_PASSWORD",
1623+
default=None,
1624+
description="Password for Trino authentication",
1625+
)

0 commit comments

Comments
 (0)