From d5b706d5469423087f91816c9f0e32095d338efc Mon Sep 17 00:00:00 2001 From: mor3dr3ad Date: Wed, 6 Nov 2019 14:22:00 +0100 Subject: [PATCH 1/2] Update parsing.py to account for new edx structure Updated parsing on lines 385 and 397 to account for a slightly changed structure in edx's website. Sections can now have one of two classes: `outline-item section` or `outline-item section scored`. Changed code for subsections accordingly too. --- edx_dl/parsing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/edx_dl/parsing.py b/edx_dl/parsing.py index 5e50d354..4b0557b0 100644 --- a/edx_dl/parsing.py +++ b/edx_dl/parsing.py @@ -382,7 +382,7 @@ def _get_section_name(section_soup): # FIXME: Extract from here and test def _make_subsections(section_soup): try: - subsections_soup = section_soup.find_all('li', class_='vertical outline-item focusable') + subsections_soup = section_soup.find_all('li', class_=['vertical outline-item focusable', 'vertical outline-item focusable scored']) except AttributeError: return [] # FIXME correct extraction of subsection.name (unicode) @@ -394,7 +394,7 @@ def _make_subsections(section_soup): return subsections soup = BeautifulSoup(page) - sections_soup = soup.find_all('li', class_='outline-item section') + sections_soup = soup.find_all('li', class_=['outline-item section', 'outline-item section']) sections = [Section(position=i, name=_get_section_name(section_soup), From 006045f8ddf86c816d98a1a2a18ca5dc613e8f17 Mon Sep 17 00:00:00 2001 From: mor3dr3ad Date: Wed, 6 Nov 2019 14:58:20 +0100 Subject: [PATCH 2/2] update parsing.py to account for new edx structure recently edx slightly changed their website's structure for some courses: - sections now have one of two classes: `outline-item section` or `outline-item section scored` - subsections have one of two classes: `vertical outline-item focusable` or `vertical outline-item focusable scored`. this resulted in 0 downloadable sections being found for some courses, e.g. https://courses.edx.org/courses/course-v1:MITx+14.750x+3T2019/course/. A slight change in edx_dl/parsing.py in the class NewEdxPageExtractor resolves the issue. Class identifier has been replaced with a list representing both possibilities. This should therefore work for both new and old courses as well as mixed instances. --- edx_dl/parsing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/edx_dl/parsing.py b/edx_dl/parsing.py index 4b0557b0..68b12aad 100644 --- a/edx_dl/parsing.py +++ b/edx_dl/parsing.py @@ -394,7 +394,7 @@ def _make_subsections(section_soup): return subsections soup = BeautifulSoup(page) - sections_soup = soup.find_all('li', class_=['outline-item section', 'outline-item section']) + sections_soup = soup.find_all('li', class_=['outline-item section', 'outline-item section scored']) sections = [Section(position=i, name=_get_section_name(section_soup),