diff --git a/README.md b/README.md index 75ef31c1..4cabf138 100644 --- a/README.md +++ b/README.md @@ -114,6 +114,7 @@ These are the current supported sites: - [France Université Numérique](https://www.france-universite-numerique-mooc.fr/) - [GW Online SEAS](http://openedx.seas.gwu.edu/) - George Washington University - [GW Online Open](http://mooc.online.gwu.edu/) - George Washington University +- [Xuetangx (学堂在线)](http://www.xuetangx.com/) This is the full [list of sites powered by Open edX][sites]. Not all of them are supported at the moment, we welcome you to contribute support for them diff --git a/edx_dl/edx_dl.py b/edx_dl/edx_dl.py index 64486674..218a74ac 100644 --- a/edx_dl/edx_dl.py +++ b/edx_dl/edx_dl.py @@ -14,13 +14,14 @@ import pickle import re import sys +import math from functools import partial from multiprocessing.dummy import Pool as ThreadPool from six.moves.http_cookiejar import CookieJar from six.moves.urllib.error import HTTPError, URLError -from six.moves.urllib.parse import urlencode +from six.moves.urllib.parse import urlencode, quote from six.moves.urllib.request import ( urlopen, build_opener, @@ -93,19 +94,25 @@ 'bits':{ 'url':'http://any-learn.bits-pilani.ac.in', 'courseware-selector': ('nav', {'aria-label': 'Course Navigation'}), + }, + 'xuetangx': { + 'url': 'http://www.xuetangx.com', + 'courseware-selector': None, } } -BASE_URL = OPENEDX_SITES['edx']['url'] +SITE_NAME = 'edx' +BASE_URL = OPENEDX_SITES[SITE_NAME]['url'] EDX_HOMEPAGE = BASE_URL + '/login_ajax' LOGIN_API = BASE_URL + '/login_ajax' DASHBOARD = BASE_URL + '/dashboard' -COURSEWARE_SEL = OPENEDX_SITES['edx']['courseware-selector'] +COURSEWARE_SEL = OPENEDX_SITES[SITE_NAME]['courseware-selector'] def change_openedx_site(site_name): """ Changes the openedx website for the given one via the key """ + global SITE_NAME global BASE_URL global EDX_HOMEPAGE global LOGIN_API @@ -117,11 +124,15 @@ def change_openedx_site(site_name): logging.error("OpenEdX platform should be one of: %s", ', '.join(sites)) sys.exit(ExitCode.UNKNOWN_PLATFORM) - BASE_URL = OPENEDX_SITES[site_name]['url'] + SITE_NAME = site_name + BASE_URL = OPENEDX_SITES[SITE_NAME]['url'] EDX_HOMEPAGE = BASE_URL + '/login_ajax' LOGIN_API = BASE_URL + '/login_ajax' - DASHBOARD = BASE_URL + '/dashboard' - COURSEWARE_SEL = OPENEDX_SITES[site_name]['courseware-selector'] + if site_name == 'xuetangx': + DASHBOARD = BASE_URL + '/api/web/courses/mycourses?format=json' + else: + DASHBOARD = BASE_URL + '/dashboard' + COURSEWARE_SEL = OPENEDX_SITES[SITE_NAME]['courseware-selector'] def _display_courses(courses): @@ -135,10 +146,67 @@ def _display_courses(courses): logging.info(' %s', course.url) +def get_courses_info_xuetangx(url, headers): + """ + Extracts the courses information from the dashboard. + + This function is re-implemented for http://www.xuetangx.com, because + Xuetangx uses a REST API, which is quite different from other OpenEdX sites. + """ + def fetch_and_parse(base_url, param): + """ + Fetches the JSON API, and returns the total count, and a list of dicts + for the results on the current page. + + :param base_url: the URL of the API. + :param param: query parameters, represented by a list of tuples. + :return: a (total, results) tuple; (0, []) on failure. + """ + url = base_url + '?' + urlencode(param) + page = get_page_contents(url, headers) + try: + d = json.loads(page) + total = d['total'] + results = d['results'] + except (json.JSONDecodeError, KeyError): + total = 0 + results = [] + return total, results + + logging.info('Extracting course information from JSON API.') + + api_url = BASE_URL + '/api/web/courses/mycourses' + query_params = [ + [('type', 'started'), ('format', 'json')], + [('type', 'ended'), ('format', 'json')] + ] + # use default page size, and fetch multiple times, in case there is a hard + # limit set by the API + page_size = 10 + + courses = [] + page_extractor = get_page_extractor(url) + + for param in query_params: + total, results = fetch_and_parse(api_url, param) + page_count = int(math.ceil(1.0 * total / page_size)) + for i in range(page_count): + if i: + # page needs to be re-fetched unless it is the first one + new_param = param + [('offset', i * page_size)] + _, results = fetch_and_parse(api_url, new_param) + courses += page_extractor.extract_courses(results, BASE_URL) + + return courses + + def get_courses_info(url, headers): """ Extracts the courses information from the dashboard. """ + if SITE_NAME == 'xuetangx': + return get_courses_info_xuetangx(url, headers) + logging.info('Extracting course information from dashboard.') page = get_page_contents(url, headers) @@ -310,6 +378,14 @@ def parse_args(): default=False, help='list available sections') + parser.add_argument('--quality', + dest='quality', + action='store', + choices={'high', 'standard'}, + default='high', + help='quality of video to download; works for xuetangx' + ' only') + parser.add_argument('--youtube-dl-options', dest='youtube_dl_options', action='store', @@ -437,6 +513,9 @@ def extract_units(url, headers, file_formats): page = get_page_contents(url, headers) page_extractor = get_page_extractor(url) + set_headers = getattr(page_extractor, 'set_headers', None) + if callable(set_headers): + set_headers(headers) units = page_extractor.extract_units_from_html(page, BASE_URL, file_formats) return units @@ -666,27 +745,45 @@ def _build_subtitles_downloads(video, target_dir, filename_prefix, headers): return downloads -def _build_url_downloads(urls, target_dir, filename_prefix): +def _build_url_downloads(urls, target_dir, filename_prefix, args, + is_video=False): """ Builds a dict {url: filename} for the given urls If it is a youtube url it uses the valid template for youtube-dl otherwise just takes the name of the file from the url """ + if SITE_NAME == 'xuetangx' and is_video and urls: + # take advantage of the fact that the URL of HQ videos are + # lexicographically larger on Xuetangx ('quality20' > 'quality10') + urls = [max(urls)] if args.quality == 'high' else [min(urls)] downloads = {url: - _build_filename_from_url(url, target_dir, filename_prefix) + _build_filename_from_url(url, target_dir, filename_prefix, + is_video=is_video) for url in urls} return downloads -def _build_filename_from_url(url, target_dir, filename_prefix): +def _build_filename_from_url(url, target_dir, filename_prefix, is_video=False, + video_counter=[0]): """ Builds the appropriate filename for the given args """ + # video file names in Xuetangx do not make sense; + # use a counter as a workaround + if is_video: + video_counter[0] += 1 + if is_youtube_url(url): filename_template = filename_prefix + "-%(title)s-%(id)s.%(ext)s" filename = os.path.join(target_dir, filename_template) else: - original_filename = url.rsplit('/', 1)[1] + if SITE_NAME == 'xuetangx' and is_video: + original_filename = 'video_%05d.mp4' % video_counter[0] + else: + original_filename = url.rsplit('/', 1)[1] + # remove special characters that may cause problems under Windows + original_filename = ''.join(list(filter( + lambda c: c not in ';/?:@&=+$,', original_filename))) filename = os.path.join(target_dir, filename_prefix + '-' + original_filename) @@ -697,6 +794,8 @@ def download_url(url, filename, headers, args): """ Downloads the given url in filename. """ + # resolve unicode issue + url = quote(url, safe=';/?:@&=+$,') if is_youtube_url(url): download_youtube_url(url, filename, headers, args) @@ -779,13 +878,15 @@ def skip_or_download(downloads, headers, args, f=download_url): def download_video(video, args, target_dir, filename_prefix, headers): if args.prefer_cdn_videos or video.video_youtube_url is None: mp4_downloads = _build_url_downloads(video.mp4_urls, target_dir, - filename_prefix) + filename_prefix, args, + is_video=True) skip_or_download(mp4_downloads, headers, args) else: if video.video_youtube_url is not None: youtube_downloads = _build_url_downloads([video.video_youtube_url], target_dir, - filename_prefix) + filename_prefix, + is_video=True) skip_or_download(youtube_downloads, headers, args) # the behavior with subtitles is different, since the subtitles don't know @@ -813,7 +914,7 @@ def download_unit(unit, args, target_dir, filename_prefix, headers): download_video(video, args, target_dir, new_prefix, headers) res_downloads = _build_url_downloads(unit.resources_urls, target_dir, - filename_prefix) + filename_prefix, args) skip_or_download(res_downloads, headers, args) @@ -827,13 +928,19 @@ def download(args, selections, all_units, headers): # notice that we could iterate over all_units, but we prefer to do it over # sections/subsections to add correct prefixes and show nicer information. + # courses on Xuetangx may contain chinese characters + preserve_non_ascii = (SITE_NAME == 'xuetangx') + for selected_course, selected_sections in selections.items(): - coursename = directory_name(selected_course.name) + coursename = directory_name(selected_course.name, + minimal_change=preserve_non_ascii) for selected_section in selected_sections: section_dirname = "%02d-%s" % (selected_section.position, selected_section.name) target_dir = os.path.join(args.output_dir, coursename, - clean_filename(section_dirname)) + clean_filename(section_dirname, + minimal_change= + preserve_non_ascii)) mkdir_p(target_dir) counter = 0 for subsection in selected_section.subsections: diff --git a/edx_dl/parsing.py b/edx_dl/parsing.py index 5e50d354..e8980376 100644 --- a/edx_dl/parsing.py +++ b/edx_dl/parsing.py @@ -5,6 +5,7 @@ """ import re import json +import logging from datetime import timedelta, datetime @@ -12,6 +13,7 @@ from bs4 import BeautifulSoup as BeautifulSoup_ from .common import Course, Section, SubSection, Unit, Video +from .utils import get_page_contents, remove_blanks # Force use of bs4 with html.parser @@ -188,7 +190,9 @@ def extract_resources_urls(self, text, BASE_URL, file_formats): youtube_links = re_youtube_links.findall(text) resources_urls += youtube_links - return resources_urls + # there may be some surplus blank characters extracted from the HTML; + # remove them + return list(map(remove_blanks, resources_urls)) def extract_sections_from_html(self, page, BASE_URL): """ @@ -408,6 +412,75 @@ def _make_subsections(section_soup): return sections +class XuetangxPageExtractor(ClassicEdXPageExtractor): + + def __init__(self): + self.headers = None + + def set_headers(self, headers): + """Sets the headers necessary for accessing the video URL API""" + self.headers = headers + self.base_url = None + + def extract_courses(self, results, BASE_URL): + """ + Extract courses from a list of dicts. + """ + courses = [] + + for result in results: + try: + course_id = result['id'] + course_name = result['name'] + course_url = BASE_URL + result['info_link'] + # Xuetangx allows accessing materials for all archived courses, + # so it's safe to mark all courses as 'Started'. + course_state = 'Started' + except KeyError: + continue + courses.append(Course(id=course_id, + name=course_name, + url=course_url, + state=course_state)) + + return courses + + def extract_units_from_html(self, page, BASE_URL, file_formats): + self.base_url = BASE_URL + return ClassicEdXPageExtractor.extract_units_from_html(self, page, + BASE_URL, + file_formats) + + def extract_mp4_urls(self, text): + """ + Looks for available links to the mp4 version of the videos + """ + # Xuetangx does not provide the video URL directly in the page; + # instead, a video id can be found in the page and translated into + # actual URL through a "video2source" API. + m = re.search('(?<=data-ccsource=').+(?=')', text) + if not m: + return [] + + video_id = m.group(0) + if not self.base_url: + logging.debug('Base URL unset; please set self.base_url before ' + 'calling extract_mp4_urls') + return [] + video_src_url = self.base_url + '/videoid2source/' + video_id + video_src_json = get_page_contents(video_src_url, self.headers) + try: + sources = json.loads(video_src_json)['sources'] + except (json.JSONDecodeError, KeyError): + return [] + + mp4_urls = [] + for quality in sources: + if sources[quality]: + mp4_urls.append(sources[quality][0]) + return mp4_urls + + def get_page_extractor(url): """ factory method for page extractors @@ -423,6 +496,8 @@ def get_page_extractor(url): url.startswith('https://www.fun-mooc.fr') ): return CurrentEdXPageExtractor() + elif 'xuetangx.com' in url: + return XuetangxPageExtractor() else: return ClassicEdXPageExtractor() diff --git a/edx_dl/utils.py b/edx_dl/utils.py index 0ec44718..6a61f601 100644 --- a/edx_dl/utils.py +++ b/edx_dl/utils.py @@ -42,11 +42,11 @@ def execute_command(cmd, args): raise e -def directory_name(initial_name): +def directory_name(initial_name, minimal_change=False): """ Transform the name of a directory into an ascii version """ - result = clean_filename(initial_name) + result = clean_filename(initial_name, minimal_change=minimal_change) return result if result != "" else "course_folder" @@ -139,3 +139,8 @@ def clean_filename(s, minimal_change=False): s = s.strip().replace(' ', '_') valid_chars = '-_.()%s%s' % (string.ascii_letters, string.digits) return ''.join(c for c in s if c in valid_chars) + + +def remove_blanks(s): + """Remove all blank characters from a string.""" + return ''.join(list(filter(lambda c: not c.isspace(), s))) \ No newline at end of file