forked from coursera-dl/coursera-dl
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcoursera-dl
More file actions
executable file
·235 lines (214 loc) · 8.74 KB
/
coursera-dl
File metadata and controls
executable file
·235 lines (214 loc) · 8.74 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
#!/usr/bin/env python
"""
For downloading lecture resources such as videos for Coursera classes. Given a class name and related cookie file, it scrapes the course listing page to get the section (week) and lecture names, and then downloads the related materials into appropriately named files and directories.
Examples:
coursera-dl -c cookies.txt saas
coursera-dl -c cookies.txt -l listing.html -o saas --skip-download
Author:
John Lehmann (first last at geemail dotcom or @jplehmann)
"""
import sys, os, re, string
import urllib2, cookielib
import tempfile
import subprocess
import argparse
import StringIO
from BeautifulSoup import BeautifulSoup
def get_syllabus_url(className):
"""Return the Coursera index/syllabus URL."""
return "http://class.coursera.org/%s/lecture/index" % className
def load_cookies_file(cookies_file):
"""Loads the cookies file. I am pre-pending the file with the special
Netscape header because the cookie loader is being very particular about
this string."""
cookies = StringIO.StringIO()
NETSCAPE_HEADER = "# Netscape HTTP Cookie File"
cookies.write(NETSCAPE_HEADER);
cookies.write(open(cookies_file, 'r').read())
cookies.flush()
cookies.seek(0)
return cookies
def get_opener(cookies_file):
"""Use cookie file to create a url opener."""
cj = cookielib.MozillaCookieJar()
cookies = load_cookies_file(cookies_file)
# nasty hack: cj.load() requires a filename not a file, but if
# I use stringio, that file doesn't exist. I used NamedTemporaryFile
# before, but encountered problems on Windows.
cj._really_load(cookies, "StringIO.cookies", False, False)
return urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
def get_page(url, cookies_file):
"""Download an HTML page using the cookiejar."""
opener = get_opener(cookies_file)
return opener.open(url).read()
def get_syllabus(class_name, cookies_file, local_page=False):
""" Get the course listing webpage."""
if (not (local_page and os.path.exists(local_page))):
url = get_syllabus_url(class_name)
page = get_page(url, cookies_file)
print "Downloaded %s (%d bytes)" % (url, len(page))
# cache the page if we're in 'local' mode
if (local_page):
open(local_page, 'w').write(page)
else:
page = open(local_page).read()
return page
def clean_filename(s):
"""Sanitize a string to be used as a filename."""
# strip paren portions which contain trailing time length (...)
s = re.sub("\([^\(]*$", "", s)
s = s.strip().replace(':','-').replace(' ', '_')
valid_chars = "-_.()%s%s" % (string.ascii_letters, string.digits)
return ''.join(c for c in s if c in valid_chars)
def get_anchor_format(a):
"""Extract the resource file-type format from the anchor"""
# (. or format=) then (file_extension) then (? or $)
# e.g. "...format=txt" or "...download.mp4?..."
format = re.search("(?:\.|format=)(\w+)(?:\?.*)?$", a)
return format.group(1) if format else None
def parse_syllabus(page):
"""Parses a Coursera course listing/syllabus page.
Each section is a week of classes."""
sections = []
soup = BeautifulSoup(page)
# traverse sections
for stag in soup.findAll(attrs={'class':'list_header'}):
assert stag.string != None, "couldn't find section"
section_name = clean_filename(stag.string)
print section_name
lectures = [] # resources for 1 lecture
# traverse resources (e.g., video, ppt, ..)
for vtag in stag.parent.nextSibling.findAll('li'):
assert vtag.a.contents[0], "couldn't get lecture name"
vname = clean_filename(vtag.a.contents[0])
print " ", vname
lecture = {}
for a in vtag.findAll('a'):
href = a['href']
format = get_anchor_format(href)
print " ", format, href
if format: lecture[format] = href
lectures.append((vname, lecture))
sections.append((section_name, lectures))
print "Found %d sections and %d lectures on this page" % \
(len(sections), sum((len(s[1]) for s in sections)))
if (not len(sections)):
print "Probably bad cookies file (or wrong class name)"
return sections
def download_lectures(
wget_bin,
cookies_file,
class_name,
sections,
file_formats,
overwrite=False,
skip_download=False,
section_filter=None,
lecture_filter=None
):
"""Downloads lecture resources described by sections."""
def format_section(num, section):
return "%s_%02d_%s" % (class_name.upper(), num, section)
def format_resource(num, name, format):
return "%02d_%s.%s" % (num, name, format)
for (secnum, (section, lectures)) in enumerate(sections):
if section_filter and not re.search(section_filter, section):
#print "Skipping b/c of sf: ", section_filter, section
continue
sec = format_section(secnum+1, section)
for (lecnum, (lecname, lecture)) in enumerate(lectures):
if lecture_filter and not re.search(lecture_filter, lecname):
continue
if not os.path.exists(sec):
os.mkdir(sec)
# write lecture resources
for format,url in [i for i in lecture.items() if ((i[0] in file_formats) or "all" in file_formats)]:
lecfn = os.path.join(sec, format_resource(lecnum+1, lecname, format))
print lecfn
if overwrite or not os.path.exists(lecfn):
if not skip_download:
download_file(url, lecfn, cookies_file, wget_bin)
else:
open(lecfn, 'w').close() # touch
def download_file(url, fn, cookies_file, wget_bin):
"""Downloads file and removes current file if aborted by user."""
try:
if wget_bin:
download_file_wget(wget_bin, url, fn, cookies_file)
else:
download_file_nowget(url, fn, cookies_file)
except KeyboardInterrupt, e:
print "\nKeyboard Interrupt -- Removing partial file:", fn
os.remove(fn)
sys.exit()
def download_file_wget(wget_bin, url, fn, cookies_file):
"""Downloads a file using wget. Could possibly use python to stream files to
disk, but wget is robust and gives nice visual feedback."""
cmd = [wget_bin, url, "-O", fn, "--load-cookies", cookies_file, "--no-check-certificate"]
print "Executing wget:", cmd
retcode = subprocess.call(cmd)
def download_file_nowget(url, fn, cookies_file):
"""'Native' python downloader -- slower than wget."""
print "Downloading %s -> %s" % (url, fn)
urlfile = get_opener(cookies_file).open(url)
chunk_sz = 1048576
bytesread = 0
f = open(fn, "wb")
while True:
data = urlfile.read(chunk_sz)
if not data:
print "."
break
f.write(data)
bytesread += len(data)
print "\r%d bytes read" % bytesread,
sys.stdout.flush()
def parseArgs():
parser = argparse.ArgumentParser(description='Download Coursera.org lecture material and resources.')
# positional
parser.add_argument('class_name', action='store',
help='name of the class (e.g. "nlp")')
# required
parser.add_argument('-c', '--cookies_file', dest='cookies_file',
action='store', required=True, help='full path to the cookies.txt file')
# optional
parser.add_argument('-f', '--formats', dest='file_formats',
action='store', default="all", help='file format extensions to be downloaded in quotes space separated, e.g. "mp4 pdf" (default: special value "all")')
parser.add_argument('-sf', '--section_filter', dest='section_filter',
action='store', default=None, help='only download sections which contain this regex (default: disabled)')
parser.add_argument('-lf', '--lecture_filter', dest='lecture_filter',
action='store', default=None, help='only download lectures which contain this regex (default: disabled)')
parser.add_argument('-w', '--wget_bin', dest='wget_bin',
action='store', default=None, help='wget binary if it should be used for downloading')
parser.add_argument('-o', '--overwrite', dest='overwrite',
action='store_true', default=False,
help='whether existing files should be overwritten (default: False)')
parser.add_argument('-l', '--process_local_page', dest='local_page',
help='for debugging: uses or creates local cached version of syllabus page')
parser.add_argument('--skip-download', dest='skip_download',
action='store_true', default=False,
help='for debugging: skip actual downloading of files')
args = parser.parse_args()
# turn list of strings into list
args.file_formats = args.file_formats.split()
# check arguments
if not os.path.exists(args.cookies_file):
raise IOError("Cookies file not found: " + args.cookies_file)
return args
def main():
args = parseArgs()
page = get_syllabus(args.class_name, args.cookies_file, args.local_page)
sections = parse_syllabus(page)
download_lectures(
args.wget_bin,
args.cookies_file,
args.class_name,
sections,
args.file_formats,
args.overwrite,
args.skip_download,
args.section_filter,
args.lecture_filter
)
if __name__ == "__main__":
main()