coursera/coursera-dl at master · roamingryan/coursera · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
#!/usr/bin/env python
"""
For downloading lecture resources such as videos for Coursera classes. Given a class name and related cookie file, it scrapes the course listing page to get the section (week) and lecture names, and then downloads the related materials into appropriately named files and directories.

Examples:
  coursera-dl -c cookies.txt saas
  coursera-dl -c cookies.txt -l listing.html -o saas --skip-download

Author:
  John Lehmann (first last at geemail dotcom or @jplehmann)
"""

import sys, os, re, string
import urllib2, cookielib
import tempfile
import subprocess
import argparse
import StringIO
from BeautifulSoup import BeautifulSoup

def get_syllabus_url(className):
  """Return the Coursera index/syllabus URL."""
  return "http://class.coursera.org/%s/lecture/index" % className

def load_cookies_file(cookies_file):
  """Loads the cookies file. I am pre-pending the file with the special
  Netscape header because the cookie loader is being very particular about
  this string."""
  cookies = StringIO.StringIO()
  NETSCAPE_HEADER = "# Netscape HTTP Cookie File"
  cookies.write(NETSCAPE_HEADER);
  cookies.write(open(cookies_file, 'r').read())
  cookies.flush()
  cookies.seek(0)
  return cookies

def get_opener(cookies_file):
  """Use cookie file to create a url opener."""
  cj = cookielib.MozillaCookieJar()
  cookies = load_cookies_file(cookies_file)
  # nasty hack: cj.load() requires a filename not a file, but if
  # I use stringio, that file doesn't exist. I used NamedTemporaryFile
  # before, but encountered problems on Windows.
  cj._really_load(cookies, "StringIO.cookies", False, False)
  return urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))

def get_page(url, cookies_file):
  """Download an HTML page using the cookiejar."""
  opener = get_opener(cookies_file)
  return opener.open(url).read()

def get_syllabus(class_name, cookies_file, local_page=False):
  """ Get the course listing webpage."""
  if (not (local_page and os.path.exists(local_page))):
    url = get_syllabus_url(class_name)
    page = get_page(url, cookies_file)
    print "Downloaded %s (%d bytes)" % (url, len(page))
    # cache the page if we're in 'local' mode
    if (local_page):
      open(local_page, 'w').write(page)
  else:
    page = open(local_page).read()
  return page

def clean_filename(s):
  """Sanitize a string to be used as a filename."""
  # strip paren portions which contain trailing time length (...)
  s = re.sub("\([^\(]*$", "", s)
  s = s.strip().replace(':','-').replace(' ', '_')
  valid_chars = "-_.()%s%s" % (string.ascii_letters, string.digits)
  return ''.join(c for c in s if c in valid_chars)

def get_anchor_format(a):
  """Extract the resource file-type format from the anchor"""
  # (. or format=) then (file_extension) then (? or $)
  # e.g. "...format=txt" or "...download.mp4?..."
  format = re.search("(?:\.|format=)(\w+)(?:\?.*)?$", a)
  return format.group(1) if format else None

def parse_syllabus(page):
  """Parses a Coursera course listing/syllabus page.
  Each section is a week of classes."""
  sections = []
  soup = BeautifulSoup(page)
  # traverse sections
  for stag in soup.findAll(attrs={'class':'list_header'}):
    assert stag.string != None, "couldn't find section"
    section_name = clean_filename(stag.string)
    print section_name
    lectures = [] # resources for 1 lecture
    # traverse resources (e.g., video, ppt, ..)
    for vtag in stag.parent.nextSibling.findAll('li'):
      assert vtag.a.contents[0], "couldn't get lecture name"
      vname = clean_filename(vtag.a.contents[0])
      print "  ", vname
      lecture = {}
      for a in vtag.findAll('a'):
        href = a['href']
        format = get_anchor_format(href)
        print "    ", format, href
        if format: lecture[format] = href
      lectures.append((vname, lecture))
    sections.append((section_name, lectures))
  print "Found %d sections and %d lectures on this page" % \
    (len(sections), sum((len(s[1]) for s in sections)))
  if (not len(sections)):
    print "Probably bad cookies file (or wrong class name)"
  return sections

def download_lectures(
  wget_bin,
  cookies_file,
  class_name,
  sections,
  file_formats,
  overwrite=False,
  skip_download=False,
  section_filter=None,
  lecture_filter=None
  ):
  """Downloads lecture resources described by sections."""

  def format_section(num, section):
    return "%s_%02d_%s" % (class_name.upper(), num, section)

  def format_resource(num, name, format):
    return "%02d_%s.%s" % (num, name, format)

  for (secnum, (section, lectures)) in enumerate(sections):
    if section_filter and not re.search(section_filter, section):
      #print "Skipping b/c of sf: ", section_filter, section
      continue
    sec = format_section(secnum+1, section)
    for (lecnum, (lecname, lecture)) in enumerate(lectures):
      if lecture_filter and not re.search(lecture_filter, lecname):
        continue
      if not os.path.exists(sec):
        os.mkdir(sec)
      # write lecture resources
      for format,url in [i for i in lecture.items() if ((i[0] in file_formats) or "all" in file_formats)]:
        lecfn = os.path.join(sec, format_resource(lecnum+1, lecname, format))
        print lecfn
        if overwrite or not os.path.exists(lecfn):
          if not skip_download:
            download_file(url, lecfn, cookies_file, wget_bin)
          else:
            open(lecfn, 'w').close()  # touch

def download_file(url, fn, cookies_file, wget_bin):
  """Downloads file and removes current file if aborted by user."""
  try:
    if wget_bin:
      download_file_wget(wget_bin, url, fn, cookies_file)
    else:
      download_file_nowget(url, fn, cookies_file)
  except KeyboardInterrupt, e:
    print "\nKeyboard Interrupt -- Removing partial file:", fn
    os.remove(fn)
    sys.exit()

def download_file_wget(wget_bin, url, fn, cookies_file):
  """Downloads a file using wget.  Could possibly use python to stream files to
  disk, but wget is robust and gives nice visual feedback."""
  cmd = [wget_bin, url, "-O", fn, "--load-cookies", cookies_file, "--no-check-certificate"]
  print "Executing wget:", cmd
  retcode = subprocess.call(cmd)

def download_file_nowget(url, fn, cookies_file):
  """'Native' python downloader -- slower than wget."""
  print "Downloading %s -> %s" % (url, fn)
  urlfile = get_opener(cookies_file).open(url)
  chunk_sz = 1048576
  bytesread = 0
  f = open(fn, "wb")
  while True:
    data = urlfile.read(chunk_sz)
    if not data:
      print "."
      break
    f.write(data)
    bytesread += len(data)
    print "\r%d bytes read" % bytesread,
    sys.stdout.flush()

def parseArgs():
  parser = argparse.ArgumentParser(description='Download Coursera.org lecture material and resources.')
  # positional
  parser.add_argument('class_name', action='store',
    help='name of the class (e.g. "nlp")')
  # required
  parser.add_argument('-c', '--cookies_file', dest='cookies_file',
    action='store', required=True, help='full path to the cookies.txt file')
  # optional
  parser.add_argument('-f', '--formats', dest='file_formats',
    action='store', default="all", help='file format extensions to be downloaded in quotes space separated, e.g. "mp4 pdf" (default: special value "all")')
  parser.add_argument('-sf', '--section_filter', dest='section_filter',
    action='store', default=None, help='only download sections which contain this regex (default: disabled)')
  parser.add_argument('-lf', '--lecture_filter', dest='lecture_filter',
    action='store', default=None, help='only download lectures which contain this regex (default: disabled)')
  parser.add_argument('-w', '--wget_bin', dest='wget_bin',
    action='store', default=None, help='wget binary if it should be used for downloading')
  parser.add_argument('-o', '--overwrite', dest='overwrite',
    action='store_true', default=False,
    help='whether existing files should be overwritten (default: False)')
  parser.add_argument('-l', '--process_local_page', dest='local_page',
    help='for debugging: uses or creates local cached version of syllabus page')
  parser.add_argument('--skip-download', dest='skip_download',
    action='store_true', default=False,
    help='for debugging: skip actual downloading of files')
  args = parser.parse_args()
  # turn list of strings into list
  args.file_formats = args.file_formats.split()
  # check arguments
  if not os.path.exists(args.cookies_file):
    raise IOError("Cookies file not found: " + args.cookies_file)
  return args

def main():
  args = parseArgs()
  page = get_syllabus(args.class_name, args.cookies_file, args.local_page)
  sections = parse_syllabus(page)
  download_lectures(
    args.wget_bin,
    args.cookies_file,
    args.class_name,
    sections,
    args.file_formats,
    args.overwrite,
    args.skip_download,
    args.section_filter,
    args.lecture_filter
  )

if __name__ == "__main__":
  main()