Skip to content

Commit 3489e16

Browse files
committed
Add a helper script to download data from the IMDb
1 parent 9686e10 commit 3489e16

File tree

1 file changed

+49
-0
lines changed

1 file changed

+49
-0
lines changed

binary-search/download_imdb.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
#!/usr/bin/env python
2+
3+
"""
4+
Fetch and parse people names from the IMDb.
5+
6+
Usage:
7+
$ python download_imdb.py
8+
"""
9+
10+
import gzip
11+
import shutil
12+
import tempfile
13+
import urllib.request
14+
15+
16+
def main():
17+
"""Script entry point."""
18+
19+
print('Fetching data from IMDb...')
20+
21+
with open('names.txt', 'w') as destination:
22+
destination.writelines(names())
23+
24+
with open('names.txt') as source, \
25+
open('sorted_names.txt', 'w') as destination:
26+
destination.writelines(sorted(source.readlines()))
27+
28+
print('Created "names.txt" and "sorted_names.txt"')
29+
30+
31+
def names():
32+
"""Return a generator of names with a trailing newline."""
33+
url = 'https://datasets.imdbws.com/name.basics.tsv.gz'
34+
with urllib.request.urlopen(url) as response:
35+
with tempfile.NamedTemporaryFile(mode='w+b') as archive:
36+
shutil.copyfileobj(response, archive)
37+
archive.seek(0)
38+
with gzip.open(archive, mode='rt') as source:
39+
next(source) # Skip the header
40+
for line in source:
41+
full_name = line.split('\t')[1]
42+
yield f'{full_name}\n'
43+
44+
45+
if __name__ == '__main__':
46+
try:
47+
main()
48+
except KeyboardInterrupt:
49+
print('Aborted')

0 commit comments

Comments
 (0)