forked from stanfordjournalism/search-script-scrape
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmore_scotus_laughs.py
More file actions
25 lines (20 loc) · 896 Bytes
/
more_scotus_laughs.py
File metadata and controls
25 lines (20 loc) · 896 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
# Modification of scripts/50.py to count all the laughs in the most recent term
from lxml import html
from subprocess import check_output
from urllib.parse import urljoin
import requests
url = 'http://www.supremecourt.gov/oral_arguments/argument_transcript.aspx'
doc = html.fromstring(requests.get(url).text)
# get all the rulings
for link in doc.cssselect('table.datatables tr a'):
href = link.attrib['href']
# let's store the title of the case from table cell
casetitle = link.getnext().text_content()
# download PDF
pdf_url = urljoin(url, href)
with open("/tmp/t.pdf", 'wb') as f:
f.write(requests.get(pdf_url).content)
# punt to shell and run pdftotext
# http://www.foolabs.com/xpdf/download.html
txt = check_output("pdftotext -layout /tmp/t.pdf -", shell = True).decode()
print("%s laughs in: %s" % (txt.count("(Laughter.)"), casetitle))