-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdecolonization.py
More file actions
108 lines (96 loc) · 4.09 KB
/
decolonization.py
File metadata and controls
108 lines (96 loc) · 4.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
from table import Table
import pdb
import re
import pprint
import bs4
from csv_writer import CsvWriter
class Decolonization:
URL = 'https://en.wikipedia.org/wiki/Decolonization'
def __init__(self):
self.scraped_tables = Table(self.URL, start = 1, end = 4)
self.data = self.to_dicts()
CsvWriter('decolonization', self.data,
headers = [
"Decolonized state",
"Colonizer",
"Year",
"Event"
])
def to_dicts(self):
pp = pprint.PrettyPrinter()
temp = {}
for table in self.scraped_tables.tables:
headers = [content.string.rstrip() for content in table.tr.contents if content.string.rstrip()]
colonizer = None
year = None
for row in table.find_all('tr')[1:]:
tds = [td for td in row.find_all(re.compile("^t(?:d|h)"))]
if len(tds) == 3:
tds = [year] + tds
elif len(tds) == 2:
tds = [year, colonizer] + tds
tds[3] = ' '.join(tds[3].stripped_strings)
for i in range(0, 3):
if tds[i]:
if hasattr(tds[i], 'string') and tds[i].string is not None:
tds[i] = tds[i].string.rstrip()
elif hasattr(tds[i], 'stripped_strings') and tds[i].stripped_strings is not None:
tds[i] = ' '.join(tds[i].stripped_strings)
data = dict(zip(headers, tds))
if data.get('Year') is None:
data['Year'] = year
else:
year = data['Year']
if data.get('Colonizer') is None:
data['Colonizer'] = colonizer
else:
colonizer = data['Colonizer']
state = Decolonization.format_state(data['Decolonized state'])
data['Colonizer'] = Decolonization.format_state(data['Colonizer'])
if state not in temp:
temp[state] = {
'Year': [],
'Colonizer': [],
'Event': []
}
# Avoid dupes but also retain insertion order:
if data['Year'] not in temp[state]['Year']:
temp[state]['Year'].append(data['Year'])
if data['Colonizer'] not in temp[state]['Colonizer']:
temp[state]['Colonizer'].append(data['Colonizer'])
if data['Event'] not in temp[state]['Event']:
temp[state]['Event'].append(Table.format_sentence(data['Event']))
#temp.append(data)
#pp.pprint(zipped)
'''
for i, td in enumerate(row.find_all(re.compile("^t(?:d|h)"))):
if td.attrs:
if td.string:
data[headers[i]] = td.string.rstrip()
temp.append(data)
print(f'{row.attrs}')
if row.attrs:
return
#if row['rowspan']:
#breakpoint();
'''
#pp.pprint([state for state, state_dict in temp.items()])
dicts = []
for state, state_dict in temp.items():
temp_state = {
'Decolonized state': state,
'Colonizer': ', '.join(state_dict['Colonizer']),
'Year': ', '.join(state_dict['Year']),
'Event': '\n'.join(state_dict['Event'])
}
#other_data = { k: ', '.join(v) for (k, v) in state_dict.items() }
#dicts.append({ **temp_state, **other_data })
dicts.append(temp_state)
return dicts
def format_state(state):
# Order matters here! Need to delete the footnotes first
state = re.sub('\s?\[\d+\]', '', state)
state = re.sub('\s?[^\w, \-\(\)]+', '', state)
state = Table.format_sentence(state)
return state
Decolonization()