white-house-salaries/wrangle/xls_to_pdf.py at master · helloworlddata/white-house-salaries · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# Command-line script to read XLS files
# and convert them to plain text CSV

from argparse import ArgumentParser
from csv import DictWriter, DictReader
from glob import glob
from os.path import isdir, join as joinpath
from sys import stdout
from xlrd import open_workbook


HEADERS_TEXT = 'NAME,STATUS,SALARY,PAY BASIS,POSITION TITLE'
HEADERS = HEADERS_TEXT.split(',')

def process_wh_salary_workbook(wbpath):
    """
    a very non-generalized function, expecting things to
    be in the expected WH salary format, headers and all,
    single sheet. No need to make it more flexible right now.
    """
    book = open_workbook(wbpath)
    sheet = book.sheets()[0]
    headers_found = False
    for n in range(sheet.nrows):
        cols = sheet.row_values(n)
        if not headers_found:
            if cols == HEADERS:
                headers_found = True
        else:
            # headers have been found
            # don't capture anything if unless
            # nearly all cells are filled...
#            if not all(c == '' for c in cols) and len(HEADERS) - len(cols) == 0:
            if 'Employee' in cols[1] or '$' in cols[2]:
                yield dict(zip(HEADERS, cols))


if __name__ == '__main__':
    parser = ArgumentParser("Convert WH salary XLS page(s) to CSV")
    parser.add_argument('inpath', type=str, help="Path to a XLSX file, or directory of them")
    args = parser.parse_args()
    inpath = args.inpath
    if isdir(inpath):
        filenames = glob(joinpath(inpath, '*.xls?'))
    else:
        filenames = [inpath]

    # set up the CSV
    csvout = DictWriter(stdout, fieldnames=HEADERS)
    csvout.writeheader()

    for fname in filenames:
        for d in process_wh_salary_workbook(fname):
            csvout.writerow(d)