Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/beer_reviews.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,5 +35,7 @@ def get_beer_reviews_df(save=True):
elt = elt.replace('\xa0', ' ')
arr.append(elt)
df[c] = pd.Series(arr, dtype=df[c].dtype, index=df.index)
df.rename(columns={col: col.lower() for
col in df.columns}, inplace=True)
write_df(save, df, data_dir[1], BEER_REVIEWS_CONFIG.main_file)
return df
27 changes: 27 additions & 0 deletions src/building_permits.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import pandas as pd
import numpy as np
import kaggle


def get_building_permits_df():
kaggle.api.authenticate()
kaggle.api.dataset_download_files('chicago/chicago-building-permits',
path='data/building_permits/raw', unzip=True)
# dataset update daily.

csv_path = 'data/building_permits/raw/building-permits.csv'
df = pd.read_csv(csv_path)
df.columns = df.columns.str.strip()
df['PERMIT#'] = df['PERMIT#'].astype(str)
for col in df.columns:
if 'ZIPCODE' in col: # zip code may contain '-'
df[col] = df[col].astype(str)
df['ESTIMATED_COST'] = (
df['REPORTED_COST'].astype(float) + 1E-10).apply(np.log)
df.rename(columns={col: col.lower() for
col in df.columns}, inplace=True)
for col in df:
if 'contact_' in col:
df[col] = df[col].astype(str)
df['work_description'] = df['work_description'].astype('category')
return df
22 changes: 22 additions & 0 deletions src/cacao_flavor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import re
import pandas as pd


def get_cacao_flavor_df():
# !kaggle datasets download rtatman/chocolate-bar-ratings/ -p data/cacao_flavor/raw --unzip
csv_path = 'data/cacao_flavor/raw/flavors_of_cacao.csv'
df = pd.read_csv(csv_path)
df.rename(columns={col: re.sub('\xa0', ' ', col) for
col in df.columns}, inplace=True)
df.rename(columns={col: re.sub('\n', '_', col) for
col in df.columns}, inplace=True)
df.rename(columns={col: re.sub(' ', '_', col).lower() for
col in df.columns}, inplace=True)
df["broad_bean_origin"] = df["broad_bean_origin"].astype('category')

for col in ['company__(maker-if_known)',
'specific_bean_origin_or_bar_name',
'broad_bean_origin']
# remove '&'

return df
6 changes: 5 additions & 1 deletion src/colleges.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
from collections import namedtuple
import re

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -64,6 +65,9 @@ def get_colleges_df(save=True):
cats = ['State', 'Predominant Degree', 'Highest Degree', 'Ownership', 'Region', 'ZIP']
for c in cats:
df[c] = df[c].astype('category')

df.rename(columns={col: col.lower() for
col in df.columns}, inplace=True)
write_df(save, df, data_dir[1], COLLEGES_CONFIG.main_file)
df.rename(columns={col: re.sub(' ', '_', col).lower() for
col in df.columns}, inplace=True)
return df
2 changes: 2 additions & 0 deletions src/crime_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,15 @@


def get_crime_df(save=True):
# FIXME dead link :s
data_dir = fetch(CRIME_DATA_CONFIG)
file = os.listdir(data_dir[0])[0]
csv_path = os.path.join(data_dir[0], file)
df = pd.read_csv(csv_path)

cols = ['Area Name', 'Victim Sex', 'Victim Descent', 'Premise Description', 'Weapon Description',
'Status Description', 'Crime Code Description']
print(df.columns)
df['Victim Age'] = float_to_int(df['Victim Age'], df.index)
df['Premise Code'] = float_to_int(df['Premise Code'], df.index)
df['Weapon Used Code'] = float_to_int(df['Weapon Used Code'], df.index)
Expand Down
35 changes: 35 additions & 0 deletions src/drug_discovery.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import os
from collections import namedtuple

import pandas as pd

from common.file_management import fetch, write_df

DatasetInfo = namedtuple('DatasetInfo', ['name', 'urlinfos', 'main_file', 'source'])
UrlInfo = namedtuple('UrlInfo', ['url', 'filenames', 'uncompress'])

DRUG_DISCOVERY_CONFIG = DatasetInfo(
name='drug_discovery',
urlinfos=(
UrlInfo(
url="https://www.accessdata.fda.gov/cder/ndctext.zip",
filenames=(
"product.txt",
), uncompress=False
),
),
main_file="product.txt",
source="https://www.fda.gov/drugs/drug-approvals-and-databases/national-drug-code-directory"
)


def get_drug_discovery_df(save=True):
data_dir = fetch(DRUG_DISCOVERY_CONFIG)
file = os.listdir(data_dir[0])[1]
csv_path = os.path.join(data_dir[0], file)
df = pd.read_csv(csv_path, sep='\t', encoding='latin1')
cat_cols = ['DRG Definition', 'Provider State']
for c in cat_cols:
df[c] = df[c].astype('category')
write_df(save, df, data_dir[1], DRUG_DISCOVERY_CONFIG.main_file)
return df
3 changes: 3 additions & 0 deletions src/employee_salaries.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import datetime
import os
from collections import namedtuple
import re

import pandas as pd

Expand Down Expand Up @@ -35,4 +36,6 @@ def get_employee_salaries_df(save=True):
df['Department Name'] = df['Department Name'].astype('category')
df['Assignment Category'] = df['Assignment Category'].astype('category')
write_df(save, df, data_dir[1], EMPLOYEE_SALARIES_CONFIG.main_file)
df.rename(columns={col: re.sub(' ', '_', col).lower() for
col in df.columns}, inplace=True)
return df
67 changes: 67 additions & 0 deletions src/federal_election.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import os
import re
from collections import namedtuple

import pandas as pd
import numpy as np

from common.file_management import fetch, write_df, float_to_int

DatasetInfo = namedtuple('DatasetInfo', ['name', 'urlinfos', 'main_file', 'source'])
UrlInfo = namedtuple('UrlInfo', ['url', 'filenames', 'uncompress'])

FEDERAL_ELECTION_CONFIG = DatasetInfo(
name='federal_election',
urlinfos=(
UrlInfo(
url='https://cg-519a459a-0ea3-42c2-b7bc-fa1143481f74.s3-us-gov-west-1.amazonaws.com/bulk-downloads/2012/indiv12.zip',
filenames=(
"itcont.txt",
), uncompress=True
),
),
main_file="itcont.txt",
source="https://classic.fec.gov/finance/disclosure/ftpdet.shtml"
)

FEDERAL_ELECTION_HEADER_CONFIG = DatasetInfo(
name='federal_election',
urlinfos=(
UrlInfo(
url='https://classic.fec.gov/finance/disclosure/metadata/indiv_header_file.csv',
filenames=(
"indiv_header_file.csv",
), uncompress=False
),
),
main_file="indiv_header_file.csv",
source="https://classic.fec.gov/finance/disclosure/metadata/DataDictionaryContributionsbyIndividuals.shtml"
)


def get_federal_election_df(save=True):
# data
data_dir = fetch(FEDERAL_ELECTION_CONFIG)
file = "itcont.txt"
csv_path = os.path.join(data_dir[0], file)
# header
data_dir_header = fetch(FEDERAL_ELECTION_HEADER_CONFIG)
file_header = "indiv_header_file.csv"
csv_path_header = os.path.join(data_dir_header[0], file_header)

df_header = pd.read_csv(csv_path_header)
df = pd.read_csv(csv_path, sep='|', encoding='latin1',
header=None, names=df_header.columns)
# Some donations are negative
df['TRANSACTION_AMT'] = df['TRANSACTION_AMT'].abs()
# Predicting the log of the donation
df['TRANSACTION_AMT'] = df[
'TRANSACTION_AMT'].apply(np.log)
df = df[df['TRANSACTION_AMT'] > 0]
df.rename(columns={col: col.lower() for
col in df.columns}, inplace=True)
df['zip_code'] = df['zip_code'].astype(str)
df['city'].loc[1378568] = re.sub('{', '', df['city'].loc[1378568])
df['memo_text'] = df['memo_text'].astype('category')
write_df(save, df, data_dir[1], FEDERAL_ELECTION_CONFIG.main_file)
return df
19 changes: 19 additions & 0 deletions src/house_sales.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import pandas as pd
import numpy as np
import kaggle


def get_house_sales_df():
kaggle.api.authenticate()
kaggle.api.dataset_download_files('harlfoxem/housesalesprediction',
path='data/house_sales/raw', unzip=True)

csv_path = 'data/house_sales/raw/kc_house_data.csv'
df = pd.read_csv(csv_path)
df = pd.read_csv(csv_path, index_col=0)
df.rename(columns={col: col.lower() for
col in df.columns}, inplace=True)
print(df.columns)
df['zipcode'] = df['zipcode'].astype(str)
df['zipcode'] = df['zipcode'].astype('category')
return df
20 changes: 20 additions & 0 deletions src/kickstarter_projects.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import re

import pandas as pd
import numpy as np


def get_kickstarter_projects_df():
# !kaggle datasets download kemical/kickstarter-projects -p data/kickstarter_projects/raw --unzip

# there are two. Pick the oldest one
csv_path = 'data/kickstarter_projects/raw/ks-projects-201801.csv'
df = pd.read_csv(csv_path, encoding='latin1', index_col=0)
df = df[df['state'].isin(['failed', 'successful'])]
df['state'] = (df['state'] == 'successful')
df['usd pledged'] = (
df['usd pledged'].astype(float) + 1E-10).apply(np.log)
df['name'] = ([re.sub('{', '(', str(name)) for name in df.name.values])
df['name'] = ([re.sub('}', ')', str(name)) for name in df.name.values])
df['category'] = df['category'].astype('category')
return df
10 changes: 6 additions & 4 deletions src/medical_charge.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
from collections import namedtuple
import re

import pandas as pd

Expand All @@ -16,7 +17,7 @@
"Statistics-Trends-and-Reports/Medicare-Provider-Charge-Data/"
"Downloads/Inpatient_Data_2011_CSV.zip",
filenames=(
"MedicalProviderChargeInpatient.csv",
"Medicare_Provider_Charge_Inpatient_DRG100_FY2011.csv",
),
uncompress=True

Expand All @@ -31,12 +32,13 @@

def get_medical_charge_df(save=True):
data_dir = fetch(MEDICAL_CHARGE_CONFIG)
file = os.listdir(data_dir[0])[0]
file = os.listdir(data_dir[0])[1]
csv_path = os.path.join(data_dir[0], file)
df = pd.read_csv(csv_path)
df = pd.read_csv(csv_path, sep=',')
cat_cols = ['DRG Definition', 'Provider State']
for c in cat_cols:
df[c] = df[c].astype('category')

df.rename(columns={col: re.sub(' ', '_', col).lower() for
col in df.columns}, inplace=True)
write_df(save, df, data_dir[1], MEDICAL_CHARGE_CONFIG.main_file)
return df
4 changes: 3 additions & 1 deletion src/met_objects.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
from collections import namedtuple
import re

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -64,6 +65,7 @@ def get_met_objects_df(save=True):

for c in cat_cols:
df[c] = df[c].astype('category')

df.rename(columns={col: re.sub(' ', '_', col).lower() for
col in df.columns}, inplace=True)
write_df(save, df, data_dir[1], MET_OBJECTS_CONFIG.main_file)
return df
2 changes: 2 additions & 0 deletions src/midwest_survey.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,8 @@ def get_midwest_survey_df(save=True):
df = pd.read_csv(csv_path, index_col='RespondentID')
df = merge_columns(df)
write_df(save, df, data_dir[1], MIDWEST_SURVEY_CONFIG.main_file)
df.rename(columns={col: 'Location_Census_Region' for
col in ['Location (Census Region)']}, inplace=True)
return df


Expand Down
2 changes: 1 addition & 1 deletion src/openml_beer_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
'attributes': 'auto',
'data': df,
'ignore_attribute': None,
'default_target_attribute': 'Beer_Style',
'default_target_attribute': 'beer_style',
'row_id_attribute': df.index.name,
'citation': None,
'version_label': '0.1',
Expand Down
39 changes: 39 additions & 0 deletions src/openml_building_upload.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import openml
from openml.datasets import create_dataset

from building_permits import get_building_permits_df

openml.config.apikey = '58012f5a6cbba5dcd3ddefbf852c1e99'
openml.config.apikey = 'ca1d24f37f00a1517a1638d5acc24321' # Thomas
df = get_building_permits_df()

print('use head ! ' * 100)
df = df.head(n=1500)

desc = """This dataset includes information about currently-valid building permits issued by the City of Chicago from 2006 to the present. Building permits are issued subject to payment of applicable fees. If building or zoning permit fees show as unpaid, the permit is not valid. (A permit is valid if only 'other fees' are shown as unpaid.) This dataset does not include permits which have been issued and voided or revoked. This dataset also does not include permits for mechanical amusement riding devices and carnivals issued by the Department of Buildings.

Property Index Numbers (PINs) and geographic information (ward, community area and census tract) are provided for most permit types issued in 2008 or later.
"""

params = {
'name': 'building_permits',
'description': desc,
'creator': 'Chicago city',
'contributor': 'https://www.kaggle.com/chicago',
'language': 'English',
'licence': 'CC0 Public Domain',
'collection_date': '2019-08-13',
'attributes': 'auto',
'data': df,
'ignore_attribute': None,
'default_target_attribute': 'estimated_cost',
'row_id_attribute': df.index.name,
'citation': None,
'version_label': '0.1',
'original_data_url': 'https://www.kaggle.com/chicago/chicago-building-permits',
'paper_url': None,
'update_comment': None
}

dset = create_dataset(**params)
open_ml_id = dset.publish()
Loading