skrub-data · TwsThomas · Aug 14, 2019 · Aug 19, 2019 · Aug 19, 2019 · Sep 12, 2019
diff --git a/src/beer_reviews.py b/src/beer_reviews.py
@@ -35,5 +35,7 @@ def get_beer_reviews_df(save=True):
                 elt = elt.replace('\xa0', ' ')
             arr.append(elt)
         df[c] = pd.Series(arr, dtype=df[c].dtype, index=df.index)
+    df.rename(columns={col: col.lower() for
+              col in df.columns}, inplace=True)
     write_df(save, df, data_dir[1], BEER_REVIEWS_CONFIG.main_file)
     return df
diff --git a/src/building_permits.py b/src/building_permits.py
@@ -0,0 +1,27 @@
+import pandas as pd
+import numpy as np
+import kaggle
+
+
+def get_building_permits_df():
+    kaggle.api.authenticate()
+    kaggle.api.dataset_download_files('chicago/chicago-building-permits',
+     path='data/building_permits/raw', unzip=True)
+    # dataset update daily.
+
+    csv_path = 'data/building_permits/raw/building-permits.csv'
+    df = pd.read_csv(csv_path)
+    df.columns = df.columns.str.strip()
+    df['PERMIT#'] = df['PERMIT#'].astype(str)
+    for col in df.columns:
+        if 'ZIPCODE' in col:  # zip code may contain '-'
+            df[col] = df[col].astype(str)
+    df['ESTIMATED_COST'] = (
+        df['REPORTED_COST'].astype(float) + 1E-10).apply(np.log)
+    df.rename(columns={col: col.lower() for
+              col in df.columns}, inplace=True)
+    for col in df:
+        if 'contact_' in col:
+            df[col] = df[col].astype(str)
+    df['work_description'] = df['work_description'].astype('category')
+    return df
diff --git a/src/cacao_flavor.py b/src/cacao_flavor.py
@@ -0,0 +1,22 @@
+import re
+import pandas as pd
+
+
+def get_cacao_flavor_df():
+    # !kaggle datasets download rtatman/chocolate-bar-ratings/ -p data/cacao_flavor/raw --unzip
+    csv_path = 'data/cacao_flavor/raw/flavors_of_cacao.csv'
+    df = pd.read_csv(csv_path)
+    df.rename(columns={col: re.sub('\xa0', ' ', col) for
+              col in df.columns}, inplace=True)
+    df.rename(columns={col: re.sub('\n', '_', col) for
+              col in df.columns}, inplace=True)
+    df.rename(columns={col: re.sub(' ', '_', col).lower() for
+              col in df.columns}, inplace=True)
+    df["broad_bean_origin"] = df["broad_bean_origin"].astype('category')
+
+    for col in ['company__(maker-if_known)',
+            'specific_bean_origin_or_bar_name',
+            'broad_bean_origin']
+    # remove '&'
+
+    return df
diff --git a/src/colleges.py b/src/colleges.py
@@ -1,5 +1,6 @@
 import os
 from collections import namedtuple
+import re
 
 import numpy as np
 import pandas as pd
@@ -64,6 +65,9 @@ def get_colleges_df(save=True):
     cats = ['State', 'Predominant Degree', 'Highest Degree', 'Ownership', 'Region', 'ZIP']
     for c in cats:
         df[c] = df[c].astype('category')
-
+    df.rename(columns={col: col.lower() for
+              col in df.columns}, inplace=True)
     write_df(save, df, data_dir[1], COLLEGES_CONFIG.main_file)
+    df.rename(columns={col: re.sub(' ', '_', col).lower() for
+              col in df.columns}, inplace=True)
     return df
diff --git a/src/crime_data.py b/src/crime_data.py
@@ -24,13 +24,15 @@
 
 
 def get_crime_df(save=True):
+    # FIXME dead link :s
     data_dir = fetch(CRIME_DATA_CONFIG)
     file = os.listdir(data_dir[0])[0]
     csv_path = os.path.join(data_dir[0], file)
     df = pd.read_csv(csv_path)
 
     cols = ['Area Name', 'Victim Sex', 'Victim Descent', 'Premise Description', 'Weapon Description',
             'Status Description', 'Crime Code Description']
+    print(df.columns)
     df['Victim Age'] = float_to_int(df['Victim Age'], df.index)
     df['Premise Code'] = float_to_int(df['Premise Code'], df.index)
     df['Weapon Used Code'] = float_to_int(df['Weapon Used Code'], df.index)

diff --git a/src/drug_discovery.py b/src/drug_discovery.py
@@ -0,0 +1,35 @@
+import os
+from collections import namedtuple
+
+import pandas as pd
+
+from common.file_management import fetch, write_df
+
+DatasetInfo = namedtuple('DatasetInfo', ['name', 'urlinfos', 'main_file', 'source'])
+UrlInfo = namedtuple('UrlInfo', ['url', 'filenames', 'uncompress'])
+
+DRUG_DISCOVERY_CONFIG = DatasetInfo(
+    name='drug_discovery',
+    urlinfos=(
+        UrlInfo(
+            url="https://www.accessdata.fda.gov/cder/ndctext.zip",
+            filenames=(
+                "product.txt",
+            ), uncompress=False
+        ),
+    ),
+    main_file="product.txt",
+    source="https://www.fda.gov/drugs/drug-approvals-and-databases/national-drug-code-directory"
+)
+
+
+def get_drug_discovery_df(save=True):
+    data_dir = fetch(DRUG_DISCOVERY_CONFIG)
+    file = os.listdir(data_dir[0])[1]
+    csv_path = os.path.join(data_dir[0], file)
+    df = pd.read_csv(csv_path, sep='\t', encoding='latin1')
+    cat_cols = ['DRG Definition', 'Provider State']
+    for c in cat_cols:
+        df[c] = df[c].astype('category')
+    write_df(save, df, data_dir[1], DRUG_DISCOVERY_CONFIG.main_file)
+    return df
diff --git a/src/employee_salaries.py b/src/employee_salaries.py
@@ -1,6 +1,7 @@
 import datetime
 import os
 from collections import namedtuple
+import re
 
 import pandas as pd
 
@@ -35,4 +36,6 @@ def get_employee_salaries_df(save=True):
     df['Department Name'] = df['Department Name'].astype('category')
     df['Assignment Category'] = df['Assignment Category'].astype('category')
     write_df(save, df, data_dir[1], EMPLOYEE_SALARIES_CONFIG.main_file)
+    df.rename(columns={col: re.sub(' ', '_', col).lower() for
+              col in df.columns}, inplace=True)
     return df
diff --git a/src/federal_election.py b/src/federal_election.py
@@ -0,0 +1,67 @@
+import os
+import re
+from collections import namedtuple
+
+import pandas as pd
+import numpy as np
+
+from common.file_management import fetch, write_df, float_to_int
+
+DatasetInfo = namedtuple('DatasetInfo', ['name', 'urlinfos', 'main_file', 'source'])
+UrlInfo = namedtuple('UrlInfo', ['url', 'filenames', 'uncompress'])
+
+FEDERAL_ELECTION_CONFIG = DatasetInfo(
+    name='federal_election',
+    urlinfos=(
+        UrlInfo(
+            url='https://cg-519a459a-0ea3-42c2-b7bc-fa1143481f74.s3-us-gov-west-1.amazonaws.com/bulk-downloads/2012/indiv12.zip',
+            filenames=(
+                "itcont.txt",
+            ), uncompress=True
+        ),
+    ),
+    main_file="itcont.txt",
+    source="https://classic.fec.gov/finance/disclosure/ftpdet.shtml"
+)
+
+FEDERAL_ELECTION_HEADER_CONFIG = DatasetInfo(
+    name='federal_election',
+    urlinfos=(
+        UrlInfo(
+            url='https://classic.fec.gov/finance/disclosure/metadata/indiv_header_file.csv',
+            filenames=(
+                "indiv_header_file.csv",
+            ), uncompress=False
+        ),
+    ),
+    main_file="indiv_header_file.csv",
+    source="https://classic.fec.gov/finance/disclosure/metadata/DataDictionaryContributionsbyIndividuals.shtml"
+)
+
+
+def get_federal_election_df(save=True):
+    # data
+    data_dir = fetch(FEDERAL_ELECTION_CONFIG)
+    file = "itcont.txt"
+    csv_path = os.path.join(data_dir[0], file)
+    # header
+    data_dir_header = fetch(FEDERAL_ELECTION_HEADER_CONFIG)
+    file_header = "indiv_header_file.csv"
+    csv_path_header = os.path.join(data_dir_header[0], file_header)
+
+    df_header = pd.read_csv(csv_path_header)
+    df = pd.read_csv(csv_path, sep='|', encoding='latin1',
+                     header=None, names=df_header.columns)
+    # Some donations are negative
+    df['TRANSACTION_AMT'] = df['TRANSACTION_AMT'].abs()
+    # Predicting the log of the donation
+    df['TRANSACTION_AMT'] = df[
+        'TRANSACTION_AMT'].apply(np.log)
+    df = df[df['TRANSACTION_AMT'] > 0]
+    df.rename(columns={col: col.lower() for
+              col in df.columns}, inplace=True)
+    df['zip_code'] = df['zip_code'].astype(str)
+    df['city'].loc[1378568] = re.sub('{', '', df['city'].loc[1378568])
+    df['memo_text'] = df['memo_text'].astype('category')
+    write_df(save, df, data_dir[1], FEDERAL_ELECTION_CONFIG.main_file)
+    return df
diff --git a/src/house_sales.py b/src/house_sales.py
@@ -0,0 +1,19 @@
+import pandas as pd
+import numpy as np
+import kaggle
+
+
+def get_house_sales_df():
+    kaggle.api.authenticate()
+    kaggle.api.dataset_download_files('harlfoxem/housesalesprediction',
+        path='data/house_sales/raw', unzip=True)
+
+    csv_path = 'data/house_sales/raw/kc_house_data.csv'
+    df = pd.read_csv(csv_path)
+    df = pd.read_csv(csv_path, index_col=0)
+    df.rename(columns={col: col.lower() for
+              col in df.columns}, inplace=True)
+    print(df.columns)
+    df['zipcode'] = df['zipcode'].astype(str)
+    df['zipcode'] = df['zipcode'].astype('category')
+    return df
diff --git a/src/kickstarter_projects.py b/src/kickstarter_projects.py
@@ -0,0 +1,20 @@
+import re
+
+import pandas as pd
+import numpy as np
+
+
+def get_kickstarter_projects_df():
+    # !kaggle datasets download kemical/kickstarter-projects -p data/kickstarter_projects/raw --unzip 
+
+    # there are two. Pick the oldest one
+    csv_path = 'data/kickstarter_projects/raw/ks-projects-201801.csv' 
+    df = pd.read_csv(csv_path, encoding='latin1', index_col=0)
+    df = df[df['state'].isin(['failed', 'successful'])]
+    df['state'] = (df['state'] == 'successful')
+    df['usd pledged'] = (
+        df['usd pledged'].astype(float) + 1E-10).apply(np.log)
+    df['name'] = ([re.sub('{', '(', str(name)) for name in df.name.values])
+    df['name'] = ([re.sub('}', ')', str(name)) for name in df.name.values])
+    df['category'] = df['category'].astype('category')
+    return df
diff --git a/src/medical_charge.py b/src/medical_charge.py
@@ -1,5 +1,6 @@
 import os
 from collections import namedtuple
+import re
 
 import pandas as pd
 
@@ -16,7 +17,7 @@
                 "Statistics-Trends-and-Reports/Medicare-Provider-Charge-Data/"
                 "Downloads/Inpatient_Data_2011_CSV.zip",
             filenames=(
-                "MedicalProviderChargeInpatient.csv",
+                "Medicare_Provider_Charge_Inpatient_DRG100_FY2011.csv",
             ),
             uncompress=True
 
@@ -31,12 +32,13 @@
 
 def get_medical_charge_df(save=True):
     data_dir = fetch(MEDICAL_CHARGE_CONFIG)
-    file = os.listdir(data_dir[0])[0]
+    file = os.listdir(data_dir[0])[1]
     csv_path = os.path.join(data_dir[0], file)
-    df = pd.read_csv(csv_path)
+    df = pd.read_csv(csv_path, sep=',')
     cat_cols = ['DRG Definition', 'Provider State']
     for c in cat_cols:
         df[c] = df[c].astype('category')
-
+    df.rename(columns={col: re.sub(' ', '_', col).lower() for
+              col in df.columns}, inplace=True)
     write_df(save, df, data_dir[1], MEDICAL_CHARGE_CONFIG.main_file)
     return df
diff --git a/src/met_objects.py b/src/met_objects.py
@@ -1,5 +1,6 @@
 import os
 from collections import namedtuple
+import re
 
 import numpy as np
 import pandas as pd
@@ -64,6 +65,7 @@ def get_met_objects_df(save=True):
 
     for c in cat_cols:
         df[c] = df[c].astype('category')
-
+    df.rename(columns={col: re.sub(' ', '_', col).lower() for
+              col in df.columns}, inplace=True)
     write_df(save, df, data_dir[1], MET_OBJECTS_CONFIG.main_file)
     return df
diff --git a/src/midwest_survey.py b/src/midwest_survey.py
@@ -108,6 +108,8 @@ def get_midwest_survey_df(save=True):
     df = pd.read_csv(csv_path, index_col='RespondentID')
     df = merge_columns(df)
     write_df(save, df, data_dir[1], MIDWEST_SURVEY_CONFIG.main_file)
+    df.rename(columns={col: 'Location_Census_Region' for
+              col in ['Location (Census Region)']}, inplace=True)
     return df
 
 

diff --git a/src/openml_beer_upload.py b/src/openml_beer_upload.py
@@ -21,7 +21,7 @@
     'attributes': 'auto',
     'data': df,
     'ignore_attribute': None,
-    'default_target_attribute': 'Beer_Style',
+    'default_target_attribute': 'beer_style',
     'row_id_attribute': df.index.name,
     'citation': None,
     'version_label': '0.1',

diff --git a/src/openml_building_upload.py b/src/openml_building_upload.py
@@ -0,0 +1,39 @@
+import openml
+from openml.datasets import create_dataset
+
+from building_permits import get_building_permits_df
+
+openml.config.apikey = '58012f5a6cbba5dcd3ddefbf852c1e99'
+openml.config.apikey = 'ca1d24f37f00a1517a1638d5acc24321'  # Thomas
+df = get_building_permits_df()
+
+print('use head  ! ' * 100)
+df = df.head(n=1500)
+
+desc = """This dataset includes information about currently-valid building permits issued by the City of Chicago from 2006 to the present. Building permits are issued subject to payment of applicable fees. If building or zoning permit fees show as unpaid, the permit is not valid. (A permit is valid if only 'other fees' are shown as unpaid.) This dataset does not include permits which have been issued and voided or revoked. This dataset also does not include permits for mechanical amusement riding devices and carnivals issued by the Department of Buildings.
+
+Property Index Numbers (PINs) and geographic information (ward, community area and census tract) are provided for most permit types issued in 2008 or later.
+"""
+
+params = {
+    'name': 'building_permits',
+    'description': desc,
+    'creator': 'Chicago city',
+    'contributor': 'https://www.kaggle.com/chicago',
+    'language': 'English',
+    'licence': 'CC0 Public Domain',
+    'collection_date': '2019-08-13',
+    'attributes': 'auto',
+    'data': df,
+    'ignore_attribute': None,
+    'default_target_attribute': 'estimated_cost',
+    'row_id_attribute': df.index.name,
+    'citation': None,
+    'version_label': '0.1',
+    'original_data_url': 'https://www.kaggle.com/chicago/chicago-building-permits',
+    'paper_url': None,
+    'update_comment': None
+}
+
+dset = create_dataset(**params)
+open_ml_id = dset.publish()