diff --git a/src/beer_reviews.py b/src/beer_reviews.py index 082e5e4..4d25b12 100644 --- a/src/beer_reviews.py +++ b/src/beer_reviews.py @@ -35,5 +35,7 @@ def get_beer_reviews_df(save=True): elt = elt.replace('\xa0', ' ') arr.append(elt) df[c] = pd.Series(arr, dtype=df[c].dtype, index=df.index) + df.rename(columns={col: col.lower() for + col in df.columns}, inplace=True) write_df(save, df, data_dir[1], BEER_REVIEWS_CONFIG.main_file) return df diff --git a/src/building_permits.py b/src/building_permits.py new file mode 100644 index 0000000..a14672d --- /dev/null +++ b/src/building_permits.py @@ -0,0 +1,27 @@ +import pandas as pd +import numpy as np +import kaggle + + +def get_building_permits_df(): + kaggle.api.authenticate() + kaggle.api.dataset_download_files('chicago/chicago-building-permits', + path='data/building_permits/raw', unzip=True) + # dataset update daily. + + csv_path = 'data/building_permits/raw/building-permits.csv' + df = pd.read_csv(csv_path) + df.columns = df.columns.str.strip() + df['PERMIT#'] = df['PERMIT#'].astype(str) + for col in df.columns: + if 'ZIPCODE' in col: # zip code may contain '-' + df[col] = df[col].astype(str) + df['ESTIMATED_COST'] = ( + df['REPORTED_COST'].astype(float) + 1E-10).apply(np.log) + df.rename(columns={col: col.lower() for + col in df.columns}, inplace=True) + for col in df: + if 'contact_' in col: + df[col] = df[col].astype(str) + df['work_description'] = df['work_description'].astype('category') + return df diff --git a/src/cacao_flavor.py b/src/cacao_flavor.py new file mode 100644 index 0000000..db0dc9b --- /dev/null +++ b/src/cacao_flavor.py @@ -0,0 +1,22 @@ +import re +import pandas as pd + + +def get_cacao_flavor_df(): + # !kaggle datasets download rtatman/chocolate-bar-ratings/ -p data/cacao_flavor/raw --unzip + csv_path = 'data/cacao_flavor/raw/flavors_of_cacao.csv' + df = pd.read_csv(csv_path) + df.rename(columns={col: re.sub('\xa0', ' ', col) for + col in df.columns}, inplace=True) + df.rename(columns={col: re.sub('\n', '_', col) for + col in df.columns}, inplace=True) + df.rename(columns={col: re.sub(' ', '_', col).lower() for + col in df.columns}, inplace=True) + df["broad_bean_origin"] = df["broad_bean_origin"].astype('category') + + for col in ['company__(maker-if_known)', + 'specific_bean_origin_or_bar_name', + 'broad_bean_origin'] + # remove '&' + + return df diff --git a/src/colleges.py b/src/colleges.py index b2dd496..aef2a6a 100644 --- a/src/colleges.py +++ b/src/colleges.py @@ -1,5 +1,6 @@ import os from collections import namedtuple +import re import numpy as np import pandas as pd @@ -64,6 +65,9 @@ def get_colleges_df(save=True): cats = ['State', 'Predominant Degree', 'Highest Degree', 'Ownership', 'Region', 'ZIP'] for c in cats: df[c] = df[c].astype('category') - + df.rename(columns={col: col.lower() for + col in df.columns}, inplace=True) write_df(save, df, data_dir[1], COLLEGES_CONFIG.main_file) + df.rename(columns={col: re.sub(' ', '_', col).lower() for + col in df.columns}, inplace=True) return df diff --git a/src/crime_data.py b/src/crime_data.py index 3696d24..9da24e3 100644 --- a/src/crime_data.py +++ b/src/crime_data.py @@ -24,6 +24,7 @@ def get_crime_df(save=True): + # FIXME dead link :s data_dir = fetch(CRIME_DATA_CONFIG) file = os.listdir(data_dir[0])[0] csv_path = os.path.join(data_dir[0], file) @@ -31,6 +32,7 @@ def get_crime_df(save=True): cols = ['Area Name', 'Victim Sex', 'Victim Descent', 'Premise Description', 'Weapon Description', 'Status Description', 'Crime Code Description'] + print(df.columns) df['Victim Age'] = float_to_int(df['Victim Age'], df.index) df['Premise Code'] = float_to_int(df['Premise Code'], df.index) df['Weapon Used Code'] = float_to_int(df['Weapon Used Code'], df.index) diff --git a/src/drug_discovery.py b/src/drug_discovery.py new file mode 100644 index 0000000..9faf25c --- /dev/null +++ b/src/drug_discovery.py @@ -0,0 +1,35 @@ +import os +from collections import namedtuple + +import pandas as pd + +from common.file_management import fetch, write_df + +DatasetInfo = namedtuple('DatasetInfo', ['name', 'urlinfos', 'main_file', 'source']) +UrlInfo = namedtuple('UrlInfo', ['url', 'filenames', 'uncompress']) + +DRUG_DISCOVERY_CONFIG = DatasetInfo( + name='drug_discovery', + urlinfos=( + UrlInfo( + url="https://www.accessdata.fda.gov/cder/ndctext.zip", + filenames=( + "product.txt", + ), uncompress=False + ), + ), + main_file="product.txt", + source="https://www.fda.gov/drugs/drug-approvals-and-databases/national-drug-code-directory" +) + + +def get_drug_discovery_df(save=True): + data_dir = fetch(DRUG_DISCOVERY_CONFIG) + file = os.listdir(data_dir[0])[1] + csv_path = os.path.join(data_dir[0], file) + df = pd.read_csv(csv_path, sep='\t', encoding='latin1') + cat_cols = ['DRG Definition', 'Provider State'] + for c in cat_cols: + df[c] = df[c].astype('category') + write_df(save, df, data_dir[1], DRUG_DISCOVERY_CONFIG.main_file) + return df diff --git a/src/employee_salaries.py b/src/employee_salaries.py index e436d58..129aa43 100644 --- a/src/employee_salaries.py +++ b/src/employee_salaries.py @@ -1,6 +1,7 @@ import datetime import os from collections import namedtuple +import re import pandas as pd @@ -35,4 +36,6 @@ def get_employee_salaries_df(save=True): df['Department Name'] = df['Department Name'].astype('category') df['Assignment Category'] = df['Assignment Category'].astype('category') write_df(save, df, data_dir[1], EMPLOYEE_SALARIES_CONFIG.main_file) + df.rename(columns={col: re.sub(' ', '_', col).lower() for + col in df.columns}, inplace=True) return df diff --git a/src/federal_election.py b/src/federal_election.py new file mode 100644 index 0000000..d767d4d --- /dev/null +++ b/src/federal_election.py @@ -0,0 +1,67 @@ +import os +import re +from collections import namedtuple + +import pandas as pd +import numpy as np + +from common.file_management import fetch, write_df, float_to_int + +DatasetInfo = namedtuple('DatasetInfo', ['name', 'urlinfos', 'main_file', 'source']) +UrlInfo = namedtuple('UrlInfo', ['url', 'filenames', 'uncompress']) + +FEDERAL_ELECTION_CONFIG = DatasetInfo( + name='federal_election', + urlinfos=( + UrlInfo( + url='https://cg-519a459a-0ea3-42c2-b7bc-fa1143481f74.s3-us-gov-west-1.amazonaws.com/bulk-downloads/2012/indiv12.zip', + filenames=( + "itcont.txt", + ), uncompress=True + ), + ), + main_file="itcont.txt", + source="https://classic.fec.gov/finance/disclosure/ftpdet.shtml" +) + +FEDERAL_ELECTION_HEADER_CONFIG = DatasetInfo( + name='federal_election', + urlinfos=( + UrlInfo( + url='https://classic.fec.gov/finance/disclosure/metadata/indiv_header_file.csv', + filenames=( + "indiv_header_file.csv", + ), uncompress=False + ), + ), + main_file="indiv_header_file.csv", + source="https://classic.fec.gov/finance/disclosure/metadata/DataDictionaryContributionsbyIndividuals.shtml" +) + + +def get_federal_election_df(save=True): + # data + data_dir = fetch(FEDERAL_ELECTION_CONFIG) + file = "itcont.txt" + csv_path = os.path.join(data_dir[0], file) + # header + data_dir_header = fetch(FEDERAL_ELECTION_HEADER_CONFIG) + file_header = "indiv_header_file.csv" + csv_path_header = os.path.join(data_dir_header[0], file_header) + + df_header = pd.read_csv(csv_path_header) + df = pd.read_csv(csv_path, sep='|', encoding='latin1', + header=None, names=df_header.columns) + # Some donations are negative + df['TRANSACTION_AMT'] = df['TRANSACTION_AMT'].abs() + # Predicting the log of the donation + df['TRANSACTION_AMT'] = df[ + 'TRANSACTION_AMT'].apply(np.log) + df = df[df['TRANSACTION_AMT'] > 0] + df.rename(columns={col: col.lower() for + col in df.columns}, inplace=True) + df['zip_code'] = df['zip_code'].astype(str) + df['city'].loc[1378568] = re.sub('{', '', df['city'].loc[1378568]) + df['memo_text'] = df['memo_text'].astype('category') + write_df(save, df, data_dir[1], FEDERAL_ELECTION_CONFIG.main_file) + return df diff --git a/src/house_sales.py b/src/house_sales.py new file mode 100644 index 0000000..bfab5c6 --- /dev/null +++ b/src/house_sales.py @@ -0,0 +1,19 @@ +import pandas as pd +import numpy as np +import kaggle + + +def get_house_sales_df(): + kaggle.api.authenticate() + kaggle.api.dataset_download_files('harlfoxem/housesalesprediction', + path='data/house_sales/raw', unzip=True) + + csv_path = 'data/house_sales/raw/kc_house_data.csv' + df = pd.read_csv(csv_path) + df = pd.read_csv(csv_path, index_col=0) + df.rename(columns={col: col.lower() for + col in df.columns}, inplace=True) + print(df.columns) + df['zipcode'] = df['zipcode'].astype(str) + df['zipcode'] = df['zipcode'].astype('category') + return df diff --git a/src/kickstarter_projects.py b/src/kickstarter_projects.py new file mode 100644 index 0000000..0276314 --- /dev/null +++ b/src/kickstarter_projects.py @@ -0,0 +1,20 @@ +import re + +import pandas as pd +import numpy as np + + +def get_kickstarter_projects_df(): + # !kaggle datasets download kemical/kickstarter-projects -p data/kickstarter_projects/raw --unzip + + # there are two. Pick the oldest one + csv_path = 'data/kickstarter_projects/raw/ks-projects-201801.csv' + df = pd.read_csv(csv_path, encoding='latin1', index_col=0) + df = df[df['state'].isin(['failed', 'successful'])] + df['state'] = (df['state'] == 'successful') + df['usd pledged'] = ( + df['usd pledged'].astype(float) + 1E-10).apply(np.log) + df['name'] = ([re.sub('{', '(', str(name)) for name in df.name.values]) + df['name'] = ([re.sub('}', ')', str(name)) for name in df.name.values]) + df['category'] = df['category'].astype('category') + return df diff --git a/src/medical_charge.py b/src/medical_charge.py index 9b82b4e..d27f93c 100644 --- a/src/medical_charge.py +++ b/src/medical_charge.py @@ -1,5 +1,6 @@ import os from collections import namedtuple +import re import pandas as pd @@ -16,7 +17,7 @@ "Statistics-Trends-and-Reports/Medicare-Provider-Charge-Data/" "Downloads/Inpatient_Data_2011_CSV.zip", filenames=( - "MedicalProviderChargeInpatient.csv", + "Medicare_Provider_Charge_Inpatient_DRG100_FY2011.csv", ), uncompress=True @@ -31,12 +32,13 @@ def get_medical_charge_df(save=True): data_dir = fetch(MEDICAL_CHARGE_CONFIG) - file = os.listdir(data_dir[0])[0] + file = os.listdir(data_dir[0])[1] csv_path = os.path.join(data_dir[0], file) - df = pd.read_csv(csv_path) + df = pd.read_csv(csv_path, sep=',') cat_cols = ['DRG Definition', 'Provider State'] for c in cat_cols: df[c] = df[c].astype('category') - + df.rename(columns={col: re.sub(' ', '_', col).lower() for + col in df.columns}, inplace=True) write_df(save, df, data_dir[1], MEDICAL_CHARGE_CONFIG.main_file) return df diff --git a/src/met_objects.py b/src/met_objects.py index 6aa20a3..93e6891 100644 --- a/src/met_objects.py +++ b/src/met_objects.py @@ -1,5 +1,6 @@ import os from collections import namedtuple +import re import numpy as np import pandas as pd @@ -64,6 +65,7 @@ def get_met_objects_df(save=True): for c in cat_cols: df[c] = df[c].astype('category') - + df.rename(columns={col: re.sub(' ', '_', col).lower() for + col in df.columns}, inplace=True) write_df(save, df, data_dir[1], MET_OBJECTS_CONFIG.main_file) return df diff --git a/src/midwest_survey.py b/src/midwest_survey.py index fe11c8d..3f0abbc 100644 --- a/src/midwest_survey.py +++ b/src/midwest_survey.py @@ -108,6 +108,8 @@ def get_midwest_survey_df(save=True): df = pd.read_csv(csv_path, index_col='RespondentID') df = merge_columns(df) write_df(save, df, data_dir[1], MIDWEST_SURVEY_CONFIG.main_file) + df.rename(columns={col: 'Location_Census_Region' for + col in ['Location (Census Region)']}, inplace=True) return df diff --git a/src/openml_beer_upload.py b/src/openml_beer_upload.py index a6a3bfb..0219ece 100644 --- a/src/openml_beer_upload.py +++ b/src/openml_beer_upload.py @@ -21,7 +21,7 @@ 'attributes': 'auto', 'data': df, 'ignore_attribute': None, - 'default_target_attribute': 'Beer_Style', + 'default_target_attribute': 'beer_style', 'row_id_attribute': df.index.name, 'citation': None, 'version_label': '0.1', diff --git a/src/openml_building_upload.py b/src/openml_building_upload.py new file mode 100644 index 0000000..1506ed7 --- /dev/null +++ b/src/openml_building_upload.py @@ -0,0 +1,39 @@ +import openml +from openml.datasets import create_dataset + +from building_permits import get_building_permits_df + +openml.config.apikey = '58012f5a6cbba5dcd3ddefbf852c1e99' +openml.config.apikey = 'ca1d24f37f00a1517a1638d5acc24321' # Thomas +df = get_building_permits_df() + +print('use head ! ' * 100) +df = df.head(n=1500) + +desc = """This dataset includes information about currently-valid building permits issued by the City of Chicago from 2006 to the present. Building permits are issued subject to payment of applicable fees. If building or zoning permit fees show as unpaid, the permit is not valid. (A permit is valid if only 'other fees' are shown as unpaid.) This dataset does not include permits which have been issued and voided or revoked. This dataset also does not include permits for mechanical amusement riding devices and carnivals issued by the Department of Buildings. + +Property Index Numbers (PINs) and geographic information (ward, community area and census tract) are provided for most permit types issued in 2008 or later. +""" + +params = { + 'name': 'building_permits', + 'description': desc, + 'creator': 'Chicago city', + 'contributor': 'https://www.kaggle.com/chicago', + 'language': 'English', + 'licence': 'CC0 Public Domain', + 'collection_date': '2019-08-13', + 'attributes': 'auto', + 'data': df, + 'ignore_attribute': None, + 'default_target_attribute': 'estimated_cost', + 'row_id_attribute': df.index.name, + 'citation': None, + 'version_label': '0.1', + 'original_data_url': 'https://www.kaggle.com/chicago/chicago-building-permits', + 'paper_url': None, + 'update_comment': None +} + +dset = create_dataset(**params) +open_ml_id = dset.publish() diff --git a/src/openml_cacao_upload.py b/src/openml_cacao_upload.py new file mode 100644 index 0000000..1e2e7fc --- /dev/null +++ b/src/openml_cacao_upload.py @@ -0,0 +1,54 @@ +import openml +from openml.datasets import create_dataset + +from cacao_flavor import get_cacao_flavor_df + +openml.config.apikey = '58012f5a6cbba5dcd3ddefbf852c1e99' +openml.config.apikey = 'ca1d24f37f00a1517a1638d5acc24321' # Thomas +df = get_cacao_flavor_df() + +full_desc = """Chocolate Bar Ratings. + Expert ratings of over 1,700 chocolate bars. Each chocolate is evaluated from a combination of both objective qualities and subjective interpretation. A rating here only represents an experience with one bar from one batch. Batch numbers, vintages and review dates are included in the database when known. + + The database is narrowly focused on plain dark chocolate with an aim of appreciating the flavors of the cacao when made into chocolate. The ratings do not reflect health benefits, social missions, or organic status. + + Flavor is the most important component of the Flavors of Cacao ratings. Diversity, balance, intensity and purity of flavors are all considered. It is possible for a straight forward single note chocolate to rate as high as a complex flavor profile that changes throughout. Genetics, terroir, post harvest techniques, processing and storage can all be discussed when considering the flavor component. + + Texture has a great impact on the overall experience and it is also possible for texture related issues to impact flavor. It is a good way to evaluate the makers vision, attention to detail and level of proficiency. + + Aftermelt is the experience after the chocolate has melted. Higher quality chocolate will linger and be long lasting and enjoyable. Since the aftermelt is the last impression you get from the chocolate, it receives equal importance in the overall rating. + + Overall Opinion is really where the ratings reflect a subjective opinion. Ideally it is my evaluation of whether or not the components above worked together and an opinion on the flavor development, character and style. It is also here where each chocolate can usually be summarized by the most prominent impressions that you would remember about each chocolate. + + Flavors of Cacao Rating System: + 5= Elite (Transcending beyond the ordinary limits) + 4= Premium (Superior flavor development, character and style) + 3= Satisfactory(3.0) to praiseworthy(3.75) (well made with special qualities) + 2= Disappointing (Passable but contains at least one significant flaw) + 1= Unpleasant (mostly unpalatable) + + Acknowledgements + These ratings were compiled by Brady Brelinski, Founding Member of the Manhattan Chocolate Society. For up-to-date information, as well as additional content (including interviews with craft chocolate makers), please see his website: http://flavorsofcacao.com/index.html""" + +params = { + 'name': 'cacao_flavor', + 'description': full_desc, + 'creator': 'http://flavorsofcacao.com/index.html', + 'contributor': 'https://www.kaggle.com/rtatman/', + 'language': 'English', + 'licence': 'CC0 Public Domaine', + 'collection_date': '2017-08-12', + 'attributes': 'auto', + 'data': df, + 'ignore_attribute': None, + 'default_target_attribute': 'bean_type', + 'row_id_attribute': df.index.name, + 'citation': None, + 'version_label': '0.1', + 'original_data_url': 'https://www.kaggle.com/rtatman/chocolate-bar-ratings/', + 'paper_url': None, + 'update_comment': None +} + +dset = create_dataset(**params) +open_ml_id = dset.publish() diff --git a/src/openml_colleges_upload.py b/src/openml_colleges_upload.py index 348c3ec..14ef415 100644 --- a/src/openml_colleges_upload.py +++ b/src/openml_colleges_upload.py @@ -4,6 +4,8 @@ from colleges import * openml.config.apikey = '58012f5a6cbba5dcd3ddefbf852c1e99' +openml.config.apikey = 'ca1d24f37f00a1517a1638d5acc24321' # Thomas + df = get_colleges_df() params = { @@ -18,7 +20,7 @@ 'attributes': 'auto', 'data': df, 'ignore_attribute': None, - 'default_target_attribute': 'Faculty_Salary', + 'default_target_attribute': 'faculty_salary', 'row_id_attribute': df.index.name, 'citation': None, 'version_label': '0.1', diff --git a/src/openml_crime_upload.py b/src/openml_crime_upload.py index a635584..de2c084 100644 --- a/src/openml_crime_upload.py +++ b/src/openml_crime_upload.py @@ -21,7 +21,7 @@ 'attributes': 'auto', 'data': df, 'ignore_attribute': None, - 'default_target_attribute': 'Crime_Code_1', + 'default_target_attribute': 'Crime Code 1', 'row_id_attribute': df.index.name, 'citation': None, 'version_label': '0.1', diff --git a/src/openml_employee_upload.py b/src/openml_employee_upload.py index 23f522c..26b33a4 100644 --- a/src/openml_employee_upload.py +++ b/src/openml_employee_upload.py @@ -19,7 +19,7 @@ 'attributes': 'auto', 'data': df, 'ignore_attribute': None, - 'default_target_attribute': 'Current_Annual_Salary', + 'default_target_attribute': 'current_annual_salary', 'row_id_attribute': df.index.name, 'citation': None, 'version_label': '0.1', diff --git a/src/openml_federal_upload.py b/src/openml_federal_upload.py new file mode 100644 index 0000000..da88bd0 --- /dev/null +++ b/src/openml_federal_upload.py @@ -0,0 +1,39 @@ +import openml +from openml.datasets import create_dataset + +from federal_election import * + +openml.config.apikey = '58012f5a6cbba5dcd3ddefbf852c1e99' +openml.config.apikey = 'ca1d24f37f00a1517a1638d5acc24321' # Thomas +df = get_federal_election_df() + +params = { + 'name': 'federal_election', + 'description': """General Description +2015-current: greater than $200.00. The Commission categorizes contributions from individuals using the calendar year-to-date amount for political action committee (PAC) and party committee receipts and the election-cycle-to-date amount for candidate receipts to determine whether the contribution meets the categorization threshold of greater than $200.00. + +1989-2014: $200 and above. The Commission categorized contributions from individuals using the reporting period amount to determine whether a contribution met the categorization threshold of $200.00 or more. + +1975-1988: $500 and above. The Commission categorized contributions from individuals using the reporting period amount to determine whether a contribution met the categorization threshold of $500.00 or more. + +header description can be found here : https://classic.fec.gov/finance/disclosure/metadata/DataDictionaryContributionsbyIndividuals.shtml +""", + 'creator': 'Federal Election Commission', + 'contributor': None, + 'language': 'English', + 'licence': 'NA', + 'collection_date': '2015-08-13', + 'attributes': 'auto', + 'data': df, + 'ignore_attribute': None, + 'default_target_attribute': 'transaction_amt', + 'row_id_attribute': df.index.name, + 'citation': None, + 'version_label': '0.1', + 'original_data_url': FEDERAL_ELECTION_CONFIG.urlinfos[0].url, + 'paper_url': FEDERAL_ELECTION_CONFIG.source, + 'update_comment': None +} + +dset = create_dataset(**params) +open_ml_id = dset.publish() diff --git a/src/openml_housesales_upload.py b/src/openml_housesales_upload.py new file mode 100644 index 0000000..6a9adae --- /dev/null +++ b/src/openml_housesales_upload.py @@ -0,0 +1,35 @@ +import openml +from openml.datasets import create_dataset + +from house_sales import get_house_sales_df + +openml.config.apikey = '58012f5a6cbba5dcd3ddefbf852c1e99' +openml.config.apikey = 'ca1d24f37f00a1517a1638d5acc24321' # Thomas +df = get_house_sales_df() + +params = { + 'name': 'house_sales', + 'description': """This dataset contains house sale prices for King County, which includes Seattle. It includes homes sold between May 2014 and May 2015. + +It contains 19 house features plus the price and the id columns, along with 21613 observations. +It's a great dataset for evaluating simple regression models. +""", + 'creator': 'https://www.kaggle.com/harlfoxem/', + 'contributor': 'https://www.kaggle.com/harlfoxem/', + 'language': 'English', + 'licence': 'CC0 Public Domain', + 'collection_date': '2016-08-25', + 'attributes': 'auto', + 'data': df, + 'ignore_attribute': None, + 'default_target_attribute': 'price', + 'row_id_attribute': df.index.name, + 'citation': None, + 'version_label': '0.1', + 'original_data_url': 'https://www.kaggle.com/harlfoxem/housesalesprediction', + 'paper_url': None, + 'update_comment': None +} + +dset = create_dataset(**params) +open_ml_id = dset.publish() diff --git a/src/openml_journal_upload.py b/src/openml_journal_upload.py index 46739ee..82fd79f 100644 --- a/src/openml_journal_upload.py +++ b/src/openml_journal_upload.py @@ -4,6 +4,7 @@ from journal_influence import * openml.config.apikey = '58012f5a6cbba5dcd3ddefbf852c1e99' +openml.config.apikey = 'ca1d24f37f00a1517a1638d5acc24321' # Thomas df = get_journal_influence_df() params = { diff --git a/src/openml_kickstarter_upload.py b/src/openml_kickstarter_upload.py new file mode 100644 index 0000000..87032d9 --- /dev/null +++ b/src/openml_kickstarter_upload.py @@ -0,0 +1,43 @@ +import openml +from openml.datasets import create_dataset + +from kickstarter_projects import get_kickstarter_projects_df + +openml.config.apikey = '58012f5a6cbba5dcd3ddefbf852c1e99' +openml.config.apikey = 'ca1d24f37f00a1517a1638d5acc24321' # Thomas +df = get_kickstarter_projects_df() + +params = { + 'name': 'kickstarter_projects', + 'description': """ + Data are collected from Kickstarter Platform + +You'll find most useful data for project analysis. Columns are self explanatory except: + +usd_pledged: conversion in US dollars of the pledged column (conversion done by kickstarter). + +usd pledge real: conversion in US dollars of the pledged column (conversion from Fixer.io API). + +usd goal real: conversion in US dollars of the goal column (conversion from Fixer.io API). + + +""", + 'creator': 'https://www.kickstarter.com/', + 'contributor': 'https://www.kaggle.com/kemical', + 'language': 'English', + 'licence': 'CC BY-NC-SA 4.0', + 'collection_date': '2018-01-15', + 'attributes': 'auto', + 'data': df, + 'ignore_attribute': None, + 'default_target_attribute': 'state', + 'row_id_attribute': df.index.name, + 'citation': None, + 'version_label': '0.1', + 'original_data_url': 'https://www.kaggle.com/kemical/kickstarter-projects', + 'paper_url': None, + 'update_comment': None +} + +dset = create_dataset(**params) +open_ml_id = dset.publish() diff --git a/src/openml_medical_upload.py b/src/openml_medical_upload.py index 1d1d7fa..f8ff92c 100644 --- a/src/openml_medical_upload.py +++ b/src/openml_medical_upload.py @@ -4,6 +4,7 @@ from medical_charge import * openml.config.apikey = '58012f5a6cbba5dcd3ddefbf852c1e99' +openml.config.apikey = 'ca1d24f37f00a1517a1638d5acc24321' # Thomas df = get_medical_charge_df() params = { @@ -22,7 +23,7 @@ 'attributes': 'auto', 'data': df, 'ignore_attribute': None, - 'default_target_attribute': 'Average_total_payments', + 'default_target_attribute': 'average_total_payments', 'row_id_attribute': df.index.name, 'citation': None, 'version_label': '0.1', diff --git a/src/openml_met_upload.py b/src/openml_met_upload.py index 5df318a..815534f 100644 --- a/src/openml_met_upload.py +++ b/src/openml_met_upload.py @@ -20,7 +20,7 @@ 'attributes': 'auto', 'data': df, 'ignore_attribute': None, - 'default_target_attribute': 'Is_Public_Domain', + 'default_target_attribute': 'is_public_domain', 'row_id_attribute': df.index.name, 'citation': None, 'version_label': '0.1', diff --git a/src/openml_public_upload.py b/src/openml_public_upload.py new file mode 100644 index 0000000..226c6a2 --- /dev/null +++ b/src/openml_public_upload.py @@ -0,0 +1,34 @@ + +import openml +from openml.datasets import create_dataset + +from public_procurement import * + +openml.config.apikey = '58012f5a6cbba5dcd3ddefbf852c1e99' +openml.config.apikey = 'ca1d24f37f00a1517a1638d5acc24321' # Thomas +df = get_public_procurement_df() + +params = { + 'name': 'public_procurement', + 'description': """Public procurement data +for the European Economic Area, Switzerland, and the +Macedonia. 2015""", + 'creator': 'European Union open data', + 'contributor': None, + 'language': 'English', + 'licence': 'Public Domain (CC0)', + 'collection_date': '2016-04-18', + 'attributes': 'auto', + 'data': df, + 'ignore_attribute': None, + 'default_target_attribute': 'award_value_euro', + 'row_id_attribute': df.index.name, + 'citation': None, + 'version_label': '0.1', + 'original_data_url': PUBLIC_PROCUREMENT_CONFIG.urlinfos[0].url, + 'paper_url': PUBLIC_PROCUREMENT_CONFIG.source, + 'update_comment': None +} + +dset = create_dataset(**params) +open_ml_id = dset.publish() diff --git a/src/openml_traffic_upload.py b/src/openml_traffic_upload.py index 90c16c4..5093f15 100644 --- a/src/openml_traffic_upload.py +++ b/src/openml_traffic_upload.py @@ -19,7 +19,7 @@ 'attributes': 'auto', 'data': df, 'ignore_attribute': None, - 'default_target_attribute': 'Violation_type', + 'default_target_attribute': 'violation_type', 'row_id_attribute': df.index.name, 'citation': None, 'version_label': '0.1', diff --git a/src/openml_vancouver_upload.py b/src/openml_vancouver_upload.py new file mode 100644 index 0000000..bfb2d0b --- /dev/null +++ b/src/openml_vancouver_upload.py @@ -0,0 +1,42 @@ +import openml +from openml.datasets import create_dataset + +from vancouver_employee import * + +openml.config.apikey = '58012f5a6cbba5dcd3ddefbf852c1e99' +openml.config.apikey = 'ca1d24f37f00a1517a1638d5acc24321' # Thomas +df = get_vancouver_employee_df() + +desc = """Employee remuneration and expenses (earning over 75,000CAD per year). This data set includes remuneration and expenses from employees earning over 75,000CAD per year. + + Attributes: + NAME: Name of employee listed by last name, followed by initials of first name and middle name (if applicable) + DEPARTMENT: Name of an organization unit at the City of Vancouver where specified Title belongs + TITLE: Name of position + REMUNERATION: Includes salary, overtime, gratuity and vacation payouts. Excludes severance payment. + EXPENSES: Includes charges such as training, tuition, conferences and travel and professional dues""" +# desc = 'Employee of Vancouver remenuration' +params = { + 'name': 'vancouver_employee', + 'description': desc, + 'creator': 'City of Vancouver', + 'contributor': None, + 'language': 'English', + 'licence': 'Open Government Licence Vancouver', + 'collection_date': '2017-08-13', + 'attributes': 'auto', + 'data': df, + 'ignore_attribute': None, + 'default_target_attribute': 'remuneration', + 'row_id_attribute': df.index.name, + 'citation': None, + 'version_label': '0.1', + 'original_data_url': 'https://data.vancouver.ca/datacatalogue/employeeRemunerationExpensesOver75k.htm', #VANCOUVER_EMPLOYEE_CONFIG.urlinfos[0].url, + 'paper_url': VANCOUVER_EMPLOYEE_CONFIG.source, + 'update_comment': None +} + +# from openml_beer_upload import params +# params['name'] = 'vancouver_employee' +dset = create_dataset(**params) +open_ml_id = dset.publish() diff --git a/src/openml_wine_upload.py b/src/openml_wine_upload.py new file mode 100644 index 0000000..a6fe1fc --- /dev/null +++ b/src/openml_wine_upload.py @@ -0,0 +1,32 @@ +import openml +from openml.datasets import create_dataset + +from wine_reviews import get_wine_reviews_df + +openml.config.apikey = '58012f5a6cbba5dcd3ddefbf852c1e99' +openml.config.apikey = 'ca1d24f37f00a1517a1638d5acc24321' # Thomas +df = get_wine_reviews_df() + +params = { + 'name': 'wine_reviews', + 'description': 'Wine data gathered by https://www.kaggle.com/zynicide' + 'The data was scraped from WineEnthusiast during the week of June 15th, 2017. The code for the scraper can be found at https://github.com/zackthoutt/wine-deep-learning', + 'creator': 'https://www.winemag.com', + 'contributor': 'https://www.kaggle.com/zynicide', + 'language': 'English', + 'licence': 'CC BY-NC-SA 4.0', + 'collection_date': '2017-06-15', + 'attributes': 'auto', + 'data': df, + 'ignore_attribute': None, + 'default_target_attribute': 'points', + 'row_id_attribute': df.index.name, + 'citation': None, + 'version_label': '0.1', + 'original_data_url': 'https://www.kaggle.com/zynicide/wine-reviews/home#winemag-data_first150k.csv', + 'paper_url': None, + 'update_comment': None +} + +dset = create_dataset(**params) +open_ml_id = dset.publish() diff --git a/src/public_procurement.py b/src/public_procurement.py new file mode 100644 index 0000000..5380f85 --- /dev/null +++ b/src/public_procurement.py @@ -0,0 +1,64 @@ +import os +import re +from collections import namedtuple + +import pandas as pd +import numpy as np + +from common.file_management import fetch, write_df, float_to_int + +DatasetInfo = namedtuple('DatasetInfo', ['name', 'urlinfos', 'main_file', 'source']) +UrlInfo = namedtuple('UrlInfo', ['url', 'filenames', 'uncompress']) + +PUBLIC_PROCUREMENT_CONFIG = DatasetInfo( + name='public_procurement', + urlinfos=( + UrlInfo( + url='http://data.europa.eu/euodp/repository/ec/dg-grow/mapps/2019/TED_Contract_award_notices_2015.csv', + filenames=( + "TED_Contract_award_notices_2015.csv", + ), uncompress=False + ), + ), + main_file="TED_Contract_award_notices_2015.csv", + source="https://data.europa.eu/euodp/en/data/dataset/ted-csv" +) + + +def get_public_procurement_df(save=True): + + # FIXME df.shape = (565163, 75) != from paper + # FIXME nb category cae_name = 39623 != from paper + # FIXME cae_name become str rather than category + # (openml requirments) + data_dir = fetch(PUBLIC_PROCUREMENT_CONFIG) + file = os.listdir(data_dir[0])[0] + csv_path = os.path.join(data_dir[0], file) + df = pd.read_csv(csv_path) + + df.loc[df.ID_LOT == 'Zp 2130-64/15', 'ID_LOT'] = np.nan + df.ID_LOT = df.ID_LOT.astype(float) + df.loc[df.CRIT_PRICE_WEIGHT == '50 points', + 'CRIT_PRICE_WEIGHT'] = np.nan + df.loc[[("%" in str(price)) for price in + df.CRIT_PRICE_WEIGHT.values], + 'CRIT_PRICE_WEIGHT'] = np.nan + df.CRIT_PRICE_WEIGHT = df.CRIT_PRICE_WEIGHT.astype(float) + row_typo = [] + for row, id_lot in enumerate(df.ID_LOT_AWARDED): + try: + float(id_lot) + except: + row_typo.append(row) + df.loc[row_typo, 'ID_LOT_AWARDED'] = np.nan # 345 over 565163 + df.ID_LOT_AWARDED = df.ID_LOT_AWARDED.astype(float) + df.loc[[39165, 39164], 'CONTRACT_NUMBER'] = np.nan + df.rename(columns={col: col.lower() for + col in df.columns}, inplace=True) + # df['cae_name'] = df['cae_name'].astype('category') + df['cae_name'] = df['cae_name'].astype(str) + tronq_cae = [str(x)[:1023] for x in df['cae_name']] + df['cae_name'] = pd.Series(tronq_cae, dtype=df['cae_name'].dtype, + index=df.index) + write_df(save, df, data_dir[1], PUBLIC_PROCUREMENT_CONFIG.main_file) + return df diff --git a/src/traffic_violations.py b/src/traffic_violations.py index 074f4c3..f06160f 100644 --- a/src/traffic_violations.py +++ b/src/traffic_violations.py @@ -53,5 +53,7 @@ def get_traffic_violations_df(save=True): df['Arrest Type'] = df['Arrest Type'].astype('category') df['Race'] = df['Race'].astype('category') df['Violation Type'] = df['Violation Type'].astype('category') + df.rename(columns={col: re.sub(' ', '_', col).lower() for + col in df.columns}, inplace=True) write_df(save, df, data_dir[1], TRAFFIC_VIOLATIONS_CONFIG.main_file) return df diff --git a/src/vancouver_employee.py b/src/vancouver_employee.py new file mode 100644 index 0000000..32b1f0b --- /dev/null +++ b/src/vancouver_employee.py @@ -0,0 +1,41 @@ +import os +from collections import namedtuple + +import numpy as np +import pandas as pd + +from common.file_management import fetch, write_df + +DatasetInfo = namedtuple('DatasetInfo', ['name', 'urlinfos', 'main_file', 'source']) +UrlInfo = namedtuple('UrlInfo', ['url', 'filenames', 'uncompress']) + +VANCOUVER_EMPLOYEE_CONFIG = DatasetInfo( + name='vancouver_employee', + urlinfos=( + UrlInfo( + url='ftp://webftp.vancouver.ca/OpenData/csv/2017StaffRemunerationOver75KWithExpenses.csv', + filenames=( + "StaffRemunerationOver75KWithExpenses.csv", + ), uncompress=False + ), + ), + main_file="2017StaffRemunerationOver75KWithExpenses.csv", + source="https://data.vancouver.ca/datacatalogue/employeeRemunerationExpensesOver75k.htm" +) + + +def get_vancouver_employee_df(save=True): + # InvalidSchema: No connection adapters were found for 'ftp://webftp.vancouver.ca/OpenData/csv/2017StaffRemunerationOver75KWithExpenses.csv' + # data_dir = fetch(VANCOUVER_EMPLOYEE_CONFIG) + # file = os.listdir(data_dir[0])[0] + # csv_path = os.path.join(data_dir[0], file) + + csv_path = 'data/vancouver_employee/raw/2017StaffRemunerationOver75KWithExpenses.csv' + df = pd.read_csv(csv_path, header=3) + df['Remuneration'] = df['Remuneration'].apply( + lambda x: np.log(float(''.join(str(x).split(','))))) + df.rename(columns={col: col.lower() for + col in df.columns}, inplace=True) + df['title'] = df['title'].astype('category') + # write_df(save, df, data_dir[1], VANCOUVER_EMPLOYEE_CONFIG.main_file) + return df diff --git a/src/wine_reviews.py b/src/wine_reviews.py new file mode 100644 index 0000000..35c8db0 --- /dev/null +++ b/src/wine_reviews.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def get_wine_reviews_df(): + # !kaggle datasets download zynicide/wine-reviews/ -p data/wine_reviews/raw --unzip + csv_path = 'data/wine_reviews/raw/winemag-data_first150k.csv' + df = pd.read_csv(csv_path, index_col=0) + return df + cat_cols = ['country', 'points', 'province', 'region_1', + 'region_2', 'variety'] + for c in cat_cols: + df[c] = df[c].astype('category')