Skip to content

Commit 6dcc052

Browse files
committed
init repo
0 parents  commit 6dcc052

File tree

6 files changed

+1929
-0
lines changed

6 files changed

+1929
-0
lines changed

.gitignore

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
.datatoys
2+
3+
# Byte-compiled / optimized / DLL files
4+
__pycache__/
5+
*.py[cod]
6+
*$py.class
7+
8+
# C extensions
9+
*.so
10+
11+
# Distribution / packaging
12+
.Python
13+
build/
14+
develop-eggs/
15+
dist/
16+
downloads/
17+
eggs/
18+
.eggs/
19+
lib/
20+
lib64/
21+
parts/
22+
sdist/
23+
var/
24+
wheels/
25+
share/python-wheels/
26+
*.egg-info/
27+
.installed.cfg
28+
*.egg
29+
MANIFEST
30+
31+
# PyInstaller
32+
# Usually these files are written by a python script from a template
33+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
34+
*.manifest
35+
*.spec
36+
37+
# Installer logs
38+
pip-log.txt
39+
pip-delete-this-directory.txt
40+
41+
# Unit test / coverage reports
42+
htmlcov/
43+
.tox/
44+
.nox/
45+
.coverage
46+
.coverage.*
47+
.cache
48+
nosetests.xml
49+
coverage.xml
50+
*.cover
51+
*.py,cover
52+
.hypothesis/
53+
.pytest_cache/
54+
cover/
55+
56+
# Translations
57+
*.mo
58+
*.pot
59+
60+
# Django stuff:
61+
*.log
62+
local_settings.py
63+
db.sqlite3
64+
db.sqlite3-journal
65+
66+
# Flask stuff:
67+
instance/
68+
.webassets-cache
69+
70+
# Scrapy stuff:
71+
.scrapy
72+
73+
# Sphinx documentation
74+
docs/_build/
75+
76+
# PyBuilder
77+
.pybuilder/
78+
target/
79+
80+
# Jupyter Notebook
81+
.ipynb_checkpoints
82+
83+
# IPython
84+
profile_default/
85+
ipython_config.py
86+
87+
# pyenv
88+
# For a library or package, you might want to ignore these files since the code is
89+
# intended to run in multiple environments; otherwise, check them in:
90+
# .python-version
91+
92+
# pipenv
93+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
94+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
95+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
96+
# install all needed dependencies.
97+
#Pipfile.lock
98+
99+
# poetry
100+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
101+
# This is especially recommended for binary packages to ensure reproducibility, and is more
102+
# commonly ignored for libraries.
103+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
104+
#poetry.lock
105+
106+
# pdm
107+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
108+
#pdm.lock
109+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
110+
# in version control.
111+
# https://pdm.fming.dev/#use-with-ide
112+
.pdm.toml
113+
114+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115+
__pypackages__/
116+
117+
# Celery stuff
118+
celerybeat-schedule
119+
celerybeat.pid
120+
121+
# SageMath parsed files
122+
*.sage.py
123+
124+
# Environments
125+
.env
126+
.venv
127+
env/
128+
venv/
129+
ENV/
130+
env.bak/
131+
venv.bak/
132+
133+
# Spyder project settings
134+
.spyderproject
135+
.spyproject
136+
137+
# Rope project settings
138+
.ropeproject
139+
140+
# mkdocs documentation
141+
/site
142+
143+
# mypy
144+
.mypy_cache/
145+
.dmypy.json
146+
dmypy.json
147+
148+
# Pyre type checker
149+
.pyre/
150+
151+
# pytype static type analyzer
152+
.pytype/
153+
154+
# Cython debug symbols
155+
cython_debug/
156+
157+
# PyCharm
158+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160+
# and can be added to the global gitignore or merged into this file. For a more nuclear
161+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
162+
#.idea/

.pre-commit-config.yaml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
repos:
2+
- repo: https://github.com/pre-commit/pre-commit-hooks
3+
rev: v2.3.0
4+
hooks:
5+
- id: check-yaml
6+
- id: end-of-file-fixer
7+
- id: trailing-whitespace
8+
- repo: https://github.com/psf/black
9+
rev: "22.6.0"
10+
hooks:
11+
- id: black

datatoys/__init__.py

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
import os
2+
3+
import pandas as pd
4+
import pyreadr
5+
6+
7+
class Datatoy:
8+
"""Easily install & load curated Republic of Korea public datasets from https://github.com/statgarten/datatoys"""
9+
10+
DATATOYS_URL = "https://github.com/statgarten/datatoys"
11+
README_POSTFIX = "/blob/main/README.md"
12+
DOWNLOAD_DIR = f"{os.path.join(os.getcwd(), '.datatoys')}"
13+
DATASET_HEADER_KO = "데이터셋"
14+
15+
def __init__(self):
16+
self.__create_download_directory()
17+
self.manifest = self.get_manifest()
18+
19+
def __create_download_directory(self):
20+
os.makedirs(self.DOWNLOAD_DIR, exist_ok=True)
21+
assert os.path.exists(self.DOWNLOAD_DIR)
22+
23+
def _dataset_downloaded(self, dataset_nm: str) -> bool:
24+
return os.path.exists(f"{self.DOWNLOAD_DIR}/{dataset_nm}.rda")
25+
26+
def _dataset_in_manifest(self, dataset_nm: str) -> bool:
27+
return dataset_nm in self.get_manifest_dataset_names()
28+
29+
def get_manifest(self) -> pd.DataFrame:
30+
url = self.DATATOYS_URL + self.README_POSTFIX
31+
response = pd.read_html(url)
32+
assert len(response) == 1 and isinstance(response[0], pd.DataFrame)
33+
return response.pop()
34+
35+
def get_manifest_dataset_names(self) -> list:
36+
return self.manifest.loc[:, self.DATASET_HEADER_KO].tolist()
37+
38+
def show_manifest(self):
39+
print(self.get_manifest())
40+
41+
def install(self, dataset_nm: str) -> bool:
42+
"""Install the dataset to the download directory.
43+
44+
Args:
45+
dataset_nm (str): The name of the dataset to be deleted.
46+
47+
Returns:
48+
return: return True if successfully installed otherwise false.
49+
50+
Raises:
51+
raise ValueError: raise ValueError if the dataset is not in the manifest.
52+
"""
53+
54+
remote_url = f"{self.DATATOYS_URL}/blob/main/data/{dataset_nm}.rda?raw=true"
55+
dst_path = f"{self.DOWNLOAD_DIR}/{dataset_nm}.rda"
56+
if not self._dataset_in_manifest(dataset_nm):
57+
raise ValueError(
58+
f"Dataset `{dataset_nm}` is not in the manifest. Check the manifest with `Datatoy().show_manifest()`."
59+
)
60+
try:
61+
pyreadr.download_file(remote_url, dst_path)
62+
except Exception as e:
63+
print(f"Exception occured while downloading {remote_url}", e)
64+
return False
65+
assert self._dataset_downloaded(dataset_nm)
66+
return True
67+
68+
def load(self, dataset_nm: str) -> pd.DataFrame:
69+
"""Load the dataset from the download directory.
70+
71+
Calls `Datatoy().install()` if the dataset is not downloaded.
72+
73+
Args:
74+
dataset_nm (str): The name of the dataset to be deleted.
75+
76+
Returns:
77+
return: pandas.DataFrame
78+
"""
79+
80+
dst_path = f"{self.DOWNLOAD_DIR}/{dataset_nm}.rda"
81+
if not self._dataset_downloaded(dataset_nm):
82+
print(f"Dataset `{dataset_nm}` is not installed. Installing it first.")
83+
self.install(dataset_nm)
84+
res = pyreadr.read_r(dst_path)
85+
data = res.get(dataset_nm)
86+
assert isinstance(data, pd.DataFrame)
87+
return data
88+
89+
def clean(self, dataset_nm: str) -> bool:
90+
"""Delete the dataset from the download directory.
91+
92+
Args:
93+
dataset_nm (str): The name of the dataset to be deleted.
94+
95+
Returns:
96+
return: return True if the dataset is deleted successfully otherwise false.
97+
"""
98+
99+
dst_path = f"{self.DOWNLOAD_DIR}/{dataset_nm}.rda"
100+
if self._dataset_downloaded(dataset_nm):
101+
os.remove(dst_path)
102+
return True
103+
return False
104+
105+
def clean_all(self):
106+
"""Cleanup all datasets within the download directory."""
107+
108+
for dataset_nm in self.get_manifest_dataset_names():
109+
self.clean(dataset_nm)
110+
111+
112+
if __name__ == "__main__":
113+
dt = Datatoy()
114+
dt.show_manifest()
115+
dataset_nm = "karaoke"
116+
df = dt.load("karaoke")
117+
print(df.head())
118+
dt.clean(dataset_nm)

0 commit comments

Comments
 (0)