Skip to content

Commit 534c38e

Browse files
authored
fix(sinan): split sinan continue (#213)
* Pysus refactoring * Minor fixes * linter * Fix sinan tests * Skip tests where google returned a response with code 429
1 parent 3516670 commit 534c38e

File tree

6 files changed

+55
-59
lines changed

6 files changed

+55
-59
lines changed

epigraphhub/data/brasil/sinan/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
import unicodedata
22

3-
from pysus.online_data import SINAN
3+
from pysus.online_data import FTP_SINAN
44

5-
DISEASES = SINAN.agravos
5+
DISEASES = FTP_SINAN.diseases
66

77

88
def normalize_str(disease: str) -> str:

epigraphhub/data/brasil/sinan/extract.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,31 @@
1-
import os
2-
from pathlib import Path
3-
41
import pandas as pd
52
from loguru import logger
6-
from pysus import SINAN
3+
from pysus.online_data import SINAN
74

85
from epigraphhub.data._config import PYSUS_DATA_PATH, SINAN_LOG_PATH
96

107
logger.add(SINAN_LOG_PATH, retention="7 days")
118

129

13-
def download(disease: str, years: list = None) -> None:
10+
def download(disease: str, years: list) -> list:
1411
"""
1512
Download all parquets available for a disease,
1613
according to `SINAN.agravos`.
1714
1815
Attrs:
1916
disease (str): The disease to be downloaded.
17+
years (list): The years to be downloaded.
18+
Returns:
19+
A list with full paths of parquet dirs to upload into db
2020
"""
2121

22-
SINAN.download_parquets(disease, years, data_path=PYSUS_DATA_PATH)
22+
parquets_dirs = SINAN.download(
23+
disease=disease, years=years, data_path=PYSUS_DATA_PATH
24+
)
25+
26+
logger.info(f"Disease {disease} for years {years} downloaded at {PYSUS_DATA_PATH}")
2327

24-
logger.info(f"All years for {disease} downloaded at {PYSUS_DATA_PATH}")
28+
return parquets_dirs
2529

2630

2731
def metadata_df(disease: str) -> pd.DataFrame:
Lines changed: 34 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,11 @@
11
import os
2-
from pathlib import Path
32

4-
import pandas as pd
5-
from pysus import SINAN
63
from loguru import logger
74
from pangres import upsert
8-
from pysus.classes.sinan import Disease
5+
from pysus.online_data import parquets_to_dataframe
96

107
from epigraphhub.connection import get_engine
11-
from epigraphhub.data._config import SINAN_LOG_PATH, PYSUS_DATA_PATH
8+
from epigraphhub.data._config import SINAN_LOG_PATH
129
from epigraphhub.settings import env
1310

1411
from . import normalize_str
@@ -17,46 +14,39 @@
1714
engine = get_engine(credential_name=env.db.default_credential)
1815

1916

20-
def upload(disease: str, data_path: str = PYSUS_DATA_PATH):
17+
def upload(disease: str, parquet_dir: str) -> None:
2118
"""
2219
Connects to the EpiGraphHub SQL server and load parquet chunks within
23-
directories, extracted using `extract.download`, into database. Receives
24-
a disease and look for local parquets paths in PYSUS_DATA_PATH, extract theirs
25-
DataFrames and upsert rows to Postgres connection following EGH table
26-
convention, see more in EGH's documentation:
20+
directories, extracted using `extract.download`, into database. a local
21+
parquet dir (eg. ~/pysus/ZIKABR19.parquet), extract theirs DataFrames
22+
and upsert rows to Postgres connection following EGH table convention,
23+
see more in EGH's documentation:
2724
https://epigraphhub.readthedocs.io/en/latest/instruction_name_tables.html#about-metadata-tables
2825
"""
29-
disease_years = Disease(disease).get_years(stage='all')
30-
31-
for year in disease_years:
32-
df = SINAN.parquets_to_df(disease, year, data_path)
33-
if not df.empty:
34-
df.columns = df.columns.str.lower()
35-
df.index.name = "index"
36-
37-
tablename = "sinan_" + normalize_str(disease) + "_m"
38-
schema = "brasil"
39-
40-
print(f"Inserting {disease}-{year} on {schema}.{tablename}")
41-
42-
with engine.connect() as conn:
43-
try:
44-
upsert(
45-
con=conn,
46-
df=df,
47-
table_name=tablename,
48-
schema=schema,
49-
if_row_exists="update",
50-
chunksize=1000,
51-
add_new_columns=True,
52-
create_table=True,
53-
)
54-
55-
print(f"Table {tablename} updated")
56-
57-
except Exception as e:
58-
logger.error(f"Not able to upsert {tablename} \n{e}")
59-
raise e
60-
else:
61-
print(f'[WARNING] No data for {disease} and year {year}. Skipping')
62-
continue
26+
if any(os.listdir(parquet_dir)):
27+
df = parquets_to_dataframe(parquet_dir=parquet_dir)
28+
df.columns = df.columns.str.lower()
29+
df.index.name = "index"
30+
31+
tablename = "sinan_" + normalize_str(disease) + "_m"
32+
schema = "brasil"
33+
print(f"Inserting {parquet_dir} on {schema}.{tablename}")
34+
35+
with engine.connect() as conn:
36+
try:
37+
upsert(
38+
con=conn,
39+
df=df,
40+
table_name=tablename,
41+
schema=schema,
42+
if_row_exists="update",
43+
chunksize=1000,
44+
add_new_columns=True,
45+
create_table=True,
46+
)
47+
48+
print(f"Table {tablename} updated")
49+
50+
except Exception as e:
51+
logger.error(f"Not able to upsert {tablename} \n{e}")
52+
raise e

epigraphhub/data/brasil/sinan/viz.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import pandas as pd
22
from loguru import logger
3-
from pysus import SINAN
3+
from pysus.online_data import parquets_to_dataframe
44

55
from epigraphhub.connection import get_engine
66
from epigraphhub.data._config import SINAN_LOG_PATH
@@ -13,7 +13,7 @@
1313
engine = get_engine(credential_name=env.db.default_credential)
1414

1515

16-
def parquet(disease: str, year: str|int) -> pd.DataFrame:
16+
def parquet(parquets_dir: str) -> pd.DataFrame:
1717
"""
1818
Convert the parquet files into a pandas DataFrame.
1919
@@ -27,7 +27,7 @@ def parquet(disease: str, year: str|int) -> pd.DataFrame:
2727
df (DataFrame) : A Pandas DataFrame.
2828
"""
2929

30-
df = SINAN.parquet_to_df(disease, year)
30+
df = parquets_to_dataframe(parquet_dir=parquets_dir)
3131
df.columns = df.columns.str.lower()
3232

3333
return df

tests/test_data/test_ggtrends.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ def test_historical_interest():
2020
assert not df.empty
2121

2222

23+
@pytest.mark.skip(reason="Google returned a response with code 429.")
2324
def test_interest_over_time():
2425
keywords = ["coronavirus", "covid"]
2526
iot_df = ggtrends.interest_over_time(keywords)
@@ -34,6 +35,7 @@ def test_interest_region():
3435
assert df.index.name == "geoName"
3536

3637

38+
@pytest.mark.skip(reason="Google returned a response with code 429.")
3739
def test_related_topics():
3840
keywords = ["coronavirus", "covid"]
3941
d = ggtrends.related_topics(keywords)

tests/test_data/test_sinan_fetch.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,20 +22,20 @@ def setUp(self):
2222
self.schema = "brasil"
2323

2424
def test_download_data_zika(self):
25-
extract.download(self.disease)
25+
extract.download(disease=self.disease, years=self.year)
2626
self.assertTrue(any(os.listdir(self.data_dir)))
2727
self.assertTrue(self.file[0] in os.listdir(self.data_dir))
2828

2929
def test_parquet_visualization(self):
3030
fpath = Path(self.data_dir) / self.file[0]
31-
df = viz.parquet(fpath, clean_after_read=False)
31+
df = viz.parquet(fpath)
3232
self.assertIsInstance(df, pd.DataFrame)
3333
self.assertEqual(df.shape, (32684, 38))
3434

3535
def test_metadata_extraction(self):
3636
anim_metadata = extract.metadata_df("Animais Peçonhentos")
3737
self.assertTrue(isinstance(anim_metadata, pd.DataFrame))
38-
self.assertEqual(anim_metadata.shape, (58, 7))
38+
self.assertEqual(anim_metadata.shape, (59, 7))
3939
self.assertEqual(
4040
list(anim_metadata.columns),
4141
[

0 commit comments

Comments
 (0)