Skip to content

Commit 4d23895

Browse files
committed
Refactor download pipelines anatel
1 parent 04c17bc commit 4d23895

File tree

4 files changed

+120
-115
lines changed

4 files changed

+120
-115
lines changed

pipelines/utils/crawler_anatel/banda_larga_fixa/constants.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,35 @@ class constants(Enum): # pylint: disable=c0103
1313
"https://dados.gov.br/dados/conjuntos-dados/acessos---banda-larga-fixa"
1414
)
1515

16+
COOKIES = {
17+
"_ga": "GA1.1.1373815678.1744670764",
18+
"SLG_G_WPT_TO": "pt",
19+
"SLG_GWPT_Show_Hide_tmp": "1",
20+
"SLG_wptGlobTipTmp": "1",
21+
"AWSALB": "VwPXBjkh3JMyPZnflBxvTLuhZrtgjJkUOAF2o3DLzPw91FLKfa46btmQBRooIJLmWoHj9ZgmprqHSkzmemH3wx1m9IXbzyemVzcLKYb9AioQ8F7vXVf/VtIf8Chu",
22+
"AWSALBCORS": "VwPXBjkh3JMyPZnflBxvTLuhZrtgjJkUOAF2o3DLzPw91FLKfa46btmQBRooIJLmWoHj9ZgmprqHSkzmemH3wx1m9IXbzyemVzcLKYb9AioQ8F7vXVf/VtIf8Chu",
23+
"_ga_HVQVE1EE4Y": "GS1.1.1744751022.3.1.1744751036.46.0.0",
24+
"_ga_YEVH28106Q": "GS1.1.1744751022.3.1.1744751036.0.0.0",
25+
"_ga_Q5P3VN4T0E": "GS1.1.1744751021.3.1.1744751036.0.0.0",
26+
"_ga_PZGRWZP59S": "GS1.1.1744751021.3.1.1744751037.0.0.0",
27+
"_ga_3TJ75C1VW5": "GS1.1.1744751022.3.1.1744751037.0.0.0",
28+
}
29+
30+
HEADERS = {
31+
"accept": "application/json, text/plain, */*",
32+
"accept-language": "pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7",
33+
"cookie": "_ga=GA1.1.1373815678.1744670764; SLG_G_WPT_TO=pt; SLG_GWPT_Show_Hide_tmp=1; SLG_wptGlobTipTmp=1; AWSALB=VwPXBjkh3JMyPZnflBxvTLuhZrtgjJkUOAF2o3DLzPw91FLKfa46btmQBRooIJLmWoHj9ZgmprqHSkzmemH3wx1m9IXbzyemVzcLKYb9AioQ8F7vXVf/VtIf8Chu; AWSALBCORS=VwPXBjkh3JMyPZnflBxvTLuhZrtgjJkUOAF2o3DLzPw91FLKfa46btmQBRooIJLmWoHj9ZgmprqHSkzmemH3wx1m9IXbzyemVzcLKYb9AioQ8F7vXVf/VtIf8Chu; _ga_HVQVE1EE4Y=GS1.1.1744751022.3.1.1744751036.46.0.0; _ga_YEVH28106Q=GS1.1.1744751022.3.1.1744751036.0.0.0; _ga_Q5P3VN4T0E=GS1.1.1744751021.3.1.1744751036.0.0.0; _ga_PZGRWZP59S=GS1.1.1744751021.3.1.1744751037.0.0.0; _ga_3TJ75C1VW5=GS1.1.1744751022.3.1.1744751037.0.0.0",
34+
"priority": "u=1, i",
35+
"referer": "https://dados.gov.br/dados/conjuntos-dados/acessos---banda-larga-fixa",
36+
"sec-ch-ua": '"Not A(Brand";v="8", "Chromium";v="132", "Opera GX";v="117"',
37+
"sec-ch-ua-mobile": "?0",
38+
"sec-ch-ua-platform": '"Windows"',
39+
"sec-fetch-dest": "empty",
40+
"sec-fetch-mode": "cors",
41+
"sec-fetch-site": "same-origin",
42+
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 OPR/117.0.0.0",
43+
}
44+
1645
INPUT_PATH = "/tmp/data/input/"
1746

1847
TABLES_OUTPUT_PATH = {

pipelines/utils/crawler_anatel/banda_larga_fixa/utils.py

Lines changed: 30 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -1,84 +1,50 @@
11
# -*- coding: utf-8 -*-
2+
import gc
23
import os
3-
import time
44
from zipfile import ZipFile
55

66
import numpy as np
77
import pandas as pd
8-
from selenium import webdriver
9-
from selenium.webdriver.common.by import By
10-
from selenium.webdriver.support import expected_conditions as EC
11-
from selenium.webdriver.support.ui import WebDriverWait
8+
import requests
129

1310
from pipelines.utils.crawler_anatel.banda_larga_fixa.constants import (
1411
constants as anatel_constants,
1512
)
1613
from pipelines.utils.utils import log, to_partitions
1714

1815

19-
def download_zip_file(path):
20-
"""
21-
Downloads a zip file from a specific URL and saves it to the given path.
22-
23-
Args:
24-
path (str): The path where the downloaded zip file will be saved.
16+
def download_zip_file():
17+
response = requests.get(
18+
"https://dados.gov.br/api/publico/conjuntos-dados/acessos---banda-larga-fixa",
19+
cookies=anatel_constants.COOKIES.value,
20+
headers=anatel_constants.HEADERS.value,
21+
)
2522

26-
Returns:
27-
None
28-
"""
29-
if not os.path.exists(path):
30-
os.makedirs(path)
31-
options = webdriver.ChromeOptions()
32-
# https://github.com/SeleniumHQ/selenium/issues/11637
33-
prefs = {
34-
"download.default_directory": path,
35-
"download.prompt_for_download": False,
36-
"download.directory_upgrade": True,
37-
"safebrowsing.enabled": True,
38-
}
39-
options.add_experimental_option(
40-
"prefs",
41-
prefs,
42-
)
43-
options.add_argument("--headless=new")
44-
options.add_argument("--test-type")
45-
options.add_argument("--disable-gpu")
46-
options.add_argument("--no-first-run")
47-
options.add_argument("--no-sandbox")
48-
options.add_argument("--disable-dev-shm-usage")
49-
options.add_argument("--no-default-browser-check")
50-
options.add_argument("--ignore-certificate-errors")
51-
options.add_argument("--start-maximized")
52-
options.add_argument(
53-
"user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
54-
)
55-
driver = webdriver.Chrome(options=options)
56-
driver.get(anatel_constants.URL.value)
57-
58-
driver.maximize_window()
59-
WebDriverWait(driver, 300).until(
60-
EC.element_to_be_clickable(
61-
(
62-
By.XPATH,
63-
"/html/body/div/section/div/div[3]/div[2]/div[3]/div[2]/header/button",
23+
r = response.json()
24+
for recurso in r["resources"]:
25+
if recurso["format"] == "ZIP":
26+
download_url = recurso["url"]
27+
print(
28+
f"Baixando {download_url} em {anatel_constants.INPUT_PATH.value}"
6429
)
65-
)
66-
).click()
6730

68-
WebDriverWait(driver, 300).until(
69-
EC.element_to_be_clickable(
70-
(
71-
By.XPATH,
72-
"/html/body/div/section/div/div[3]/div[2]/div[3]/div[2]/div/div[1]/div[2]/div[2]/div/button",
73-
)
31+
with open(
32+
os.path.join(
33+
anatel_constants.INPUT_PATH.value, "acessos_banda_larga_fixa.zip"
34+
),
35+
"wb",
36+
) as file:
37+
response = requests.get(
38+
download_url,
39+
cookies=anatel_constants.COOKIES.value,
40+
headers=anatel_constants.HEADERS.value,
7441
)
75-
).click()
76-
time.sleep(150)
77-
log(os.listdir(path))
42+
file.write(response.content)
7843

7944

8045
def unzip_file():
81-
download_zip_file(path=anatel_constants.INPUT_PATH.value)
46+
os.makedirs(anatel_constants.INPUT_PATH.value, exist_ok=True)
47+
download_zip_file()
8248
zip_file_path = os.path.join(
8349
anatel_constants.INPUT_PATH.value, "acessos_banda_larga_fixa.zip"
8450
)
@@ -89,6 +55,9 @@ def unzip_file():
8955
except Exception as e:
9056
print(f"Erro ao baixar ou extrair o arquivo ZIP: {str(e)}")
9157

58+
os.remove(zip_file_path)
59+
gc.collect()
60+
9261

9362
def check_and_create_column(df: pd.DataFrame, col_name: str) -> pd.DataFrame:
9463
"""

pipelines/utils/crawler_anatel/telefonia_movel/constants.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,35 @@ class constants(Enum): # pylint: disable=c0103
1111
Constant values for the br_anatel_telefonia_movel project
1212
"""
1313

14+
COOEKIES = {
15+
"_ga": "GA1.1.1373815678.1744670764",
16+
"SLG_G_WPT_TO": "pt",
17+
"SLG_GWPT_Show_Hide_tmp": "1",
18+
"SLG_wptGlobTipTmp": "1",
19+
"AWSALB": "uNE7TsXIJeZE89WxJSn4deQ0LzA78gEi6OQY5DsOky3dxNNqzXdeZvhxAyskH0YwYUYdc1oJIDJKMIIoFm7cv0Q4Ox8l/wwPrnLuh9aeNQG5DV2hgLSpqIikwsWf",
20+
"AWSALBCORS": "uNE7TsXIJeZE89WxJSn4deQ0LzA78gEi6OQY5DsOky3dxNNqzXdeZvhxAyskH0YwYUYdc1oJIDJKMIIoFm7cv0Q4Ox8l/wwPrnLuh9aeNQG5DV2hgLSpqIikwsWf",
21+
"_ga_HVQVE1EE4Y": "GS1.1.1744841164.6.1.1744841288.59.0.0",
22+
"_ga_YEVH28106Q": "GS1.1.1744841164.6.1.1744841288.0.0.0",
23+
"_ga_Q5P3VN4T0E": "GS1.1.1744841164.6.1.1744841288.0.0.0",
24+
"_ga_PZGRWZP59S": "GS1.1.1744841164.6.1.1744841288.0.0.0",
25+
"_ga_3TJ75C1VW5": "GS1.1.1744841164.6.1.1744841288.0.0.0",
26+
}
27+
28+
HEADERS = {
29+
"accept": "application/json, text/plain, */*",
30+
"accept-language": "pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7",
31+
# 'cookie': '_ga=GA1.1.1373815678.1744670764; SLG_G_WPT_TO=pt; SLG_GWPT_Show_Hide_tmp=1; SLG_wptGlobTipTmp=1; AWSALB=uNE7TsXIJeZE89WxJSn4deQ0LzA78gEi6OQY5DsOky3dxNNqzXdeZvhxAyskH0YwYUYdc1oJIDJKMIIoFm7cv0Q4Ox8l/wwPrnLuh9aeNQG5DV2hgLSpqIikwsWf; AWSALBCORS=uNE7TsXIJeZE89WxJSn4deQ0LzA78gEi6OQY5DsOky3dxNNqzXdeZvhxAyskH0YwYUYdc1oJIDJKMIIoFm7cv0Q4Ox8l/wwPrnLuh9aeNQG5DV2hgLSpqIikwsWf; _ga_HVQVE1EE4Y=GS1.1.1744841164.6.1.1744841288.59.0.0; _ga_YEVH28106Q=GS1.1.1744841164.6.1.1744841288.0.0.0; _ga_Q5P3VN4T0E=GS1.1.1744841164.6.1.1744841288.0.0.0; _ga_PZGRWZP59S=GS1.1.1744841164.6.1.1744841288.0.0.0; _ga_3TJ75C1VW5=GS1.1.1744841164.6.1.1744841288.0.0.0',
32+
"priority": "u=1, i",
33+
"referer": "https://dados.gov.br/dados/conjuntos-dados/acessos-autorizadas-smp",
34+
"sec-ch-ua": '"Not A(Brand";v="8", "Chromium";v="132", "Opera GX";v="117"',
35+
"sec-ch-ua-mobile": "?0",
36+
"sec-ch-ua-platform": '"Windows"',
37+
"sec-fetch-dest": "empty",
38+
"sec-fetch-mode": "cors",
39+
"sec-fetch-site": "same-origin",
40+
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 OPR/117.0.0.0",
41+
}
42+
1443
URL = "https://dados.gov.br/dados/conjuntos-dados/acessos-autorizadas-smp"
1544

1645
INPUT_PATH = "/tmp/data/input/"

pipelines/utils/crawler_anatel/telefonia_movel/utils.py

Lines changed: 32 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -4,85 +4,63 @@
44
"""
55
# pylint: disable=too-few-public-methods,invalid-name
66

7+
import gc
78
import os
8-
import time
99
from zipfile import ZipFile
1010

1111
import pandas as pd
12-
from selenium import webdriver
13-
from selenium.webdriver.common.by import By
14-
from selenium.webdriver.support import expected_conditions as EC
15-
from selenium.webdriver.support.ui import WebDriverWait
12+
import requests
1613

1714
from pipelines.utils.crawler_anatel.telefonia_movel.constants import (
1815
constants as anatel_constants,
1916
)
2017
from pipelines.utils.utils import log, to_partitions
2118

2219

23-
def download_zip_file(path):
24-
if not os.path.exists(path):
25-
os.makedirs(path)
26-
options = webdriver.ChromeOptions()
27-
prefs = {
28-
"download.default_directory": path,
29-
"download.prompt_for_download": False,
30-
"download.directory_upgrade": True,
31-
"safebrowsing.enabled": True,
32-
}
33-
options.add_experimental_option(
34-
"prefs",
35-
prefs,
36-
)
37-
options.add_argument("--headless=new")
38-
options.add_argument("--test-type")
39-
options.add_argument("--disable-gpu")
40-
options.add_argument("--no-first-run")
41-
options.add_argument("--no-sandbox")
42-
options.add_argument("--disable-dev-shm-usage")
43-
options.add_argument("--no-default-browser-check")
44-
options.add_argument("--ignore-certificate-errors")
45-
options.add_argument("--start-maximized")
46-
options.add_argument(
47-
"user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
48-
)
49-
50-
driver = webdriver.Chrome(options=options)
51-
driver.get(anatel_constants.URL.value)
52-
driver.maximize_window()
53-
WebDriverWait(driver, 300).until(
54-
EC.element_to_be_clickable(
55-
(
56-
By.XPATH,
57-
"/html/body/div/section/div/div[3]/div[2]/div[3]/div[2]/header/button",
58-
)
59-
)
60-
).click()
61-
WebDriverWait(driver, 300).until(
62-
EC.element_to_be_clickable(
63-
(
64-
By.XPATH,
65-
"/html/body/div/section/div/div[3]/div[2]/div[3]/div[2]/div/div[1]/div[2]/div[2]/div/button",
20+
def download_zip_file():
21+
response = requests.get(
22+
"https://dados.gov.br/api/publico/conjuntos-dados/acessos-autorizadas-smp",
23+
cookies=anatel_constants.COOEKIES.value,
24+
headers=anatel_constants.HEADERS.value,
25+
)
26+
r = response.json()
27+
for recurso in r["resources"]:
28+
if recurso["format"] == "CSV":
29+
download_url = recurso["url"]
30+
print(
31+
f"Baixando {download_url} em {anatel_constants.INPUT_PATH.value}"
6632
)
33+
34+
with open(
35+
os.path.join(
36+
anatel_constants.INPUT_PATH.value, "acessos_telefonia_movel.zip"
37+
),
38+
"wb",
39+
) as file:
40+
response = requests.get(
41+
download_url,
42+
cookies=anatel_constants.COOEKIES.value,
43+
headers=anatel_constants.HEADERS.value,
6744
)
68-
).click()
69-
time.sleep(150)
70-
log(os.listdir(path))
45+
file.write(response.content)
7146

7247

7348
def unzip_file():
74-
download_zip_file(path=anatel_constants.INPUT_PATH.value)
49+
os.makedirs(anatel_constants.INPUT_PATH.value, exist_ok=True)
50+
download_zip_file()
7551
zip_file_path = os.path.join(
7652
anatel_constants.INPUT_PATH.value, "acessos_telefonia_movel.zip"
7753
)
78-
time.sleep(300)
54+
7955
try:
8056
with ZipFile(zip_file_path, "r") as zip_ref:
8157
zip_ref.extractall(anatel_constants.INPUT_PATH.value)
82-
8358
except Exception as e:
8459
print(f"Erro ao baixar ou extrair o arquivo ZIP: {str(e)}")
8560

61+
os.remove(zip_file_path)
62+
gc.collect()
63+
8664

8765
# ! TASK MICRODADOS
8866
def clean_csv_microdados(ano, semestre, table_id):

0 commit comments

Comments
 (0)