Skip to content

Commit dc2d6bd

Browse files
committed
parse de TPU classes
1 parent 740074b commit dc2d6bd

31 files changed

+158261
-5554
lines changed

R/tpu.R renamed to R/tpu-assunto.R

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,16 @@
1-
tpu_download <- function(sig, path = "data-raw/tpu") {
1+
tpu_assunto_download <- function(sig, path = "data-raw/tpu/A") {
22
load("data/sgt_atual.rda")
33

44
u_tpu <- sgt_atual |>
55
dplyr::filter(sigla == sig) |>
66
dplyr::pull(link)
77

8-
file <- glue::glue("{path}/tpu_{sig}.html")
8+
file <- glue::glue("{path}/{sig}.html")
99

1010
r_tpu <- httr::GET(u_tpu, httr::write_disk(file, overwrite = TRUE))
1111
}
1212

13-
tpu_parse <- function(file) {
13+
tpu_assunto_parse <- function(file) {
1414
# pega todos os assuntos de nível 2 a 6
1515
html <- xml2::read_html(file) |>
1616
xml2::xml_find_first("//table") |>
@@ -90,10 +90,14 @@ tpu_parse <- function(file) {
9090
dplyr::select(!dplyr::contains("assunto")) |>
9191
ncol()
9292

93-
da_assunto1 <- assunto1_sem_id |>
93+
assunto1_ids <- assunto1_sem_id |>
94+
dplyr::filter(classe1) |>
9495
dplyr::mutate(
95-
id = ifelse(assunto1, 1:n_id, NA_integer_)
96-
) |>
96+
id = ifelse(classe1, 1:n_id, NA_integer_)
97+
)
98+
99+
da_assunto1 <- assunto1_sem_id |>
100+
dplyr::left_join(assunto1_ids) |>
97101
tidyr::fill(id, .direction="down") |>
98102
dplyr::group_by(id) |>
99103
dplyr::mutate(
@@ -157,7 +161,7 @@ tpu_parse <- function(file) {
157161
return(da)
158162
}
159163

160-
tpu_tidy <- function(da) {
164+
tpu_assunto_tidy <- function(da) {
161165
da |>
162166
dplyr::mutate(
163167
dplyr::across(

R/tpu-classe.R

Lines changed: 196 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,196 @@
1+
tpu_classe_download <- function(sig, path = "data-raw/tpu/C") {
2+
load("data/sgt_atual.rda")
3+
4+
u_tpu <- sgt_atual |>
5+
dplyr::filter(sigla == sig) |>
6+
dplyr::pull(link)
7+
8+
file <- glue::glue("{path}/{sig}.html")
9+
10+
r_tpu <- httr::GET(u_tpu, httr::write_disk(file, overwrite = TRUE))
11+
}
12+
13+
tpu_classe_parse <- function(file) {
14+
# pega todos os classes de nível 2 a 6
15+
html <- xml2::read_html(file) |>
16+
xml2::xml_find_first("//table") |>
17+
xml2::xml_find_all(".//tr[contains(@style, 'background-color')]")
18+
19+
da_sem_classe1 <- purrr::map(html, function(html) {
20+
tibble::tibble(
21+
classe1 = NA_character_,
22+
classe2 = html |>
23+
xml2::xml_find_all("./td[1]") |>
24+
xml2::xml_text(trim=TRUE),
25+
classe3 = html |>
26+
xml2::xml_find_all("./td[2]") |>
27+
xml2::xml_text(trim=TRUE),
28+
classe4 = html |>
29+
xml2::xml_find_all("./td[3]") |>
30+
xml2::xml_text(trim=TRUE),
31+
classe5 = html |>
32+
xml2::xml_find_all("./td[4]") |>
33+
xml2::xml_text(trim=TRUE),
34+
classe6 = html |>
35+
xml2::xml_find_all("./td[5]") |>
36+
xml2::xml_text(trim=TRUE),
37+
codigo = html |>
38+
xml2::xml_find_all("./td[6]") |>
39+
xml2::xml_text(trim=TRUE),
40+
codigo_pai = html |>
41+
xml2::xml_find_all("./td[7]") |>
42+
xml2::xml_text(trim=TRUE),
43+
dispositivo_legal = html |>
44+
xml2::xml_find_all("./td[8]") |>
45+
xml2::xml_text(trim=TRUE),
46+
artigo = html |>
47+
xml2::xml_find_all("./td[9]") |>
48+
xml2::xml_text(trim=TRUE),
49+
sigla = html |>
50+
xml2::xml_find_all("./td[10]") |>
51+
xml2::xml_text(trim=TRUE),
52+
alteracoes = html |>
53+
xml2::xml_find_all("./td[11]") |>
54+
xml2::xml_text(trim=TRUE),
55+
glossario = html |>
56+
xml2::xml_find_all("./td[12]") |>
57+
xml2::xml_text(trim=TRUE),
58+
dt_publicacao = html |>
59+
xml2::xml_find_all("./td[13]") |>
60+
xml2::xml_text(trim=TRUE),
61+
dt_alteracao = html |>
62+
xml2::xml_find_all("./td[14]") |>
63+
xml2::xml_text(trim=TRUE),
64+
dt_inativacao = html |>
65+
xml2::xml_find_all("./td[15]") |>
66+
xml2::xml_text(trim=TRUE),
67+
dt_reativacao = html |>
68+
xml2::xml_find_all("./td[16]") |>
69+
xml2::xml_text(trim=TRUE)
70+
)
71+
}) |>
72+
dplyr::bind_rows() |>
73+
dplyr::mutate(id = dplyr::row_number()*2)
74+
75+
# pega todos classes de nível 1
76+
classe1_sem_id <- tibble::tibble(
77+
txt = xml2::read_html(file) |>
78+
xml2::xml_find_first("//table") |>
79+
xml2::xml_find_all("./td[contains(@style, 'background-color:#4c83c8')]") |>
80+
xml2::xml_text(trim=TRUE)
81+
) |>
82+
dplyr::mutate(
83+
txt = ifelse(txt == "", NA_character_, txt),
84+
classe1 = !stringr::str_detect(txt, "[a-z]|[0-9]")
85+
)
86+
87+
n_id <- classe1_sem_id |>
88+
dplyr::count(classe1) |>
89+
dplyr::filter(classe1) |>
90+
dplyr::pull(n)
91+
92+
n_col <- da_sem_classe1 |>
93+
dplyr::select(!dplyr::contains("classe")) |>
94+
ncol()
95+
96+
classe1_ids <- classe1_sem_id |>
97+
dplyr::filter(classe1) |>
98+
dplyr::mutate(
99+
id = ifelse(classe1, 1:n_id, NA_integer_)
100+
)
101+
102+
da_classe1 <- classe1_sem_id |>
103+
dplyr::left_join(classe1_ids) |>
104+
tidyr::fill(id, .direction="down") |>
105+
dplyr::group_by(id) |>
106+
dplyr::mutate(
107+
colname = 1:n_col,
108+
colname = dplyr::case_when(
109+
colname == 1 ~ "classe1",
110+
colname == 2 ~ "codigo",
111+
colname == 3 ~ "codigo_pai",
112+
colname == 4 ~ "dispositivo_legal",
113+
colname == 5 ~ "artigo",
114+
colname == 6 ~ "sigla",
115+
colname == 7 ~ "alteracoes",
116+
colname == 8 ~ "glossario",
117+
colname == 9 ~ "dt_publicacao",
118+
colname == 10 ~ "dt_alteracao",
119+
colname == 11 ~ "dt_inativacao",
120+
colname == 12 ~ "dt_reativacao"
121+
)
122+
) |>
123+
dplyr::ungroup() |>
124+
dplyr::select(txt, id, colname) |>
125+
tidyr::pivot_wider(values_from = txt, names_from = colname) |>
126+
dplyr::transmute(
127+
classe1,
128+
classe2 = "",
129+
classe3 = "",
130+
classe4 = "",
131+
classe5 = "",
132+
classe6 = "",
133+
codigo,
134+
codigo_pai,
135+
dispositivo_legal,
136+
artigo,
137+
sigla,
138+
alteracoes,
139+
glossario,
140+
dt_publicacao,
141+
dt_alteracao,
142+
dt_inativacao,
143+
dt_reativacao
144+
)
145+
146+
# insere os classes de nivel 1 junto dos demais classes
147+
da <- da_sem_classe1
148+
for(cod in da_classe1$codigo) {
149+
posicao_classe1 <- da |>
150+
dplyr::mutate(
151+
classe1_acima = codigo_pai == cod
152+
) |>
153+
dplyr::filter(classe1_acima) |>
154+
dplyr::pull(id) |>
155+
min() - 1L
156+
157+
row <- da_classe1 |>
158+
dplyr::filter(codigo == cod) |>
159+
dplyr::mutate(id = posicao_classe1)
160+
161+
da <- da |>
162+
dplyr::bind_rows(row) |>
163+
dplyr::arrange(id)
164+
}
165+
da <- da |> dplyr::select(-id)
166+
return(da)
167+
}
168+
169+
tpu_classe_tidy <- function(da) {
170+
da |>
171+
dplyr::mutate(
172+
dplyr::across(
173+
everything(),
174+
~ifelse(.x == "", NA_character_, .x)
175+
),
176+
dplyr::across(
177+
dplyr::contains("codigo"),
178+
~as.integer(.x)
179+
),
180+
dplyr::across(
181+
dplyr::starts_with("dt_"),
182+
~ifelse(.x == "0000-00-00 00:00:00", NA_character_, .x)
183+
),
184+
dplyr::across(
185+
dplyr::starts_with("dt_"),
186+
~lubridate::ymd_hms(.x)
187+
)
188+
) |>
189+
tidyr::fill(dplyr::contains("classe"), .direction="down") |>
190+
dplyr::mutate(
191+
dplyr::across(
192+
dplyr::contains("classe"),
193+
~tidyr::replace_na(.x, "-")
194+
)
195+
)
196+
}

0 commit comments

Comments
 (0)