@@ -12,7 +12,7 @@ https://creativecommons.org/licenses/by-nc/4.0/
1212
1313## Load library packages
1414
15- ``` {r message=FALSE, warning=FALSE}
15+ ``` {r message=FALSE, warning=FALSE, include=FALSE }
1616library(tidyverse)
1717library(rvest)
1818library(htmltools)
@@ -103,15 +103,17 @@ in an HTML document that has tagging such as this:
103103<li ><a href =" /ecartico/persons/17296" >Anna Aaltse (1715 - 1738)</a ></li >
104104```
105105
106- I want to gather the text within the ` <li> ` tag: ** Anna Aaltse (1715 - 1738)**
106+ I want to gather the text within the ` <li> ` tag: e.g. ** Anna Aaltse (1715 - 1738)**
107+
108+ You can use the [ Selector Gadget] ( https://selectorgadget.com/ ) to help you identify the HTML/CSS tags and codes.
107109
108110#### CODE
109111
110112Using the ` html_nodes() ` and ` html_text() ` functions, I can retrieve all the text within ` <li></li> ` tags.
111113
112114``` {r}
113115names <- results %>%
114- html_nodes("li ") %>%
116+ html_nodes("#setwidth li a ") %>%
115117 html_text()
116118
117119names
@@ -137,7 +139,7 @@ Using the `html_nodes()` and `html_attr()` functions, I can retrieve all the att
137139
138140``` {r}
139141url <- results %>%
140- html_nodes("li a") %>%
142+ html_nodes("#setwidth li a") %>%
141143 html_attr("href")
142144
143145url
@@ -172,7 +174,7 @@ Create some new variables with `mutate`. Build a **full** URL from the _relativ
172174``` {r}
173175urls_to_crawl_df <- results_df %>%
174176 mutate(full_url = glue::glue("http://www.vondel.humanities.uva.nl{url}")) %>%
175- mutate(full_url = str_replace_all(full_url, "\\.\\.", "")) %>%
177+ # mutate(full_url = str_replace_all(full_url, "\\.\\.", "")) %>%
176178 select(full_url)
177179
178180urls_to_crawl_df
@@ -226,7 +228,7 @@ Parse the **text** of the navigation bar.
226228
227229``` {r}
228230results %>% #html_nodes("div.subnav")
229- html_nodes("div .subnav a ") %>%
231+ html_nodes("form+ .subnav") %>%
230232 html_text()
231233
232234```
@@ -235,7 +237,7 @@ Parse the HTML _href_ **attribute** to get the URL.
235237
236238``` {r loop}
237239navigation <- results %>%
238- html_nodes("div .subnav a") %>%
240+ html_nodes("form+ .subnav a") %>%
239241 html_attr("href")
240242
241243navigation
@@ -272,7 +274,6 @@ Below, use `dplyr::distinct()` and `stringr::string_extract()` among other tidyv
272274``` {r}
273275nav_df <- nav_df %>%
274276 filter(str_detect(navigation, "&page=")) %>%
275- distinct(navigation) %>%
276277 mutate(page_no = str_extract(navigation, "\\d+$")) %>%
277278 mutate(page_no = as.numeric(page_no))
278279
@@ -369,6 +370,7 @@ nav_results_list <- tibble(
369370
370371nav_results_list
371372```
373+
372374Above, I have three rows of _ lists_ , each list is the read_html() results of a summary results page, i.e. each list has 50 URLs and text of my eventual targets.
373375
374376- ` nav_results_list$summary_url ` is the URL for each summary page.
@@ -383,12 +385,12 @@ results_by_page <- tibble(summary_url = nav_results_list$summary_url,
383385 url =
384386 map(nav_results_list$html_results,
385387 ~ .x %>%
386- html_nodes("ul li a") %>%
388+ html_nodes("#setwidth li a") %>%
387389 html_attr("href")),
388390 name =
389391 map(nav_results_list$html_results,
390392 ~ .x %>%
391- html_nodes("ul li a") %>%
393+ html_nodes("#setwidth li a") %>%
392394 html_text()
393395 )
394396 )
@@ -403,9 +405,7 @@ When I unnest the nested _list_, I then have a single tibble with 150 URLs and 1
403405
404406``` {r}
405407results_by_page %>%
406- unnest(cols = c(url, name)) %>%
407- filter(!str_detect(name, "ECARTICO")) %>%
408- filter(!str_detect(name, "^\\+"))
408+ unnest(cols = c(url, name))
409409
410410```
411411
@@ -447,26 +447,10 @@ The information gathered is information from the detailed names page about the c
447447
448448emanuel <- read_html("http://www.vondel.humanities.uva.nl/ecartico/persons/10579")
449449
450- child_filter <- emanuel %>%
451- html_nodes("ul li a") %>%
452- html_attr("rel") %>%
453- as_tibble() %>%
454- mutate(id = row_number()) %>%
455- filter(str_detect(value, "children"))
456- child_filter
457-
458- child_text <- emanuel %>%
459- html_nodes("ul li a") %>%
460- html_text() %>%
461- as_tibble() %>%
462- rename(text = value) %>%
463- mutate(id = row_number()) %>%
464- inner_join(child_filter)
465-
466- child_text %>%
467- pull(text)
468-
469-
450+ children_name <- emanuel %>%
451+ html_nodes("ul~ h2+ ul li > a") %>%
452+ html_text()
453+ children_name
470454```
471455#### Iterate
472456
0 commit comments