-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path02_scrape_base_sitemaps_newsfront.R
More file actions
59 lines (39 loc) · 1.99 KB
/
02_scrape_base_sitemaps_newsfront.R
File metadata and controls
59 lines (39 loc) · 1.99 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# 02 crawl sitemap_index for base_sitemaps
# should run directly on Server
# connect DB & sitemap index if necessary ####
source("01_create_sitemap_index_newsfront.R")
scrape_base_sitemaps_nf <- function(conn = conn, is_initial_scrape = F){
library(DBI)
library(rvest)
library(lubridate)
# loop over sitemaps: ####
lastcrawl = ifelse("base_sitemaps" %in% DBI::dbListTables(conn) & is_initial_scrape = F,
(tbl(conn, "base_sitemaps") %>% dplyr::arrange(desc(last_crawl)) %>% head(1) %>% collect())$last_crawl %>% lubridate::ymd_hms(., tz = "UTC"),
lubridate::ymd_hms("1900-01-01 00:00:01", tz = "UTC")
) %>% as_datetime()
base_sitemaps <- tibble()
for(index in 1:nrow(sitemap_index)) {
pointer <- rvest::read_html(sitemap_index$loc[index])
# Crawl each sitemap_index
base_sitemaps_crawl <- tibble(loc = pointer %>% html_elements("sitemap loc") %>% html_text2(),
lastmod = pointer %>% html_elements("sitemap lastmod") %>% html_text2() %>% lubridate::ymd_hms(., tz = "UTC"),
index_sitemap = sitemap_index$loc[index],
version_nf = sitemap_index$version_nf[index],
last_crawl = Sys.time() %>% lubridate::ymd_hms(., tz = "UTC")
)
# add new rows for each newsfront version
base_sitemaps <- base_sitemaps %>% bind_rows(., base_sitemaps_crawl)
print(paste(index, "/", nrow(sitemap_index), "-", sitemap_index$loc[index], "saved."))
}
# filter non-updated sitemaps: ####
new_sitemaps <- dplyr::filter(base_sitemaps, lastmod > lastcrawl)
# append table in db: ####
# ToDo: change to update lastmod, last_crawl where loc %in% new_sitemaps!
DBI::dbWriteTable(conn = conn, name = "base_sitemaps",
value = new_sitemaps %>% dplyr::mutate(across(.cols = !is.character, as.character)),
append = TRUE
)
print("Succesfully pushed new sitemaps to DB")
return(new_sitemaps)
}
new_sitemaps <- scrape_base_sitemaps_nf()