-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path03_scrape_sitemaps_newsfront.R
More file actions
59 lines (43 loc) · 2.65 KB
/
03_scrape_sitemaps_newsfront.R
File metadata and controls
59 lines (43 loc) · 2.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# 03 Crawl base_sitemaps for url_list
# connect DB as conn ####
source("00_connect_DB_newsfront.R")
library(DBI)
library(rvest)
library(lubridate)
# source("02_scrape_base_sitemaps_newsfront.R") # get new_sitemaps and update base_sitemaps
# old: new_sitemaps <- DBI::dbReadTable(conn, "base_sitemaps")
# get new pages: #####
new_sitemaps <- new_sitemaps %>% slice_sample(., prop = 1) # randomly mix to avoid DDoS-Guard
for (index in 1:nrow(new_sitemaps)) {
# Crawl each sitemap_index
pointer <- rvest::read_html(new_sitemaps$loc[index])
if("try-error" %in% class(
sitemaps_crawl <- try(tibble(loc = pointer %>% html_elements(xpath = "//url[not(image)]/loc") %>% html_text2(),
lastmod = if(stringr::str_detect(new_sitemaps$loc[index], "page-sitemap")){
# some sitemaps need modification:
c(Sys.time() %>% lubridate::ymd_hms(., tz = "UTC"), # add first-row entry for these
pointer %>% html_elements(xpath = "//url[not(image)]/lastmod") %>% html_text2() %>% lubridate::ymd_hms(., tz = "UTC"))
}else{
pointer %>% html_elements(xpath = "//url[not(image)]/lastmod") %>% html_text2() %>% lubridate::ymd_hms(., tz = "UTC")
},
base_sitemap = new_sitemaps$loc[index],
version_nf = new_sitemaps$version_nf[index],
last_crawl = Sys.time() %>% lubridate::ymd_hms(., tz = "UTC")
))
)){
print(paste("Error saving", new_sitemaps$loc[index]))
}else{
# Later: filter for new entries: #####
#lastcrawl = ifelse("url_list" %in% DBI::dbListTables(conn),
# (tbl(conn, "new_sitemaps") %>% dplyr::arrange(desc(last_crawl)) %>% head(1) %>% collect())$last_crawl %>% lubridate::ymd_hms(., tz = "UTC"),
# lubridate::ymd_hms("1900-01-01 00:00:01", tz = "UTC")
#) %>% as_datetime()
# new_pages <- dplyr::filter(sitemaps, lastmod > lastcrawl)
# append table in db: #####
DBI::dbWriteTable(conn = conn, name = "url_list",
value = sitemaps_crawl %>% dplyr::mutate(across(.cols = !is.character, as.character)),
append = TRUE
)
print(paste(index, "/", nrow(new_sitemaps), "-", new_sitemaps$loc[index], "saved."))
}
} # end of for-loop