-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path01_scraping_RT.R
More file actions
102 lines (64 loc) · 3.18 KB
/
01_scraping_RT.R
File metadata and controls
102 lines (64 loc) · 3.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# 01 Scraping RT
source("00_RT_Scraper.R")
### apply function ####
# set parameter:
# select startdate (here: 01 Jan 2021)
startdate <- "01-01-2021"
# select time interval (here: 2 to keep small as possible)
period_end <- "01-01-2022"
# keyword & interval as default
# links_RT_search <- scrapeRT(startdate, period_end, interval_length = 2)
links_RT_search_finer <- scrapeRT(startdate, period_end, interval_length = 1)
# candidates
links_RT_search_Baerbock_new <- scrapeRT(startdate, period_end, keyword = "Baerbock", interval_length = 25, sleepmin = 5, sleepmax = 7)
# got 419/422 (soll 408 in 2021)
links_RT_search_Laschet_new <- scrapeRT(startdate, period_end, keyword = "Laschet", interval_length = 25, sleepmin = 5, sleepmax = 7)
# got 390/413 (soll 409 in 2021)
links_RT_search_Scholz_new <- scrapeRT(startdate, period_end, keyword = "Scholz", interval_length = 25, sleepmin = 5, sleepmax = 7)
# got 439/471 (soll 447 in 2021)
links_RT_search_finest_Baerbock <- scrapeRT(startdate, period_end, keyword = "Baerbock", interval_length = 0, sleepmin = 2, folder = "RT_searches_candidates_finest")
links_RT_search_finest_Laschet <- scrapeRT(startdate, period_end, keyword = "Laschet", interval_length = 0, sleepmin = 2, folder = "RT_searches_candidates_finest")
links_RT_search_finest_Scholz <- scrapeRT(startdate, period_end, keyword = "Scholz", interval_length = 0, sleepmin = 2, folder = "RT_searches_candidates_finest")
# RT_searches_candidates_finest <- map_dfr(.x = candidate, cbind(scrapeRT(startdate, period_end, .x, 0, folder = "RT_searches_candidates_finest"), candidate = .x))
# check on 12-03/12-04
### ToDo: FIND OUT WHICH ARE MISSING!!! ####
# search all search result files for "Weiter-Button"
# get all 68 missing articles
# convert to df
base_url_rt <- "https://de.rt.com"
candidate_links <- c(links_RT_search_Baerbock_new, links_RT_search_Laschet_new, links_RT_search_Scholz_new) %>% tibble() %>%
distinct() %>%
mutate(links = paste0(base_url_rt, .))
write_xlsx(candidate_links)
# bind_rows(tibble(), tibble()) gives 3 vars with NAs for all rows of other vars... could be useful!
# look for remaining "Weiter" Buttons!
# need to filter for year!
save.image("all_data.RData")
# save list:
# library(rlist)
# rlist::list.save(links_RT_search, 'Articles/rt_links.rds')
# identical(links_RT_search, 'Articles/rt_links.rds')
## get page source ####
# data_RT <- map_dfr(links_RT$links_full, get_pages_RT)
# remDr$close()
# data_RT %<>% left_join(., links_RT, by = c("link" = "links_full"))
# # save dataset
# write_csv(data_RT, 'Articles/data_rt.csv')
# rio::export(data_RT, "Articles/data_rt.rds")
#
# # import again:
# data_RT1 <- rio::import("articles/data_rt.rds")
# data_RT2 <- rio::import("articles/data_rt.csv") # seems preferable, but larger
# better: loop to save txt
{
# start server
for (i in 1:length(candidate_links$links)) {
while (remDr$getCurrentUrl() != candidate_links$links[i]) {
remDr$navigate(candidate_links$links[i])
randsleep <- sample(seq(sleepmin, sleepmax, by = 0.001), 1)
Sys.sleep(randsleep)
page <- remDr$getPageSource()[[1]]
writeLines(page, paste0("articles/RThtml/RTarticle", i, ".txt"), useBytes = T)
}
}
}