Skip to content

Commit b075930

Browse files
SarahWagnerarunge
andauthored
Feature/9 source lifecycle (#18)
* #9 source lifecycle * #9 version bump * #9 fix * Update R/01_sync_sources.R Co-authored-by: Antonia Runge <antonia.runge@inwt-statistics.de> * Update R/01_sync_sources.R Co-authored-by: Antonia Runge <antonia.runge@inwt-statistics.de> * Update R/01_sync_sources.R Co-authored-by: Antonia Runge <antonia.runge@inwt-statistics.de> * Update R/01_sync_sources.R Co-authored-by: Antonia Runge <antonia.runge@inwt-statistics.de> * Update R/01_sync_sources.R Co-authored-by: Antonia Runge <antonia.runge@inwt-statistics.de> * Update R/01_sync_sources.R Co-authored-by: Antonia Runge <antonia.runge@inwt-statistics.de> * Update R/01_sync_sources.R Co-authored-by: Antonia Runge <antonia.runge@inwt-statistics.de> * Update R/00_namespace.R Co-authored-by: Antonia Runge <antonia.runge@inwt-statistics.de> * #9 test for collections * #9 test required collections are not empty * #9 don't allow updates on same id * #9 fix * r cmd check * Update R/01_sync_sources.R Co-authored-by: Antonia Runge <antonia.runge@inwt-statistics.de> * #18 rm repeated code --------- Co-authored-by: Antonia Runge <antonia.runge@inwt-statistics.de>
1 parent db479cb commit b075930

File tree

12 files changed

+173
-9
lines changed

12 files changed

+173
-9
lines changed

.Rbuildignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,4 @@ Jenkinsfile
66
.Renviron_template
77
.aws
88
^License.md$
9+
^\.github$

DESCRIPTION

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
Package: rssData
22
Type: Package
33
Title: Saving RSS fetched text in a central database
4-
Version: 25.10.0
4+
Version: 25.11.0
55
Authors@R: person("Sarah", "Wagner", email = "sarah.wagner@inwt-statistics.de", role = c("aut", "cre"))
66
Maintainer: INWT Statistics <sarah.wagner@inwt-statistics.de>
77
Description: The rssData package is designed to manage and store daily RSS feed entries into a
@@ -17,13 +17,14 @@ Imports:
1717
stringr,
1818
dplyr,
1919
rlang,
20-
mongolite
20+
mongolite,
21+
purrr,
22+
glue
2123
Suggests:
2224
lintr,
2325
testthat (>= 3.0.0),
24-
purrr,
2526
tibble
2627
Encoding: UTF-8
2728
LazyData: true
28-
RoxygenNote: 7.3.2
29+
RoxygenNote: 7.3.3
2930
Config/testthat/edition: 3

NAMESPACE

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,18 +16,23 @@ export(read_text_via_url)
1616
export(safe_GET)
1717
export(safe_HEAD)
1818
export(safely)
19+
export(sync_sources_with_mongo)
1920
export(update_database)
2021
export(url_exists)
2122
import(dplyr)
2223
importFrom(futile.logger,flog.debug)
24+
importFrom(futile.logger,flog.error)
2325
importFrom(futile.logger,flog.info)
2426
importFrom(futile.logger,flog.warn)
27+
importFrom(glue,glue)
2528
importFrom(httr,GET)
2629
importFrom(httr,HEAD)
2730
importFrom(httr,content)
2831
importFrom(httr,timeout)
2932
importFrom(mongolite,mongo)
33+
importFrom(purrr,map_df)
3034
importFrom(rlang,":=")
35+
importFrom(rlang,.data)
3136
importFrom(rvest,html_nodes)
3237
importFrom(rvest,html_text)
3338
importFrom(rvest,read_html)

NEWS.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,13 @@
1+
# rssData 25.11.0
2+
3+
## New Features
4+
- **source lifecycle management**: the source meta data is managed via a document in MongoDB (soures collection)
5+
- any manipulation of sources will be documented there (adding, changing or deleting)
6+
- the goal is to be able to track which sources are or were active in which exact time periods
7+
8+
## Updates
9+
- **config**: addition of two new sources
10+
111
# rssData 25.10.0
212

313
## Updates

R/00_namespace.R

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
1-
#' @importFrom futile.logger flog.debug flog.info flog.warn
1+
#' @importFrom futile.logger flog.debug flog.info flog.warn flog.error
22
#' @importFrom yaml yaml.load_file
33
#' @importFrom httr GET HEAD content timeout
44
#' @importFrom rvest read_html html_nodes html_text
55
#' @importFrom stringr str_squish regex str_detect
66
#' @importFrom tidyRSS tidyfeed
77
#' @importFrom stats setNames
88
#' @importFrom mongolite mongo
9+
#' @importFrom purrr map_df
10+
#' @importFrom glue glue
911
#' @import dplyr
10-
#' @importFrom rlang :=
12+
#' @importFrom rlang := .data
1113
NULL

R/01_sync_sources.R

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
#' Sync config sources with MongoDB
2+
#'
3+
#' Compares the sources listed in the config object
4+
#' with the MongoDB "sources" collection.
5+
#' Adds new sources, marks removed ones, and if a source with an existing
6+
#' ID has changed (name or URL), it raises a warning, marks the old entry
7+
#' as removed, and inserts the updated version.
8+
#'
9+
#' @param config The loaded configuration list (as from config.yaml)
10+
#' @export
11+
sync_sources_with_mongo <- function(config) {
12+
13+
con <- credentials(collection="sources")
14+
15+
config_sources <- map_df(config$sources, function(src) {
16+
tibble(
17+
source_id = as.character(src$id),
18+
source_name = as.character(src$name),
19+
source_url = as.character(src$url)
20+
)
21+
})
22+
23+
# read current MongoDB sources
24+
mongo_sources <- tryCatch({
25+
con$find('{}', fields = '{"_id":0}')
26+
}, error = function(e) {
27+
stop("Failed to fetch data from MongoDB 'sources' collection: ", e$message, call. = FALSE)
28+
})
29+
30+
now <- Sys.time()
31+
32+
# detect updated sources (same id, changed name or url)
33+
updated_sources <- inner_join(config_sources, mongo_sources, by = "source_id") %>%
34+
filter(
35+
.data$source_name.x != .data$source_name.y |
36+
.data$source_url.x != .data$source_url.y
37+
)
38+
39+
40+
if (nrow(updated_sources) > 0) {
41+
42+
msg <- glue(
43+
"Source update detected for {nrow(updated_sources)} source(s).\n",
44+
"The following sources have the same source_id but different name or URL:\n\n",
45+
paste0(
46+
"- source_id: ", updated_sources$source_id, "\n",
47+
" old name: ", updated_sources$source_name.y, "\n",
48+
" new name: ", updated_sources$source_name.x, "\n",
49+
" old url: ", updated_sources$source_url.y, "\n",
50+
" new url: ", updated_sources$source_url.x,
51+
collapse = "\n\n"
52+
),
53+
"\n\nPlease update your config file. A changed source should use a NEW source_id."
54+
)
55+
56+
flog.error(msg)
57+
stop(msg, call. = FALSE)
58+
} else {
59+
flog.debug("No updated sources found.")
60+
}
61+
62+
# detect new sources
63+
new_sources <- anti_join(config_sources, mongo_sources, by = "source_id")
64+
65+
if (nrow(new_sources) > 0) {
66+
new_sources <- mutate(
67+
new_sources,
68+
date_added = now,
69+
date_removed = as.POSIXct(NA, tz = "UTC")
70+
)
71+
con$insert(new_sources)
72+
flog.info(
73+
glue("Inserted {nrow(new_sources)} new source(s): {paste(new_sources$source_name, collapse = ', ')}")
74+
)
75+
} else {
76+
flog.debug("No new sources found.")
77+
}
78+
79+
# detect removed sources
80+
removed_sources <- anti_join(mongo_sources, config_sources, by = "source_id") %>%
81+
filter(is.na(.data$date_removed))
82+
83+
if (nrow(removed_sources) > 0) {
84+
for (sid in removed_sources$source_id) {
85+
con$update(
86+
query = paste0('{"source_id": "', sid, '"}'),
87+
update = paste0('{"$set": {"date_removed": {"$date": "', format(now, "%Y-%m-%dT%H:%M:%S%z"), '"}}}')
88+
)
89+
}
90+
flog.info(
91+
glue("Marked {nrow(removed_sources)} source(s) as removed: {paste(removed_sources$source_id, collapse = ', ')}")
92+
)
93+
} else {
94+
flog.debug("No removed sources found.")
95+
}
96+
97+
flog.info("MongoDB sync complete.")
98+
}

R/04_db_creds.R

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,11 @@
1313
#' \item \code{DB_PORT}: The port number of the MongoDB server.
1414
#' }
1515
#'
16+
#' @param collection character, name of MongoDB collection
1617
#' @return A mongo connection object that can be used to interact with the MongoDB database.
1718
#'
1819
#' @export
19-
credentials <- function() {
20+
credentials <- function(collection = Sys.getenv("DB_COLLECTION")) {
2021
URI <- sprintf(
2122
"mongodb://%s:%s@%s:%s/%s",
2223
Sys.getenv("DB_USER"),
@@ -26,7 +27,7 @@ credentials <- function() {
2627
Sys.getenv("DB_NAME")
2728
)
2829

29-
return(mongo(collection = Sys.getenv("DB_COLLECTION"), db = Sys.getenv("DB_NAME"), url = URI))
30+
return(mongo(collection = collection, db = Sys.getenv("DB_NAME"), url = URI))
3031
}
3132

3233

R/05_main.R

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#' @export
66
main <- function(config_file = "config.yaml") {
77
config <- load_yaml_config(config_file)
8+
sync_sources_with_mongo(config)
89
# read entries from each rss source
910
feeds <- read_rss_sources(config)
1011
# read text from each entry for each rss source

config.yaml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,20 @@ sources:
9595
category: "International"
9696
id: 12
9797

98+
Source_13:
99+
url: "https://www.japantimes.co.jp/feed/"
100+
name: "the japan times"
101+
description: "World News"
102+
category: "International"
103+
id: 13
104+
105+
Source_14:
106+
url: "https://mg.co.za/feed/"
107+
name: "Mail & Guardian"
108+
description: "World News"
109+
category: "International"
110+
id: 14
111+
98112
# Mapping for all columns of the target table on the database
99113
# Add columns as needed
100114
# Only change the values (right from the colon), not the keys!

man/credentials.Rd

Lines changed: 4 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)