Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
25f0193
added support for netzschleuder
schochastics Apr 5, 2025
67ec488
chore: Auto-update from GitHub Actions
schochastics Apr 5, 2025
5c5895d
better source/target handling
schochastics Apr 5, 2025
233b9e0
added token for restricted networks
schochastics Apr 6, 2025
e521a71
added minty for type checking
schochastics Apr 7, 2025
4b07fc2
Apply suggestions from code review
schochastics Apr 10, 2025
33ae5ed
chore: Auto-update from GitHub Actions
schochastics Apr 10, 2025
db8b0db
changes of review
schochastics Apr 10, 2025
51046b9
added missing quotes
schochastics Apr 10, 2025
d8b1f77
added proper httr2 requests with custom ratelimits
schochastics Apr 10, 2025
bc6a34a
added slash notation and directed/bipartite checking via metadata
schochastics Apr 10, 2025
d7a0b06
suppress minty warnings
schochastics Apr 10, 2025
ca89264
docs update
schochastics Apr 10, 2025
a7c2ef1
correct collection url
schochastics Apr 10, 2025
4b473a1
new netzschleuder namespace
schochastics Apr 14, 2025
e60885d
removed call argument
schochastics Apr 14, 2025
6e45a5d
removed rlang
schochastics Apr 14, 2025
dd495a4
added token to ns_graph and error handling for gprops file
schochastics Apr 14, 2025
7a8004c
docs
schochastics Apr 14, 2025
e29f90f
rate limit works
schochastics Apr 14, 2025
bcd1b1a
put base_req into onload
schochastics Apr 15, 2025
4431e4b
added more comments for clarity
schochastics Apr 16, 2025
a52dec8
added David as ctb and a sentence about netzschleuder
schochastics Apr 16, 2025
10b25c4
chore: Auto-update from GitHub Actions
schochastics Apr 16, 2025
4485280
review changes
schochastics Apr 17, 2025
02fb9db
chore: Auto-update from GitHub Actions
schochastics Apr 17, 2025
0c717fc
moved minty check to ns_df
schochastics Apr 17, 2025
1974d6a
better name error handling
schochastics Apr 17, 2025
29412cf
moved all pkgs to suggest and removed onLoad
schochastics Apr 18, 2025
0ad558d
earlier renaming and better regex
schochastics Apr 18, 2025
3cd0496
double parenthesis
schochastics Apr 18, 2025
eb12a3a
allow metadata as input to ns_graph
schochastics Apr 23, 2025
c9ef882
chore: Auto-update from GitHub Actions
schochastics Apr 23, 2025
94bab69
implement file size limit
schochastics Apr 23, 2025
18f87d1
docs
schochastics Apr 23, 2025
2a6d9f1
added meta data print
schochastics Apr 23, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file removed .github/dep-suggests-matrix.json
Empty file.
1 change: 0 additions & 1 deletion .github/versions-matrix.json

This file was deleted.

15 changes: 11 additions & 4 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -9,21 +9,28 @@ Authors@R: c(
person("Kirill", "Müller", , "[email protected]", role = c("aut", "cre"),
comment = c(ORCID = "0000-0002-1416-3412")),
person("Maëlle", "Salmon", role = "ctb"),
person("David","Schoch", role = "aut",
comment = c(ORCID="0000-0003-2952-4812")),
person("Chan Zuckerberg Initiative", role = "fnd")
)
Description: A small collection of various network data sets, to use with
the 'igraph' package: the Enron email network, various food webs,
interactions in the immunoglobulin protein, the karate club network,
Koenigsberg's bridges, visuotactile brain areas of the macaque monkey,
UK faculty friendship network, domestic US flights network, etc.
UK faculty friendship network, domestic US flights network, etc. Also provides
access to the API of <https://networks.skewed.de/>.
License: CC BY-SA 4.0 + file LICENSE
URL: http://igraph.org
BugReports: https://github.com/igraph/igraphdata/issues
Depends:
R (>= 2.10)
R (>= 4.0)
Imports:
igraph (>= 1.5.0)
Suggests:
igraph (>= 2.0.0),
rlang
Suggests:
cli,
minty,
httr2,
testthat (>= 3.0.0)
Encoding: UTF-8
LazyData: true
Expand Down
4 changes: 4 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
# Generated by roxygen2: do not edit by hand

S3method(print,ns_meta)
export(lesmis_gml)
export(lesmis_graphml)
export(lesmis_pajek)
export(ns_df)
export(ns_graph)
export(ns_metadata)
importFrom(igraph,vcount)
264 changes: 264 additions & 0 deletions R/netzschleuder.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,264 @@
#' @keywords internal
.pkg_env <- new.env(parent = emptyenv())

get_base_req <- function() {
if (!exists("base_req", envir = .pkg_env, inherits = FALSE)) {
base_req <- httr2::request("https://networks.skewed.de") |>
httr2::req_throttle(capacity = 20, fill_time_s = 60) |>
httr2::req_user_agent(
"R package igraphdata (github.com/igraph/igraphdata)"
)
.pkg_env$base_req <- base_req
}
.pkg_env$base_req
}

make_request <- function(path, token = NULL, method = "GET") {
rlang::check_installed("httr2")
req <- httr2::req_url_path(get_base_req(), path)
req <- httr2::req_method(req, method)
if (method == "HEAD") {
req <- httr2::req_headers(req, `Accept-Encoding` = "identity")
}
if (!is.null(token)) {
req <- httr2::req_headers(req, `WWW-Authenticate` = token)
}

resp <- httr2::req_perform(req)

if (httr2::resp_status(resp) != 200) {
stop("Failed to download file. Status: ", httr2::resp_status(resp))
}

resp
}

resolve_name <- function(x) {
#remove trailing /
x <- sub("/$", "", x)
#remove double slash
x <- sub("//", "/", x)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this important?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

because @szhorvat tried to break the name parameter and these are just some cleaning of common user input errors that could occur


if (grepl("/", x)) {
x_split <- strsplit(x, "/", fixed = TRUE)[[1]]
if (length(x_split) > 2) {
cli::cli_abort(
"{.arg name} has {length(x_split)} components instead of 2."
)
}
return(x_split)
} else {
c(x, x)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks interesting to me. I wonder if we can make the intent here clearer.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This mechanism is need for the API. If a collection has only one network, the url is
https://networks.skewed.de/net/7th_graders/files/7th_graders.csv.zip. SO the name is duplicated. For a network in a collection, the zip file has a different name

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it. Should we then return a named list instead, to clarify the intent?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sounds good

}
}

download_file <- function(zip_url, token = NULL, file, size_limit) {
resp <- make_request(zip_url, token, method = "HEAD")
byte_size <- as.numeric(httr2::resp_headers(resp)[["content-length"]])
gb_size <- round(byte_size / 1024^3, 4)
if (gb_size > size_limit) {
cli::cli_abort(c(
"{zip_url} has a size of {gb_size} GB and exceeds the size limit of {size_limit} GB.",
"i" = "To download the file, set {.arg size_limit} to a value greater than {gb_size}"
))
}
resp <- make_request(zip_url, token, method = "GET")
writeBin(httr2::resp_body_raw(resp), file)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are the files always small enough to fit into memory?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are some networks that go up to 10GB. Should we implement something that stops users to download files over a certain size?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we implement something that stops users to download files over a certain size?

IMO it's not a bad idea. Many of our users are inexperienced, and the risk of accidentally trying to download something large is too high. But is it possible the determine the size in advance? Or would you use the edge and vertex count as a proxy for the size?

Do I understand correctly that httr2 does automatically cache requests, preventing the re-downloading of large data?

If you do this, I'd suggest having an option to override and download anyway.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Something to consider is whether R has limits on the number of elements in dataframes, or number of rows it can read from CSVs. R still has this silly limitation of 32-bit integers, and while it can index with doubles now (weird!), not every object has support for such indexing. E.g. matrices don't.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is about raw data loaded into memory, which is what it looks like this code is doing. Can we let curl handle the downloading in this case, to write directly to disk in chunks? There might be a way to show a progress bar too, and the user can cancel if it's too large.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will dig into httr2 a bit on this. I would like to keep using it here too for the rate limiting. I am sure there is something that can help us here. Otherwise my idea was to create an option or so that specifies a hard upper limit of edges/nodes that are permitted.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the rate limiting important for downloads, or just for API calls?

We can probably get the body in chunks and write piecemeal if really necessary.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The rate limiting is the most important for files because that is the heaviest load on the server.

invisible(NULL)
}

#' Download and Convert Graph Data from Netzschleuder
#'
#' These functions provide tools to interact with the Netzschleuder network dataset archive.
#' Netzschleuder (<https://networks.skewed.de/>) is a large online repository for network datasets,
#' aimed at aiding scientific research.
#' \describe{
#' \item{`ns_metadata()`}{ retrieves metadata about a network or network collection.}
#' \item{`ns_df()`}{downloads the graph data as data frames (nodes, edges, and graph properties).}
#' \item{`ns_graph()`}{creates an `igraph` object directly from Netzschleuder.}
#' }
#'
#' @param name Character. The name of the network dataset. To get a network from a collection,
#' use the format `<collection_name>/<network_name>`.
#' @param collection Logical. If TRUE, get the metadata of a whole collection of networks.
#' @param token Character. Some networks have restricted access and require a token.
#' @param size_limit Numeric. Maximum allowed file size in GB. Larger files will be prevented from being downloaded.
#' See <https://networks.skewed.de/restricted>.
#'
#' @return
#' \describe{
#' \item{`ns_metadata()`}{A list containing metadata for the dataset.}
#' \item{`ns_df()`}{A named list with `nodes`, `edges`, `gprops`, and `meta`.}
#' \item{`ns_graph()`}{An `igraph` object.}
#' }
#' @examples
#' \dontrun{
#' # Get metadata
#' ns_metadata("copenhagen/calls")
#'
#' # Download network as data frames
#' graph_data <- ns_df("copenhagen/calls")
#'
#' # Create an igraph object
#' g <- ns_graph("copenhagen/calls")
#' }
#'
#' @seealso <https://networks.skewed.de/>
#' @rdname netzschleuder
#' @export
ns_metadata <- function(name, collection = FALSE) {
rlang::check_installed("cli")
net_ident <- resolve_name(name)
path <- sprintf("api/net/%s", net_ident[[1]])
collection_url <- sprintf("https://networks.skewed.de/net/%s", net_ident[[1]])
resp <- make_request(path)
raw <- httr2::resp_body_json(resp)
class(raw) <- c("ns_meta", class(raw))
raw[["is_collection"]] <- collection
raw[["collection_name"]] <- net_ident[[1]]
if (collection) {
return(raw)
} else if (
net_ident[[1]] == net_ident[[2]] &&
length(unlist(raw$nets)) > 1 &&
!collection
) {
cli::cli_abort(
c(
"{net_ident[[1]]} is a collection and downloading a whole collection is not permitted.",
"i" = "see {.url {collection_url}}"
)
)
} else if (net_ident[[1]] == net_ident[[2]]) {
return(raw)
} else {
idx <- which(unlist(raw[["nets"]]) == net_ident[[2]])
if (length(idx) == 0) {
cli::cli_abort(
c(
"{net_ident[[2]]} is not part of the collection {net_ident[[1]]}.",
"i" = "see {.url {collection_url}}"
)
)
}
raw[["analyses"]] <- raw[["analyses"]][[net_ident[[2]]]]
raw[["nets"]] <- raw[["nets"]][idx]
raw
}
}

#' @rdname netzschleuder
#' @export
ns_df <- function(name, token = NULL, size_limit = 1) {
rlang::check_installed("minty")
if (is.character(name)) {
meta <- ns_metadata(name, collection = FALSE)
net_ident <- resolve_name(name)
} else if (inherits(name, "ns_meta")) {
if (name[["is_collection"]]) {
cli::cli_abort(c(
"{.arg name} contains the meta data of a whole collection and downloading a whole collection is not permitted.",
"i" = "set collection = FALSE in `ns_metadata()`"
))
}
meta <- name
net_ident <- c(meta[["collection_name"]], meta[["nets"]])
} else {
cli::cli_abort("{.arg name} must be a string or a `ns_meta` object.")
}

zip_url <- sprintf(
"net/%s/files/%s.csv.zip",
net_ident[[1]],
net_ident[[2]]
)

temp <- tempfile(fileext = "zip")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There might be a withr::local_*() function that takes care of deleting the files when done.

download_file(zip_url, token = token, file = temp, size_limit = size_limit)

zip_contents <- utils::unzip(temp, list = TRUE)

edge_file_name <- grep("edge", zip_contents$Name, value = TRUE)
node_file_name <- grep("node", zip_contents$Name, value = TRUE)
gprops_file_name <- grep("gprops", zip_contents$Name, value = TRUE)

edges_df_raw <- utils::read.csv(unz(temp, edge_file_name))
edges_df <- suppressWarnings(minty::type_convert(edges_df_raw))
source_loc <- grep("source", names(edges_df))
target_loc <- grep("target", names(edges_df))
Comment on lines +187 to +188
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What if more than one column has "source" or "target" in them? Is there a more robust way to find out?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wouldn't it be better to stick to numerical indices here? I'm sure @count0 can notify us if he decided to update the format. There could in principle be an edge attribute called "source".

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My random sampling showed that the first two columns are always source and target so we could make this more specific. Then add an error message along the lines "The file does not seem to follow standard naming convention. Download the file manually "

names(edges_df)[c(source_loc, target_loc)] <- c("from", "to")

# netzschleuder uses 0-indexing, igraph uses 1-indexing
edges_df[["from"]] <- edges_df[["from"]] + 1L
edges_df[["to"]] <- edges_df[["to"]] + 1L

nodes_df_raw <- utils::read.csv(unz(temp, node_file_name))
#suppress warning if no character columns found
nodes_df <- suppressWarnings(minty::type_convert(nodes_df_raw))
names(nodes_df)[1] <- "id"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm leaning towards implementing lintr just for this, here and elsewhere.

Suggested change
names(nodes_df)[1] <- "id"
names(nodes_df)[[1]] <- "id"


# netzschleuder uses 0-indexing, igraph uses 1-indexing
nodes_df[["id"]] <- nodes_df[["id"]] + 1L
if ("X_pos" %in% names(nodes_df)) {
regex <- gregexpr("-?\\d+\\.\\d+", nodes_df[["X_pos"]])
matches <- regmatches(nodes_df[["X_pos"]], regex)

mat <- vapply(matches, as.numeric, numeric(2))

nodes_df[["X_pos"]] <- NULL
nodes_df[["x"]] <- mat[1, ]
nodes_df[["y"]] <- mat[2, ]
}

gprops_df <- readLines(unz(temp, gprops_file_name))

on.exit(unlink(temp))

list(nodes = nodes_df, edges = edges_df, gprops = gprops_df, meta = meta)
}

#' @rdname netzschleuder
#' @export
ns_graph <- function(name, token = NULL, size_limit = 1) {
graph_data <- ns_df(name, token = token, size_limit = size_limit)
directed <- graph_data$meta[["analyses"]][["is_directed"]]
bipartite <- graph_data$meta[["analyses"]][["is_bipartite"]]

g <- igraph::graph_from_data_frame(
graph_data$edges,
directed = directed,
vertices = graph_data$nodes
)

if (bipartite) {
types <- rep(FALSE, igraph::vcount(g))
types[graph_data$nodes$id %in% graph_data$edges[[1]]] <- TRUE
g <- igraph::set_vertex_attr(g, "type", value = types)
}

g
}

#' @export
print.ns_meta <- function(x, ...) {
if (x[["is_collection"]]) {
cat("Netzschleuder Metadata for the collection:", x[["collection_name"]])
cat("Number of Networks:", length(x[["nets"]]))
} else {
cat(
"Netzschleuder Metadata for: ",
x[["collection_name"]],
"/",
x[["nets"]][[1]],
sep = ""
)
cat("\n")
cat("Number of vertices:", x$analyses$num_vertices)
cat("\n")
cat("Number of Edges:", x$analyses$num_edges)
cat("\n")
cat("Directed:", x$analyses$is_directed)
cat("\n")
cat("Bipartite:", x$analyses$is_bipartite)
}
}
3 changes: 2 additions & 1 deletion man/igraphdata-package.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

58 changes: 58 additions & 0 deletions man/netzschleuder.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.