-
Notifications
You must be signed in to change notification settings - Fork 18
feat: New ns_graph() and lower-level ns_df() and ns_metadata() to download from [netzschleuder](https://networks.skewed.de)
#23
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
25f0193
67ec488
5c5895d
233b9e0
e521a71
4b07fc2
33ae5ed
db8b0db
51046b9
d8b1f77
bc6a34a
d7a0b06
ca89264
a7c2ef1
4b473a1
e60885d
6e45a5d
dd495a4
7a8004c
e29f90f
bcd1b1a
4431e4b
a52dec8
10b25c4
4485280
02fb9db
0c717fc
1974d6a
29412cf
0ad558d
3cd0496
eb12a3a
c9ef882
94bab69
18f87d1
2a6d9f1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
This file was deleted.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -9,21 +9,28 @@ Authors@R: c( | |
| person("Kirill", "Müller", , "[email protected]", role = c("aut", "cre"), | ||
| comment = c(ORCID = "0000-0002-1416-3412")), | ||
| person("Maëlle", "Salmon", role = "ctb"), | ||
| person("David","Schoch", role = "aut", | ||
| comment = c(ORCID="0000-0003-2952-4812")), | ||
| person("Chan Zuckerberg Initiative", role = "fnd") | ||
| ) | ||
| Description: A small collection of various network data sets, to use with | ||
| the 'igraph' package: the Enron email network, various food webs, | ||
| interactions in the immunoglobulin protein, the karate club network, | ||
| Koenigsberg's bridges, visuotactile brain areas of the macaque monkey, | ||
| UK faculty friendship network, domestic US flights network, etc. | ||
| UK faculty friendship network, domestic US flights network, etc. Also provides | ||
| access to the API of <https://networks.skewed.de/>. | ||
| License: CC BY-SA 4.0 + file LICENSE | ||
| URL: http://igraph.org | ||
| BugReports: https://github.com/igraph/igraphdata/issues | ||
| Depends: | ||
| R (>= 2.10) | ||
| R (>= 4.0) | ||
| Imports: | ||
| igraph (>= 1.5.0) | ||
| Suggests: | ||
| igraph (>= 2.0.0), | ||
szhorvat marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| rlang | ||
| Suggests: | ||
| cli, | ||
| minty, | ||
| httr2, | ||
| testthat (>= 3.0.0) | ||
| Encoding: UTF-8 | ||
| LazyData: true | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,6 +1,10 @@ | ||
| # Generated by roxygen2: do not edit by hand | ||
|
|
||
| S3method(print,ns_meta) | ||
| export(lesmis_gml) | ||
| export(lesmis_graphml) | ||
| export(lesmis_pajek) | ||
| export(ns_df) | ||
| export(ns_graph) | ||
| export(ns_metadata) | ||
| importFrom(igraph,vcount) |
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,264 @@ | ||||||
| #' @keywords internal | ||||||
| .pkg_env <- new.env(parent = emptyenv()) | ||||||
|
|
||||||
| get_base_req <- function() { | ||||||
| if (!exists("base_req", envir = .pkg_env, inherits = FALSE)) { | ||||||
| base_req <- httr2::request("https://networks.skewed.de") |> | ||||||
| httr2::req_throttle(capacity = 20, fill_time_s = 60) |> | ||||||
| httr2::req_user_agent( | ||||||
| "R package igraphdata (github.com/igraph/igraphdata)" | ||||||
| ) | ||||||
| .pkg_env$base_req <- base_req | ||||||
| } | ||||||
| .pkg_env$base_req | ||||||
| } | ||||||
|
|
||||||
| make_request <- function(path, token = NULL, method = "GET") { | ||||||
| rlang::check_installed("httr2") | ||||||
| req <- httr2::req_url_path(get_base_req(), path) | ||||||
| req <- httr2::req_method(req, method) | ||||||
| if (method == "HEAD") { | ||||||
| req <- httr2::req_headers(req, `Accept-Encoding` = "identity") | ||||||
| } | ||||||
| if (!is.null(token)) { | ||||||
| req <- httr2::req_headers(req, `WWW-Authenticate` = token) | ||||||
| } | ||||||
|
|
||||||
| resp <- httr2::req_perform(req) | ||||||
|
|
||||||
| if (httr2::resp_status(resp) != 200) { | ||||||
| stop("Failed to download file. Status: ", httr2::resp_status(resp)) | ||||||
| } | ||||||
|
|
||||||
| resp | ||||||
| } | ||||||
|
|
||||||
| resolve_name <- function(x) { | ||||||
| #remove trailing / | ||||||
| x <- sub("/$", "", x) | ||||||
| #remove double slash | ||||||
| x <- sub("//", "/", x) | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is this important?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. because @szhorvat tried to break the name parameter and these are just some cleaning of common user input errors that could occur |
||||||
|
|
||||||
| if (grepl("/", x)) { | ||||||
| x_split <- strsplit(x, "/", fixed = TRUE)[[1]] | ||||||
| if (length(x_split) > 2) { | ||||||
| cli::cli_abort( | ||||||
| "{.arg name} has {length(x_split)} components instead of 2." | ||||||
| ) | ||||||
| } | ||||||
| return(x_split) | ||||||
| } else { | ||||||
| c(x, x) | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This looks interesting to me. I wonder if we can make the intent here clearer.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This mechanism is need for the API. If a collection has only one network, the url is
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Got it. Should we then return a named list instead, to clarify the intent?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sounds good |
||||||
| } | ||||||
| } | ||||||
|
|
||||||
| download_file <- function(zip_url, token = NULL, file, size_limit) { | ||||||
| resp <- make_request(zip_url, token, method = "HEAD") | ||||||
| byte_size <- as.numeric(httr2::resp_headers(resp)[["content-length"]]) | ||||||
| gb_size <- round(byte_size / 1024^3, 4) | ||||||
| if (gb_size > size_limit) { | ||||||
| cli::cli_abort(c( | ||||||
| "{zip_url} has a size of {gb_size} GB and exceeds the size limit of {size_limit} GB.", | ||||||
| "i" = "To download the file, set {.arg size_limit} to a value greater than {gb_size}" | ||||||
| )) | ||||||
| } | ||||||
| resp <- make_request(zip_url, token, method = "GET") | ||||||
| writeBin(httr2::resp_body_raw(resp), file) | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are the files always small enough to fit into memory?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There are some networks that go up to 10GB. Should we implement something that stops users to download files over a certain size?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
IMO it's not a bad idea. Many of our users are inexperienced, and the risk of accidentally trying to download something large is too high. But is it possible the determine the size in advance? Or would you use the edge and vertex count as a proxy for the size? Do I understand correctly that If you do this, I'd suggest having an option to override and download anyway.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Something to consider is whether R has limits on the number of elements in dataframes, or number of rows it can read from CSVs. R still has this silly limitation of 32-bit integers, and while it can index with
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is about raw data loaded into memory, which is what it looks like this code is doing. Can we let curl handle the downloading in this case, to write directly to disk in chunks? There might be a way to show a progress bar too, and the user can cancel if it's too large.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I will dig into httr2 a bit on this. I would like to keep using it here too for the rate limiting. I am sure there is something that can help us here. Otherwise my idea was to create an option or so that specifies a hard upper limit of edges/nodes that are permitted.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is the rate limiting important for downloads, or just for API calls? We can probably get the body in chunks and write piecemeal if really necessary.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The rate limiting is the most important for files because that is the heaviest load on the server. |
||||||
| invisible(NULL) | ||||||
| } | ||||||
|
|
||||||
| #' Download and Convert Graph Data from Netzschleuder | ||||||
| #' | ||||||
| #' These functions provide tools to interact with the Netzschleuder network dataset archive. | ||||||
| #' Netzschleuder (<https://networks.skewed.de/>) is a large online repository for network datasets, | ||||||
| #' aimed at aiding scientific research. | ||||||
| #' \describe{ | ||||||
| #' \item{`ns_metadata()`}{ retrieves metadata about a network or network collection.} | ||||||
| #' \item{`ns_df()`}{downloads the graph data as data frames (nodes, edges, and graph properties).} | ||||||
| #' \item{`ns_graph()`}{creates an `igraph` object directly from Netzschleuder.} | ||||||
| #' } | ||||||
| #' | ||||||
| #' @param name Character. The name of the network dataset. To get a network from a collection, | ||||||
| #' use the format `<collection_name>/<network_name>`. | ||||||
| #' @param collection Logical. If TRUE, get the metadata of a whole collection of networks. | ||||||
| #' @param token Character. Some networks have restricted access and require a token. | ||||||
| #' @param size_limit Numeric. Maximum allowed file size in GB. Larger files will be prevented from being downloaded. | ||||||
| #' See <https://networks.skewed.de/restricted>. | ||||||
| #' | ||||||
| #' @return | ||||||
| #' \describe{ | ||||||
| #' \item{`ns_metadata()`}{A list containing metadata for the dataset.} | ||||||
| #' \item{`ns_df()`}{A named list with `nodes`, `edges`, `gprops`, and `meta`.} | ||||||
| #' \item{`ns_graph()`}{An `igraph` object.} | ||||||
| #' } | ||||||
| #' @examples | ||||||
| #' \dontrun{ | ||||||
| #' # Get metadata | ||||||
| #' ns_metadata("copenhagen/calls") | ||||||
| #' | ||||||
| #' # Download network as data frames | ||||||
| #' graph_data <- ns_df("copenhagen/calls") | ||||||
| #' | ||||||
| #' # Create an igraph object | ||||||
| #' g <- ns_graph("copenhagen/calls") | ||||||
| #' } | ||||||
| #' | ||||||
| #' @seealso <https://networks.skewed.de/> | ||||||
| #' @rdname netzschleuder | ||||||
| #' @export | ||||||
| ns_metadata <- function(name, collection = FALSE) { | ||||||
| rlang::check_installed("cli") | ||||||
| net_ident <- resolve_name(name) | ||||||
| path <- sprintf("api/net/%s", net_ident[[1]]) | ||||||
| collection_url <- sprintf("https://networks.skewed.de/net/%s", net_ident[[1]]) | ||||||
| resp <- make_request(path) | ||||||
| raw <- httr2::resp_body_json(resp) | ||||||
| class(raw) <- c("ns_meta", class(raw)) | ||||||
| raw[["is_collection"]] <- collection | ||||||
| raw[["collection_name"]] <- net_ident[[1]] | ||||||
| if (collection) { | ||||||
| return(raw) | ||||||
| } else if ( | ||||||
| net_ident[[1]] == net_ident[[2]] && | ||||||
| length(unlist(raw$nets)) > 1 && | ||||||
| !collection | ||||||
| ) { | ||||||
| cli::cli_abort( | ||||||
| c( | ||||||
| "{net_ident[[1]]} is a collection and downloading a whole collection is not permitted.", | ||||||
| "i" = "see {.url {collection_url}}" | ||||||
| ) | ||||||
| ) | ||||||
| } else if (net_ident[[1]] == net_ident[[2]]) { | ||||||
| return(raw) | ||||||
| } else { | ||||||
| idx <- which(unlist(raw[["nets"]]) == net_ident[[2]]) | ||||||
| if (length(idx) == 0) { | ||||||
| cli::cli_abort( | ||||||
| c( | ||||||
| "{net_ident[[2]]} is not part of the collection {net_ident[[1]]}.", | ||||||
| "i" = "see {.url {collection_url}}" | ||||||
| ) | ||||||
| ) | ||||||
| } | ||||||
| raw[["analyses"]] <- raw[["analyses"]][[net_ident[[2]]]] | ||||||
| raw[["nets"]] <- raw[["nets"]][idx] | ||||||
| raw | ||||||
| } | ||||||
| } | ||||||
|
|
||||||
| #' @rdname netzschleuder | ||||||
| #' @export | ||||||
| ns_df <- function(name, token = NULL, size_limit = 1) { | ||||||
| rlang::check_installed("minty") | ||||||
| if (is.character(name)) { | ||||||
| meta <- ns_metadata(name, collection = FALSE) | ||||||
| net_ident <- resolve_name(name) | ||||||
| } else if (inherits(name, "ns_meta")) { | ||||||
| if (name[["is_collection"]]) { | ||||||
| cli::cli_abort(c( | ||||||
| "{.arg name} contains the meta data of a whole collection and downloading a whole collection is not permitted.", | ||||||
| "i" = "set collection = FALSE in `ns_metadata()`" | ||||||
| )) | ||||||
| } | ||||||
| meta <- name | ||||||
| net_ident <- c(meta[["collection_name"]], meta[["nets"]]) | ||||||
| } else { | ||||||
| cli::cli_abort("{.arg name} must be a string or a `ns_meta` object.") | ||||||
| } | ||||||
|
|
||||||
| zip_url <- sprintf( | ||||||
| "net/%s/files/%s.csv.zip", | ||||||
| net_ident[[1]], | ||||||
| net_ident[[2]] | ||||||
| ) | ||||||
|
|
||||||
| temp <- tempfile(fileext = "zip") | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There might be a |
||||||
| download_file(zip_url, token = token, file = temp, size_limit = size_limit) | ||||||
|
|
||||||
| zip_contents <- utils::unzip(temp, list = TRUE) | ||||||
schochastics marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
|
|
||||||
| edge_file_name <- grep("edge", zip_contents$Name, value = TRUE) | ||||||
| node_file_name <- grep("node", zip_contents$Name, value = TRUE) | ||||||
| gprops_file_name <- grep("gprops", zip_contents$Name, value = TRUE) | ||||||
|
|
||||||
| edges_df_raw <- utils::read.csv(unz(temp, edge_file_name)) | ||||||
| edges_df <- suppressWarnings(minty::type_convert(edges_df_raw)) | ||||||
| source_loc <- grep("source", names(edges_df)) | ||||||
| target_loc <- grep("target", names(edges_df)) | ||||||
|
Comment on lines
+187
to
+188
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What if more than one column has "source" or "target" in them? Is there a more robust way to find out?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Wouldn't it be better to stick to numerical indices here? I'm sure @count0 can notify us if he decided to update the format. There could in principle be an edge attribute called "source".
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My random sampling showed that the first two columns are always |
||||||
| names(edges_df)[c(source_loc, target_loc)] <- c("from", "to") | ||||||
|
|
||||||
| # netzschleuder uses 0-indexing, igraph uses 1-indexing | ||||||
| edges_df[["from"]] <- edges_df[["from"]] + 1L | ||||||
| edges_df[["to"]] <- edges_df[["to"]] + 1L | ||||||
|
|
||||||
| nodes_df_raw <- utils::read.csv(unz(temp, node_file_name)) | ||||||
krlmlr marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
| #suppress warning if no character columns found | ||||||
| nodes_df <- suppressWarnings(minty::type_convert(nodes_df_raw)) | ||||||
| names(nodes_df)[1] <- "id" | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm leaning towards implementing lintr just for this, here and elsewhere.
Suggested change
|
||||||
|
|
||||||
| # netzschleuder uses 0-indexing, igraph uses 1-indexing | ||||||
| nodes_df[["id"]] <- nodes_df[["id"]] + 1L | ||||||
| if ("X_pos" %in% names(nodes_df)) { | ||||||
| regex <- gregexpr("-?\\d+\\.\\d+", nodes_df[["X_pos"]]) | ||||||
| matches <- regmatches(nodes_df[["X_pos"]], regex) | ||||||
|
|
||||||
| mat <- vapply(matches, as.numeric, numeric(2)) | ||||||
|
|
||||||
| nodes_df[["X_pos"]] <- NULL | ||||||
| nodes_df[["x"]] <- mat[1, ] | ||||||
| nodes_df[["y"]] <- mat[2, ] | ||||||
| } | ||||||
|
|
||||||
| gprops_df <- readLines(unz(temp, gprops_file_name)) | ||||||
|
|
||||||
| on.exit(unlink(temp)) | ||||||
schochastics marked this conversation as resolved.
Show resolved
Hide resolved
schochastics marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
|
|
||||||
| list(nodes = nodes_df, edges = edges_df, gprops = gprops_df, meta = meta) | ||||||
| } | ||||||
|
|
||||||
| #' @rdname netzschleuder | ||||||
| #' @export | ||||||
| ns_graph <- function(name, token = NULL, size_limit = 1) { | ||||||
| graph_data <- ns_df(name, token = token, size_limit = size_limit) | ||||||
| directed <- graph_data$meta[["analyses"]][["is_directed"]] | ||||||
| bipartite <- graph_data$meta[["analyses"]][["is_bipartite"]] | ||||||
|
|
||||||
| g <- igraph::graph_from_data_frame( | ||||||
| graph_data$edges, | ||||||
| directed = directed, | ||||||
| vertices = graph_data$nodes | ||||||
| ) | ||||||
|
|
||||||
| if (bipartite) { | ||||||
schochastics marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
| types <- rep(FALSE, igraph::vcount(g)) | ||||||
| types[graph_data$nodes$id %in% graph_data$edges[[1]]] <- TRUE | ||||||
| g <- igraph::set_vertex_attr(g, "type", value = types) | ||||||
| } | ||||||
|
|
||||||
| g | ||||||
schochastics marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
| } | ||||||
|
|
||||||
| #' @export | ||||||
| print.ns_meta <- function(x, ...) { | ||||||
| if (x[["is_collection"]]) { | ||||||
| cat("Netzschleuder Metadata for the collection:", x[["collection_name"]]) | ||||||
| cat("Number of Networks:", length(x[["nets"]])) | ||||||
| } else { | ||||||
| cat( | ||||||
| "Netzschleuder Metadata for: ", | ||||||
| x[["collection_name"]], | ||||||
| "/", | ||||||
| x[["nets"]][[1]], | ||||||
| sep = "" | ||||||
| ) | ||||||
| cat("\n") | ||||||
| cat("Number of vertices:", x$analyses$num_vertices) | ||||||
| cat("\n") | ||||||
| cat("Number of Edges:", x$analyses$num_edges) | ||||||
| cat("\n") | ||||||
| cat("Directed:", x$analyses$is_directed) | ||||||
| cat("\n") | ||||||
| cat("Bipartite:", x$analyses$is_bipartite) | ||||||
| } | ||||||
| } | ||||||
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Uh oh!
There was an error while loading. Please reload this page.