Skip to content

Commit 9d25615

Browse files
authored
Add function to append filesystem metadata to a given tar archive (#40)
* Add and export make_tar_index function * Use `make_tar_index()` when building R packages * Update NEWS.md * Append VFS metadata to R package .tgz output * Improve VFS metadata encoding in .tgz file * Rename make_tar_index to add_tar_index * Update pkgdown documentation * Explicitly write metadata values as integer type * Embed filesystem metadata as a tar entry * Set highest compression level when repacking tar * Early exit tar processing on existing metadata * Deal with hard and symbolic links in tar indexing
1 parent 478609e commit 9d25615

18 files changed

+451
-67
lines changed

NAMESPACE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
export(add_list)
44
export(add_pkg)
55
export(add_repo)
6+
export(add_tar_index)
67
export(build)
78
export(file_packager)
89
export(make_library)

NEWS.md

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,12 @@
11
# rwasm (development version)
22

3-
* Support for a new `compression` argument in `build()`, `add_pkg()`, `make_vfs_library()`, and other related functions. When enabled, VFS images will be compressed using `gzip`. Note: Loading compressed VFS images requires at least version 0.4.1 of webR (#39).
3+
## New features
4+
5+
* When building R packages with `compress` set to `TRUE`, use the binary R package `.tgz` file for the Emscripten filesystem image data and generate custom metadata rather than using Emscripten's `file_packager` tool.
6+
7+
* Support for a new `compress` argument in `file_packager()`, `make_vfs_library()`, and other related functions. When enabled, VFS images will be compressed using `gzip` (#39).
8+
9+
Note: Mounting processed `.tgz` archives or compressed VFS images requires at least version 0.4.2 of webR.
410

511
# rwasm 0.1.0
612

R/build.R

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ build <- function(packages,
1818
out_dir = ".",
1919
remotes = NULL,
2020
dependencies = FALSE,
21-
compress = FALSE) {
21+
compress = TRUE) {
2222
tmp_dir <- tempfile()
2323
on.exit(unlink(tmp_dir, recursive = TRUE))
2424
dir.create(tmp_dir)
@@ -215,16 +215,21 @@ wasm_build <- function(pkg, tarball_path, contrib_bin, compress) {
215215
bin_dest <- fs::path(contrib_bin, paste0(pkg, "_", bin_ver, ".tgz"))
216216
fs::file_copy(bin_path, bin_dest, overwrite = TRUE)
217217

218-
# Build an Emscripten filesystem image for the package
219-
tmp_bin_dir <- fs::path(tempfile())
220-
on.exit(unlink(tmp_bin_dir, recursive = TRUE), add = TRUE)
221-
untar(bin_dest, exdir = tmp_bin_dir)
222-
file_packager(
223-
fs::dir_ls(tmp_bin_dir)[[1]],
224-
contrib_bin,
225-
fs::path_file(bin_dest),
226-
compress
227-
)
218+
if (compress) {
219+
# Use binary .tgz file to build Emscripten filesystem image metadata
220+
add_tar_index(bin_dest, strip = 1)
221+
} else {
222+
# Build an uncompressed Emscripten filesystem image for the package
223+
tmp_bin_dir <- fs::path(tempfile())
224+
on.exit(unlink(tmp_bin_dir, recursive = TRUE), add = TRUE)
225+
untar(bin_dest, exdir = tmp_bin_dir)
226+
file_packager(
227+
fs::dir_ls(tmp_bin_dir)[[1]],
228+
contrib_bin,
229+
fs::path_file(bin_dest),
230+
compress = FALSE
231+
)
232+
}
228233

229234
invisible(NULL)
230235
}

R/lib.R

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -48,13 +48,14 @@ make_library <- function(repo_dir = "./repo", lib_dir = "./lib", strip = NULL) {
4848
#'
4949
#' Each filesystem image is generated using Emscripten's [file_packager()] tool
5050
#' and the output `.data` and `.js.metadata` filesystem image files are written
51-
#' to the repository in the same directory as the package binary `.tar.gz`
52-
#' files.
51+
#' to the repository in the same directory as the package binary `.tgz` files.
5352
#'
5453
#' The resulting filesystem images may then be used by webR to download and
55-
#' install R packages faster by mounting the `.data` images to the Emscripten
56-
#' virtual filesystem, rather than decompressing and extracting the equivalent
57-
#' `.tar.gz` files.
54+
#' install R packages by mounting the `.data` images to the Emscripten virtual
55+
#' filesystem.
56+
#'
57+
#' When `compress` is `TRUE`, an additional file with extension `".data.gz"` is
58+
#' also output containing a compressed version of the filesystem data.
5859
#'
5960
#' @inheritParams add_pkg
6061
#'
@@ -100,6 +101,9 @@ make_vfs_repo <- function(repo_dir = "./repo", compress = FALSE) {
100101
#' tool and the output `.data` and `.js.metadata` filesystem image files are
101102
#' written to the directory `out_dir`.
102103
#'
104+
#' When `compress` is `TRUE`, an additional file with extension `".data.gz"` is
105+
#' also output containing a compressed version of the filesystem data.
106+
#'
103107
#' The resulting image can be downloaded by webR and mounted on the Emscripten
104108
#' virtual filesystem as an efficient way to provide a pre-configured R library,
105109
#' without installing each R package individually.

R/repo.R

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -76,20 +76,22 @@ add_list <- function(list_file, ...) {
7676
#' source. Defaults to `NA`, meaning prefer a built-in list of references to
7777
#' packages pre-modified for use with webR.
7878
#' @param dependencies Dependency specification for packages to additionally
79-
#' add to the repository. Defaults to `FALSE`, meaning no additional packages.
80-
#' Use `NA` to install only hard dependencies whereas `TRUE` installs all
81-
#' optional dependencies as well. See [pkgdepends::as_pkg_dependencies]
82-
#' for details.
83-
#' @inheritParams file_packager
84-
#'
79+
#' add to the repository. Defaults to `FALSE`, meaning no additional packages.
80+
#' Use `NA` to install only hard dependencies whereas `TRUE` installs all
81+
#' optional dependencies as well. See [pkgdepends::as_pkg_dependencies]
82+
#' for details.
83+
#' @param compress When `TRUE`, add and compress Emscripten virtual filesystem
84+
#' metadata in the resulting R package binary `.tgz` files. Otherwise,
85+
#' [file_packager()] is used to create uncompressed virtual filesystem images
86+
#' included in the output binary package repository. Defaults to `TRUE`.
8587
#' @importFrom dplyr rows_update select
8688
#' @importFrom pkgdepends new_pkg_download_proposal
8789
#' @export
8890
add_pkg <- function(packages,
8991
repo_dir = "./repo",
9092
remotes = NA,
9193
dependencies = FALSE,
92-
compress = FALSE) {
94+
compress = TRUE) {
9395
# Set up pkgdepends configuration
9496
config <- ppm_config
9597
config$dependencies <- dependencies
@@ -185,7 +187,7 @@ prefer_remotes <- function(package_info, remotes = NA) {
185187
update_repo <- function(package_info,
186188
remotes = NA,
187189
repo_dir = "./repo",
188-
compress = FALSE) {
190+
compress = TRUE) {
189191
r_version <- R_system_version(getOption("rwasm.webr_version"))
190192

191193
writeLines(sprintf("Processing %d package(s).", nrow(package_info)))

R/tar.R

Lines changed: 229 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,229 @@
1+
#' Add Emscripten virtual filesystem metadata to a given `tar` archive
2+
#'
3+
#' Calculates file offsets and other metadata for content stored in an
4+
#' (optionally gzip compressed) `tar` archive. Once added, the `tar` archive
5+
#' with metadata can be mounted as an Emscripten filesystem image, making the
6+
#' contents of the archive available to the WebAssembly R process.
7+
#'
8+
#' The virtual filesystem metadata is appended to the end of the `tar` archive,
9+
#' with the output replacing the original file. The resulting archive should be
10+
#' hosted online so that its URL can be provided to webR for mounting on the
11+
#' virtual filesystem.
12+
#'
13+
#' If `strip` is greater than `0` the virtual filesystem metadata is generated
14+
#' such that when mounted by webR the specified number of leading path elements
15+
#' are removed. Useful for R package binaries where data files are stored in the
16+
#' original `.tgz` file under a subdirectory. Files with fewer path name
17+
#' elements than the specified amount are skipped.
18+
#'
19+
#' @param file Filename of the `tar` archive for which metadata is to be added.
20+
#' @param strip Remove the specified number of leading path elements when
21+
#' mounting with webR. Defaults to `0`.
22+
#' @export
23+
add_tar_index <- function(file, strip = 0) {
24+
file <- fs::path_norm(file)
25+
file_ext <- tolower(fs::path_ext(file))
26+
file_base <- fs::path_ext_remove(file)
27+
28+
message(paste("Appending virtual filesystem metadata for:", file))
29+
30+
# Check if our tar is compatible
31+
if (!any(file_ext == c("tgz", "gz", "tar"))) {
32+
stop(paste0("Can't make index for \"", file,
33+
"\". Only uncompressed or `gzip` compressed tar files can be indexed."))
34+
}
35+
36+
# Handle two-component extensions
37+
if (file_ext == "gz") {
38+
file_base <- fs::path_ext_remove(file_base)
39+
}
40+
41+
# Read archive contents, decompressing if necessary
42+
gzip <- any(file_ext == c("tgz", "gz"))
43+
data <- readBin(file, "raw", n = file.size(file))
44+
if (gzip) {
45+
data <- memDecompress(data)
46+
}
47+
48+
# Build metadata from source .tar file
49+
con <- rawConnection(data, open = "rb")
50+
on.exit(close(con), add = TRUE)
51+
entries <- read_tar_offsets(con, strip)
52+
tar_end <- seek(con)
53+
54+
metadata <- list(
55+
files = entries,
56+
gzip = gzip,
57+
remote_package_size = length(data)
58+
)
59+
60+
# Add metadata as additional .tar entry
61+
entry <- create_metadata_entry(metadata)
62+
json_block <- as.integer(tar_end / 512) + 1L
63+
64+
# Append additional metadata hint for webR
65+
magic <- charToRaw('webR')
66+
reserved <- raw(4) # reserved for future use
67+
block <- writeBin(json_block, raw(), size = 4, endian = "big")
68+
len <- writeBin(entry$length, raw(), size = 4, endian = "big")
69+
hint <- c(magic, reserved, block, len)
70+
71+
# Build new .tar archive data
72+
data <- c(data[1:tar_end], entry$data, raw(1024), hint)
73+
74+
# Write output and move into place
75+
out <- tempfile()
76+
out_con <- if (gzip) {
77+
gzfile(out, open = "wb", compression = 9)
78+
} else {
79+
file(out, open = "wb")
80+
}
81+
writeBin(data, out_con, size = 1L)
82+
close(out_con)
83+
fs::file_copy(out, file, overwrite = TRUE)
84+
}
85+
86+
create_metadata_entry <- function(metadata) {
87+
# metadata contents
88+
json <- charToRaw(jsonlite::toJSON(metadata, auto_unbox = TRUE))
89+
len <- length(json)
90+
blocks <- ceiling(len/512)
91+
length(json) <- 512 * blocks
92+
93+
# entry header
94+
timestamp <- as.integer(Sys.time())
95+
header <- raw(512)
96+
header[1:15] <- charToRaw('.vfs-index.json') # filename
97+
header[101:108] <- charToRaw('0000644 ') # mode
98+
header[109:116] <- charToRaw('0000000 ') # uid
99+
header[117:124] <- charToRaw('0000000 ') # gid
100+
header[125:136] <- charToRaw(sprintf("%011o ", len)) # length
101+
header[137:148] <- charToRaw(sprintf("%011o ", timestamp)) # timestamp
102+
header[149:156] <- charToRaw(' ') # placeholder
103+
header[157:157] <- charToRaw('0') # type
104+
header[258:262] <- charToRaw('ustar') # ustar magic
105+
header[264:265] <- charToRaw('00') # ustar version
106+
header[266:269] <- charToRaw('root') # user
107+
header[298:302] <- charToRaw('wheel') # group
108+
109+
# populate checksum field
110+
checksum <- raw(8)
111+
checksum[1:6] <- charToRaw(sprintf("%06o", sum(as.integer(header))))
112+
checksum[8] <- charToRaw(' ')
113+
header[149:156] <- checksum
114+
115+
list(data = c(header, json), length = len)
116+
}
117+
118+
read_tar_offsets <- function(con, strip) {
119+
entries <- list()
120+
next_filename <- NULL
121+
122+
while (TRUE) {
123+
# Read tar entry header block
124+
header <- readBin(con, "raw", n = 512)
125+
126+
# Basic tar filename
127+
filename <- rawToChar(header[1:100])
128+
129+
# Empty header indicates end of archive, early exit for existing metadata
130+
if (all(header == 0) || filename == ".vfs-index.json") {
131+
# Return connection position to just before this header
132+
seek(con, -512, origin = "current")
133+
break
134+
}
135+
136+
# Entry size and offset
137+
offset <- seek(con)
138+
size <- strtoi(sub("\\s.*", "", rawToChar(header[125:136])), 8)
139+
file_blocks <- ceiling(size / 512)
140+
141+
# Skip directories, global, and vendor-specific extended headers
142+
type <- rawToChar(header[157])
143+
if (grepl("5|g|[A-Z]", type)) {
144+
next
145+
}
146+
147+
# Handle PAX extended header
148+
if (type == "x") {
149+
pax_data <- readBin(con, "raw", n = 512 * ceiling(size / 512))
150+
pax_data <- pax_data[1:max(which(pax_data != as.raw(0x00)))]
151+
lines <- raw_split(pax_data, "\n")
152+
for (line in lines) {
153+
payload <- raw_split(line, " ")[[2]]
154+
kv <- raw_split(payload, "=")
155+
if (rawToChar(kv[[1]]) == "path") {
156+
next_filename <- rawToChar(kv[[2]])
157+
break
158+
}
159+
}
160+
next
161+
}
162+
163+
# Apply ustar formatted extended filename
164+
magic <- rawToChar(header[258:263])
165+
if (magic == "ustar"){
166+
prefix <- rawToChar(header[346:501])
167+
filename <- paste(prefix, filename, sep = "/")
168+
}
169+
170+
# Apply PAX formatted extended filename
171+
if (!is.null(next_filename)) {
172+
filename <- next_filename
173+
next_filename <- NULL
174+
}
175+
176+
# Strip path elements, ignoring leading slash, skip if no path remains
177+
if (strip > 0) {
178+
filename <- gsub("^/", "", filename)
179+
parts <- fs::path_split(filename)[[1]]
180+
parts <- parts[-strip:-1]
181+
if (length(parts) == 0) {
182+
seek(con, 512 * file_blocks, origin = "current")
183+
next
184+
}
185+
filename <- fs::path_join(c("/", parts))
186+
}
187+
188+
# Calculate file offsets
189+
entry <- list(filename = filename, start = offset, end = offset + size)
190+
191+
# Deal with hard and symbolic links
192+
if (grepl("1|2", type)) {
193+
link_name <- rawToChar(header[158:257])
194+
if (type == "2") {
195+
link_name <- fs::path_norm(fs::path(fs::path_dir(filename), link_name))
196+
}
197+
link_entry <- Find(\(e) e$filename == link_name, entries)
198+
entry$start = link_entry$start
199+
entry$end = link_entry$end
200+
file_blocks <- 0
201+
}
202+
203+
entries <- append(entries, list(entry))
204+
205+
# Skip to next entry header
206+
seek(con, 512 * file_blocks, origin = "current")
207+
}
208+
entries
209+
}
210+
211+
# Split the elements of a raw vector x according to matches of element `split`
212+
raw_split <- function(x, split) {
213+
if (is.character(split)) {
214+
split <- charToRaw(split)
215+
}
216+
217+
start <- 1
218+
out <- list()
219+
for (end in which(x == split)) {
220+
out <- c(out, list(x[start:(end - 1)]))
221+
start <- end + 1
222+
}
223+
224+
if (start <= length(x)) {
225+
out <- c(out, list(x[start:length(x)]))
226+
}
227+
228+
out
229+
}

_pkgdown.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
url: https://r-wasm.github.io/rwasm/
22
template:
33
bootstrap: 5
4-
4+
deploy:
5+
install_metadata: true

inst/pkgdown.yml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
pandoc: '3.2'
2+
pkgdown: 2.0.9.9000
3+
pkgdown_sha: 34ee692e4ce10c8abfb863cc782da771838558f7
4+
articles:
5+
github-actions: github-actions.html
6+
mount-fs-image: mount-fs-image.html
7+
mount-host-dir: mount-host-dir.html
8+
rwasm: rwasm.html
9+
tar-metadata: tar-metadata.html
10+
last_built: 2024-09-10T15:29Z
11+
urls:
12+
reference: https://r-wasm.github.io/rwasm/reference
13+
article: https://r-wasm.github.io/rwasm/articles

man/add_list.Rd

Lines changed: 4 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)