|
| 1 | +#' Add Emscripten virtual filesystem metadata to a given `tar` archive |
| 2 | +#' |
| 3 | +#' Calculates file offsets and other metadata for content stored in an |
| 4 | +#' (optionally gzip compressed) `tar` archive. Once added, the `tar` archive |
| 5 | +#' with metadata can be mounted as an Emscripten filesystem image, making the |
| 6 | +#' contents of the archive available to the WebAssembly R process. |
| 7 | +#' |
| 8 | +#' The virtual filesystem metadata is appended to the end of the `tar` archive, |
| 9 | +#' with the output replacing the original file. The resulting archive should be |
| 10 | +#' hosted online so that its URL can be provided to webR for mounting on the |
| 11 | +#' virtual filesystem. |
| 12 | +#' |
| 13 | +#' If `strip` is greater than `0` the virtual filesystem metadata is generated |
| 14 | +#' such that when mounted by webR the specified number of leading path elements |
| 15 | +#' are removed. Useful for R package binaries where data files are stored in the |
| 16 | +#' original `.tgz` file under a subdirectory. Files with fewer path name |
| 17 | +#' elements than the specified amount are skipped. |
| 18 | +#' |
| 19 | +#' @param file Filename of the `tar` archive for which metadata is to be added. |
| 20 | +#' @param strip Remove the specified number of leading path elements when |
| 21 | +#' mounting with webR. Defaults to `0`. |
| 22 | +#' @export |
| 23 | +add_tar_index <- function(file, strip = 0) { |
| 24 | + file <- fs::path_norm(file) |
| 25 | + file_ext <- tolower(fs::path_ext(file)) |
| 26 | + file_base <- fs::path_ext_remove(file) |
| 27 | + |
| 28 | + message(paste("Appending virtual filesystem metadata for:", file)) |
| 29 | + |
| 30 | + # Check if our tar is compatible |
| 31 | + if (!any(file_ext == c("tgz", "gz", "tar"))) { |
| 32 | + stop(paste0("Can't make index for \"", file, |
| 33 | + "\". Only uncompressed or `gzip` compressed tar files can be indexed.")) |
| 34 | + } |
| 35 | + |
| 36 | + # Handle two-component extensions |
| 37 | + if (file_ext == "gz") { |
| 38 | + file_base <- fs::path_ext_remove(file_base) |
| 39 | + } |
| 40 | + |
| 41 | + # Read archive contents, decompressing if necessary |
| 42 | + gzip <- any(file_ext == c("tgz", "gz")) |
| 43 | + data <- readBin(file, "raw", n = file.size(file)) |
| 44 | + if (gzip) { |
| 45 | + data <- memDecompress(data) |
| 46 | + } |
| 47 | + |
| 48 | + # Build metadata from source .tar file |
| 49 | + con <- rawConnection(data, open = "rb") |
| 50 | + on.exit(close(con), add = TRUE) |
| 51 | + entries <- read_tar_offsets(con, strip) |
| 52 | + tar_end <- seek(con) |
| 53 | + |
| 54 | + metadata <- list( |
| 55 | + files = entries, |
| 56 | + gzip = gzip, |
| 57 | + remote_package_size = length(data) |
| 58 | + ) |
| 59 | + |
| 60 | + # Add metadata as additional .tar entry |
| 61 | + entry <- create_metadata_entry(metadata) |
| 62 | + json_block <- as.integer(tar_end / 512) + 1L |
| 63 | + |
| 64 | + # Append additional metadata hint for webR |
| 65 | + magic <- charToRaw('webR') |
| 66 | + reserved <- raw(4) # reserved for future use |
| 67 | + block <- writeBin(json_block, raw(), size = 4, endian = "big") |
| 68 | + len <- writeBin(entry$length, raw(), size = 4, endian = "big") |
| 69 | + hint <- c(magic, reserved, block, len) |
| 70 | + |
| 71 | + # Build new .tar archive data |
| 72 | + data <- c(data[1:tar_end], entry$data, raw(1024), hint) |
| 73 | + |
| 74 | + # Write output and move into place |
| 75 | + out <- tempfile() |
| 76 | + out_con <- if (gzip) { |
| 77 | + gzfile(out, open = "wb", compression = 9) |
| 78 | + } else { |
| 79 | + file(out, open = "wb") |
| 80 | + } |
| 81 | + writeBin(data, out_con, size = 1L) |
| 82 | + close(out_con) |
| 83 | + fs::file_copy(out, file, overwrite = TRUE) |
| 84 | +} |
| 85 | + |
| 86 | +create_metadata_entry <- function(metadata) { |
| 87 | + # metadata contents |
| 88 | + json <- charToRaw(jsonlite::toJSON(metadata, auto_unbox = TRUE)) |
| 89 | + len <- length(json) |
| 90 | + blocks <- ceiling(len/512) |
| 91 | + length(json) <- 512 * blocks |
| 92 | + |
| 93 | + # entry header |
| 94 | + timestamp <- as.integer(Sys.time()) |
| 95 | + header <- raw(512) |
| 96 | + header[1:15] <- charToRaw('.vfs-index.json') # filename |
| 97 | + header[101:108] <- charToRaw('0000644 ') # mode |
| 98 | + header[109:116] <- charToRaw('0000000 ') # uid |
| 99 | + header[117:124] <- charToRaw('0000000 ') # gid |
| 100 | + header[125:136] <- charToRaw(sprintf("%011o ", len)) # length |
| 101 | + header[137:148] <- charToRaw(sprintf("%011o ", timestamp)) # timestamp |
| 102 | + header[149:156] <- charToRaw(' ') # placeholder |
| 103 | + header[157:157] <- charToRaw('0') # type |
| 104 | + header[258:262] <- charToRaw('ustar') # ustar magic |
| 105 | + header[264:265] <- charToRaw('00') # ustar version |
| 106 | + header[266:269] <- charToRaw('root') # user |
| 107 | + header[298:302] <- charToRaw('wheel') # group |
| 108 | + |
| 109 | + # populate checksum field |
| 110 | + checksum <- raw(8) |
| 111 | + checksum[1:6] <- charToRaw(sprintf("%06o", sum(as.integer(header)))) |
| 112 | + checksum[8] <- charToRaw(' ') |
| 113 | + header[149:156] <- checksum |
| 114 | + |
| 115 | + list(data = c(header, json), length = len) |
| 116 | +} |
| 117 | + |
| 118 | +read_tar_offsets <- function(con, strip) { |
| 119 | + entries <- list() |
| 120 | + next_filename <- NULL |
| 121 | + |
| 122 | + while (TRUE) { |
| 123 | + # Read tar entry header block |
| 124 | + header <- readBin(con, "raw", n = 512) |
| 125 | + |
| 126 | + # Basic tar filename |
| 127 | + filename <- rawToChar(header[1:100]) |
| 128 | + |
| 129 | + # Empty header indicates end of archive, early exit for existing metadata |
| 130 | + if (all(header == 0) || filename == ".vfs-index.json") { |
| 131 | + # Return connection position to just before this header |
| 132 | + seek(con, -512, origin = "current") |
| 133 | + break |
| 134 | + } |
| 135 | + |
| 136 | + # Entry size and offset |
| 137 | + offset <- seek(con) |
| 138 | + size <- strtoi(sub("\\s.*", "", rawToChar(header[125:136])), 8) |
| 139 | + file_blocks <- ceiling(size / 512) |
| 140 | + |
| 141 | + # Skip directories, global, and vendor-specific extended headers |
| 142 | + type <- rawToChar(header[157]) |
| 143 | + if (grepl("5|g|[A-Z]", type)) { |
| 144 | + next |
| 145 | + } |
| 146 | + |
| 147 | + # Handle PAX extended header |
| 148 | + if (type == "x") { |
| 149 | + pax_data <- readBin(con, "raw", n = 512 * ceiling(size / 512)) |
| 150 | + pax_data <- pax_data[1:max(which(pax_data != as.raw(0x00)))] |
| 151 | + lines <- raw_split(pax_data, "\n") |
| 152 | + for (line in lines) { |
| 153 | + payload <- raw_split(line, " ")[[2]] |
| 154 | + kv <- raw_split(payload, "=") |
| 155 | + if (rawToChar(kv[[1]]) == "path") { |
| 156 | + next_filename <- rawToChar(kv[[2]]) |
| 157 | + break |
| 158 | + } |
| 159 | + } |
| 160 | + next |
| 161 | + } |
| 162 | + |
| 163 | + # Apply ustar formatted extended filename |
| 164 | + magic <- rawToChar(header[258:263]) |
| 165 | + if (magic == "ustar"){ |
| 166 | + prefix <- rawToChar(header[346:501]) |
| 167 | + filename <- paste(prefix, filename, sep = "/") |
| 168 | + } |
| 169 | + |
| 170 | + # Apply PAX formatted extended filename |
| 171 | + if (!is.null(next_filename)) { |
| 172 | + filename <- next_filename |
| 173 | + next_filename <- NULL |
| 174 | + } |
| 175 | + |
| 176 | + # Strip path elements, ignoring leading slash, skip if no path remains |
| 177 | + if (strip > 0) { |
| 178 | + filename <- gsub("^/", "", filename) |
| 179 | + parts <- fs::path_split(filename)[[1]] |
| 180 | + parts <- parts[-strip:-1] |
| 181 | + if (length(parts) == 0) { |
| 182 | + seek(con, 512 * file_blocks, origin = "current") |
| 183 | + next |
| 184 | + } |
| 185 | + filename <- fs::path_join(c("/", parts)) |
| 186 | + } |
| 187 | + |
| 188 | + # Calculate file offsets |
| 189 | + entry <- list(filename = filename, start = offset, end = offset + size) |
| 190 | + |
| 191 | + # Deal with hard and symbolic links |
| 192 | + if (grepl("1|2", type)) { |
| 193 | + link_name <- rawToChar(header[158:257]) |
| 194 | + if (type == "2") { |
| 195 | + link_name <- fs::path_norm(fs::path(fs::path_dir(filename), link_name)) |
| 196 | + } |
| 197 | + link_entry <- Find(\(e) e$filename == link_name, entries) |
| 198 | + entry$start = link_entry$start |
| 199 | + entry$end = link_entry$end |
| 200 | + file_blocks <- 0 |
| 201 | + } |
| 202 | + |
| 203 | + entries <- append(entries, list(entry)) |
| 204 | + |
| 205 | + # Skip to next entry header |
| 206 | + seek(con, 512 * file_blocks, origin = "current") |
| 207 | + } |
| 208 | + entries |
| 209 | +} |
| 210 | + |
| 211 | +# Split the elements of a raw vector x according to matches of element `split` |
| 212 | +raw_split <- function(x, split) { |
| 213 | + if (is.character(split)) { |
| 214 | + split <- charToRaw(split) |
| 215 | + } |
| 216 | + |
| 217 | + start <- 1 |
| 218 | + out <- list() |
| 219 | + for (end in which(x == split)) { |
| 220 | + out <- c(out, list(x[start:(end - 1)])) |
| 221 | + start <- end + 1 |
| 222 | + } |
| 223 | + |
| 224 | + if (start <= length(x)) { |
| 225 | + out <- c(out, list(x[start:length(x)])) |
| 226 | + } |
| 227 | + |
| 228 | + out |
| 229 | +} |
0 commit comments