diff --git a/Cargo.lock b/Cargo.lock index 6eb7d4544..db656e681 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1968,6 +1968,7 @@ dependencies = [ "dashmap", "derive_more 2.0.1", "docsrs-metadata", + "flate2", "fn-error-context", "font-awesome-as-a-crate", "futures-util", diff --git a/Cargo.toml b/Cargo.toml index c31b27f98..3b5342ba0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -44,6 +44,7 @@ prometheus = { version = "0.14.0", default-features = false } rustwide = { version = "0.19.0", features = ["unstable-toolchain-ci", "unstable"] } mime_guess = "2" zstd = "0.13.0" +flate2 = "1.1.1" hostname = "0.4.0" path-slash = "0.2.0" once_cell = { version = "1.4.0", features = ["parking_lot"] } diff --git a/benches/compression.rs b/benches/compression.rs index 7b5d140e1..2514d6806 100644 --- a/benches/compression.rs +++ b/benches/compression.rs @@ -33,6 +33,18 @@ pub fn regex_capture_matches(c: &mut Criterion) { 5 * 1024 * 1024, ) }); + }) + .bench_function("compress gzip", |b| { + b.iter(|| compress(black_box(html_slice), CompressionAlgorithm::Gzip)); + }) + .bench_function("decompress gzip", |b| { + b.iter(|| { + decompress( + black_box(html_slice), + CompressionAlgorithm::Gzip, + 5 * 1024 * 1024, + ) + }); }); } diff --git a/src/db/delete.rs b/src/db/delete.rs index 1d8311218..6a7135823 100644 --- a/src/db/delete.rs +++ b/src/db/delete.rs @@ -222,7 +222,7 @@ mod tests { use super::*; use crate::db::ReleaseId; use crate::registry_api::{CrateOwner, OwnerKind}; - use crate::storage::rustdoc_json_path; + use crate::storage::{CompressionAlgorithm, rustdoc_json_path}; use crate::test::{async_wrapper, fake_release_that_failed_before_build}; use test_case::test_case; @@ -413,6 +413,7 @@ mod tests { version, "x86_64-unknown-linux-gnu", crate::storage::RustdocJsonFormatVersion::Latest, + Some(CompressionAlgorithm::Zstd), )) .await } diff --git a/src/db/file.rs b/src/db/file.rs index 7dbe1077c..7ad101cf3 100644 --- a/src/db/file.rs +++ b/src/db/file.rs @@ -47,6 +47,8 @@ pub(crate) fn detect_mime(file_path: impl AsRef) -> Mime { Some("toml") => mimes::TEXT_TOML.clone(), Some("js") => mime::TEXT_JAVASCRIPT, Some("json") => mime::APPLICATION_JSON, + Some("gz") => mimes::APPLICATION_GZIP.clone(), + Some("zst") => mimes::APPLICATION_ZSTD.clone(), _ => mime, } } @@ -103,3 +105,27 @@ pub(crate) fn file_list_to_json(files: impl IntoIterator) -> V .collect(), ) } + +#[cfg(test)] +mod tests { + use super::*; + use test_case::test_case; + + // some standard mime types that mime-guess handles + #[test_case("txt", &mime::TEXT_PLAIN)] + #[test_case("html", &mime::TEXT_HTML)] + // overrides of other mime types and defaults for + // types mime-guess doesn't know about + #[test_case("md", &mimes::TEXT_MARKDOWN)] + #[test_case("rs", &mimes::TEXT_RUST)] + #[test_case("markdown", &mimes::TEXT_MARKDOWN)] + #[test_case("css", &mime::TEXT_CSS)] + #[test_case("toml", &mimes::TEXT_TOML)] + #[test_case("js", &mime::TEXT_JAVASCRIPT)] + #[test_case("json", &mime::APPLICATION_JSON)] + #[test_case("zst", &mimes::APPLICATION_ZSTD)] + #[test_case("gz", &mimes::APPLICATION_GZIP)] + fn test_detect_mime(ext: &str, expected: &Mime) { + assert_eq!(&detect_mime(format!("something.{ext}")), expected); + } +} diff --git a/src/db/mimes.rs b/src/db/mimes.rs index e5a4c0a14..30f09ac27 100644 --- a/src/db/mimes.rs +++ b/src/db/mimes.rs @@ -8,6 +8,8 @@ macro_rules! mime { } mime!(APPLICATION_ZIP, "application/zip"); +mime!(APPLICATION_ZSTD, "application/zstd"); +mime!(APPLICATION_GZIP, "application/gzip"); mime!(TEXT_MARKDOWN, "text/markdown"); mime!(TEXT_RUST, "text/rust"); mime!(TEXT_TOML, "text/toml"); diff --git a/src/docbuilder/mod.rs b/src/docbuilder/mod.rs index 80764d9ec..f09da2d71 100644 --- a/src/docbuilder/mod.rs +++ b/src/docbuilder/mod.rs @@ -4,3 +4,6 @@ mod rustwide_builder; pub(crate) use self::limits::Limits; pub(crate) use self::rustwide_builder::DocCoverage; pub use self::rustwide_builder::{BuildPackageSummary, PackageKind, RustwideBuilder}; + +#[cfg(test)] +pub use self::rustwide_builder::RUSTDOC_JSON_COMPRESSION_ALGORITHMS; diff --git a/src/docbuilder/rustwide_builder.rs b/src/docbuilder/rustwide_builder.rs index 9c2471921..7c0971755 100644 --- a/src/docbuilder/rustwide_builder.rs +++ b/src/docbuilder/rustwide_builder.rs @@ -13,8 +13,8 @@ use crate::docbuilder::Limits; use crate::error::Result; use crate::repositories::RepositoryStatsUpdater; use crate::storage::{ - RustdocJsonFormatVersion, get_file_list, rustdoc_archive_path, rustdoc_json_path, - source_archive_path, + CompressionAlgorithm, RustdocJsonFormatVersion, compress, get_file_list, rustdoc_archive_path, + rustdoc_json_path, source_archive_path, }; use crate::utils::{ CargoMetadata, ConfigName, copy_dir_all, get_config, parse_rustc_version, report_error, @@ -45,6 +45,9 @@ const COMPONENTS: &[&str] = &["llvm-tools-preview", "rustc-dev", "rustfmt"]; const DUMMY_CRATE_NAME: &str = "empty-library"; const DUMMY_CRATE_VERSION: &str = "1.0.0"; +pub const RUSTDOC_JSON_COMPRESSION_ALGORITHMS: &[CompressionAlgorithm] = + &[CompressionAlgorithm::Zstd, CompressionAlgorithm::Gzip]; + /// read the format version from a rustdoc JSON file. fn read_format_version_from_rustdoc_json( reader: impl std::io::Read, @@ -909,12 +912,25 @@ impl RustwideBuilder { .context("couldn't parse rustdoc json to find format version")? }; - for format_version in [format_version, RustdocJsonFormatVersion::Latest] { - let _span = info_span!("store_json", %format_version).entered(); - let path = rustdoc_json_path(name, version, target, format_version); + for alg in RUSTDOC_JSON_COMPRESSION_ALGORITHMS { + let compressed_json: Vec = { + let _span = + info_span!("compress_json", file_size = json_filename.metadata()?.len(), algorithm=%alg) + .entered(); + + compress(BufReader::new(File::open(&json_filename)?), *alg)? + }; - self.storage.store_path(&path, &json_filename)?; - self.storage.set_public_access(&path, true)?; + for format_version in [format_version, RustdocJsonFormatVersion::Latest] { + let path = rustdoc_json_path(name, version, target, format_version, Some(*alg)); + let _span = + info_span!("store_json", %format_version, algorithm=%alg, target_path=%path) + .entered(); + + self.storage + .store_one_uncompressed(&path, compressed_json.clone())?; + self.storage.set_public_access(&path, true)?; + } } Ok(()) @@ -1279,7 +1295,7 @@ mod tests { use super::*; use crate::db::types::Feature; use crate::registry_api::ReleaseData; - use crate::storage::CompressionAlgorithm; + use crate::storage::{CompressionAlgorithm, compression}; use crate::test::{AxumRouterTestExt, TestEnvironment, wrapper}; use std::{io, iter}; use test_case::test_case; @@ -1467,29 +1483,39 @@ mod tests { // other targets too for target in DEFAULT_TARGETS { - // check if rustdoc json files exist for all targets - let path = rustdoc_json_path( - crate_, - version, - target, - RustdocJsonFormatVersion::Latest, - ); - assert!(storage.exists(&path)?); - assert!(storage.get_public_access(&path)?); - - let json_prefix = format!("rustdoc-json/{crate_}/{version}/{target}/"); - let mut json_files: Vec<_> = storage - .list_prefix(&json_prefix) - .filter_map(|res| res.ok()) - .map(|f| f.strip_prefix(&json_prefix).unwrap().to_owned()) - .collect(); - json_files.sort(); - assert!(json_files[0].starts_with(&format!("empty-library_1.0.0_{target}_"))); - assert!(json_files[0].ends_with(".json")); - assert_eq!( - json_files[1], - format!("empty-library_1.0.0_{target}_latest.json") - ); + for alg in RUSTDOC_JSON_COMPRESSION_ALGORITHMS { + // check if rustdoc json files exist for all targets + let path = rustdoc_json_path( + crate_, + version, + target, + RustdocJsonFormatVersion::Latest, + Some(*alg), + ); + assert!(storage.exists(&path)?); + assert!(storage.get_public_access(&path)?); + + let ext = compression::file_extension_for(*alg); + + let json_prefix = format!("rustdoc-json/{crate_}/{version}/{target}/"); + let mut json_files: Vec<_> = storage + .list_prefix(&json_prefix) + .filter_map(|res| res.ok()) + .map(|f| f.strip_prefix(&json_prefix).unwrap().to_owned()) + .collect(); + json_files.retain(|f| f.ends_with(&format!(".json.{ext}"))); + json_files.sort(); + dbg!(&json_files); + assert!( + json_files[0].starts_with(&format!("empty-library_1.0.0_{target}_")) + ); + + assert!(json_files[0].ends_with(&format!(".json.{ext}"))); + assert_eq!( + json_files[1], + format!("empty-library_1.0.0_{target}_latest.json.{ext}") + ); + } if target == &default_target { continue; diff --git a/src/storage/compression.rs b/src/storage/compression.rs index 251056532..6ea093231 100644 --- a/src/storage/compression.rs +++ b/src/storage/compression.rs @@ -1,6 +1,6 @@ use anyhow::Error; -use bzip2::Compression; use bzip2::read::{BzDecoder, BzEncoder}; +use flate2::read::{GzDecoder, GzEncoder}; use serde::{Deserialize, Serialize}; use std::{ collections::HashSet, @@ -29,6 +29,13 @@ pub enum CompressionAlgorithm { #[default] Zstd = 0, Bzip2 = 1, + Gzip = 2, +} + +impl CompressionAlgorithm { + pub fn file_extension(&self) -> &'static str { + file_extension_for(*self) + } } impl std::convert::TryFrom for CompressionAlgorithm { @@ -45,17 +52,40 @@ impl std::convert::TryFrom for CompressionAlgorithm { } } +pub(crate) fn file_extension_for(algorithm: CompressionAlgorithm) -> &'static str { + match algorithm { + CompressionAlgorithm::Zstd => "zst", + CompressionAlgorithm::Bzip2 => "bz2", + CompressionAlgorithm::Gzip => "gz", + } +} + +pub(crate) fn compression_from_file_extension(ext: &str) -> Option { + match ext { + "zst" => Some(CompressionAlgorithm::Zstd), + "bz2" => Some(CompressionAlgorithm::Bzip2), + "gz" => Some(CompressionAlgorithm::Gzip), + _ => None, + } +} + // public for benchmarking pub fn compress(content: impl Read, algorithm: CompressionAlgorithm) -> Result, Error> { match algorithm { CompressionAlgorithm::Zstd => Ok(zstd::encode_all(content, 9)?), CompressionAlgorithm::Bzip2 => { - let mut compressor = BzEncoder::new(content, Compression::best()); + let mut compressor = BzEncoder::new(content, bzip2::Compression::best()); let mut data = vec![]; compressor.read_to_end(&mut data)?; Ok(data) } + CompressionAlgorithm::Gzip => { + let mut compressor = GzEncoder::new(content, flate2::Compression::default()); + let mut data = vec![]; + compressor.read_to_end(&mut data)?; + Ok(data) + } } } @@ -72,6 +102,9 @@ pub fn decompress( CompressionAlgorithm::Bzip2 => { io::copy(&mut BzDecoder::new(content), &mut buffer)?; } + CompressionAlgorithm::Gzip => { + io::copy(&mut GzDecoder::new(content), &mut buffer)?; + } } Ok(buffer.into_inner()) @@ -81,6 +114,7 @@ pub fn decompress( mod tests { use super::*; use strum::IntoEnumIterator; + use test_case::test_case; #[test] fn test_compression() { @@ -134,9 +168,18 @@ mod tests { } } - #[test] - fn test_enum_display() { - assert_eq!(CompressionAlgorithm::Zstd.to_string(), "Zstd"); - assert_eq!(CompressionAlgorithm::Bzip2.to_string(), "Bzip2"); + #[test_case(CompressionAlgorithm::Zstd, "Zstd")] + #[test_case(CompressionAlgorithm::Bzip2, "Bzip2")] + #[test_case(CompressionAlgorithm::Gzip, "Gzip")] + fn test_enum_display(alg: CompressionAlgorithm, expected: &str) { + assert_eq!(alg.to_string(), expected); + } + + #[test_case(CompressionAlgorithm::Zstd, "zst")] + #[test_case(CompressionAlgorithm::Bzip2, "bz2")] + #[test_case(CompressionAlgorithm::Gzip, "gz")] + fn test_file_extensions(alg: CompressionAlgorithm, expected: &str) { + assert_eq!(file_extension_for(alg), expected); + assert_eq!(compression_from_file_extension(expected), Some(alg)); } } diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 197266c56..11692a7d6 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -1,5 +1,5 @@ mod archive_index; -mod compression; +pub(crate) mod compression; mod database; mod s3; @@ -22,7 +22,6 @@ use fn_error_context::context; use futures_util::stream::BoxStream; use mime::Mime; use path_slash::PathExt; -use serde_with::{DeserializeFromStr, SerializeDisplay}; use std::{ fmt, fs::{self, File}, @@ -543,6 +542,31 @@ impl AsyncStorage { self.store_inner(blobs).await } + // Store file into the backend at the given path, uncompressed. + // The path will also be used to determine the mime type. + #[instrument(skip(self, content))] + pub(crate) async fn store_one_uncompressed( + &self, + path: impl Into + std::fmt::Debug, + content: impl Into>, + ) -> Result<()> { + let path = path.into(); + let content = content.into(); + let mime = detect_mime(&path).to_owned(); + + self.store_inner(vec![Blob { + path, + mime, + content, + compression: None, + // this field is ignored by the backend + date_updated: Utc::now(), + }]) + .await?; + + Ok(()) + } + // Store file into the backend at the given path (also used to detect mime type), returns the // chosen compression algorithm #[instrument(skip(self, content))] @@ -794,6 +818,18 @@ impl Storage { self.runtime.block_on(self.inner.store_blobs(blobs)) } + // Store file into the backend at the given path, uncompressed. + // The path will also be used to determine the mime type. + #[instrument(skip(self, content))] + pub(crate) fn store_one_uncompressed( + &self, + path: impl Into + std::fmt::Debug, + content: impl Into>, + ) -> Result<()> { + self.runtime + .block_on(self.inner.store_one_uncompressed(path, content)) + } + // Store file into the backend at the given path (also used to detect mime type), returns the // chosen compression algorithm #[instrument(skip(self, content))] @@ -857,7 +893,7 @@ pub(crate) fn rustdoc_archive_path(name: &str, version: &str) -> String { format!("rustdoc/{name}/{version}.zip") } -#[derive(strum::Display, Debug, PartialEq, Eq, Clone, SerializeDisplay, DeserializeFromStr)] +#[derive(strum::Display, Debug, PartialEq, Eq, Clone, Copy)] #[strum(serialize_all = "snake_case")] pub(crate) enum RustdocJsonFormatVersion { #[strum(serialize = "{0}")] @@ -881,10 +917,18 @@ pub(crate) fn rustdoc_json_path( version: &str, target: &str, format_version: RustdocJsonFormatVersion, + compression_algorithm: Option, ) -> String { - format!( + let mut path = format!( "rustdoc-json/{name}/{version}/{target}/{name}_{version}_{target}_{format_version}.json" - ) + ); + + if let Some(alg) = compression_algorithm { + path.push('.'); + path.push_str(compression::file_extension_for(alg)); + } + + path } pub(crate) fn source_archive_path(name: &str, version: &str) -> String { @@ -904,15 +948,6 @@ mod test { assert_eq!(expected.to_string(), input); // test FromStr assert_eq!(expected, input.parse().unwrap()); - - let json_input = format!("\"{input}\""); - // test Serialize - assert_eq!(serde_json::to_string(&expected).unwrap(), json_input); - // test Deserialize - assert_eq!( - serde_json::from_str::(&json_input).unwrap(), - expected - ); } #[test] diff --git a/src/test/fakes.rs b/src/test/fakes.rs index 3ce1a3923..109407dfd 100644 --- a/src/test/fakes.rs +++ b/src/test/fakes.rs @@ -5,11 +5,11 @@ use crate::db::types::BuildStatus; use crate::db::{ BuildId, ReleaseId, initialize_build, initialize_crate, initialize_release, update_build_status, }; -use crate::docbuilder::DocCoverage; +use crate::docbuilder::{DocCoverage, RUSTDOC_JSON_COMPRESSION_ALGORITHMS}; use crate::error::Result; use crate::registry_api::{CrateData, CrateOwner, ReleaseData}; use crate::storage::{ - AsyncStorage, CompressionAlgorithm, RustdocJsonFormatVersion, rustdoc_archive_path, + AsyncStorage, CompressionAlgorithm, RustdocJsonFormatVersion, compress, rustdoc_archive_path, rustdoc_json_path, source_archive_path, }; use crate::utils::{Dependency, MetadataPackage, Target}; @@ -521,23 +521,30 @@ impl<'a> FakeRelease<'a> { targets.push(default_target.to_owned()); } for target in &targets { - for format_version in [ - RustdocJsonFormatVersion::Version(42), - RustdocJsonFormatVersion::Latest, - ] { - storage - .store_one( - &rustdoc_json_path( - &package.name, - &package.version, - target, - format_version, - ), - serde_json::to_vec(&serde_json::json!({ - "format_version": 42 - }))?, - ) - .await?; + let dummy_rustdoc_json_content = serde_json::to_vec(&serde_json::json!({ + "format_version": 42 + }))?; + + for alg in RUSTDOC_JSON_COMPRESSION_ALGORITHMS { + let compressed_json: Vec = compress(&*dummy_rustdoc_json_content, *alg)?; + + for format_version in [ + RustdocJsonFormatVersion::Version(42), + RustdocJsonFormatVersion::Latest, + ] { + storage + .store_one_uncompressed( + &rustdoc_json_path( + &package.name, + &package.version, + target, + format_version, + Some(*alg), + ), + compressed_json.clone(), + ) + .await?; + } } } } diff --git a/src/web/extractors.rs b/src/web/extractors.rs index a0ba3d70e..b0d06641e 100644 --- a/src/web/extractors.rs +++ b/src/web/extractors.rs @@ -1,5 +1,5 @@ use crate::db::{AsyncPoolClient, Pool}; -use anyhow::Context as _; +use anyhow::{Context as _, anyhow}; use axum::{ RequestPartsExt, extract::{Extension, FromRequestParts, OptionalFromRequestParts}, @@ -89,4 +89,111 @@ impl From for AxumNope { } } +/// extract a potential file extension from a path. +/// Axum doesn't support file extension suffixes yet, +/// especially when we have a route like '/something/{parameter}.{ext}' where two +/// parameters are used, one of which is a file extension. +/// +/// This is already solved in matchit 0.8.6, but not yet in axum +/// https://github.com/ibraheemdev/matchit/issues/17 +/// https://github.com/tokio-rs/axum/pull/3143 +/// +/// So our workaround is: +/// 1. we provide explicit routes for all file extensions we need to support (so no `.{ext}`). +/// 2. we extract the file extension from the path manually, using this extractor. +#[derive(Debug)] +pub(crate) struct PathFileExtension(pub(crate) String); + +impl FromRequestParts for PathFileExtension +where + S: Send + Sync, +{ + type Rejection = AxumNope; + + async fn from_request_parts(parts: &mut Parts, _state: &S) -> Result { + parts + .extract::>() + .await + .expect("can never fail") + .ok_or_else(|| AxumNope::BadRequest(anyhow!("file extension not found in path"))) + } +} + +impl OptionalFromRequestParts for PathFileExtension +where + S: Send + Sync, +{ + type Rejection = (); + + async fn from_request_parts( + parts: &mut Parts, + _state: &S, + ) -> Result, Self::Rejection> { + if let Some((_rest, last_component)) = parts.uri.path().rsplit_once('/') { + if let Some((_rest, ext)) = last_component.rsplit_once('.') { + return Ok(Some(PathFileExtension(ext.to_string()))); + } + } + + Ok(None) + } +} + // TODO: we will write tests for this when async db tests are working + +#[cfg(test)] +mod tests { + use super::*; + use crate::test::{AxumResponseTestExt, AxumRouterTestExt}; + use axum::{Router, routing::get}; + use http::StatusCode; + + #[tokio::test] + async fn test_path_file_ext() -> anyhow::Result<()> { + let app = Router::new() + .route( + "/mandatory/something.pdf", + get(|PathFileExtension(ext): PathFileExtension| async move { + format!("mandatory: {ext}") + }), + ) + .route( + "/mandatory_missing/something", + get(|PathFileExtension(_ext): PathFileExtension| async move { "never called" }), + ) + .route( + "/", + get(|PathFileExtension(_ext): PathFileExtension| async move { "never called" }), + ) + .route( + "/optional/something.pdf", + get(|ext: Option| async move { format!("option: {:?}", ext) }), + ) + .route( + "/optional_missing/something", + get(|ext: Option| async move { format!("option: {:?}", ext) }), + ); + + let res = app.get("/mandatory/something.pdf").await?; + assert!(res.status().is_success()); + assert_eq!(res.text().await?, "mandatory: pdf"); + + for path in &["/mandatory_missing/something", "/"] { + let res = app.get(path).await?; + assert_eq!(res.status(), StatusCode::BAD_REQUEST); + } + + let res = app.get("/optional/something.pdf").await?; + assert!(res.status().is_success()); + assert_eq!( + res.text().await?, + "option: Some(PathFileExtension(\"pdf\"))" + ); + + let res = app.get("/optional_missing/something").await?; + assert!(res.status().is_success()); + assert_eq!(res.text().await?, "option: None"); + + Ok(()) + } +} diff --git a/src/web/routes.rs b/src/web/routes.rs index 596f861e9..d24876e96 100644 --- a/src/web/routes.rs +++ b/src/web/routes.rs @@ -308,6 +308,14 @@ pub(super) fn build_axum_routes() -> AxumRouter { "/crate/{name}/{version}/download", get_internal(super::rustdoc::download_handler), ) + .route_with_tsr( + "/crate/{name}/{version}/json.gz", + get_internal(super::rustdoc::json_download_handler), + ) + .route_with_tsr( + "/crate/{name}/{version}/json.zst", + get_internal(super::rustdoc::json_download_handler), + ) .route_with_tsr( "/crate/{name}/{version}/json", get_internal(super::rustdoc::json_download_handler), @@ -320,6 +328,14 @@ pub(super) fn build_axum_routes() -> AxumRouter { "/crate/{name}/{version}/target-redirect/{*path}", get_internal(super::rustdoc::target_redirect_handler), ) + .route_with_tsr( + "/crate/{name}/{version}/{target}/json.gz", + get_internal(super::rustdoc::json_download_handler), + ) + .route_with_tsr( + "/crate/{name}/{version}/{target}/json.zst", + get_internal(super::rustdoc::json_download_handler), + ) .route_with_tsr( "/crate/{name}/{version}/{target}/json", get_internal(super::rustdoc::json_download_handler), diff --git a/src/web/rustdoc.rs b/src/web/rustdoc.rs index c62f64f9d..1a2e98598 100644 --- a/src/web/rustdoc.rs +++ b/src/web/rustdoc.rs @@ -3,7 +3,10 @@ use crate::{ AsyncStorage, Config, InstanceMetrics, RUSTDOC_STATIC_STORAGE_PREFIX, db::Pool, - storage::{RustdocJsonFormatVersion, rustdoc_archive_path, rustdoc_json_path}, + storage::{ + CompressionAlgorithm, RustdocJsonFormatVersion, + compression::compression_from_file_extension, rustdoc_archive_path, rustdoc_json_path, + }, utils, web::{ MetaData, ReqVersion, axum_cached_redirect, axum_parse_uri_with_params, @@ -38,6 +41,8 @@ use std::{ }; use tracing::{Instrument, debug, error, info_span, instrument, trace}; +use super::extractors::PathFileExtension; + static DOC_RUST_LANG_ORG_REDIRECTS: Lazy> = Lazy::new(|| { HashMap::from([ ("alloc", "stable/alloc"), @@ -822,7 +827,7 @@ pub(crate) struct JsonDownloadParams { pub(crate) name: String, pub(crate) version: ReqVersion, pub(crate) target: Option, - pub(crate) format_version: Option, + pub(crate) format_version: Option, } #[instrument(skip_all)] @@ -831,7 +836,20 @@ pub(crate) async fn json_download_handler( mut conn: DbConnection, Extension(config): Extension>, Extension(storage): Extension>, + file_extension: Option, ) -> AxumResult { + // TODO: we could also additionally read the accept-encoding header here. But especially + // in combination with priorities it's complex to parse correctly. So for now only + // file extensions in the URL. + let wanted_compression = + if let Some(ext) = file_extension.map(|ext| ext.0) { + Some(compression_from_file_extension(&ext).ok_or_else(|| { + AxumNope::BadRequest(anyhow!("unknown compression file extension")) + })?) + } else { + None + }; + let matched_release = match_version(&mut conn, ¶ms.name, ¶ms.version) .await? .assume_exact_name()?; @@ -865,27 +883,72 @@ pub(crate) async fn json_download_handler( .to_string() }; - let format_version = params - .format_version - .unwrap_or(RustdocJsonFormatVersion::Latest); + let wanted_format_version = if let Some(request_format_version) = params.format_version { + // axum doesn't support extension suffixes in the route yet, not as parameter, and not + // statically, when combined with a parameter (like `.../{format_version}.gz`). + // This is solved in matchit 0.8.6, but not yet in axum: + // https://github.com/ibraheemdev/matchit/issues/17 + // https://github.com/tokio-rs/axum/pull/3143 + // + // Because of this we have cases where `format_version` also contains a file extension + // suffix like `.zstd`. `wanted_compression` is already extracted above, so we only + // need to strip the extension from the `format_version` before trying to parse it. + let stripped_format_version = if let Some(wanted_compression) = wanted_compression { + request_format_version + .strip_suffix(&format!(".{}", wanted_compression.file_extension())) + .expect("should exist") + } else { + &request_format_version + }; + + stripped_format_version + .parse::() + .context("can't parse format version")? + } else { + RustdocJsonFormatVersion::Latest + }; + + let wanted_compression = wanted_compression.unwrap_or_default(); let storage_path = rustdoc_json_path( &krate.name, &krate.version.to_string(), &target, - format_version, + wanted_format_version, + Some(wanted_compression), ); - if !storage.exists(&storage_path).await? { - return Err(AxumNope::ResourceNotFound); - } + let redirect = |storage_path: &str| { + super::axum_cached_redirect( + format!("{}/{}", config.s3_static_root_path, storage_path), + CachePolicy::ForeverInCdn, + ) + }; - // since we didn't build rustdoc json for all releases yet, - // this redirect might redirect to a location that doesn't exist. - Ok(super::axum_cached_redirect( - format!("{}/{}", config.s3_static_root_path, storage_path), - CachePolicy::ForeverInCdn, - )?) + if storage.exists(&storage_path).await? { + Ok(redirect(&storage_path)?) + } else { + // we have old files on the bucket where we stored zstd compressed files, + // with content-encoding=zstd & just a `.json` file extension. + // As a fallback, we redirect to that, if zstd was requested (which is also the default). + if wanted_compression == CompressionAlgorithm::Zstd { + let storage_path = rustdoc_json_path( + &krate.name, + &krate.version.to_string(), + &target, + wanted_format_version, + None, + ); + + if storage.exists(&storage_path).await? { + // we have an old file with a `.json` extension, + // redirect to that as fallback + return Ok(redirect(&storage_path)?); + } + } + + Err(AxumNope::ResourceNotFound) + } } #[instrument(skip_all)] @@ -948,7 +1011,9 @@ mod test { use super::*; use crate::{ Config, + docbuilder::RUSTDOC_JSON_COMPRESSION_ALGORITHMS, registry_api::{CrateOwner, OwnerKind}, + storage::compression::file_extension_for, test::*, utils::Dependency, web::{cache::CachePolicy, encode_url_path}, @@ -3086,50 +3151,93 @@ mod test { "latest/json", "0.2.0", "x86_64-unknown-linux-gnu", - RustdocJsonFormatVersion::Latest + RustdocJsonFormatVersion::Latest, + CompressionAlgorithm::Zstd + )] + #[test_case( + "latest/json.gz", + "0.2.0", + "x86_64-unknown-linux-gnu", + RustdocJsonFormatVersion::Latest, + CompressionAlgorithm::Gzip )] #[test_case( "0.1/json", "0.1.0", "x86_64-unknown-linux-gnu", - RustdocJsonFormatVersion::Latest; + RustdocJsonFormatVersion::Latest, + CompressionAlgorithm::Zstd; "semver" )] #[test_case( "0.1.0/json", "0.1.0", "x86_64-unknown-linux-gnu", - RustdocJsonFormatVersion::Latest + RustdocJsonFormatVersion::Latest, + CompressionAlgorithm::Zstd )] #[test_case( "latest/json/latest", "0.2.0", "x86_64-unknown-linux-gnu", - RustdocJsonFormatVersion::Latest + RustdocJsonFormatVersion::Latest, + CompressionAlgorithm::Zstd + )] + #[test_case( + "latest/json/latest.gz", + "0.2.0", + "x86_64-unknown-linux-gnu", + RustdocJsonFormatVersion::Latest, + CompressionAlgorithm::Gzip )] #[test_case( "latest/json/42", "0.2.0", "x86_64-unknown-linux-gnu", - RustdocJsonFormatVersion::Version(42) + RustdocJsonFormatVersion::Version(42), + CompressionAlgorithm::Zstd )] #[test_case( "latest/i686-pc-windows-msvc/json", "0.2.0", "i686-pc-windows-msvc", - RustdocJsonFormatVersion::Latest + RustdocJsonFormatVersion::Latest, + CompressionAlgorithm::Zstd + )] + #[test_case( + "latest/i686-pc-windows-msvc/json.gz", + "0.2.0", + "i686-pc-windows-msvc", + RustdocJsonFormatVersion::Latest, + CompressionAlgorithm::Gzip )] #[test_case( "latest/i686-pc-windows-msvc/json/42", "0.2.0", "i686-pc-windows-msvc", - RustdocJsonFormatVersion::Version(42) + RustdocJsonFormatVersion::Version(42), + CompressionAlgorithm::Zstd + )] + #[test_case( + "latest/i686-pc-windows-msvc/json/42.gz", + "0.2.0", + "i686-pc-windows-msvc", + RustdocJsonFormatVersion::Version(42), + CompressionAlgorithm::Gzip + )] + #[test_case( + "latest/i686-pc-windows-msvc/json/42.zst", + "0.2.0", + "i686-pc-windows-msvc", + RustdocJsonFormatVersion::Version(42), + CompressionAlgorithm::Zstd )] fn json_download( request_path_suffix: &str, redirect_version: &str, redirect_target: &str, redirect_format_version: RustdocJsonFormatVersion, + redirect_compression: CompressionAlgorithm, ) { async_wrapper(|env| async move { env.override_config(|config| { @@ -3157,10 +3265,78 @@ mod test { let web = env.web_app().await; + let compression_ext = file_extension_for(redirect_compression); + web.assert_redirect_cached_unchecked( &format!("/crate/dummy/{request_path_suffix}"), &format!("https://static.docs.rs/rustdoc-json/dummy/{redirect_version}/{redirect_target}/\ - dummy_{redirect_version}_{redirect_target}_{redirect_format_version}.json"), + dummy_{redirect_version}_{redirect_target}_{redirect_format_version}.json.{compression_ext}"), + CachePolicy::ForeverInCdn, + &env.config(), + ) + .await?; + Ok(()) + }); + } + + #[test_case("")] + #[test_case(".zst")] + fn test_json_download_fallback_to_old_files_without_compression_extension(ext: &str) { + async_wrapper(|env| async move { + env.override_config(|config| { + config.s3_static_root_path = "https://static.docs.rs".into(); + }); + + const NAME: &str = "dummy"; + const VERSION: &str = "0.1.0"; + const TARGET: &str = "x86_64-unknown-linux-gnu"; + const FORMAT_VERSION: RustdocJsonFormatVersion = RustdocJsonFormatVersion::Latest; + + env.fake_release() + .await + .name(NAME) + .version(VERSION) + .archive_storage(true) + .default_target(TARGET) + .create() + .await?; + + let storage = env.async_storage().await; + + let zstd_blob = storage + .get( + &rustdoc_json_path( + NAME, + VERSION, + TARGET, + FORMAT_VERSION, + Some(CompressionAlgorithm::Zstd), + ), + usize::MAX, + ) + .await?; + + for compression in RUSTDOC_JSON_COMPRESSION_ALGORITHMS { + let path = + rustdoc_json_path(NAME, VERSION, TARGET, FORMAT_VERSION, Some(*compression)); + storage.delete_prefix(&path).await?; + assert!(!storage.exists(&path).await?); + } + storage + .store_one( + &rustdoc_json_path(NAME, VERSION, TARGET, FORMAT_VERSION, None), + zstd_blob.content, + ) + .await?; + + let web = env.web_app().await; + + web.assert_redirect_cached_unchecked( + &format!("/crate/dummy/latest/json{ext}"), + &format!( + "https://static.docs.rs/rustdoc-json/{NAME}/{VERSION}/{TARGET}/\ + {NAME}_{VERSION}_{TARGET}_{FORMAT_VERSION}.json" // without .zstd + ), CachePolicy::ForeverInCdn, &env.config(), ) diff --git a/templates/core/about/rustdoc-json.html b/templates/core/about/rustdoc-json.html index e93e359c9..5a47b7a53 100644 --- a/templates/core/about/rustdoc-json.html +++ b/templates/core/about/rustdoc-json.html @@ -30,6 +30,11 @@

URLs

https://static.docs.rs. Until we rebuilt all releases, the redirect target might not exist.

+

+ By default we use `zstd` compression, which is more space efficient and faster to decompress. For + more limited environments we also started supporting `gzip` compression. You can receive gzip by + adding `.gz` to the given URL. +

Here some URL examples you can use.

@@ -46,6 +51,10 @@

URLs

https://docs.rs/crate/clap/latest/json latest version, default target, latest format-version + + https://docs.rs/crate/clap/latest/json.gz + latest version, default target, latest format-version, gzip compression + https://docs.rs/crate/clap/latest/json/42 latest version, default target, format-version 42 @@ -54,6 +63,10 @@

URLs

https://docs.rs/crate/clap/~4/json latest v4 via semver, default target, latest format-version + + https://docs.rs/crate/clap/~4/json.zst + latest v4 via semver, default target, latest format-version, zstd compression + https://docs.rs/crate/clap/latest/i686-pc-windows-msvc/json latest version, windows target, latest format-version