diff --git a/Cargo.lock b/Cargo.lock
index 6eb7d4544..db656e681 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1968,6 +1968,7 @@ dependencies = [
"dashmap",
"derive_more 2.0.1",
"docsrs-metadata",
+ "flate2",
"fn-error-context",
"font-awesome-as-a-crate",
"futures-util",
diff --git a/Cargo.toml b/Cargo.toml
index c31b27f98..3b5342ba0 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -44,6 +44,7 @@ prometheus = { version = "0.14.0", default-features = false }
rustwide = { version = "0.19.0", features = ["unstable-toolchain-ci", "unstable"] }
mime_guess = "2"
zstd = "0.13.0"
+flate2 = "1.1.1"
hostname = "0.4.0"
path-slash = "0.2.0"
once_cell = { version = "1.4.0", features = ["parking_lot"] }
diff --git a/benches/compression.rs b/benches/compression.rs
index 7b5d140e1..2514d6806 100644
--- a/benches/compression.rs
+++ b/benches/compression.rs
@@ -33,6 +33,18 @@ pub fn regex_capture_matches(c: &mut Criterion) {
5 * 1024 * 1024,
)
});
+ })
+ .bench_function("compress gzip", |b| {
+ b.iter(|| compress(black_box(html_slice), CompressionAlgorithm::Gzip));
+ })
+ .bench_function("decompress gzip", |b| {
+ b.iter(|| {
+ decompress(
+ black_box(html_slice),
+ CompressionAlgorithm::Gzip,
+ 5 * 1024 * 1024,
+ )
+ });
});
}
diff --git a/src/db/delete.rs b/src/db/delete.rs
index 1d8311218..6a7135823 100644
--- a/src/db/delete.rs
+++ b/src/db/delete.rs
@@ -222,7 +222,7 @@ mod tests {
use super::*;
use crate::db::ReleaseId;
use crate::registry_api::{CrateOwner, OwnerKind};
- use crate::storage::rustdoc_json_path;
+ use crate::storage::{CompressionAlgorithm, rustdoc_json_path};
use crate::test::{async_wrapper, fake_release_that_failed_before_build};
use test_case::test_case;
@@ -413,6 +413,7 @@ mod tests {
version,
"x86_64-unknown-linux-gnu",
crate::storage::RustdocJsonFormatVersion::Latest,
+ Some(CompressionAlgorithm::Zstd),
))
.await
}
diff --git a/src/db/file.rs b/src/db/file.rs
index 7dbe1077c..7ad101cf3 100644
--- a/src/db/file.rs
+++ b/src/db/file.rs
@@ -47,6 +47,8 @@ pub(crate) fn detect_mime(file_path: impl AsRef) -> Mime {
Some("toml") => mimes::TEXT_TOML.clone(),
Some("js") => mime::TEXT_JAVASCRIPT,
Some("json") => mime::APPLICATION_JSON,
+ Some("gz") => mimes::APPLICATION_GZIP.clone(),
+ Some("zst") => mimes::APPLICATION_ZSTD.clone(),
_ => mime,
}
}
@@ -103,3 +105,27 @@ pub(crate) fn file_list_to_json(files: impl IntoIterator) -> V
.collect(),
)
}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+ use test_case::test_case;
+
+ // some standard mime types that mime-guess handles
+ #[test_case("txt", &mime::TEXT_PLAIN)]
+ #[test_case("html", &mime::TEXT_HTML)]
+ // overrides of other mime types and defaults for
+ // types mime-guess doesn't know about
+ #[test_case("md", &mimes::TEXT_MARKDOWN)]
+ #[test_case("rs", &mimes::TEXT_RUST)]
+ #[test_case("markdown", &mimes::TEXT_MARKDOWN)]
+ #[test_case("css", &mime::TEXT_CSS)]
+ #[test_case("toml", &mimes::TEXT_TOML)]
+ #[test_case("js", &mime::TEXT_JAVASCRIPT)]
+ #[test_case("json", &mime::APPLICATION_JSON)]
+ #[test_case("zst", &mimes::APPLICATION_ZSTD)]
+ #[test_case("gz", &mimes::APPLICATION_GZIP)]
+ fn test_detect_mime(ext: &str, expected: &Mime) {
+ assert_eq!(&detect_mime(format!("something.{ext}")), expected);
+ }
+}
diff --git a/src/db/mimes.rs b/src/db/mimes.rs
index e5a4c0a14..30f09ac27 100644
--- a/src/db/mimes.rs
+++ b/src/db/mimes.rs
@@ -8,6 +8,8 @@ macro_rules! mime {
}
mime!(APPLICATION_ZIP, "application/zip");
+mime!(APPLICATION_ZSTD, "application/zstd");
+mime!(APPLICATION_GZIP, "application/gzip");
mime!(TEXT_MARKDOWN, "text/markdown");
mime!(TEXT_RUST, "text/rust");
mime!(TEXT_TOML, "text/toml");
diff --git a/src/docbuilder/mod.rs b/src/docbuilder/mod.rs
index 80764d9ec..f09da2d71 100644
--- a/src/docbuilder/mod.rs
+++ b/src/docbuilder/mod.rs
@@ -4,3 +4,6 @@ mod rustwide_builder;
pub(crate) use self::limits::Limits;
pub(crate) use self::rustwide_builder::DocCoverage;
pub use self::rustwide_builder::{BuildPackageSummary, PackageKind, RustwideBuilder};
+
+#[cfg(test)]
+pub use self::rustwide_builder::RUSTDOC_JSON_COMPRESSION_ALGORITHMS;
diff --git a/src/docbuilder/rustwide_builder.rs b/src/docbuilder/rustwide_builder.rs
index 9c2471921..7c0971755 100644
--- a/src/docbuilder/rustwide_builder.rs
+++ b/src/docbuilder/rustwide_builder.rs
@@ -13,8 +13,8 @@ use crate::docbuilder::Limits;
use crate::error::Result;
use crate::repositories::RepositoryStatsUpdater;
use crate::storage::{
- RustdocJsonFormatVersion, get_file_list, rustdoc_archive_path, rustdoc_json_path,
- source_archive_path,
+ CompressionAlgorithm, RustdocJsonFormatVersion, compress, get_file_list, rustdoc_archive_path,
+ rustdoc_json_path, source_archive_path,
};
use crate::utils::{
CargoMetadata, ConfigName, copy_dir_all, get_config, parse_rustc_version, report_error,
@@ -45,6 +45,9 @@ const COMPONENTS: &[&str] = &["llvm-tools-preview", "rustc-dev", "rustfmt"];
const DUMMY_CRATE_NAME: &str = "empty-library";
const DUMMY_CRATE_VERSION: &str = "1.0.0";
+pub const RUSTDOC_JSON_COMPRESSION_ALGORITHMS: &[CompressionAlgorithm] =
+ &[CompressionAlgorithm::Zstd, CompressionAlgorithm::Gzip];
+
/// read the format version from a rustdoc JSON file.
fn read_format_version_from_rustdoc_json(
reader: impl std::io::Read,
@@ -909,12 +912,25 @@ impl RustwideBuilder {
.context("couldn't parse rustdoc json to find format version")?
};
- for format_version in [format_version, RustdocJsonFormatVersion::Latest] {
- let _span = info_span!("store_json", %format_version).entered();
- let path = rustdoc_json_path(name, version, target, format_version);
+ for alg in RUSTDOC_JSON_COMPRESSION_ALGORITHMS {
+ let compressed_json: Vec = {
+ let _span =
+ info_span!("compress_json", file_size = json_filename.metadata()?.len(), algorithm=%alg)
+ .entered();
+
+ compress(BufReader::new(File::open(&json_filename)?), *alg)?
+ };
- self.storage.store_path(&path, &json_filename)?;
- self.storage.set_public_access(&path, true)?;
+ for format_version in [format_version, RustdocJsonFormatVersion::Latest] {
+ let path = rustdoc_json_path(name, version, target, format_version, Some(*alg));
+ let _span =
+ info_span!("store_json", %format_version, algorithm=%alg, target_path=%path)
+ .entered();
+
+ self.storage
+ .store_one_uncompressed(&path, compressed_json.clone())?;
+ self.storage.set_public_access(&path, true)?;
+ }
}
Ok(())
@@ -1279,7 +1295,7 @@ mod tests {
use super::*;
use crate::db::types::Feature;
use crate::registry_api::ReleaseData;
- use crate::storage::CompressionAlgorithm;
+ use crate::storage::{CompressionAlgorithm, compression};
use crate::test::{AxumRouterTestExt, TestEnvironment, wrapper};
use std::{io, iter};
use test_case::test_case;
@@ -1467,29 +1483,39 @@ mod tests {
// other targets too
for target in DEFAULT_TARGETS {
- // check if rustdoc json files exist for all targets
- let path = rustdoc_json_path(
- crate_,
- version,
- target,
- RustdocJsonFormatVersion::Latest,
- );
- assert!(storage.exists(&path)?);
- assert!(storage.get_public_access(&path)?);
-
- let json_prefix = format!("rustdoc-json/{crate_}/{version}/{target}/");
- let mut json_files: Vec<_> = storage
- .list_prefix(&json_prefix)
- .filter_map(|res| res.ok())
- .map(|f| f.strip_prefix(&json_prefix).unwrap().to_owned())
- .collect();
- json_files.sort();
- assert!(json_files[0].starts_with(&format!("empty-library_1.0.0_{target}_")));
- assert!(json_files[0].ends_with(".json"));
- assert_eq!(
- json_files[1],
- format!("empty-library_1.0.0_{target}_latest.json")
- );
+ for alg in RUSTDOC_JSON_COMPRESSION_ALGORITHMS {
+ // check if rustdoc json files exist for all targets
+ let path = rustdoc_json_path(
+ crate_,
+ version,
+ target,
+ RustdocJsonFormatVersion::Latest,
+ Some(*alg),
+ );
+ assert!(storage.exists(&path)?);
+ assert!(storage.get_public_access(&path)?);
+
+ let ext = compression::file_extension_for(*alg);
+
+ let json_prefix = format!("rustdoc-json/{crate_}/{version}/{target}/");
+ let mut json_files: Vec<_> = storage
+ .list_prefix(&json_prefix)
+ .filter_map(|res| res.ok())
+ .map(|f| f.strip_prefix(&json_prefix).unwrap().to_owned())
+ .collect();
+ json_files.retain(|f| f.ends_with(&format!(".json.{ext}")));
+ json_files.sort();
+ dbg!(&json_files);
+ assert!(
+ json_files[0].starts_with(&format!("empty-library_1.0.0_{target}_"))
+ );
+
+ assert!(json_files[0].ends_with(&format!(".json.{ext}")));
+ assert_eq!(
+ json_files[1],
+ format!("empty-library_1.0.0_{target}_latest.json.{ext}")
+ );
+ }
if target == &default_target {
continue;
diff --git a/src/storage/compression.rs b/src/storage/compression.rs
index 251056532..6ea093231 100644
--- a/src/storage/compression.rs
+++ b/src/storage/compression.rs
@@ -1,6 +1,6 @@
use anyhow::Error;
-use bzip2::Compression;
use bzip2::read::{BzDecoder, BzEncoder};
+use flate2::read::{GzDecoder, GzEncoder};
use serde::{Deserialize, Serialize};
use std::{
collections::HashSet,
@@ -29,6 +29,13 @@ pub enum CompressionAlgorithm {
#[default]
Zstd = 0,
Bzip2 = 1,
+ Gzip = 2,
+}
+
+impl CompressionAlgorithm {
+ pub fn file_extension(&self) -> &'static str {
+ file_extension_for(*self)
+ }
}
impl std::convert::TryFrom for CompressionAlgorithm {
@@ -45,17 +52,40 @@ impl std::convert::TryFrom for CompressionAlgorithm {
}
}
+pub(crate) fn file_extension_for(algorithm: CompressionAlgorithm) -> &'static str {
+ match algorithm {
+ CompressionAlgorithm::Zstd => "zst",
+ CompressionAlgorithm::Bzip2 => "bz2",
+ CompressionAlgorithm::Gzip => "gz",
+ }
+}
+
+pub(crate) fn compression_from_file_extension(ext: &str) -> Option {
+ match ext {
+ "zst" => Some(CompressionAlgorithm::Zstd),
+ "bz2" => Some(CompressionAlgorithm::Bzip2),
+ "gz" => Some(CompressionAlgorithm::Gzip),
+ _ => None,
+ }
+}
+
// public for benchmarking
pub fn compress(content: impl Read, algorithm: CompressionAlgorithm) -> Result, Error> {
match algorithm {
CompressionAlgorithm::Zstd => Ok(zstd::encode_all(content, 9)?),
CompressionAlgorithm::Bzip2 => {
- let mut compressor = BzEncoder::new(content, Compression::best());
+ let mut compressor = BzEncoder::new(content, bzip2::Compression::best());
let mut data = vec![];
compressor.read_to_end(&mut data)?;
Ok(data)
}
+ CompressionAlgorithm::Gzip => {
+ let mut compressor = GzEncoder::new(content, flate2::Compression::default());
+ let mut data = vec![];
+ compressor.read_to_end(&mut data)?;
+ Ok(data)
+ }
}
}
@@ -72,6 +102,9 @@ pub fn decompress(
CompressionAlgorithm::Bzip2 => {
io::copy(&mut BzDecoder::new(content), &mut buffer)?;
}
+ CompressionAlgorithm::Gzip => {
+ io::copy(&mut GzDecoder::new(content), &mut buffer)?;
+ }
}
Ok(buffer.into_inner())
@@ -81,6 +114,7 @@ pub fn decompress(
mod tests {
use super::*;
use strum::IntoEnumIterator;
+ use test_case::test_case;
#[test]
fn test_compression() {
@@ -134,9 +168,18 @@ mod tests {
}
}
- #[test]
- fn test_enum_display() {
- assert_eq!(CompressionAlgorithm::Zstd.to_string(), "Zstd");
- assert_eq!(CompressionAlgorithm::Bzip2.to_string(), "Bzip2");
+ #[test_case(CompressionAlgorithm::Zstd, "Zstd")]
+ #[test_case(CompressionAlgorithm::Bzip2, "Bzip2")]
+ #[test_case(CompressionAlgorithm::Gzip, "Gzip")]
+ fn test_enum_display(alg: CompressionAlgorithm, expected: &str) {
+ assert_eq!(alg.to_string(), expected);
+ }
+
+ #[test_case(CompressionAlgorithm::Zstd, "zst")]
+ #[test_case(CompressionAlgorithm::Bzip2, "bz2")]
+ #[test_case(CompressionAlgorithm::Gzip, "gz")]
+ fn test_file_extensions(alg: CompressionAlgorithm, expected: &str) {
+ assert_eq!(file_extension_for(alg), expected);
+ assert_eq!(compression_from_file_extension(expected), Some(alg));
}
}
diff --git a/src/storage/mod.rs b/src/storage/mod.rs
index 197266c56..11692a7d6 100644
--- a/src/storage/mod.rs
+++ b/src/storage/mod.rs
@@ -1,5 +1,5 @@
mod archive_index;
-mod compression;
+pub(crate) mod compression;
mod database;
mod s3;
@@ -22,7 +22,6 @@ use fn_error_context::context;
use futures_util::stream::BoxStream;
use mime::Mime;
use path_slash::PathExt;
-use serde_with::{DeserializeFromStr, SerializeDisplay};
use std::{
fmt,
fs::{self, File},
@@ -543,6 +542,31 @@ impl AsyncStorage {
self.store_inner(blobs).await
}
+ // Store file into the backend at the given path, uncompressed.
+ // The path will also be used to determine the mime type.
+ #[instrument(skip(self, content))]
+ pub(crate) async fn store_one_uncompressed(
+ &self,
+ path: impl Into + std::fmt::Debug,
+ content: impl Into>,
+ ) -> Result<()> {
+ let path = path.into();
+ let content = content.into();
+ let mime = detect_mime(&path).to_owned();
+
+ self.store_inner(vec![Blob {
+ path,
+ mime,
+ content,
+ compression: None,
+ // this field is ignored by the backend
+ date_updated: Utc::now(),
+ }])
+ .await?;
+
+ Ok(())
+ }
+
// Store file into the backend at the given path (also used to detect mime type), returns the
// chosen compression algorithm
#[instrument(skip(self, content))]
@@ -794,6 +818,18 @@ impl Storage {
self.runtime.block_on(self.inner.store_blobs(blobs))
}
+ // Store file into the backend at the given path, uncompressed.
+ // The path will also be used to determine the mime type.
+ #[instrument(skip(self, content))]
+ pub(crate) fn store_one_uncompressed(
+ &self,
+ path: impl Into + std::fmt::Debug,
+ content: impl Into>,
+ ) -> Result<()> {
+ self.runtime
+ .block_on(self.inner.store_one_uncompressed(path, content))
+ }
+
// Store file into the backend at the given path (also used to detect mime type), returns the
// chosen compression algorithm
#[instrument(skip(self, content))]
@@ -857,7 +893,7 @@ pub(crate) fn rustdoc_archive_path(name: &str, version: &str) -> String {
format!("rustdoc/{name}/{version}.zip")
}
-#[derive(strum::Display, Debug, PartialEq, Eq, Clone, SerializeDisplay, DeserializeFromStr)]
+#[derive(strum::Display, Debug, PartialEq, Eq, Clone, Copy)]
#[strum(serialize_all = "snake_case")]
pub(crate) enum RustdocJsonFormatVersion {
#[strum(serialize = "{0}")]
@@ -881,10 +917,18 @@ pub(crate) fn rustdoc_json_path(
version: &str,
target: &str,
format_version: RustdocJsonFormatVersion,
+ compression_algorithm: Option,
) -> String {
- format!(
+ let mut path = format!(
"rustdoc-json/{name}/{version}/{target}/{name}_{version}_{target}_{format_version}.json"
- )
+ );
+
+ if let Some(alg) = compression_algorithm {
+ path.push('.');
+ path.push_str(compression::file_extension_for(alg));
+ }
+
+ path
}
pub(crate) fn source_archive_path(name: &str, version: &str) -> String {
@@ -904,15 +948,6 @@ mod test {
assert_eq!(expected.to_string(), input);
// test FromStr
assert_eq!(expected, input.parse().unwrap());
-
- let json_input = format!("\"{input}\"");
- // test Serialize
- assert_eq!(serde_json::to_string(&expected).unwrap(), json_input);
- // test Deserialize
- assert_eq!(
- serde_json::from_str::(&json_input).unwrap(),
- expected
- );
}
#[test]
diff --git a/src/test/fakes.rs b/src/test/fakes.rs
index 3ce1a3923..109407dfd 100644
--- a/src/test/fakes.rs
+++ b/src/test/fakes.rs
@@ -5,11 +5,11 @@ use crate::db::types::BuildStatus;
use crate::db::{
BuildId, ReleaseId, initialize_build, initialize_crate, initialize_release, update_build_status,
};
-use crate::docbuilder::DocCoverage;
+use crate::docbuilder::{DocCoverage, RUSTDOC_JSON_COMPRESSION_ALGORITHMS};
use crate::error::Result;
use crate::registry_api::{CrateData, CrateOwner, ReleaseData};
use crate::storage::{
- AsyncStorage, CompressionAlgorithm, RustdocJsonFormatVersion, rustdoc_archive_path,
+ AsyncStorage, CompressionAlgorithm, RustdocJsonFormatVersion, compress, rustdoc_archive_path,
rustdoc_json_path, source_archive_path,
};
use crate::utils::{Dependency, MetadataPackage, Target};
@@ -521,23 +521,30 @@ impl<'a> FakeRelease<'a> {
targets.push(default_target.to_owned());
}
for target in &targets {
- for format_version in [
- RustdocJsonFormatVersion::Version(42),
- RustdocJsonFormatVersion::Latest,
- ] {
- storage
- .store_one(
- &rustdoc_json_path(
- &package.name,
- &package.version,
- target,
- format_version,
- ),
- serde_json::to_vec(&serde_json::json!({
- "format_version": 42
- }))?,
- )
- .await?;
+ let dummy_rustdoc_json_content = serde_json::to_vec(&serde_json::json!({
+ "format_version": 42
+ }))?;
+
+ for alg in RUSTDOC_JSON_COMPRESSION_ALGORITHMS {
+ let compressed_json: Vec = compress(&*dummy_rustdoc_json_content, *alg)?;
+
+ for format_version in [
+ RustdocJsonFormatVersion::Version(42),
+ RustdocJsonFormatVersion::Latest,
+ ] {
+ storage
+ .store_one_uncompressed(
+ &rustdoc_json_path(
+ &package.name,
+ &package.version,
+ target,
+ format_version,
+ Some(*alg),
+ ),
+ compressed_json.clone(),
+ )
+ .await?;
+ }
}
}
}
diff --git a/src/web/extractors.rs b/src/web/extractors.rs
index a0ba3d70e..b0d06641e 100644
--- a/src/web/extractors.rs
+++ b/src/web/extractors.rs
@@ -1,5 +1,5 @@
use crate::db::{AsyncPoolClient, Pool};
-use anyhow::Context as _;
+use anyhow::{Context as _, anyhow};
use axum::{
RequestPartsExt,
extract::{Extension, FromRequestParts, OptionalFromRequestParts},
@@ -89,4 +89,111 @@ impl From for AxumNope {
}
}
+/// extract a potential file extension from a path.
+/// Axum doesn't support file extension suffixes yet,
+/// especially when we have a route like '/something/{parameter}.{ext}' where two
+/// parameters are used, one of which is a file extension.
+///
+/// This is already solved in matchit 0.8.6, but not yet in axum
+/// https://github.com/ibraheemdev/matchit/issues/17
+/// https://github.com/tokio-rs/axum/pull/3143
+///
+/// So our workaround is:
+/// 1. we provide explicit routes for all file extensions we need to support (so no `.{ext}`).
+/// 2. we extract the file extension from the path manually, using this extractor.
+#[derive(Debug)]
+pub(crate) struct PathFileExtension(pub(crate) String);
+
+impl FromRequestParts for PathFileExtension
+where
+ S: Send + Sync,
+{
+ type Rejection = AxumNope;
+
+ async fn from_request_parts(parts: &mut Parts, _state: &S) -> Result {
+ parts
+ .extract::
+
+ By default we use `zstd` compression, which is more space efficient and faster to decompress. For
+ more limited environments we also started supporting `gzip` compression. You can receive gzip by
+ adding `.gz` to the given URL.
+