diff --git a/src/storage/mod.rs b/src/storage/mod.rs index c680e2996..e4e696474 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -253,40 +253,6 @@ impl AsyncStorage { } } - /// Fetch a rustdoc file from our blob storage. - /// * `name` - the crate name - /// * `version` - the crate version - /// * `latest_build_id` - the id of the most recent build. used purely to invalidate the local archive - /// index cache, when `archive_storage` is `true.` Without it we wouldn't know that we have - /// to invalidate the locally cached file after a rebuild. - /// * `path` - the wanted path inside the documentation. - /// * `archive_storage` - if `true`, we will assume we have a remove ZIP archive and an index - /// where we can fetch the requested path from inside the ZIP file. - #[instrument] - pub(crate) async fn fetch_rustdoc_file( - &self, - name: &str, - version: &str, - latest_build_id: Option, - path: &str, - archive_storage: bool, - ) -> Result { - trace!("fetch rustdoc file"); - Ok(if archive_storage { - self.get_from_archive( - &rustdoc_archive_path(name, version), - latest_build_id, - path, - self.max_file_size_for(path), - ) - .await? - } else { - // Add rustdoc prefix, name and version to the path for accessing the file stored in the database - let remote_path = format!("rustdoc/{name}/{version}/{path}"); - self.get(&remote_path, self.max_file_size_for(path)).await? - }) - } - /// Fetch a rustdoc file from our blob storage. /// * `name` - the crate name /// * `version` - the crate version @@ -840,23 +806,6 @@ impl Storage { .block_on(self.inner.set_public_access(path, public)) } - pub(crate) fn fetch_rustdoc_file( - &self, - name: &str, - version: &str, - latest_build_id: Option, - path: &str, - archive_storage: bool, - ) -> Result { - self.runtime.block_on(self.inner.fetch_rustdoc_file( - name, - version, - latest_build_id, - path, - archive_storage, - )) - } - pub(crate) fn fetch_source_file( &self, name: &str, diff --git a/src/utils/html.rs b/src/utils/html.rs index 965b74f5c..bae9d19c2 100644 --- a/src/utils/html.rs +++ b/src/utils/html.rs @@ -1,9 +1,30 @@ -use crate::web::{ - page::templates::{Body, Head, Vendored}, - rustdoc::RustdocPage, +use crate::{ + InstanceMetrics, + web::{ + page::{ + TemplateData, + templates::{Body, Head, Vendored}, + }, + rustdoc::RustdocPage, + }, }; use askama::Template; +use async_stream::stream; +use axum::body::Bytes; +use futures_util::{Stream, StreamExt as _}; use lol_html::{element, errors::RewritingError}; +use std::sync::Arc; +use tokio::{io::AsyncRead, task::JoinHandle}; +use tokio_util::io::ReaderStream; +use tracing::error; + +#[derive(thiserror::Error, Debug)] +pub(crate) enum RustdocRewritingError { + #[error("HTML rewriter error: {0}")] + RewritingError(#[from] lol_html::errors::RewritingError), + #[error("generic error while rewriting rustdoc HTML: {0}")] + Other(#[from] anyhow::Error), +} /// Rewrite a rustdoc page to have the docs.rs topbar /// @@ -11,99 +32,182 @@ use lol_html::{element, errors::RewritingError}; /// render the `rustdoc/` templates with the `html`. /// The output is an HTML page which has not yet been UTF-8 validated. /// In practice, the output should always be valid UTF-8. -pub(crate) fn rewrite_lol( - html: &[u8], +pub(crate) fn rewrite_rustdoc_html_stream( + template_data: Arc, + mut reader: R, max_allowed_memory_usage: usize, - data: &RustdocPage, -) -> Result, RewritingError> { - use lol_html::html_content::{ContentType, Element}; - use lol_html::{HtmlRewriter, MemorySettings, Settings}; - - let head_html = Head::new(data).render().unwrap(); - let vendored_html = Vendored.render().unwrap(); - let body_html = Body.render().unwrap(); - let topbar_html = data.render().unwrap(); - - // Before: ... rustdoc content ... - // After: - // ```html - //
- // ... rustdoc content ... - //
- // ``` - let body_handler = |rustdoc_body_class: &mut Element| { - // Add the `rustdoc` classes to the html body - let mut tmp; - let klass = if let Some(classes) = rustdoc_body_class.get_attribute("class") { - tmp = classes; - tmp.push_str(" container-rustdoc"); - &tmp - } else { - "container-rustdoc" - }; - rustdoc_body_class.set_attribute("class", klass)?; - rustdoc_body_class.set_attribute("id", "rustdoc_body_wrapper")?; - rustdoc_body_class.set_attribute("tabindex", "-1")?; - // Change the `body` to a `div` - rustdoc_body_class.set_tag_name("div")?; - // Prepend the askama content - rustdoc_body_class.prepend(&body_html, ContentType::Html); - // Wrap the transformed body and topbar into a element - rustdoc_body_class.before(r#""#, ContentType::Html); - // Insert the topbar outside of the rustdoc div - rustdoc_body_class.before(&topbar_html, ContentType::Html); - // Finalize body with - rustdoc_body_class.after("", ContentType::Html); - - Ok(()) - }; - - let settings = Settings { - element_content_handlers: vec![ - // Append `style.css` stylesheet after all head elements. - element!("head", |head: &mut Element| { - head.append(&head_html, ContentType::Html); - Ok(()) - }), - element!("body", body_handler), - // Append `vendored.css` before `rustdoc.css`, so that the duplicate copy of - // `normalize.css` will be overridden by the later version. - // - // Later rustdoc has `#mainThemeStyle` that could be used, but pre-2018 docs - // don't have this: - // - // https://github.com/rust-lang/rust/commit/003b2bc1c65251ec2fc80b78ed91c43fb35402ec - // - // Pre-2018 rustdoc also didn't have the resource suffix, but docs.rs was using a fork - // that had implemented it already then, so we can assume the css files are - // `/rustdoc-.css` and use the `-` to distinguish from the - // `rustdoc.static` path. - element!( - "link[rel='stylesheet'][href*='rustdoc-']", - |rustdoc_css: &mut Element| { - rustdoc_css.before(&vendored_html, ContentType::Html); + data: Arc, + metrics: Arc, +) -> impl Stream> +where + R: AsyncRead + Unpin + 'static, +{ + stream!({ + let (input_sender, input_receiver) = std::sync::mpsc::channel::>>(); + let (result_sender, mut result_receiver) = tokio::sync::mpsc::unbounded_channel::(); + + let join_handle: JoinHandle> = tokio::spawn(async move { + // we're using the rendering threadpool to limit CPU usage on the server, and to + // offload potentially CPU intensive stuff from the tokio runtime. + // Also this lets us limit the threadpool size and through that the CPU usage. + template_data + .render_in_threadpool(move || { + use lol_html::html_content::{ContentType, Element}; + use lol_html::{HtmlRewriter, MemorySettings, Settings}; + + let head_html = Head::new(&data).render().unwrap(); + let vendored_html = Vendored.render().unwrap(); + let body_html = Body.render().unwrap(); + let topbar_html = data.render().unwrap(); + + // Before: ... rustdoc content ... + // After: + // ```html + //
+ // ... rustdoc content ... + //
+ // ``` + let body_handler = |rustdoc_body_class: &mut Element| { + // Add the `rustdoc` classes to the html body + let mut tmp; + let klass = if let Some(classes) = rustdoc_body_class.get_attribute("class") + { + tmp = classes; + tmp.push_str(" container-rustdoc"); + &tmp + } else { + "container-rustdoc" + }; + rustdoc_body_class.set_attribute("class", klass)?; + rustdoc_body_class.set_attribute("id", "rustdoc_body_wrapper")?; + rustdoc_body_class.set_attribute("tabindex", "-1")?; + // Change the `body` to a `div` + rustdoc_body_class.set_tag_name("div")?; + // Prepend the askama content + rustdoc_body_class.prepend(&body_html, ContentType::Html); + // Wrap the transformed body and topbar into a element + rustdoc_body_class + .before(r#""#, ContentType::Html); + // Insert the topbar outside of the rustdoc div + rustdoc_body_class.before(&topbar_html, ContentType::Html); + // Finalize body with + rustdoc_body_class.after("", ContentType::Html); + + Ok(()) + }; + + let settings = Settings { + element_content_handlers: vec![ + // Append `style.css` stylesheet after all head elements. + element!("head", |head: &mut Element| { + head.append(&head_html, ContentType::Html); + Ok(()) + }), + element!("body", body_handler), + // Append `vendored.css` before `rustdoc.css`, so that the duplicate copy of + // `normalize.css` will be overridden by the later version. + // + // Later rustdoc has `#mainThemeStyle` that could be used, but pre-2018 docs + // don't have this: + // + // https://github.com/rust-lang/rust/commit/003b2bc1c65251ec2fc80b78ed91c43fb35402ec + // + // Pre-2018 rustdoc also didn't have the resource suffix, but docs.rs was using a fork + // that had implemented it already then, so we can assume the css files are + // `/rustdoc-.css` and use the `-` to distinguish from the + // `rustdoc.static` path. + element!( + "link[rel='stylesheet'][href*='rustdoc-']", + move |rustdoc_css: &mut Element| { + rustdoc_css.before(&vendored_html, ContentType::Html); + Ok(()) + } + ), + ], + memory_settings: MemorySettings { + max_allowed_memory_usage, + ..MemorySettings::default() + }, + ..Settings::default() + }; + + let mut rewriter = HtmlRewriter::new(settings, move |chunk: &[u8]| { + // send the result back to the main rewriter when its coming in. + // this can fail only when the receiver is dropped, in which case + // we exit this thread anyways. + let _ = result_sender.send(Bytes::from(chunk.to_vec())); + }); + while let Some(chunk) = input_receiver.recv()? { + // receive data from the input receiver. + // `input_receiver` is a non-async one. + // Since we're in a normal background thread, we can use the blocking `.recv` + // here. + // We will get `None` when the reader is done reading, + // so that's our signal to exit this loop and call `rewriter.end()` below. + rewriter.write(&chunk)?; + } + // finalize everything. Will trigger the output sink (and through that, + // sending data to the `result_sender`). + rewriter.end()?; Ok(()) - } - ), - ], - memory_settings: MemorySettings { - max_allowed_memory_usage, - ..MemorySettings::default() - }, - ..Settings::default() - }; + }) + .await?; + Ok(()) + }); + + let mut reader_stream = ReaderStream::new(&mut reader); + while let Some(chunk) = reader_stream.next().await { + let chunk = chunk.map_err(|err| { + error!(?err, "error while reading from rustdoc HTML reader"); + RustdocRewritingError::Other(err.into()) + })?; - // The input and output are always strings, we just use `&[u8]` so we only have to validate once. - let mut buffer = Vec::new(); - // TODO: Make the rewriter persistent? - let mut writer = HtmlRewriter::new(settings, |bytes: &[u8]| { - buffer.extend_from_slice(bytes); - }); + if let Err(err) = input_sender.send(Some(chunk.to_vec())) { + error!( + ?err, + "error when trying to send chunk to html rewriter thread" + ); + yield Err(RustdocRewritingError::Other(err.into())); + break; + } - writer.write(html)?; - writer.end()?; + while let Ok(bytes) = result_receiver.try_recv() { + yield Ok(bytes); + } + } + // This signals the renderer thread to finalize & exit. + if let Err(err) = input_sender.send(None) { + error!( + ?err, + "error when trying to send end signal to html rewriter thread" + ); + yield Err(RustdocRewritingError::Other(err.into())); + } + while let Some(bytes) = result_receiver.recv().await { + yield Ok(bytes); + } - Ok(buffer) + join_handle.await.expect("Task panicked").map_err(|e| { + error!( + ?e, + memory_limit = max_allowed_memory_usage, + "error while rewriting rustdoc HTML" + ); + // our `render_in_threadpool` and so the async tokio task return an `anyhow::Result`. + // In most cases this will be an error from the `HtmlRewriter`, which we'll get as a + // `RewritingError` which we extract here again. The other cases remain an + // `anyhow::Error`. + match e.downcast::() { + Ok(e) => { + if matches!(e, RewritingError::MemoryLimitExceeded(_)) { + metrics.html_rewrite_ooms.inc(); + } + RustdocRewritingError::RewritingError(e) + } + Err(e) => RustdocRewritingError::Other(e), + } + })?; + }) } #[cfg(test)] diff --git a/src/utils/mod.rs b/src/utils/mod.rs index eff293dab..836fad2d3 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -3,7 +3,7 @@ pub(crate) use self::cargo_metadata::{CargoMetadata, Package as MetadataPackage}; pub(crate) use self::copy::copy_dir_all; pub use self::daemon::{start_daemon, watch_registry}; -pub(crate) use self::html::rewrite_lol; +pub(crate) use self::html::rewrite_rustdoc_html_stream; pub use self::queue::{ get_crate_pattern_and_priority, get_crate_priority, list_crate_priorities, remove_crate_priority, set_crate_priority, diff --git a/src/web/rustdoc.rs b/src/web/rustdoc.rs index 46316a4f7..3fb6fb864 100644 --- a/src/web/rustdoc.rs +++ b/src/web/rustdoc.rs @@ -4,7 +4,7 @@ use crate::{ AsyncStorage, Config, InstanceMetrics, RUSTDOC_STATIC_STORAGE_PREFIX, db::Pool, storage::{ - CompressionAlgorithm, RustdocJsonFormatVersion, + CompressionAlgorithm, RustdocJsonFormatVersion, StreamingBlob, compression::compression_from_file_extension, rustdoc_archive_path, rustdoc_json_path, }, utils, @@ -16,7 +16,7 @@ use crate::{ encode_url_path, error::{AxumNope, AxumResult, EscapedURI}, extractors::{DbConnection, Path}, - file::{File, StreamingFile}, + file::StreamingFile, match_version, page::{ TemplateData, @@ -27,11 +27,12 @@ use crate::{ use anyhow::{Context as _, anyhow}; use askama::Template; use axum::{ + body::Body, extract::{Extension, Query}, http::{StatusCode, Uri}, - response::{Html, IntoResponse, Response as AxumResponse}, + response::{IntoResponse, Response as AxumResponse}, }; -use lol_html::errors::RewritingError; +use http::{HeaderValue, header}; use once_cell::sync::Lazy; use semver::Version; use serde::Deserialize; @@ -286,31 +287,15 @@ pub struct RustdocPage { } impl RustdocPage { - fn into_response( - self, - rustdoc_html: &[u8], + async fn into_response( + self: &Arc, + template_data: Arc, + metrics: Arc, + rustdoc_html: StreamingBlob, max_parse_memory: usize, - metrics: &InstanceMetrics, - config: &Config, - file_path: &str, ) -> AxumResult { let is_latest_url = self.is_latest_url; - // Extract the head and body of the rustdoc file so that we can insert it into our own html - // while logging OOM errors from html rewriting - let html = match utils::rewrite_lol(rustdoc_html, max_parse_memory, &self) { - Err(RewritingError::MemoryLimitExceeded(..)) => { - metrics.html_rewrite_ooms.inc(); - - return Err(AxumNope::InternalError(anyhow!( - "Failed to serve the rustdoc file '{}' because rewriting it surpassed the memory limit of {} bytes", - file_path, - config.max_parse_memory, - ))); - } - result => result.context("error rewriting HTML")?, - }; - Ok(( StatusCode::OK, (!is_latest_url).then_some([("X-Robots-Tag", "noindex")]), @@ -319,7 +304,17 @@ impl RustdocPage { } else { CachePolicy::ForeverInCdnAndStaleInBrowser }), - Html(html), + [( + header::CONTENT_TYPE, + HeaderValue::from_static(mime::TEXT_HTML_UTF_8.as_ref()), + )], + Body::from_stream(utils::rewrite_rustdoc_html_stream( + template_data, + rustdoc_html.content, + max_parse_memory, + self.clone(), + metrics, + )), ) .into_response()) } @@ -462,7 +457,7 @@ pub(crate) async fn rustdoc_html_server_handler( // Attempt to load the file from the database let blob = match storage - .fetch_rustdoc_file( + .stream_rustdoc_file( ¶ms.name, &krate.version.to_string(), krate.latest_build_id, @@ -551,7 +546,7 @@ pub(crate) async fn rustdoc_html_server_handler( // default asset caching behaviour is `Cache::ForeverInCdnAndBrowser`. // This is an edge-case when we serve invocation specific static assets under `/latest/`: // https://github.com/rust-lang/docs.rs/issues/1593 - return Ok(File(blob).into_response()); + return Ok(StreamingFile(blob).into_response()); } let latest_release = krate.latest_release()?; @@ -621,33 +616,19 @@ pub(crate) async fn rustdoc_html_server_handler( .record(krate.crate_id, krate.release_id, target); // Build the page of documentation, - templates - .render_in_threadpool({ - let metrics = metrics.clone(); - move || { - let metadata = krate.metadata.clone(); - Ok(RustdocPage { - latest_path, - permalink_path, - inner_path, - is_latest_version, - is_latest_url: params.version.is_latest(), - is_prerelease, - metadata, - krate, - current_target, - } - .into_response( - &blob.content, - config.max_parse_memory, - &metrics, - &config, - &storage_path, - )) - } - }) - .instrument(info_span!("rewrite html")) - .await? + let page = Arc::new(RustdocPage { + latest_path, + permalink_path, + inner_path, + is_latest_version, + is_latest_url: params.version.is_latest(), + is_prerelease, + metadata: krate.metadata.clone(), + krate, + current_target, + }); + page.into_response(templates, metrics, blob, config.max_parse_memory) + .await } /// Checks whether the given path exists.