diff --git a/Cargo.lock b/Cargo.lock index 4f0816b9d19..c289728f1f9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1026,6 +1026,7 @@ dependencies = [ "cookie", "crates_io_cdn_logs", "crates_io_database", + "crates_io_database_dump", "crates_io_env_vars", "crates_io_github", "crates_io_index", @@ -1124,6 +1125,26 @@ dependencies = [ "diesel_full_text_search", ] +[[package]] +name = "crates_io_database_dump" +version = "0.0.0" +dependencies = [ + "anyhow", + "chrono", + "crates_io_test_db", + "diesel", + "flate2", + "insta", + "minijinja", + "serde", + "serde_json", + "tar", + "tempfile", + "toml", + "tracing", + "zip", +] + [[package]] name = "crates_io_env_vars" version = "0.0.0" diff --git a/Cargo.toml b/Cargo.toml index e7891527fa0..a44b431867e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -51,6 +51,7 @@ bon = "=2.3.0" cargo-manifest = "=0.15.2" crates_io_cdn_logs = { path = "crates/crates_io_cdn_logs" } crates_io_database = { path = "crates/crates_io_database" } +crates_io_database_dump = { path = "crates/crates_io_database_dump" } crates_io_env_vars = { path = "crates/crates_io_env_vars" } crates_io_github = { path = "crates/crates_io_github" } crates_io_index = { path = "crates/crates_io_index" } @@ -116,7 +117,6 @@ tracing-subscriber = { version = "=0.3.18", features = ["env-filter", "json"] } typomania = { version = "=0.1.2", default-features = false } url = "=2.5.2" unicode-xid = "=0.2.6" -zip = { version = "=2.2.0", default-features = false, features = ["deflate"] } [dev-dependencies] bytes = "=1.8.0" @@ -129,3 +129,4 @@ googletest = "=0.12.0" insta = { version = "=1.40.0", features = ["glob", "json", "redactions"] } regex = "=1.11.0" tokio = "=1.40.0" +zip = { version = "=2.2.0", default-features = false, features = ["deflate"] } diff --git a/crates/crates_io_database_dump/Cargo.toml b/crates/crates_io_database_dump/Cargo.toml new file mode 100644 index 00000000000..cec7ee29eaf --- /dev/null +++ b/crates/crates_io_database_dump/Cargo.toml @@ -0,0 +1,26 @@ +[package] +name = "crates_io_database_dump" +version = "0.0.0" +license = "MIT OR Apache-2.0" +edition = "2021" + +[lints] +workspace = true + +[dependencies] +anyhow = "=1.0.90" +chrono = { version = "=0.4.38", default-features = false, features = ["clock", "serde"] } +flate2 = "=1.0.34" +minijinja = "=2.3.1" +serde = { version = "=1.0.210", features = ["derive"] } +serde_json = "=1.0.132" +tar = "=0.4.42" +tempfile = "=3.13.0" +toml = "=0.8.19" +tracing = "=0.1.40" +zip = { version = "=2.2.0", default-features = false, features = ["deflate"] } + +[dev-dependencies] +crates_io_test_db = { path = "../crates_io_test_db" } +diesel = "=2.2.4" +insta = { version = "=1.40.0", features = ["glob"] } diff --git a/src/worker/jobs/dump_db/configuration.rs b/crates/crates_io_database_dump/src/configuration.rs similarity index 94% rename from src/worker/jobs/dump_db/configuration.rs rename to crates/crates_io_database_dump/src/configuration.rs index 277a2074792..6c11b8eddd2 100644 --- a/src/worker/jobs/dump_db/configuration.rs +++ b/crates/crates_io_database_dump/src/configuration.rs @@ -1,10 +1,11 @@ +use serde::Deserialize; use std::collections::{BTreeMap, VecDeque}; /// An enum indicating whether a column is included in the database dumps. /// Public columns are included, private are not. #[derive(Clone, Copy, Debug, Deserialize, PartialEq)] #[serde(rename_all = "lowercase")] -pub(super) enum ColumnVisibility { +pub enum ColumnVisibility { Private, Public, } @@ -16,7 +17,7 @@ pub(super) enum ColumnVisibility { /// used in a `WHERE` clause to filter the rows of the table. The `columns` /// field maps column names to their respective visibilities. #[derive(Clone, Debug, Default, Deserialize)] -pub(super) struct TableConfig { +pub struct TableConfig { #[serde(default)] pub dependencies: Vec, pub filter: Option, @@ -28,17 +29,17 @@ pub(super) struct TableConfig { /// Maps table names to the respective configurations. Used to load `dump_db.toml`. #[derive(Clone, Debug, Default, Deserialize)] #[serde(transparent)] -pub(super) struct VisibilityConfig(pub BTreeMap); +pub struct VisibilityConfig(pub BTreeMap); impl VisibilityConfig { - pub(super) fn get() -> Self { + pub fn get() -> Self { toml::from_str(include_str!("dump-db.toml")).unwrap() } /// Sort the tables in a way that dependencies come before dependent tables. /// /// Returns a vector of table names. - pub(super) fn topological_sort(&self) -> Vec<&str> { + pub fn topological_sort(&self) -> Vec<&str> { let mut num_deps = BTreeMap::new(); let mut rev_deps: BTreeMap<_, Vec<_>> = BTreeMap::new(); for (table, config) in self.0.iter() { diff --git a/src/worker/jobs/dump_db/dump-db.toml b/crates/crates_io_database_dump/src/dump-db.toml similarity index 100% rename from src/worker/jobs/dump_db/dump-db.toml rename to crates/crates_io_database_dump/src/dump-db.toml diff --git a/src/worker/jobs/dump_db/dump-export.sql.j2 b/crates/crates_io_database_dump/src/dump-export.sql.j2 similarity index 100% rename from src/worker/jobs/dump_db/dump-export.sql.j2 rename to crates/crates_io_database_dump/src/dump-export.sql.j2 diff --git a/src/worker/jobs/dump_db/dump-import.sql.j2 b/crates/crates_io_database_dump/src/dump-import.sql.j2 similarity index 100% rename from src/worker/jobs/dump_db/dump-import.sql.j2 rename to crates/crates_io_database_dump/src/dump-import.sql.j2 diff --git a/src/worker/jobs/dump_db/gen_scripts.rs b/crates/crates_io_database_dump/src/gen_scripts.rs similarity index 95% rename from src/worker/jobs/dump_db/gen_scripts.rs rename to crates/crates_io_database_dump/src/gen_scripts.rs index 8bb900bc83d..56ef913a929 100644 --- a/src/worker/jobs/dump_db/gen_scripts.rs +++ b/crates/crates_io_database_dump/src/gen_scripts.rs @@ -1,9 +1,8 @@ +use crate::configuration::{ColumnVisibility, TableConfig, VisibilityConfig}; use anyhow::Context; +use serde::Serialize; use std::{fs::File, path::Path}; - -use crate::worker::jobs::dump_db::configuration::{ - ColumnVisibility, TableConfig, VisibilityConfig, -}; +use tracing::debug; pub fn gen_scripts(export_script: &Path, import_script: &Path) -> anyhow::Result<()> { let config = VisibilityConfig::get(); @@ -119,7 +118,7 @@ impl VisibilityConfig { #[cfg(test)] mod tests { use super::*; - use crate::test_util::test_db_connection; + use crates_io_test_db::TestDatabase; use diesel::prelude::*; use std::collections::HashSet; use std::iter::FromIterator; @@ -128,8 +127,10 @@ mod tests { /// test database. #[test] fn check_visibility_config() { - let (_test_db, conn) = &mut test_db_connection(); - let db_columns = HashSet::::from_iter(get_db_columns(conn)); + let test_db = TestDatabase::new(); + let mut conn = test_db.connect(); + + let db_columns = HashSet::::from_iter(get_db_columns(&mut conn)); let vis_columns = VisibilityConfig::get() .0 .iter() @@ -167,6 +168,8 @@ mod tests { } mod information_schema { + use diesel::table; + table! { information_schema.columns (table_schema, table_name, column_name) { table_schema -> Text, diff --git a/crates/crates_io_database_dump/src/lib.rs b/crates/crates_io_database_dump/src/lib.rs new file mode 100644 index 00000000000..865fbd48a4f --- /dev/null +++ b/crates/crates_io_database_dump/src/lib.rs @@ -0,0 +1,322 @@ +use anyhow::{anyhow, Context}; +use serde::Serialize; +use std::fs; +use std::fs::File; +use std::path::{Path, PathBuf}; +use tracing::debug; +use zip::write::SimpleFileOptions; + +mod configuration; +mod gen_scripts; + +pub use configuration::VisibilityConfig; +pub use gen_scripts::gen_scripts; + +/// Manage the export directory. +/// +/// Create the directory, populate it with the psql scripts and CSV dumps, and +/// make sure it gets deleted again even in the case of an error. +#[derive(Debug)] +pub struct DumpDirectory { + /// The temporary directory that contains the export directory. + tempdir: tempfile::TempDir, + pub timestamp: chrono::DateTime, +} + +impl DumpDirectory { + pub fn create() -> anyhow::Result { + debug!("Creating database dump folder…"); + let tempdir = tempfile::tempdir()?; + let timestamp = chrono::Utc::now(); + + Ok(Self { tempdir, timestamp }) + } + + pub fn path(&self) -> &Path { + self.tempdir.path() + } + + pub fn populate(&self, database_url: &str) -> anyhow::Result<()> { + self.add_readme() + .context("Failed to write README.md file")?; + + self.add_metadata() + .context("Failed to write metadata.json file")?; + + self.dump_schema(database_url) + .context("Failed to generate schema.sql file")?; + + self.dump_db(database_url) + .context("Failed to create database dump") + } + + fn add_readme(&self) -> anyhow::Result<()> { + use std::io::Write; + + let path = self.path().join("README.md"); + debug!(?path, "Writing README.md file…"); + let mut readme = File::create(path)?; + readme.write_all(include_bytes!("readme_for_tarball.md"))?; + Ok(()) + } + + fn add_metadata(&self) -> anyhow::Result<()> { + #[derive(Serialize)] + struct Metadata<'a> { + timestamp: &'a chrono::DateTime, + crates_io_commit: String, + } + let metadata = Metadata { + timestamp: &self.timestamp, + crates_io_commit: std::env::var("HEROKU_SLUG_COMMIT") + .unwrap_or_else(|_| "unknown".to_owned()), + }; + let path = self.path().join("metadata.json"); + debug!(?path, "Writing metadata.json file…"); + let file = File::create(path)?; + serde_json::to_writer_pretty(file, &metadata)?; + Ok(()) + } + + pub fn dump_schema(&self, database_url: &str) -> anyhow::Result<()> { + let path = self.path().join("schema.sql"); + debug!(?path, "Writing schema.sql file…"); + let schema_sql = + File::create(&path).with_context(|| format!("Failed to create {}", path.display()))?; + + let status = std::process::Command::new("pg_dump") + .arg("--schema-only") + .arg("--no-owner") + .arg("--no-acl") + .arg(database_url) + .stdout(schema_sql) + .spawn() + .context("Failed to run `pg_dump` command")? + .wait() + .context("Failed to wait for `pg_dump` to exit")?; + + if !status.success() { + return Err(anyhow!( + "pg_dump did not finish successfully (exit code: {}).", + status + )); + } + + Ok(()) + } + + pub fn dump_db(&self, database_url: &str) -> anyhow::Result<()> { + debug!("Generating export.sql and import.sql files…"); + let export_script = self.path().join("export.sql"); + let import_script = self.path().join("import.sql"); + gen_scripts(&export_script, &import_script) + .context("Failed to generate export/import scripts")?; + + debug!("Filling data folder…"); + fs::create_dir(self.path().join("data")).context("Failed to create `data` directory")?; + + run_psql(&export_script, database_url) + } +} + +pub fn run_psql(script: &Path, database_url: &str) -> anyhow::Result<()> { + debug!(?script, "Running psql script…"); + let psql_script = + File::open(script).with_context(|| format!("Failed to open {}", script.display()))?; + + let psql = std::process::Command::new("psql") + .arg("--no-psqlrc") + .arg(database_url) + .current_dir(script.parent().unwrap()) + .stdin(psql_script) + .stdout(std::process::Stdio::null()) + .stderr(std::process::Stdio::piped()) + .spawn() + .context("Failed to run psql command")?; + + let output = psql + .wait_with_output() + .context("Failed to wait for psql command to exit")?; + + let stderr = String::from_utf8_lossy(&output.stderr); + if stderr.contains("ERROR") { + return Err(anyhow!("Error while executing psql: {stderr}")); + } + if !output.status.success() { + return Err(anyhow!("psql did not finish successfully.")); + } + Ok(()) +} + +pub struct Archives { + pub tar: tempfile::NamedTempFile, + pub zip: tempfile::NamedTempFile, +} + +pub fn create_archives(export_dir: &Path, tarball_prefix: &Path) -> anyhow::Result { + debug!("Creating tarball file…"); + let tar_tempfile = tempfile::NamedTempFile::new()?; + let encoder = + flate2::write::GzEncoder::new(tar_tempfile.as_file(), flate2::Compression::default()); + let mut tar = tar::Builder::new(encoder); + + debug!("Creating zip file…"); + let zip_tempfile = tempfile::NamedTempFile::new()?; + let mut zip = zip::ZipWriter::new(zip_tempfile.as_file()); + + debug!("Appending `{tarball_prefix:?}` directory to tarball…"); + tar.append_dir(tarball_prefix, export_dir)?; + + // Append readme, metadata, schemas. + let mut paths = Vec::new(); + for entry in fs::read_dir(export_dir)? { + let entry = entry?; + let file_type = entry.file_type()?; + if file_type.is_file() { + paths.push((entry.path(), entry.file_name())); + } + } + // Sort paths to make the tarball deterministic. + paths.sort(); + for (path, file_name) in paths { + let name = tarball_prefix.join(&file_name); + debug!("Appending `{name:?}` file to tarball…"); + tar.append_path_with_name(&path, name)?; + + debug!("Appending `{file_name:?}` file to zip file…"); + zip.start_file_from_path(&file_name, SimpleFileOptions::default())?; + std::io::copy(&mut File::open(path)?, &mut zip)?; + } + + // Append topologically sorted tables to make it possible to pipeline + // importing with gz extraction. + + debug!("Sorting database tables"); + let visibility_config = VisibilityConfig::get(); + let sorted_tables = visibility_config.topological_sort(); + + let path = tarball_prefix.join("data"); + debug!("Appending `data` directory to tarball…"); + tar.append_dir(path, export_dir.join("data"))?; + + debug!("Appending `data` directory to zip file…"); + zip.add_directory("data", SimpleFileOptions::default())?; + + for table in sorted_tables { + let csv_path = export_dir.join("data").join(table).with_extension("csv"); + if csv_path.exists() { + let name = tarball_prefix + .join("data") + .join(table) + .with_extension("csv"); + debug!("Appending `{name:?}` file to tarball…"); + tar.append_path_with_name(&csv_path, name)?; + + let name = PathBuf::from("data").join(table).with_extension("csv"); + debug!("Appending `{name:?}` file to zip file…"); + zip.start_file_from_path(&name, SimpleFileOptions::default())?; + std::io::copy(&mut File::open(csv_path)?, &mut zip)?; + } + } + + drop(tar); + zip.finish()?; + + Ok(Archives { + tar: tar_tempfile, + zip: zip_tempfile, + }) +} + +#[cfg(test)] +mod tests { + use super::*; + use crates_io_test_db::TestDatabase; + use flate2::read::GzDecoder; + use insta::{assert_debug_snapshot, assert_snapshot}; + use std::io::BufReader; + use tar::Archive; + + #[test] + fn test_dump_tarball() { + let tempdir = tempfile::Builder::new() + .prefix("DumpTarball") + .tempdir() + .unwrap(); + let p = tempdir.path(); + + fs::write(p.join("README.md"), "# crates.io Database Dump\n").unwrap(); + fs::create_dir(p.join("data")).unwrap(); + fs::write(p.join("data").join("crates.csv"), "").unwrap(); + fs::write(p.join("data").join("crate_owners.csv"), "").unwrap(); + fs::write(p.join("data").join("users.csv"), "").unwrap(); + + let archives = create_archives(p, &PathBuf::from("0000-00-00")).unwrap(); + let gz = GzDecoder::new(File::open(archives.tar.path()).unwrap()); + let mut tar = Archive::new(gz); + + let entries = tar.entries().unwrap(); + let paths = entries + .map(|entry| entry.unwrap().path().unwrap().display().to_string()) + .collect::>(); + + assert_debug_snapshot!(paths, @r###" + [ + "0000-00-00", + "0000-00-00/README.md", + "0000-00-00/data", + "0000-00-00/data/crates.csv", + "0000-00-00/data/users.csv", + "0000-00-00/data/crate_owners.csv", + ] + "###); + + let file = File::open(archives.zip.path()).unwrap(); + let reader = BufReader::new(file); + + let archive = zip::ZipArchive::new(reader).unwrap(); + let zip_paths = archive.file_names().collect::>(); + assert_debug_snapshot!(zip_paths, @r###" + [ + "README.md", + "data/", + "data/crates.csv", + "data/users.csv", + "data/crate_owners.csv", + ] + "###); + } + + #[test] + fn dump_db_and_reimport_dump() { + let db_one = TestDatabase::new(); + + // TODO prefill database with some data + + let directory = DumpDirectory::create().unwrap(); + directory.populate(db_one.url()).unwrap(); + + let db_two = TestDatabase::empty(); + + let schema_script = directory.path().join("schema.sql"); + run_psql(&schema_script, db_two.url()).unwrap(); + + let import_script = directory.path().join("import.sql"); + run_psql(&import_script, db_two.url()).unwrap(); + + // TODO: Consistency checks on the re-imported data? + } + + #[test] + fn test_sql_scripts() { + let db = TestDatabase::new(); + + let directory = DumpDirectory::create().unwrap(); + directory.populate(db.url()).unwrap(); + + insta::glob!(directory.path(), "{import,export}.sql", |path| { + let content = fs::read_to_string(path).unwrap(); + assert_snapshot!(content); + }); + } +} diff --git a/src/worker/jobs/dump_db/readme_for_tarball.md b/crates/crates_io_database_dump/src/readme_for_tarball.md similarity index 100% rename from src/worker/jobs/dump_db/readme_for_tarball.md rename to crates/crates_io_database_dump/src/readme_for_tarball.md diff --git a/src/tests/snapshots/crates_io__tests__dump_db__sql_scripts@export.sql.snap b/crates/crates_io_database_dump/src/snapshots/crates_io_database_dump__tests__sql_scripts@export.sql.snap similarity index 97% rename from src/tests/snapshots/crates_io__tests__dump_db__sql_scripts@export.sql.snap rename to crates/crates_io_database_dump/src/snapshots/crates_io_database_dump__tests__sql_scripts@export.sql.snap index 871dcce41cf..e661d868e42 100644 --- a/src/tests/snapshots/crates_io__tests__dump_db__sql_scripts@export.sql.snap +++ b/crates/crates_io_database_dump/src/snapshots/crates_io_database_dump__tests__sql_scripts@export.sql.snap @@ -1,5 +1,5 @@ --- -source: src/tests/dump_db.rs +source: crates/crates_io_database_dump/src/lib.rs expression: content --- BEGIN ISOLATION LEVEL REPEATABLE READ, READ ONLY; diff --git a/src/tests/snapshots/crates_io__tests__dump_db__sql_scripts@import.sql.snap b/crates/crates_io_database_dump/src/snapshots/crates_io_database_dump__tests__sql_scripts@import.sql.snap similarity index 98% rename from src/tests/snapshots/crates_io__tests__dump_db__sql_scripts@import.sql.snap rename to crates/crates_io_database_dump/src/snapshots/crates_io_database_dump__tests__sql_scripts@import.sql.snap index 63111067f75..9b5c8681290 100644 --- a/src/tests/snapshots/crates_io__tests__dump_db__sql_scripts@import.sql.snap +++ b/crates/crates_io_database_dump/src/snapshots/crates_io_database_dump__tests__sql_scripts@import.sql.snap @@ -1,5 +1,5 @@ --- -source: src/tests/dump_db.rs +source: crates/crates_io_database_dump/src/lib.rs expression: content --- BEGIN; diff --git a/src/tests/dump_db.rs b/src/tests/dump_db.rs index 65e7a5fbf49..59e05e3cf4e 100644 --- a/src/tests/dump_db.rs +++ b/src/tests/dump_db.rs @@ -1,8 +1,7 @@ use crate::tests::builders::CrateBuilder; use crate::tests::util::TestApp; -use crate::worker::jobs::{dump_db, DumpDb}; +use crate::worker::jobs::DumpDb; use bytes::Buf; -use crates_io_test_db::TestDatabase; use crates_io_worker::BackgroundJob; use flate2::read::GzDecoder; use insta::{assert_debug_snapshot, assert_snapshot}; @@ -106,40 +105,3 @@ fn tar_paths(archive: &mut Archive) -> Vec { .map(|path| PATH_DATE_RE.replace(&path, "YYYY-MM-DD-HHMMSS").to_string()) .collect() } - -#[test] -fn dump_db_and_reimport_dump() { - crate::util::tracing::init_for_test(); - - let db_one = TestDatabase::new(); - - // TODO prefill database with some data - - let directory = dump_db::DumpDirectory::create().unwrap(); - directory.populate(db_one.url()).unwrap(); - - let db_two = TestDatabase::empty(); - - let schema_script = directory.path().join("schema.sql"); - dump_db::run_psql(&schema_script, db_two.url()).unwrap(); - - let import_script = directory.path().join("import.sql"); - dump_db::run_psql(&import_script, db_two.url()).unwrap(); - - // TODO: Consistency checks on the re-imported data? -} - -#[test] -fn test_sql_scripts() { - crate::util::tracing::init_for_test(); - - let db = TestDatabase::new(); - - let directory = dump_db::DumpDirectory::create().unwrap(); - directory.populate(db.url()).unwrap(); - - insta::glob!(directory.path(), "{import,export}.sql", |path| { - let content = std::fs::read_to_string(path).unwrap(); - assert_snapshot!(content); - }); -} diff --git a/src/worker/jobs/dump_db.rs b/src/worker/jobs/dump_db.rs index 1b603e9704b..2b02815ad15 100644 --- a/src/worker/jobs/dump_db.rs +++ b/src/worker/jobs/dump_db.rs @@ -1,13 +1,10 @@ -use self::configuration::VisibilityConfig; use crate::tasks::spawn_blocking; use crate::worker::Environment; -use anyhow::{anyhow, Context}; +use crates_io_database_dump::{create_archives, DumpDirectory}; use crates_io_worker::BackgroundJob; use secrecy::ExposeSecret; -use std::fs::{self, File}; -use std::path::{Path, PathBuf}; +use std::path::PathBuf; use std::sync::Arc; -use zip::write::SimpleFileOptions; #[derive(Clone, Serialize, Deserialize)] pub struct DumpDb; @@ -67,281 +64,3 @@ impl BackgroundJob for DumpDb { Ok(()) } } - -/// Manage the export directory. -/// -/// Create the directory, populate it with the psql scripts and CSV dumps, and -/// make sure it gets deleted again even in the case of an error. -#[derive(Debug)] -pub struct DumpDirectory { - /// The temporary directory that contains the export directory. - tempdir: tempfile::TempDir, - pub timestamp: chrono::DateTime, -} - -impl DumpDirectory { - pub fn create() -> anyhow::Result { - debug!("Creating database dump folder…"); - let tempdir = tempfile::tempdir()?; - let timestamp = chrono::Utc::now(); - - Ok(Self { tempdir, timestamp }) - } - - pub fn path(&self) -> &Path { - self.tempdir.path() - } - - pub fn populate(&self, database_url: &str) -> anyhow::Result<()> { - self.add_readme() - .context("Failed to write README.md file")?; - - self.add_metadata() - .context("Failed to write metadata.json file")?; - - self.dump_schema(database_url) - .context("Failed to generate schema.sql file")?; - - self.dump_db(database_url) - .context("Failed to create database dump") - } - - fn add_readme(&self) -> anyhow::Result<()> { - use std::io::Write; - - let path = self.path().join("README.md"); - debug!(?path, "Writing README.md file…"); - let mut readme = File::create(path)?; - readme.write_all(include_bytes!("dump_db/readme_for_tarball.md"))?; - Ok(()) - } - - fn add_metadata(&self) -> anyhow::Result<()> { - #[derive(Serialize)] - struct Metadata<'a> { - timestamp: &'a chrono::DateTime, - crates_io_commit: String, - } - let metadata = Metadata { - timestamp: &self.timestamp, - crates_io_commit: dotenvy::var("HEROKU_SLUG_COMMIT") - .unwrap_or_else(|_| "unknown".to_owned()), - }; - let path = self.path().join("metadata.json"); - debug!(?path, "Writing metadata.json file…"); - let file = File::create(path)?; - serde_json::to_writer_pretty(file, &metadata)?; - Ok(()) - } - - pub fn dump_schema(&self, database_url: &str) -> anyhow::Result<()> { - let path = self.path().join("schema.sql"); - debug!(?path, "Writing schema.sql file…"); - let schema_sql = - File::create(&path).with_context(|| format!("Failed to create {}", path.display()))?; - - let status = std::process::Command::new("pg_dump") - .arg("--schema-only") - .arg("--no-owner") - .arg("--no-acl") - .arg(database_url) - .stdout(schema_sql) - .spawn() - .context("Failed to run `pg_dump` command")? - .wait() - .context("Failed to wait for `pg_dump` to exit")?; - - if !status.success() { - return Err(anyhow!( - "pg_dump did not finish successfully (exit code: {}).", - status - )); - } - - Ok(()) - } - - pub fn dump_db(&self, database_url: &str) -> anyhow::Result<()> { - debug!("Generating export.sql and import.sql files…"); - let export_script = self.path().join("export.sql"); - let import_script = self.path().join("import.sql"); - gen_scripts::gen_scripts(&export_script, &import_script) - .context("Failed to generate export/import scripts")?; - - debug!("Filling data folder…"); - fs::create_dir(self.path().join("data")).context("Failed to create `data` directory")?; - - run_psql(&export_script, database_url) - } -} - -pub fn run_psql(script: &Path, database_url: &str) -> anyhow::Result<()> { - debug!(?script, "Running psql script…"); - let psql_script = - File::open(script).with_context(|| format!("Failed to open {}", script.display()))?; - - let psql = std::process::Command::new("psql") - .arg("--no-psqlrc") - .arg(database_url) - .current_dir(script.parent().unwrap()) - .stdin(psql_script) - .stdout(std::process::Stdio::null()) - .stderr(std::process::Stdio::piped()) - .spawn() - .context("Failed to run psql command")?; - - let output = psql - .wait_with_output() - .context("Failed to wait for psql command to exit")?; - - let stderr = String::from_utf8_lossy(&output.stderr); - if stderr.contains("ERROR") { - return Err(anyhow!("Error while executing psql: {stderr}")); - } - if !output.status.success() { - return Err(anyhow!("psql did not finish successfully.")); - } - Ok(()) -} - -struct Archives { - tar: tempfile::NamedTempFile, - zip: tempfile::NamedTempFile, -} - -fn create_archives(export_dir: &Path, tarball_prefix: &Path) -> anyhow::Result { - debug!("Creating tarball file…"); - let tar_tempfile = tempfile::NamedTempFile::new()?; - let encoder = - flate2::write::GzEncoder::new(tar_tempfile.as_file(), flate2::Compression::default()); - let mut tar = tar::Builder::new(encoder); - - debug!("Creating zip file…"); - let zip_tempfile = tempfile::NamedTempFile::new()?; - let mut zip = zip::ZipWriter::new(zip_tempfile.as_file()); - - debug!("Appending `{tarball_prefix:?}` directory to tarball…"); - tar.append_dir(tarball_prefix, export_dir)?; - - // Append readme, metadata, schemas. - let mut paths = Vec::new(); - for entry in fs::read_dir(export_dir)? { - let entry = entry?; - let file_type = entry.file_type()?; - if file_type.is_file() { - paths.push((entry.path(), entry.file_name())); - } - } - // Sort paths to make the tarball deterministic. - paths.sort(); - for (path, file_name) in paths { - let name = tarball_prefix.join(&file_name); - debug!("Appending `{name:?}` file to tarball…"); - tar.append_path_with_name(&path, name)?; - - debug!("Appending `{file_name:?}` file to zip file…"); - zip.start_file_from_path(&file_name, SimpleFileOptions::default())?; - std::io::copy(&mut File::open(path)?, &mut zip)?; - } - - // Append topologically sorted tables to make it possible to pipeline - // importing with gz extraction. - - debug!("Sorting database tables"); - let visibility_config = VisibilityConfig::get(); - let sorted_tables = visibility_config.topological_sort(); - - let path = tarball_prefix.join("data"); - debug!("Appending `data` directory to tarball…"); - tar.append_dir(path, export_dir.join("data"))?; - - debug!("Appending `data` directory to zip file…"); - zip.add_directory("data", SimpleFileOptions::default())?; - - for table in sorted_tables { - let csv_path = export_dir.join("data").join(table).with_extension("csv"); - if csv_path.exists() { - let name = tarball_prefix - .join("data") - .join(table) - .with_extension("csv"); - debug!("Appending `{name:?}` file to tarball…"); - tar.append_path_with_name(&csv_path, name)?; - - let name = PathBuf::from("data").join(table).with_extension("csv"); - debug!("Appending `{name:?}` file to zip file…"); - zip.start_file_from_path(&name, SimpleFileOptions::default())?; - std::io::copy(&mut File::open(csv_path)?, &mut zip)?; - } - } - - drop(tar); - zip.finish()?; - - Ok(Archives { - tar: tar_tempfile, - zip: zip_tempfile, - }) -} - -mod configuration; -mod gen_scripts; - -#[cfg(test)] -mod tests { - use super::*; - use flate2::read::GzDecoder; - use insta::assert_debug_snapshot; - use std::io::BufReader; - use tar::Archive; - - #[test] - fn test_dump_tarball() { - let tempdir = tempfile::Builder::new() - .prefix("DumpTarball") - .tempdir() - .unwrap(); - let p = tempdir.path(); - - fs::write(p.join("README.md"), "# crates.io Database Dump\n").unwrap(); - fs::create_dir(p.join("data")).unwrap(); - fs::write(p.join("data").join("crates.csv"), "").unwrap(); - fs::write(p.join("data").join("crate_owners.csv"), "").unwrap(); - fs::write(p.join("data").join("users.csv"), "").unwrap(); - - let archives = create_archives(p, &PathBuf::from("0000-00-00")).unwrap(); - let gz = GzDecoder::new(File::open(archives.tar.path()).unwrap()); - let mut tar = Archive::new(gz); - - let entries = tar.entries().unwrap(); - let paths = entries - .map(|entry| entry.unwrap().path().unwrap().display().to_string()) - .collect::>(); - - assert_debug_snapshot!(paths, @r###" - [ - "0000-00-00", - "0000-00-00/README.md", - "0000-00-00/data", - "0000-00-00/data/crates.csv", - "0000-00-00/data/users.csv", - "0000-00-00/data/crate_owners.csv", - ] - "###); - - let file = File::open(archives.zip.path()).unwrap(); - let reader = BufReader::new(file); - - let archive = zip::ZipArchive::new(reader).unwrap(); - let zip_paths = archive.file_names().collect::>(); - assert_debug_snapshot!(zip_paths, @r###" - [ - "README.md", - "data/", - "data/crates.csv", - "data/users.csv", - "data/crate_owners.csv", - ] - "###); - } -}