|
3 | 3 | #![deny(unused_crate_dependencies)] |
4 | 4 |
|
5 | 5 | use anyhow::{Error, Result, anyhow}; |
| 6 | +use async_stream::try_stream; |
6 | 7 | use clap::{Parser, Subcommand}; |
7 | | -use stac::{Collection, Href, Item, Links, Migrate, geoparquet::Compression}; |
| 8 | +use futures_core::TryStream; |
| 9 | +use futures_util::{TryStreamExt, pin_mut}; |
| 10 | +use stac::{Assets, Collection, Href, Item, Links, Migrate, SelfHref, geoparquet::Compression}; |
8 | 11 | use stac_api::{GetItems, GetSearch, Search}; |
9 | | -use stac_io::{Format, Validate}; |
| 12 | +use stac_io::{Format, StacStore, Validate}; |
10 | 13 | use stac_server::Backend; |
11 | | -use std::{collections::HashMap, io::Write, str::FromStr}; |
12 | | -use tokio::{io::AsyncReadExt, net::TcpListener, runtime::Handle}; |
| 14 | +use std::{ |
| 15 | + collections::{HashMap, VecDeque}, |
| 16 | + io::Write, |
| 17 | + str::FromStr, |
| 18 | +}; |
| 19 | +use tokio::{io::AsyncReadExt, net::TcpListener, runtime::Handle, task::JoinSet}; |
13 | 20 | use tracing::metadata::Level; |
14 | | -use tracing_subscriber::EnvFilter; |
| 21 | +use tracing_indicatif::IndicatifLayer; |
| 22 | +use tracing_subscriber::{ |
| 23 | + fmt::writer::MakeWriterExt, layer::SubscriberExt, util::SubscriberInitExt, |
| 24 | +}; |
| 25 | +use url::Url; |
15 | 26 |
|
16 | 27 | const DEFAULT_COLLECTION_ID: &str = "default-collection-id"; |
17 | 28 |
|
@@ -232,6 +243,19 @@ pub enum Command { |
232 | 243 | create_collections: bool, |
233 | 244 | }, |
234 | 245 |
|
| 246 | + /// Crawls a STAC Catalog or Collection by following its links. |
| 247 | + /// |
| 248 | + /// Items are saved as item collections (in the output format) in the output directory. |
| 249 | + Crawl { |
| 250 | + /// The href of a STAC Catalog or Collection |
| 251 | + href: String, |
| 252 | + |
| 253 | + /// The output directory |
| 254 | + /// |
| 255 | + /// This doesn't have to be local, by the way. |
| 256 | + directory: String, |
| 257 | + }, |
| 258 | + |
235 | 259 | /// Validates a STAC value. |
236 | 260 | /// |
237 | 261 | /// The default output format is plain text — use `--output-format=json` to |
@@ -264,11 +288,16 @@ impl Rustac { |
264 | 288 | /// is setting up the appropriate logging (e.g. Python). |
265 | 289 | pub async fn run(self, init_tracing_subscriber: bool) -> Result<()> { |
266 | 290 | if init_tracing_subscriber { |
267 | | - tracing_subscriber::fmt() |
268 | | - .with_env_filter(EnvFilter::from_default_env()) |
269 | | - .with_max_level(self.log_level()) |
270 | | - .with_writer(std::io::stderr) |
271 | | - .pretty() |
| 291 | + let indicatif_layer = IndicatifLayer::new(); |
| 292 | + tracing_subscriber::registry() |
| 293 | + .with( |
| 294 | + tracing_subscriber::fmt::layer().with_writer( |
| 295 | + indicatif_layer |
| 296 | + .get_stderr_writer() |
| 297 | + .with_max_level(self.log_level().unwrap_or(Level::WARN)), |
| 298 | + ), |
| 299 | + ) |
| 300 | + .with(indicatif_layer) |
272 | 301 | .init(); |
273 | 302 | } |
274 | 303 | match self.command { |
@@ -418,6 +447,45 @@ impl Rustac { |
418 | 447 | load_and_serve(addr, backend, collections, items, create_collections).await |
419 | 448 | } |
420 | 449 | } |
| 450 | + Command::Crawl { |
| 451 | + ref href, |
| 452 | + ref directory, |
| 453 | + } => { |
| 454 | + let opts = self.opts(); |
| 455 | + let (store, path) = stac_io::parse_href_opts(href.clone(), opts.clone())?; |
| 456 | + let value: stac::Value = store.get(path).await.unwrap(); |
| 457 | + let mut items: HashMap<Option<String>, Vec<Item>> = HashMap::new(); |
| 458 | + let crawl = crawl(value, store).await; |
| 459 | + pin_mut!(crawl); |
| 460 | + let mut warned = false; |
| 461 | + while let Some(item) = crawl.try_next().await? { |
| 462 | + let collection = item.collection.clone(); |
| 463 | + if collection.as_deref() == Some(DEFAULT_COLLECTION_ID) && !warned { |
| 464 | + warned = true; |
| 465 | + tracing::warn!( |
| 466 | + "collection id matches the default collection id, so any collection-less items will be grouped into this collection: {DEFAULT_COLLECTION_ID}" |
| 467 | + ) |
| 468 | + } |
| 469 | + items.entry(collection).or_default().push(item); |
| 470 | + } |
| 471 | + let (store, path) = stac_io::parse_href_opts(directory.clone(), opts)?; |
| 472 | + let format = self.output_format(None); |
| 473 | + for (collection, items) in items { |
| 474 | + let file_name = format!( |
| 475 | + "{}.{}", |
| 476 | + collection.as_deref().unwrap_or(DEFAULT_COLLECTION_ID), |
| 477 | + format.extension() |
| 478 | + ); |
| 479 | + store |
| 480 | + .put_format( |
| 481 | + path.child(file_name), |
| 482 | + stac::ItemCollection::from(items), |
| 483 | + format, |
| 484 | + ) |
| 485 | + .await?; |
| 486 | + } |
| 487 | + Ok(()) |
| 488 | + } |
421 | 489 | Command::Validate { ref infile } => { |
422 | 490 | let value = self.get(infile.as_deref()).await?; |
423 | 491 | let result = Handle::current() |
@@ -651,6 +719,59 @@ fn level_value(level: Option<Level>) -> i8 { |
651 | 719 | } |
652 | 720 | } |
653 | 721 |
|
| 722 | +async fn crawl(value: stac::Value, store: StacStore) -> impl TryStream<Item = Result<Item>> { |
| 723 | + use stac::Value::*; |
| 724 | + |
| 725 | + try_stream! { |
| 726 | + let mut values = VecDeque::from([value]); |
| 727 | + while let Some(mut value) = values.pop_front() { |
| 728 | + value.make_links_absolute()?; |
| 729 | + match value { |
| 730 | + Catalog(_) | Collection(_) => { |
| 731 | + if let Catalog(ref catalog) = value { |
| 732 | + tracing::info!("got catalog={}", catalog.id); |
| 733 | + } |
| 734 | + if let Collection(ref collection) = value { |
| 735 | + tracing::info!("got collection={}", collection.id); |
| 736 | + } |
| 737 | + let mut join_set: JoinSet<Result<stac::Value>> = JoinSet::new(); |
| 738 | + for link in value |
| 739 | + .links() |
| 740 | + .iter() |
| 741 | + .filter(|link| link.is_child() || link.is_item()) |
| 742 | + .cloned() |
| 743 | + { |
| 744 | + let store = store.clone(); |
| 745 | + let url = Url::try_from(link.href)?; |
| 746 | + join_set.spawn(async move { |
| 747 | + let value: stac::Value = store.get(url.path()).await?; |
| 748 | + Ok(value) |
| 749 | + }); |
| 750 | + } |
| 751 | + while let Some(result) = join_set.join_next().await { |
| 752 | + let value = result??; |
| 753 | + values.push_back(value); |
| 754 | + } |
| 755 | + } |
| 756 | + Item(mut item) => { |
| 757 | + if let Some(self_href) = item.self_href().cloned() { |
| 758 | + item.make_assets_absolute(self_href)?; |
| 759 | + } |
| 760 | + yield item; |
| 761 | + } |
| 762 | + ItemCollection(item_collection) => { |
| 763 | + for mut item in item_collection.items { |
| 764 | + if let Some(self_href) = item.self_href().cloned() { |
| 765 | + item.make_assets_absolute(self_href)?; |
| 766 | + } |
| 767 | + yield item; |
| 768 | + } |
| 769 | + } |
| 770 | + } |
| 771 | + } |
| 772 | + } |
| 773 | +} |
| 774 | + |
654 | 775 | #[cfg(test)] |
655 | 776 | mod tests { |
656 | 777 | use super::Rustac; |
|
0 commit comments