e-marshall
diff --git a/‎book/_config.yml
Lines changed: 2 additions & 184 deletions b/‎book/_config.yml
Lines changed: 2 additions & 184 deletions
diff --git a/‎book/background/2_data_cubes.md
Lines changed: 4 additions & 1 deletion b/‎book/background/2_data_cubes.md
Lines changed: 4 additions & 1 deletion
diff --git a/‎book/background/3_tutorials_overview.md
Lines changed: 2 additions & 2 deletions b/‎book/background/3_tutorials_overview.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎book/background/4_tutorial_data.md
Lines changed: 1 addition & 1 deletion b/‎book/background/4_tutorial_data.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎book/background/5_software.md
Lines changed: 11 additions & 9 deletions b/‎book/background/5_software.md
Lines changed: 11 additions & 9 deletions
@@ -1,7 +1,7 @@
 # Book settings
 # Learn more at https://jupyterbook.org/customize/config.html
 
-title: Cloud-native geospatial datacube workflows with open-source tools
+title: Cloud-native geospatial data cube workflows with open-source tools
 author: Emma Marshall
 copyright: "2025" #, Emma Marshall
 #logo: logo.png
@@ -33,7 +33,7 @@ bibtex_bibfiles:
 
 # Information about where the book exists on the web
 repository:
-  url: https://github.com/e-marshall/cloud-open-source-geospatial-datacube-workflows
+  url: https://github.com/e-marshall/cloud-open-source-geospatial-data-cube-workflows
   branch: main  
 
 launch_buttons:
@@ -74,192 +74,10 @@ sphinx:
     - substitution
 
     myst_substitutions:
-      part1_title: "Part 2: Background"
-      part2_title: "ITS_LIVE ice velocity data tutorial"
-      #part2_title: "Using Xarray to examine cloud-based glacier surface velocity data"
-      part3_title: "Sentinel-1 RTC imagery tutorial"
-      #part3_title:  "Sentinel-1 RTC data workflows with xarray"
-      part4_title: "Part 5: Conclusion"
-
-      #tutorial 1 nb titles
-      title_its_nb1: "# 3.1 Accessing cloud-hosted ITS_LIVE data"
-      title_its_nb2: "# 3.2 Working with larger than memory data"
-      title_its_nb3: "# 3.3 Handling raster and vector data"
-      title_its_nb4: "# 3.4 Exploratory data analysis of a single glacier"
-      title_its_nb5: "# 3.5 Exploratory data analysis of multiple glaciers"
-
-      #tutorial 2 nb titles
-      title_s1_1: "# 4.1 Read Sentinel-1 data processed by ASF"
-      title_s1_2: "# 4.2 Wrangle metadata"
-      title_s1_3: "# 4.3 Exploratory analysis of ASF S1 imagery"
-      title_s1_4: "# 4.4 Read Sentinel-1 RTC data from Microsoft Planetary Computer"
-      title_s1_5: "# 4.5 Comparing Sentinel-1 RTC datasets"
-      #title_s1_6: "# 6. Example of Sentinel-1 RTC time series analysis"
 
       #global nb sections
-      intro: "## Introduction"
-      overview: "### Overview"
-      outline: "### Outline"
-      learning_goals: "### Learning goals"
-      concepts: "#### Concepts"
-      techniques: "#### Techniques"
-      conclusion: "## Conclusion"
       break: "----"
 
-      #nb1
-      #can't get subs + links to headings to work
-      # so not using lettered headings for now
-      # but still using numbered subsections (a1_...)
-      a_its_nb1: "A. Overview of ITS_LIVE data"
-      a1_its_nb1: "1) Data structure overview"
-      a2_its_nb1: "2) Climate Forecast (CF) Metadata Conventions"
-
-      b_its_nb1: "B. Read ITS_LIVE data from AWS S3 using Xarray"
-      b1_its_nb1: "1) Overview of ITS_LIVE data storage and catalog"
-      b2_its_nb1: "2) Read ITS_LIVE data from S3 storage into memory"
-      b3_its_nb1: "3) Check spatial footprint of data"
-
-      c_its_nb1: "C. Query ITS_LIVE catalog"
-      c1_its_nb1: "1) Find ITS_LIVE granule for a point of interest"
-      c2_its_nb1: "2) Read + visualize spatial footprint of ITS_LIVE data"
-
-
-      #nb2
-      a_its_nb2: "A. Compare approaches for reading larger than memory data"
-      a1_its_nb2: "1) `chunks = 'auto'`"
-      a2_its_nb2: "2) `chunks = {}`"
-      a3_its_nb2: "3) An out-of-order time dimension"
-      a4_its_nb2: "4) Read the dataset without Dask"
-      b_its_nb2: "B. Organize data once it's in memory"
-      b1_its_nb2: "1) Arrange dataset in chronological order"
-      b2_its_nb2: "2) Convert to a Dask-backed `Xarray.Dataset`"
-
-      #nb3
-      a_its_nb3: "Read data using strategy identified in previous notebook"
-      b_its_nb3: "Incorporate glacier outline (vector) data"
-      b1_its_nb3: "1) Read and reproject vector data"
-      b2_its_nb3: "2) Visualize spatial extents of glacier outlines and ITS_LIVE data cube"
-      b3_its_nb3: "3) Crop vector data to spatial extent of raster data"
-      c_its_nb3: "C. Combine raster and vector data"
-      c1_its_nb3: "1) Use vector data to crop raster data"
-      c2_its_nb3: "2) Write clipped raster data cube to disk"
-
-      #nb4
-      a_its_nb4: "A. Data exploration"
-      a1_its_nb4: "1) Load raster data and visualize with vector data"
-      a2_its_nb4: "2) Examine data coverage along the time dimension"
-      a3_its_nb4: "3) Look at data by sensor"
-      b_its_nb4: "B. Comparing different satellites"
-      b1_its_nb4: "1) DataTree approach"
-      b2_its_nb4: "2) GroupBy approach"
-      c_its_nb4: "C. Examine velocity variability"
-      c1_its_nb4: "1) Histograms and summary statistics"
-      c2_its_nb4: "2) Spatial velocity variablity"
-      c3_its_nb4: "3) Temporal velocity variability"
-      d_its_nb4: "D. Dimensional computations"
-      d1_its_nb4: "1) Temporal resampling"
-      d2_its_nb4: "2) Grouped analysis by season"
-
-      #nb5
-      a_its_nb5: "A. Read and organize data"
-      a1_its_nb5: "1) Raster data"
-      a2_its_nb5: "2) Vector data"
-
-      b_its_nb5: "B. Combine raster and vector to create a vector data cube"
-      b1_its_nb5: "1) Make a vector data cube"
-      b2_its_nb5: "2) Add attribute data to vector cube"
-      b3_its_nb5: "3) Write vector cube to disk"
-
-      c_its_nb5: "C. Data visualization"
-      c1_its_nb5: "1) Read vector data cube into memory"
-      c2_its_nb5: "2) Visualize velocity data"
-      c3_its_nb5: "3) Visualize associations between velocity and attribute data"
-
-
-      #sentinel nb1
-      a_s1_nb1: "A. Prepare to read data into memory"
-      a1_s1_nb1: "1) Build lists of file names and paths needed for VRT objects"
-      a2_s1_nb1: "2) Create VRT objects"
-      b_s1_nb1: "B. Read data"
-      b1_s1_nb1: "1) Take a look at chunking"
-      # sentinel nb2
-      a_s1_nb2: "A. Read and inspect initial metadata"
-      a1_s1_nb2: "1) Add appropriate names to variables"
-      a2_s1_nb2: "2) What metadata currently exists?"
-
-      b_s1_nb2: "B. Add metadata from file name"
-      b1_s1_nb2: "1) Parse file name"
-      b2_s1_nb2: "2) Extract and format acquisition dates"
-      b3_s1_nb2: "3) Combine data cubes"
-
-      c_s1_nb2: "C. Time-varying metadata"
-      c1_s1_nb2: "1) Extract attributes as list of dictionaries"
-      c2_s1_nb2: "2) Create tuple of metadata for each type of information"
-      c3_s1_nb2: "3) Assign metadata tuple to Xarray dataset as a coordinate variable"
-
-      d_s1_nb2: "D. Add metadata from a markdown file"
-      d1_s1_nb2: "1) Extract granule ID"
-      d2_s1_nb2: "2) Build coordinate `xr.DataArray`"
-
-      #sentinel nb3
-      a_s1_nb3: "A. Read and prepare data"
-      a1_s1_nb3: "1) Clip to spatial area of interest"
-
-      b_s1_nb3: "B. Layover-shadow map"
-      b1_s1_nb3: "1) Interactive visualization of layover-shadow maps"
-
-      c_s1_nb3: "C. Orbital direction"
-      c1_s1_nb3: "1) Is a pass ascending or descending?"
-      c2_s1_nb3: "2) Assign orbital direction as a coordinate variable"
-
-      d_s1_nb3: "D. Duplicate time steps"
-      d1_s1_nb3: "1) Identify duplicate time steps"
-      d2_s1_nb3: "2) Visualize duplicates"
-      d3_s1_nb3: "3) Drop duplicates"
-
-      e_s1_nb3: "E. Examine coverage over time series"
-
-      f_s1_nb3: "F. Data visualization"
-      f1_s1_nb3: "1) Mean backscatter over time"
-      f2_s1_nb3: "2) Seasonal backscatter variability"
-      f3_s1_nb3: "3) Backscatter time series"
-
-
-
-
-      #s1 nb4
-      a_s1_nb4: "A. Connect to Microsoft Planetary Computer"
-      a1_s1_nb4: "1) Explore STAC metadata"
-
-      b_s1_nb4: "B. Read data and create Xarray data cube"
-      b1_s1_nb4: "1) Create a Dask distributed cluster"
-      b2_s1_nb4: "2) Use `stackstac` to pull queried data from Planetary Computer"
-      b3_s1_nb4: "3) Inspect dataset"
-      #b4_s1_nb4: "4) Convert a 4-d `xr.DataArray` to a 3-d `xr.Dataset`"
-
-      c_s1_nb4: "C. Visualize data"
-      c1_s1_nb4: "1) Ascending and descending pass acquisitions"
-      c2_s1_nb4: "2) Variability over time"
-      c3_s1_nb4: "3) Seasonal variability"
-
-      #s1 nb5
-      a_s1_nb5: "A. Read and prepare data"
-      a1_s1_nb5: "1) Check coordinate reference system information"
-
-      b_s1_nb5: "B. Ensure direct comparison between datasets"
-      b1_s1_nb5: "1) Subset time series to common time steps"
-      b2_s1_nb5: "2) Handle differences in spatial resolution"
-      b3_s1_nb5: "3) Mask missing data from one dataset"
-
-      c_s1_nb5: "C. Combine objects"
-      c1_s1_nb5: "1) `expand_dims()` to add 'source' dimension"
-      c2_s1_nb5: "2) `combine_by_coords()`"
-
-      d_s1_nb5: "D. Visualize comparison"
-      d1_s1_nb5: "1) Mean over time"
-      d2_s1_nb5: "2) Mean over space"
-      d3_s1_nb5: "3) Difference"
-
 # Not sure why but uncommenting these causes all of the md 
 # substitution variables and formatting like tabs to not work
 #sphinx:
 
@@ -5,8 +5,9 @@ The term **data cube** is used frequently throughout this book. This page contai
 
 [^mynote2]: Geffner et al. frame this distinction as *measure attributes* ("attributes whose values are of interest") and *functional attributes* that contextualize the measure attribute values {cite:t}`geffner_2000_dynamic`.
 
+The key object of analysis in this book is a data cube. Many scientific workflows examine how a given variable (such as temperature, wind speed, relative humidity, etc.) varies over time and/or space. Data cubes are a way of organizing geospatial data that allow us to ask these questions. Most of the examples are [raster data cubes](https://openeo.org/documentation/1.0/datacubes.html). Raster data cubes are n-dimensional objects that store continuous measurements or estimates of physical quantities that exist along a given dimension(s).   
 
-The key object of analysis in this book is a [raster data cube](https://openeo.org/documentation/1.0/datacubes.html). Raster data cubes are n-dimensional objects that store continuous measurements or estimates of physical quantities that exist along given dimension(s). Many scientific workflows involve examining how a variable (such as temperature, windspeed, relative humidity, etc.) varies over time and/or space. Data cubes are a way of organizing geospatial data that let us ask these questions.
+Many examples in the book also include vector data. In contrast to raster data, where continuous measurements are stored on a grid, vector data represent geographic features such as roads, rivers, and political borders using points, lines, and polygons. Vector data are often stored as table-like data frames, where geometry and attribute information for individual features are stored in each row of the table. A relatively new development in the Xarray and Python ecosystem is support for vector data cubes. Vector data cubes are similar to raster data cubes, except that one of the cube's dimensions is an array of geometry objects. This allows you to store multi-dimensional data associated with each geometry. 
 
 A very common data cube structure is a 3-dimensional object with (`x`,`y`,`time`) dimensions ({cite:t}`Baumann_2019_datacube,giuliani_2019_EarthObservationOpen,mahecha_2020_EarthSystemData,montero_2024_EarthSystemData`). While this is a relatively intuitive concept,in practice, the amount and types of information contained within a single dataset and the operations involved in managing them, can become complicated and unwieldy. As analysts, we access data (usually from providers such as Distributed Active Archive Center or [DAACs](https://nssdc.gsfc.nasa.gov/earth/daacs.html)), and then we are responsible for organizing the data in a way that let's us ask questions of it. While some of these decisions are straightforward (eg. *It makes sense to stack observations from different points in time along a time dimension*), some can be more open-ended (*Where and how should important metadata be stored so that it will propagate across appropriate operations and be accessible when it is needed?*). 
 
@@ -70,6 +71,8 @@ In the second [tutorial](../sentinel1/s1_intro.md), we work with two Sentinel-1
 
 ### See also 
 - [OpenEO - Data Cubes](https://openeo.org/documentation/1.0/datacubes.html)
+- [r-spatial - Vector Data Cubes](https://r-spatial.org/r/2022/09/12/vdc.html)
+- [Xvec - Vector data cubes for Xarray](https://xvec.readthedocs.io/en/stable/)
 - [Open Data Cube initiative](https://www.opendatacube.org/about-draft)
 - [The Datacube Manifesto](http://www.earthserver.eu/tech/datacube-manifesto/The-Datacube-Manifesto.pdf)
 - [ARCO: The smartest way to access big geospatial data - Lobelia Earth](https://blog.lobelia.earth/arco-the-smartest-way-to-access-big-geospatial-data-eaf689eff3c9)
 
@@ -2,7 +2,7 @@
 
 This book contains two distinct tutorials, each of which focuses on a different cloud-optimized geospatial dataset and different cloud-computing resources. Read more about the datasets used [here](4_tutorial_data.md).
 
-## *Part 1: {{part2_title}}*
+## *Part 1: ITS_LIVE ice velocity data tutorial*
 
 This tutorial focuses on a dataset of ice velocity observations derived from satellite image pairs, using a number of different satellite sensors. This dataset is accessed as Zarr data cubes from AWS S3 cloud object storage. The notebooks in this tutorial focus on:  
 
@@ -12,7 +12,7 @@ This tutorial focuses on a dataset of ice velocity observations derived from sat
 4) Inspecting metadata and using metadata to subset and visualize the dataset,
 5) Exploratory data analysis and visualization at the scale of a single glacier
 
-## *Part 2: {{part3_title}}*
+## *Part 2: Sentinel-1 RTC imagery tutorial*
 
 This tutorial focuses on data from Sentinel-1, a synthetic aperture radar (SAR) dataset containing imagery collected at C-band. Specifically, we are looking at Sentinel-1 Radiometric Terrain Corrected (RTC) imagery (for more detail on this, see [tutorial data](4_tutorial_data.md)). We demonstrate how to access and work with two Sentinel-1 RTC datasets as well as how to set up and perform an initial comparison between the two and time series analysis of Sentinel-1 backscatter variability. These notebooks cover:  
 
 
@@ -1,6 +1,6 @@
 # 2.4 Data used in tutorials
 
-We use a many different datasets throughout these tutorials. While each tutorial is focused on a different raster time series (ITS_LIVE ice velocity data and Sentinel-1 imagery), we also use vector data to represent points of interest. 
+We use many different datasets throughout these tutorials. While each tutorial is focused on a different raster time series (ITS_LIVE ice velocity data and Sentinel-1 imagery), we also use vector data to represent points of interest. 
 
 Most of the examples in this book use data accessed programmatically from cloud-object storage. We make subset of the data available in this books Github repository to remove the need for computationally-intensive operations in the tutorials. In one example, working with Sentinel-1 data processed by Alaska Satellite Facility, we start with data downloaded locally. Users who would like to complete this processing step on their own may do so (and access the data [here](https://zenodo.org/records/15036782)), but a smaller subset of this data is stored in the repository. 
 
 
@@ -1,20 +1,21 @@
 # 2.5 Software and computing environment
 
-On this page you'll find information about the computing environment that will be used in both of the tutorials in this book. We provide instructions for Running locally (on laptop), or on a hosted JupyterHub in AWS us-west-2.
+On this page you'll find information about the computing environment that will be used for both of the tutorials in this book. We provide instructions for running locally (on a laptop), or on a hosted JupyterHub in AWS us-west-2.
 
 ## *Running tutorial materials locally*
 
 There are two options for creating a software environment: [pixi](https://pixi.sh/latest/) or [mamba](https://mamba.readthedocs.io/en/latest/) / [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html). We recommend using pixi to create a consistent environment on different operating systems. If you have pixi installed, follow the steps below, otherwise, follow the steps for conda/mamba below.
 
 ### To use pixi
 1. Clone the book's GitHub repository:   
-    ```git clone https://github.com/e-marshall/cloud-open-source-geospatial-datacube-workflows.git```
+    ```git clone https://github.com/e-marshall/cloud-open-source-geospatial-data-cube-workflows.git```
 
 2. Navigate into the repo environment:  
-```cd cloud-open-source-geospatial-datacube-workflows```
+```cd cloud-open-source-geospatial-data-cube-workflows```
 
-3. There is a small data cube included in the repo that is used in the tutorials. We don't want git to track this so we tell it to ignore this file path.  
-```git update-index --assume-unchanged book/itslive/data/raster_data/regional_glacier_velocity_vector_cube.zarr/.```
+3. There are two small data cubes included in the repo that are used in the tutorials. We don't want git to track these so we tell git to ignore these file paths:
+
+```git update-index --assume-unchanged book/itslive/data/raster_data/regional_glacier_velocity_vector_cube.zarr/. book/sentinel/data/raster_data/full_timeseries/intermediate_cubes/s1_asf_clipped_cube.zarr/.```
 
 4. Execute `pixi run` for each tutorial:  
 ```pixi run itslive```  
@@ -25,17 +26,18 @@ Note that the first `pixi run` will download specific versions of all required P
 ### To use conda/mamba
 
 1. Clone this book's GitHub repository:  
-```git clone https://github.com/e-marshall/cloud-open-source-geospatial-datacube-workflows.git```
+```git clone https://github.com/e-marshall/cloud-open-source-geospatial-data-cube-workflows.git```
 
 2. Navigate into the `book` sub-directory:    
-```cd cloud-open-source-geospatial-datacube-workflows/book```
+```cd cloud-open-source-geospatial-data-cube-workflows/book```
 
 3. Create and activate a conda environment from the `environment.yml` file located in the repo:  
 ```conda env create -f environment.yml```  
 ```conda activate book```
 
-4. There is a small data cube included in the repo that is used in the tutorials. We don't want git to track this so we tell it to ignore this file path.  
-```git update-index --assume-unchanged book/itslive/data/raster_data/regional_glacier_velocity_vector_cube.zarr/.```
+4. There are two small data cubes included in the repo that are used in the tutorials. We don't want git to track these so we tell git to ignore these file paths:
+
+```git update-index --assume-unchanged book/itslive/data/raster_data/regional_glacier_velocity_vector_cube.zarr/. book/sentinel/data/raster_data/full_timeseries/intermediate_cubes/s1_asf_clipped_cube.zarr/.```
 
 5. Start Jupyterlab and navigate to the directories containing the Jupyter notebooks (`itslive/nbs` and `s1/nbs`):    
 ```jupyterlab```