triblespace · somethingelseentirely · Aug 2, 2025 · Aug 2, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,7 @@ have been removed to keep the changelog focused on Yeehaw's history.
 - rename benchmark imports to use the `yeehaw` crate.
 - update examples to import the `yeehaw` crate instead of `tantivy`.
 - run preflight tests without enabling the `unstable` feature.
+- handle unknown column codes gracefully in `ColumnarReader::iter_columns`.
 
 ## Features/Improvements
 - add docs/example and Vec<u32> values to sstable [#2660](https://github.com/quickwit-oss/yeehaw/pull/2660)(@PSeitz)
@@ -28,3 +29,4 @@ have been removed to keep the changelog focused on Yeehaw's history.
 - remove outdated TODO for columnar `list_columns` and document error handling follow-up.
 - remove unused `std::iter` imports from test modules.
 - expand documentation for document deserialization traits.
+- reorder inventory tasks to prioritize fixing doctest regressions.
diff --git a/INVENTORY.md b/INVENTORY.md
@@ -51,9 +51,9 @@ This inventory captures the direction of the rewrite and the major tasks require
     - Update types and crates (e.g. `TantivyDocument`, `tantivy-fst`) to use `yeehaw` naming.
 11. **Rename subcrate benchmarks**
     - Update benchmark crates (e.g. bitpacker, common, sstable) to use `yeehaw` naming once subcrate packages are renamed.
-12. **Improve error handling in `ColumnarReader::iter_columns`**
-    - Replace `unwrap` usage with a fallible API that reports unknown column types.
-13. **Fix failing doctests after crate rename**
+12. **Fix failing doctests after crate rename**
     - Update examples referencing `tantivy` to `yeehaw` so `cargo test --doc` succeeds.
+13. **Improve error handling in `ColumnarReader::iter_columns`** *(done)*
+    - Replace `unwrap` usage with a fallible API that reports unknown column types.
 14. **Replace nightly benches using `test::Bencher`**
     - Migrate inline benchmarks to a stable harness so the `unstable` feature can be tested on stable Rust.
diff --git a/columnar/src/columnar/merge/mod.rs b/columnar/src/columnar/merge/mod.rs
@@ -410,7 +410,8 @@ fn group_columns_for_merge<'a>(
     for (columnar_id, columnar_reader) in columnar_readers.iter().enumerate() {
         let column_name_and_handle = columnar_reader.iter_columns()?;
 
-        for (column_name, handle) in column_name_and_handle {
+        for result in column_name_and_handle {
+            let (column_name, handle) = result?;
             let column_category: ColumnTypeCategory = handle.column_type().into();
             columns
                 .entry((column_name, column_category))

diff --git a/columnar/src/columnar/reader/mod.rs b/columnar/src/columnar/reader/mod.rs
@@ -125,21 +125,24 @@ impl ColumnarReader {
     // Iterate over the columns in a sorted way
     pub fn iter_columns(
         &self,
-    ) -> io::Result<impl Iterator<Item = (String, DynamicColumnHandle)> + '_> {
+    ) -> io::Result<impl Iterator<Item = io::Result<(String, DynamicColumnHandle)>> + '_> {
         let mut stream = self.column_dictionary.stream()?;
         Ok(std::iter::from_fn(move || {
             if stream.advance() {
                 let key_bytes: &[u8] = stream.key();
                 let column_code: u8 = key_bytes.last().cloned().unwrap();
-                // TODO Error Handling. The API gets quite ugly when returning the error here, so
-                // instead we could just check the first N columns upfront.
-                let column_type: ColumnType = ColumnType::try_from_code(column_code)
-                    .map_err(|_| io_invalid_data(format!("Unknown column code `{column_code}`")))
-                    .unwrap();
+                let column_type = match ColumnType::try_from_code(column_code) {
+                    Ok(column_type) => column_type,
+                    Err(_) => {
+                        return Some(Err(io_invalid_data(format!(
+                            "Unknown column code `{column_code}`"
+                        ))));
+                    }
+                };
                 let range = stream.value().clone();
                 let column_name =
-                // The last two bytes are respectively the 0u8 separator and the column_type.
-                String::from_utf8_lossy(&key_bytes[..key_bytes.len() - 2]).to_string();
+                    // The last two bytes are respectively the 0u8 separator and the column_type.
+                    String::from_utf8_lossy(&key_bytes[..key_bytes.len() - 2]).to_string();
                 let file_slice = self
                     .column_data
                     .slice(range.start as usize..range.end as usize);
@@ -148,15 +151,15 @@ impl ColumnarReader {
                     column_type,
                     format_version: self.format_version,
                 };
-                Some((column_name, column_handle))
+                Some(Ok((column_name, column_handle)))
             } else {
                 None
             }
         }))
     }
 
     pub fn list_columns(&self) -> io::Result<Vec<(String, DynamicColumnHandle)>> {
-        Ok(self.iter_columns()?.collect())
+        self.iter_columns()?.collect()
     }
 
     pub async fn read_columns_async(

diff --git a/src/aggregation/metric/top_hits.rs b/src/aggregation/metric/top_hits.rs
@@ -191,21 +191,18 @@ impl TopHitsAggregationReq {
             unsupported_err("version")?;
         }
 
+        let columns = reader.list_columns()?;
         self.doc_value_fields = self
             .doc_value_fields
             .iter()
             .map(|field| {
-                if !field.contains('*')
-                    && reader
-                        .iter_columns()?
-                        .any(|(name, _)| name.as_str() == field)
-                {
+                if !field.contains('*') && columns.iter().any(|(name, _)| name.as_str() == field) {
                     return Ok(vec![field.to_owned()]);
                 }
 
                 let pattern = globbed_string_to_regex(field)?;
-                let fields = reader
-                    .iter_columns()?
+                let fields = columns
+                    .iter()
                     .map(|(name, _)| {
                         // normalize path from internal fast field repr
                         name.replace(JSON_PATH_SEGMENT_SEP_STR, ".")
@@ -214,15 +211,14 @@ impl TopHitsAggregationReq {
                     .collect::<Vec<_>>();
                 assert!(
                     !fields.is_empty(),
-                    "No fields matched the glob '{field}' in docvalue_fields"
+                    "No fields matched the glob '{field}' in docvalue_fields",
                 );
                 Ok(fields)
             })
             .collect::<crate::Result<Vec<_>>>()?
             .into_iter()
             .flatten()
             .collect();
-
         Ok(())
     }
 

diff --git a/src/index/segment_reader.rs b/src/index/segment_reader.rs
@@ -355,23 +355,24 @@ impl SegmentReader {
             .fast_fields()
             .columnar()
             .iter_columns()?
-            .map(|(mut field_name, handle)| {
+            .map(|res| {
+                let (mut field_name, handle) = res?;
                 json_path_sep_to_dot(&mut field_name);
                 // map to canonical path, to avoid similar but different entries.
                 // Eventually we should just accept '.' separated for all cases.
                 let field_name = map_to_canonical
                     .get(&field_name)
                     .unwrap_or(&field_name)
                     .to_string();
-                FieldMetadata {
+                Ok(FieldMetadata {
                     indexed: false,
                     stored: false,
                     field_name,
                     fast: true,
                     typ: Type::from(handle.column_type()),
-                }
+                })
             })
-            .collect();
+            .collect::<io::Result<Vec<_>>>()?;
         // Since the type is encoded differently in the fast field and in the inverted index,
         // the order of the fields is not guaranteed to be the same. Therefore, we sort the fields.
         // If we are sure that the order is the same, we can remove this sort.