diff --git a/CHANGELOG.md b/CHANGELOG.md index 0e505d9911..c5f65aebbc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ have been removed to keep the changelog focused on Yeehaw's history. - rename benchmark imports to use the `yeehaw` crate. - update examples to import the `yeehaw` crate instead of `tantivy`. - run preflight tests without enabling the `unstable` feature. +- handle unknown column codes gracefully in `ColumnarReader::iter_columns`. ## Features/Improvements - add docs/example and Vec values to sstable [#2660](https://github.com/quickwit-oss/yeehaw/pull/2660)(@PSeitz) @@ -28,3 +29,4 @@ have been removed to keep the changelog focused on Yeehaw's history. - remove outdated TODO for columnar `list_columns` and document error handling follow-up. - remove unused `std::iter` imports from test modules. - expand documentation for document deserialization traits. +- reorder inventory tasks to prioritize fixing doctest regressions. diff --git a/INVENTORY.md b/INVENTORY.md index b49026f66e..199bf354c4 100644 --- a/INVENTORY.md +++ b/INVENTORY.md @@ -51,9 +51,9 @@ This inventory captures the direction of the rewrite and the major tasks require - Update types and crates (e.g. `TantivyDocument`, `tantivy-fst`) to use `yeehaw` naming. 11. **Rename subcrate benchmarks** - Update benchmark crates (e.g. bitpacker, common, sstable) to use `yeehaw` naming once subcrate packages are renamed. -12. **Improve error handling in `ColumnarReader::iter_columns`** - - Replace `unwrap` usage with a fallible API that reports unknown column types. -13. **Fix failing doctests after crate rename** +12. **Fix failing doctests after crate rename** - Update examples referencing `tantivy` to `yeehaw` so `cargo test --doc` succeeds. +13. **Improve error handling in `ColumnarReader::iter_columns`** *(done)* + - Replace `unwrap` usage with a fallible API that reports unknown column types. 14. **Replace nightly benches using `test::Bencher`** - Migrate inline benchmarks to a stable harness so the `unstable` feature can be tested on stable Rust. diff --git a/columnar/src/columnar/merge/mod.rs b/columnar/src/columnar/merge/mod.rs index 54449f8a49..71672d3903 100644 --- a/columnar/src/columnar/merge/mod.rs +++ b/columnar/src/columnar/merge/mod.rs @@ -410,7 +410,8 @@ fn group_columns_for_merge<'a>( for (columnar_id, columnar_reader) in columnar_readers.iter().enumerate() { let column_name_and_handle = columnar_reader.iter_columns()?; - for (column_name, handle) in column_name_and_handle { + for result in column_name_and_handle { + let (column_name, handle) = result?; let column_category: ColumnTypeCategory = handle.column_type().into(); columns .entry((column_name, column_category)) diff --git a/columnar/src/columnar/reader/mod.rs b/columnar/src/columnar/reader/mod.rs index 42e0abf0a7..5c0aafcee3 100644 --- a/columnar/src/columnar/reader/mod.rs +++ b/columnar/src/columnar/reader/mod.rs @@ -125,21 +125,24 @@ impl ColumnarReader { // Iterate over the columns in a sorted way pub fn iter_columns( &self, - ) -> io::Result + '_> { + ) -> io::Result> + '_> { let mut stream = self.column_dictionary.stream()?; Ok(std::iter::from_fn(move || { if stream.advance() { let key_bytes: &[u8] = stream.key(); let column_code: u8 = key_bytes.last().cloned().unwrap(); - // TODO Error Handling. The API gets quite ugly when returning the error here, so - // instead we could just check the first N columns upfront. - let column_type: ColumnType = ColumnType::try_from_code(column_code) - .map_err(|_| io_invalid_data(format!("Unknown column code `{column_code}`"))) - .unwrap(); + let column_type = match ColumnType::try_from_code(column_code) { + Ok(column_type) => column_type, + Err(_) => { + return Some(Err(io_invalid_data(format!( + "Unknown column code `{column_code}`" + )))); + } + }; let range = stream.value().clone(); let column_name = - // The last two bytes are respectively the 0u8 separator and the column_type. - String::from_utf8_lossy(&key_bytes[..key_bytes.len() - 2]).to_string(); + // The last two bytes are respectively the 0u8 separator and the column_type. + String::from_utf8_lossy(&key_bytes[..key_bytes.len() - 2]).to_string(); let file_slice = self .column_data .slice(range.start as usize..range.end as usize); @@ -148,7 +151,7 @@ impl ColumnarReader { column_type, format_version: self.format_version, }; - Some((column_name, column_handle)) + Some(Ok((column_name, column_handle))) } else { None } @@ -156,7 +159,7 @@ impl ColumnarReader { } pub fn list_columns(&self) -> io::Result> { - Ok(self.iter_columns()?.collect()) + self.iter_columns()?.collect() } pub async fn read_columns_async( diff --git a/src/aggregation/metric/top_hits.rs b/src/aggregation/metric/top_hits.rs index f776bdb080..60e4a6573b 100644 --- a/src/aggregation/metric/top_hits.rs +++ b/src/aggregation/metric/top_hits.rs @@ -191,21 +191,18 @@ impl TopHitsAggregationReq { unsupported_err("version")?; } + let columns = reader.list_columns()?; self.doc_value_fields = self .doc_value_fields .iter() .map(|field| { - if !field.contains('*') - && reader - .iter_columns()? - .any(|(name, _)| name.as_str() == field) - { + if !field.contains('*') && columns.iter().any(|(name, _)| name.as_str() == field) { return Ok(vec![field.to_owned()]); } let pattern = globbed_string_to_regex(field)?; - let fields = reader - .iter_columns()? + let fields = columns + .iter() .map(|(name, _)| { // normalize path from internal fast field repr name.replace(JSON_PATH_SEGMENT_SEP_STR, ".") @@ -214,7 +211,7 @@ impl TopHitsAggregationReq { .collect::>(); assert!( !fields.is_empty(), - "No fields matched the glob '{field}' in docvalue_fields" + "No fields matched the glob '{field}' in docvalue_fields", ); Ok(fields) }) @@ -222,7 +219,6 @@ impl TopHitsAggregationReq { .into_iter() .flatten() .collect(); - Ok(()) } diff --git a/src/index/segment_reader.rs b/src/index/segment_reader.rs index afb12bf006..8088fa9045 100644 --- a/src/index/segment_reader.rs +++ b/src/index/segment_reader.rs @@ -355,7 +355,8 @@ impl SegmentReader { .fast_fields() .columnar() .iter_columns()? - .map(|(mut field_name, handle)| { + .map(|res| { + let (mut field_name, handle) = res?; json_path_sep_to_dot(&mut field_name); // map to canonical path, to avoid similar but different entries. // Eventually we should just accept '.' separated for all cases. @@ -363,15 +364,15 @@ impl SegmentReader { .get(&field_name) .unwrap_or(&field_name) .to_string(); - FieldMetadata { + Ok(FieldMetadata { indexed: false, stored: false, field_name, fast: true, typ: Type::from(handle.column_type()), - } + }) }) - .collect(); + .collect::>>()?; // Since the type is encoded differently in the fast field and in the inverted index, // the order of the fields is not guaranteed to be the same. Therefore, we sort the fields. // If we are sure that the order is the same, we can remove this sort.