Skip to content

Commit c5e5a15

Browse files
Merge pull request #17 from triblespace/codex/plan-next-steps-for-project-advancement
Improve columnar error handling
2 parents fffe7b1 + cbe7731 commit c5e5a15

File tree

6 files changed

+30
-27
lines changed

6 files changed

+30
-27
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ have been removed to keep the changelog focused on Yeehaw's history.
1111
- rename benchmark imports to use the `yeehaw` crate.
1212
- update examples to import the `yeehaw` crate instead of `tantivy`.
1313
- run preflight tests without enabling the `unstable` feature.
14+
- handle unknown column codes gracefully in `ColumnarReader::iter_columns`.
1415

1516
## Features/Improvements
1617
- add docs/example and Vec<u32> values to sstable [#2660](https://github.com/quickwit-oss/yeehaw/pull/2660)(@PSeitz)
@@ -28,3 +29,4 @@ have been removed to keep the changelog focused on Yeehaw's history.
2829
- remove outdated TODO for columnar `list_columns` and document error handling follow-up.
2930
- remove unused `std::iter` imports from test modules.
3031
- expand documentation for document deserialization traits.
32+
- reorder inventory tasks to prioritize fixing doctest regressions.

INVENTORY.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,9 +51,9 @@ This inventory captures the direction of the rewrite and the major tasks require
5151
- Update types and crates (e.g. `TantivyDocument`, `tantivy-fst`) to use `yeehaw` naming.
5252
11. **Rename subcrate benchmarks**
5353
- Update benchmark crates (e.g. bitpacker, common, sstable) to use `yeehaw` naming once subcrate packages are renamed.
54-
12. **Improve error handling in `ColumnarReader::iter_columns`**
55-
- Replace `unwrap` usage with a fallible API that reports unknown column types.
56-
13. **Fix failing doctests after crate rename**
54+
12. **Fix failing doctests after crate rename**
5755
- Update examples referencing `tantivy` to `yeehaw` so `cargo test --doc` succeeds.
56+
13. **Improve error handling in `ColumnarReader::iter_columns`** *(done)*
57+
- Replace `unwrap` usage with a fallible API that reports unknown column types.
5858
14. **Replace nightly benches using `test::Bencher`**
5959
- Migrate inline benchmarks to a stable harness so the `unstable` feature can be tested on stable Rust.

columnar/src/columnar/merge/mod.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -410,7 +410,8 @@ fn group_columns_for_merge<'a>(
410410
for (columnar_id, columnar_reader) in columnar_readers.iter().enumerate() {
411411
let column_name_and_handle = columnar_reader.iter_columns()?;
412412

413-
for (column_name, handle) in column_name_and_handle {
413+
for result in column_name_and_handle {
414+
let (column_name, handle) = result?;
414415
let column_category: ColumnTypeCategory = handle.column_type().into();
415416
columns
416417
.entry((column_name, column_category))

columnar/src/columnar/reader/mod.rs

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -125,21 +125,24 @@ impl ColumnarReader {
125125
// Iterate over the columns in a sorted way
126126
pub fn iter_columns(
127127
&self,
128-
) -> io::Result<impl Iterator<Item = (String, DynamicColumnHandle)> + '_> {
128+
) -> io::Result<impl Iterator<Item = io::Result<(String, DynamicColumnHandle)>> + '_> {
129129
let mut stream = self.column_dictionary.stream()?;
130130
Ok(std::iter::from_fn(move || {
131131
if stream.advance() {
132132
let key_bytes: &[u8] = stream.key();
133133
let column_code: u8 = key_bytes.last().cloned().unwrap();
134-
// TODO Error Handling. The API gets quite ugly when returning the error here, so
135-
// instead we could just check the first N columns upfront.
136-
let column_type: ColumnType = ColumnType::try_from_code(column_code)
137-
.map_err(|_| io_invalid_data(format!("Unknown column code `{column_code}`")))
138-
.unwrap();
134+
let column_type = match ColumnType::try_from_code(column_code) {
135+
Ok(column_type) => column_type,
136+
Err(_) => {
137+
return Some(Err(io_invalid_data(format!(
138+
"Unknown column code `{column_code}`"
139+
))));
140+
}
141+
};
139142
let range = stream.value().clone();
140143
let column_name =
141-
// The last two bytes are respectively the 0u8 separator and the column_type.
142-
String::from_utf8_lossy(&key_bytes[..key_bytes.len() - 2]).to_string();
144+
// The last two bytes are respectively the 0u8 separator and the column_type.
145+
String::from_utf8_lossy(&key_bytes[..key_bytes.len() - 2]).to_string();
143146
let file_slice = self
144147
.column_data
145148
.slice(range.start as usize..range.end as usize);
@@ -148,15 +151,15 @@ impl ColumnarReader {
148151
column_type,
149152
format_version: self.format_version,
150153
};
151-
Some((column_name, column_handle))
154+
Some(Ok((column_name, column_handle)))
152155
} else {
153156
None
154157
}
155158
}))
156159
}
157160

158161
pub fn list_columns(&self) -> io::Result<Vec<(String, DynamicColumnHandle)>> {
159-
Ok(self.iter_columns()?.collect())
162+
self.iter_columns()?.collect()
160163
}
161164

162165
pub async fn read_columns_async(

src/aggregation/metric/top_hits.rs

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -191,21 +191,18 @@ impl TopHitsAggregationReq {
191191
unsupported_err("version")?;
192192
}
193193

194+
let columns = reader.list_columns()?;
194195
self.doc_value_fields = self
195196
.doc_value_fields
196197
.iter()
197198
.map(|field| {
198-
if !field.contains('*')
199-
&& reader
200-
.iter_columns()?
201-
.any(|(name, _)| name.as_str() == field)
202-
{
199+
if !field.contains('*') && columns.iter().any(|(name, _)| name.as_str() == field) {
203200
return Ok(vec![field.to_owned()]);
204201
}
205202

206203
let pattern = globbed_string_to_regex(field)?;
207-
let fields = reader
208-
.iter_columns()?
204+
let fields = columns
205+
.iter()
209206
.map(|(name, _)| {
210207
// normalize path from internal fast field repr
211208
name.replace(JSON_PATH_SEGMENT_SEP_STR, ".")
@@ -214,15 +211,14 @@ impl TopHitsAggregationReq {
214211
.collect::<Vec<_>>();
215212
assert!(
216213
!fields.is_empty(),
217-
"No fields matched the glob '{field}' in docvalue_fields"
214+
"No fields matched the glob '{field}' in docvalue_fields",
218215
);
219216
Ok(fields)
220217
})
221218
.collect::<crate::Result<Vec<_>>>()?
222219
.into_iter()
223220
.flatten()
224221
.collect();
225-
226222
Ok(())
227223
}
228224

src/index/segment_reader.rs

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -355,23 +355,24 @@ impl SegmentReader {
355355
.fast_fields()
356356
.columnar()
357357
.iter_columns()?
358-
.map(|(mut field_name, handle)| {
358+
.map(|res| {
359+
let (mut field_name, handle) = res?;
359360
json_path_sep_to_dot(&mut field_name);
360361
// map to canonical path, to avoid similar but different entries.
361362
// Eventually we should just accept '.' separated for all cases.
362363
let field_name = map_to_canonical
363364
.get(&field_name)
364365
.unwrap_or(&field_name)
365366
.to_string();
366-
FieldMetadata {
367+
Ok(FieldMetadata {
367368
indexed: false,
368369
stored: false,
369370
field_name,
370371
fast: true,
371372
typ: Type::from(handle.column_type()),
372-
}
373+
})
373374
})
374-
.collect();
375+
.collect::<io::Result<Vec<_>>>()?;
375376
// Since the type is encoded differently in the fast field and in the inverted index,
376377
// the order of the fields is not guaranteed to be the same. Therefore, we sort the fields.
377378
// If we are sure that the order is the same, we can remove this sort.

0 commit comments

Comments
 (0)