Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ have been removed to keep the changelog focused on Yeehaw's history.
- rename benchmark imports to use the `yeehaw` crate.
- update examples to import the `yeehaw` crate instead of `tantivy`.
- run preflight tests without enabling the `unstable` feature.
- handle unknown column codes gracefully in `ColumnarReader::iter_columns`.

## Features/Improvements
- add docs/example and Vec<u32> values to sstable [#2660](https://github.com/quickwit-oss/yeehaw/pull/2660)(@PSeitz)
Expand All @@ -28,3 +29,4 @@ have been removed to keep the changelog focused on Yeehaw's history.
- remove outdated TODO for columnar `list_columns` and document error handling follow-up.
- remove unused `std::iter` imports from test modules.
- expand documentation for document deserialization traits.
- reorder inventory tasks to prioritize fixing doctest regressions.
6 changes: 3 additions & 3 deletions INVENTORY.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,9 @@ This inventory captures the direction of the rewrite and the major tasks require
- Update types and crates (e.g. `TantivyDocument`, `tantivy-fst`) to use `yeehaw` naming.
11. **Rename subcrate benchmarks**
- Update benchmark crates (e.g. bitpacker, common, sstable) to use `yeehaw` naming once subcrate packages are renamed.
12. **Improve error handling in `ColumnarReader::iter_columns`**
- Replace `unwrap` usage with a fallible API that reports unknown column types.
13. **Fix failing doctests after crate rename**
12. **Fix failing doctests after crate rename**
- Update examples referencing `tantivy` to `yeehaw` so `cargo test --doc` succeeds.
13. **Improve error handling in `ColumnarReader::iter_columns`** *(done)*
- Replace `unwrap` usage with a fallible API that reports unknown column types.
14. **Replace nightly benches using `test::Bencher`**
- Migrate inline benchmarks to a stable harness so the `unstable` feature can be tested on stable Rust.
3 changes: 2 additions & 1 deletion columnar/src/columnar/merge/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -410,7 +410,8 @@ fn group_columns_for_merge<'a>(
for (columnar_id, columnar_reader) in columnar_readers.iter().enumerate() {
let column_name_and_handle = columnar_reader.iter_columns()?;

for (column_name, handle) in column_name_and_handle {
for result in column_name_and_handle {
let (column_name, handle) = result?;
let column_category: ColumnTypeCategory = handle.column_type().into();
columns
.entry((column_name, column_category))
Expand Down
23 changes: 13 additions & 10 deletions columnar/src/columnar/reader/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -125,21 +125,24 @@ impl ColumnarReader {
// Iterate over the columns in a sorted way
pub fn iter_columns(
&self,
) -> io::Result<impl Iterator<Item = (String, DynamicColumnHandle)> + '_> {
) -> io::Result<impl Iterator<Item = io::Result<(String, DynamicColumnHandle)>> + '_> {
let mut stream = self.column_dictionary.stream()?;
Ok(std::iter::from_fn(move || {
if stream.advance() {
let key_bytes: &[u8] = stream.key();
let column_code: u8 = key_bytes.last().cloned().unwrap();
// TODO Error Handling. The API gets quite ugly when returning the error here, so
// instead we could just check the first N columns upfront.
let column_type: ColumnType = ColumnType::try_from_code(column_code)
.map_err(|_| io_invalid_data(format!("Unknown column code `{column_code}`")))
.unwrap();
let column_type = match ColumnType::try_from_code(column_code) {
Ok(column_type) => column_type,
Err(_) => {
return Some(Err(io_invalid_data(format!(
"Unknown column code `{column_code}`"
))));
}
};
let range = stream.value().clone();
let column_name =
// The last two bytes are respectively the 0u8 separator and the column_type.
String::from_utf8_lossy(&key_bytes[..key_bytes.len() - 2]).to_string();
// The last two bytes are respectively the 0u8 separator and the column_type.
String::from_utf8_lossy(&key_bytes[..key_bytes.len() - 2]).to_string();
let file_slice = self
.column_data
.slice(range.start as usize..range.end as usize);
Expand All @@ -148,15 +151,15 @@ impl ColumnarReader {
column_type,
format_version: self.format_version,
};
Some((column_name, column_handle))
Some(Ok((column_name, column_handle)))
} else {
None
}
}))
}

pub fn list_columns(&self) -> io::Result<Vec<(String, DynamicColumnHandle)>> {
Ok(self.iter_columns()?.collect())
self.iter_columns()?.collect()
}

pub async fn read_columns_async(
Expand Down
14 changes: 5 additions & 9 deletions src/aggregation/metric/top_hits.rs
Original file line number Diff line number Diff line change
Expand Up @@ -191,21 +191,18 @@ impl TopHitsAggregationReq {
unsupported_err("version")?;
}

let columns = reader.list_columns()?;
self.doc_value_fields = self
.doc_value_fields
.iter()
.map(|field| {
if !field.contains('*')
&& reader
.iter_columns()?
.any(|(name, _)| name.as_str() == field)
{
if !field.contains('*') && columns.iter().any(|(name, _)| name.as_str() == field) {
return Ok(vec![field.to_owned()]);
}

let pattern = globbed_string_to_regex(field)?;
let fields = reader
.iter_columns()?
let fields = columns
.iter()
.map(|(name, _)| {
// normalize path from internal fast field repr
name.replace(JSON_PATH_SEGMENT_SEP_STR, ".")
Expand All @@ -214,15 +211,14 @@ impl TopHitsAggregationReq {
.collect::<Vec<_>>();
assert!(
!fields.is_empty(),
"No fields matched the glob '{field}' in docvalue_fields"
"No fields matched the glob '{field}' in docvalue_fields",
);
Ok(fields)
})
.collect::<crate::Result<Vec<_>>>()?
.into_iter()
.flatten()
.collect();

Ok(())
}

Expand Down
9 changes: 5 additions & 4 deletions src/index/segment_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -355,23 +355,24 @@ impl SegmentReader {
.fast_fields()
.columnar()
.iter_columns()?
.map(|(mut field_name, handle)| {
.map(|res| {
let (mut field_name, handle) = res?;
json_path_sep_to_dot(&mut field_name);
// map to canonical path, to avoid similar but different entries.
// Eventually we should just accept '.' separated for all cases.
let field_name = map_to_canonical
.get(&field_name)
.unwrap_or(&field_name)
.to_string();
FieldMetadata {
Ok(FieldMetadata {
indexed: false,
stored: false,
field_name,
fast: true,
typ: Type::from(handle.column_type()),
}
})
})
.collect();
.collect::<io::Result<Vec<_>>>()?;
// Since the type is encoded differently in the fast field and in the inverted index,
// the order of the fields is not guaranteed to be the same. Therefore, we sort the fields.
// If we are sure that the order is the same, we can remove this sort.
Expand Down
Loading