Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion bin/ciff2bmp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ struct Args {
bsize: usize,
#[structopt(short, long, help = "Compress range data")]
compress_range: bool,
#[structopt(short, long, help = "Range pruning ratio", default_value = "0.0")]
range_pruning_ratio: f32,
}

fn main() {
Expand All @@ -31,7 +33,8 @@ fn main() {
.input_path(args.ciff_file)
.output_path(args.output)
.compress_range(args.compress_range)
.bsize(args.bsize);
.bsize(args.bsize)
.range_pruning_ratio(args.range_pruning_ratio);

// Convert the Ciff file to BMP format
if let Err(error) = converter.to_bmp() {
Expand Down
11 changes: 8 additions & 3 deletions src/ciff/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ pub struct CiffToBmp {
output: Option<PathBuf>,
bsize: Option<usize>,
compress_range: bool,
range_pruning_ratio: f32,
}

Copy link

Copilot AI Jul 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The range_pruning_ratio field lacks initialization in the struct. This could cause issues when creating new instances without explicitly setting this field.

Suggested change
impl Default for CiffToBmp {
fn default() -> Self {
Self {
input: None,
output: None,
bsize: None,
compress_range: false,
range_pruning_ratio: 0.0, // Default value for range_pruning_ratio
}
}
}

Copilot uses AI. Check for mistakes.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is this nonsense?

impl CiffToBmp {
Expand All @@ -92,6 +93,10 @@ impl CiffToBmp {
self.compress_range = compress_range;
self
}
pub fn range_pruning_ratio(&mut self, range_pruning_ratio: f32) -> &mut Self {
self.range_pruning_ratio = range_pruning_ratio;
self
}
/// Builds a BMP index using the previously defined parameters.
///
/// # Errors
Expand All @@ -110,11 +115,11 @@ impl CiffToBmp {
.as_ref()
.ok_or_else(|| anyhow!("input path undefined"))?;
let bsize = self.bsize.ok_or_else(|| anyhow!("bsize undefined"))?;
convert_to_bmp(input, output, bsize, self.compress_range)
convert_to_bmp(input, output, bsize, self.compress_range, self.range_pruning_ratio)
}
}

fn convert_to_bmp(input: &Path, output: &Path, bsize: usize, compress_range: bool) -> Result<()> {
fn convert_to_bmp(input: &Path, output: &Path, bsize: usize, compress_range: bool, range_pruning_ratio: f32) -> Result<()> {
println!("{:?}", output);
let mut ciff_reader =
File::open(input).with_context(|| format!("Unable to open {}", input.display()))?;
Expand All @@ -126,7 +131,7 @@ fn convert_to_bmp(input: &Path, output: &Path, bsize: usize, compress_range: boo
let header: Header = Header::from_stream(&mut input)?;
println!("{}", header);

builder = IndexBuilder::new(header.num_documents as usize, bsize);
builder = IndexBuilder::new(header.num_documents as usize, bsize, range_pruning_ratio);

eprintln!("Processing postings");
let progress = ProgressBar::new(u64::try_from(header.num_postings_lists)?);
Expand Down
18 changes: 13 additions & 5 deletions src/index/inverted_index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -75,16 +75,18 @@ pub struct IndexBuilder {
posting_lists: Vec<Vec<(u32, u32)>>,
terms: Vec<String>,
documents: Vec<String>,
range_pruning_ratio: f32
}

impl IndexBuilder {
pub fn new(num_documents: usize, bsize: usize) -> Self {
pub fn new(num_documents: usize, bsize: usize, range_pruning_ratio: f32) -> Self {
IndexBuilder {
num_documents,
bsize,
posting_lists: Vec::new(),
terms: Vec::new(),
documents: Vec::new(),
range_pruning_ratio,
}
}

Expand Down Expand Up @@ -134,13 +136,19 @@ impl IndexBuilder {
let range_size = self.bsize;
let blocks_num = div_ceil(num_docs, range_size);
let mut range_maxes: Vec<u8> = vec![0; blocks_num];
p_list.iter().for_each(|&(docid, score)| {
let current_max = &mut range_maxes[docid as usize / range_size];
*current_max = cmp::max(*current_max, score as u8);
});
let mut sorted_scores: Vec<u32> = p_list.iter().map(|&(_, score)| score).collect();
sorted_scores.sort_by(|a, b| b.cmp(&a));

let pruning_threshold = sorted_scores.get(((1.0 - self.range_pruning_ratio) * sorted_scores.len() as f32) as usize).copied().unwrap_or(0) as u32;


p_list.iter().for_each(|&(docid, score)| {
if score >= pruning_threshold {
let current_max = &mut range_maxes[docid as usize / range_size];
*current_max = cmp::max(*current_max, score as u8);
}
});

// Retrieve the 10th, 100th and 1000th elements
let s10th = sorted_scores.get(9).copied().unwrap_or(0) as u8;
let s100th = sorted_scores.get(99).copied().unwrap_or(0) as u8;
Expand Down
Loading