Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions rust/worker/src/execution/operators/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,4 @@ pub mod select;
pub mod source_record_segment;
pub mod sparse_index_knn;
pub mod sparse_log_knn;
pub mod transform_log;
60 changes: 60 additions & 0 deletions rust/worker/src/execution/operators/transform_log.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
use async_trait::async_trait;
use chroma_error::{ChromaError, ErrorCodes};
use chroma_system::Operator;
use chroma_types::{Chunk, LogRecord};
use thiserror::Error;

#[derive(Debug)]
pub struct TransformOperator {}
Comment on lines +7 to +8
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[Documentation]

A doc comment explaining the purpose of this operator, especially that it's currently a placeholder, would be helpful for future maintainers.

For example:

Suggested change
#[derive(Debug)]
pub struct TransformOperator {}
/// A transformation operator for log records.
///
/// This is currently a placeholder and performs an identity transformation (cloning the input).
/// It is intended to be extended with actual transformation logic.
#[derive(Debug)]
pub struct TransformOperator {}

Committable suggestion

Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation.

Context for Agents
[**Documentation**]

A doc comment explaining the purpose of this operator, especially that it's currently a placeholder, would be helpful for future maintainers.

For example:
```suggestion
/// A transformation operator for log records.
///
/// This is currently a placeholder and performs an identity transformation (cloning the input).
/// It is intended to be extended with actual transformation logic.
#[derive(Debug)]
pub struct TransformOperator {}
```

⚡ **Committable suggestion**

Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation.

File: rust/worker/src/execution/operators/transform_log.rs
Line: 8


#[derive(Debug)]
pub struct TransformInput {
pub(crate) records: Chunk<LogRecord>,
}

impl TransformInput {
pub fn new(records: Chunk<LogRecord>) -> Self {
TransformInput { records }
}
}

#[derive(Debug)]
pub struct TransformOutput {
pub(crate) records: Chunk<LogRecord>,
}

#[derive(Debug, Error)]
#[error("Failed to transform records.")]
pub struct TransformError;

impl ChromaError for TransformError {
fn code(&self) -> ErrorCodes {
ErrorCodes::Internal
}
}

impl TransformOperator {
pub fn new() -> Box<Self> {
Box::new(TransformOperator {})
}

pub fn transform(&self, records: &Chunk<LogRecord>) -> Chunk<LogRecord> {
records.clone()
Comment on lines +41 to +42
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[BestPractice]

The transform method performs a shallow clone of the entire chunk of records, which can be expensive for large datasets. Since this is currently a no-op transformation (just returns records.clone()), consider if this allocation is necessary. If this is placeholder code, consider adding a TODO comment explaining the intended transformation logic.

Context for Agents
[**BestPractice**]

The `transform` method performs a shallow clone of the entire chunk of records, which can be expensive for large datasets. Since this is currently a no-op transformation (just returns `records.clone()`), consider if this allocation is necessary. If this is placeholder code, consider adding a TODO comment explaining the intended transformation logic.

File: rust/worker/src/execution/operators/transform_log.rs
Line: 42

}
}

#[async_trait]
impl Operator<TransformInput, TransformOutput> for TransformOperator {
type Error = TransformError;

fn get_name(&self) -> &'static str {
"TransformOperator"
}

async fn run(&self, input: &TransformInput) -> Result<TransformOutput, TransformError> {
let transformed_records = self.transform(&input.records);
Ok(TransformOutput {
records: transformed_records,
})
}
}
62 changes: 60 additions & 2 deletions rust/worker/src/execution/orchestration/compact.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ use crate::execution::operators::{
SourceRecordSegmentError, SourceRecordSegmentInput, SourceRecordSegmentOperator,
SourceRecordSegmentOutput,
},
transform_log::{TransformError, TransformInput, TransformOperator, TransformOutput},
};

/** The state of the orchestrator.
Expand Down Expand Up @@ -111,6 +112,7 @@ impl Default for CompactOrchestratorMetrics {
#[derive(Debug)]
enum ExecutionState {
Pending,
Transform,
Partition,
MaterializeApplyCommitFlush,
Register,
Expand Down Expand Up @@ -141,6 +143,7 @@ pub struct CompactOrchestrator {
hnsw_provider: HnswIndexProvider,
spann_provider: SpannProvider,

// TODO(jobs): Split this into source and dest collections.
collection: OnceCell<Collection>,
writers: OnceCell<CompactWriters>,
flush_results: Vec<SegmentFlushInfo>,
Expand All @@ -162,6 +165,9 @@ pub struct CompactOrchestrator {
segment_spans: HashMap<SegmentUuid, Span>,

metrics: CompactOrchestratorMetrics,

// Functions
function: Option<()>,
Comment on lines +169 to +170
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[BestPractice]

The function field is defined as Option<()> which only indicates presence/absence but provides no meaningful data. Consider using a more descriptive enum or boolean field name that clearly indicates what functionality is being toggled:

Suggested change
// Functions
function: Option<()>,
// Transform function configuration
transform_enabled: bool,

Committable suggestion

Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation.

Context for Agents
[**BestPractice**]

The `function` field is defined as `Option<()>` which only indicates presence/absence but provides no meaningful data. Consider using a more descriptive enum or boolean field name that clearly indicates what functionality is being toggled:

```suggestion
// Transform function configuration
transform_enabled: bool,
```

⚡ **Committable suggestion**

Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation.

File: rust/worker/src/execution/orchestration/compact.rs
Line: 170

}

#[derive(Error, Debug)]
Expand Down Expand Up @@ -192,6 +198,8 @@ pub enum CompactionError {
Panic(#[from] PanicError),
#[error("Error partitioning logs: {0}")]
Partition(#[from] PartitionError),
#[error("Error transforming logs: {0}")]
Transform(#[from] TransformError),
#[error("Error prefetching segment: {0}")]
PrefetchSegment(#[from] PrefetchSegmentError),
#[error("Error creating record segment reader: {0}")]
Expand Down Expand Up @@ -249,6 +257,7 @@ impl ChromaError for CompactionError {
Self::MetadataSegment(e) => e.should_trace_error(),
Self::Panic(e) => e.should_trace_error(),
Self::Partition(e) => e.should_trace_error(),
Self::Transform(e) => e.should_trace_error(),
Self::PrefetchSegment(e) => e.should_trace_error(),
Self::RecordSegmentReader(e) => e.should_trace_error(),
Self::RecordSegmentWriter(e) => e.should_trace_error(),
Expand Down Expand Up @@ -276,6 +285,7 @@ pub enum CompactionResponse {
impl CompactOrchestrator {
#[allow(clippy::too_many_arguments)]
pub fn new(
// TODO(jobs): Split this into source and dest collection IDs.
collection_id: CollectionUuid,
rebuild: bool,
fetch_log_batch_size: u32,
Expand Down Expand Up @@ -316,6 +326,7 @@ impl CompactOrchestrator {
num_materialized_logs: 0,
segment_spans: HashMap::new(),
metrics: CompactOrchestratorMetrics::default(),
function: Some(()),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[BestPractice]

Hardcoded Some(()) value provides no semantic meaning. If this is meant to enable transform functionality by default, use a more explicit boolean:

Suggested change
function: Some(()),
transform_enabled: true, // Default to transform mode

Committable suggestion

Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation.

Context for Agents
[**BestPractice**]

Hardcoded `Some(())` value provides no semantic meaning. If this is meant to enable transform functionality by default, use a more explicit boolean:

```suggestion
transform_enabled: true, // Default to transform mode
```

⚡ **Committable suggestion**

Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation.

File: rust/worker/src/execution/orchestration/compact.rs
Line: 329

}
}

Expand All @@ -325,6 +336,36 @@ impl CompactOrchestrator {
}
}

async fn transform_or_partition(
&mut self,
records: Chunk<LogRecord>,
ctx: &ComponentContext<CompactOrchestrator>,
) {
if let Some(_fn) = self.function {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[BestPractice]

Variable name _fn is unclear and doesn't follow Rust naming conventions. Consider using a more descriptive name that indicates what this represents:

Suggested change
if let Some(_fn) = self.function {
if self.transform_enabled {

Committable suggestion

Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation.

Context for Agents
[**BestPractice**]

Variable name `_fn` is unclear and doesn't follow Rust naming conventions. Consider using a more descriptive name that indicates what this represents:

```suggestion
if self.transform_enabled {
```

⚡ **Committable suggestion**

Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation.

File: rust/worker/src/execution/orchestration/compact.rs
Line: 344

self.transform(records, ctx).await;
} else {
self.partition(records, ctx).await;
}
}

async fn transform(
&mut self,
records: Chunk<LogRecord>,
ctx: &ComponentContext<CompactOrchestrator>,
) {
self.state = ExecutionState::Transform;
let operator = TransformOperator::new();
tracing::info!("Transforming {} records", records.len());
let input = TransformInput::new(records);
let task = wrap(
operator,
input,
ctx.receiver(),
self.context.task_cancellation_token.clone(),
);
self.send(task, ctx, Some(Span::current())).await;
}

async fn partition(
&mut self,
records: Chunk<LogRecord>,
Expand Down Expand Up @@ -974,7 +1015,7 @@ impl Handler<TaskResult<FetchLogOutput, FetchLogError>> for CompactOrchestrator
return;
}
}
self.partition(output, ctx).await;
self.transform_or_partition(output, ctx).await;
}
}

Expand Down Expand Up @@ -1012,11 +1053,28 @@ impl Handler<TaskResult<SourceRecordSegmentOutput, SourceRecordSegmentError>>
)
.await;
} else {
self.partition(output, ctx).await;
self.transform_or_partition(output, ctx).await;
}
}
}

#[async_trait]
impl Handler<TaskResult<TransformOutput, TransformError>> for CompactOrchestrator {
type Result = ();

async fn handle(
&mut self,
message: TaskResult<TransformOutput, TransformError>,
ctx: &ComponentContext<CompactOrchestrator>,
) {
let output = match self.ok_or_terminate(message.into_inner(), ctx).await {
Some(recs) => recs.records,
None => return,
};
self.partition(output, ctx).await;
}
}

#[async_trait]
impl Handler<TaskResult<PartitionOutput, PartitionError>> for CompactOrchestrator {
type Result = ();
Expand Down
Loading